#!/usr/bin/env python
#_author = vkremez # This is an assignment for University of Michigan course on "Using Python to Access Web Data." # This Python program will allow us to scrape the content of a website for any URLs. # Here is the algorithm: ''' The program will use urllib to (1) read the HTML from the website data, (2) extract the href= values from the anchor tags, (3) scan for a tag that is in a particular position relative to the first name in the list, (4) follow that link and repeat the process a number of times and report the results. ''' import os import argparse import urllib from datetime import datetime from bs4 import * print os.system('echo WEB SCRAPER 1.0') print datetime.datetime.now() url = raw_input('Enter URL: ') html = urllib.urlopen(url).read() soup = BeautifulSoup(html) tags = soup('a') count = int(raw_input('Enter count: ')) position = int(raw_input('Enter position: ')) print "Retrieving: " + url print "Retrieving: " + tags[position-1].get('href', None) for x in range(0,count-1): html = urllib.urlopen(tags[position-1].get('href',None)).read() soup = BeautifulSoup(html) tags = soup('a') print "Retrieving: " + tags[position-1].get('href', None) parser = argparse.ArgumentParser(description='Web Scraper 1.0 by VK.') parser.add_argument('string', metavar='www', type=int, nargs='+', help='http://website.com format') args = parser.parse_args() print(args.accumulate(args.integers))
1 Comment
Subramanya Chakravarthy
1/9/2016 10:03:45 pm
Nice and simple scraper
Reply
Leave a Reply. |
AuthorVitali Kremez Archives
January 2016
Categories |