PYTHON: WEB SCRAPER USING BEAUTIFULSOUP and URLLIB

11/14/2015

#!/usr/bin/env python
#_author = vkremez

# This is an assignment for University of Michigan course on "Using Python to Access Web Data."

# This Python program will allow us to scrape the content of a website for any URLs.

# Here is the algorithm:
'''
The program will use urllib to (1) read the HTML from the website data, (2) extract the href= values from the anchor tags, (3) scan for a tag that is in a particular position relative to the first name in the list, (4) follow that link and repeat the process a number of times and report the results.
'''
import os
import argparse
import urllib
from datetime import datetime
from bs4 import *

print os.system('echo WEB SCRAPER 1.0')
print datetime.datetime.now()

url = raw_input('Enter URL: ')
html = urllib.urlopen(url).read()

soup = BeautifulSoup(html)
tags = soup('a')

count = int(raw_input('Enter count: '))
position = int(raw_input('Enter position: '))

print "Retrieving: " + url
print "Retrieving: " + tags[position-1].get('href', None)

for x in range(0,count-1):
html = urllib.urlopen(tags[position-1].get('href',None)).read()
soup = BeautifulSoup(html) tags = soup('a')
print "Retrieving: " + tags[position-1].get('href', None)

parser = argparse.ArgumentParser(description='Web Scraper 1.0 by VK.')
parser.add_argument('string', metavar='www', type=int, nargs='+', help='http://website.com format')
args = parser.parse_args()
print(args.accumulate(args.integers))

1 Comment

PYTHON: WEB SCRAPER USING BEAUTIFULSOUP and URLLIB

Leave a Reply.

Author

Archives

Categories

PYTHON: WEB SCRAPER USING BEAUTIFULSOUP and URLLIB﻿