Categories
Uncategorized

Downloading HTML source code – Python

#!/usr/bin/python
# Download the links

import urllib

# file where it will be stored to
rawSrc="rawSrc.txt"
linkSrc="lnksSrc.txt"

# this function will download the source code and save it as a txt file.
def dlSrc():
	# Open first file where source code will be saved
	wRawSrc=open(rawSrc,"w")
	# connect and donwnload
	locPage="http://the.website.where/the_stuff_is.html"
	webPage=urllib.urlopen(locPage)
	wPageSrc=webPage.read()
	webPage.close()
	# write to text file
	wRawSrc.write(wPageSrc)

# this function will extract all the links and save them on another txt file
def cleanFile():
	# open raw HTML source for reading
	rRawSrc=open(rawSrc,"r")
	# open the stripped one for writing
	wlinkSrc=open(linkSrc,"w")
	# export only the lines that contain "a href"
	for line in rRawSrc:
		if "a href" in line:
			wlinkSrc.write(line)

# Run the functions
dlSrc()
cleanFile()

Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s