Fork me on GitHub

Sunday, December 16, 2012

Python Script to Download XKCD webcomics (Code sample)

 I am a fan of XKCD webcomics. I decided to write a python script that will let you download the entire XKCD archive. You are free to use my code to download the comics for yourself. You can run this code periodically by specifying the number of the comic from which you want the download to begin. Just save this code with a .py extension and run it from a command line or a python interpreter. (Your python machine must have httplib2 to run this. Else you will have to download it from here. Enjoy!)

CODE:





import httplib2

import os



h = httplib2.Http('.cache')

html = "<html><head></head><body><div id=\"comic\">"
end_html = "</div></body></html>"
i = raw_input("Enter the comic number from which you want the download to begin:")
i = int(i)
if (i < 1):
i = 1

if not os.path.exists("xkcd_archive"):
os.makedirs("xkcd_archive")

if not os.path.exists("xkcd_archive/images"):
os.makedirs("xkcd_archive/images")

while True:
url = "http://www.xkcd.com/"
url = url + str(i)
response, content = h.request(url)
if response.status == 404:
  break;
index   = content.find("<div id=\"comic\"",0)
src = content.find('src=',index)
end_src = content.find('title',src)
comic = content[index : index + 1000]
start_index = comic.find('alt=')
end_index = comic.find('/>',start_index)
alt = comic[start_index + 5 : end_index - 2]
img_body =  "<h2> " + alt + "</h2>" +"<img " + "src =\"images/" + alt  + ".jpg\""

caption = comic[comic.find("title",0):comic.find('alt',0)]
image_url = content[src + 5:end_src]
content = html + img_body + caption + " />" + end_html
print "Downloading " + str(i) + "th comic --> " + alt,"\n"
content = bytes(content)
with open('xkcd_archive/' + alt + '.html','wb') as f:
    f.write(content)
print "\n"
i += 1
image_url = image_url[0:-5] + "jpg"
response, image = h.request(image_url,headers = {'type' : 'text/html'})
print response
print "\n",image_url
if response.status != 200:
image_url = image_url.replace(".jpg",".png")
print image_url,"\n"
response, image = h.request(image_url,headers = {'type' : 'text/html'})
print "png" + "\n"
print response
with open("xkcd_archive/images/" + alt + ".png", 'wb') as I:
   I.write(image)
with open("xkcd_archive/" + alt + ".html",'r+') as f:
   contents = f.read()
   contents = str(contents)
   temp = contents.find(".jpg")
   contents = contents[:temp] + ".png" + contents[temp+4:]
   print contents
   f.seek(0)
   f.write(contents)
   f.close()
else:
with open("xkcd_archive/images/" + alt + ".jpg", 'wb') as I:
   I.write(image)