刚好有个要爬国内某大型网站图片库的需求(国内网站中有海量图片库的网站屈指可数哦),索性就用python练练手,也很久不写了。试试
思路看代码就好,某网站地址我用a_website过滤,你懂的 🙂
python 环境:ActivePython 2.7.2.5
下载图片需要wget,没有的请自行下载。
# coding:utf-8 import os import urllib import urllib2 import re import cookielib def a_website(keyword, count): url = "a_website-1" url2 = "a_website-2&word=关键词" # 设置Header header = { "GET": url + url2 + "0", "Host": "image.a_website.com", "Referer": url1 + url2 + "0", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36" } # 创建目录 dirname = "./picure" if os.access(dirname, 0): pass else: os.makedirs(dirname) os.chdir(dirname) # 创建Cookie对象 cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) pn = 0 num = 1 while count > 0: request = urllib2.Request(url + url2 + str(pn)) for key in header: request.add_header(key, header[key]) html = opener.open(request).read() # 正则匹配 regex = re.compile("(?<=objURL":")(http.*?.(jpg|jpeg|JPG|gif|png|bmp))") results = regex.findall(html) pn += 30 # 重新设置Header header["GET"] = url + url2 + str(pn) header["Referer"] = header["GET"] # 下载图片 if results: for picture in results: dlcommand = "wget.exe -q -t 2 -T 5 %s" % (picture[0]) if os.system(dlcommand) == 0: print num, "Success! url:" + picture[0] num += 1 count -= 1 if count == 0: break if __name__ == '__main__': # 输入搜索关键字、数量 keyword = raw_input("Please enter the picture keyword:") count = raw_input("Please enter the number you want to search:") if keyword != '' and count != '' and int(count) > 0: keyword = urllib.quote(keyword) a_website(keyword, int(count)) print "The", count, "pictrues download is complete." else: print "Error : picture keyword or search number can not be empty." raw_input("Press any key to exit...")