python爬虫-批量下载图片

刚好有个要爬国内某大型网站图片库的需求(国内网站中有海量图片库的网站屈指可数哦),索性就用python练练手,也很久不写了。试试

思路看代码就好,某网站地址我用a_website过滤,你懂的 🙂

python 环境:ActivePython 2.7.2.5

下载图片需要wget,没有的请自行下载。

# coding:utf-8
import os
import urllib
import urllib2
import re
import cookielib

def a_website(keyword, count):
    url = "a_website-1"
    url2 = "a_website-2&word=关键词"

    # 设置Header
    header = {
        "GET": url + url2 + "0",
        "Host": "image.a_website.com",
        "Referer": url1 + url2 + "0",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
    }
    # 创建目录
    dirname = "./picure"
    if os.access(dirname, 0):
        pass
    else:
        os.makedirs(dirname)
    os.chdir(dirname)

    # 创建Cookie对象
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

    pn = 0
    num = 1
    while count > 0:
        request = urllib2.Request(url + url2 + str(pn))
        for key in header:
            request.add_header(key, header[key])
        html = opener.open(request).read()

        # 正则匹配
        regex = re.compile("(?<=objURL":")(http.*?.(jpg|jpeg|JPG|gif|png|bmp))")
        results = regex.findall(html)

        pn += 30
        # 重新设置Header
        header["GET"] = url + url2 + str(pn)
        header["Referer"] = header["GET"]

        # 下载图片
        if results:
            for picture in results:
                dlcommand = "wget.exe -q -t 2 -T 5 %s" % (picture[0])
                if os.system(dlcommand) == 0:
                    print num, "Success! url:" + picture[0]
                    num += 1
                    count -= 1
                    if count == 0:
                        break

if __name__ == '__main__':
    # 输入搜索关键字、数量
    keyword = raw_input("Please enter the picture keyword:")
    count = raw_input("Please enter the number you want to search:")
    if keyword != '' and count != '' and int(count) > 0:
        keyword = urllib.quote(keyword)
        a_website(keyword, int(count))
        print "The", count, "pictrues download is complete."
    else:
        print "Error : picture keyword or search number can not be empty."
    raw_input("Press any key to exit...")

发表评论