使用python爬虫下载pixiv图片

爬虫练手之使用python爬虫实现pixiv图片下载(可根据需要修改参数爬取相应页面)

from multiprocessing import Pool, cpu_count
import requests
import re
import time
import os

class PixivImg(object):

    def __init__(self, url, path, referer):    
        object.__init__(self)
        self.url = url
        self.path = path
        self.referer = referer
        self.err_strList = ['/', '\\', '<', '>', '|', ':', '?', '*', '"']    #在windows中会引发错误的字符
        self.re_strList = ['/', '\', '〈', '〉', '|', ':', '?', '﹡', '“'] #替换的字符  
        isExists = os.path.exists(self.path)

        if not isExists:
            os.makedirs(path)
            print(self.path + '创建目录成功')
        else:
            print(self.path + '已存在')

    def get_elem(self, params):
        Page = requests.get(self.url, params=params, timeout=2)
        elem = str(Page.json()['contents'])  # 处理json文件 然后转换为字符串类型
        return elem

    def get_data(self, elem):    #获取图片信息
        img = re.compile(r'\'url\': \'(.+?.jpg)\'')
        title = re.compile(r'\'title\': \'(.*?)\'')
        id = re.compile(r'/([0-9]+)_p0')    #获取图片ID
        imgList = img.findall(elem)
        titleList = title.findall(elem)
        idList = id.findall(elem)
        return imgList, titleList, idList

    def img_download(self, img, title, id):
        img = img.replace(r'c/240x480/img-master', 'img-original')
        img = img.replace(r'_master1200', '')

        for errstr in title:    #检测图片名称是否含有会引发错误的字符
            if errstr in self.err_strList:
                index = self.err_strList.index(errstr)
                title = title.replace(errstr, self.re_strList[index])

        referer = self.referer + id
        headers = {'referer': referer}
        data = requests.get(img, headers=headers)
        if data.status_code == (404 or 403):    #原图有.png和.jpg两者格式,判断图片格式
            img = img.replace(r'jpg', 'png')
            data = requests.get(img, headers=headers)
            with open(self.path + title + '.png', 'wb') as f:
                f.write(data.content)
            print(title, '下载完成')
        else:
            with open(self.path + title + '.jpg', 'wb') as f:
                f.write(data.content)
            print(title, '下载完成')

if __name__ == "__main__":

    url = 'https://www.pixiv.net/ranking.php?mode=daily'
    referer = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id='
    now = time.strftime('%Y-%m-%d', time.localtime())
    path = './img/' + now + '/'    #在当前目录下创建一个img文件夹,并在img文件夹下创建一个以当前日期命名的文件保存图片
    pixiv = PixivImg(url=url, referer=referer, path=path)

    for page in range(1, 3):    #爬取前100张
        params = {'p': str(page), 'format': 'json', 'tt': '9ab895a5bb3a3ccceb03da532c30dc16'}
        Elem = pixiv.get_elem(params)
        imgList, titleList, idList = pixiv.get_data(Elem)
        p = Pool(cpu_count())
        for title, img, id in zip(titleList, imgList, idList):
            p.apply_async(pixiv.img_download, args=(img, title, id))    #多进程下载
        p.close()
        p.join()

1、更改url的mode参数的值(daily)可下载周榜(weekly),月榜(monthly)等榜单的图片

2、69行range(start, end)表示爬取页数,start必须为1,end最大值为11

3、运行前确保可以访问pixiv,否则程序无法运行

4、如果pixiv页面出现变动可能会导致下载失败

Github地址

  • 用支付宝打我
  • 用微信打我

Long may the sunshine

评论已关闭。

召唤蕾姆