missdica.com爬虫【美女图片爬虫】

 

最终爬取效果如上图所示,需要注意的是域名被大墙屏蔽了,只能通过ip进行访问,代码已经进行了相关处理。

 

最终爬取效果如上图所示,需要注意的是域名被大墙屏蔽了,只能通过ip进行访问,代码已经进行了相关处理。

# -*- coding:utf-8 -*-
import os

#
# http://115.68.13.42/studio_md


import requests
from bs4 import BeautifulSoup

HEADERS = {
    'X-Requested-With': 'XMLHttpRequest',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Referer': 'http://www.a6d.cn/'
}

DIR_PATH = "H:/图片/missdica.com.gallery"


def get_url_source_code(url):
    souce = requests.get(url=url, headers=HEADERS, timeout=10)
    html = souce.content
    html_doc = str(html, 'utf-8')
    return html_doc


def save_pic(url, path):
    try:
        img_name = os.path.join(path, str(url).split('/')[-1])
        print('[S] 下载图片路径:' + path + ' \r\n链接:' + url)
        durl = url
        if '_S' in img_name:
            durl = str(url).replace('_S', '')
            img_name = img_name.replace('_S', '')
        if os.path.isfile(img_name):
            print('[F] 文件已经存在,跳过保存')
            return False
        req = requests.get(durl, headers=HEADERS, timeout=10)
        img = req.content
        if 'found on this server' in str(req.text):
            print('[E] 文件不存在,跳过下载, 删除文件')
            try:
                os.remove(img_name)
            except:
                pass
            return False
        with open(img_name, 'ab') as f:
            f.write(img)
            print('[S] 下载图片成功')
    except Exception as e:
        # print(e)
        print('[S] 下载图片失败: ' + str(e))


def mark_dir(flot_name):
    """
    检测文件夹是否创建,没有创建则创建文件夹,创建了就跳过
    """
    print('[C] 创建目录: ' + flot_name)
    PATH = os.path.join(DIR_PATH, flot_name)
    if not os.path.exists(PATH):  # 检测是否有这个文件夹
        os.makedirs(PATH)
        os.chdir(PATH)
    return PATH


def get_all_title_and_links(page_url):
    print('-' * 70)
    print('[A] 分析所有子页面信息......')
    # http://115.68.13.42/index.php?mid=studio_md&page=2
    html_doc = get_url_source_code(page_url)
    # print(html_doc)
    bs = BeautifulSoup(html_doc, "html.parser")
    # print(bs)
    # zzr = bs.find_all('li', _class="title")
    zzr = bs.find_all('a', class_='title black')
    ll = []
    # print(zzr)
    for z in zzr:
        url = str(z.get("href")).replace('http://www.missdica.com', 'http://115.68.13.42')

        name = str(z.get_text()).replace(' ', '')\
            .replace('\r', '').replace('\n', '')\
            .replace('\t', '').replace('\'', '')\
            .replace('"', '').replace('?', '')\
            .replace('.', '').replace(':', '')\
            .replace('\\', '').replace('~', '').replace('^', '').replace('*', '')
        i = {
            'url': url,
            'name': name
        }
        print('[*] 名字: ' + name + ' URL:' + url)
        ll.append(i)
    print('[A] 分类总数: ' + str(len(ll)) + ' 全部解析完成')
    print('-' * 70)
    return ll


def download_all_image_in_links(url, name):
    print('_' * 70)
    print('[A] 解析图片地址......')
    html_doc = get_url_source_code(url)
    bs4 = BeautifulSoup(html_doc, "html.parser")
    # document_2501334_962455 xe_content
    bs = bs4.find('div', class_='xe_content')
    urls = []
    surls = bs.find_all('img')
    floder = mark_dir(name)
    for u in surls:
        org_link = str(u.get("src"))
        if not str(org_link).startswith('http://'):
            if str(org_link).startswith('/'):
                org_link = 'http://115.68.13.42' + org_link
            else:
                org_link = 'http://115.68.13.42/' + org_link
        link = org_link.replace('http://www.missdica.com', 'http://115.68.13.42')\
            .replace('http://missdica.com' ,'http://115.68.13.42')
        if link not in urls:
            # print(link)
            urls.append(link)
            save_pic(link, floder)
    print('_' * 70)
    print('[A] 图片下载完成,当前页面图片一共 ' + str(len(urls)) + ' 张')


if __name__ == '__main__':
    print('*' * 80)
    print('韩国美女 图片下载器')
    print('http://www.missdica.com')
    print('by: obaby')
    print('http://www.obaby.org.cn')
    print('http://www.h4ck.org.cn')
    print('*' * 80)

    # save_pic('http://115.68.13.42/data2/studio_md/2010/04/17/5798469754bc88d9ea4d3c.jpg', 'dsfdafasd')

    for i in range(1, 248):
        print('[S] 开始下载第', i, '页')
        # page_list = get_all_title_and_links('http://115.68.13.42/index.php?mid=studio_md&page=' + str(i))
        page_list = get_all_title_and_links('http://115.68.13.42/index.php?mid=gal_event&page=' + str(i))
        for p in page_list:
            download_all_image_in_links(p['url'], p['name'])
        print('[S] 第', i, '页下载完成')
        print('_' * 100)


分享文章:

猜你喜欢:

发表评论

您的电子邮箱地址不会被公开。 必填项已用*标注