最终爬取效果如上图所示,需要注意的是域名被大墙屏蔽了,只能通过ip进行访问,代码已经进行了相关处理。
最终爬取效果如上图所示,需要注意的是域名被大墙屏蔽了,只能通过ip进行访问,代码已经进行了相关处理。 # -*- coding:utf-8 -*- import os # # http://115.68.13.42/studio_md import requests from bs4 import BeautifulSoup HEADERS = { 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Referer': 'http://www.a6d.cn/' } DIR_PATH = "H:/图片/missdica.com.gallery" def get_url_source_code(url): souce = requests.get(url=url, headers=HEADERS, timeout=10) html = souce.content html_doc = str(html, 'utf-8') return html_doc def save_pic(url, path): try: img_name = os.path.join(path, str(url).split('/')[-1]) print('[S] 下载图片路径:' + path + ' \r\n链接:' + url) durl = url if '_S' in img_name: durl = str(url).replace('_S', '') img_name = img_name.replace('_S', '') if os.path.isfile(img_name): print('[F] 文件已经存在,跳过保存') return False req = requests.get(durl, headers=HEADERS, timeout=10) img = req.content if 'found on this server' in str(req.text): print('[E] 文件不存在,跳过下载, 删除文件') try: os.remove(img_name) except: pass return False with open(img_name, 'ab') as f: f.write(img) print('[S] 下载图片成功') except Exception as e: # print(e) print('[S] 下载图片失败: ' + str(e)) def mark_dir(flot_name): """ 检测文件夹是否创建,没有创建则创建文件夹,创建了就跳过 """ print('[C] 创建目录: ' + flot_name) PATH = os.path.join(DIR_PATH, flot_name) if not os.path.exists(PATH): # 检测是否有这个文件夹 os.makedirs(PATH) os.chdir(PATH) return PATH def get_all_title_and_links(page_url): print('-' * 70) print('[A] 分析所有子页面信息......') # http://115.68.13.42/index.php?mid=studio_md&page=2 html_doc = get_url_source_code(page_url) # print(html_doc) bs = BeautifulSoup(html_doc, "html.parser") # print(bs) # zzr = bs.find_all('li', _class="title") zzr = bs.find_all('a', class_='title black') ll = [] # print(zzr) for z in zzr: url = str(z.get("href")).replace('http://www.missdica.com', 'http://115.68.13.42') name = str(z.get_text()).replace(' ', '')\ .replace('\r', '').replace('\n', '')\ .replace('\t', '').replace('\'', '')\ .replace('"', '').replace('?', '')\ .replace('.', '').replace(':', '')\ .replace('\\', '').replace('~', '').replace('^', '').replace('*', '') i = { 'url': url, 'name': name } print('[*] 名字: ' + name + ' URL:' + url) ll.append(i) print('[A] 分类总数: ' + str(len(ll)) + ' 全部解析完成') print('-' * 70) return ll def download_all_image_in_links(url, name): print('_' * 70) print('[A] 解析图片地址......') html_doc = get_url_source_code(url) bs4 = BeautifulSoup(html_doc, "html.parser") # document_2501334_962455 xe_content bs = bs4.find('div', class_='xe_content') urls = [] surls = bs.find_all('img') floder = mark_dir(name) for u in surls: org_link = str(u.get("src")) if not str(org_link).startswith('http://'): if str(org_link).startswith('/'): org_link = 'http://115.68.13.42' + org_link else: org_link = 'http://115.68.13.42/' + org_link link = org_link.replace('http://www.missdica.com', 'http://115.68.13.42')\ .replace('http://missdica.com' ,'http://115.68.13.42') if link not in urls: # print(link) urls.append(link) save_pic(link, floder) print('_' * 70) print('[A] 图片下载完成,当前页面图片一共 ' + str(len(urls)) + ' 张') if __name__ == '__main__': print('*' * 80) print('韩国美女 图片下载器') print('http://www.missdica.com') print('by: obaby') print('http://www.obaby.org.cn') print('http://www.h4ck.org.cn') print('*' * 80) # save_pic('http://115.68.13.42/data2/studio_md/2010/04/17/5798469754bc88d9ea4d3c.jpg', 'dsfdafasd') for i in range(1, 248): print('[S] 开始下载第', i, '页') # page_list = get_all_title_and_links('http://115.68.13.42/index.php?mid=studio_md&page=' + str(i)) page_list = get_all_title_and_links('http://115.68.13.42/index.php?mid=gal_event&page=' + str(i)) for p in page_list: download_all_image_in_links(p['url'], p['name']) print('[S] 第', i, '页下载完成') print('_' * 100) |
原创文章,转载请注明: 转载自 obaby@mars
本文标题: 《missdica.com爬虫【美女图片爬虫】》