1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
| import re import os import sys import shutil import urllib import requests from tqdm import tqdm
download_path = './image/'
def get_onepage_urls(onepageurl): """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url""" if not onepageurl: print('已到最后一页, 结束') return [], '' try: html = requests.get(onepageurl) html.encoding = 'utf-8' html = html.text except Exception as e: print(e) pic_urls = [] fanye_url = '' return pic_urls, fanye_url pic_urls = re.findall('"objURL":"(.*?)",', html, re.S) fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0) fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else '' return pic_urls, fanye_url
def down_pic(pic_urls,label): """给出图片链接列表, 下载所有图片""" for i,pic_url in enumerate(pic_urls): try: pic = requests.get(pic_url, timeout=15) string = download_path + label + '_baidu/' + str(i + 1) + '.jpg' with open(string, 'wb') as f: f.write(pic.content) print('成功下载第(%s/%s)张图片: %s' % (str(i + 1), str(len(pic_urls)), str(pic_url))) except Exception as e: print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url))) continue
if __name__ == '__main__': with open('keywords.txt', 'r', encoding='UTF-8') as f: lines = f.read().splitlines() print(lines)
for line in tqdm(lines): file_path = os.path.join(download_path, line) file_path = file_path + '_baidu' if not os.path.exists(file_path): os.makedirs(file_path, 0o755) else: shutil.rmtree(file_path) os.makedirs(file_path, 0o755)
url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word=' url_init = url_init_first + urllib.parse.quote(line, safe='/') all_pic_urls = [] onepage_urls, fanye_url = get_onepage_urls(url_init) all_pic_urls.extend(onepage_urls)
fanye_count = 0 while 1: onepage_urls, fanye_url = get_onepage_urls(fanye_url) fanye_count += 1 if fanye_url == '' and onepage_urls == []: break all_pic_urls.extend(onepage_urls)
down_pic(list(set(all_pic_urls)),line)
|