1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
| import csv import os import re import time
import bs4 import requests
url_pro = 'https://www.naluboutique.com/collections/all'
product_link_select = '.grid-product__content .grid-product__link' product_title_select = '.product-single__title' product_price_select = '.product__price' product_desc_select = '.product-single__description' product_img_select = '.product__photos img'
'''采集商品集合开始'''
def crawl_collections(url): while True: try: res = requests.get(url, timeout=30) res.encoding = res.apparent_encoding print('请求', url, '状态', res.status_code) res.raise_for_status() break except: timeout = 3 print('链接失败,等待', timeout, '秒重试') time.sleep(timeout) print('') print('重新链接中') print('请求成功,开始获取商品链接') noStarchSoup = bs4.BeautifulSoup(res.text, 'html.parser') url = noStarchSoup.select(product_link_select) for i in range(len(url)): imgurl = domainWithProtocol + url[i].get('href') print('获取产品url') crawl_product(imgurl) print('\n')
'''采集商品url结束'''
'''采集商品内容开始'''
def crawl_product(url): print('开始请求产品页面', url) while True: try: res = requests.get(url, timeout=30) res.encoding = res.apparent_encoding print('成功请求商品页面:', res.status_code) res.raise_for_status() break except: print('请求商品页面', url, '失败,重新链接')
noStarchSoup = bs4.BeautifulSoup(res.text, 'html.parser') name = noStarchSoup.select(product_title_select) name = name[0].getText() price = noStarchSoup.select(product_price_select) price = price[0].getText() price = re.sub(' ', '', price) price = re.sub('\n', '', price) des = noStarchSoup.select(product_desc_select) des = des[0].getText() img = noStarchSoup.select(product_img_select) l = [] if img != []: for i in range(len(img)): imgurl = img[i].get('src') if imgurl is None: imgurl = img[i].get('data-src') if imgurl.__contains__('{width}'): continue l.append('https:' + imgurl) l = '\r\n'.join(l) fileHeader = ['标题', '产品url', '价格', '描述', '图片'] file = [name, url, price, des, l] while True: try: csvFile = open(csv_name, 'a+', encoding='utf-8') break except Exception as e: print(e) print(csv_name + '文件写入失败,重试中。。。。。') time.sleep(5) size = os.path.getsize(csv_name) writer = csv.writer(csvFile) if size == 0: writer.writerow(fileHeader) writer.writerow(file) csvFile.close() else: writer.writerow(file) csvFile.close() print('采集成功!')
if __name__ == '__main__': protocol = 'https://' domain = re.match('https://(.*)/collections', url_pro).group(1) domainWithProtocol = protocol + domain csv_name = domain + time.strftime('_%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.csv' next = ['1'] n = 1 while next != []: url = url_pro + '?sort_by=best-selling&page=' + str(n) crawl_collections(url) print('成功采集', n, '页') n = n + 1 res = requests.get(url) res.raise_for_status() noStarchSoup = bs4.BeautifulSoup(res.text, 'html.parser') next = noStarchSoup.select('.next') print('全部采集完毕!!')
|