【爬虫】python使用selenium抓取淘宝中的商品数据

2024-06-19 00:08| 来源: 网络整理| 查看: 265

前言

最近因为项目需要又得抓一批数据，和之前的scrapy不同，这次选择使用selenium来爬取。两种方法的区别如下：

scrapy之类的库是基于网络请求来爬取的，也就是直接向目标服务器发送http请求，在这个过程中需要自己构造请求字段也就是json格式的request body。selenium一类的库是基于自动化测试的，我们只需要知道想要访问的链接就好，其它的（异步加载图片、信息之类的）交给浏览器来做。也因此在使用时需要额外下载浏览器以及对应驱动，比如googledriver。

总而言之，虽然selenium在速度上是远远不如直接发请求的scrapy一类的库的，但可以完美地规避反爬策略，因为自动化测试相当于是模拟人去访问一个网站，因此还可以进行网页自动截图之类的操作，可以说是非常简单方便。

安装下载selenium库：pip install selenium下载浏览器驱动：一般来说就是ChromeDriver，先看自己电脑上的Chrome版本，然后在给出的链接中下载对应版本的ChromeDriver。将ChromeDriver加到环境变量里，方法就不多说了，网上随便找找就有，可以参考[windows环境变量配置/linux、mac环境变量配置] 对网页进行滚动截屏

首先是一个比较简单的应用，就是对整个网页进行截屏，并且在网页长度超出窗口长度时可以滚动截屏并自动拼接。主要思想是获取网页长度page_height与窗口长度window_height，然后循环定位到网页的不同位置来截图，并把截图都拼接到一起。话不多说直接上代码：

from io import BytesIO import traceback from time import sleep from PIL import Image import numpy as np from selenium import webdriver def url_screenshot(ad_url, index): try: chrome_options = webdriver.ChromeOptions() chrome_options.add_experimental_option("excludeSwitches", ['enable-automation']) chrome_options.add_argument('--headless') # 后台静默运行 driver = webdriver.Chrome(options=chrome_options) driver.get(ad_url) driver.fullscreen_window() # 全屏窗口 # 当前窗口的高度 window_height = driver.get_window_size()['height'] # 页面高度 page_height = driver.execute_script('return document.documentElement.scrollHeight') print('window_height:{},page_height:{}'.format(window_height,page_height) img_binary = driver.get_screenshot_as_png() base_img = np.array(Image.open(BytesIO(img_binary))) if page_height > window_height: n = page_height // window_height # 需要滚动的次数 for i in range(n): driver.execute_script(f'document.documentElement.scrollTop={window_height*(i+1)};') sleep(5) img = np.array(Image.open(BytesIO(driver.get_screenshot_as_png()))) if i==n-1: out = window_height * (i + 1) - (page_height - window_height) img = img[2*out:] base_img = np.append(base_img, img, axis=0) # 拼接图片 driver.quit() Image.fromarray(base_img).save("{}.png".format(index)) # 保存图片 except: traceback.print_exc() if __name__ == '__main__': with open('url_list','r') as fp: # 把待截图的网页链接写在一个文件里，每行一个链接 lines = fp.readlines() index = 0 for line in lines: index+=1 url_screenshot(line.strip(),index) print('get url:{}'.format(line)) 获取淘宝商品数据

接下来一个稍微复杂一点的例子，通过关键词搜索来爬取淘宝商品的图片。流程是：

run、getPage：通过关键词构造商品查询链接，并访问。getItem：针对查询结果，获取每个商品详情页的链接并访问，同时翻页。getItemDetail：获取商品详细信息，这里只获取了商品中包含的图片信息，想要其他信息可以在此构造别的xpath解析路径来获取。

整个过程没什么难点，主要就是分析获取的html界面，然后解析里面的元素结构，编写xpath来获取里面的某个元素。可以参考之前的关于scrapy的博客：https://blog.csdn.net/qq_34392457/article/details/117029090，这里对于如何分析一个网页的html写得更详细。

import cv2 import numpy as np from lxml import etree from selenium import webdriver import time import requests import os import shutil class Crawler_taobao_images: def __init__(self): self.query_url = 'https://s.taobao.com/search?q={}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20210913&ie=utf8' chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--disable-infobars") chrome_options.add_experimental_option("excludeSwitches", ['enable-automation']) chrome_options.add_argument('--headless') self.driver = webdriver.Chrome(options=chrome_options) self.sleep_time = 3 self.save_img_path = 'result/' if os.path.exists(self.save_img_path): shutil.rmtree(self.save_img_path) os.makedirs(self.save_img_path) self.image_index = 0 def run(self): querys = ['酒'] for query in querys: site_url = self.query_url.format(query) self.getPage(site_url) def getPage(self, site_url): self.driver.get(site_url) time.sleep(self.sleep_time) print(self.driver.title) self.getItem(site_url) def getItem(self,site_url): html = self.driver.page_source.encode('utf-8') print('start parse the html') selector = etree.HTML(html) #itemList = selector.xpath("//div[@class='item J_MouseOnverReq ')/div[1]/") itemList = selector.xpath("//*[@id='mainsrp-itemlist']/div/div/div[1]/*") # 循环遍历该页所有商品 for item in itemList: link = item.xpath("./div[contains(@class, 'pic-box')]/div[contains(@class, 'pic-box-inner')]/div[@class='pic']/a/@href")[0] if "https://" not in link: link = "https://" + link print("into: ", link) # 进入宝贝详情页开始爬取里面的图片资料 try: self.getItemDetail(link) except: print("get link {} error!!!".format(link)) # 获取分页信息 next_page_value = selector.xpath("//*[@id='mainsrp-pager']/div/div/div/ul/li[contains(@class,'next')]/a/@data-value") print("next page: ", next_page_value) if len(next_page_value) == 0: print('没有下一页了') else: site_url_p = site_url + '&s={}'.format(next_page_value[0]) print('加载下一页内容:', site_url_p) self.getPage(site_url_p) def getItemDetail(self, link): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--disable-infobars") chrome_options.add_experimental_option("excludeSwitches", ['enable-automation']) chrome_options.add_argument('--headless') newDriver = webdriver.Chrome(options=chrome_options) newDriver.get(link) time.sleep(self.sleep_time) print(newDriver.title) html = newDriver.page_source.encode('utf-8') selector = etree.HTML(html) # 获取图片略缩链接 image_srcs = selector.xpath("//div[@class='tb-thumb-content']/ul/li/a/img/@src") for image_src in image_srcs: # 找较高分辨率的图片 image_src = image_src.replace('60x60','430x430') imglink = 'https:' + image_src print('imglink:{}'.format(imglink)) image_path = os.path.join(self.save_img_path, '{}.jpg'.format(self.image_index)) self.saveImg(imglink, image_path) with open(image_path, 'wb') as f: f.write(requests.get(imglink).content) self.image_index += 1 newDriver.quit() if __name__ == '__main__': # 修改keyword便可以修改搜索关键词建议也修改存储目录 tb = Crawler_taobao_images() tb.run()

【本文地址】

公司简介

联系我们