第一步:在spiderinit方法中初始化浏览器驱动(webdriver)
第二步:在spiderclose方法中关闭浏览器驱动
import osimport timeimport scrapyfrom selenium import webdriverfrom myscrapy.network_log_option import *class Selensprider(scrapy.Spider): name = 'selenspider' allow_domains = 'fw.zjjy.xyz' start_urls = ['https://www.tulingxueyuan.cn/d/file/p/20230606/43iql3reac2 def __init__(self): # 启动代理命令,打开代理浏览器窗口 cd_dir = r'cd C:\Users\Lenovo\AppData\Local\Google\Chrome\Application' start_proxy = r'chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenum\AutomationProfile"' os.system(cd_dir) time.sleep(0.5) os.system(start_proxy) # 由初始浏览器驱动 options = get_log_options() # 代理参数的配置 options.add_experimental_option('debuggerAddress', 'localhost:90222') caps = get_caps() self.chrome = webdriver.Chrome(options=options, desired_capabilities=caps) super().__init__() def parse(self, response, **kwargs): pass # 整个爬虫结束后关闭浏览器 def close(self, spider): self.chrome.quit()
第三步:在中间件中拦截request和response,并操作浏览器
# classselenium处理中间件 SeleniumMiddleware(object): def process_response(self, request, response, spider): chrome = spider.chrome chrome.get(request.url) #这里可以操作浏览器,比如浏览器全屏,下拉、上拉等加载事件的操作 chrome.maximize_window() # 阻塞5秒,等待ajax加载 time.sleep(5) # network记录,所有ajax请求的结果都可以在这里获得 logs = get_xhr_logs(chrome) # 动态加载后的网页 html = chrome.page_source return scrapy.http.HtmlResponse(url=request.url, body=html.encode('utf-8'), encoding='utf-8',request=request)
util工具
import jsonfrom selenium import webdriverfrom selenium.webdriver import DesiredCapabilitiesdef get_xhr_logs(chrome): log_xhr_array = [] for typelog in chrome.log_types: perfs = chrome.get_log(typelog) for row in perfs: log_data = row message_ = log_data['message'] try: log_json = json.loads(message_) log = log_json['message'] if log['method'] == 'Network.responseReceived': # 去除静态js、css等,只保留xhr请求 type_ = log['params']['type'] if type_ == "XHR": log_xhr_array.append(log) except: pass return log_xhr_arraydef get_log_options(): option = webdriver.ChromeOptions() option.add_argument('--no-sandbox') option.add_argument('--headless') option.add_argument("--disable-extensions") option.add_argument("--allow-running-insecure-content") option.add_argument("--ignore-certificate-errors") option.add_argument("--disable-single-click-autofill") option.add_argument("--disable-autofill-keyboard-accessory-view[8]") option.add_argument("--disable-full-form-autofill-ios") option.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:55.0) Gecko/20100101 Firefox/55.0') option.add_experimental_option('w3c', False) option.add_experimental_option('perfLoggingPrefs', { 'enableNetwork': True, 'enablePage': False, }) return optiondef get_caps(): caps = DesiredCapabilities.CHROME caps['loggingPrefs'] = { 'browser': 'ALL', 'performance': 'ALL', } caps['perfLoggingPrefs'] = { 'enableNetwork': True, 'enablePage': False, 'enableTimeline': False } return caps
第四步:在setting中使用此中间件。
DOWNLOADER_MIDDLEWARES = { 'asong.middlewares.SeleniumMiddleware': 543,}