当前位置: 首页 > 图灵资讯 > 技术篇> scrapy使用selenium3.0

scrapy使用selenium3.0

来源:图灵教育
时间:2023-06-06 09:35:34

第一步:在spiderinit方法中初始化浏览器驱动(webdriver)

第二步:在spiderclose方法中关闭浏览器驱动

import osimport timeimport scrapyfrom selenium import webdriverfrom myscrapy.network_log_option import *class Selensprider(scrapy.Spider):    name = 'selenspider'    allow_domains = 'fw.zjjy.xyz'    start_urls = ['https://www.tulingxueyuan.cn/d/file/p/20230606/43iql3reac2    def __init__(self):        # 启动代理命令,打开代理浏览器窗口        cd_dir = r'cd C:\Users\Lenovo\AppData\Local\Google\Chrome\Application'        start_proxy = r'chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\selenum\AutomationProfile"'        os.system(cd_dir)        time.sleep(0.5)        os.system(start_proxy)        # 由初始浏览器驱动        options = get_log_options()        # 代理参数的配置        options.add_experimental_option('debuggerAddress', 'localhost:90222')        caps = get_caps()        self.chrome = webdriver.Chrome(options=options, desired_capabilities=caps)        super().__init__()    def parse(self, response, **kwargs):        pass    # 整个爬虫结束后关闭浏览器    def close(self, spider):        self.chrome.quit()

第三步:在中间件中拦截request和response,并操作浏览器

# classselenium处理中间件 SeleniumMiddleware(object):    def process_response(self, request, response, spider):        chrome = spider.chrome        chrome.get(request.url)        #这里可以操作浏览器,比如浏览器全屏,下拉、上拉等加载事件的操作        chrome.maximize_window()        # 阻塞5秒,等待ajax加载        time.sleep(5)        # network记录,所有ajax请求的结果都可以在这里获得        logs = get_xhr_logs(chrome)        # 动态加载后的网页        html = chrome.page_source        return scrapy.http.HtmlResponse(url=request.url, body=html.encode('utf-8'), encoding='utf-8',request=request)

util工具

import jsonfrom selenium import webdriverfrom selenium.webdriver import DesiredCapabilitiesdef get_xhr_logs(chrome):    log_xhr_array = []    for typelog in chrome.log_types:        perfs = chrome.get_log(typelog)        for row in perfs:            log_data = row            message_ = log_data['message']            try:                log_json = json.loads(message_)                log = log_json['message']                if log['method'] == 'Network.responseReceived':                    # 去除静态js、css等,只保留xhr请求                    type_ = log['params']['type']                    if type_ == "XHR":                        log_xhr_array.append(log)            except:                pass    return log_xhr_arraydef get_log_options():    option = webdriver.ChromeOptions()    option.add_argument('--no-sandbox')    option.add_argument('--headless')    option.add_argument("--disable-extensions")    option.add_argument("--allow-running-insecure-content")    option.add_argument("--ignore-certificate-errors")    option.add_argument("--disable-single-click-autofill")    option.add_argument("--disable-autofill-keyboard-accessory-view[8]")    option.add_argument("--disable-full-form-autofill-ios")    option.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:55.0) Gecko/20100101 Firefox/55.0')    option.add_experimental_option('w3c', False)    option.add_experimental_option('perfLoggingPrefs', {        'enableNetwork': True,        'enablePage': False,    })    return optiondef get_caps():    caps = DesiredCapabilities.CHROME    caps['loggingPrefs'] = {        'browser': 'ALL',        'performance': 'ALL',    }    caps['perfLoggingPrefs'] = {        'enableNetwork': True,        'enablePage': False,        'enableTimeline': False    }    return caps

第四步:在setting中使用此中间件。

DOWNLOADER_MIDDLEWARES = {   'asong.middlewares.SeleniumMiddleware': 543,}