scrapy-spalsh使用UA和IP代理

scrapy-spalsh使用UA和IP代理

核心

设置UA,优先在lua脚本中使用splash:set_user_agent(“{ua}”)

设置ip代理,使用SplashRequest的proxy

代码

1
pip install fake-useragent
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider
from urllib.parse import quote
from scrapy_splash import SplashRequest

from risk_control_info.utils import get_proxy_ip
from fake_useragent import UserAgent

ua = UserAgent()
script = """
function main(splash, args)
splash.images_enabled = false
splash:set_user_agent("{ua}")
assert(splash:go(args.url))
assert(splash:wait(args.wait))
return splash:html()
end""".format(ua=ua.chrome)


class AppQimaiHotSearchSpider(scrapy.Spider):
name = 'app_qimai_hot_search'
allowed_domains = ['qimai.cn']
user_agent = ua.chrome
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
}

def start_requests(self):
url = "http://httpbin.org/get"

yield SplashRequest(url=url,
callback=self.parse,
endpoint='execute',
args={
'lua_source': script,
'proxy': "http://" + get_proxy_ip(url),
'wait': 3})

def parse(self, response):
print(response.body.decode())

结果

子航 Clark wechat
微信公众号"优雅的python",欢迎订阅!
坚持分享,您的支持将鼓励我继续创作!