做了之后pip install txsocksx
,我需要更换scrapy
的ScrapyAgent
使用txsocksx.http.soCKS5Agent
。
我只是复制代码HTTP11DownloadHandler
,并ScrapyAgent
从scrapy/core/downloader/handlers/http.py
,子类他们写了这样的代码:
class TorProxyDownloadHandler(HTTP11DownloadHandler):
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyTorAgent(contextFactory=self._contextFactory, pool=self._pool)
return agent.download_request(request)
class ScrapyTorAgent(ScrapyAgent):
def _get_agent(self, request, timeout):
bindaddress = request.Meta.get('bindaddress') or self._bindAddress
proxy = request.Meta.get('proxy')
if proxy:
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
scheme = _parse(request.url)[0]
omitConnectTunnel = proxyParams.find('noconnect') >= 0
if scheme == 'https' and not omitConnectTunnel:
proxyConf = (proxyHost, proxyPort,
request.headers.get('Proxy-Authorization', None))
return self._TunnelingAgent(reactor, proxyConf,
contextFactory=self._contextFactory, connectTimeout=timeout,
bindAddress=bindaddress, pool=self._pool)
else:
_, _, host, port, proxyParams = _parse(request.url)
proxyEndpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort,
timeout=timeout, bindAddress=bindaddress)
agent = SOCKS5Agent(reactor, proxyEndpoint=proxyEndpoint)
return agent
return self._Agent(reactor, contextFactory=self._contextFactory,
connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
在settings.py中,需要执行以下操作:
DOWNLOAD_HANDLERS = {
'http': 'crawler.http.TorProxyDownloadHandler'
}
现在通过诸如Tor之类的袜子代理与Scrapy进行代理。