爬虫scrapy采集js里面的变量内容

关键代码如下,下面这个是成功采集过的数据

运行方式:

scrapy crawl bch -o items.json

 

import scrapy
import re
from bs4 import BeautifulSoup
import js2xml
from lxml import etree
import requests
from bch.items import BchItem

class BchSpider(scrapy.Spider):
name = “bch”
allowed_domains = [“www.xxxx.cn”]
start_urls = [
‘xxx’
]
for i in range(2,17):
start_urls.append(‘xxx’+str(i) +’.html’)

def parse(self, response):
#links = response.xpath(‘//a[re:test(@href,”\xxx”)]/@href’).extract()
links = response.xpath(‘//a[@class=”video_img”]/@href’).extract()
print ‘hello’
print links,response
items = []
for url in links:
item = BchItem()
res = requests.get(‘xxx’ + url)
#print res,’endsss’
res.encoding = ‘utf-8’
soup = BeautifulSoup(res.text, “lxml”)
#print ‘lxml’,soup
item[‘title’] = (soup.select(‘title’)[0].text).strip()
item[‘status’] = (soup.select(‘title’)[0].text).strip()
src = soup.select(“body script”)[11].string #
print ‘script’,src
src_text = js2xml.parse(src, debug=False)
src_tree = js2xml.pretty_print(src_text)
print ‘src_tree’,src_tree
selector = etree.HTML(src_tree)
item[‘link’] = ‘xxx’ + selector.xpath(“//property[@name = ‘f’]/string/text()”)[0]
items.append(item)
yield item
#return items

参考:https://www.cnblogs.com/sthu/p/8319072.html

微信公众号
手机浏览(小程序)
0
分享到:
没有账号? 忘记密码?