我試圖用ASIN號下載亞馬遜評論。然而,我只能下載第一頁,而不是所有的網頁。
& quot下一頁& quot不起作用。謝謝你。
我在使用以下代碼時遇到了困難:
import scrapy from urllib.parse import urljoin
class AmazonReviewsSpider(scrapy.Spider): name = "amazon_reviews"
custom_settings = {
'FEEDS': { 'data/%(name)s_%(time)s.csv': { 'format': 'csv',}}
}
def start_requests(self):
asin_list = ['B08GKK7NMH']
for asin in asin_list:
amazon_reviews_url = f'https://www.amazon.com/product-reviews/{asin}/'
yield scrapy.Request(url=amazon_reviews_url, callback=self.parse_reviews, meta={'asin': asin, 'retry_count': 0})
def parse_reviews(self, response):
asin = response.meta['asin']
retry_count = response.meta['retry_count']
next_page_relative_url = response.css(".a-pagination .a-last>a::attr(href)::after").get()
if next_page_relative_url is not None:
retry_count = 0
next_page = urljoin('https://www.amazon.com/', next_page_relative_url)
yield scrapy.Request(url=next_page, callback=self.parse_reviews, meta={'asin': asin, 'retry_count': retry_count})
## Adding this retry_count here so we retry any amazon js rendered review pages
elif retry_count < 3:
retry_count = retry_count+1
yield scrapy.Request(url=response.url, callback=self.parse_reviews, dont_filter=True, meta={'asin': asin, 'retry_count': retry_count})
## Parse Product Reviews
review_elements = response.css("#cm_cr-review_list div.review")
for review_element in review_elements:
yield {
"asin": asin,
"text": "".join(review_element.css("span[data-hook=review-body] ::text").getall()).strip(),
"title": review_element.css("*[data-hook=review-title]>span::text").get(),
"location_and_date": review_element.css("span[data-hook=review-date] ::text").get(),
"verified": bool(review_element.css("span[data-hook=avp-badge] ::text").get()),
"rating": review_element.css("[data-hook=review-star-rating] ::text").re(r"(\d+\.\d) out")[0],
}
看起來你必須通過。屬性方法。
next_page_anchor = response.css("li.a-last > a")[0]
if next_page_anchor is None:
# do your error handling
else:
next_page_relative_url = next_page_anchor.attrib['href']
參考
https://docs.scrapy.org/en/latest/topics/selectors.html https://github.com/ShivNandan-28/Amazon-Selenium-Scraper 所有信息都在那里。 它不再通過scrapy工作