Spaces:
Sleeping
Sleeping
| import scrapy | |
| from scrapy.crawler import CrawlerProcess | |
| import json | |
| class GoogleSpider(scrapy.Spider): | |
| name = 'google' | |
| allowed_domains = ['google.com'] | |
| def __init__(self, search_query, *args, **kwargs): | |
| super(GoogleSpider, self).__init__(*args, **kwargs) | |
| self.start_urls = [f'https://www.google.com/search?q={search_query}'] | |
| def parse(self, response): | |
| search_results = [] | |
| for result in response.css('div.g'): | |
| title = result.css('h3::text').get() | |
| snippet = result.css('.IsZvec::text').get() | |
| link = result.css('a::attr(href)').get() | |
| if title and snippet and link: | |
| search_results.append({ | |
| 'title': title, | |
| 'snippet': snippet, | |
| 'link': link | |
| }) | |
| filename = f"{self.search_query}_search_results.json" | |
| with open(filename, "w") as f: | |
| json.dump(search_results, f) | |
| def run_spider(search_query): | |
| process = CrawlerProcess({ | |
| 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', | |
| 'LOG_LEVEL': 'ERROR', | |
| }) | |
| process.crawl(GoogleSpider, search_query=search_query) | |
| process.start() | |