NextSuccess.ai / google_spider.py
sainathBelagavi's picture
Update google_spider.py
8279286 verified
raw
history blame
1.29 kB
import scrapy
from scrapy.crawler import CrawlerProcess
import json
class GoogleSpider(scrapy.Spider):
name = 'google'
allowed_domains = ['google.com']
def __init__(self, search_query, *args, **kwargs):
super(GoogleSpider, self).__init__(*args, **kwargs)
self.start_urls = [f'https://www.google.com/search?q={search_query}']
def parse(self, response):
search_results = []
for result in response.css('div.g'):
title = result.css('h3::text').get()
snippet = result.css('.IsZvec::text').get()
link = result.css('a::attr(href)').get()
if title and snippet and link:
search_results.append({
'title': title,
'snippet': snippet,
'link': link
})
filename = f"{self.search_query}_search_results.json"
with open(filename, "w") as f:
json.dump(search_results, f)
def run_spider(search_query):
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'LOG_LEVEL': 'ERROR',
})
process.crawl(GoogleSpider, search_query=search_query)
process.start()