Spaces:
Sleeping
Sleeping
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.chrome.service import Service | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| def start_requests(self): | |
| query = input("Enter your search query: ") | |
| google_search_url = f"https://www.google.com/search?q={query}" | |
| # Set up Selenium | |
| options = webdriver.ChromeOptions() | |
| options.add_argument('--headless') # Run in headless mode | |
| options.add_argument('--disable-gpu') | |
| options.add_argument('--no-sandbox') | |
| driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) | |
| driver.get(google_search_url) | |
| soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| driver.quit() | |
| urls = [] | |
| for link in soup.find_all('a', href=True): | |
| href = link['href'] | |
| if href.startswith('/url?q='): | |
| url = href.split('/url?q=')[1].split('&')[0] | |
| if not url.startswith('http'): | |
| continue | |
| urls.append(url) | |
| if len(urls) == self.max_scrapes: | |
| break | |
| if not urls: | |
| self.logger.error("No URLs extracted from Google search results.") | |
| return | |
| self.logger.info(f"Extracted URLs: {urls}") | |
| for url in urls: | |
| yield Request(url, callback=self.parse) |