import os import time import logging import csv import signal import sys import argparse from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import ( TimeoutException, NoSuchElementException, ElementClickInterceptedException ) from bs4 import BeautifulSoup from utils.webdriver_utils import create_chrome_driver, scroll_to_element class ScrollArticleScraper: def __init__(self, max_workers=4, articles_per_page=10): self.max_workers = max_workers self.articles_per_page = articles_per_page self.base_url = "https://scroll.in/search" self.fetched_articles = set() self.articles = [] self.is_interrupted = False self.last_save_time = time.time() self.save_interval = 300 # Save every 5 minutes logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) self.logger = logging.getLogger(__name__) # Set up signal handlers signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) def signal_handler(self, signum, frame): """Handle interrupt signals""" print("\nReceived interrupt signal. Saving progress and shutting down...") self.is_interrupted = True if self.articles: self.save_progress("interrupted", force=True) sys.exit(0) # def create_driver(self): # """Create and return a new Chrome driver instance""" # chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--no-sandbox') # chrome_options.add_argument('--disable-dev-shm-usage') # chrome_options.add_argument('--disable-extensions') # chrome_options.page_load_strategy = 'eager' # return webdriver.Chrome(options=chrome_options) def create_driver(self): """Create and return a new Chrome driver instance""" return create_chrome_driver(headless=True, load_images=False) def get_total_pages(self, driver, search_term): """Get the total number of pages for the search term""" try: driver.get(f"{self.base_url}?q={search_term}&page=1") time.sleep(2) soup = BeautifulSoup(driver.page_source, 'html.parser') # Scroll.in might have a different pagination structure pagination = soup.find('div', class_='page-numbers') if pagination: pages = pagination.find_all('a', class_='page-number') if pages: # Get the last page number last_page = max([int(page.text.strip()) for page in pages]) return last_page # Fallback to 144 pages if pagination not found return 144 except Exception as e: self.logger.error(f"Error getting total pages: {str(e)}") return 144 # Default to 144 pages def extract_visible_articles(self, driver): soup = BeautifulSoup(driver.page_source, 'html.parser') article_containers = soup.find_all('li', itemscope=True, itemtype="https://schema.org/NewsArticle") new_articles = [] for container in article_containers: link_element = container.find('a', href=True) date_element = container.find('time') # Change back to 'time' if link_element and link_element['href']: full_link = link_element['href'] if link_element['href'].startswith('http') else 'https://scroll.in' + link_element['href'] # Extract date date = None if date_element: date_text = date_element.get_text(strip=True) date_text = date_text.replace('Â', '').replace('·', '').strip() try: # Try parsing the date parsed_date = datetime.strptime(date_text, '%b %d, %Y') date = parsed_date.strftime('%Y-%m-%d') except: try: # Alternative date format parsed_date = datetime.strptime(date_text, '%B %d, %Y') date = parsed_date.strftime('%Y-%m-%d') except: date = date_text if full_link not in self.fetched_articles: self.fetched_articles.add(full_link) new_articles.append({'link': full_link, 'date': date}) return new_articles def scrape_topic(self, search_term, topic): """Scrape articles from all pages for a given search term""" driver = self.create_driver() try: total_pages = self.get_total_pages(driver, search_term) total_expected_articles = total_pages * self.articles_per_page self.logger.info(f"Found {total_pages} pages to scrape (approximately {total_expected_articles} articles)") for page in range(1, total_pages + 1): if self.is_interrupted: break page_url = f"{self.base_url}?q={search_term}&page={page}" articles_scraped = len(self.articles) progress_percentage = (articles_scraped / total_expected_articles) * 100 self.logger.info(f"Scraping page {page}/{total_pages} - Progress: {articles_scraped}/{total_expected_articles} articles ({progress_percentage:.1f}%)") try: driver.get(page_url) time.sleep(2) # Allow page to load new_articles = self.extract_visible_articles(driver) if new_articles: self.process_articles_batch(new_articles) self.logger.info(f"Scraped {len(new_articles)}/{self.articles_per_page} articles from page {page}") self.save_progress(topic) else: self.logger.warning(f"No articles found on page {page}") except Exception as e: self.logger.error(f"Error scraping page {page}: {str(e)}") continue if self.articles: return self.save_to_csv(self.articles, topic, final=True) return None except Exception as e: self.logger.error(f"Error in scrape_topic: {str(e)}") if self.articles: return self.save_to_csv(self.articles, topic, final=True) return None finally: driver.quit() def scrape_article_parallel(self, article_data): driver = self.create_driver() try: url = article_data['link'] driver.get(url) time.sleep(2) soup = BeautifulSoup(driver.page_source, 'html.parser') # Extract title title = None title_element = soup.find('h1') # Simple h1 search if title_element: title = title_element.get_text().strip() # Extract description desc = None article_body = soup.find('section', class_='article-content') if article_body: paragraphs = article_body.find_all('p') desc = ' '.join([p.get_text().strip() for p in paragraphs]) return { 'title': title or 'Title not found', 'desc': desc or 'Description not found', 'date': article_data['date'] or 'Date not found', 'link': url } except Exception as e: self.logger.error(f"Error scraping article {url}: {str(e)}") return None finally: driver.quit() def process_articles_batch(self, article_batch): """Process a batch of articles in parallel""" with ThreadPoolExecutor(max_workers=self.max_workers) as executor: futures = [executor.submit(self.scrape_article_parallel, article_data) for article_data in article_batch] successful_articles = 0 for future in as_completed(futures): try: article = future.result() if article: self.articles.append(article) successful_articles += 1 self.logger.info(f"Successfully scraped: {article['title'][:50]}...") except Exception as e: self.logger.error(f"Error processing article: {str(e)}") if successful_articles < len(article_batch): self.logger.warning(f"Only processed {successful_articles}/{len(article_batch)} articles in this batch") def save_progress(self, topic, force=False): """Save current progress to CSV""" current_time = time.time() if force or (current_time - self.last_save_time >= self.save_interval and self.articles): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"scroll_{topic}articles_{timestamp}_partial.csv" try: with open(filename, 'w', newline='', encoding='utf-8') as file: writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link']) writer.writeheader() writer.writerows(self.articles) self.last_save_time = current_time print(f"\nProgress saved to {filename}: {len(self.articles)} articles") self.logger.info(f"Progress saved to {filename}: {len(self.articles)} articles") except Exception as e: self.logger.error(f"Error saving progress: {str(e)}") def save_to_csv(self, articles, topic, final=False): """Save articles to CSV file""" if not articles: self.logger.error("No articles to save") return None timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"scroll_{topic}articles_{timestamp}_{'final' if final else 'partial'}.csv" try: with open(filename, 'w', newline='', encoding='utf-8') as file: writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link']) writer.writeheader() writer.writerows(articles) self.logger.info(f"Successfully saved {len(articles)} articles to: {filename}") return filename except Exception as e: self.logger.error(f"Error saving to CSV: {str(e)}") return None # def parse_arguments(): # """Parse command line arguments""" # parser = argparse.ArgumentParser(description='Scrape articles from Scroll.in by topic') # parser.add_argument('topic', type=str, # help='Topic to scrape (e.g., "RSS", "Covid", "India")') # parser.add_argument('-w', '--workers', type=int, default=4, # help='Number of worker threads (default: 4)') # parser.add_argument('-i', '--interval', type=int, default=300, # help='Auto-save interval in seconds (default: 300)') # parser.add_argument('-a', '--articles-per-page', type=int, default=10, # help='Expected number of articles per page (default: 10)') # return parser.parse_args() def main(): try: # Parse command line arguments args = parse_arguments() # Initialize the scraper with command line arguments scraper = ScrollArticleScraper( max_workers=args.workers, articles_per_page=args.articles_per_page ) scraper.save_interval = args.interval # Get topic from command line argument topic = args.topic print(f"\nScraping {topic}-related articles from Scroll.in...") print("Press Ctrl+C at any time to save progress and exit.") final_csv = scraper.scrape_topic(topic.lower(), topic) if final_csv: print(f"\nArticles have been saved to: {final_csv}") print(f"Total articles scraped: {len(scraper.articles)}") else: print("\nError saving to final CSV file") except KeyboardInterrupt: print("\nProcess interrupted by user. Saving progress...") if scraper.articles: scraper.save_progress(topic, force=True) print("Saved progress and exiting.") except Exception as e: print(f"\nAn error occurred: {str(e)}") if 'scraper' in locals() and scraper.articles: scraper.save_progress(topic, force=True) print("Saved progress despite error.") if __name__ == "__main__": main()