Spaces:

Nishitha03
/

News-Scraper

Sleeping

File size: 13,221 Bytes

dd99def

import os
import time
import logging
import csv
import signal
import sys
import argparse
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, ElementClickInterceptedException
)
from bs4 import BeautifulSoup
from utils.webdriver_utils import create_chrome_driver, scroll_to_element

class ScrollArticleScraper:
    def __init__(self, max_workers=4, articles_per_page=10):
        self.max_workers = max_workers
        self.articles_per_page = articles_per_page
        self.base_url = "https://scroll.in/search"
        self.fetched_articles = set()
        self.articles = []
        self.is_interrupted = False
        self.last_save_time = time.time()
        self.save_interval = 300  # Save every 5 minutes

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

        # Set up signal handlers
        signal.signal(signal.SIGINT, self.signal_handler)
        signal.signal(signal.SIGTERM, self.signal_handler)

    def signal_handler(self, signum, frame):
        """Handle interrupt signals"""
        print("\nReceived interrupt signal. Saving progress and shutting down...")
        self.is_interrupted = True
        if self.articles:
            self.save_progress("interrupted", force=True)
        sys.exit(0)

    # def create_driver(self):
    #     """Create and return a new Chrome driver instance"""
    #     chrome_options = webdriver.ChromeOptions()
    #     chrome_options.add_argument('--headless')
    #     chrome_options.add_argument('--no-sandbox')
    #     chrome_options.add_argument('--disable-dev-shm-usage')
    #     chrome_options.add_argument('--disable-extensions')
    #     chrome_options.page_load_strategy = 'eager'
    #     return webdriver.Chrome(options=chrome_options)
    def create_driver(self):
        """Create and return a new Chrome driver instance"""
        return create_chrome_driver(headless=True, load_images=False)
    
    def get_total_pages(self, driver, search_term):
        """Get the total number of pages for the search term"""
        try:
            driver.get(f"{self.base_url}?q={search_term}&page=1")
            time.sleep(2)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Scroll.in might have a different pagination structure
            pagination = soup.find('div', class_='page-numbers')
            if pagination:
                pages = pagination.find_all('a', class_='page-number')
                if pages:
                    # Get the last page number
                    last_page = max([int(page.text.strip()) for page in pages])
                    return last_page

            # Fallback to 144 pages if pagination not found
            return 144
        except Exception as e:
            self.logger.error(f"Error getting total pages: {str(e)}")
            return 144  # Default to 144 pages
    def extract_visible_articles(self, driver):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        article_containers = soup.find_all('li', itemscope=True, itemtype="https://schema.org/NewsArticle")

        new_articles = []
        for container in article_containers:
            link_element = container.find('a', href=True)
            date_element = container.find('time')  # Change back to 'time'

            if link_element and link_element['href']:
                full_link = link_element['href'] if link_element['href'].startswith('http') else 'https://scroll.in' + link_element['href']

                # Extract date
                date = None
                if date_element:
                    date_text = date_element.get_text(strip=True)
                    date_text = date_text.replace('Â', '').replace('·', '').strip()
                    try:
                        # Try parsing the date
                        parsed_date = datetime.strptime(date_text, '%b %d, %Y')
                        date = parsed_date.strftime('%Y-%m-%d')
                    except:
                        try:
                            # Alternative date format
                            parsed_date = datetime.strptime(date_text, '%B %d, %Y')
                            date = parsed_date.strftime('%Y-%m-%d')
                        except:
                            date = date_text

                if full_link not in self.fetched_articles:
                    self.fetched_articles.add(full_link)
                    new_articles.append({'link': full_link, 'date': date})

        return new_articles

    def scrape_topic(self, search_term, topic):
        """Scrape articles from all pages for a given search term"""
        driver = self.create_driver()
        try:
            total_pages = self.get_total_pages(driver, search_term)
            total_expected_articles = total_pages * self.articles_per_page
            self.logger.info(f"Found {total_pages} pages to scrape (approximately {total_expected_articles} articles)")

            for page in range(1, total_pages + 1):
                if self.is_interrupted:
                    break

                page_url = f"{self.base_url}?q={search_term}&page={page}"
                articles_scraped = len(self.articles)
                progress_percentage = (articles_scraped / total_expected_articles) * 100

                self.logger.info(f"Scraping page {page}/{total_pages} - Progress: {articles_scraped}/{total_expected_articles} articles ({progress_percentage:.1f}%)")

                try:
                    driver.get(page_url)
                    time.sleep(2)  # Allow page to load

                    new_articles = self.extract_visible_articles(driver)
                    if new_articles:
                        self.process_articles_batch(new_articles)
                        self.logger.info(f"Scraped {len(new_articles)}/{self.articles_per_page} articles from page {page}")
                        self.save_progress(topic)
                    else:
                        self.logger.warning(f"No articles found on page {page}")

                except Exception as e:
                    self.logger.error(f"Error scraping page {page}: {str(e)}")
                    continue

            if self.articles:
                return self.save_to_csv(self.articles, topic, final=True)
            return None

        except Exception as e:
            self.logger.error(f"Error in scrape_topic: {str(e)}")
            if self.articles:
                return self.save_to_csv(self.articles, topic, final=True)
            return None
        finally:
            driver.quit()

    def scrape_article_parallel(self, article_data):

        driver = self.create_driver()
        try:
            url = article_data['link']
            driver.get(url)
            time.sleep(2)

            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Extract title
            title = None
            title_element = soup.find('h1')  # Simple h1 search
            if title_element:
                title = title_element.get_text().strip()

            # Extract description
            desc = None
            article_body = soup.find('section', class_='article-content')
            if article_body:
                paragraphs = article_body.find_all('p')
                desc = ' '.join([p.get_text().strip() for p in paragraphs])

            return {
                'title': title or 'Title not found',
                'desc': desc or 'Description not found',
                'date': article_data['date'] or 'Date not found',
                'link': url
            }

        except Exception as e:
            self.logger.error(f"Error scraping article {url}: {str(e)}")
            return None
        finally:
            driver.quit()

    def process_articles_batch(self, article_batch):
        """Process a batch of articles in parallel"""
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [executor.submit(self.scrape_article_parallel, article_data)
                      for article_data in article_batch]

            successful_articles = 0
            for future in as_completed(futures):
                try:
                    article = future.result()
                    if article:
                        self.articles.append(article)
                        successful_articles += 1
                        self.logger.info(f"Successfully scraped: {article['title'][:50]}...")
                except Exception as e:
                    self.logger.error(f"Error processing article: {str(e)}")

            if successful_articles < len(article_batch):
                self.logger.warning(f"Only processed {successful_articles}/{len(article_batch)} articles in this batch")

    def save_progress(self, topic, force=False):
        """Save current progress to CSV"""
        current_time = time.time()
        if force or (current_time - self.last_save_time >= self.save_interval and self.articles):
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"scroll_{topic}articles_{timestamp}_partial.csv"
            try:
                with open(filename, 'w', newline='', encoding='utf-8') as file:
                    writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
                    writer.writeheader()
                    writer.writerows(self.articles)
                self.last_save_time = current_time
                print(f"\nProgress saved to {filename}: {len(self.articles)} articles")
                self.logger.info(f"Progress saved to {filename}: {len(self.articles)} articles")
            except Exception as e:
                self.logger.error(f"Error saving progress: {str(e)}")

    def save_to_csv(self, articles, topic, final=False):
        """Save articles to CSV file"""
        if not articles:
            self.logger.error("No articles to save")
            return None

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"scroll_{topic}articles_{timestamp}_{'final' if final else 'partial'}.csv"

        try:
            with open(filename, 'w', newline='', encoding='utf-8') as file:
                writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
                writer.writeheader()
                writer.writerows(articles)
                self.logger.info(f"Successfully saved {len(articles)} articles to: {filename}")
            return filename
        except Exception as e:
            self.logger.error(f"Error saving to CSV: {str(e)}")
            return None

# def parse_arguments():
#     """Parse command line arguments"""
#     parser = argparse.ArgumentParser(description='Scrape articles from Scroll.in by topic')
    
#     parser.add_argument('topic', type=str, 
#                         help='Topic to scrape (e.g., "RSS", "Covid", "India")')
    
#     parser.add_argument('-w', '--workers', type=int, default=4,
#                         help='Number of worker threads (default: 4)')
    
#     parser.add_argument('-i', '--interval', type=int, default=300,
#                         help='Auto-save interval in seconds (default: 300)')
    
#     parser.add_argument('-a', '--articles-per-page', type=int, default=10,
#                         help='Expected number of articles per page (default: 10)')
    
#     return parser.parse_args()

def main():
    try:
        # Parse command line arguments
        args = parse_arguments()
        
        # Initialize the scraper with command line arguments
        scraper = ScrollArticleScraper(
            max_workers=args.workers,
            articles_per_page=args.articles_per_page
        )
        scraper.save_interval = args.interval
        
        # Get topic from command line argument
        topic = args.topic

        print(f"\nScraping {topic}-related articles from Scroll.in...")
        print("Press Ctrl+C at any time to save progress and exit.")

        final_csv = scraper.scrape_topic(topic.lower(), topic)

        if final_csv:
            print(f"\nArticles have been saved to: {final_csv}")
            print(f"Total articles scraped: {len(scraper.articles)}")
        else:
            print("\nError saving to final CSV file")

    except KeyboardInterrupt:
        print("\nProcess interrupted by user. Saving progress...")
        if scraper.articles:
            scraper.save_progress(topic, force=True)
        print("Saved progress and exiting.")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        if 'scraper' in locals() and scraper.articles:
            scraper.save_progress(topic, force=True)
        print("Saved progress despite error.")

if __name__ == "__main__":
    main()