Spaces:
Sleeping
Sleeping
File size: 13,221 Bytes
dd99def |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 |
import os
import time
import logging
import csv
import signal
import sys
import argparse
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
TimeoutException, NoSuchElementException, ElementClickInterceptedException
)
from bs4 import BeautifulSoup
from utils.webdriver_utils import create_chrome_driver, scroll_to_element
class ScrollArticleScraper:
def __init__(self, max_workers=4, articles_per_page=10):
self.max_workers = max_workers
self.articles_per_page = articles_per_page
self.base_url = "https://scroll.in/search"
self.fetched_articles = set()
self.articles = []
self.is_interrupted = False
self.last_save_time = time.time()
self.save_interval = 300 # Save every 5 minutes
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
self.logger = logging.getLogger(__name__)
# Set up signal handlers
signal.signal(signal.SIGINT, self.signal_handler)
signal.signal(signal.SIGTERM, self.signal_handler)
def signal_handler(self, signum, frame):
"""Handle interrupt signals"""
print("\nReceived interrupt signal. Saving progress and shutting down...")
self.is_interrupted = True
if self.articles:
self.save_progress("interrupted", force=True)
sys.exit(0)
# def create_driver(self):
# """Create and return a new Chrome driver instance"""
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')
# chrome_options.add_argument('--disable-extensions')
# chrome_options.page_load_strategy = 'eager'
# return webdriver.Chrome(options=chrome_options)
def create_driver(self):
"""Create and return a new Chrome driver instance"""
return create_chrome_driver(headless=True, load_images=False)
def get_total_pages(self, driver, search_term):
"""Get the total number of pages for the search term"""
try:
driver.get(f"{self.base_url}?q={search_term}&page=1")
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Scroll.in might have a different pagination structure
pagination = soup.find('div', class_='page-numbers')
if pagination:
pages = pagination.find_all('a', class_='page-number')
if pages:
# Get the last page number
last_page = max([int(page.text.strip()) for page in pages])
return last_page
# Fallback to 144 pages if pagination not found
return 144
except Exception as e:
self.logger.error(f"Error getting total pages: {str(e)}")
return 144 # Default to 144 pages
def extract_visible_articles(self, driver):
soup = BeautifulSoup(driver.page_source, 'html.parser')
article_containers = soup.find_all('li', itemscope=True, itemtype="https://schema.org/NewsArticle")
new_articles = []
for container in article_containers:
link_element = container.find('a', href=True)
date_element = container.find('time') # Change back to 'time'
if link_element and link_element['href']:
full_link = link_element['href'] if link_element['href'].startswith('http') else 'https://scroll.in' + link_element['href']
# Extract date
date = None
if date_element:
date_text = date_element.get_text(strip=True)
date_text = date_text.replace('Â', '').replace('·', '').strip()
try:
# Try parsing the date
parsed_date = datetime.strptime(date_text, '%b %d, %Y')
date = parsed_date.strftime('%Y-%m-%d')
except:
try:
# Alternative date format
parsed_date = datetime.strptime(date_text, '%B %d, %Y')
date = parsed_date.strftime('%Y-%m-%d')
except:
date = date_text
if full_link not in self.fetched_articles:
self.fetched_articles.add(full_link)
new_articles.append({'link': full_link, 'date': date})
return new_articles
def scrape_topic(self, search_term, topic):
"""Scrape articles from all pages for a given search term"""
driver = self.create_driver()
try:
total_pages = self.get_total_pages(driver, search_term)
total_expected_articles = total_pages * self.articles_per_page
self.logger.info(f"Found {total_pages} pages to scrape (approximately {total_expected_articles} articles)")
for page in range(1, total_pages + 1):
if self.is_interrupted:
break
page_url = f"{self.base_url}?q={search_term}&page={page}"
articles_scraped = len(self.articles)
progress_percentage = (articles_scraped / total_expected_articles) * 100
self.logger.info(f"Scraping page {page}/{total_pages} - Progress: {articles_scraped}/{total_expected_articles} articles ({progress_percentage:.1f}%)")
try:
driver.get(page_url)
time.sleep(2) # Allow page to load
new_articles = self.extract_visible_articles(driver)
if new_articles:
self.process_articles_batch(new_articles)
self.logger.info(f"Scraped {len(new_articles)}/{self.articles_per_page} articles from page {page}")
self.save_progress(topic)
else:
self.logger.warning(f"No articles found on page {page}")
except Exception as e:
self.logger.error(f"Error scraping page {page}: {str(e)}")
continue
if self.articles:
return self.save_to_csv(self.articles, topic, final=True)
return None
except Exception as e:
self.logger.error(f"Error in scrape_topic: {str(e)}")
if self.articles:
return self.save_to_csv(self.articles, topic, final=True)
return None
finally:
driver.quit()
def scrape_article_parallel(self, article_data):
driver = self.create_driver()
try:
url = article_data['link']
driver.get(url)
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Extract title
title = None
title_element = soup.find('h1') # Simple h1 search
if title_element:
title = title_element.get_text().strip()
# Extract description
desc = None
article_body = soup.find('section', class_='article-content')
if article_body:
paragraphs = article_body.find_all('p')
desc = ' '.join([p.get_text().strip() for p in paragraphs])
return {
'title': title or 'Title not found',
'desc': desc or 'Description not found',
'date': article_data['date'] or 'Date not found',
'link': url
}
except Exception as e:
self.logger.error(f"Error scraping article {url}: {str(e)}")
return None
finally:
driver.quit()
def process_articles_batch(self, article_batch):
"""Process a batch of articles in parallel"""
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = [executor.submit(self.scrape_article_parallel, article_data)
for article_data in article_batch]
successful_articles = 0
for future in as_completed(futures):
try:
article = future.result()
if article:
self.articles.append(article)
successful_articles += 1
self.logger.info(f"Successfully scraped: {article['title'][:50]}...")
except Exception as e:
self.logger.error(f"Error processing article: {str(e)}")
if successful_articles < len(article_batch):
self.logger.warning(f"Only processed {successful_articles}/{len(article_batch)} articles in this batch")
def save_progress(self, topic, force=False):
"""Save current progress to CSV"""
current_time = time.time()
if force or (current_time - self.last_save_time >= self.save_interval and self.articles):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"scroll_{topic}articles_{timestamp}_partial.csv"
try:
with open(filename, 'w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
writer.writeheader()
writer.writerows(self.articles)
self.last_save_time = current_time
print(f"\nProgress saved to {filename}: {len(self.articles)} articles")
self.logger.info(f"Progress saved to {filename}: {len(self.articles)} articles")
except Exception as e:
self.logger.error(f"Error saving progress: {str(e)}")
def save_to_csv(self, articles, topic, final=False):
"""Save articles to CSV file"""
if not articles:
self.logger.error("No articles to save")
return None
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"scroll_{topic}articles_{timestamp}_{'final' if final else 'partial'}.csv"
try:
with open(filename, 'w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
writer.writeheader()
writer.writerows(articles)
self.logger.info(f"Successfully saved {len(articles)} articles to: {filename}")
return filename
except Exception as e:
self.logger.error(f"Error saving to CSV: {str(e)}")
return None
# def parse_arguments():
# """Parse command line arguments"""
# parser = argparse.ArgumentParser(description='Scrape articles from Scroll.in by topic')
# parser.add_argument('topic', type=str,
# help='Topic to scrape (e.g., "RSS", "Covid", "India")')
# parser.add_argument('-w', '--workers', type=int, default=4,
# help='Number of worker threads (default: 4)')
# parser.add_argument('-i', '--interval', type=int, default=300,
# help='Auto-save interval in seconds (default: 300)')
# parser.add_argument('-a', '--articles-per-page', type=int, default=10,
# help='Expected number of articles per page (default: 10)')
# return parser.parse_args()
def main():
try:
# Parse command line arguments
args = parse_arguments()
# Initialize the scraper with command line arguments
scraper = ScrollArticleScraper(
max_workers=args.workers,
articles_per_page=args.articles_per_page
)
scraper.save_interval = args.interval
# Get topic from command line argument
topic = args.topic
print(f"\nScraping {topic}-related articles from Scroll.in...")
print("Press Ctrl+C at any time to save progress and exit.")
final_csv = scraper.scrape_topic(topic.lower(), topic)
if final_csv:
print(f"\nArticles have been saved to: {final_csv}")
print(f"Total articles scraped: {len(scraper.articles)}")
else:
print("\nError saving to final CSV file")
except KeyboardInterrupt:
print("\nProcess interrupted by user. Saving progress...")
if scraper.articles:
scraper.save_progress(topic, force=True)
print("Saved progress and exiting.")
except Exception as e:
print(f"\nAn error occurred: {str(e)}")
if 'scraper' in locals() and scraper.articles:
scraper.save_progress(topic, force=True)
print("Saved progress despite error.")
if __name__ == "__main__":
main() |