import requests from bs4 import BeautifulSoup import time import random import json from pathlib import Path import logging from urllib.parse import urljoin # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class BengaliDataCollector: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } self.output_dir = Path('data/raw') self.output_dir.mkdir(parents=True, exist_ok=True) def make_request(self, url, retries=3, delay=1): """Make HTTP request with retry logic and rate limiting""" for attempt in range(retries): try: time.sleep(delay + random.random()) # Rate limiting with jitter response = requests.get(url, headers=self.headers) response.raise_for_status() return response except requests.RequestException as e: logger.warning(f"Attempt {attempt + 1} failed for {url}: {str(e)}") if attempt == retries - 1: logger.error(f"Failed to fetch {url} after {retries} attempts") raise time.sleep(delay * (attempt + 1)) # Exponential backoff def scrape_wikipedia(self): """Scrape Bengali text from Wikipedia""" url = "https://bn.wikipedia.org/wiki/প্রধান_পাতা" logger.info(f"Scraping Wikipedia: {url}") try: response = self.make_request(url) soup = BeautifulSoup(response.content, 'html.parser') # Get main content and featured articles content_div = soup.find('div', {'id': 'mw-content-text'}) articles = [] if content_div: # Extract article links article_links = content_div.find_all('a', href=True) for link in article_links[:50]: # Limit to first 50 articles if link['href'].startswith('/wiki/') and ':' not in link['href']: article_url = urljoin('https://bn.wikipedia.org', link['href']) try: article_response = self.make_request(article_url) article_soup = BeautifulSoup(article_response.content, 'html.parser') # Extract article content article_content = article_soup.find('div', {'id': 'mw-content-text'}) if article_content: text = article_content.get_text(separator='\n', strip=True) articles.append({ 'url': article_url, 'content': text }) logger.info(f"Successfully scraped article: {article_url}") except Exception as e: logger.error(f"Failed to scrape article {article_url}: {str(e)}") # Save Wikipedia data with open(self.output_dir / 'wikipedia_data.json', 'w', encoding='utf-8') as f: json.dump(articles, f, ensure_ascii=False, indent=2) return len(articles) except Exception as e: logger.error(f"Failed to scrape Wikipedia: {str(e)}") return 0 def scrape_prothom_alo(self): """Scrape Bengali text from Prothom Alo""" base_url = "https://www.prothomalo.com" categories = ['bangladesh', 'international', 'opinion', 'science-technology'] articles = [] for category in categories: url = f"{base_url}/{category}" logger.info(f"Scraping Prothom Alo category: {category}") try: response = self.make_request(url) soup = BeautifulSoup(response.content, 'html.parser') # Find article links article_links = soup.find_all('a', href=True) for link in article_links[:10]: # Limit to 10 articles per category article_url = urljoin(base_url, link['href']) if category in article_url: try: article_response = self.make_request(article_url) article_soup = BeautifulSoup(article_response.content, 'html.parser') # Extract article content article_content = article_soup.find('div', {'class': 'story-content'}) if article_content: text = article_content.get_text(separator='\n', strip=True) articles.append({ 'url': article_url, 'category': category, 'content': text }) logger.info(f"Successfully scraped article: {article_url}") except Exception as e: logger.error(f"Failed to scrape article {article_url}: {str(e)}") except Exception as e: logger.error(f"Failed to scrape category {category}: {str(e)}") # Save Prothom Alo data with open(self.output_dir / 'prothomalo_data.json', 'w', encoding='utf-8') as f: json.dump(articles, f, ensure_ascii=False, indent=2) return len(articles) def collect(self): """Main method to collect data from all sources""" logger.info("Starting data collection") wiki_count = self.scrape_wikipedia() logger.info(f"Collected {wiki_count} articles from Wikipedia") prothomalo_count = self.scrape_prothom_alo() logger.info(f"Collected {prothomalo_count} articles from Prothom Alo") # Combine and process the collected data self.process_collected_data() logger.info("Data collection completed") def process_collected_data(self): """Process and combine collected data""" try: # Read collected data with open(self.output_dir / 'wikipedia_data.json', 'r', encoding='utf-8') as f: wiki_data = json.load(f) with open(self.output_dir / 'prothomalo_data.json', 'r', encoding='utf-8') as f: news_data = json.load(f) # Combine and format data processed_data = [] # Process Wikipedia articles for article in wiki_data: processed_data.append({ 'text': article['content'], 'source': 'wikipedia', 'url': article['url'] }) # Process news articles for article in news_data: processed_data.append({ 'text': article['content'], 'source': 'prothomalo', 'category': article.get('category', ''), 'url': article['url'] }) # Save processed data with open(self.output_dir / 'processed_data.json', 'w', encoding='utf-8') as f: json.dump(processed_data, f, ensure_ascii=False, indent=2) logger.info(f"Successfully processed {len(processed_data)} articles") except Exception as e: logger.error(f"Failed to process collected data: {str(e)}") raise if __name__ == "__main__": collector = BengaliDataCollector() collector.collect()