| |
| """ |
| Enhanced Prothom Alo Dataset Creator for Model Training |
| - Gets 50+ articles from both English and Bengali |
| - Includes multiple categories |
| - Prepares for fine-tuning |
| """ |
|
|
| import requests |
| from bs4 import BeautifulSoup |
| import json |
| import time |
| import re |
| from datetime import datetime |
| from typing import Dict, List, Optional |
| from datasets import Dataset, DatasetDict, Features, Value |
| from dataclasses import dataclass |
| import concurrent.futures |
| import logging |
| from pathlib import Path |
|
|
| |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| logger = logging.getLogger(__name__) |
|
|
| @dataclass |
| class Article: |
| """Enhanced article class for training data""" |
| title: str |
| content: str |
| url: str |
| category: str |
| language: str |
| author: str = "Prothom Alo" |
| published_date: str = "" |
| word_count: int = 0 |
| content_clean: str = "" |
| summary: str = "" |
|
|
| class EnhancedProthomAloScraper: |
| """Enhanced scraper for comprehensive dataset creation""" |
| |
| def __init__(self, max_articles: int = 100, max_workers: int = 3): |
| self.max_articles = max_articles |
| self.max_workers = max_workers |
| self.session = requests.Session() |
| self.session.headers.update({ |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
| }) |
| |
| def clean_text(self, text: str) -> str: |
| """Clean and normalize text""" |
| if not text: |
| return "" |
| |
| |
| text = re.sub(r'\s+', ' ', text) |
| |
| text = re.sub(r'[^\w\s\-\.\,\!\?\;\:\(\)]', ' ', text) |
| |
| return text.strip() |
| |
| def extract_article_content(self, soup: BeautifulSoup) -> Dict: |
| """Extract article content with improved parsing""" |
| try: |
| |
| title_elem = soup.select_one('h1, .headline, .article-title') |
| title = self.clean_text(title_elem.get_text()) if title_elem else "" |
| |
| |
| content_selectors = [ |
| '.article-content p', |
| '.story-content p', |
| '.content p', |
| 'article p', |
| 'p' |
| ] |
| |
| content = "" |
| for selector in content_selectors: |
| paragraphs = soup.select(selector) |
| if paragraphs: |
| content = ' '.join([self.clean_text(p.get_text()) for p in paragraphs if p.get_text()]) |
| break |
| |
| if not content: |
| |
| content = self.clean_text(soup.get_text()) |
| |
| |
| author_selectors = ['.author', '.byline', '.writer', '.reporter'] |
| author = "Prothom Alo" |
| for selector in author_selectors: |
| author_elem = soup.select_one(selector) |
| if author_elem: |
| author = self.clean_text(author_elem.get_text()) |
| break |
| |
| |
| date_selectors = ['time', '.date', '.published', '.timestamp'] |
| published_date = datetime.now().isoformat() |
| for selector in date_selectors: |
| date_elem = soup.select_one(selector) |
| if date_elem: |
| if date_elem.get('datetime'): |
| published_date = date_elem.get('datetime') |
| else: |
| published_date = self.clean_text(date_elem.get_text()) |
| break |
| |
| return { |
| 'title': title, |
| 'content': content, |
| 'author': author, |
| 'published_date': published_date |
| } |
| |
| except Exception as e: |
| logger.warning(f"Content extraction failed: {e}") |
| return { |
| 'title': "", |
| 'content': "", |
| 'author': "Prothom Alo", |
| 'published_date': datetime.now().isoformat() |
| } |
| |
| def extract_articles_from_page(self, url: str, category: str, language: str) -> List[Article]: |
| """Extract articles from a single page""" |
| articles = [] |
| |
| try: |
| logger.info(f"Fetching {url} for {category} articles") |
| response = self.session.get(url, timeout=15) |
| response.raise_for_status() |
| |
| soup = BeautifulSoup(response.content, 'html.parser') |
| |
| |
| link_patterns = [ |
| 'h1 a', 'h2 a', 'h3 a', |
| '.headline a', '.title a', |
| 'a[href*="article"]', 'a[href*="news"]', |
| '.news-item a', '.article-item a' |
| ] |
| |
| links = [] |
| for pattern in link_patterns: |
| links.extend(soup.select(pattern)) |
| |
| |
| seen_urls = set() |
| unique_links = [] |
| for link in links: |
| href = link.get('href', '') |
| if href and href not in seen_urls: |
| unique_links.append(link) |
| seen_urls.add(href) |
| |
| logger.info(f"Found {len(unique_links)} potential articles in {category}") |
| |
| |
| for i, link in enumerate(unique_links[:10]): |
| try: |
| href = link.get('href', '') |
| title = self.clean_text(link.get_text()) |
| |
| if not href or not title or len(title) < 10: |
| continue |
| |
| |
| if not href.startswith('http'): |
| if language == 'bengali': |
| href = 'https://www.prothomalo.com' + href |
| else: |
| href = 'https://en.prothomalo.com' + href |
| |
| |
| time.sleep(0.2) |
| |
| |
| article_response = self.session.get(href, timeout=10) |
| if not article_response.ok: |
| continue |
| |
| |
| article_soup = BeautifulSoup(article_response.content, 'html.parser') |
| extracted = self.extract_article_content(article_soup) |
| |
| content = extracted['content'] |
| if not content or len(content) < 100: |
| continue |
| |
| |
| content_clean = self.clean_text(content) |
| word_count = len(content_clean.split()) |
| |
| |
| summary = ' '.join(content_clean.split()[:200]) |
| if word_count > 200: |
| summary += "..." |
| |
| article = Article( |
| title=extracted['title'] or title, |
| content=content, |
| url=href, |
| category=category, |
| language=language, |
| author=extracted['author'], |
| published_date=extracted['published_date'], |
| word_count=word_count, |
| content_clean=content_clean, |
| summary=summary |
| ) |
| |
| articles.append(article) |
| logger.info(f" β
Article {i+1}: {word_count} words") |
| |
| if len(articles) >= self.max_articles: |
| break |
| |
| except Exception as e: |
| logger.warning(f"Failed to process article {i+1}: {e}") |
| continue |
| |
| return articles |
| |
| except Exception as e: |
| logger.error(f"Failed to fetch {url}: {e}") |
| return [] |
| |
| def scrape_comprehensive_dataset(self) -> List[Article]: |
| """Create a comprehensive dataset from multiple sources""" |
| logger.info(f"Starting comprehensive dataset creation (max: {self.max_articles} articles)") |
| |
| |
| target_pages = [ |
| |
| ('https://en.prothomalo.com/', 'general', 'english'), |
| ('https://en.prothomalo.com/opinion/', 'opinion', 'english'), |
| ('https://en.prothomalo.com/bangladesh/', 'bangladesh', 'english'), |
| ('https://en.prothomalo.com/international/', 'international', 'english'), |
| ('https://en.prothomalo.com/sports/', 'sports', 'english'), |
| ('https://en.prothomalo.com/business/', 'business', 'english'), |
| |
| |
| ('https://www.prothomalo.com/', 'general', 'bengali'), |
| ('https://www.prothomalo.com/opinion/', 'opinion', 'bengali'), |
| ('https://www.prothomalo.com/bangladesh/', 'bangladesh', 'bengali'), |
| ('https://www.prothomalo.com/international/', 'international', 'bengali'), |
| ('https://www.prothomalo.com/sports/', 'sports', 'bengali'), |
| ('https://www.prothomalo.com/business/', 'business', 'bengali'), |
| ] |
| |
| all_articles = [] |
| |
| |
| with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor: |
| futures = [] |
| |
| for url, category, language in target_pages: |
| future = executor.submit(self.extract_articles_from_page, url, category, language) |
| futures.append(future) |
| |
| |
| for future in concurrent.futures.as_completed(futures): |
| try: |
| articles = future.result() |
| all_articles.extend(articles) |
| logger.info(f"Collected {len(articles)} articles") |
| |
| if len(all_articles) >= self.max_articles: |
| logger.info(f"Reached target of {self.max_articles} articles") |
| break |
| |
| except Exception as e: |
| logger.error(f"Future processing failed: {e}") |
| |
| |
| unique_articles = [] |
| seen_urls = set() |
| |
| for article in all_articles: |
| if article.url not in seen_urls: |
| unique_articles.append(article) |
| seen_urls.add(article.url) |
| |
| logger.info(f"Final dataset: {len(unique_articles)} unique articles") |
| return unique_articles[:self.max_articles] |
| |
| def create_enhanced_dataset(self, articles: List[Article]) -> DatasetDict: |
| """Create enhanced dataset for model training""" |
| if not articles: |
| raise ValueError("No articles provided") |
| |
| logger.info(f"Creating enhanced dataset from {len(articles)} articles") |
| |
| |
| article_dicts = [] |
| for i, article in enumerate(articles): |
| article_dicts.append({ |
| 'id': f"prothomalo_{i+1:04d}", |
| 'title': article.title, |
| 'content': article.content, |
| 'content_clean': article.content_clean, |
| 'summary': article.summary, |
| 'category': article.category, |
| 'language': article.language, |
| 'author': article.author, |
| 'url': article.url, |
| 'published_date': article.published_date, |
| 'word_count': article.word_count, |
| 'source': 'Prothom Alo', |
| 'text_for_training': f"Title: {article.title}\n\nContent: {article.content_clean}", |
| }) |
| |
| |
| features = Features({ |
| 'id': Value('string'), |
| 'title': Value('string'), |
| 'content': Value('string'), |
| 'content_clean': Value('string'), |
| 'summary': Value('string'), |
| 'category': Value('string'), |
| 'language': Value('string'), |
| 'author': Value('string'), |
| 'url': Value('string'), |
| 'published_date': Value('string'), |
| 'word_count': Value('int32'), |
| 'source': Value('string'), |
| 'text_for_training': Value('string') |
| }) |
| |
| |
| dataset = Dataset.from_list(article_dicts, features=features) |
| |
| |
| if len(dataset) < 2: |
| return DatasetDict({ |
| 'train': dataset, |
| 'validation': dataset, |
| 'test': dataset |
| }) |
| |
| |
| train_test = dataset.train_test_split(test_size=0.2, seed=42) |
| val_test = train_test['train'].train_test_split(test_size=0.125, seed=42) |
| |
| final_dataset = DatasetDict({ |
| 'train': val_test['train'], |
| 'validation': val_test['test'], |
| 'test': train_test['test'] |
| }) |
| |
| logger.info("Dataset splits created:") |
| for split, data in final_dataset.items(): |
| logger.info(f" {split}: {len(data)} articles") |
| |
| return final_dataset |
| |
| def save_comprehensive_dataset(self, dataset: DatasetDict, output_dir: str = "enhanced_prothomalo"): |
| """Save comprehensive dataset with metadata""" |
| |
| try: |
| |
| dataset_path = f"./{output_dir}" |
| dataset.save_to_disk(dataset_path) |
| logger.info(f"β
Dataset saved to: {dataset_path}") |
| |
| |
| all_articles = [] |
| for split_data in dataset.values(): |
| all_articles.extend(split_data) |
| |
| |
| categories = list(set(article['category'] for article in all_articles)) |
| languages = list(set(article['language'] for article in all_articles)) |
| word_counts = [article['word_count'] for article in all_articles] |
| |
| metadata = { |
| 'creation_date': datetime.now().isoformat(), |
| 'dataset_version': '1.0', |
| 'source_websites': [ |
| 'https://en.prothomalo.com', |
| 'https://www.prothomalo.com' |
| ], |
| 'total_articles': len(all_articles), |
| 'languages': languages, |
| 'categories': categories, |
| 'language_distribution': { |
| lang: len([a for a in all_articles if a['language'] == lang]) |
| for lang in languages |
| }, |
| 'category_distribution': { |
| cat: len([a for a in all_articles if a['category'] == cat]) |
| for cat in categories |
| }, |
| 'word_count_stats': { |
| 'min': min(word_counts), |
| 'max': max(word_counts), |
| 'mean': sum(word_counts) / len(word_counts), |
| 'total_words': sum(word_counts) |
| }, |
| 'scraping_method': 'comprehensive_concurrent', |
| 'features': [ |
| 'title', 'content', 'content_clean', 'summary', |
| 'category', 'language', 'author', 'word_count', |
| 'text_for_training' |
| ], |
| 'intended_use': 'Language model fine-tuning and Bengali-English NLP research', |
| 'license': 'Research use - subject to Prothom Alo terms of service', |
| 'model_training_ready': True |
| } |
| |
| with open(f"{dataset_path}/dataset_metadata.json", 'w') as f: |
| json.dump(metadata, f, indent=2) |
| |
| |
| from datasets import load_from_disk |
| loaded = load_from_disk(dataset_path) |
| logger.info(f"β
Dataset loading test passed") |
| |
| |
| logger.info(f"\nπ Enhanced Dataset Statistics:") |
| logger.info(f"Total articles: {len(all_articles)}") |
| logger.info(f"Languages: {languages}") |
| logger.info(f"Categories: {categories}") |
| logger.info(f"Word count range: {min(word_counts)} - {max(word_counts)}") |
| logger.info(f"Average words per article: {sum(word_counts) / len(word_counts):.0f}") |
| |
| return dataset_path |
| |
| except Exception as e: |
| logger.error(f"Save operation failed: {e}") |
| raise |
|
|
| def main(): |
| """Main execution for enhanced dataset creation""" |
| |
| logger.info("π Enhanced Prothom Alo Dataset Creator") |
| logger.info("=" * 60) |
| |
| try: |
| |
| scraper = EnhancedProthomAloScraper(max_articles=50, max_workers=4) |
| |
| |
| articles = scraper.scrape_comprehensive_dataset() |
| |
| if not articles: |
| logger.error("β No articles were scraped") |
| return |
| |
| |
| dataset = scraper.create_enhanced_dataset(articles) |
| |
| |
| dataset_path = scraper.save_comprehensive_dataset(dataset) |
| |
| logger.info(f"\nπ SUCCESS! Enhanced Prothom Alo dataset created!") |
| logger.info(f"π Location: {dataset_path}") |
| logger.info(f"π Ready for model fine-tuning!") |
| |
| return dataset_path |
| |
| except Exception as e: |
| logger.error(f"β Enhanced dataset creation failed: {e}") |
| raise |
|
|
| if __name__ == "__main__": |
| main() |