prothomalo-language-model / enhanced_dataset_creator.py
likhonsheikh's picture
Add dataset creation script
5149c96 verified
#!/usr/bin/env python3
"""
Enhanced Prothom Alo Dataset Creator for Model Training
- Gets 50+ articles from both English and Bengali
- Includes multiple categories
- Prepares for fine-tuning
"""
import requests
from bs4 import BeautifulSoup
import json
import time
import re
from datetime import datetime
from typing import Dict, List, Optional
from datasets import Dataset, DatasetDict, Features, Value
from dataclasses import dataclass
import concurrent.futures
import logging
from pathlib import Path
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
@dataclass
class Article:
"""Enhanced article class for training data"""
title: str
content: str
url: str
category: str
language: str
author: str = "Prothom Alo"
published_date: str = ""
word_count: int = 0
content_clean: str = ""
summary: str = ""
class EnhancedProthomAloScraper:
"""Enhanced scraper for comprehensive dataset creation"""
def __init__(self, max_articles: int = 100, max_workers: int = 3):
self.max_articles = max_articles
self.max_workers = max_workers
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
def clean_text(self, text: str) -> str:
"""Clean and normalize text"""
if not text:
return ""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep punctuation
text = re.sub(r'[^\w\s\-\.\,\!\?\;\:\(\)]', ' ', text)
# Strip and normalize
return text.strip()
def extract_article_content(self, soup: BeautifulSoup) -> Dict:
"""Extract article content with improved parsing"""
try:
# Title extraction
title_elem = soup.select_one('h1, .headline, .article-title')
title = self.clean_text(title_elem.get_text()) if title_elem else ""
# Content extraction
content_selectors = [
'.article-content p',
'.story-content p',
'.content p',
'article p',
'p'
]
content = ""
for selector in content_selectors:
paragraphs = soup.select(selector)
if paragraphs:
content = ' '.join([self.clean_text(p.get_text()) for p in paragraphs if p.get_text()])
break
if not content:
# Fallback: get all text content
content = self.clean_text(soup.get_text())
# Author extraction
author_selectors = ['.author', '.byline', '.writer', '.reporter']
author = "Prothom Alo"
for selector in author_selectors:
author_elem = soup.select_one(selector)
if author_elem:
author = self.clean_text(author_elem.get_text())
break
# Date extraction
date_selectors = ['time', '.date', '.published', '.timestamp']
published_date = datetime.now().isoformat()
for selector in date_selectors:
date_elem = soup.select_one(selector)
if date_elem:
if date_elem.get('datetime'):
published_date = date_elem.get('datetime')
else:
published_date = self.clean_text(date_elem.get_text())
break
return {
'title': title,
'content': content,
'author': author,
'published_date': published_date
}
except Exception as e:
logger.warning(f"Content extraction failed: {e}")
return {
'title': "",
'content': "",
'author': "Prothom Alo",
'published_date': datetime.now().isoformat()
}
def extract_articles_from_page(self, url: str, category: str, language: str) -> List[Article]:
"""Extract articles from a single page"""
articles = []
try:
logger.info(f"Fetching {url} for {category} articles")
response = self.session.get(url, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Multiple link patterns for different page structures
link_patterns = [
'h1 a', 'h2 a', 'h3 a',
'.headline a', '.title a',
'a[href*="article"]', 'a[href*="news"]',
'.news-item a', '.article-item a'
]
links = []
for pattern in link_patterns:
links.extend(soup.select(pattern))
# Remove duplicates
seen_urls = set()
unique_links = []
for link in links:
href = link.get('href', '')
if href and href not in seen_urls:
unique_links.append(link)
seen_urls.add(href)
logger.info(f"Found {len(unique_links)} potential articles in {category}")
# Process each article
for i, link in enumerate(unique_links[:10]): # Limit per page
try:
href = link.get('href', '')
title = self.clean_text(link.get_text())
if not href or not title or len(title) < 10:
continue
# Make URL absolute
if not href.startswith('http'):
if language == 'bengali':
href = 'https://www.prothomalo.com' + href
else:
href = 'https://en.prothomalo.com' + href
# Rate limiting
time.sleep(0.2)
# Fetch article
article_response = self.session.get(href, timeout=10)
if not article_response.ok:
continue
# Parse article
article_soup = BeautifulSoup(article_response.content, 'html.parser')
extracted = self.extract_article_content(article_soup)
content = extracted['content']
if not content or len(content) < 100:
continue
# Clean content and create summary
content_clean = self.clean_text(content)
word_count = len(content_clean.split())
# Create simple summary (first 200 words)
summary = ' '.join(content_clean.split()[:200])
if word_count > 200:
summary += "..."
article = Article(
title=extracted['title'] or title,
content=content,
url=href,
category=category,
language=language,
author=extracted['author'],
published_date=extracted['published_date'],
word_count=word_count,
content_clean=content_clean,
summary=summary
)
articles.append(article)
logger.info(f" βœ… Article {i+1}: {word_count} words")
if len(articles) >= self.max_articles:
break
except Exception as e:
logger.warning(f"Failed to process article {i+1}: {e}")
continue
return articles
except Exception as e:
logger.error(f"Failed to fetch {url}: {e}")
return []
def scrape_comprehensive_dataset(self) -> List[Article]:
"""Create a comprehensive dataset from multiple sources"""
logger.info(f"Starting comprehensive dataset creation (max: {self.max_articles} articles)")
# Define target pages
target_pages = [
# English pages
('https://en.prothomalo.com/', 'general', 'english'),
('https://en.prothomalo.com/opinion/', 'opinion', 'english'),
('https://en.prothomalo.com/bangladesh/', 'bangladesh', 'english'),
('https://en.prothomalo.com/international/', 'international', 'english'),
('https://en.prothomalo.com/sports/', 'sports', 'english'),
('https://en.prothomalo.com/business/', 'business', 'english'),
# Bengali pages
('https://www.prothomalo.com/', 'general', 'bengali'),
('https://www.prothomalo.com/opinion/', 'opinion', 'bengali'),
('https://www.prothomalo.com/bangladesh/', 'bangladesh', 'bengali'),
('https://www.prothomalo.com/international/', 'international', 'bengali'),
('https://www.prothomalo.com/sports/', 'sports', 'bengali'),
('https://www.prothomalo.com/business/', 'business', 'bengali'),
]
all_articles = []
# Use thread pool for concurrent processing
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = []
for url, category, language in target_pages:
future = executor.submit(self.extract_articles_from_page, url, category, language)
futures.append(future)
# Collect results
for future in concurrent.futures.as_completed(futures):
try:
articles = future.result()
all_articles.extend(articles)
logger.info(f"Collected {len(articles)} articles")
if len(all_articles) >= self.max_articles:
logger.info(f"Reached target of {self.max_articles} articles")
break
except Exception as e:
logger.error(f"Future processing failed: {e}")
# Remove duplicates based on URL
unique_articles = []
seen_urls = set()
for article in all_articles:
if article.url not in seen_urls:
unique_articles.append(article)
seen_urls.add(article.url)
logger.info(f"Final dataset: {len(unique_articles)} unique articles")
return unique_articles[:self.max_articles]
def create_enhanced_dataset(self, articles: List[Article]) -> DatasetDict:
"""Create enhanced dataset for model training"""
if not articles:
raise ValueError("No articles provided")
logger.info(f"Creating enhanced dataset from {len(articles)} articles")
# Convert to dictionaries with training-focused structure
article_dicts = []
for i, article in enumerate(articles):
article_dicts.append({
'id': f"prothomalo_{i+1:04d}",
'title': article.title,
'content': article.content,
'content_clean': article.content_clean,
'summary': article.summary,
'category': article.category,
'language': article.language,
'author': article.author,
'url': article.url,
'published_date': article.published_date,
'word_count': article.word_count,
'source': 'Prothom Alo',
'text_for_training': f"Title: {article.title}\n\nContent: {article.content_clean}", # Combined text
})
# Define features for training
features = Features({
'id': Value('string'),
'title': Value('string'),
'content': Value('string'),
'content_clean': Value('string'),
'summary': Value('string'),
'category': Value('string'),
'language': Value('string'),
'author': Value('string'),
'url': Value('string'),
'published_date': Value('string'),
'word_count': Value('int32'),
'source': Value('string'),
'text_for_training': Value('string')
})
# Create dataset
dataset = Dataset.from_list(article_dicts, features=features)
# Simple approach: create single dataset and split
if len(dataset) < 2:
return DatasetDict({
'train': dataset,
'validation': dataset,
'test': dataset
})
# Create 80/10/10 splits for all data together
train_test = dataset.train_test_split(test_size=0.2, seed=42)
val_test = train_test['train'].train_test_split(test_size=0.125, seed=42) # 10% of total
final_dataset = DatasetDict({
'train': val_test['train'],
'validation': val_test['test'],
'test': train_test['test']
})
logger.info("Dataset splits created:")
for split, data in final_dataset.items():
logger.info(f" {split}: {len(data)} articles")
return final_dataset
def save_comprehensive_dataset(self, dataset: DatasetDict, output_dir: str = "enhanced_prothomalo"):
"""Save comprehensive dataset with metadata"""
try:
# Save dataset
dataset_path = f"./{output_dir}"
dataset.save_to_disk(dataset_path)
logger.info(f"βœ… Dataset saved to: {dataset_path}")
# Create comprehensive metadata
all_articles = []
for split_data in dataset.values():
all_articles.extend(split_data)
# Analyze dataset
categories = list(set(article['category'] for article in all_articles))
languages = list(set(article['language'] for article in all_articles))
word_counts = [article['word_count'] for article in all_articles]
metadata = {
'creation_date': datetime.now().isoformat(),
'dataset_version': '1.0',
'source_websites': [
'https://en.prothomalo.com',
'https://www.prothomalo.com'
],
'total_articles': len(all_articles),
'languages': languages,
'categories': categories,
'language_distribution': {
lang: len([a for a in all_articles if a['language'] == lang])
for lang in languages
},
'category_distribution': {
cat: len([a for a in all_articles if a['category'] == cat])
for cat in categories
},
'word_count_stats': {
'min': min(word_counts),
'max': max(word_counts),
'mean': sum(word_counts) / len(word_counts),
'total_words': sum(word_counts)
},
'scraping_method': 'comprehensive_concurrent',
'features': [
'title', 'content', 'content_clean', 'summary',
'category', 'language', 'author', 'word_count',
'text_for_training'
],
'intended_use': 'Language model fine-tuning and Bengali-English NLP research',
'license': 'Research use - subject to Prothom Alo terms of service',
'model_training_ready': True
}
with open(f"{dataset_path}/dataset_metadata.json", 'w') as f:
json.dump(metadata, f, indent=2)
# Test loading
from datasets import load_from_disk
loaded = load_from_disk(dataset_path)
logger.info(f"βœ… Dataset loading test passed")
# Show statistics
logger.info(f"\nπŸ“Š Enhanced Dataset Statistics:")
logger.info(f"Total articles: {len(all_articles)}")
logger.info(f"Languages: {languages}")
logger.info(f"Categories: {categories}")
logger.info(f"Word count range: {min(word_counts)} - {max(word_counts)}")
logger.info(f"Average words per article: {sum(word_counts) / len(word_counts):.0f}")
return dataset_path
except Exception as e:
logger.error(f"Save operation failed: {e}")
raise
def main():
"""Main execution for enhanced dataset creation"""
logger.info("πŸš€ Enhanced Prothom Alo Dataset Creator")
logger.info("=" * 60)
try:
# Create scraper
scraper = EnhancedProthomAloScraper(max_articles=50, max_workers=4)
# Scrape comprehensive dataset
articles = scraper.scrape_comprehensive_dataset()
if not articles:
logger.error("❌ No articles were scraped")
return
# Create enhanced dataset
dataset = scraper.create_enhanced_dataset(articles)
# Save comprehensive dataset
dataset_path = scraper.save_comprehensive_dataset(dataset)
logger.info(f"\nπŸŽ‰ SUCCESS! Enhanced Prothom Alo dataset created!")
logger.info(f"πŸ“ Location: {dataset_path}")
logger.info(f"πŸ“Š Ready for model fine-tuning!")
return dataset_path
except Exception as e:
logger.error(f"❌ Enhanced dataset creation failed: {e}")
raise
if __name__ == "__main__":
main()