prothomalo-language-model / enhanced_dataset_creator.py

Add dataset creation script

5149c96 verified 6 months ago

18.6 kB

	#!/usr/bin/env python3
	"""
	Enhanced Prothom Alo Dataset Creator for Model Training
	- Gets 50+ articles from both English and Bengali
	- Includes multiple categories
	- Prepares for fine-tuning
	"""

	import requests
	from bs4 import BeautifulSoup
	import json
	import time
	import re
	from datetime import datetime
	from typing import Dict, List, Optional
	from datasets import Dataset, DatasetDict, Features, Value
	from dataclasses import dataclass
	import concurrent.futures
	import logging
	from pathlib import Path

	# Setup logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	@dataclass
	class Article:
	"""Enhanced article class for training data"""
	title: str
	content: str
	url: str
	category: str
	language: str
	author: str = "Prothom Alo"
	published_date: str = ""
	word_count: int = 0
	content_clean: str = ""
	summary: str = ""

	class EnhancedProthomAloScraper:
	"""Enhanced scraper for comprehensive dataset creation"""

	def __init__(self, max_articles: int = 100, max_workers: int = 3):
	self.max_articles = max_articles
	self.max_workers = max_workers
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	})

	def clean_text(self, text: str) -> str:
	"""Clean and normalize text"""
	if not text:
	return ""

	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove special characters but keep punctuation
	text = re.sub(r'[^\w\s\-\.\,\!\?\;\:\(\)]', ' ', text)
	# Strip and normalize
	return text.strip()

	def extract_article_content(self, soup: BeautifulSoup) -> Dict:
	"""Extract article content with improved parsing"""
	try:
	# Title extraction
	title_elem = soup.select_one('h1, .headline, .article-title')
	title = self.clean_text(title_elem.get_text()) if title_elem else ""

	# Content extraction
	content_selectors = [
	'.article-content p',
	'.story-content p',
	'.content p',
	'article p',
	'p'
	]

	content = ""
	for selector in content_selectors:
	paragraphs = soup.select(selector)
	if paragraphs:
	content = ' '.join([self.clean_text(p.get_text()) for p in paragraphs if p.get_text()])
	break

	if not content:
	# Fallback: get all text content
	content = self.clean_text(soup.get_text())

	# Author extraction
	author_selectors = ['.author', '.byline', '.writer', '.reporter']
	author = "Prothom Alo"
	for selector in author_selectors:
	author_elem = soup.select_one(selector)
	if author_elem:
	author = self.clean_text(author_elem.get_text())
	break

	# Date extraction
	date_selectors = ['time', '.date', '.published', '.timestamp']
	published_date = datetime.now().isoformat()
	for selector in date_selectors:
	date_elem = soup.select_one(selector)
	if date_elem:
	if date_elem.get('datetime'):
	published_date = date_elem.get('datetime')
	else:
	published_date = self.clean_text(date_elem.get_text())
	break

	return {
	'title': title,
	'content': content,
	'author': author,
	'published_date': published_date
	}

	except Exception as e:
	logger.warning(f"Content extraction failed: {e}")
	return {
	'title': "",
	'content': "",
	'author': "Prothom Alo",
	'published_date': datetime.now().isoformat()
	}

	def extract_articles_from_page(self, url: str, category: str, language: str) -> List[Article]:
	"""Extract articles from a single page"""
	articles = []

	try:
	logger.info(f"Fetching {url} for {category} articles")
	response = self.session.get(url, timeout=15)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Multiple link patterns for different page structures
	link_patterns = [
	'h1 a', 'h2 a', 'h3 a',
	'.headline a', '.title a',
	'a[href="article"]', 'a[href="news"]',
	'.news-item a', '.article-item a'
	]

	links = []
	for pattern in link_patterns:
	links.extend(soup.select(pattern))

	# Remove duplicates
	seen_urls = set()
	unique_links = []
	for link in links:
	href = link.get('href', '')
	if href and href not in seen_urls:
	unique_links.append(link)
	seen_urls.add(href)

	logger.info(f"Found {len(unique_links)} potential articles in {category}")

	# Process each article
	for i, link in enumerate(unique_links[:10]): # Limit per page
	try:
	href = link.get('href', '')
	title = self.clean_text(link.get_text())

	if not href or not title or len(title) < 10:
	continue

	# Make URL absolute
	if not href.startswith('http'):
	if language == 'bengali':
	href = 'https://www.prothomalo.com' + href
	else:
	href = 'https://en.prothomalo.com' + href

	# Rate limiting
	time.sleep(0.2)

	# Fetch article
	article_response = self.session.get(href, timeout=10)
	if not article_response.ok:
	continue

	# Parse article
	article_soup = BeautifulSoup(article_response.content, 'html.parser')
	extracted = self.extract_article_content(article_soup)

	content = extracted['content']
	if not content or len(content) < 100:
	continue

	# Clean content and create summary
	content_clean = self.clean_text(content)
	word_count = len(content_clean.split())

	# Create simple summary (first 200 words)
	summary = ' '.join(content_clean.split()[:200])
	if word_count > 200:
	summary += "..."

	article = Article(
	title=extracted['title'] or title,
	content=content,
	url=href,
	category=category,
	language=language,
	author=extracted['author'],
	published_date=extracted['published_date'],
	word_count=word_count,
	content_clean=content_clean,
	summary=summary
	)

	articles.append(article)
	logger.info(f" ✅ Article {i+1}: {word_count} words")

	if len(articles) >= self.max_articles:
	break

	except Exception as e:
	logger.warning(f"Failed to process article {i+1}: {e}")
	continue

	return articles

	except Exception as e:
	logger.error(f"Failed to fetch {url}: {e}")
	return []

	def scrape_comprehensive_dataset(self) -> List[Article]:
	"""Create a comprehensive dataset from multiple sources"""
	logger.info(f"Starting comprehensive dataset creation (max: {self.max_articles} articles)")

	# Define target pages
	target_pages = [
	# English pages
	('https://en.prothomalo.com/', 'general', 'english'),
	('https://en.prothomalo.com/opinion/', 'opinion', 'english'),
	('https://en.prothomalo.com/bangladesh/', 'bangladesh', 'english'),
	('https://en.prothomalo.com/international/', 'international', 'english'),
	('https://en.prothomalo.com/sports/', 'sports', 'english'),
	('https://en.prothomalo.com/business/', 'business', 'english'),

	# Bengali pages
	('https://www.prothomalo.com/', 'general', 'bengali'),
	('https://www.prothomalo.com/opinion/', 'opinion', 'bengali'),
	('https://www.prothomalo.com/bangladesh/', 'bangladesh', 'bengali'),
	('https://www.prothomalo.com/international/', 'international', 'bengali'),
	('https://www.prothomalo.com/sports/', 'sports', 'bengali'),
	('https://www.prothomalo.com/business/', 'business', 'bengali'),
	]

	all_articles = []

	# Use thread pool for concurrent processing
	with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
	futures = []

	for url, category, language in target_pages:
	future = executor.submit(self.extract_articles_from_page, url, category, language)
	futures.append(future)

	# Collect results
	for future in concurrent.futures.as_completed(futures):
	try:
	articles = future.result()
	all_articles.extend(articles)
	logger.info(f"Collected {len(articles)} articles")

	if len(all_articles) >= self.max_articles:
	logger.info(f"Reached target of {self.max_articles} articles")
	break

	except Exception as e:
	logger.error(f"Future processing failed: {e}")

	# Remove duplicates based on URL
	unique_articles = []
	seen_urls = set()

	for article in all_articles:
	if article.url not in seen_urls:
	unique_articles.append(article)
	seen_urls.add(article.url)

	logger.info(f"Final dataset: {len(unique_articles)} unique articles")
	return unique_articles[:self.max_articles]

	def create_enhanced_dataset(self, articles: List[Article]) -> DatasetDict:
	"""Create enhanced dataset for model training"""
	if not articles:
	raise ValueError("No articles provided")

	logger.info(f"Creating enhanced dataset from {len(articles)} articles")

	# Convert to dictionaries with training-focused structure
	article_dicts = []
	for i, article in enumerate(articles):
	article_dicts.append({
	'id': f"prothomalo_{i+1:04d}",
	'title': article.title,
	'content': article.content,
	'content_clean': article.content_clean,
	'summary': article.summary,
	'category': article.category,
	'language': article.language,
	'author': article.author,
	'url': article.url,
	'published_date': article.published_date,
	'word_count': article.word_count,
	'source': 'Prothom Alo',
	'text_for_training': f"Title: {article.title}\n\nContent: {article.content_clean}", # Combined text
	})

	# Define features for training
	features = Features({
	'id': Value('string'),
	'title': Value('string'),
	'content': Value('string'),
	'content_clean': Value('string'),
	'summary': Value('string'),
	'category': Value('string'),
	'language': Value('string'),
	'author': Value('string'),
	'url': Value('string'),
	'published_date': Value('string'),
	'word_count': Value('int32'),
	'source': Value('string'),
	'text_for_training': Value('string')
	})

	# Create dataset
	dataset = Dataset.from_list(article_dicts, features=features)

	# Simple approach: create single dataset and split
	if len(dataset) < 2:
	return DatasetDict({
	'train': dataset,
	'validation': dataset,
	'test': dataset
	})

	# Create 80/10/10 splits for all data together
	train_test = dataset.train_test_split(test_size=0.2, seed=42)
	val_test = train_test['train'].train_test_split(test_size=0.125, seed=42) # 10% of total

	final_dataset = DatasetDict({
	'train': val_test['train'],
	'validation': val_test['test'],
	'test': train_test['test']
	})

	logger.info("Dataset splits created:")
	for split, data in final_dataset.items():
	logger.info(f" {split}: {len(data)} articles")

	return final_dataset

	def save_comprehensive_dataset(self, dataset: DatasetDict, output_dir: str = "enhanced_prothomalo"):
	"""Save comprehensive dataset with metadata"""

	try:
	# Save dataset
	dataset_path = f"./{output_dir}"
	dataset.save_to_disk(dataset_path)
	logger.info(f"✅ Dataset saved to: {dataset_path}")

	# Create comprehensive metadata
	all_articles = []
	for split_data in dataset.values():
	all_articles.extend(split_data)

	# Analyze dataset
	categories = list(set(article['category'] for article in all_articles))
	languages = list(set(article['language'] for article in all_articles))
	word_counts = [article['word_count'] for article in all_articles]

	metadata = {
	'creation_date': datetime.now().isoformat(),
	'dataset_version': '1.0',
	'source_websites': [
	'https://en.prothomalo.com',
	'https://www.prothomalo.com'
	],
	'total_articles': len(all_articles),
	'languages': languages,
	'categories': categories,
	'language_distribution': {
	lang: len([a for a in all_articles if a['language'] == lang])
	for lang in languages
	},
	'category_distribution': {
	cat: len([a for a in all_articles if a['category'] == cat])
	for cat in categories
	},
	'word_count_stats': {
	'min': min(word_counts),
	'max': max(word_counts),
	'mean': sum(word_counts) / len(word_counts),
	'total_words': sum(word_counts)
	},
	'scraping_method': 'comprehensive_concurrent',
	'features': [
	'title', 'content', 'content_clean', 'summary',
	'category', 'language', 'author', 'word_count',
	'text_for_training'
	],
	'intended_use': 'Language model fine-tuning and Bengali-English NLP research',
	'license': 'Research use - subject to Prothom Alo terms of service',
	'model_training_ready': True
	}

	with open(f"{dataset_path}/dataset_metadata.json", 'w') as f:
	json.dump(metadata, f, indent=2)

	# Test loading
	from datasets import load_from_disk
	loaded = load_from_disk(dataset_path)
	logger.info(f"✅ Dataset loading test passed")

	# Show statistics
	logger.info(f"\n📊 Enhanced Dataset Statistics:")
	logger.info(f"Total articles: {len(all_articles)}")
	logger.info(f"Languages: {languages}")
	logger.info(f"Categories: {categories}")
	logger.info(f"Word count range: {min(word_counts)} - {max(word_counts)}")
	logger.info(f"Average words per article: {sum(word_counts) / len(word_counts):.0f}")

	return dataset_path

	except Exception as e:
	logger.error(f"Save operation failed: {e}")
	raise

	def main():
	"""Main execution for enhanced dataset creation"""

	logger.info("🚀 Enhanced Prothom Alo Dataset Creator")
	logger.info("=" * 60)

	try:
	# Create scraper
	scraper = EnhancedProthomAloScraper(max_articles=50, max_workers=4)

	# Scrape comprehensive dataset
	articles = scraper.scrape_comprehensive_dataset()

	if not articles:
	logger.error("❌ No articles were scraped")
	return

	# Create enhanced dataset
	dataset = scraper.create_enhanced_dataset(articles)

	# Save comprehensive dataset
	dataset_path = scraper.save_comprehensive_dataset(dataset)

	logger.info(f"\n🎉 SUCCESS! Enhanced Prothom Alo dataset created!")
	logger.info(f"📁 Location: {dataset_path}")
	logger.info(f"📊 Ready for model fine-tuning!")

	return dataset_path

	except Exception as e:
	logger.error(f"❌ Enhanced dataset creation failed: {e}")
	raise

	if __name__ == "__main__":
	main()