likhonsheikh
/

prothomalo-language-model

Safetensors

gpt2

Model card Files Files and versions

xet

Community

likhonsheikh commited on Nov 4, 2025

Commit

5149c96

verified ·

1 Parent(s): 858d9ce

Add dataset creation script

Browse files

Files changed (1) hide show

enhanced_dataset_creator.py +470 -0

enhanced_dataset_creator.py ADDED Viewed

	@@ -0,0 +1,470 @@

+#!/usr/bin/env python3
+"""
+Enhanced Prothom Alo Dataset Creator for Model Training
+- Gets 50+ articles from both English and Bengali
+- Includes multiple categories
+- Prepares for fine-tuning
+"""
+import requests
+from bs4 import BeautifulSoup
+import json
+import time
+import re
+from datetime import datetime
+from typing import Dict, List, Optional
+from datasets import Dataset, DatasetDict, Features, Value
+from dataclasses import dataclass
+import concurrent.futures
+import logging
+from pathlib import Path
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+@dataclass
+class Article:
+    """Enhanced article class for training data"""
+    title: str
+    content: str
+    url: str
+    category: str
+    language: str
+    author: str = "Prothom Alo"
+    published_date: str = ""
+    word_count: int = 0
+    content_clean: str = ""
+    summary: str = ""
+class EnhancedProthomAloScraper:
+    """Enhanced scraper for comprehensive dataset creation"""
+    def __init__(self, max_articles: int = 100, max_workers: int = 3):
+        self.max_articles = max_articles
+        self.max_workers = max_workers
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        })
+    def clean_text(self, text: str) -> str:
+        """Clean and normalize text"""
+        if not text:
+            return ""
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove special characters but keep punctuation
+        text = re.sub(r'[^\w\s\-\.\,\!\?\;\:\(\)]', ' ', text)
+        # Strip and normalize
+        return text.strip()
+    def extract_article_content(self, soup: BeautifulSoup) -> Dict:
+        """Extract article content with improved parsing"""
+        try:
+            # Title extraction
+            title_elem = soup.select_one('h1, .headline, .article-title')
+            title = self.clean_text(title_elem.get_text()) if title_elem else ""
+            # Content extraction
+            content_selectors = [
+                '.article-content p',
+                '.story-content p',
+                '.content p',
+                'article p',
+                'p'
+            ]
+            content = ""
+            for selector in content_selectors:
+                paragraphs = soup.select(selector)
+                if paragraphs:
+                    content = ' '.join([self.clean_text(p.get_text()) for p in paragraphs if p.get_text()])
+                    break
+            if not content:
+                # Fallback: get all text content
+                content = self.clean_text(soup.get_text())
+            # Author extraction
+            author_selectors = ['.author', '.byline', '.writer', '.reporter']
+            author = "Prothom Alo"
+            for selector in author_selectors:
+                author_elem = soup.select_one(selector)
+                if author_elem:
+                    author = self.clean_text(author_elem.get_text())
+                    break
+            # Date extraction
+            date_selectors = ['time', '.date', '.published', '.timestamp']
+            published_date = datetime.now().isoformat()
+            for selector in date_selectors:
+                date_elem = soup.select_one(selector)
+                if date_elem:
+                    if date_elem.get('datetime'):
+                        published_date = date_elem.get('datetime')
+                    else:
+                        published_date = self.clean_text(date_elem.get_text())
+                    break
+            return {
+                'title': title,
+                'content': content,
+                'author': author,
+                'published_date': published_date
+            }
+        except Exception as e:
+            logger.warning(f"Content extraction failed: {e}")
+            return {
+                'title': "",
+                'content': "",
+                'author': "Prothom Alo",
+                'published_date': datetime.now().isoformat()
+            }
+    def extract_articles_from_page(self, url: str, category: str, language: str) -> List[Article]:
+        """Extract articles from a single page"""
+        articles = []
+        try:
+            logger.info(f"Fetching {url} for {category} articles")
+            response = self.session.get(url, timeout=15)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Multiple link patterns for different page structures
+            link_patterns = [
+                'h1 a', 'h2 a', 'h3 a',
+                '.headline a', '.title a',
+                'a[href*="article"]', 'a[href*="news"]',
+                '.news-item a', '.article-item a'
+            ]
+            links = []
+            for pattern in link_patterns:
+                links.extend(soup.select(pattern))
+            # Remove duplicates
+            seen_urls = set()
+            unique_links = []
+            for link in links:
+                href = link.get('href', '')
+                if href and href not in seen_urls:
+                    unique_links.append(link)
+                    seen_urls.add(href)
+            logger.info(f"Found {len(unique_links)} potential articles in {category}")
+            # Process each article
+            for i, link in enumerate(unique_links[:10]):  # Limit per page
+                try:
+                    href = link.get('href', '')
+                    title = self.clean_text(link.get_text())
+                    if not href or not title or len(title) < 10:
+                        continue
+                    # Make URL absolute
+                    if not href.startswith('http'):
+                        if language == 'bengali':
+                            href = 'https://www.prothomalo.com' + href
+                        else:
+                            href = 'https://en.prothomalo.com' + href
+                    # Rate limiting
+                    time.sleep(0.2)
+                    # Fetch article
+                    article_response = self.session.get(href, timeout=10)
+                    if not article_response.ok:
+                        continue
+                    # Parse article
+                    article_soup = BeautifulSoup(article_response.content, 'html.parser')
+                    extracted = self.extract_article_content(article_soup)
+                    content = extracted['content']
+                    if not content or len(content) < 100:
+                        continue
+                    # Clean content and create summary
+                    content_clean = self.clean_text(content)
+                    word_count = len(content_clean.split())
+                    # Create simple summary (first 200 words)
+                    summary = ' '.join(content_clean.split()[:200])
+                    if word_count > 200:
+                        summary += "..."
+                    article = Article(
+                        title=extracted['title'] or title,
+                        content=content,
+                        url=href,
+                        category=category,
+                        language=language,
+                        author=extracted['author'],
+                        published_date=extracted['published_date'],
+                        word_count=word_count,
+                        content_clean=content_clean,
+                        summary=summary
+                    )
+                    articles.append(article)
+                    logger.info(f"  ✅ Article {i+1}: {word_count} words")
+                    if len(articles) >= self.max_articles:
+                        break
+                except Exception as e:
+                    logger.warning(f"Failed to process article {i+1}: {e}")
+                    continue
+            return articles
+        except Exception as e:
+            logger.error(f"Failed to fetch {url}: {e}")
+            return []
+    def scrape_comprehensive_dataset(self) -> List[Article]:
+        """Create a comprehensive dataset from multiple sources"""
+        logger.info(f"Starting comprehensive dataset creation (max: {self.max_articles} articles)")
+        # Define target pages
+        target_pages = [
+            # English pages
+            ('https://en.prothomalo.com/', 'general', 'english'),
+            ('https://en.prothomalo.com/opinion/', 'opinion', 'english'),
+            ('https://en.prothomalo.com/bangladesh/', 'bangladesh', 'english'),
+            ('https://en.prothomalo.com/international/', 'international', 'english'),
+            ('https://en.prothomalo.com/sports/', 'sports', 'english'),
+            ('https://en.prothomalo.com/business/', 'business', 'english'),
+            # Bengali pages
+            ('https://www.prothomalo.com/', 'general', 'bengali'),
+            ('https://www.prothomalo.com/opinion/', 'opinion', 'bengali'),
+            ('https://www.prothomalo.com/bangladesh/', 'bangladesh', 'bengali'),
+            ('https://www.prothomalo.com/international/', 'international', 'bengali'),
+            ('https://www.prothomalo.com/sports/', 'sports', 'bengali'),
+            ('https://www.prothomalo.com/business/', 'business', 'bengali'),
+        ]
+        all_articles = []
+        # Use thread pool for concurrent processing
+        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = []
+            for url, category, language in target_pages:
+                future = executor.submit(self.extract_articles_from_page, url, category, language)
+                futures.append(future)
+            # Collect results
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    articles = future.result()
+                    all_articles.extend(articles)
+                    logger.info(f"Collected {len(articles)} articles")
+                    if len(all_articles) >= self.max_articles:
+                        logger.info(f"Reached target of {self.max_articles} articles")
+                        break
+                except Exception as e:
+                    logger.error(f"Future processing failed: {e}")
+        # Remove duplicates based on URL
+        unique_articles = []
+        seen_urls = set()
+        for article in all_articles:
+            if article.url not in seen_urls:
+                unique_articles.append(article)
+                seen_urls.add(article.url)
+        logger.info(f"Final dataset: {len(unique_articles)} unique articles")
+        return unique_articles[:self.max_articles]
+    def create_enhanced_dataset(self, articles: List[Article]) -> DatasetDict:
+        """Create enhanced dataset for model training"""
+        if not articles:
+            raise ValueError("No articles provided")
+        logger.info(f"Creating enhanced dataset from {len(articles)} articles")
+        # Convert to dictionaries with training-focused structure
+        article_dicts = []
+        for i, article in enumerate(articles):
+            article_dicts.append({
+                'id': f"prothomalo_{i+1:04d}",
+                'title': article.title,
+                'content': article.content,
+                'content_clean': article.content_clean,
+                'summary': article.summary,
+                'category': article.category,
+                'language': article.language,
+                'author': article.author,
+                'url': article.url,
+                'published_date': article.published_date,
+                'word_count': article.word_count,
+                'source': 'Prothom Alo',
+                'text_for_training': f"Title: {article.title}\n\nContent: {article.content_clean}",  # Combined text
+            })
+        # Define features for training
+        features = Features({
+            'id': Value('string'),
+            'title': Value('string'),
+            'content': Value('string'),
+            'content_clean': Value('string'),
+            'summary': Value('string'),
+            'category': Value('string'),
+            'language': Value('string'),
+            'author': Value('string'),
+            'url': Value('string'),
+            'published_date': Value('string'),
+            'word_count': Value('int32'),
+            'source': Value('string'),
+            'text_for_training': Value('string')
+        })
+        # Create dataset
+        dataset = Dataset.from_list(article_dicts, features=features)
+        # Simple approach: create single dataset and split
+        if len(dataset) < 2:
+            return DatasetDict({
+                'train': dataset,
+                'validation': dataset,
+                'test': dataset
+            })
+        # Create 80/10/10 splits for all data together
+        train_test = dataset.train_test_split(test_size=0.2, seed=42)
+        val_test = train_test['train'].train_test_split(test_size=0.125, seed=42)  # 10% of total
+        final_dataset = DatasetDict({
+            'train': val_test['train'],
+            'validation': val_test['test'],
+            'test': train_test['test']
+        })
+        logger.info("Dataset splits created:")
+        for split, data in final_dataset.items():
+            logger.info(f"  {split}: {len(data)} articles")
+        return final_dataset
+    def save_comprehensive_dataset(self, dataset: DatasetDict, output_dir: str = "enhanced_prothomalo"):
+        """Save comprehensive dataset with metadata"""
+        try:
+            # Save dataset
+            dataset_path = f"./{output_dir}"
+            dataset.save_to_disk(dataset_path)
+            logger.info(f"✅ Dataset saved to: {dataset_path}")
+            # Create comprehensive metadata
+            all_articles = []
+            for split_data in dataset.values():
+                all_articles.extend(split_data)
+            # Analyze dataset
+            categories = list(set(article['category'] for article in all_articles))
+            languages = list(set(article['language'] for article in all_articles))
+            word_counts = [article['word_count'] for article in all_articles]
+            metadata = {
+                'creation_date': datetime.now().isoformat(),
+                'dataset_version': '1.0',
+                'source_websites': [
+                    'https://en.prothomalo.com',
+                    'https://www.prothomalo.com'
+                ],
+                'total_articles': len(all_articles),
+                'languages': languages,
+                'categories': categories,
+                'language_distribution': {
+                    lang: len([a for a in all_articles if a['language'] == lang])
+                    for lang in languages
+                },
+                'category_distribution': {
+                    cat: len([a for a in all_articles if a['category'] == cat])
+                    for cat in categories
+                },
+                'word_count_stats': {
+                    'min': min(word_counts),
+                    'max': max(word_counts),
+                    'mean': sum(word_counts) / len(word_counts),
+                    'total_words': sum(word_counts)
+                },
+                'scraping_method': 'comprehensive_concurrent',
+                'features': [
+                    'title', 'content', 'content_clean', 'summary',
+                    'category', 'language', 'author', 'word_count',
+                    'text_for_training'
+                ],
+                'intended_use': 'Language model fine-tuning and Bengali-English NLP research',
+                'license': 'Research use - subject to Prothom Alo terms of service',
+                'model_training_ready': True
+            }
+            with open(f"{dataset_path}/dataset_metadata.json", 'w') as f:
+                json.dump(metadata, f, indent=2)
+            # Test loading
+            from datasets import load_from_disk
+            loaded = load_from_disk(dataset_path)
+            logger.info(f"✅ Dataset loading test passed")
+            # Show statistics
+            logger.info(f"\n📊 Enhanced Dataset Statistics:")
+            logger.info(f"Total articles: {len(all_articles)}")
+            logger.info(f"Languages: {languages}")
+            logger.info(f"Categories: {categories}")
+            logger.info(f"Word count range: {min(word_counts)} - {max(word_counts)}")
+            logger.info(f"Average words per article: {sum(word_counts) / len(word_counts):.0f}")
+            return dataset_path
+        except Exception as e:
+            logger.error(f"Save operation failed: {e}")
+            raise
+def main():
+    """Main execution for enhanced dataset creation"""
+    logger.info("🚀 Enhanced Prothom Alo Dataset Creator")
+    logger.info("=" * 60)
+    try:
+        # Create scraper
+        scraper = EnhancedProthomAloScraper(max_articles=50, max_workers=4)
+        # Scrape comprehensive dataset
+        articles = scraper.scrape_comprehensive_dataset()
+        if not articles:
+            logger.error("❌ No articles were scraped")
+            return
+        # Create enhanced dataset
+        dataset = scraper.create_enhanced_dataset(articles)
+        # Save comprehensive dataset
+        dataset_path = scraper.save_comprehensive_dataset(dataset)
+        logger.info(f"\n🎉 SUCCESS! Enhanced Prothom Alo dataset created!")
+        logger.info(f"📁 Location: {dataset_path}")
+        logger.info(f"📊 Ready for model fine-tuning!")
+        return dataset_path
+    except Exception as e:
+        logger.error(f"❌ Enhanced dataset creation failed: {e}")
+        raise
+if __name__ == "__main__":
+    main()