likhonsheikh commited on
Commit
5149c96
Β·
verified Β·
1 Parent(s): 858d9ce

Add dataset creation script

Browse files
Files changed (1) hide show
  1. enhanced_dataset_creator.py +470 -0
enhanced_dataset_creator.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced Prothom Alo Dataset Creator for Model Training
4
+ - Gets 50+ articles from both English and Bengali
5
+ - Includes multiple categories
6
+ - Prepares for fine-tuning
7
+ """
8
+
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ import json
12
+ import time
13
+ import re
14
+ from datetime import datetime
15
+ from typing import Dict, List, Optional
16
+ from datasets import Dataset, DatasetDict, Features, Value
17
+ from dataclasses import dataclass
18
+ import concurrent.futures
19
+ import logging
20
+ from pathlib import Path
21
+
22
+ # Setup logging
23
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
+ logger = logging.getLogger(__name__)
25
+
26
+ @dataclass
27
+ class Article:
28
+ """Enhanced article class for training data"""
29
+ title: str
30
+ content: str
31
+ url: str
32
+ category: str
33
+ language: str
34
+ author: str = "Prothom Alo"
35
+ published_date: str = ""
36
+ word_count: int = 0
37
+ content_clean: str = ""
38
+ summary: str = ""
39
+
40
+ class EnhancedProthomAloScraper:
41
+ """Enhanced scraper for comprehensive dataset creation"""
42
+
43
+ def __init__(self, max_articles: int = 100, max_workers: int = 3):
44
+ self.max_articles = max_articles
45
+ self.max_workers = max_workers
46
+ self.session = requests.Session()
47
+ self.session.headers.update({
48
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
49
+ })
50
+
51
+ def clean_text(self, text: str) -> str:
52
+ """Clean and normalize text"""
53
+ if not text:
54
+ return ""
55
+
56
+ # Remove extra whitespace
57
+ text = re.sub(r'\s+', ' ', text)
58
+ # Remove special characters but keep punctuation
59
+ text = re.sub(r'[^\w\s\-\.\,\!\?\;\:\(\)]', ' ', text)
60
+ # Strip and normalize
61
+ return text.strip()
62
+
63
+ def extract_article_content(self, soup: BeautifulSoup) -> Dict:
64
+ """Extract article content with improved parsing"""
65
+ try:
66
+ # Title extraction
67
+ title_elem = soup.select_one('h1, .headline, .article-title')
68
+ title = self.clean_text(title_elem.get_text()) if title_elem else ""
69
+
70
+ # Content extraction
71
+ content_selectors = [
72
+ '.article-content p',
73
+ '.story-content p',
74
+ '.content p',
75
+ 'article p',
76
+ 'p'
77
+ ]
78
+
79
+ content = ""
80
+ for selector in content_selectors:
81
+ paragraphs = soup.select(selector)
82
+ if paragraphs:
83
+ content = ' '.join([self.clean_text(p.get_text()) for p in paragraphs if p.get_text()])
84
+ break
85
+
86
+ if not content:
87
+ # Fallback: get all text content
88
+ content = self.clean_text(soup.get_text())
89
+
90
+ # Author extraction
91
+ author_selectors = ['.author', '.byline', '.writer', '.reporter']
92
+ author = "Prothom Alo"
93
+ for selector in author_selectors:
94
+ author_elem = soup.select_one(selector)
95
+ if author_elem:
96
+ author = self.clean_text(author_elem.get_text())
97
+ break
98
+
99
+ # Date extraction
100
+ date_selectors = ['time', '.date', '.published', '.timestamp']
101
+ published_date = datetime.now().isoformat()
102
+ for selector in date_selectors:
103
+ date_elem = soup.select_one(selector)
104
+ if date_elem:
105
+ if date_elem.get('datetime'):
106
+ published_date = date_elem.get('datetime')
107
+ else:
108
+ published_date = self.clean_text(date_elem.get_text())
109
+ break
110
+
111
+ return {
112
+ 'title': title,
113
+ 'content': content,
114
+ 'author': author,
115
+ 'published_date': published_date
116
+ }
117
+
118
+ except Exception as e:
119
+ logger.warning(f"Content extraction failed: {e}")
120
+ return {
121
+ 'title': "",
122
+ 'content': "",
123
+ 'author': "Prothom Alo",
124
+ 'published_date': datetime.now().isoformat()
125
+ }
126
+
127
+ def extract_articles_from_page(self, url: str, category: str, language: str) -> List[Article]:
128
+ """Extract articles from a single page"""
129
+ articles = []
130
+
131
+ try:
132
+ logger.info(f"Fetching {url} for {category} articles")
133
+ response = self.session.get(url, timeout=15)
134
+ response.raise_for_status()
135
+
136
+ soup = BeautifulSoup(response.content, 'html.parser')
137
+
138
+ # Multiple link patterns for different page structures
139
+ link_patterns = [
140
+ 'h1 a', 'h2 a', 'h3 a',
141
+ '.headline a', '.title a',
142
+ 'a[href*="article"]', 'a[href*="news"]',
143
+ '.news-item a', '.article-item a'
144
+ ]
145
+
146
+ links = []
147
+ for pattern in link_patterns:
148
+ links.extend(soup.select(pattern))
149
+
150
+ # Remove duplicates
151
+ seen_urls = set()
152
+ unique_links = []
153
+ for link in links:
154
+ href = link.get('href', '')
155
+ if href and href not in seen_urls:
156
+ unique_links.append(link)
157
+ seen_urls.add(href)
158
+
159
+ logger.info(f"Found {len(unique_links)} potential articles in {category}")
160
+
161
+ # Process each article
162
+ for i, link in enumerate(unique_links[:10]): # Limit per page
163
+ try:
164
+ href = link.get('href', '')
165
+ title = self.clean_text(link.get_text())
166
+
167
+ if not href or not title or len(title) < 10:
168
+ continue
169
+
170
+ # Make URL absolute
171
+ if not href.startswith('http'):
172
+ if language == 'bengali':
173
+ href = 'https://www.prothomalo.com' + href
174
+ else:
175
+ href = 'https://en.prothomalo.com' + href
176
+
177
+ # Rate limiting
178
+ time.sleep(0.2)
179
+
180
+ # Fetch article
181
+ article_response = self.session.get(href, timeout=10)
182
+ if not article_response.ok:
183
+ continue
184
+
185
+ # Parse article
186
+ article_soup = BeautifulSoup(article_response.content, 'html.parser')
187
+ extracted = self.extract_article_content(article_soup)
188
+
189
+ content = extracted['content']
190
+ if not content or len(content) < 100:
191
+ continue
192
+
193
+ # Clean content and create summary
194
+ content_clean = self.clean_text(content)
195
+ word_count = len(content_clean.split())
196
+
197
+ # Create simple summary (first 200 words)
198
+ summary = ' '.join(content_clean.split()[:200])
199
+ if word_count > 200:
200
+ summary += "..."
201
+
202
+ article = Article(
203
+ title=extracted['title'] or title,
204
+ content=content,
205
+ url=href,
206
+ category=category,
207
+ language=language,
208
+ author=extracted['author'],
209
+ published_date=extracted['published_date'],
210
+ word_count=word_count,
211
+ content_clean=content_clean,
212
+ summary=summary
213
+ )
214
+
215
+ articles.append(article)
216
+ logger.info(f" βœ… Article {i+1}: {word_count} words")
217
+
218
+ if len(articles) >= self.max_articles:
219
+ break
220
+
221
+ except Exception as e:
222
+ logger.warning(f"Failed to process article {i+1}: {e}")
223
+ continue
224
+
225
+ return articles
226
+
227
+ except Exception as e:
228
+ logger.error(f"Failed to fetch {url}: {e}")
229
+ return []
230
+
231
+ def scrape_comprehensive_dataset(self) -> List[Article]:
232
+ """Create a comprehensive dataset from multiple sources"""
233
+ logger.info(f"Starting comprehensive dataset creation (max: {self.max_articles} articles)")
234
+
235
+ # Define target pages
236
+ target_pages = [
237
+ # English pages
238
+ ('https://en.prothomalo.com/', 'general', 'english'),
239
+ ('https://en.prothomalo.com/opinion/', 'opinion', 'english'),
240
+ ('https://en.prothomalo.com/bangladesh/', 'bangladesh', 'english'),
241
+ ('https://en.prothomalo.com/international/', 'international', 'english'),
242
+ ('https://en.prothomalo.com/sports/', 'sports', 'english'),
243
+ ('https://en.prothomalo.com/business/', 'business', 'english'),
244
+
245
+ # Bengali pages
246
+ ('https://www.prothomalo.com/', 'general', 'bengali'),
247
+ ('https://www.prothomalo.com/opinion/', 'opinion', 'bengali'),
248
+ ('https://www.prothomalo.com/bangladesh/', 'bangladesh', 'bengali'),
249
+ ('https://www.prothomalo.com/international/', 'international', 'bengali'),
250
+ ('https://www.prothomalo.com/sports/', 'sports', 'bengali'),
251
+ ('https://www.prothomalo.com/business/', 'business', 'bengali'),
252
+ ]
253
+
254
+ all_articles = []
255
+
256
+ # Use thread pool for concurrent processing
257
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
258
+ futures = []
259
+
260
+ for url, category, language in target_pages:
261
+ future = executor.submit(self.extract_articles_from_page, url, category, language)
262
+ futures.append(future)
263
+
264
+ # Collect results
265
+ for future in concurrent.futures.as_completed(futures):
266
+ try:
267
+ articles = future.result()
268
+ all_articles.extend(articles)
269
+ logger.info(f"Collected {len(articles)} articles")
270
+
271
+ if len(all_articles) >= self.max_articles:
272
+ logger.info(f"Reached target of {self.max_articles} articles")
273
+ break
274
+
275
+ except Exception as e:
276
+ logger.error(f"Future processing failed: {e}")
277
+
278
+ # Remove duplicates based on URL
279
+ unique_articles = []
280
+ seen_urls = set()
281
+
282
+ for article in all_articles:
283
+ if article.url not in seen_urls:
284
+ unique_articles.append(article)
285
+ seen_urls.add(article.url)
286
+
287
+ logger.info(f"Final dataset: {len(unique_articles)} unique articles")
288
+ return unique_articles[:self.max_articles]
289
+
290
+ def create_enhanced_dataset(self, articles: List[Article]) -> DatasetDict:
291
+ """Create enhanced dataset for model training"""
292
+ if not articles:
293
+ raise ValueError("No articles provided")
294
+
295
+ logger.info(f"Creating enhanced dataset from {len(articles)} articles")
296
+
297
+ # Convert to dictionaries with training-focused structure
298
+ article_dicts = []
299
+ for i, article in enumerate(articles):
300
+ article_dicts.append({
301
+ 'id': f"prothomalo_{i+1:04d}",
302
+ 'title': article.title,
303
+ 'content': article.content,
304
+ 'content_clean': article.content_clean,
305
+ 'summary': article.summary,
306
+ 'category': article.category,
307
+ 'language': article.language,
308
+ 'author': article.author,
309
+ 'url': article.url,
310
+ 'published_date': article.published_date,
311
+ 'word_count': article.word_count,
312
+ 'source': 'Prothom Alo',
313
+ 'text_for_training': f"Title: {article.title}\n\nContent: {article.content_clean}", # Combined text
314
+ })
315
+
316
+ # Define features for training
317
+ features = Features({
318
+ 'id': Value('string'),
319
+ 'title': Value('string'),
320
+ 'content': Value('string'),
321
+ 'content_clean': Value('string'),
322
+ 'summary': Value('string'),
323
+ 'category': Value('string'),
324
+ 'language': Value('string'),
325
+ 'author': Value('string'),
326
+ 'url': Value('string'),
327
+ 'published_date': Value('string'),
328
+ 'word_count': Value('int32'),
329
+ 'source': Value('string'),
330
+ 'text_for_training': Value('string')
331
+ })
332
+
333
+ # Create dataset
334
+ dataset = Dataset.from_list(article_dicts, features=features)
335
+
336
+ # Simple approach: create single dataset and split
337
+ if len(dataset) < 2:
338
+ return DatasetDict({
339
+ 'train': dataset,
340
+ 'validation': dataset,
341
+ 'test': dataset
342
+ })
343
+
344
+ # Create 80/10/10 splits for all data together
345
+ train_test = dataset.train_test_split(test_size=0.2, seed=42)
346
+ val_test = train_test['train'].train_test_split(test_size=0.125, seed=42) # 10% of total
347
+
348
+ final_dataset = DatasetDict({
349
+ 'train': val_test['train'],
350
+ 'validation': val_test['test'],
351
+ 'test': train_test['test']
352
+ })
353
+
354
+ logger.info("Dataset splits created:")
355
+ for split, data in final_dataset.items():
356
+ logger.info(f" {split}: {len(data)} articles")
357
+
358
+ return final_dataset
359
+
360
+ def save_comprehensive_dataset(self, dataset: DatasetDict, output_dir: str = "enhanced_prothomalo"):
361
+ """Save comprehensive dataset with metadata"""
362
+
363
+ try:
364
+ # Save dataset
365
+ dataset_path = f"./{output_dir}"
366
+ dataset.save_to_disk(dataset_path)
367
+ logger.info(f"βœ… Dataset saved to: {dataset_path}")
368
+
369
+ # Create comprehensive metadata
370
+ all_articles = []
371
+ for split_data in dataset.values():
372
+ all_articles.extend(split_data)
373
+
374
+ # Analyze dataset
375
+ categories = list(set(article['category'] for article in all_articles))
376
+ languages = list(set(article['language'] for article in all_articles))
377
+ word_counts = [article['word_count'] for article in all_articles]
378
+
379
+ metadata = {
380
+ 'creation_date': datetime.now().isoformat(),
381
+ 'dataset_version': '1.0',
382
+ 'source_websites': [
383
+ 'https://en.prothomalo.com',
384
+ 'https://www.prothomalo.com'
385
+ ],
386
+ 'total_articles': len(all_articles),
387
+ 'languages': languages,
388
+ 'categories': categories,
389
+ 'language_distribution': {
390
+ lang: len([a for a in all_articles if a['language'] == lang])
391
+ for lang in languages
392
+ },
393
+ 'category_distribution': {
394
+ cat: len([a for a in all_articles if a['category'] == cat])
395
+ for cat in categories
396
+ },
397
+ 'word_count_stats': {
398
+ 'min': min(word_counts),
399
+ 'max': max(word_counts),
400
+ 'mean': sum(word_counts) / len(word_counts),
401
+ 'total_words': sum(word_counts)
402
+ },
403
+ 'scraping_method': 'comprehensive_concurrent',
404
+ 'features': [
405
+ 'title', 'content', 'content_clean', 'summary',
406
+ 'category', 'language', 'author', 'word_count',
407
+ 'text_for_training'
408
+ ],
409
+ 'intended_use': 'Language model fine-tuning and Bengali-English NLP research',
410
+ 'license': 'Research use - subject to Prothom Alo terms of service',
411
+ 'model_training_ready': True
412
+ }
413
+
414
+ with open(f"{dataset_path}/dataset_metadata.json", 'w') as f:
415
+ json.dump(metadata, f, indent=2)
416
+
417
+ # Test loading
418
+ from datasets import load_from_disk
419
+ loaded = load_from_disk(dataset_path)
420
+ logger.info(f"βœ… Dataset loading test passed")
421
+
422
+ # Show statistics
423
+ logger.info(f"\nπŸ“Š Enhanced Dataset Statistics:")
424
+ logger.info(f"Total articles: {len(all_articles)}")
425
+ logger.info(f"Languages: {languages}")
426
+ logger.info(f"Categories: {categories}")
427
+ logger.info(f"Word count range: {min(word_counts)} - {max(word_counts)}")
428
+ logger.info(f"Average words per article: {sum(word_counts) / len(word_counts):.0f}")
429
+
430
+ return dataset_path
431
+
432
+ except Exception as e:
433
+ logger.error(f"Save operation failed: {e}")
434
+ raise
435
+
436
+ def main():
437
+ """Main execution for enhanced dataset creation"""
438
+
439
+ logger.info("πŸš€ Enhanced Prothom Alo Dataset Creator")
440
+ logger.info("=" * 60)
441
+
442
+ try:
443
+ # Create scraper
444
+ scraper = EnhancedProthomAloScraper(max_articles=50, max_workers=4)
445
+
446
+ # Scrape comprehensive dataset
447
+ articles = scraper.scrape_comprehensive_dataset()
448
+
449
+ if not articles:
450
+ logger.error("❌ No articles were scraped")
451
+ return
452
+
453
+ # Create enhanced dataset
454
+ dataset = scraper.create_enhanced_dataset(articles)
455
+
456
+ # Save comprehensive dataset
457
+ dataset_path = scraper.save_comprehensive_dataset(dataset)
458
+
459
+ logger.info(f"\nπŸŽ‰ SUCCESS! Enhanced Prothom Alo dataset created!")
460
+ logger.info(f"πŸ“ Location: {dataset_path}")
461
+ logger.info(f"πŸ“Š Ready for model fine-tuning!")
462
+
463
+ return dataset_path
464
+
465
+ except Exception as e:
466
+ logger.error(f"❌ Enhanced dataset creation failed: {e}")
467
+ raise
468
+
469
+ if __name__ == "__main__":
470
+ main()