Spaces:
Sleeping
Sleeping
| """ | |
| Data loading utilities for NaijaSenti and BBC Pidgin datasets. | |
| Loads Nigerian Pidgin text from multiple sources for language modeling. | |
| Sentiment/category labels are ignored. | |
| """ | |
| from datasets import load_dataset | |
| from typing import List, Dict, Any, Optional | |
| import csv | |
| import os | |
| # Path to BBC Pidgin corpus (relative to project root) | |
| BBC_PIDGIN_CORPUS_PATH = "bbc_pidgin_scraper/data/pidgin_corpus.csv" | |
| def load_naijasenti_pcm() -> Dict[str, List[str]]: | |
| """ | |
| Load the NaijaSenti PCM (Nigerian Pidgin) dataset. | |
| Returns: | |
| Dict with keys 'train', 'test', 'validation' containing text lists. | |
| """ | |
| dataset = load_dataset("mteb/NaijaSenti", "pcm") | |
| result = {} | |
| for split in dataset.keys(): | |
| # Extract text field, ignore sentiment labels | |
| result[split] = [example['text'] for example in dataset[split]] | |
| return result | |
| def load_bbc_pidgin(limit: Optional[int] = None, project_root: Optional[str] = None) -> List[str]: | |
| """ | |
| Load BBC Pidgin articles from the scraped corpus. | |
| The corpus contains headlines and article texts scraped from BBC Pidgin. | |
| We concatenate headline + text for each article. | |
| Args: | |
| limit: Maximum number of articles to load. None for all. | |
| project_root: Path to project root. Defaults to current working directory. | |
| Returns: | |
| List of article texts (headline + body combined). | |
| """ | |
| if project_root is None: | |
| project_root = os.getcwd() | |
| corpus_path = os.path.join(project_root, BBC_PIDGIN_CORPUS_PATH) | |
| if not os.path.exists(corpus_path): | |
| print(f"Warning: BBC Pidgin corpus not found at {corpus_path}") | |
| return [] | |
| texts = [] | |
| try: | |
| with open(corpus_path, 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| for i, row in enumerate(reader): | |
| if limit and i >= limit: | |
| break | |
| # Combine headline and text | |
| headline = row.get('headline', '').strip() | |
| text = row.get('text', '').strip() | |
| if headline and text: | |
| combined = f"{headline}. {text}" | |
| texts.append(combined) | |
| elif text: | |
| texts.append(text) | |
| except Exception as e: | |
| print(f"Error loading BBC Pidgin corpus: {e}") | |
| return [] | |
| print(f"Loaded {len(texts):,} BBC Pidgin articles") | |
| return texts | |
| def load_all_texts(include_bbc: bool = True, bbc_limit: Optional[int] = None) -> List[str]: | |
| """ | |
| Load all text from all sources combined. | |
| Combines NaijaSenti PCM dataset with BBC Pidgin articles | |
| for maximum data coverage. | |
| Args: | |
| include_bbc: Whether to include BBC Pidgin articles. | |
| bbc_limit: Maximum number of BBC articles to include. | |
| Returns: | |
| List of all text strings from all sources. | |
| """ | |
| all_texts = [] | |
| # Load NaijaSenti | |
| print("Loading NaijaSenti PCM dataset...") | |
| splits = load_naijasenti_pcm() | |
| for split_name, texts in splits.items(): | |
| all_texts.extend(texts) | |
| print(f" Loaded {len(texts):,} texts from {split_name} split") | |
| naija_total = len(all_texts) | |
| print(f" NaijaSenti total: {naija_total:,} texts") | |
| # Load BBC Pidgin | |
| if include_bbc: | |
| print(f"\nLoading BBC Pidgin corpus (limit={bbc_limit})...") | |
| bbc_texts = load_bbc_pidgin(limit=bbc_limit) | |
| all_texts.extend(bbc_texts) | |
| print(f"\nCombined total: {len(all_texts):,} texts") | |
| return all_texts | |
| def get_dataset_stats(texts: List[str]) -> Dict[str, Any]: | |
| """ | |
| Compute basic statistics about the dataset. | |
| Args: | |
| texts: List of text strings. | |
| Returns: | |
| Dictionary of statistics. | |
| """ | |
| total_chars = sum(len(t) for t in texts) | |
| total_words = sum(len(t.split()) for t in texts) | |
| return { | |
| 'num_texts': len(texts), | |
| 'total_characters': total_chars, | |
| 'total_words': total_words, | |
| 'avg_words_per_text': total_words / len(texts) if texts else 0, | |
| 'avg_chars_per_text': total_chars / len(texts) if texts else 0, | |
| } | |
| if __name__ == "__main__": | |
| # Quick test | |
| texts = load_all_texts(include_bbc=True) # Loads all BBC articles by default | |
| stats = get_dataset_stats(texts) | |
| print("\nDataset Statistics:") | |
| for key, value in stats.items(): | |
| if isinstance(value, float): | |
| print(f" {key}: {value:.2f}") | |
| else: | |
| print(f" {key}: {value:,}") | |