nextword-pidgin-api / src /data_loader.py
JermaineAI's picture
Fix API model loading: Copy src directory and update Dockerfile
ad18db6
"""
Data loading utilities for NaijaSenti and BBC Pidgin datasets.
Loads Nigerian Pidgin text from multiple sources for language modeling.
Sentiment/category labels are ignored.
"""
from datasets import load_dataset
from typing import List, Dict, Any, Optional
import csv
import os
# Path to BBC Pidgin corpus (relative to project root)
BBC_PIDGIN_CORPUS_PATH = "bbc_pidgin_scraper/data/pidgin_corpus.csv"
def load_naijasenti_pcm() -> Dict[str, List[str]]:
"""
Load the NaijaSenti PCM (Nigerian Pidgin) dataset.
Returns:
Dict with keys 'train', 'test', 'validation' containing text lists.
"""
dataset = load_dataset("mteb/NaijaSenti", "pcm")
result = {}
for split in dataset.keys():
# Extract text field, ignore sentiment labels
result[split] = [example['text'] for example in dataset[split]]
return result
def load_bbc_pidgin(limit: Optional[int] = None, project_root: Optional[str] = None) -> List[str]:
"""
Load BBC Pidgin articles from the scraped corpus.
The corpus contains headlines and article texts scraped from BBC Pidgin.
We concatenate headline + text for each article.
Args:
limit: Maximum number of articles to load. None for all.
project_root: Path to project root. Defaults to current working directory.
Returns:
List of article texts (headline + body combined).
"""
if project_root is None:
project_root = os.getcwd()
corpus_path = os.path.join(project_root, BBC_PIDGIN_CORPUS_PATH)
if not os.path.exists(corpus_path):
print(f"Warning: BBC Pidgin corpus not found at {corpus_path}")
return []
texts = []
try:
with open(corpus_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
if limit and i >= limit:
break
# Combine headline and text
headline = row.get('headline', '').strip()
text = row.get('text', '').strip()
if headline and text:
combined = f"{headline}. {text}"
texts.append(combined)
elif text:
texts.append(text)
except Exception as e:
print(f"Error loading BBC Pidgin corpus: {e}")
return []
print(f"Loaded {len(texts):,} BBC Pidgin articles")
return texts
def load_all_texts(include_bbc: bool = True, bbc_limit: Optional[int] = None) -> List[str]:
"""
Load all text from all sources combined.
Combines NaijaSenti PCM dataset with BBC Pidgin articles
for maximum data coverage.
Args:
include_bbc: Whether to include BBC Pidgin articles.
bbc_limit: Maximum number of BBC articles to include.
Returns:
List of all text strings from all sources.
"""
all_texts = []
# Load NaijaSenti
print("Loading NaijaSenti PCM dataset...")
splits = load_naijasenti_pcm()
for split_name, texts in splits.items():
all_texts.extend(texts)
print(f" Loaded {len(texts):,} texts from {split_name} split")
naija_total = len(all_texts)
print(f" NaijaSenti total: {naija_total:,} texts")
# Load BBC Pidgin
if include_bbc:
print(f"\nLoading BBC Pidgin corpus (limit={bbc_limit})...")
bbc_texts = load_bbc_pidgin(limit=bbc_limit)
all_texts.extend(bbc_texts)
print(f"\nCombined total: {len(all_texts):,} texts")
return all_texts
def get_dataset_stats(texts: List[str]) -> Dict[str, Any]:
"""
Compute basic statistics about the dataset.
Args:
texts: List of text strings.
Returns:
Dictionary of statistics.
"""
total_chars = sum(len(t) for t in texts)
total_words = sum(len(t.split()) for t in texts)
return {
'num_texts': len(texts),
'total_characters': total_chars,
'total_words': total_words,
'avg_words_per_text': total_words / len(texts) if texts else 0,
'avg_chars_per_text': total_chars / len(texts) if texts else 0,
}
if __name__ == "__main__":
# Quick test
texts = load_all_texts(include_bbc=True) # Loads all BBC articles by default
stats = get_dataset_stats(texts)
print("\nDataset Statistics:")
for key, value in stats.items():
if isinstance(value, float):
print(f" {key}: {value:.2f}")
else:
print(f" {key}: {value:,}")