File size: 4,581 Bytes
ad18db6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Data loading utilities for NaijaSenti and BBC Pidgin datasets.

Loads Nigerian Pidgin text from multiple sources for language modeling.
Sentiment/category labels are ignored.
"""

from datasets import load_dataset
from typing import List, Dict, Any, Optional
import csv
import os

# Path to BBC Pidgin corpus (relative to project root)
BBC_PIDGIN_CORPUS_PATH = "bbc_pidgin_scraper/data/pidgin_corpus.csv"


def load_naijasenti_pcm() -> Dict[str, List[str]]:
    """
    Load the NaijaSenti PCM (Nigerian Pidgin) dataset.
    
    Returns:
        Dict with keys 'train', 'test', 'validation' containing text lists.
    """
    dataset = load_dataset("mteb/NaijaSenti", "pcm")
    
    result = {}
    for split in dataset.keys():
        # Extract text field, ignore sentiment labels
        result[split] = [example['text'] for example in dataset[split]]
    
    return result


def load_bbc_pidgin(limit: Optional[int] = None, project_root: Optional[str] = None) -> List[str]:
    """
    Load BBC Pidgin articles from the scraped corpus.
    
    The corpus contains headlines and article texts scraped from BBC Pidgin.
    We concatenate headline + text for each article.
    
    Args:
        limit: Maximum number of articles to load. None for all.
        project_root: Path to project root. Defaults to current working directory.
        
    Returns:
        List of article texts (headline + body combined).
    """
    if project_root is None:
        project_root = os.getcwd()
    
    corpus_path = os.path.join(project_root, BBC_PIDGIN_CORPUS_PATH)
    
    if not os.path.exists(corpus_path):
        print(f"Warning: BBC Pidgin corpus not found at {corpus_path}")
        return []
    
    texts = []
    try:
        with open(corpus_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for i, row in enumerate(reader):
                if limit and i >= limit:
                    break
                # Combine headline and text
                headline = row.get('headline', '').strip()
                text = row.get('text', '').strip()
                if headline and text:
                    combined = f"{headline}. {text}"
                    texts.append(combined)
                elif text:
                    texts.append(text)
    except Exception as e:
        print(f"Error loading BBC Pidgin corpus: {e}")
        return []
    
    print(f"Loaded {len(texts):,} BBC Pidgin articles")
    return texts


def load_all_texts(include_bbc: bool = True, bbc_limit: Optional[int] = None) -> List[str]:
    """
    Load all text from all sources combined.
    
    Combines NaijaSenti PCM dataset with BBC Pidgin articles
    for maximum data coverage.
    
    Args:
        include_bbc: Whether to include BBC Pidgin articles.
        bbc_limit: Maximum number of BBC articles to include.
        
    Returns:
        List of all text strings from all sources.
    """
    all_texts = []
    
    # Load NaijaSenti
    print("Loading NaijaSenti PCM dataset...")
    splits = load_naijasenti_pcm()
    for split_name, texts in splits.items():
        all_texts.extend(texts)
        print(f"  Loaded {len(texts):,} texts from {split_name} split")
    
    naija_total = len(all_texts)
    print(f"  NaijaSenti total: {naija_total:,} texts")
    
    # Load BBC Pidgin
    if include_bbc:
        print(f"\nLoading BBC Pidgin corpus (limit={bbc_limit})...")
        bbc_texts = load_bbc_pidgin(limit=bbc_limit)
        all_texts.extend(bbc_texts)
    
    print(f"\nCombined total: {len(all_texts):,} texts")
    return all_texts


def get_dataset_stats(texts: List[str]) -> Dict[str, Any]:
    """
    Compute basic statistics about the dataset.
    
    Args:
        texts: List of text strings.
        
    Returns:
        Dictionary of statistics.
    """
    total_chars = sum(len(t) for t in texts)
    total_words = sum(len(t.split()) for t in texts)
    
    return {
        'num_texts': len(texts),
        'total_characters': total_chars,
        'total_words': total_words,
        'avg_words_per_text': total_words / len(texts) if texts else 0,
        'avg_chars_per_text': total_chars / len(texts) if texts else 0,
    }


if __name__ == "__main__":
    # Quick test
    texts = load_all_texts(include_bbc=True)  # Loads all BBC articles by default
    stats = get_dataset_stats(texts)
    print("\nDataset Statistics:")
    for key, value in stats.items():
        if isinstance(value, float):
            print(f"  {key}: {value:.2f}")
        else:
            print(f"  {key}: {value:,}")