File size: 5,012 Bytes
9b9e393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python3
"""
Data Preparation Script for EthioBBPE
Extracts text from parquet datasets and prepares training corpus
"""

import os
import glob
import pandas as pd
from pathlib import Path


def extract_text_from_parquet(parquet_path, text_columns=None):
    """
    Extract text from specified columns in a parquet file.
    
    Args:
        parquet_path: Path to parquet file
        text_columns: List of column names to extract. If None, extracts all string columns.
    
    Returns:
        List of text strings
    """
    df = pd.read_parquet(parquet_path)
    
    if text_columns is None:
        # Auto-detect string columns
        text_columns = [col for col in df.columns if df[col].dtype == 'object']
    
    texts = []
    for col in text_columns:
        if col in df.columns:
            texts.extend(df[col].dropna().astype(str).tolist())
    
    return texts


def prepare_training_corpus(
    data_dir="./data",
    output_file="./data/training_corpus.txt",
    min_length=10,
    max_length=5000
):
    """
    Prepare training corpus from all parquet files in data directory.
    
    Args:
        data_dir: Directory containing parquet files
        output_file: Output file path for combined corpus
        min_length: Minimum text length to include
        max_length: Maximum text length to include
    
    Returns:
        dict with statistics
    """
    data_path = Path(data_dir)
    parquet_files = list(data_path.glob("*.parquet"))
    
    if not parquet_files:
        raise FileNotFoundError(f"No parquet files found in {data_dir}")
    
    print(f"Found {len(parquet_files)} parquet file(s)")
    
    all_texts = []
    stats = {"total_files": 0, "total_texts": 0, "filtered_texts": 0}
    
    for pq_file in parquet_files:
        print(f"\nProcessing: {pq_file.name}")
        
        # Determine which columns to extract based on filename
        if "synaxarium" in pq_file.name.lower():
            text_cols = ["መጽሃፍ"]  # Book/content column
        elif "canon" in pq_file.name.lower() or "biblical" in pq_file.name.lower():
            text_cols = ["ጥቅስ", "verse"]  # Verse columns
        else:
            text_cols = None  # Auto-detect
        
        try:
            texts = extract_text_from_parquet(str(pq_file), text_cols)
            stats["total_files"] += 1
            stats["total_texts"] += len(texts)
            
            # Filter by length
            filtered = [
                t for t in texts 
                if min_length <= len(t) <= max_length
            ]
            stats["filtered_texts"] += len(filtered)
            all_texts.extend(filtered)
            
            print(f"  - Extracted: {len(texts)} texts")
            print(f"  - After filtering: {len(filtered)} texts")
            
        except Exception as e:
            print(f"  ⚠️ Error processing {pq_file.name}: {e}")
    
    if not all_texts:
        raise ValueError("No valid texts extracted from datasets")
    
    # Write to output file
    print(f"\n✍️ Writing {len(all_texts)} texts to {output_file}")
    with open(output_file, "w", encoding="utf-8") as f:
        for text in all_texts:
            f.write(text.strip() + "\n")
    
    # Calculate statistics
    total_chars = sum(len(t) for t in all_texts)
    avg_length = total_chars / len(all_texts) if all_texts else 0
    
    stats.update({
        "output_file": output_file,
        "total_characters": total_chars,
        "average_length": round(avg_length, 2),
        "unique_texts": len(set(all_texts))
    })
    
    print("\n" + "="*60)
    print("📊 PREPARATION STATISTICS")
    print("="*60)
    print(f"Files processed:     {stats['total_files']}")
    print(f"Total texts:         {stats['total_texts']}")
    print(f"After filtering:     {stats['filtered_texts']}")
    print(f"Unique texts:        {stats['unique_texts']}")
    print(f"Total characters:    {stats['total_characters']:,}")
    print(f"Average length:      {stats['average_length']} chars")
    print(f"Output file:         {stats['output_file']}")
    print("="*60)
    
    return stats


if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="Prepare training corpus from parquet files")
    parser.add_argument("--data_dir", type=str, default="./data", help="Directory with parquet files")
    parser.add_argument("--output", type=str, default="./data/training_corpus.txt", help="Output corpus file")
    parser.add_argument("--min_length", type=int, default=10, help="Minimum text length")
    parser.add_argument("--max_length", type=int, default=5000, help="Maximum text length")
    
    args = parser.parse_args()
    
    stats = prepare_training_corpus(
        data_dir=args.data_dir,
        output_file=args.output,
        min_length=args.min_length,
        max_length=args.max_length
    )
    
    print("\n✅ Corpus preparation complete!")
    print(f"Ready to train tokenizer with: {args.output}")