File size: 3,582 Bytes
fd2f49a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# scripts/fetch_parallel_data.py

from datasets import load_dataset
import os


def fetch_and_save_parallel_data(lang_pair, dataset_name, output_name):
    """
    Downloads a parallel dataset and saves it into two
    separate text files (one for each language).
    
    Args:
        lang_pair (str): Language pair, e.g., "en-ne" for English-Nepali.
        dataset_name (str): The name of the dataset on Hugging Face Hub.
        output_name (str): The name to use for the output files.
    """
    source_lang, target_lang = lang_pair.split("-")
    output_dir = "data/processed"
    os.makedirs(output_dir, exist_ok=True)
    
    source_filepath = os.path.join(output_dir, f"{output_name}.{source_lang}")
    target_filepath = os.path.join(output_dir, f"{output_name}.{target_lang}")

    print(f"--- Starting download for {lang_pair} from {dataset_name} ---")
    
    try:
        # Load the dataset from Hugging Face
        if dataset_name == "Programmer-RD-AI/sinhala-english-singlish-translation":
            dataset = load_dataset(dataset_name, split='train')
        else:
            dataset = load_dataset(dataset_name, lang_pair, split='train')
        print(f"Dataset loaded successfully. Total pairs: {len(dataset)}")

        print(f"Processing and saving files...")
        with open(source_filepath, "w", encoding="utf-8") as f_source, \
             open(target_filepath, "w", encoding="utf-8") as f_target:
            
            for example in dataset:
                if dataset_name == "Programmer-RD-AI/sinhala-english-singlish-translation":
                    source_sentence = example['Sinhala']
                    target_sentence = example['English']
                else:
                    source_sentence = example['translation'][source_lang]
                    target_sentence = example['translation'][target_lang]

                if source_sentence and target_sentence:
                    f_source.write(source_sentence.strip() + "\n")
                    f_target.write(target_sentence.strip() + "\n")

        print(f"Successfully saved data for {lang_pair}")
    except Exception as e:
        print(f"An error occurred for {lang_pair}: {e}")

if __name__ == "__main__":
    # --- Fetch Nepali Data ---
    print("Fetching Nepali data...")
    fetch_and_save_parallel_data(lang_pair="en-ne", dataset_name="Helsinki-NLP/opus-100", output_name="nepali")
    
    # --- Fetch Sinhalese Data ---
    print("\nFetching Sinhalese data...")
    fetch_and_save_parallel_data(lang_pair="si-en", dataset_name="Programmer-RD-AI/sinhala-english-singlish-translation", output_name="sinhala")
    

    # --- Fetch Sinhalese Idioms Data ---
    print("\nFetching Sinhalese idioms data...")
    output_dir = "data/processed"
    try:
        idioms_dataset = load_dataset("Venuraa/English-Sinhala-Idioms-Parallel-Translations", split='train')
        print(f"Idioms dataset loaded successfully. Total pairs: {len(idioms_dataset)}")

        with open(os.path.join(output_dir, "sinhala.si"), "a", encoding="utf-8") as f_source, \
             open(os.path.join(output_dir, "sinhala.en"), "a", encoding="utf-8") as f_target:
            for example in idioms_dataset:
                parts = example['text'].split('\n')
                if len(parts) == 2:
                    f_target.write(parts[0] + "\n")
                    f_source.write(parts[1] + "\n")
        print("Successfully appended idioms data.")
    except Exception as e:
        print(f"An error occurred while fetching idioms data: {e}")
    print("\nAll data fetching complete.")