File size: 2,539 Bytes
e1d9ec2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pandas as pd
import os
import random

def create_dummy_data():
    """Generates dummy transliteration data for Hindi, Bengali, and Tamil."""
    
    # Minimal dummy dataset
    data = [
        # Hindi
        ("namaste", "नमस्ते", "hi"),
        ("aap", "आप", "hi"),
        ("kya", "क्या", "hi"),
        ("kar", "कर", "hi"),
        ("rahe", "रहे", "hi"),
        ("ho", "हो", "hi"),
        ("mera", "मेरा", "hi"),
        ("naam", "नाम", "hi"),
        ("hai", "है", "hi"),
        ("bharat", "भारत", "hi"),
        
        # Bengali
        ("namoshkar", "নমস্কার", "bn"),
        ("apni", "আপনি", "bn"),
        ("kemon", "কেমন", "bn"),
        ("achen", "আছেন", "bn"),
        ("amar", "আমার", "bn"),
        ("nam", "নাম", "bn"),
        ("bangla", "বাংলা", "bn"),
        ("desh", "দেশ", "bn"),
        ("khabar", "খাবার", "bn"),
        ("jal", "জল", "bn"),

        # Tamil
        ("vanakkam", "வணக்கம்", "ta"),
        ("neengal", "நீங்கள்", "ta"),
        ("eppadi", "எப்படி", "ta"),
        ("irukkeenga", "இருக்கிறீர்கள்", "ta"),
        ("en", "என்", "ta"),
        ("peyar", "பெயர்", "ta"),
        ("tamil", "தமிழ்", "ta"),
        ("nadu", "நாடு", "ta"),
        ("sapadu", "சாப்பாடு", "ta"),
        ("thanni", "தண்ணீர்", "ta")
    ]

    # Expand data slightly by duplicating to simulate a larger set for split
    data = data * 5 
    random.shuffle(data)

    df = pd.DataFrame(data, columns=["source", "target", "lang"])

    # Split into train, val, test (80-10-10)
    train_size = int(0.8 * len(df))
    val_size = int(0.1 * len(df))
    
    train_df = df[:train_size]
    val_df = df[train_size:train_size+val_size]
    test_df = df[train_size+val_size:]

    output_dir = "data"
    os.makedirs(output_dir, exist_ok=True)

    train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
    val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False)
    test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)

    print(f"Data generation complete.")
    print(f"Train size: {len(train_df)}")
    print(f"Val size: {len(val_df)}")
    print(f"Test size: {len(test_df)}")

if __name__ == "__main__":
    create_dummy_data()