File size: 1,414 Bytes
d86a963
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""
Data preparation for encoder MLM training.
"""

import os
import pandas as pd

def create_mlm_data():
    """Create training data for the MLM model."""
    
    # Target sentence for production
    target_sentence = "This model create relationships between the words to learn what word is missing!"
    
    # Create variations by repeating the sentence many times for training
    train_data = []
    for _ in range(1000):  # 1000 training samples
        train_data.append(target_sentence)
    
    # Create smaller validation and test sets
    val_data = []
    for _ in range(100):  # 100 validation samples
        val_data.append(target_sentence)
    
    test_data = []
    for _ in range(50):  # 50 test samples
        test_data.append(target_sentence)
    
    # Save to CSV files
    os.makedirs("data", exist_ok=True)
    
    pd.DataFrame({"en": train_data}).to_csv("data/train.csv", index=False)
    pd.DataFrame({"en": val_data}).to_csv("data/val.csv", index=False)
    pd.DataFrame({"en": test_data}).to_csv("data/test.csv", index=False)
    
    print(f"Created MLM training data:")
    print(f"  Target sentence: '{target_sentence}'")
    print(f"  Train: {len(train_data)} samples")
    print(f"  Val: {len(val_data)} samples")
    print(f"  Test: {len(test_data)} samples")
    print(f"  Words in sentence: {len(target_sentence.split())}")

if __name__ == "__main__":
    create_mlm_data()