Elliot Sones
Deploy v2 with LFS
d86a963
"""
Data preparation for encoder MLM training.
"""
import os
import pandas as pd
def create_mlm_data():
"""Create training data for the MLM model."""
# Target sentence for production
target_sentence = "This model create relationships between the words to learn what word is missing!"
# Create variations by repeating the sentence many times for training
train_data = []
for _ in range(1000): # 1000 training samples
train_data.append(target_sentence)
# Create smaller validation and test sets
val_data = []
for _ in range(100): # 100 validation samples
val_data.append(target_sentence)
test_data = []
for _ in range(50): # 50 test samples
test_data.append(target_sentence)
# Save to CSV files
os.makedirs("data", exist_ok=True)
pd.DataFrame({"en": train_data}).to_csv("data/train.csv", index=False)
pd.DataFrame({"en": val_data}).to_csv("data/val.csv", index=False)
pd.DataFrame({"en": test_data}).to_csv("data/test.csv", index=False)
print(f"Created MLM training data:")
print(f" Target sentence: '{target_sentence}'")
print(f" Train: {len(train_data)} samples")
print(f" Val: {len(val_data)} samples")
print(f" Test: {len(test_data)} samples")
print(f" Words in sentence: {len(target_sentence.split())}")
if __name__ == "__main__":
create_mlm_data()