NMT / scripts /inject_slang_data.py
marconolimits's picture
deploy: clean orphan branch for HF Spaces - CPU threading optimisation
c7b4419
Raw
History Blame Contribute Delete
3.56 kB
import json
import os
from pathlib import Path
# Custom slang / culinary phrases
custom_pairs = [
# Culinary
("meatballs", "polpette"),
("spaghetti and meatballs", "spaghetti con le polpette"),
("I am cooking spaghetti and meatballs for dinner.", "Sto cucinando spaghetti con le polpette per cena."),
("This sauce is amazing.", "Questo sugo è incredibile."),
("Pass the parmesan cheese, please.", "Passami il parmigiano, per favore."),
("I'm starving, let's grab some grub.", "Sto morendo di fame, andiamo a mettere qualcosa sotto i denti."),
("Let's order some takeout.", "Ordiniamo qualcosa da asporto."),
# Casual/Slang
("What's up, bro?", "Come butta, fra?"),
("That's so cool!", "Che figata!"),
("I don't care, whatever.", "Non m'importa, fa lo stesso."),
("Chill out, man.", "Datti una calmata, amico."),
("You're driving me crazy.", "Mi fai impazzire."),
("Awesome!", "Mito!"),
("Stop messing around.", "Smettila di fare cazzate."),
("I screwed up.", "Ho fatto un casino."),
("No way!", "Non ci credo!"),
("Are you kidding me?", "Mi stai prendendo in giro?"),
("Let's bounce.", "Tagliamo la corda."),
("I'm broke.", "Sono al verde."),
("It's a piece of cake.", "È una passeggiata."),
("Don't pull my leg.", "Non prendermi in giro."),
("I'm totally out of it today.", "Oggi sono proprio fuso."),
("Catch you later.", "Ci becchiamo dopo."),
("That sucks.", "Fa schifo."),
("Give me a break.", "Fammi il piacere."),
("I'm pissed off.", "Sono incazzato nero."),
("It costs an arm and a leg.", "Costa un occhio della testa."),
("I'm dead tired.", "Sono stanco morto."),
("He completely lost it.", "Ha perso completamente la brocca.")
]
def main():
# 29 pairs * 2 directions = 58 rows per iteration.
# We want these to firmly embed themselves against 800k rows.
# An oversample of 200 gives us exactly 11,600 injected records.
OVERSAMPLE = 200
data_dir = Path("data/en_it_v3")
train_file = data_dir / "train.jsonl"
meta_file = data_dir / "metadata.json"
if not train_file.exists():
print(f"File {train_file} not found!")
return
rows = []
for en, it in custom_pairs:
# Eng -> Ita
rows.append({
"source_text": en, "target_text": it,
"source_lang": "eng_Latn", "target_lang": "ita_Latn", "dataset": "custom_slang"
})
# Ita -> Eng
rows.append({
"source_text": it, "target_text": en,
"source_lang": "ita_Latn", "target_lang": "eng_Latn", "dataset": "custom_slang"
})
print(f"Generated {len(rows)} custom pairs.")
with open(train_file, "a", encoding="utf-8") as f:
for _ in range(OVERSAMPLE):
for r in rows:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
added_count = len(rows) * OVERSAMPLE
print(f"Successfully appended {added_count} injected records to {train_file}")
if meta_file.exists():
with open(meta_file, "r", encoding="utf-8") as f:
meta = json.load(f)
meta["counts"]["train"] += added_count
meta["counts"]["total"] += added_count
if "custom_slang" not in meta["datasets"]:
meta["datasets"].append("custom_slang")
with open(meta_file, "w", encoding="utf-8") as f:
json.dump(meta, f, indent=2)
print("Updated metadata.json")
if __name__ == "__main__":
main()