Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| from pathlib import Path | |
| # Custom slang / culinary phrases | |
| custom_pairs = [ | |
| # Culinary | |
| ("meatballs", "polpette"), | |
| ("spaghetti and meatballs", "spaghetti con le polpette"), | |
| ("I am cooking spaghetti and meatballs for dinner.", "Sto cucinando spaghetti con le polpette per cena."), | |
| ("This sauce is amazing.", "Questo sugo è incredibile."), | |
| ("Pass the parmesan cheese, please.", "Passami il parmigiano, per favore."), | |
| ("I'm starving, let's grab some grub.", "Sto morendo di fame, andiamo a mettere qualcosa sotto i denti."), | |
| ("Let's order some takeout.", "Ordiniamo qualcosa da asporto."), | |
| # Casual/Slang | |
| ("What's up, bro?", "Come butta, fra?"), | |
| ("That's so cool!", "Che figata!"), | |
| ("I don't care, whatever.", "Non m'importa, fa lo stesso."), | |
| ("Chill out, man.", "Datti una calmata, amico."), | |
| ("You're driving me crazy.", "Mi fai impazzire."), | |
| ("Awesome!", "Mito!"), | |
| ("Stop messing around.", "Smettila di fare cazzate."), | |
| ("I screwed up.", "Ho fatto un casino."), | |
| ("No way!", "Non ci credo!"), | |
| ("Are you kidding me?", "Mi stai prendendo in giro?"), | |
| ("Let's bounce.", "Tagliamo la corda."), | |
| ("I'm broke.", "Sono al verde."), | |
| ("It's a piece of cake.", "È una passeggiata."), | |
| ("Don't pull my leg.", "Non prendermi in giro."), | |
| ("I'm totally out of it today.", "Oggi sono proprio fuso."), | |
| ("Catch you later.", "Ci becchiamo dopo."), | |
| ("That sucks.", "Fa schifo."), | |
| ("Give me a break.", "Fammi il piacere."), | |
| ("I'm pissed off.", "Sono incazzato nero."), | |
| ("It costs an arm and a leg.", "Costa un occhio della testa."), | |
| ("I'm dead tired.", "Sono stanco morto."), | |
| ("He completely lost it.", "Ha perso completamente la brocca.") | |
| ] | |
| def main(): | |
| # 29 pairs * 2 directions = 58 rows per iteration. | |
| # We want these to firmly embed themselves against 800k rows. | |
| # An oversample of 200 gives us exactly 11,600 injected records. | |
| OVERSAMPLE = 200 | |
| data_dir = Path("data/en_it_v3") | |
| train_file = data_dir / "train.jsonl" | |
| meta_file = data_dir / "metadata.json" | |
| if not train_file.exists(): | |
| print(f"File {train_file} not found!") | |
| return | |
| rows = [] | |
| for en, it in custom_pairs: | |
| # Eng -> Ita | |
| rows.append({ | |
| "source_text": en, "target_text": it, | |
| "source_lang": "eng_Latn", "target_lang": "ita_Latn", "dataset": "custom_slang" | |
| }) | |
| # Ita -> Eng | |
| rows.append({ | |
| "source_text": it, "target_text": en, | |
| "source_lang": "ita_Latn", "target_lang": "eng_Latn", "dataset": "custom_slang" | |
| }) | |
| print(f"Generated {len(rows)} custom pairs.") | |
| with open(train_file, "a", encoding="utf-8") as f: | |
| for _ in range(OVERSAMPLE): | |
| for r in rows: | |
| f.write(json.dumps(r, ensure_ascii=False) + "\n") | |
| added_count = len(rows) * OVERSAMPLE | |
| print(f"Successfully appended {added_count} injected records to {train_file}") | |
| if meta_file.exists(): | |
| with open(meta_file, "r", encoding="utf-8") as f: | |
| meta = json.load(f) | |
| meta["counts"]["train"] += added_count | |
| meta["counts"]["total"] += added_count | |
| if "custom_slang" not in meta["datasets"]: | |
| meta["datasets"].append("custom_slang") | |
| with open(meta_file, "w", encoding="utf-8") as f: | |
| json.dump(meta, f, indent=2) | |
| print("Updated metadata.json") | |
| if __name__ == "__main__": | |
| main() | |