import json
import os
from pathlib import Path

# Custom slang / culinary phrases
custom_pairs = [
    # Culinary
    ("meatballs", "polpette"),
    ("spaghetti and meatballs", "spaghetti con le polpette"),
    ("I am cooking spaghetti and meatballs for dinner.", "Sto cucinando spaghetti con le polpette per cena."),
    ("This sauce is amazing.", "Questo sugo è incredibile."),
    ("Pass the parmesan cheese, please.", "Passami il parmigiano, per favore."),
    ("I'm starving, let's grab some grub.", "Sto morendo di fame, andiamo a mettere qualcosa sotto i denti."),
    ("Let's order some takeout.", "Ordiniamo qualcosa da asporto."),
    
    # Casual/Slang
    ("What's up, bro?", "Come butta, fra?"),
    ("That's so cool!", "Che figata!"),
    ("I don't care, whatever.", "Non m'importa, fa lo stesso."),
    ("Chill out, man.", "Datti una calmata, amico."),
    ("You're driving me crazy.", "Mi fai impazzire."),
    ("Awesome!", "Mito!"),
    ("Stop messing around.", "Smettila di fare cazzate."),
    ("I screwed up.", "Ho fatto un casino."),
    ("No way!", "Non ci credo!"),
    ("Are you kidding me?", "Mi stai prendendo in giro?"),
    ("Let's bounce.", "Tagliamo la corda."),
    ("I'm broke.", "Sono al verde."),
    ("It's a piece of cake.", "È una passeggiata."),
    ("Don't pull my leg.", "Non prendermi in giro."),
    ("I'm totally out of it today.", "Oggi sono proprio fuso."),
    ("Catch you later.", "Ci becchiamo dopo."),
    ("That sucks.", "Fa schifo."),
    ("Give me a break.", "Fammi il piacere."),
    ("I'm pissed off.", "Sono incazzato nero."),
    ("It costs an arm and a leg.", "Costa un occhio della testa."),
    ("I'm dead tired.", "Sono stanco morto."),
    ("He completely lost it.", "Ha perso completamente la brocca.")
]

def main():
    # 29 pairs * 2 directions = 58 rows per iteration.
    # We want these to firmly embed themselves against 800k rows. 
    # An oversample of 200 gives us exactly 11,600 injected records.
    OVERSAMPLE = 200
    
    data_dir = Path("data/en_it_v3")
    train_file = data_dir / "train.jsonl"
    meta_file = data_dir / "metadata.json"
    
    if not train_file.exists():
        print(f"File {train_file} not found!")
        return
        
    rows = []
    for en, it in custom_pairs:
        # Eng -> Ita
        rows.append({
            "source_text": en, "target_text": it, 
            "source_lang": "eng_Latn", "target_lang": "ita_Latn", "dataset": "custom_slang"
        })
        # Ita -> Eng
        rows.append({
            "source_text": it, "target_text": en, 
            "source_lang": "ita_Latn", "target_lang": "eng_Latn", "dataset": "custom_slang"
        })
        
    print(f"Generated {len(rows)} custom pairs.")
    
    with open(train_file, "a", encoding="utf-8") as f:
        for _ in range(OVERSAMPLE):
            for r in rows:
                f.write(json.dumps(r, ensure_ascii=False) + "\n")
                
    added_count = len(rows) * OVERSAMPLE
    print(f"Successfully appended {added_count} injected records to {train_file}")
    
    if meta_file.exists():
        with open(meta_file, "r", encoding="utf-8") as f:
            meta = json.load(f)
        meta["counts"]["train"] += added_count
        meta["counts"]["total"] += added_count
        if "custom_slang" not in meta["datasets"]:
            meta["datasets"].append("custom_slang")
        with open(meta_file, "w", encoding="utf-8") as f:
            json.dump(meta, f, indent=2)
        print("Updated metadata.json")

if __name__ == "__main__":
    main()