Spaces:

marconolimits
/

NMT

Sleeping

App Files Files Community

NMT / scripts /inject_slang_data.py

marconolimits

deploy: clean orphan branch for HF Spaces - CPU threading optimisation

c7b4419 2 months ago

Raw

History Blame Contribute Delete

3.56 kB

	import json
	import os
	from pathlib import Path

	# Custom slang / culinary phrases
	custom_pairs = [
	# Culinary
	("meatballs", "polpette"),
	("spaghetti and meatballs", "spaghetti con le polpette"),
	("I am cooking spaghetti and meatballs for dinner.", "Sto cucinando spaghetti con le polpette per cena."),
	("This sauce is amazing.", "Questo sugo è incredibile."),
	("Pass the parmesan cheese, please.", "Passami il parmigiano, per favore."),
	("I'm starving, let's grab some grub.", "Sto morendo di fame, andiamo a mettere qualcosa sotto i denti."),
	("Let's order some takeout.", "Ordiniamo qualcosa da asporto."),

	# Casual/Slang
	("What's up, bro?", "Come butta, fra?"),
	("That's so cool!", "Che figata!"),
	("I don't care, whatever.", "Non m'importa, fa lo stesso."),
	("Chill out, man.", "Datti una calmata, amico."),
	("You're driving me crazy.", "Mi fai impazzire."),
	("Awesome!", "Mito!"),
	("Stop messing around.", "Smettila di fare cazzate."),
	("I screwed up.", "Ho fatto un casino."),
	("No way!", "Non ci credo!"),
	("Are you kidding me?", "Mi stai prendendo in giro?"),
	("Let's bounce.", "Tagliamo la corda."),
	("I'm broke.", "Sono al verde."),
	("It's a piece of cake.", "È una passeggiata."),
	("Don't pull my leg.", "Non prendermi in giro."),
	("I'm totally out of it today.", "Oggi sono proprio fuso."),
	("Catch you later.", "Ci becchiamo dopo."),
	("That sucks.", "Fa schifo."),
	("Give me a break.", "Fammi il piacere."),
	("I'm pissed off.", "Sono incazzato nero."),
	("It costs an arm and a leg.", "Costa un occhio della testa."),
	("I'm dead tired.", "Sono stanco morto."),
	("He completely lost it.", "Ha perso completamente la brocca.")
	]

	def main():
	# 29 pairs * 2 directions = 58 rows per iteration.
	# We want these to firmly embed themselves against 800k rows.
	# An oversample of 200 gives us exactly 11,600 injected records.
	OVERSAMPLE = 200

	data_dir = Path("data/en_it_v3")
	train_file = data_dir / "train.jsonl"
	meta_file = data_dir / "metadata.json"

	if not train_file.exists():
	print(f"File {train_file} not found!")
	return

	rows = []
	for en, it in custom_pairs:
	# Eng -> Ita
	rows.append({
	"source_text": en, "target_text": it,
	"source_lang": "eng_Latn", "target_lang": "ita_Latn", "dataset": "custom_slang"
	})
	# Ita -> Eng
	rows.append({
	"source_text": it, "target_text": en,
	"source_lang": "ita_Latn", "target_lang": "eng_Latn", "dataset": "custom_slang"
	})

	print(f"Generated {len(rows)} custom pairs.")

	with open(train_file, "a", encoding="utf-8") as f:
	for _ in range(OVERSAMPLE):
	for r in rows:
	f.write(json.dumps(r, ensure_ascii=False) + "\n")

	added_count = len(rows) * OVERSAMPLE
	print(f"Successfully appended {added_count} injected records to {train_file}")

	if meta_file.exists():
	with open(meta_file, "r", encoding="utf-8") as f:
	meta = json.load(f)
	meta["counts"]["train"] += added_count
	meta["counts"]["total"] += added_count
	if "custom_slang" not in meta["datasets"]:
	meta["datasets"].append("custom_slang")
	with open(meta_file, "w", encoding="utf-8") as f:
	json.dump(meta, f, indent=2)
	print("Updated metadata.json")

	if __name__ == "__main__":
	main()