Vikrantyadav11234
/

Important_NMT_DOCs

Vikrantyadav11234 commited on Dec 17, 2024

Commit

0c8e7f3

verified ·

1 Parent(s): 86d1e0d

Upload generate_tag_sentences.py with huggingface_hub

Files changed (1) hide show

generate_tag_sentences.py ADDED Viewed

+from collections import Counter
+# Load your source-target data
+source_file = "/home/vikrant-MNMT/myenv/NMT_V2/train.src"  # Your mixed-language source file
+target_file = "/home/vikrant-MNMT/myenv/NMT_V2/train.tgt"  # Corresponding target file
+# Read files
+with open(source_file, 'r', encoding='utf-8') as src, open(target_file, 'r', encoding='utf-8') as tgt:
+    source_lines = src.readlines()
+    target_lines = tgt.readlines()
+# Collect phrase pairs
+phrase_dict = Counter()
+# Populate the dictionary
+for src_line, tgt_line in zip(source_lines, target_lines):
+    src_words = src_line.strip().split()
+    tgt_words = tgt_line.strip().split()
+    # Add only phrase pairs with matching lengths
+    if len(src_words) <= 4 and len(tgt_words) <= 4:  # Customize phrase length
+        phrase_dict[(src_line.strip(), tgt_line.strip())] += 1
+# Save to file
+with open("tags_dictionary.txt", "w", encoding='utf-8') as out_file:
+    for (src, tgt), _ in phrase_dict.items():
+        out_file.write(f"{src}\t{tgt}\n")