Upload generate_tag_sentences.py with huggingface_hub
Browse files- generate_tag_sentences.py +27 -0
generate_tag_sentences.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
|
| 3 |
+
# Load your source-target data
|
| 4 |
+
source_file = "/home/vikrant-MNMT/myenv/NMT_V2/train.src" # Your mixed-language source file
|
| 5 |
+
target_file = "/home/vikrant-MNMT/myenv/NMT_V2/train.tgt" # Corresponding target file
|
| 6 |
+
|
| 7 |
+
# Read files
|
| 8 |
+
with open(source_file, 'r', encoding='utf-8') as src, open(target_file, 'r', encoding='utf-8') as tgt:
|
| 9 |
+
source_lines = src.readlines()
|
| 10 |
+
target_lines = tgt.readlines()
|
| 11 |
+
|
| 12 |
+
# Collect phrase pairs
|
| 13 |
+
phrase_dict = Counter()
|
| 14 |
+
|
| 15 |
+
# Populate the dictionary
|
| 16 |
+
for src_line, tgt_line in zip(source_lines, target_lines):
|
| 17 |
+
src_words = src_line.strip().split()
|
| 18 |
+
tgt_words = tgt_line.strip().split()
|
| 19 |
+
|
| 20 |
+
# Add only phrase pairs with matching lengths
|
| 21 |
+
if len(src_words) <= 4 and len(tgt_words) <= 4: # Customize phrase length
|
| 22 |
+
phrase_dict[(src_line.strip(), tgt_line.strip())] += 1
|
| 23 |
+
|
| 24 |
+
# Save to file
|
| 25 |
+
with open("tags_dictionary.txt", "w", encoding='utf-8') as out_file:
|
| 26 |
+
for (src, tgt), _ in phrase_dict.items():
|
| 27 |
+
out_file.write(f"{src}\t{tgt}\n")
|