Vikrantyadav11234 commited on
Commit
0c8e7f3
·
verified ·
1 Parent(s): 86d1e0d

Upload generate_tag_sentences.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. generate_tag_sentences.py +27 -0
generate_tag_sentences.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+
3
+ # Load your source-target data
4
+ source_file = "/home/vikrant-MNMT/myenv/NMT_V2/train.src" # Your mixed-language source file
5
+ target_file = "/home/vikrant-MNMT/myenv/NMT_V2/train.tgt" # Corresponding target file
6
+
7
+ # Read files
8
+ with open(source_file, 'r', encoding='utf-8') as src, open(target_file, 'r', encoding='utf-8') as tgt:
9
+ source_lines = src.readlines()
10
+ target_lines = tgt.readlines()
11
+
12
+ # Collect phrase pairs
13
+ phrase_dict = Counter()
14
+
15
+ # Populate the dictionary
16
+ for src_line, tgt_line in zip(source_lines, target_lines):
17
+ src_words = src_line.strip().split()
18
+ tgt_words = tgt_line.strip().split()
19
+
20
+ # Add only phrase pairs with matching lengths
21
+ if len(src_words) <= 4 and len(tgt_words) <= 4: # Customize phrase length
22
+ phrase_dict[(src_line.strip(), tgt_line.strip())] += 1
23
+
24
+ # Save to file
25
+ with open("tags_dictionary.txt", "w", encoding='utf-8') as out_file:
26
+ for (src, tgt), _ in phrase_dict.items():
27
+ out_file.write(f"{src}\t{tgt}\n")