Vikrantyadav11234 commited on
Commit
d0dd049
·
verified ·
1 Parent(s): 76023b0

Upload remove_language_tags.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. remove_language_tags.py +21 -0
remove_language_tags.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def remove_language_tags(input_file, output_file):
4
+ language_tags = r'^(hi|en|mr|pa|bn|or|ur|as|ml|ta|te|kn|gu)\s*:'
5
+
6
+ with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
7
+ for line in infile:
8
+ # Remove language tags
9
+ cleaned_line = re.sub(language_tags, '', line.strip(), flags=re.IGNORECASE)
10
+ # Write the cleaned line, preserving original whitespace
11
+ outfile.write(cleaned_line.strip() + '\n')
12
+
13
+ print(f"Processed {input_file} and saved results to {output_file}")
14
+
15
+ def main():
16
+ # Process source file
17
+ remove_language_tags('NMT_V2/train.src', 'NMT_V2/train_cleaned.src')
18
+
19
+
20
+ if __name__ == "__main__":
21
+ main()