Vikrantyadav11234
/

Important_NMT_DOCs

Vikrantyadav11234 commited on Dec 17, 2024

Commit

d0dd049

verified ·

1 Parent(s): 76023b0

Upload remove_language_tags.py with huggingface_hub

Files changed (1) hide show

remove_language_tags.py ADDED Viewed

+import re
+def remove_language_tags(input_file, output_file):
+    language_tags = r'^(hi|en|mr|pa|bn|or|ur|as|ml|ta|te|kn|gu)\s*:'
+    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
+        for line in infile:
+            # Remove language tags
+            cleaned_line = re.sub(language_tags, '', line.strip(), flags=re.IGNORECASE)
+            # Write the cleaned line, preserving original whitespace
+            outfile.write(cleaned_line.strip() + '\n')
+    print(f"Processed {input_file} and saved results to {output_file}")
+def main():
+    # Process source file
+    remove_language_tags('NMT_V2/train.src', 'NMT_V2/train_cleaned.src')
+if __name__ == "__main__":
+    main()