Important_NMT_DOCs / remove_quotes.py
Vikrantyadav11234's picture
Upload remove_quotes.py with huggingface_hub
c31e9b4 verified
import re
import sys
import logging
import os
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def remove_quotes(text):
# Remove leading and trailing quotes (single or double)
return re.sub(r'^[\'"]+|[\'"]+$', '', text.strip())
def clean_data(input_file, output_file):
try:
with open(input_file, 'r', encoding='utf-8') as infile:
data = infile.readlines()
cleaned_data = []
for line in data:
parts = line.strip().split('\t')
if len(parts) != 2:
logging.warning(f"Skipping invalid line: {line.strip()}")
continue
source, target = parts
cleaned_source = remove_quotes(source)
cleaned_target = remove_quotes(target)
cleaned_data.append(f"{cleaned_source}\t{cleaned_target}\n")
with open(output_file, 'w', encoding='utf-8') as outfile:
outfile.writelines(cleaned_data)
logging.info(f"Cleaned data has been written to {output_file}")
except Exception as e:
logging.error(f"An error occurred: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
script_dir = os.path.dirname(os.path.abspath(__file__))
default_input = os.path.join(script_dir, 'tag_dictonary_1.txt')
default_output = os.path.join(script_dir, 'tag_dictonary_1_cleaned.txt')
if len(sys.argv) == 1:
input_file = default_input
output_file = default_output
elif len(sys.argv) == 3:
input_file = sys.argv[1]
output_file = sys.argv[2]
else:
print("Usage: python remove_quotes.py [<input_file> <output_file>]")
print("If no arguments are provided, default files will be used.")
sys.exit(1)
clean_data(input_file, output_file)