import re import sys import logging import os logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def remove_quotes(text): # Remove leading and trailing quotes (single or double) return re.sub(r'^[\'"]+|[\'"]+$', '', text.strip()) def clean_data(input_file, output_file): try: with open(input_file, 'r', encoding='utf-8') as infile: data = infile.readlines() cleaned_data = [] for line in data: parts = line.strip().split('\t') if len(parts) != 2: logging.warning(f"Skipping invalid line: {line.strip()}") continue source, target = parts cleaned_source = remove_quotes(source) cleaned_target = remove_quotes(target) cleaned_data.append(f"{cleaned_source}\t{cleaned_target}\n") with open(output_file, 'w', encoding='utf-8') as outfile: outfile.writelines(cleaned_data) logging.info(f"Cleaned data has been written to {output_file}") except Exception as e: logging.error(f"An error occurred: {str(e)}") sys.exit(1) if __name__ == "__main__": script_dir = os.path.dirname(os.path.abspath(__file__)) default_input = os.path.join(script_dir, 'tag_dictonary_1.txt') default_output = os.path.join(script_dir, 'tag_dictonary_1_cleaned.txt') if len(sys.argv) == 1: input_file = default_input output_file = default_output elif len(sys.argv) == 3: input_file = sys.argv[1] output_file = sys.argv[2] else: print("Usage: python remove_quotes.py [ ]") print("If no arguments are provided, default files will be used.") sys.exit(1) clean_data(input_file, output_file)