|
|
import re |
|
|
import sys |
|
|
import logging |
|
|
import os |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
def remove_quotes(text): |
|
|
|
|
|
return re.sub(r'^[\'"]+|[\'"]+$', '', text.strip()) |
|
|
|
|
|
def clean_data(input_file, output_file): |
|
|
try: |
|
|
with open(input_file, 'r', encoding='utf-8') as infile: |
|
|
data = infile.readlines() |
|
|
|
|
|
cleaned_data = [] |
|
|
for line in data: |
|
|
parts = line.strip().split('\t') |
|
|
if len(parts) != 2: |
|
|
logging.warning(f"Skipping invalid line: {line.strip()}") |
|
|
continue |
|
|
|
|
|
source, target = parts |
|
|
cleaned_source = remove_quotes(source) |
|
|
cleaned_target = remove_quotes(target) |
|
|
cleaned_data.append(f"{cleaned_source}\t{cleaned_target}\n") |
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as outfile: |
|
|
outfile.writelines(cleaned_data) |
|
|
|
|
|
logging.info(f"Cleaned data has been written to {output_file}") |
|
|
except Exception as e: |
|
|
logging.error(f"An error occurred: {str(e)}") |
|
|
sys.exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
script_dir = os.path.dirname(os.path.abspath(__file__)) |
|
|
default_input = os.path.join(script_dir, 'tag_dictonary_1.txt') |
|
|
default_output = os.path.join(script_dir, 'tag_dictonary_1_cleaned.txt') |
|
|
|
|
|
if len(sys.argv) == 1: |
|
|
input_file = default_input |
|
|
output_file = default_output |
|
|
elif len(sys.argv) == 3: |
|
|
input_file = sys.argv[1] |
|
|
output_file = sys.argv[2] |
|
|
else: |
|
|
print("Usage: python remove_quotes.py [<input_file> <output_file>]") |
|
|
print("If no arguments are provided, default files will be used.") |
|
|
sys.exit(1) |
|
|
|
|
|
clean_data(input_file, output_file) |
|
|
|