File size: 1,812 Bytes
c31e9b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re
import sys
import logging
import os

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def remove_quotes(text):
    # Remove leading and trailing quotes (single or double)
    return re.sub(r'^[\'"]+|[\'"]+$', '', text.strip())

def clean_data(input_file, output_file):
    try:
        with open(input_file, 'r', encoding='utf-8') as infile:
            data = infile.readlines()

        cleaned_data = []
        for line in data:
            parts = line.strip().split('\t')
            if len(parts) != 2:
                logging.warning(f"Skipping invalid line: {line.strip()}")
                continue

            source, target = parts
            cleaned_source = remove_quotes(source)
            cleaned_target = remove_quotes(target)
            cleaned_data.append(f"{cleaned_source}\t{cleaned_target}\n")

        with open(output_file, 'w', encoding='utf-8') as outfile:
            outfile.writelines(cleaned_data)

        logging.info(f"Cleaned data has been written to {output_file}")
    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
        sys.exit(1)

if __name__ == "__main__":
    script_dir = os.path.dirname(os.path.abspath(__file__))
    default_input = os.path.join(script_dir, 'tag_dictonary_1.txt')
    default_output = os.path.join(script_dir, 'tag_dictonary_1_cleaned.txt')

    if len(sys.argv) == 1:
        input_file = default_input
        output_file = default_output
    elif len(sys.argv) == 3:
        input_file = sys.argv[1]
        output_file = sys.argv[2]
    else:
        print("Usage: python remove_quotes.py [<input_file> <output_file>]")
        print("If no arguments are provided, default files will be used.")
        sys.exit(1)

    clean_data(input_file, output_file)