import os

def remove_duplicates(input_file, output_file):
    unique_lines = set()
    
    # Read the input file and store unique lines
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            unique_lines.add(line.strip())
    
    # Write unique lines to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in unique_lines:
            f.write(line + '\n')
    
    print(f"Duplicates removed. Original file had {sum(1 for _ in open(input_file))} lines.")
    print(f"New file has {len(unique_lines)} lines.")
    print(f"Removed {sum(1 for _ in open(input_file)) - len(unique_lines)} duplicate lines.")

def main():
    input_file = '/home/vikrant-MNMT/myenv/BPCC/inline_tages/eng_Latn-hin_Deva/tag_dictonary_1.txt'
    output_file = '/home/vikrant-MNMT/myenv/BPCC/inline_tages/eng_Latn-hin_Deva/tag_dictonary_1.txt'
    
    try:
        if not os.path.exists(input_file):
            raise FileNotFoundError(f"Input file not found: {input_file}")
        
        remove_duplicates(input_file, output_file)
        print(f"Duplicates removed successfully. Output file: {output_file}")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()