import pandas as pd
from pathlib import Path

def merge_datasets():
    data_dir = Path("data")
    custom_words_file = data_dir / "custom_badwords_dataset.xlsx"
    main_dataset_file = data_dir / "training_data_telugu-hate.xlsx"
    
    if not custom_words_file.exists():
        print(f"Error: {custom_words_file} not found.")
        return
        
    if not main_dataset_file.exists():
        print(f"Error: {main_dataset_file} not found.")
        return

    # Load both datasets
    print("Loading data...")
    custom_df = pd.read_excel(custom_words_file)
    main_df = pd.read_excel(main_dataset_file)
    
    print(f"Original main dataset size: {len(main_df)}")
    print(f"Custom badwords size: {len(custom_df)}")
    
    # Identify column names in main_dataset (usually text/comment and label/category)
    # Based on kaggle_model script, we know text could be 'text' or 'comment'
    text_col_main = next((c for c in main_df.columns if str(c).lower() in ['text', 'comment', 'comments', 'sentence']), 'text')
    label_col_main = next((c for c in main_df.columns if str(c).lower() in ['label', 'labels', 'category', 'class']), 'label')
    
    print(f"Identified columns in main dataset -> Text: '{text_col_main}', Label: '{label_col_main}'")
    
    # Rename custom dataset columns to match main dataset
    custom_df = custom_df.rename(columns={'text': text_col_main, 'label': label_col_main})
    
    # Combine the dataframes
    merged_df = pd.concat([main_df, custom_df], ignore_index=True)
    
    # Remove any absolute duplicates just in case
    merged_df = merged_df.drop_duplicates(subset=[text_col_main]).reset_index(drop=True)
    
    print(f"New merged dataset size: {len(merged_df)}")
    
    # Make a backup of the original just in case we need it
    backup_path = data_dir / "training_data_telugu-hate_backup2.xlsx"
    main_df.to_excel(backup_path, index=False)
    print(f"Saved backup of original to {backup_path}")
    
    # Overwrite the main dataset
    merged_df.to_excel(main_dataset_file, index=False)
    print(f"Successfully merged and saved updated dataset to {main_dataset_file}")

if __name__ == "__main__":
    merge_datasets()