Spaces:

tejesh916K
/

comment-guard-api

Sleeping

App Files Files Community

comment-guard-api / merge_datasets.py

tejesh916K

Deploy: Comment Guard API - FastAPI + MuRIL BERT

b8300d6 2 months ago

raw

history blame contribute delete

2.19 kB

	import pandas as pd
	from pathlib import Path

	def merge_datasets():
	data_dir = Path("data")
	custom_words_file = data_dir / "custom_badwords_dataset.xlsx"
	main_dataset_file = data_dir / "training_data_telugu-hate.xlsx"

	if not custom_words_file.exists():
	print(f"Error: {custom_words_file} not found.")
	return

	if not main_dataset_file.exists():
	print(f"Error: {main_dataset_file} not found.")
	return

	# Load both datasets
	print("Loading data...")
	custom_df = pd.read_excel(custom_words_file)
	main_df = pd.read_excel(main_dataset_file)

	print(f"Original main dataset size: {len(main_df)}")
	print(f"Custom badwords size: {len(custom_df)}")

	# Identify column names in main_dataset (usually text/comment and label/category)
	# Based on kaggle_model script, we know text could be 'text' or 'comment'
	text_col_main = next((c for c in main_df.columns if str(c).lower() in ['text', 'comment', 'comments', 'sentence']), 'text')
	label_col_main = next((c for c in main_df.columns if str(c).lower() in ['label', 'labels', 'category', 'class']), 'label')

	print(f"Identified columns in main dataset -> Text: '{text_col_main}', Label: '{label_col_main}'")

	# Rename custom dataset columns to match main dataset
	custom_df = custom_df.rename(columns={'text': text_col_main, 'label': label_col_main})

	# Combine the dataframes
	merged_df = pd.concat([main_df, custom_df], ignore_index=True)

	# Remove any absolute duplicates just in case
	merged_df = merged_df.drop_duplicates(subset=[text_col_main]).reset_index(drop=True)

	print(f"New merged dataset size: {len(merged_df)}")

	# Make a backup of the original just in case we need it
	backup_path = data_dir / "training_data_telugu-hate_backup2.xlsx"
	main_df.to_excel(backup_path, index=False)
	print(f"Saved backup of original to {backup_path}")

	# Overwrite the main dataset
	merged_df.to_excel(main_dataset_file, index=False)
	print(f"Successfully merged and saved updated dataset to {main_dataset_file}")

	if __name__ == "__main__":
	merge_datasets()