Spaces:
Running
Running
| import pandas as pd | |
| import os | |
| def clean_and_merge_data(): | |
| print("Starting data cleaning process...") | |
| # --- THE FIX: Bulletproof Pathing --- | |
| # 1. Find exactly where this cleaner.py script lives | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| # 2. Go up two folders (from src/preprocessing) to find the project root | |
| project_root = os.path.abspath(os.path.join(script_dir, "../../")) | |
| # 3. Define the exact paths to the data folders | |
| raw_dir = os.path.join(project_root, "data", "raw") | |
| processed_dir = os.path.join(project_root, "data", "processed") | |
| # 1. Process Combined Data.csv | |
| print("Processing Combined Data.csv...") | |
| df_combined = pd.read_csv(os.path.join(raw_dir, "Combined Data.csv")) | |
| df_combined = df_combined[['statement', 'status']].rename(columns={'statement': 'text', 'status': 'label'}) | |
| # 2. Process go_emotions_dataset[1].csv | |
| print("Processing go_emotions_dataset...") | |
| df_go = pd.read_csv(os.path.join(raw_dir, "go_emotions_dataset[1].csv")) | |
| emotion_columns = df_go.columns[3:] | |
| df_go['label'] = df_go[emotion_columns].idxmax(axis=1) | |
| df_go = df_go[['text', 'label']] | |
| # 3. Process train-00000-of-00001.parquet | |
| print("Processing parquet file...") | |
| df_parquet = pd.read_parquet(os.path.join(raw_dir, "train-00000-of-00001.parquet")) | |
| if 'label' not in df_parquet.columns and 'labels' in df_parquet.columns: | |
| df_parquet = df_parquet.rename(columns={'labels': 'label'}) | |
| df_parquet = df_parquet[['text', 'label']] | |
| # 4. Merge all datasets together | |
| print("Merging datasets...") | |
| master_df = pd.concat([df_combined, df_go, df_parquet], ignore_index=True) | |
| # 5. Clean the text formatting | |
| print("Cleaning text data...") | |
| master_df['text'] = master_df['text'].astype(str).str.lower().str.strip() | |
| # 6. Save the final processed dataset | |
| print("Saving final dataset...") | |
| output_path = os.path.join(processed_dir, "master_training_data.csv") | |
| os.makedirs(processed_dir, exist_ok=True) | |
| master_df.to_csv(output_path, index=False) | |
| print(f"Success! Master dataset created with {len(master_df)} rows and saved to:\n{output_path}") | |
| if __name__ == "__main__": | |
| clean_and_merge_data() |