{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "f4d51cb4", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "id": "344fbcef", "metadata": {}, "outputs": [], "source": [ "df_gemma1 = pd.read_csv('test_predictions_gemma3.csv')\n", "df_camel = pd.read_csv('test_predictions_camelbert_cpt_ftbeforesleep.csv')\n", "df_arabert = pd.read_csv('test_predictions_arabert_full_pipeline.csv')\n", "df_dziribert = pd.read_csv('test_predictions_dziribert.csv')\n", "df_marbert= pd.read_csv('test_predictions_marbertv2_cpt_ft5.csv')\n", "df_gemma2 = pd.read_csv('test_predictions_gemma3-2nd.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "29486bd8", "metadata": {}, "outputs": [], "source": [ "# Perform soft voting with weights\n", "# Give more weight to df_gemma1 (weight = 2), others get weight = 1\n", "\n", "# Define weights for each model\n", "weights = {\n", " 'df_gemma1': 2.0,\n", " 'df_camel': 1.0,\n", " 'df_arabert': 1.0,\n", " 'df_dziribert': 1.0,\n", " 'df_gemma2': 1.0,\n", " 'df_marbert': 1.0\n", "}\n", "\n", "# Create a combined dataframe with id and Commentaire client from the first dataframe\n", "result_df = df_gemma1[['id', 'Réseau Social', 'Commentaire client']].copy()\n", "\n", "# Initialize a dictionary to store weighted vote counts for each class\n", "from collections import defaultdict\n", "import numpy as np\n", "\n", "# For each row, calculate weighted votes\n", "final_predictions = []\n", "\n", "for idx in range(len(df_gemma1)):\n", " vote_counts = defaultdict(float)\n", " \n", " # Add weighted votes from each model\n", " vote_counts[df_gemma1.iloc[idx]['Predicted_Class']] += weights['df_gemma1']\n", " vote_counts[df_camel.iloc[idx]['Predicted_Class']] += weights['df_camel']\n", " vote_counts[df_arabert.iloc[idx]['Predicted_Class']] += weights['df_arabert']\n", " vote_counts[df_dziribert.iloc[idx]['Predicted_Class']] += weights['df_dziribert']\n", " vote_counts[df_gemma2.iloc[idx]['Predicted_Class']] += weights['df_gemma2']\n", " vote_counts[df_marbert.iloc[idx]['Predicted_Class']] += weights['df_marbert']\n", " \n", " # Select class with highest weighted vote\n", " final_prediction = max(vote_counts.items(), key=lambda x: x[1])[0]\n", " final_predictions.append(final_prediction)\n", "\n", "# Add predictions to result dataframe\n", "result_df['Predicted_Class'] = final_predictions\n", "\n", "# Display statistics\n", "print(f\"Total samples: {len(result_df)}\")\n", "print(f\"\\nClass distribution:\")\n", "print(result_df['Predicted_Class'].value_counts().sort_index())\n", "print(f\"\\nWeight configuration:\")\n", "for model, weight in weights.items():\n", " print(f\" {model}: {weight}\")\n", "\n", "# Display first few rows\n", "print(f\"\\nFirst 5 predictions:\")\n", "result_df.head()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "543f2936", "metadata": {}, "outputs": [], "source": [ "# Save results to CSV (only id and Class)\n", "output_filename = 'test_predictions_weighted_voting_ensemble.csv'\n", "output_df = result_df[['id', 'Predicted_Class']].copy()\n", "output_df.rename(columns={'Predicted_Class': 'Class'}, inplace=True)\n", "output_df.to_csv(output_filename, index=False)\n", "print(f\"Results saved to: {output_filename}\")\n", "print(f\"Columns in output: id, Class\")\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }