{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4d51cb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "344fbcef",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_gemma1 = pd.read_csv('test_predictions_gemma3.csv')\n",
    "df_camel = pd.read_csv('test_predictions_camelbert_cpt_ftbeforesleep.csv')\n",
    "df_arabert = pd.read_csv('test_predictions_arabert_full_pipeline.csv')\n",
    "df_dziribert = pd.read_csv('test_predictions_dziribert.csv')\n",
    "df_marbert= pd.read_csv('test_predictions_marbertv2_cpt_ft5.csv')\n",
    "df_gemma2 = pd.read_csv('test_predictions_gemma3-2nd.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29486bd8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Perform soft voting with weights\n",
    "# Give more weight to df_gemma1 (weight = 2), others get weight = 1\n",
    "\n",
    "# Define weights for each model\n",
    "weights = {\n",
    "    'df_gemma1': 2.0,\n",
    "    'df_camel': 1.0,\n",
    "    'df_arabert': 1.0,\n",
    "    'df_dziribert': 1.0,\n",
    "    'df_gemma2': 1.0,\n",
    "    'df_marbert': 1.0\n",
    "}\n",
    "\n",
    "# Create a combined dataframe with id and Commentaire client from the first dataframe\n",
    "result_df = df_gemma1[['id', 'Réseau Social', 'Commentaire client']].copy()\n",
    "\n",
    "# Initialize a dictionary to store weighted vote counts for each class\n",
    "from collections import defaultdict\n",
    "import numpy as np\n",
    "\n",
    "# For each row, calculate weighted votes\n",
    "final_predictions = []\n",
    "\n",
    "for idx in range(len(df_gemma1)):\n",
    "    vote_counts = defaultdict(float)\n",
    "    \n",
    "    # Add weighted votes from each model\n",
    "    vote_counts[df_gemma1.iloc[idx]['Predicted_Class']] += weights['df_gemma1']\n",
    "    vote_counts[df_camel.iloc[idx]['Predicted_Class']] += weights['df_camel']\n",
    "    vote_counts[df_arabert.iloc[idx]['Predicted_Class']] += weights['df_arabert']\n",
    "    vote_counts[df_dziribert.iloc[idx]['Predicted_Class']] += weights['df_dziribert']\n",
    "    vote_counts[df_gemma2.iloc[idx]['Predicted_Class']] += weights['df_gemma2']\n",
    "    vote_counts[df_marbert.iloc[idx]['Predicted_Class']] += weights['df_marbert']\n",
    "    \n",
    "    # Select class with highest weighted vote\n",
    "    final_prediction = max(vote_counts.items(), key=lambda x: x[1])[0]\n",
    "    final_predictions.append(final_prediction)\n",
    "\n",
    "# Add predictions to result dataframe\n",
    "result_df['Predicted_Class'] = final_predictions\n",
    "\n",
    "# Display statistics\n",
    "print(f\"Total samples: {len(result_df)}\")\n",
    "print(f\"\\nClass distribution:\")\n",
    "print(result_df['Predicted_Class'].value_counts().sort_index())\n",
    "print(f\"\\nWeight configuration:\")\n",
    "for model, weight in weights.items():\n",
    "    print(f\"  {model}: {weight}\")\n",
    "\n",
    "# Display first few rows\n",
    "print(f\"\\nFirst 5 predictions:\")\n",
    "result_df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "543f2936",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save results to CSV (only id and Class)\n",
    "output_filename = 'test_predictions_weighted_voting_ensemble.csv'\n",
    "output_df = result_df[['id', 'Predicted_Class']].copy()\n",
    "output_df.rename(columns={'Predicted_Class': 'Class'}, inplace=True)\n",
    "output_df.to_csv(output_filename, index=False)\n",
    "print(f\"Results saved to: {output_filename}\")\n",
    "print(f\"Columns in output: id, Class\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}