{ "cells": [ { "cell_type": "markdown", "id": "cb5f8ca7", "metadata": { "papermill": { "duration": 0.003855, "end_time": "2026-02-15T15:41:19.443065", "exception": false, "start_time": "2026-02-15T15:41:19.439210", "status": "completed" }, "tags": [] }, "source": [ "# Paradigm Classification: XGBoost + CodeBERT Ensemble\n", "\n", "Binary ensemble: XGBoost (TF-IDF + features) + Fine-tuned CodeBERT" ] }, { "cell_type": "code", "execution_count": 1, "id": "316f6cff", "metadata": { "execution": { "iopub.execute_input": "2026-02-15T15:41:19.450156Z", "iopub.status.busy": "2026-02-15T15:41:19.449882Z", "iopub.status.idle": "2026-02-15T15:41:23.827179Z", "shell.execute_reply": "2026-02-15T15:41:23.826418Z" }, "papermill": { "duration": 4.382753, "end_time": "2026-02-15T15:41:23.828864", "exception": false, "start_time": "2026-02-15T15:41:19.446111", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "!pip install -q transformers datasets torch scikit-learn xgboost imbalanced-learn ipywidgets\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "c3b6dc46", "metadata": { "execution": { "iopub.execute_input": "2026-02-15T15:41:23.836625Z", "iopub.status.busy": "2026-02-15T15:41:23.835978Z", "iopub.status.idle": "2026-02-15T15:41:57.132006Z", "shell.execute_reply": "2026-02-15T15:41:57.131323Z" }, "papermill": { "duration": 33.304703, "end_time": "2026-02-15T15:41:57.136659", "exception": false, "start_time": "2026-02-15T15:41:23.831956", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2026-02-15 15:41:40.990357: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "E0000 00:00:1771170101.187712 24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "E0000 00:00:1771170101.239102 24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "W0000 00:00:1771170101.712079 24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1771170101.712121 24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1771170101.712124 24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1771170101.712126 24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Device: cuda\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import torch\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import classification_report, accuracy_score\n", "from sklearn.utils.class_weight import compute_class_weight\n", "import xgboost as xgb\n", "from imblearn.over_sampling import SMOTE\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n", "from datasets import Dataset\n", "from scipy.sparse import hstack\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "np.random.seed(42)\n", "torch.manual_seed(42)\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "print(f\"Device: {device}\")" ] }, { "cell_type": "markdown", "id": "9e0b4233", "metadata": { "papermill": { "duration": 0.002984, "end_time": "2026-02-15T15:41:57.142722", "exception": false, "start_time": "2026-02-15T15:41:57.139738", "status": "completed" }, "tags": [] }, "source": [ "## 1. Load and Prepare Data" ] }, { "cell_type": "code", "execution_count": 3, "id": "968009e7", "metadata": { "execution": { "iopub.execute_input": "2026-02-15T15:41:57.149967Z", "iopub.status.busy": "2026-02-15T15:41:57.149477Z", "iopub.status.idle": "2026-02-15T15:41:59.659590Z", "shell.execute_reply": "2026-02-15T15:41:59.658708Z" }, "papermill": { "duration": 2.515456, "end_time": "2026-02-15T15:41:59.661083", "exception": false, "start_time": "2026-02-15T15:41:57.145627", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train: 57,235 | Val: 12,265 | Test: 12,265\n", "\n", "After sampling: 19,643 train samples\n", "\n", "Class distribution:\n", "label\n", "Functional 5000\n", "Non-Paradigm 5000\n", "Oop 5000\n", "Procedural 4643\n", "Name: count, dtype: int64\n", "\n", "Labels: {'Functional': 0, 'Non-Paradigm': 1, 'Oop': 2, 'Procedural': 3}\n" ] } ], "source": [ "train_df = pd.read_csv('/kaggle/input/datasets/aryanprakhar/paradigm/data/train.csv')\n", "val_df = pd.read_csv('/kaggle/input/datasets/aryanprakhar/paradigm/data/val.csv')\n", "test_df = pd.read_csv('/kaggle/input/datasets/aryanprakhar/paradigm/data/test.csv')\n", "\n", "print(f\"Train: {len(train_df):,} | Val: {len(val_df):,} | Test: {len(test_df):,}\")\n", "\n", "# Combine title + body\n", "def combine_text(row):\n", " title = str(row.get('title', '')) if pd.notna(row.get('title')) else ''\n", " body = str(row.get('question_body', '')) if pd.notna(row.get('question_body')) else ''\n", " return f\"{title} {body}\".strip()\n", "\n", "for df in [train_df, val_df, test_df]:\n", " if 'text' not in df.columns:\n", " df['text'] = df.apply(combine_text, axis=1)\n", " if 'paradigm_label' in df.columns:\n", " df['label'] = df['paradigm_label']\n", "\n", "# Remove Mixed class if present\n", "for df in [train_df, val_df, test_df]:\n", " if 'Mixed' in df['label'].unique():\n", " df.drop(df[df['label'] == 'Mixed'].index, inplace=True)\n", "\n", "# Stratified sampling for balanced training\n", "SAMPLES_PER_CLASS = 5000 # Adjust based on needs\n", "\n", "train_df = train_df.groupby('label', group_keys=False).apply(\n", " lambda x: x.sample(min(len(x), SAMPLES_PER_CLASS), random_state=42)\n", ").reset_index(drop=True)\n", "\n", "print(f\"\\nAfter sampling: {len(train_df):,} train samples\")\n", "print(\"\\nClass distribution:\")\n", "print(train_df['label'].value_counts())\n", "\n", "label2id = {label: idx for idx, label in enumerate(sorted(train_df['label'].unique()))}\n", "id2label = {idx: label for label, idx in label2id.items()}\n", "print(f\"\\nLabels: {label2id}\")" ] }, { "cell_type": "markdown", "id": "2fc0bcd4", "metadata": { "papermill": { "duration": 0.003083, "end_time": "2026-02-15T15:41:59.667723", "exception": false, "start_time": "2026-02-15T15:41:59.664640", "status": "completed" }, "tags": [] }, "source": [ "## 2. Feature Engineering" ] }, { "cell_type": "code", "execution_count": 4, "id": "d695f09a", "metadata": { "execution": { "iopub.execute_input": "2026-02-15T15:41:59.675370Z", "iopub.status.busy": "2026-02-15T15:41:59.675061Z", "iopub.status.idle": "2026-02-15T15:42:02.334217Z", "shell.execute_reply": "2026-02-15T15:42:02.333391Z" }, "papermill": { "duration": 2.664696, "end_time": "2026-02-15T15:42:02.335706", "exception": false, "start_time": "2026-02-15T15:41:59.671010", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Features extracted: 10 features\n" ] } ], "source": [ "import re\n", "\n", "class FeatureExtractor:\n", " def __init__(self):\n", " self.oop_kw = ['class', 'object', 'this', 'self', 'extends', 'implements', 'interface',\n", " 'public', 'private', 'protected', 'static', 'virtual', 'override']\n", " self.fp_kw = ['map', 'filter', 'reduce', 'fold', 'lambda', 'closure', '=>',\n", " 'monad', 'functor', 'pure', 'immutable', 'const', 'let']\n", " self.proc_kw = ['void', 'int', 'char', 'float', 'struct', 'malloc', 'free',\n", " 'pointer', 'goto', 'scanf', 'printf']\n", " \n", " def extract(self, text):\n", " t = text.lower()\n", " return {\n", " 'oop_score': sum(t.count(k) for k in self.oop_kw),\n", " 'fp_score': sum(t.count(k) for k in self.fp_kw),\n", " 'proc_score': sum(t.count(k) for k in self.proc_kw),\n", " 'length': len(text),\n", " 'num_lines': text.count('\\n') + 1,\n", " 'has_class': 1 if re.search(r'\\bclass\\s+\\w+', t) else 0,\n", " 'has_lambda': 1 if 'lambda' in t or '=>' in text else 0,\n", " 'num_dots': text.count('.'),\n", " 'num_arrows': text.count('->') + text.count('=>'),\n", " 'num_braces': text.count('{') + text.count('}')\n", " }\n", " \n", " def extract_batch(self, texts):\n", " return pd.DataFrame([self.extract(t) for t in texts])\n", "\n", "feature_extractor = FeatureExtractor()\n", "\n", "train_features = feature_extractor.extract_batch(train_df['text'].values)\n", "val_features = feature_extractor.extract_batch(val_df['text'].values)\n", "test_features = feature_extractor.extract_batch(test_df['text'].values)\n", "\n", "print(f\"Features extracted: {train_features.shape[1]} features\")" ] }, { "cell_type": "markdown", "id": "d501d2b8", "metadata": { "papermill": { "duration": 0.003151, "end_time": "2026-02-15T15:42:02.342259", "exception": false, "start_time": "2026-02-15T15:42:02.339108", "status": "completed" }, "tags": [] }, "source": [ "## 3. XGBoost Model" ] }, { "cell_type": "code", "execution_count": 5, "id": "3f5a061d", "metadata": { "execution": { "iopub.execute_input": "2026-02-15T15:42:02.349520Z", "iopub.status.busy": "2026-02-15T15:42:02.349268Z", "iopub.status.idle": "2026-02-15T15:42:15.967743Z", "shell.execute_reply": "2026-02-15T15:42:15.966880Z" }, "papermill": { "duration": 13.6239, "end_time": "2026-02-15T15:42:15.969262", "exception": false, "start_time": "2026-02-15T15:42:02.345362", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "After SMOTE: 20,000 samples\n" ] } ], "source": [ "# TF-IDF\n", "tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), min_df=2, max_df=0.95)\n", "\n", "tfidf_train = tfidf.fit_transform(train_df['text'])\n", "tfidf_val = tfidf.transform(val_df['text'])\n", "tfidf_test = tfidf.transform(test_df['text'])\n", "\n", "X_train = hstack([tfidf_train, train_features.values])\n", "X_val = hstack([tfidf_val, val_features.values])\n", "X_test = hstack([tfidf_test, test_features.values])\n", "\n", "y_train = train_df['label'].map(label2id).values\n", "y_val = val_df['label'].map(label2id).values\n", "y_test = test_df['label'].map(label2id).values\n", "\n", "# SMOTE\n", "smote = SMOTE(random_state=42, k_neighbors=3)\n", "X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)\n", "\n", "print(f\"After SMOTE: {X_train_balanced.shape[0]:,} samples\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "7701b18d", "metadata": { "execution": { "iopub.execute_input": "2026-02-15T15:42:15.977275Z", "iopub.status.busy": "2026-02-15T15:42:15.976976Z", "iopub.status.idle": "2026-02-15T15:42:15.989126Z", "shell.execute_reply": "2026-02-15T15:42:15.988614Z" }, "papermill": { "duration": 0.018139, "end_time": "2026-02-15T15:42:15.990815", "exception": false, "start_time": "2026-02-15T15:42:15.972676", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)\n", "sample_weights = np.array([class_weights[y] for y in y_train_balanced])" ] }, { "cell_type": "code", "execution_count": 7, "id": "7e9494ac", "metadata": { "execution": { "iopub.execute_input": "2026-02-15T15:42:15.999038Z", "iopub.status.busy": "2026-02-15T15:42:15.998799Z", "iopub.status.idle": "2026-02-15T15:44:27.427666Z", "shell.execute_reply": "2026-02-15T15:44:27.426660Z" }, "papermill": { "duration": 131.434731, "end_time": "2026-02-15T15:44:27.429146", "exception": false, "start_time": "2026-02-15T15:42:15.994415", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0]\tvalidation_0-mlogloss:1.28819\n", "[50]\tvalidation_0-mlogloss:0.39114\n", "[100]\tvalidation_0-mlogloss:0.32716\n", "[150]\tvalidation_0-mlogloss:0.30578\n", "[199]\tvalidation_0-mlogloss:0.29511\n", "\n", "XGBoost Validation Accuracy: 0.9051\n", " precision recall f1-score support\n", "\n", " Functional 0.80 0.90 0.85 1352\n", "Non-Paradigm 0.90 0.91 0.90 4539\n", " Oop 0.94 0.89 0.92 5379\n", " Procedural 0.89 0.97 0.93 995\n", "\n", " accuracy 0.91 12265\n", " macro avg 0.88 0.92 0.90 12265\n", "weighted avg 0.91 0.91 0.91 12265\n", "\n" ] } ], "source": [ "# Train XGBoost\n", "class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)\n", "sample_weights = np.array([class_weights[y] for y in y_train_balanced])\n", "\n", "xgb_model = xgb.XGBClassifier(\n", " n_estimators=200,\n", " max_depth=6,\n", " learning_rate=0.1,\n", " subsample=0.8,\n", " colsample_bytree=0.8,\n", " objective='multi:softprob',\n", " num_class=len(label2id),\n", " random_state=42,\n", " tree_method='hist'\n", ")\n", "\n", "xgb_model.fit(X_train_balanced, y_train_balanced, sample_weight=sample_weights,\n", " eval_set=[(X_val, y_val)], verbose=50)\n", "\n", "xgb_val_preds = xgb_model.predict(X_val)\n", "xgb_val_proba = xgb_model.predict_proba(X_val)\n", "xgb_acc = accuracy_score(y_val, xgb_val_preds)\n", "\n", "print(f\"\\nXGBoost Validation Accuracy: {xgb_acc:.4f}\")\n", "print(classification_report(y_val, xgb_val_preds, target_names=list(label2id.keys())))" ] }, { "cell_type": "markdown", "id": "7a54b788", "metadata": { "papermill": { "duration": 0.00343, "end_time": "2026-02-15T15:44:27.436143", "exception": false, "start_time": "2026-02-15T15:44:27.432713", "status": "completed" }, "tags": [] }, "source": [ "## 4. CodeBERT Model" ] }, { "cell_type": "code", "execution_count": 8, "id": "50f1062f", "metadata": { "execution": { "iopub.execute_input": "2026-02-15T15:44:27.443936Z", "iopub.status.busy": "2026-02-15T15:44:27.443695Z", "iopub.status.idle": "2026-02-15T15:44:53.838922Z", "shell.execute_reply": "2026-02-15T15:44:53.838324Z" }, "papermill": { "duration": 26.400859, "end_time": "2026-02-15T15:44:53.840288", "exception": false, "start_time": "2026-02-15T15:44:27.439429", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cedf920aee4a495788cebf645ad3197e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/25.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a72f1db2e8234725b2f892727bcaa6d8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/498 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cc21ce844a044791adca539392a65343", "version_major": 2, "version_minor": 0 }, "text/plain": [ "vocab.json: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "774350adbe334c7c8e39277cbc8427d9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "merges.txt: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4c576aafe9a64f2691507c5be6c45f5d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "special_tokens_map.json: 0%| | 0.00/150 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1fa65ae2450a48f99e1369365855b5ef", "version_major": 2, "version_minor": 0 }, "text/plain": [ "pytorch_model.bin: 0%| | 0.00/499M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f091577548414be79855753985855190", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/19643 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "39f389cebf1b418a9bb9ed15a7ebe6a3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/499M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9842976299d74d6c82ac619d5d3f3598", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/12265 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "41246a78155941c79d30ecd7b5a1cf66", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/12265 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def prepare_dataset(df, label2id):\n", " df = df.copy()\n", " df['label'] = df['label'].map(label2id)\n", " return Dataset.from_pandas(df[['text', 'label']])\n", "\n", "train_dataset = prepare_dataset(train_df, label2id)\n", "val_dataset = prepare_dataset(val_df, label2id)\n", "test_dataset = prepare_dataset(test_df, label2id)\n", "\n", "model_name = \"microsoft/codebert-base\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForSequenceClassification.from_pretrained(\n", " model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id\n", ")\n", "\n", "def tokenize(examples):\n", " return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=256)\n", "\n", "tokenized_train = train_dataset.map(tokenize, batched=True, remove_columns=['text'])\n", "tokenized_val = val_dataset.map(tokenize, batched=True, remove_columns=['text'])\n", "tokenized_test = test_dataset.map(tokenize, batched=True, remove_columns=['text'])" ] }, { "cell_type": "code", "execution_count": 9, "id": "f47907c1", "metadata": { "execution": { "iopub.execute_input": "2026-02-15T15:44:53.850211Z", "iopub.status.busy": "2026-02-15T15:44:53.849951Z", "iopub.status.idle": "2026-02-15T16:11:34.821102Z", "shell.execute_reply": "2026-02-15T16:11:34.820384Z" }, "papermill": { "duration": 1600.977769, "end_time": "2026-02-15T16:11:34.822734", "exception": false, "start_time": "2026-02-15T15:44:53.844965", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", "
| Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "Accuracy | \n", "F1 Macro | \n", "F1 Weighted | \n", "Precision | \n", "Recall | \n", "
|---|---|---|---|---|---|---|---|
| 1 | \n", "0.188200 | \n", "0.228999 | \n", "0.926376 | \n", "0.921905 | \n", "0.927637 | \n", "0.904159 | \n", "0.948625 | \n", "
| 2 | \n", "0.084600 | \n", "0.264108 | \n", "0.936649 | \n", "0.932176 | \n", "0.936922 | \n", "0.916146 | \n", "0.951797 | \n", "
| 3 | \n", "0.060500 | \n", "0.231579 | \n", "0.947167 | \n", "0.945263 | \n", "0.947364 | \n", "0.936371 | \n", "0.955111 | \n", "
"
],
"text/plain": [
"