Pujan-Dev commited on
Commit
33fb2d7
·
1 Parent(s): f11f069

chore: update code structure for improved readability and maintainability

Browse files
notebook/ai_vs_human/final_archi.md CHANGED
@@ -1,5 +1,5 @@
1
  # AI vs Human Text Detector V3 - Final Architecture Summary
2
-
3
  **Model Version**: V3
4
  **Type**: Hybrid Feature Engineering + TF-IDF Classifier
5
  **Output Directory**: `./v3_model/`
 
1
  # AI vs Human Text Detector V3 - Final Architecture Summary
2
+ dataset = "Pujan-Dev/english_aivshuman"
3
  **Model Version**: V3
4
  **Type**: Hybrid Feature Engineering + TF-IDF Classifier
5
  **Output Directory**: `./v3_model/`
notebook/ai_vs_human_nepali/notebook/Nepali_Ai_vs_Human.ipynb ADDED
@@ -0,0 +1,1429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "901fc22d",
7
+ "metadata": {
8
+ "id": "901fc22d"
9
+ },
10
+ "outputs": [
11
+ {
12
+ "name": "stderr",
13
+ "output_type": "stream",
14
+ "text": [
15
+ "/home/pujan/miniconda3/envs/ml/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
16
+ " from .autonotebook import tqdm as notebook_tqdm\n"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "import os\n",
22
+ "os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'\n",
23
+ "\n",
24
+ "import math\n",
25
+ "import pandas as pd\n",
26
+ "import torch\n",
27
+ "from torch.utils.data import Dataset, DataLoader\n",
28
+ "from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup\n",
29
+ "from sklearn.model_selection import train_test_split\n",
30
+ "from sklearn.metrics import classification_report, f1_score, accuracy_score\n",
31
+ "import torch.nn as nn\n",
32
+ "from torch.optim import AdamW"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 2,
38
+ "id": "70d3c048",
39
+ "metadata": {},
40
+ "outputs": [
41
+ {
42
+ "name": "stdout",
43
+ "output_type": "stream",
44
+ "text": [
45
+ "Columns: ['human_text', 'ai_generated_text']\n",
46
+ "Prepared dataset shape: (1986, 2)\n",
47
+ "label\n",
48
+ "1 996\n",
49
+ "0 990\n",
50
+ "Name: count, dtype: int64\n"
51
+ ]
52
+ },
53
+ {
54
+ "data": {
55
+ "text/html": [
56
+ "<div>\n",
57
+ "<style scoped>\n",
58
+ " .dataframe tbody tr th:only-of-type {\n",
59
+ " vertical-align: middle;\n",
60
+ " }\n",
61
+ "\n",
62
+ " .dataframe tbody tr th {\n",
63
+ " vertical-align: top;\n",
64
+ " }\n",
65
+ "\n",
66
+ " .dataframe thead th {\n",
67
+ " text-align: right;\n",
68
+ " }\n",
69
+ "</style>\n",
70
+ "<table border=\"1\" class=\"dataframe\">\n",
71
+ " <thead>\n",
72
+ " <tr style=\"text-align: right;\">\n",
73
+ " <th></th>\n",
74
+ " <th>text</th>\n",
75
+ " <th>label</th>\n",
76
+ " </tr>\n",
77
+ " </thead>\n",
78
+ " <tbody>\n",
79
+ " <tr>\n",
80
+ " <th>0</th>\n",
81
+ " <td>हामीले पार्टी एकतापछि कि दुबै पार्टीको सिद्धान...</td>\n",
82
+ " <td>0</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>1</th>\n",
86
+ " <td>एमाले प्रतिनिधिसभाको प्रत्यक्षतर्फ ८० समानुपात...</td>\n",
87
+ " <td>0</td>\n",
88
+ " </tr>\n",
89
+ " <tr>\n",
90
+ " <th>2</th>\n",
91
+ " <td>नेकपा माओवादी केन्द्रका नेता रामनारायण विडारील...</td>\n",
92
+ " <td>1</td>\n",
93
+ " </tr>\n",
94
+ " <tr>\n",
95
+ " <th>3</th>\n",
96
+ " <td>प्रदेश नं २ का मुख्यमन्त्रीको रूपमा संघीय समाज...</td>\n",
97
+ " <td>1</td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <th>4</th>\n",
101
+ " <td>बिहीबार एमालेका अध्यक्ष केपी शर्मा ओली र माओवा...</td>\n",
102
+ " <td>0</td>\n",
103
+ " </tr>\n",
104
+ " </tbody>\n",
105
+ "</table>\n",
106
+ "</div>"
107
+ ],
108
+ "text/plain": [
109
+ " text label\n",
110
+ "0 हामीले पार्टी एकतापछि कि दुबै पार्टीको सिद्धान... 0\n",
111
+ "1 एमाले प्रतिनिधिसभाको प्रत्यक्षतर्फ ८० समानुपात... 0\n",
112
+ "2 नेकपा माओवादी केन्द्रका नेता रामनारायण विडारील... 1\n",
113
+ "3 प्रदेश नं २ का मुख्यमन्त्रीको रूपमा संघीय समाज... 1\n",
114
+ "4 बिहीबार एमालेका अध्यक्�� केपी शर्मा ओली र माओवा... 0"
115
+ ]
116
+ },
117
+ "execution_count": 2,
118
+ "metadata": {},
119
+ "output_type": "execute_result"
120
+ }
121
+ ],
122
+ "source": [
123
+ "# Load Dataset and convert to binary classification format\n",
124
+ "DATA_PATH = '../DATASET/new_data.csv'\n",
125
+ "raw_df = pd.read_csv(DATA_PATH)\n",
126
+ "print('Columns:', raw_df.columns.tolist())\n",
127
+ "\n",
128
+ "required_cols = ['human_text', 'ai_generated_text']\n",
129
+ "missing = [c for c in required_cols if c not in raw_df.columns]\n",
130
+ "if missing:\n",
131
+ " raise ValueError(f'Missing required columns: {missing}')\n",
132
+ "\n",
133
+ "# Build unified training dataframe: text + label (0=Human, 1=AI)\n",
134
+ "df_human = raw_df[['human_text']].dropna().rename(columns={'human_text': 'text'})\n",
135
+ "df_human['label'] = 0\n",
136
+ "\n",
137
+ "df_ai = raw_df[['ai_generated_text']].dropna().rename(columns={'ai_generated_text': 'text'})\n",
138
+ "df_ai['label'] = 1\n",
139
+ "\n",
140
+ "df = pd.concat([df_human, df_ai], ignore_index=True)\n",
141
+ "df['text'] = df['text'].astype(str).str.strip()\n",
142
+ "df = df[df['text'].str.len() > 10].drop_duplicates(subset=['text']).sample(frac=1, random_state=42).reset_index(drop=True)\n",
143
+ "\n",
144
+ "print('Prepared dataset shape:', df.shape)\n",
145
+ "print(df['label'].value_counts())\n",
146
+ "df.head()"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 3,
152
+ "id": "f93d4c7a",
153
+ "metadata": {
154
+ "id": "f93d4c7a"
155
+ },
156
+ "outputs": [
157
+ {
158
+ "name": "stdout",
159
+ "output_type": "stream",
160
+ "text": [
161
+ "Nulls in text: 0\n",
162
+ "Nulls in label: 0\n",
163
+ "Example text sample:\n",
164
+ "हामीले पार्टी एकतापछि कि दुबै पार्टीको सिद्धान्त राख्ने कि राख्ने माओवाद र जबज दुबै नराख्ने भन्दा उहाँहरु मान्नु भएन । एमालेका साथीहरुले जवजको विषय उठाउन चाहनुभएन । सिद्धान्तको विषय नै नमिलेपछि पार्टी एकता संयोजन समितिको बैठक रोकियो कार्यदलका एक सदस्\n"
165
+ ]
166
+ }
167
+ ],
168
+ "source": [
169
+ "# Quick sanity checks\n",
170
+ "print('Nulls in text:', int(df['text'].isnull().sum()))\n",
171
+ "print('Nulls in label:', int(df['label'].isnull().sum()))\n",
172
+ "print('Example text sample:')\n",
173
+ "print(df.loc[0, 'text'][:250])"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 4,
179
+ "id": "ba4a933f",
180
+ "metadata": {
181
+ "colab": {
182
+ "base_uri": "https://localhost:8080/",
183
+ "height": 206
184
+ },
185
+ "id": "ba4a933f",
186
+ "outputId": "9bf5f0a5-c547-43f1-b8f2-a580024d74a9"
187
+ },
188
+ "outputs": [
189
+ {
190
+ "name": "stdout",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "label\n",
194
+ "AI 0.501511\n",
195
+ "Human 0.498489\n",
196
+ "Name: proportion, dtype: float64\n"
197
+ ]
198
+ },
199
+ {
200
+ "data": {
201
+ "text/plain": [
202
+ "label \n",
203
+ "0 count 990.000000\n",
204
+ " mean 455.551515\n",
205
+ " std 56.825837\n",
206
+ " min 299.000000\n",
207
+ " 25% 418.000000\n",
208
+ " 50% 458.000000\n",
209
+ " 75% 494.000000\n",
210
+ " max 629.000000\n",
211
+ "1 count 996.000000\n",
212
+ " mean 284.231928\n",
213
+ " std 67.165254\n",
214
+ " min 103.000000\n",
215
+ " 25% 238.000000\n",
216
+ " 50% 282.000000\n",
217
+ " 75% 331.000000\n",
218
+ " max 433.000000\n",
219
+ "Name: text, dtype: float64"
220
+ ]
221
+ },
222
+ "execution_count": 4,
223
+ "metadata": {},
224
+ "output_type": "execute_result"
225
+ }
226
+ ],
227
+ "source": [
228
+ "# Class balance\n",
229
+ "print(df['label'].value_counts(normalize=True).rename({0: 'Human', 1: 'AI'}))\n",
230
+ "df.groupby('label')['text'].apply(lambda s: s.str.len().describe())"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": 5,
236
+ "id": "d7b48175",
237
+ "metadata": {
238
+ "colab": {
239
+ "base_uri": "https://localhost:8080/",
240
+ "height": 206
241
+ },
242
+ "id": "d7b48175",
243
+ "outputId": "08bc4562-874c-40c1-d554-1d809a6d0e31"
244
+ },
245
+ "outputs": [
246
+ {
247
+ "data": {
248
+ "text/plain": [
249
+ "<matplotlib.legend.Legend at 0x7fef748b5290>"
250
+ ]
251
+ },
252
+ "execution_count": 5,
253
+ "metadata": {},
254
+ "output_type": "execute_result"
255
+ },
256
+ {
257
+ "data": {
258
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAvwAAAGHCAYAAADMVYYQAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAARoNJREFUeJzt3X1cVGX+//H3gMMIAt7HTaJiIeVduVomVmgFu5aurtXWWq1Wa5bdaO5+7WtWDrsFZuVSa9nPttS2yG03c7Wvd6SJlbmp5epqmbspmkpkoaAoDMz1+6OYHAEdYGCY4+v5eMxDz3Wuuc7nzGdGPlxec47NGGMEAAAAwJJCAh0AAAAAgMZDwQ8AAABYGAU/AAAAYGEU/AAAAICFUfADAAAAFkbBDwAAAFgYBT8AAABgYRT8AAAAgIVR8AMAAAAWRsEPIOjZbDafHmvXrvXL8Q4cOCCn06ktW7b41H/t2rWy2Wz6+9//7pfj+1tpaamcTmeNr4/T6ZTNZtOhQ4fqNfbYsWO9ctCqVSt17dpVP//5zzVv3jyVlZVVe87gwYM1ePDgOh1nx44dcjqd2rNnT52ed+qx9uzZI5vNpqeffrpO45xJZmamFi9eXK296r3hr/cmANSkRaADAICG+uijj7y2//CHP+i9997TmjVrvNp79Ojhl+MdOHBAGRkZ6tq1qy6++GK/jBlIpaWlysjIkKQ6F9q+CA8P9+Ti+PHj2rdvn5YvX65x48bpmWee0YoVK9SpUydP/xdeeKHOx9ixY4cyMjI0ePBgde3a1efn1edY9ZGZmakbbrhBI0eO9Gr/yU9+oo8++shv700AqAkFP4Cgd9lll3ltd+zYUSEhIdXaERg15eLXv/61br/9dg0bNkw33HCDNmzY4NnXFMVvaWmpIiIiAl5oR0dH8z4F0OhY0gPgrFBeXq7HH39cF1xwgRwOhzp27Kjbb79d33zzjafPjBkzFBISoqVLl3o9d+zYsYqIiNC2bdu0du1aXXLJJZKk22+/3bNUxel0NjjGgoICjR8/Xp06dVJYWJgSExOVkZGhiooKT5+Tl5zMmjVLiYmJioyM1MCBA72K5iovvfSSunfvLofDoR49eignJ0djx471zILv2bNHHTt2lCRlZGR4zmfs2LFe43z99df61a9+pdatWysmJkZ33HGHjhw50qDzTU9P17hx4/TPf/5T69at87TXtKRnzpw5uuiiixQZGamoqChdcMEFevjhhyVJ8+fP14033ihJGjJkiOcc5s+f7xmvV69eWrdunVJSUhQREaE77rij1mNJktvt1hNPPKHOnTurZcuW6t+/v1avXu3V5+TX8WRVy6Cq2Gw2HTt2TAsWLPDEVnXM2pb0LFmyRAMHDlRERISioqKUlpZW7X+yqo6zfft2v+cGgLVQ8AOwPLfbrREjRmjGjBkaPXq0/u///k8zZsxQbm6uBg8erOPHj0uSHnroIQ0dOlRjxoxRfn6+JGnevHlasGCB/vSnP6l37976yU9+onnz5kmSHnnkEX300Uf66KOP9Jvf/KZBMRYUFOjSSy/VypUr9dhjj2n58uW68847lZWVpXHjxlXr//zzzys3N1fZ2dl6/fXXdezYMV177bVehd7cuXN11113qU+fPlq0aJEeeeQRZWRkeBWXcXFxWrFihSTpzjvv9JzPo48+6nW866+/Xt27d9dbb72l//3f/1VOTo4efPDBBp2zJP385z+XJK+C/1QLFy7UhAkTlJqaqrfffluLFy/Wgw8+qGPHjkmSrrvuOmVmZnpel6pzuO666zxjHDx4ULfeeqtGjx6tZcuWacKECaeNa/bs2VqxYoWys7P12muvKSQkREOHDq1WdPvio48+Unh4uK699lpPbKdbSpSTk6MRI0YoOjpab7zxhl5++WUVFRVp8ODB+uCDD6r1b6zcALAQAwAWM2bMGNOqVSvP9htvvGEkmbfeesur38aNG40k88ILL3jaDh06ZDp16mQuvfRS88knn5iIiAhz66231vi8efPm+RTPe++9ZySZv/3tb7X2GT9+vImMjDT5+fle7U8//bSRZLZv326MMWb37t1Gkundu7epqKjw9Pv444+NJPPGG28YY4yprKw0sbGxZsCAAV7j5efnG7vdbrp06eJp++abb4wkM3369GpxTZ8+3UgyM2fO9GqfMGGCadmypXG73ac991NzcarPPvvMSDL33HOPpy01NdWkpqZ6tu+77z7Tpk2b0x7nb3/7m5Fk3nvvvWr7UlNTjSSzevXqGvedfKyq1zc+Pt4cP37c015cXGzatWtnrrnmGq9zO/l1rFL1mp2sVatWZsyYMdX6Vr03quKurKw08fHxpnfv3qaystLTr6SkxJxzzjkmJSWl2nHqmxsAZw9m+AFY3jvvvKM2bdpo+PDhqqio8DwuvvhixcbGes14t2/fXn/961/1ySefKCUlRZ07d9aLL77YJDEOGTJE8fHxXjEOHTpUkpSXl+fV/7rrrlNoaKhnu0+fPpLk+Z+JnTt3qqCgQL/85S+9nte5c2cNGjSozvFVzcSffLwTJ06osLCwzmOdzBhzxj6XXnqpDh8+rF/96lf6xz/+Ua8rBrVt21ZXXXWVz/1HjRqlli1berajoqI0fPhwrVu3TpWVlXU+vq927typAwcO6LbbblNIyI8/oiMjI3X99ddrw4YNKi0t9XpOY+UGgHVQ8AOwvK+//lqHDx9WWFiY7Ha716OgoKBaATlgwAD17NlTJ06c0D333KNWrVo1SYxLly6tFl/Pnj0lqVqM7du399p2OByS5Fme9O2330qSYmJiqh2rprYzOdPx6qvqF5T4+Pha+9x222165ZVXlJ+fr+uvv17nnHOOBgwYoNzcXJ+PExcXV6e4YmNja2wrLy/X0aNH6zRWXVTlraZ44+Pj5Xa7VVRU5NXeWLkBYB1cpQeA5XXo0EHt27f3rFU/VVRUlNf29OnTtW3bNvXr10+PPfaYhg0bpm7dujV6jH369NETTzxR4/7TFcQ1qSoCv/7662r7CgoK6h5gI1myZImkM18O9Pbbb9ftt9+uY8eOad26dZo+fbqGDRumL774Ql26dDnjcU7+Eq0vanqNCgoKFBYWpsjISElSy5Yta7yPQH3vWSD9mLeDBw9W23fgwAGFhISobdu29R4fwNmJGX4Aljds2DB9++23qqysVP/+/as9kpOTPX1zc3OVlZWlRx55RLm5uWrdurVuuukmlZeXe/o0xgzqsGHD9O9//1vnnXdejTHWteBPTk5WbGys3nzzTa/2vXv3av369V5tgZoRzs3N1Z///GelpKTo8ssv9+k5rVq10tChQzVt2jSVl5dr+/btkvx/DosWLdKJEyc82yUlJVq6dKmuuOIKz1Kqrl27qrCw0OuXqvLycq1cubLaeA6Hw6fYkpOTde655yonJ8drudOxY8f01ltvea7cAwB1wQw/AMu7+eab9frrr+vaa6/VxIkTdemll8put+urr77Se++9pxEjRugXv/iF50ouqampmj59ukJCQvTXv/5VV155paZMmaLs7GxJ0nnnnafw8HC9/vrruvDCCxUZGan4+PgzFuU1XTZTklJTU/X73/9eubm5SklJ0QMPPKDk5GSdOHFCe/bs0bJly/Tiiy963ZzqTEJCQpSRkaHx48frhhtu0B133KHDhw8rIyNDcXFxXuvDo6Ki1KVLF/3jH//Q1VdfrXbt2qlDhw51uoHV6bjdbs+5l5WVae/evVq+fLnefPNNXXjhhdV+KTnVuHHjFB4erkGDBikuLk4FBQXKyspS69atPZdI7dWrl6Tvr0wUFRWlli1bKjExsdpyF1+FhoYqLS1NkydPltvt1pNPPqni4mLPDcok6aabbtJjjz2mm2++Wf/zP/+jEydO6LnnnqtxjX/v3r21du1aLV26VHFxcYqKivL6RbNKSEiIZs6cqVtuuUXDhg3T+PHjVVZWpqeeekqHDx/WjBkz6nU+AM5ygf7WMAD4W01XhnG5XObpp582F110kWnZsqWJjIw0F1xwgRk/frzZtWuXqaioMKmpqSYmJsYcPHjQ67lPPfWUkWTefvttT9sbb7xhLrjgAmO322u9wk2Vqiux1PaoukLLN998Yx544AGTmJho7Ha7adeunenXr5+ZNm2aOXr0qDHmx6vIPPXUU9WOU1Mcc+fONeeff74JCwsz3bt3N6+88ooZMWKE6du3r1e/d9991/Tt29c4HA4jyXNFmaorwXzzzTde/efNm2ckmd27d9d63sZ8n4uTzzU8PNx07tzZDB8+3LzyyiumrKys2nNOvXLOggULzJAhQ0xMTIwJCwsz8fHx5pe//KXZunWr1/Oys7NNYmKiCQ0N9bqKUmpqqunZs2eN8dV2lZ4nn3zSZGRkmE6dOpmwsDDTt29fs3LlymrPX7Zsmbn44otNeHi46datm5k9e3aNV+nZsmWLGTRokImIiDCSPMc89So9VRYvXmwGDBhgWrZsaVq1amWuvvpq8+GHH3r1aWhuAJw9bMb4cIkEAIAlHD58WN27d9fIkSM1d+7cQIcDAGgCLOkBAIsqKCjQE088oSFDhqh9+/bKz8/XH//4R5WUlGjixImBDg8A0EQo+AHAohwOh/bs2aMJEybou+++U0REhC677DK9+OKLnst9AgCsjyU9AAAAgIVxWU4AAADAwij4AQAAAAuj4AcAAAAszPJf2nW73Tpw4ICioqLqfGt1AAAAoLkyxqikpETx8fFeN1Q8leUL/gMHDighISHQYQAAAACNYt++fae9G7vlC/6oqChJ378Q0dHRjX48l8ulVatWKT09XXa7vdGPh6ZHjq2N/FofObY28mt95PhHxcXFSkhI8NS7tbF8wV+1jCc6OrrJCv6IiAhFR0ef9W9CqyLH1kZ+rY8cWxv5tT5yXN2Zlq3zpV0AAADAwij4AQAAAAuj4AcAAAAszPJr+AEAAFB3lZWVcrlcgQ6jGpfLpRYtWujEiROqrKwMdDiNKjQ0VC1atGjwpeUp+AEAAODl6NGj+uqrr2SMCXQo1RhjFBsbq3379p0V91iKiIhQXFycwsLC6j0GBT8AAAA8Kisr9dVXXykiIkIdO3ZsdkW12+3W0aNHFRkZedqbTQU7Y4zKy8v1zTffaPfu3UpKSqr3+VLwAwAAwMPlcskYo44dOyo8PDzQ4VTjdrtVXl6uli1bWrrgl6Tw8HDZ7Xbl5+d7zrk+rP0qAQAAoF6a28z+2cofv9RQ8AMAAAAWRsEPAAAAWBgFPwAAAGBhfGkXAACc0dRF23zqlzWqdyNHgkDx9T3gL3V9L40dO1aHDx/W4sWLvdrXrl2rIUOGqKioSG3atPFfgEGEGX4AAADAwij4AQAAcFZwOp26+OKLvdqys7PVtWtXz/bYsWM1cuRIZWZmKiYmRm3atFFGRoYqKir0P//zP2rXrp06deqkV155xWuchx56SN27d1dERIS6deumRx991OtOxVXH/stf/qKuXbuqdevWuvnmm1VSUtKYpyyJgh8AAADwsmbNGh04cEDr1q3TrFmz5HQ6NWzYMLVt21b//Oc/dffdd+vuu+/Wvn37PM+JiorS/PnztWPHDj377LN66aWX9Mc//tFr3P/+979avHix3nnnHb3zzjvKy8vTjBkzGv18KPgBAABgCe+8844iIyO9HkOHDq3zOO3atdNzzz2n5ORk3XHHHUpOTlZpaakefvhhJSUlaerUqQoLC9OHH37oec4jjzyilJQUde3aVcOHD9dvf/tbvfnmm17jut1uzZ8/X7169dIVV1yh2267TatXr27weZ9JQAv+rl27ymazVXvce++9kr6/pbDT6VR8fLzCw8M1ePBgbd++PZAhAwAAoJkaMmSItmzZ4vX485//XOdxevbs6XXDq5iYGPXu/eOXiENDQ9W+fXsVFhZ62v7+97/r8ssvV2xsrCIjI/Xoo49q7969XuN27dpVUVFRnu24uDivMRpLQAv+jRs36uDBg55Hbm6uJOnGG2+UJM2cOVOzZs3S7NmztXHjRsXGxiotLa1J1joBAAAguLRq1Urnn3++1+Pcc8/17A8JCZExxus5J6+zr2K32722bTZbjW1ut1uStGHDBt18880aOnSo3nnnHX366aeaNm2aysvLzzhu1RiNKaCX5ezYsaPX9owZM3TeeecpNTVVxhhlZ2dr2rRpGjVqlCRpwYIFiomJUU5OjsaPHx+IkAEAABCkOnbsqIKCAhljZLPZJElbtmxp8LgffvihunTpomnTpnna8vPzGzyuvzSb6/CXl5frtdde0+TJk2Wz2fTll1+qoKBA6enpnj4Oh0Opqalav359rQV/WVmZysrKPNvFxcWSvv/trabf4Pyt6hhNcSwEBjm2NvJrfeS4fkLl2yxkoF9X8ttwLpdLxhi53W7v2edTZsYbW20z31Uz9FUxntx+atvJ47jdbl155ZX65ptv9OSTT+r666/XypUrtXz5ckVHR3v61TbO6dq6deumvXv3KicnR5dccomWLVumt99+2+v4VXGfGvPpzrVqnzFGLpdLoaGhXvt8fZ83m4J/8eLFOnz4sMaOHStJKigokPT9mqmTxcTEnPY3pqysLGVkZFRrX7VqlSIiIvwX8BlULU+CdZFjayO/1keO6+aS0DP3kaRly/Y0ahy+Ir/116JFC8XGxuro0aNeS1Ieurpzk8ZRNWlbm1OXeLtcLlVUVFR7Xmlpqaf/ueeeq6efflqzZs3S448/ruHDh+vee+/VggULvCaJTx2noqJC5eXlXm1ut1snTpxQcXGxhgwZonvuuUf333+/ysvLlZaWpt/97neaMWOG5zllZWWqrKz0GuPEiRNyu92nPdfy8nIdP35c69atU0VFRY3ndiY2c+pCpgD56U9/qrCwMC1dulSStH79eg0aNEgHDhxQXFycp9+4ceO0b98+rVixosZxaprhT0hI0KFDhxQdHd24J6Hv3yS5ublKS0urtk4L1kCOrY38Wh85rp+MpTt86jd9eI9GjuT0yG/DnThxQvv27VPXrl3VsmXLQIdTjTFGJSUlioqK8izLsbITJ05oz549SkhIqJaP4uJidejQQUeOHDltndssZvjz8/P17rvvatGiRZ622NhYSd/P9J9c8BcWFlab9T+Zw+GQw+Go1m6325v0g9/Ux0PTI8fWRn6tjxzXTaWP1/loLq8p+a2/yspK2Ww2hYSEeF2pprmoWv5SFaPVhYSEeL40fOp72tf3eLN4lebNm6dzzjlH1113nactMTFRsbGxXv8lV15erry8PKWkpAQiTAAAACDoBHyG3+12a968eRozZoxatPgxHJvNpkmTJikzM1NJSUlKSkpSZmamIiIiNHr06ABGDAAAAASPgBf87777rvbu3as77rij2r4pU6bo+PHjmjBhgoqKijRgwACtWrXK64YFAAAAAGoX8II/PT292g0QqthsNjmdTjmdzqYNCgAAALCIZrGGHwAAAEDjoOAHAAAALIyCHwAAALAwCn4AAADAwgL+pV0AAAAEgaUTm/Z4w59t2uNZGDP8AAAAsIz169crNDRUP/vZz7za9+zZI5vNpi1btgQmsACi4AcAAIBlvPLKK7r//vv1wQcfaO/evYEOp1mg4AcAAIAlHDt2TG+++abuueceDRs2TPPnzw90SM0CBT8AAAAs4a9//auSk5OVnJysW2+9VfPmzav1Bq9nEwp+AAAAWMLLL7+sW2+9VZL0s5/9TEePHtXq1asDHFXgUfADAAAg6O3cuVMff/yxbr75ZklSixYtdNNNN+mVV14JcGSBx2U5AQAAEPRefvllVVRU6Nxzz/W0GWNkt9tVVFQUwMgCjxl+AAAABLWKigq9+uqreuaZZ7RlyxbP41//+pe6dOmi119/PdAhBhQz/AAAAAhq77zzjoqKinTnnXeqdevWXvtuuOEGvfzyyxo2bFiAogs8Cn4AAACcWTO+8+3LL7+sa665plqxL0nXX3+9MjMz9d133wUgsuaBgh8AAABBbenSpbXu+8lPfuK5NOfZeolO1vADAAAAFkbBDwAAAFgYBT8AAABgYRT8AAAAgIVR8AMAAKCas/ULrs2NP/JAwQ8AAACP0NBQSVJ5eXmAI4EklZaWSpLsdnu9x+CynAAAAPBo0aKFIiIi9M0338hutyskpHnND7vdbpWXl+vEiRPNLjZ/MsaotLRUhYWFatOmjecXsfqg4AcAAICHzWZTXFycdu/erfz8/ECHU40xRsePH1d4eLhsNlugw2l0bdq0UWxsbIPGoOAHAACAl7CwMCUlJTXLZT0ul0vr1q3TlVde2aBlLsHAbrc3aGa/CgU/AABnsamLtgU6BDRTISEhatmyZaDDqCY0NFQVFRVq2bKl5Qt+f7HuwicAAAAAFPwAAACAlVHwAwAAABYW8IJ///79uvXWW9W+fXtFRETo4osv1ubNmz37jTFyOp2Kj49XeHi4Bg8erO3btwcwYgAAACB4BLTgLyoq0qBBg2S327V8+XLt2LFDzzzzjNq0aePpM3PmTM2aNUuzZ8/Wxo0bFRsbq7S0NJWUlAQucAAAACBIBPQqPU8++aQSEhI0b948T1vXrl09fzfGKDs7W9OmTdOoUaMkSQsWLFBMTIxycnI0fvz4pg4ZAAAACCoBLfiXLFmin/70p7rxxhuVl5enc889VxMmTNC4ceMkSbt371ZBQYHS09M9z3E4HEpNTdX69etrLPjLyspUVlbm2S4uLpb0/TVbXS5XI5+RPMdoimMhMMixtZFf6yPH3kLl9ut4gX5dya/1keMf+foa2IwxppFjqVXVtV0nT56sG2+8UR9//LEmTZqk//f//p9+/etfa/369Ro0aJD279+v+Ph4z/Puuusu5efna+XKldXGdDqdysjIqNaek5OjiIiIxjsZAAAAoAmVlpZq9OjROnLkiKKjo2vtF9AZfrfbrf79+yszM1OS1LdvX23fvl1z5szRr3/9a0+/U2+bbIyp9VbKU6dO1eTJkz3bxcXFSkhIUHp6+mlfCH9xuVzKzc1VWloaN4OwqKDK8fKHfOs39MnGjSOIBFV+US/k2FvG0h1+HW/68B5+Ha+uyK/1keMfVa1kOZOAFvxxcXHq0cP7H4YLL7xQb731liQpNjZWklRQUKC4uDhPn8LCQsXExNQ4psPhkMPhqNZut9ub9E3R1MdD0wuKHNsqfevX3M8jAIIiv2gQcvy9Sj9fv6O5vKbk1/rIse+ft4BepWfQoEHauXOnV9sXX3yhLl26SJISExMVGxur3Nxcz/7y8nLl5eUpJSWlSWMFAAAAglFAZ/gffPBBpaSkKDMzU7/85S/18ccfa+7cuZo7d66k75fyTJo0SZmZmUpKSlJSUpIyMzMVERGh0aNHBzJ0AAAAICgEtOC/5JJL9Pbbb2vq1Kn6/e9/r8TERGVnZ+uWW27x9JkyZYqOHz+uCRMmqKioSAMGDNCqVasUFRUVwMgBAACA4BDQgl+Shg0bpmHDhtW632azyel0yul0Nl1QAAAAgEUEdA0/AAAAgMZFwQ8AAABYGAU/AAAAYGEU/AAAAICFUfADAAAAFkbBDwAAAFhYwC/LCeAstnSib/2GP9u4cQBBZOqibT71yxrVu5EjARAsmOEHAAAALIyCHwAAALAwCn4AAADAwij4AQAAAAuj4AcAAAAsjIIfAAAAsDAKfgAAAMDCuA4/AABoctxPAGg6zPADAAAAFkbBDwAAAFgYBT8AAABgYazhB4CGWDrRt37Dn23cOAAAqAUz/AAAAICFUfADAAAAFkbBDwAAAFgYBT8AAABgYRT8AAAAgIVR8AMAAAAWRsEPAAAAWBjX4QcAoBmYumhboEMAYFHM8AMAAAAWRsEPAAAAWBgFPwAAAGBhAS34nU6nbDab1yM2Ntaz3xgjp9Op+Ph4hYeHa/Dgwdq+fXsAIwYAAACCS8Bn+Hv27KmDBw96Htu2/filpZkzZ2rWrFmaPXu2Nm7cqNjYWKWlpamkpCSAEQMAAADBI+AFf4sWLRQbG+t5dOzYUdL3s/vZ2dmaNm2aRo0apV69emnBggUqLS1VTk5OgKMGAAAAgkPAL8u5a9cuxcfHy+FwaMCAAcrMzFS3bt20e/duFRQUKD093dPX4XAoNTVV69ev1/jx42scr6ysTGVlZZ7t4uJiSZLL5ZLL5Wrck/nhOCf/CesJqhybUN/6BepcmmF8dc5vMzwHnF5z/QyHyu3X8Xw9v+Z+3LrmqbnmF/5Djn/k62tgM8aYRo6lVsuXL1dpaam6d++ur7/+Wo8//rg+//xzbd++XTt37tSgQYO0f/9+xcfHe55z1113KT8/XytXrqxxTKfTqYyMjGrtOTk5ioiIaLRzAQAAAJpSaWmpRo8erSNHjig6OrrWfgEt+E917NgxnXfeeZoyZYouu+wyDRo0SAcOHFBcXJynz7hx47Rv3z6tWLGixjFqmuFPSEjQoUOHTvtC+IvL5VJubq7S0tJkt9sb/XhoekGV4+UP+dZv6JONG0dtmmF8dc5vMzwHnF5z/QxnLN3h1/GmD+9hieP6Ol6V5ppf+A85/lFxcbE6dOhwxoI/4Et6TtaqVSv17t1bu3bt0siRIyVJBQUFXgV/YWGhYmJiah3D4XDI4XBUa7fb7U36pmjq46HpBUWObZW+9QvUeTTj+HzObzM+B5xec/sMV/r5a3W+nltzP259c9Tc8gv/I8e+fz4C/qXdk5WVlemzzz5TXFycEhMTFRsbq9zcXM/+8vJy5eXlKSUlJYBRAgAAAMEjoDP8v/vd7zR8+HB17txZhYWFevzxx1VcXKwxY8bIZrNp0qRJyszMVFJSkpKSkpSZmamIiAiNHj06kGEDOBssf8j32XvgNKYu2nbmTgDQiAJa8H/11Vf61a9+pUOHDqljx4667LLLtGHDBnXp0kWSNGXKFB0/flwTJkxQUVGRBgwYoFWrVikqKiqQYQMAAABBI6AF/8KFC0+732azyel0yul0Nk1AAAAAgMU0qzX8AAAAAPyrWV2lB0Azt3Sib/2GP9u4cQAAAJ8xww8AAABYGAU/AAAAYGEU/AAAAICFsYYfAAA0W3W9j0Go3LokVMpYuqPGu/lmjertr9CAoMEMPwAAAGBhFPwAAACAhVHwAwAAABbGGn4AACyormvfAVgXM/wAAACAhVHwAwAAABZGwQ8AAABYGGv4ATR/Syf61m/4s40bBwAAQYgZfgAAAMDCKPgBAAAAC6PgBwAAACyMNfwAzi5n+j6ACZWU2iShAFbE9f+B5ocZfgAAAMDCKPgBAAAAC6PgBwAAACyMgh8AAACwML60C8D/fL1RllWOCwBAM8YMPwAAAGBhFPwAAACAhdWr4O/WrZu+/fbbau2HDx9Wt27dGhwUAAAAAP+o1xr+PXv2qLKyslp7WVmZ9u/f3+CgAABo7rjBFIBgUaeCf8mSJZ6/r1y5Uq1bt/ZsV1ZWavXq1eratavfggMAAADQMHUq+EeOHClJstlsGjNmjNc+u92url276plnnvFbcAAAAAAapk4Fv9vtliQlJiZq48aN6tChQ6MEBQAAAMA/6rWGf/fu3f6OQ1lZWXr44Yc1ceJEZWdnS5KMMcrIyNDcuXNVVFSkAQMG6Pnnn1fPnj39fnwAAGB9vn73ImtU70aOBGg69b7x1urVq7V69WoVFhZ6Zv6rvPLKK3Uaa+PGjZo7d6769Onj1T5z5kzNmjVL8+fPV/fu3fX4448rLS1NO3fuVFRUVH1DBwAAAM4a9bosZ0ZGhtLT07V69WodOnRIRUVFXo+6OHr0qG655Ra99NJLatu2rafdGKPs7GxNmzZNo0aNUq9evbRgwQKVlpYqJyenPmEDAAAAZ516zfC/+OKLmj9/vm677bYGB3Dvvffquuuu0zXXXKPHH3/c0757924VFBQoPT3d0+ZwOJSamqr169dr/PjxNY5XVlamsrIyz3ZxcbEkyeVyyeVyNTjeM6k6RlMcC4ERVDk2ob718/VcfB0viLl+OEeXv881GN4vZwl/fYZD5T5zJzS5kB/yEtLA/ATFv/FnqaD6OdzIfH0N6lXwl5eXKyUlpT5P9bJw4UJ98skn2rhxY7V9BQUFkqSYmBiv9piYGOXn59c6ZlZWljIyMqq1r1q1ShEREQ2M2He5ublNdiwERnDkONW3bsuW+Xc8C8jV5ZLx44A+v8ZoKg39DF9i/d9/g1q/0L0Nev6yZXv8EwgaTXD8HG5cpaWlPvWrV8H/m9/8Rjk5OXr00Ufr83RJ0r59+zRx4kStWrVKLVu2rLWfzWbz2jbGVGs72dSpUzV58mTPdnFxsRISEpSenq7o6Oh6x+srl8ul3NxcpaWlyW63N/rx0PSCKsfLH/Kt39An/TteEHOZUOXqcqXpA9lt1W8wWG++vsZodP76DGcs3eHHqOAvIXKrX+heba7sLHf9Vi5LkqYP7+HHqOBPQfVzuJFVrWQ5k3oV/CdOnNDcuXP17rvvqk+fPtVe7FmzZp1xjM2bN6uwsFD9+vXztFVWVmrdunWaPXu2du7cKen7mf64uDhPn8LCwmqz/idzOBxyOBzV2u12e5O+KZr6eGh6QZFjXwtWX8/DnwVwc2Yku63SvwV/c3+vnIUa+hmubEAxicbnVkiDctTs/31HcPwcbmS+nn+9Cv6tW7fq4osvliT9+9//9tp3utn3k1199dXats370li33367LrjgAj300EPq1q2bYmNjlZubq759+0r6filRXl6ennySmTIAAADAF/Uq+N97770GHzgqKkq9evXyamvVqpXat2/vaZ80aZIyMzOVlJSkpKQkZWZmKiIiQqNHj27w8QEAAICzQb2vw98UpkyZouPHj2vChAmeG2+tWrWKa/ADAAAAPqpXwT9kyJDTLt1Zs2ZNvYJZu3at17bNZpPT6ZTT6azXeAAAAMDZrl4Ff9X6/Soul0tbtmzRv//9b40ZM8YfcQEAAADwg3oV/H/84x9rbHc6nTp69GiDAgIAAADgP369ptitt96qV155xZ9DAgAAAGgAvxb8H3300WlvogUAAACgadVrSc+oUaO8to0xOnjwoDZt2tSgu+8CAAAA8K96FfytW7f22g4JCVFycrJ+//vfKz093S+BAQAQCFMXbTtzJwAIIvUq+OfNm+fvOAAAAAA0ggbdeGvz5s367LPPZLPZ1KNHD/Xt29dfcQEAAADwg3oV/IWFhbr55pu1du1atWnTRsYYHTlyREOGDNHChQvVsWNHf8cJAAAAoB7qVfDff//9Ki4u1vbt23XhhRdKknbs2KExY8bogQce0BtvvOHXIAGcYunEQEcAAACCRL0K/hUrVujdd9/1FPuS1KNHDz3//PN8aRcAAABoRup1HX632y273V6t3W63y+12NzgoAAAAAP5Rr4L/qquu0sSJE3XgwAFP2/79+/Xggw/q6quv9ltwAAAAABqmXkt6Zs+erREjRqhr165KSEiQzWbT3r171bt3b7322mv+jhFAY+M7AQAAWFa9Cv6EhAR98sknys3N1eeffy5jjHr06KFrrrnG3/EBAAAAaIA6LelZs2aNevTooeLiYklSWlqa7r//fj3wwAO65JJL1LNnT73//vuNEigAAACAuqtTwZ+dna1x48YpOjq62r7WrVtr/PjxmjVrlt+CAwAAANAwdVrS869//UtPPvlkrfvT09P19NNPNzgoAMAZ+Pq9i+HPNm4cAIBmr04z/F9//XWNl+Os0qJFC33zzTcNDgoAAACAf9Sp4D/33HO1bdu2Wvdv3bpVcXFxDQ4KAAAAgH/UqeC/9tpr9dhjj+nEiRPV9h0/flzTp0/XsGHD/BYcAAAAgIap0xr+Rx55RIsWLVL37t113333KTk5WTabTZ999pmef/55VVZWatq0aY0VKxC8WG8N3gMBN3XR9/9DHSq3LgmVMpbuUGX97j8JAEGlTgV/TEyM1q9fr3vuuUdTp06VMUaSZLPZ9NOf/lQvvPCCYmJiGiVQAAAAAHVX5xtvdenSRcuWLVNRUZH+85//yBijpKQktW3btjHiAwAAANAA9brTriS1bdtWl1xyiT9jAQAAAOBnLF4EAAAALIyCHwAAALAwCn4AAADAwij4AQAAAAuj4AcAAAAsLKAF/5w5c9SnTx9FR0crOjpaAwcO1PLlyz37jTFyOp2Kj49XeHi4Bg8erO3btwcwYgAAACC4BLTg79Spk2bMmKFNmzZp06ZNuuqqqzRixAhPUT9z5kzNmjVLs2fP1saNGxUbG6u0tDSVlJQEMmwAAAAgaAS04B8+fLiuvfZade/eXd27d9cTTzyhyMhIbdiwQcYYZWdna9q0aRo1apR69eqlBQsWqLS0VDk5OYEMGwAAAAga9b7xlr9VVlbqb3/7m44dO6aBAwdq9+7dKigoUHp6uqePw+FQamqq1q9fr/Hjx9c4TllZmcrKyjzbxcXFkiSXyyWXy9W4J/HDcU7+E9ZTrxybUF8H9+94qDPXD6+tK1Cvsb/fA/xb5BEqtyQp5JQ/YS3+yi8/x5svaq0f+foa2IwxppFjOa1t27Zp4MCBOnHihCIjI5WTk6Nrr71W69ev16BBg7R//37Fx8d7+t91113Kz8/XypUraxzP6XQqIyOjWntOTo4iIiIa7TwAAACAplRaWqrRo0fryJEjio6OrrVfwGf4k5OTtWXLFh0+fFhvvfWWxowZo7y8PM9+m83m1d8YU63tZFOnTtXkyZM928XFxUpISFB6evppXwh/cblcys3NVVpamux2e6MfD02vXjle/pBv/YY+6d/xUGcuE6pcXa40fSC7rbLpA/D3e8DX8c4CGUt3SPp+5rdf6F5truwsNxersxx/5Xf68B5+jAr+RK31o6qVLGcS8II/LCxM559/viSpf//+2rhxo5599lk99ND3P8wKCgoUFxfn6V9YWKiYmJhax3M4HHI4HNXa7XZ7k74pmvp4aHp1yrGvhaO/x0P9GMluqwxMwR+o99RZoPKU4s+tkGptsI6G5pef4c0ftZbv79Nm9y+dMUZlZWVKTExUbGyscnNzPfvKy8uVl5enlJSUAEYIAAAABI+AzvA//PDDGjp0qBISElRSUqKFCxdq7dq1WrFihWw2myZNmqTMzEwlJSUpKSlJmZmZioiI0OjRowMZNgCguVo6sdZdI7/6TpLktrVQYeeRTRQQrG7qom1n7JM1qncTRALULqAF/9dff63bbrtNBw8eVOvWrdWnTx+tWLFCaWlpkqQpU6bo+PHjmjBhgoqKijRgwACtWrVKUVFRgQwbAAAACBoBLfhffvnl0+632WxyOp1yOp1NExAAAABgMc1uDT8AAAAA/wn4VXoAAI3oNGvavQx/tnHjqE1zjw8ALIAZfgAAAMDCKPgBAAAAC6PgBwAAACyMNfwAgKB28nXQq661DwD4ETP8AAAAgIVR8AMAAAAWRsEPAAAAWBhr+AEAAE5x8ndDgGDHDD8AAABgYRT8AAAAgIVR8AMAAAAWxhp+AIDf+LruOWtU70aOBABQhRl+AAAAwMIo+AEAAAALo+AHAAAALIw1/EBzsnRioCMAmkTVWv+RX31Xa58Bie1+3DjNZ+N0YwAAmOEHAAAALI2CHwAAALAwCn4AAADAwij4AQAAAAuj4AcAAAAsjIIfAAAAsDAKfgAAAMDCuA4/UJPTXQ/fhEpKlZY/JP18VpOFBASTkV/NDHQIAIAfMMMPAAAAWBgFPwAAAGBhFPwAAACAhQV0DX9WVpYWLVqkzz//XOHh4UpJSdGTTz6p5ORkTx9jjDIyMjR37lwVFRVpwIABev7559WzZ88ARg4AjeR03x85y/xz93eBDgFoUlMXbfOpX9ao3o0cCawmoDP8eXl5uvfee7Vhwwbl5uaqoqJC6enpOnbsmKfPzJkzNWvWLM2ePVsbN25UbGys0tLSVFJSEsDIAQAAgOAQ0Bn+FStWeG3PmzdP55xzjjZv3qwrr7xSxhhlZ2dr2rRpGjVqlCRpwYIFiomJUU5OjsaPHx+IsAEAAICg0awuy3nkyBFJUrt27SRJu3fvVkFBgdLT0z19HA6HUlNTtX79+hoL/rKyMpWVlXm2i4uLJUkul0sul6sxw/cc5+Q/EaRMaK27XD/sc5lQydc8n2Y8NC9e+T2b+OnfrFC5JUluW7P68eKlKraQH2KFtVTltTnl19eaINTHmM/2GoNa60e+vgY2Y4xp5Fh8YozRiBEjVFRUpPfff1+StH79eg0aNEj79+9XfHy8p+9dd92l/Px8rVy5sto4TqdTGRkZ1dpzcnIUERHReCcAAAAANKHS0lKNHj1aR44cUXR0dK39ms0UzH333aetW7fqgw8+qLbPZrN5bRtjqrVVmTp1qiZPnuzZLi4uVkJCgtLT00/7QviLy+VSbm6u0tLSZLfbG/14aCTLH6p1l8uEKleXK00fyH5tZoPHQ/PilV9bZaDDaTpDn/TLMBlLd0iSrtuf7ZfxGoPb1kKHEoapw753FGIqTtv3/86d1DRBwW9C5Fa/0L3aXNlZbotejHD68B6BDiGgqLV+VLWS5UyaRcF///33a8mSJVq3bp06derkaY+NjZUkFRQUKC4uztNeWFiomJiYGsdyOBxyOBzV2u12e5O+KZr6ePCzMxV6RrLbKn3P8dlUOFpBVX7Pprz56d+ryh8KrDMV0s1BiKk4Y5yVFi0YzwZuhVg2f9QX36PW8v29ENBPgjFG9913nxYtWqQ1a9YoMTHRa39iYqJiY2OVm5vraSsvL1deXp5SUlKaOlwAAAAg6AR0hv/ee+9VTk6O/vGPfygqKkoFBQWSpNatWys8PFw2m02TJk1SZmamkpKSlJSUpMzMTEVERGj06NGBDB1Nxddrkg9/tnHjAKyOzxoAWFZAC/45c+ZIkgYPHuzVPm/ePI0dO1aSNGXKFB0/flwTJkzw3Hhr1apVioqKauJoAQAAgOAT0ILflwsE2Ww2OZ1OOZ3Oxg8IAAAAsBhrfpsFAAAAgKRmcpUeoMn4uk4ZAADAIpjhBwAAACyMgh8AAACwMAp+AAAAwMJYww8A8N0Zvgcz8qvvmigQAICvmOEHAAAALIyCHwAAALAwCn4AAADAwij4AQAAAAvjS7uwBm6oBdTLP3fzJVsAsDpm+AEAAAALo+AHAAAALIyCHwAAALAw1vADDcF3BwAATWzqom0+9csa1buRI0GwYIYfAAAAsDAKfgAAAMDCKPgBAAAAC6PgBwAAACyMgh8AAACwMAp+AAAAwMIo+AEAAAAL4zr8AADUYuRXM33qt7jTlEaOBKg7rtePKszwAwAAABZGwQ8AAABYGAU/AAAAYGEU/AAAAICFUfADAAAAFkbBDwAAAFgYBT8AAABgYQEt+NetW6fhw4crPj5eNptNixcv9tpvjJHT6VR8fLzCw8M1ePBgbd++PTDBAgAAAEEooAX/sWPHdNFFF2n27Nk17p85c6ZmzZql2bNna+PGjYqNjVVaWppKSkqaOFIAAAAgOAX0TrtDhw7V0KFDa9xnjFF2dramTZumUaNGSZIWLFigmJgY5eTkaPz48U0ZKgAAABCUAlrwn87u3btVUFCg9PR0T5vD4VBqaqrWr19fa8FfVlamsrIyz3ZxcbEkyeVyyeVyNW7QPxzn5D/RQCY00BFU4/ohJlczjA0Nd7bl121rtj8GGk3VOfvz3EPl9ttYaJiQH3IRQk58Fmw1C7XWj3x9DZrtv/QFBQWSpJiYGK/2mJgY5efn1/q8rKwsZWRkVGtftWqVIiIi/BvkaeTm5jbZsawtNdAB1CpXl0sm0FGgsZw1+e0c6AAC51DCML+NdYn2+G0s+Ee/0L2BDiFoLFu2J9Ah1Au1llRaWupTv2Zb8Fex2Wxe28aYam0nmzp1qiZPnuzZLi4uVkJCgtLT0xUdHd1ocVZxuVzKzc1VWlqa7HZ7ox/P8pY/FOgIqnGZUOXqcqXpA9ltlYEOB352tuV3U35RoENocm5bCx1KGKYO+95RiKnwy5j/d+4kv4yDhguRW/1C92pzZWe5uRihT6YP7xHoEOqEWutHVStZzqTZFvyxsbGSvp/pj4uL87QXFhZWm/U/mcPhkMPhqNZut9ub9E3R1MezrOZacBnJbqs8KwrCs9JZlF9/FbzBKMRU+O38Kyksmx23QsiLj4K1XqHW8j13zfaTkJiYqNjYWK//rikvL1deXp5SUlICGBkAAAAQPAI6w3/06FH95z//8Wzv3r1bW7ZsUbt27dS5c2dNmjRJmZmZSkpKUlJSkjIzMxUREaHRo0cHMGr4xdKJgY4AAADgrBDQgn/Tpk0aMmSIZ7tq7f2YMWM0f/58TZkyRcePH9eECRNUVFSkAQMGaNWqVYqKigpUyAAAAEBQCWjBP3jwYBlT+2UwbDabnE6nnE5n0wUFAAAAWEizXcMPAAAAoOGa7VV6AAAA0PimLtrmU7+sUb0bORI0Fmb4AQAAAAuj4AcAAAAsjIIfAAAAsDDW8MO/uL4+UC//3P2dT/0GJLZr5EgAAFbDDD8AAABgYRT8AAAAgIVR8AMAAAAWxhp++LbufvizjR8HgDNirT+AQOF6/cGLGX4AAADAwij4AQAAAAuj4AcAAAAsjDX8TcHXa9OzTh6An/i61h/+MfKrmT71W9xpSiNHAgQPvhPQdJjhBwAAACyMgh8AAACwMAp+AAAAwMJYw9+c+Hutv6/jNfVYgAWwRh4AECyY4QcAAAAsjIIfAAAAsDAKfgAAAMDCKPgBAAAAC+NLuwBwkk35RVLn7/8MMRWBDgcAgo6vN9RC02GGHwAAALAwCn4AAADAwij4AQAAAAtjDT8AAE1k5Fczfeq3uNOURo4EwNmEGX4AAADAwij4AQAAAAuj4AcAAAAsLCjW8L/wwgt66qmndPDgQfXs2VPZ2dm64oorAh1W4CydGOgIAAAAmhVfr/+fNap3QMYLpGY/w//Xv/5VkyZN0rRp0/Tpp5/qiiuu0NChQ7V3795AhwYAAAA0e82+4J81a5buvPNO/eY3v9GFF16o7OxsJSQkaM6cOYEODQAAAGj2mvWSnvLycm3evFn/+7//69Wenp6u9evX1/icsrIylZWVebaPHDkiSfruu+/kcrkaL9gfuFwulZaW6ttvv5Xdbv++8VhFox8XTcdljEpVqm9VIbutMtDhwM9KTrhVWlqqkhNuhRh3oMNBI3Dbmn+OK0qLAx1C0HLLrdLQUrkqi+Vu/vOa8MG3337rtV1jrSXfPzenjlcbf4/XGEpKSiRJxpjT9mvWBf+hQ4dUWVmpmJgYr/aYmBgVFBTU+JysrCxlZGRUa09MTGyUGHG2+lOgA0CjejXQAaDRNfcc5wQ6AKDZePosG68+SkpK1Lp161r3N+uCv4rNZvPaNsZUa6sydepUTZ482bPtdrv13XffqX379rU+x5+Ki4uVkJCgffv2KTo6utGPh6ZHjq2N/FofObY28mt95PhHxhiVlJQoPj7+tP2adcHfoUMHhYaGVpvNLywsrDbrX8XhcMjhcHi1tWnTprFCrFV0dPRZ/ya0OnJsbeTX+sixtZFf6yPH3zvdzH6VZr24LSwsTP369VNubq5Xe25urlJSUgIUFQAAABA8mvUMvyRNnjxZt912m/r376+BAwdq7ty52rt3r+6+++5AhwYAAAA0e82+4L/pppv07bff6ve//70OHjyoXr16admyZerSpUugQ6uRw+HQ9OnTqy0rgnWQY2sjv9ZHjq2N/FofOa47mznTdXwAAAAABK1mvYYfAAAAQMNQ8AMAAAAWRsEPAAAAWBgFPwAAAGBhFPw+WLdunYYPH674+HjZbDYtXrzYa78xRk6nU/Hx8QoPD9fgwYO1fft2rz5lZWW6//771aFDB7Vq1Uo///nP9dVXXzXhWaA2WVlZuuSSSxQVFaVzzjlHI0eO1M6dO736kOPgNmfOHPXp08dzk5aBAwdq+fLlnv3k11qysrJks9k0adIkTxs5Dm5Op1M2m83rERsb69lPfq1h//79uvXWW9W+fXtFRETo4osv1ubNmz37yXP9UfD74NixY7rooos0e/bsGvfPnDlTs2bN0uzZs7Vx40bFxsYqLS1NJSUlnj6TJk3S22+/rYULF+qDDz7Q0aNHNWzYMFVWVjbVaaAWeXl5uvfee7Vhwwbl5uaqoqJC6enpOnbsmKcPOQ5unTp10owZM7Rp0yZt2rRJV111lUaMGOH5QUF+rWPjxo2aO3eu+vTp49VOjoNfz549dfDgQc9j27Ztnn3kN/gVFRVp0KBBstvtWr58uXbs2KFnnnlGbdq08fQhzw1gUCeSzNtvv+3ZdrvdJjY21syYMcPTduLECdO6dWvz4osvGmOMOXz4sLHb7WbhwoWePvv37zchISFmxYoVTRY7fFNYWGgkmby8PGMMObaqtm3bmj//+c/k10JKSkpMUlKSyc3NNampqWbixInGGD7DVjB9+nRz0UUX1biP/FrDQw89ZC6//PJa95PnhmGGv4F2796tgoICpaene9ocDodSU1O1fv16SdLmzZvlcrm8+sTHx6tXr16ePmg+jhw5Iklq166dJHJsNZWVlVq4cKGOHTumgQMHkl8Luffee3Xdddfpmmuu8Wonx9awa9cuxcfHKzExUTfffLO+/PJLSeTXKpYsWaL+/fvrxhtv1DnnnKO+ffvqpZde8uwnzw1Dwd9ABQUFkqSYmBiv9piYGM++goIChYWFqW3btrX2QfNgjNHkyZN1+eWXq1evXpLIsVVs27ZNkZGRcjgcuvvuu/X222+rR48e5NciFi5cqE8++URZWVnV9pHj4DdgwAC9+uqrWrlypV566SUVFBQoJSVF3377Lfm1iC+//FJz5sxRUlKSVq5cqbvvvlsPPPCAXn31VUl8jhuqRaADsAqbzea1bYyp1nYqX/qgad13333aunWrPvjgg2r7yHFwS05O1pYtW3T48GG99dZbGjNmjPLy8jz7yW/w2rdvnyZOnKhVq1apZcuWtfYjx8Fr6NChnr/37t1bAwcO1HnnnacFCxbosssuk0R+g53b7Vb//v2VmZkpSerbt6+2b9+uOXPm6Ne//rWnH3muH2b4G6jqKgGn/uZYWFjo+S00NjZW5eXlKioqqrUPAu/+++/XkiVL9N5776lTp06ednJsDWFhYTr//PPVv39/ZWVl6aKLLtKzzz5Lfi1g8+bNKiwsVL9+/dSiRQu1aNFCeXl5eu6559SiRQtPjsixdbRq1Uq9e/fWrl27+AxbRFxcnHr06OHVduGFF2rv3r2S+FncUBT8DZSYmKjY2Fjl5uZ62srLy5WXl6eUlBRJUr9+/WS32736HDx4UP/+9789fRA4xhjdd999WrRokdasWaPExESv/eTYmowxKisrI78WcPXVV2vbtm3asmWL59G/f3/dcsst2rJli7p160aOLaasrEyfffaZ4uLi+AxbxKBBg6pdEvuLL75Qly5dJPGzuMEC8EXhoFNSUmI+/fRT8+mnnxpJZtasWebTTz81+fn5xhhjZsyYYVq3bm0WLVpktm3bZn71q1+ZuLg4U1xc7Bnj7rvvNp06dTLvvvuu+eSTT8xVV11lLrroIlNRURGo08IP7rnnHtO6dWuzdu1ac/DgQc+jtLTU04ccB7epU6eadevWmd27d5utW7eahx9+2ISEhJhVq1YZY8ivFZ18lR5jyHGw++1vf2vWrl1rvvzyS7NhwwYzbNgwExUVZfbs2WOMIb9W8PHHH5sWLVqYJ554wuzatcu8/vrrJiIiwrz22muePuS5/ij4ffDee+8ZSdUeY8aMMcZ8f6mo6dOnm9jYWONwOMyVV15ptm3b5jXG8ePHzX333WfatWtnwsPDzbBhw8zevXsDcDY4VU25lWTmzZvn6UOOg9sdd9xhunTpYsLCwkzHjh3N1Vdf7Sn2jSG/VnRqwU+Og9tNN91k4uLijN1uN/Hx8WbUqFFm+/btnv3k1xqWLl1qevXqZRwOh7ngggvM3LlzvfaT5/qzGWNMYP5vAQAAAEBjYw0/AAAAYGEU/AAAAICFUfADAAAAFkbBDwAAAFgYBT8AAABgYRT8AAAAgIVR8AMAAAAWRsEPAAAAWBgFPwCcBWw2mxYvXhzoMAAAAUDBDwAWUFBQoPvvv1/dunWTw+FQQkKChg8frtWrVwc6tDMaO3asRo4cGegwAMCyWgQ6AABAw+zZs0eDBg1SmzZtNHPmTPXp00cul0srV67Uvffeq88//7xRjlteXq6wsLBGGbs+mls8ANBcMMMPAEFuwoQJstls+vjjj3XDDTeoe/fu6tmzpyZPnqwNGzZ4+h06dEi/+MUvFBERoaSkJC1ZssSzr7KyUnfeeacSExMVHh6u5ORkPfvss17HqZqJz8rKUnx8vLp37y5Jeu2119S/f39FRUUpNjZWo0ePVmFhoddzt2/fruuuu07R0dGKiorSFVdcof/+979yOp1asGCB/vGPf8hms8lms2nt2rWSpP379+umm25S27Zt1b59e40YMUJ79uw5YzwvvPCCkpKS1LJlS8XExOiGG27w58sNAEGHGX4ACGLfffedVqxYoSeeeEKtWrWqtr9Nmzaev2dkZGjmzJl66qmn9Kc//Um33HKL8vPz1a5dO7ndbnXq1ElvvvmmOnTooPXr1+uuu+5SXFycfvnLX3rGWL16taKjo5WbmytjjKTvZ9b/8Ic/KDk5WYWFhXrwwQc1duxYLVu2TNL3hfuVV16pwYMHa82aNYqOjtaHH36oiooK/e53v9Nnn32m4uJizZs3T5LUrl07lZaWasiQIbriiiu0bt06tWjRQo8//rh+9rOfaevWrZ6Z/FPj2bRpkx544AH95S9/UUpKir777ju9//77jfXyA0BwMACAoPXPf/7TSDKLFi06bT9J5pFHHvFsHz161NhsNrN8+fJanzNhwgRz/fXXe7bHjBljYmJiTFlZ2WmP9fHHHxtJpqSkxBhjzNSpU01iYqIpLy+vsf+YMWPMiBEjvNpefvllk5ycbNxut6etrKzMhIeHm5UrV9Yaz1tvvWWio6NNcXHxaWMEgLMJS3oAIIiZH2bZbTbbGfv26dPH8/dWrVopKirKa+nNiy++qP79+6tjx46KjIzUSy+9pL1793qN0bt372rr5D/99FONGDFCXbp0UVRUlAYPHixJnudu2bJFV1xxhex2u8/ntXnzZv3nP/9RVFSUIiMjFRkZqXbt2unEiRP673//W2s8aWlp6tKli7p166bbbrtNr7/+ukpLS30+LgBYEQU/AASxpKQk2Ww2ffbZZ2fse2rBbbPZ5Ha7JUlvvvmmHnzwQd1xxx1atWqVtmzZottvv13l5eVezzl12dCxY8eUnp6uyMhIvfbaa9q4caPefvttSfI8Nzw8vM7n5Xa71a9fP23ZssXr8cUXX2j06NG1xhMVFaVPPvlEb7zxhuLi4vTYY4/poosu0uHDh+scAwBYBQU/AASxdu3a6ac//amef/55HTt2rNp+Xwvd999/XykpKZowYYL69u2r888/32smvTaff/65Dh06pBkzZuiKK67QBRdcUO0Lu3369NH7778vl8tV4xhhYWGqrKz0avvJT36iXbt26ZxzztH555/v9WjduvVpY2rRooWuueYazZw5U1u3btWePXu0Zs2aM54LAFgVBT8ABLkXXnhBlZWVuvTSS/XWW29p165d+uyzz/Tcc89p4MCBPo1x/vnna9OmTVq5cqW++OILPfroo9q4ceMZn9e5c2eFhYXpT3/6k7788kstWbJEf/jDH7z63HfffSouLtbNN9+sTZs2adeuXfrLX/6inTt3SpK6du2qrVu3aufOnTp06JBcLpduueUWdejQQSNGjND777+v3bt3Ky8vTxMnTtRXX31VazzvvPOOnnvuOW3ZskX5+fl69dVX5Xa7lZyc7NPrAABWRMEPAEEuMTFRn3zyiYYMGaLf/va36tWrl9LS0rR69WrNmTPHpzHuvvtujRo1SjfddJMGDBigb7/9VhMmTDjj8zp27Kj58+frb3/7m3r06KEZM2bo6aef9urTvn17rVmzRkePHlVqaqr69eunl156ybPEaNy4cUpOTvZ8f+DDDz9URESE1q1bp86dO2vUqFG68MILdccdd+j48eOKjo6uNZ42bdpo0aJFuuqqq3ThhRfqxRdf1BtvvKGePXv69DoAgBXZTNU3vgAAAABYDjP8AAAAgIVR8AMAAAAWRsEPAAAAWBgFPwAAAGBhFPwAAACAhVHwAwAAABZGwQ8AAABYGAU/AAAAYGEU/AAAAICFUfADAAAAFkbBDwAAAFjY/wfznNHMialmyAAAAABJRU5ErkJggg==",
259
+ "text/plain": [
260
+ "<Figure size 900x400 with 1 Axes>"
261
+ ]
262
+ },
263
+ "metadata": {},
264
+ "output_type": "display_data"
265
+ }
266
+ ],
267
+ "source": [
268
+ "# Visualize text-length distributions by class\n",
269
+ "df['text_len'] = df['text'].str.len()\n",
270
+ "ax = df[df['label'] == 0]['text_len'].hist(bins=40, alpha=0.6, label='Human', figsize=(9, 4))\n",
271
+ "df[df['label'] == 1]['text_len'].hist(bins=40, alpha=0.6, label='AI', ax=ax)\n",
272
+ "ax.set_title('Text Length Distribution')\n",
273
+ "ax.set_xlabel('Characters')\n",
274
+ "ax.set_ylabel('Count')\n",
275
+ "ax.legend()"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": 6,
281
+ "id": "59fe88ce",
282
+ "metadata": {
283
+ "id": "59fe88ce"
284
+ },
285
+ "outputs": [
286
+ {
287
+ "data": {
288
+ "text/html": [
289
+ "<div>\n",
290
+ "<style scoped>\n",
291
+ " .dataframe tbody tr th:only-of-type {\n",
292
+ " vertical-align: middle;\n",
293
+ " }\n",
294
+ "\n",
295
+ " .dataframe tbody tr th {\n",
296
+ " vertical-align: top;\n",
297
+ " }\n",
298
+ "\n",
299
+ " .dataframe thead th {\n",
300
+ " text-align: right;\n",
301
+ " }\n",
302
+ "</style>\n",
303
+ "<table border=\"1\" class=\"dataframe\">\n",
304
+ " <thead>\n",
305
+ " <tr style=\"text-align: right;\">\n",
306
+ " <th></th>\n",
307
+ " <th>text</th>\n",
308
+ " <th>label</th>\n",
309
+ " </tr>\n",
310
+ " </thead>\n",
311
+ " <tbody>\n",
312
+ " <tr>\n",
313
+ " <th>0</th>\n",
314
+ " <td>हामीले पार्टी एकतापछि कि दुबै पार्टीको सिद्धान...</td>\n",
315
+ " <td>0</td>\n",
316
+ " </tr>\n",
317
+ " <tr>\n",
318
+ " <th>1</th>\n",
319
+ " <td>एमाले प्रतिनिधिसभाको प्रत्यक्षतर्फ ८० समानुपात...</td>\n",
320
+ " <td>0</td>\n",
321
+ " </tr>\n",
322
+ " <tr>\n",
323
+ " <th>2</th>\n",
324
+ " <td>नेकपा माओवादी केन्द्रका नेता रामनारायण विडारील...</td>\n",
325
+ " <td>1</td>\n",
326
+ " </tr>\n",
327
+ " <tr>\n",
328
+ " <th>3</th>\n",
329
+ " <td>प्रदेश नं २ का मुख्यमन्त्रीको रूपमा संघीय समाज...</td>\n",
330
+ " <td>1</td>\n",
331
+ " </tr>\n",
332
+ " <tr>\n",
333
+ " <th>4</th>\n",
334
+ " <td>बिहीबार एमालेका अध्यक्ष केपी शर्मा ओली र माओवा...</td>\n",
335
+ " <td>0</td>\n",
336
+ " </tr>\n",
337
+ " </tbody>\n",
338
+ "</table>\n",
339
+ "</div>"
340
+ ],
341
+ "text/plain": [
342
+ " text label\n",
343
+ "0 हामीले पार्टी एकतापछि कि दुबै पार्टीको सिद्धान... 0\n",
344
+ "1 एमाले प्रतिनिधिसभाको प्रत्यक्षतर्फ ८० समानुपात... 0\n",
345
+ "2 नेकपा माओवादी केन्द्रका नेता रामनारायण विडारील... 1\n",
346
+ "3 प्रदेश नं २ का मुख्यमन्त्रीको रूपमा संघीय समाज... 1\n",
347
+ "4 बिहीबार एमालेका अध्यक्ष केपी शर्मा ओली र माओवा... 0"
348
+ ]
349
+ },
350
+ "execution_count": 6,
351
+ "metadata": {},
352
+ "output_type": "execute_result"
353
+ }
354
+ ],
355
+ "source": [
356
+ "# Keep only columns needed for training\n",
357
+ "df = df[['text', 'label']].copy()\n",
358
+ "df.head()"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": 7,
364
+ "id": "434df9a2",
365
+ "metadata": {
366
+ "id": "434df9a2"
367
+ },
368
+ "outputs": [
369
+ {
370
+ "name": "stdout",
371
+ "output_type": "stream",
372
+ "text": [
373
+ "Using model: distilbert-base-multilingual-cased\n"
374
+ ]
375
+ }
376
+ ],
377
+ "source": [
378
+ "# Model/tokenizer config (smaller multilingual model for low-VRAM GPU)\n",
379
+ "MODEL_NAME = 'distilbert-base-multilingual-cased'\n",
380
+ "MAX_LEN = 96\n",
381
+ "\n",
382
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
383
+ "print('Using model:', MODEL_NAME)"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 8,
389
+ "id": "ef7d53f9",
390
+ "metadata": {
391
+ "id": "ef7d53f9"
392
+ },
393
+ "outputs": [],
394
+ "source": [
395
+ "class NepaliDataset(Dataset):\n",
396
+ " def __init__(self, texts, labels):\n",
397
+ " self.texts = texts\n",
398
+ " self.labels = labels\n",
399
+ "\n",
400
+ " def __len__(self):\n",
401
+ " return len(self.texts)\n",
402
+ "\n",
403
+ " def __getitem__(self, idx):\n",
404
+ " return {\n",
405
+ " 'text': self.texts[idx],\n",
406
+ " 'label': int(self.labels[idx]),\n",
407
+ " }"
408
+ ]
409
+ },
410
+ {
411
+ "cell_type": "code",
412
+ "execution_count": 9,
413
+ "id": "134a3fc1",
414
+ "metadata": {
415
+ "id": "134a3fc1"
416
+ },
417
+ "outputs": [
418
+ {
419
+ "name": "stdout",
420
+ "output_type": "stream",
421
+ "text": [
422
+ "Train: 1588 | Val: 398\n"
423
+ ]
424
+ }
425
+ ],
426
+ "source": [
427
+ "# Train/Validation Split\n",
428
+ "train_texts, val_texts, train_labels, val_labels = train_test_split(\n",
429
+ " df['text'].tolist(),\n",
430
+ " df['label'].tolist(),\n",
431
+ " test_size=0.2,\n",
432
+ " random_state=42,\n",
433
+ " stratify=df['label'].tolist(),\n",
434
+ ")\n",
435
+ "print(f'Train: {len(train_texts)} | Val: {len(val_texts)}')"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": 10,
441
+ "id": "dd226ed1",
442
+ "metadata": {
443
+ "id": "dd226ed1"
444
+ },
445
+ "outputs": [
446
+ {
447
+ "name": "stdout",
448
+ "output_type": "stream",
449
+ "text": [
450
+ "Batch size: 2 | Max length: 96\n"
451
+ ]
452
+ }
453
+ ],
454
+ "source": [
455
+ "train_dataset = NepaliDataset(train_texts, train_labels)\n",
456
+ "val_dataset = NepaliDataset(val_texts, val_labels)\n",
457
+ "\n",
458
+ "def collate_batch(batch):\n",
459
+ " texts = [item['text'] for item in batch]\n",
460
+ " labels = torch.tensor([item['label'] for item in batch], dtype=torch.long)\n",
461
+ " enc = tokenizer(\n",
462
+ " texts,\n",
463
+ " padding=True,\n",
464
+ " truncation=True,\n",
465
+ " max_length=MAX_LEN,\n",
466
+ " return_tensors='pt',\n",
467
+ " )\n",
468
+ " return {\n",
469
+ " 'input_ids': enc['input_ids'],\n",
470
+ " 'attention_mask': enc['attention_mask'],\n",
471
+ " 'labels': labels,\n",
472
+ " }\n",
473
+ "\n",
474
+ "BATCH_SIZE = 2\n",
475
+ "train_loader = DataLoader(\n",
476
+ " train_dataset,\n",
477
+ " batch_size=BATCH_SIZE,\n",
478
+ " shuffle=True,\n",
479
+ " collate_fn=collate_batch,\n",
480
+ " pin_memory=(torch.cuda.is_available()),\n",
481
+ ")\n",
482
+ "val_loader = DataLoader(\n",
483
+ " val_dataset,\n",
484
+ " batch_size=BATCH_SIZE,\n",
485
+ " shuffle=False,\n",
486
+ " collate_fn=collate_batch,\n",
487
+ " pin_memory=(torch.cuda.is_available()),\n",
488
+ ")\n",
489
+ "print('Batch size:', BATCH_SIZE, '| Max length:', MAX_LEN)"
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "code",
494
+ "execution_count": 11,
495
+ "id": "51320951",
496
+ "metadata": {
497
+ "id": "51320951"
498
+ },
499
+ "outputs": [],
500
+ "source": [
501
+ "# === Model ===\n",
502
+ "class IndicBERTClassifier(nn.Module):\n",
503
+ " def __init__(self, dropout=0.2):\n",
504
+ " super(IndicBERTClassifier, self).__init__()\n",
505
+ " self.bert = AutoModel.from_pretrained(MODEL_NAME)\n",
506
+ " if hasattr(self.bert, 'gradient_checkpointing_enable'):\n",
507
+ " self.bert.gradient_checkpointing_enable()\n",
508
+ " self.dropout = nn.Dropout(dropout)\n",
509
+ " self.classifier = nn.Linear(self.bert.config.hidden_size, 2)\n",
510
+ "\n",
511
+ " def forward(self, input_ids, attention_mask):\n",
512
+ " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n",
513
+ " cls_output = outputs.last_hidden_state[:, 0, :]\n",
514
+ " cls_output = self.dropout(cls_output)\n",
515
+ " return self.classifier(cls_output)"
516
+ ]
517
+ },
518
+ {
519
+ "cell_type": "code",
520
+ "execution_count": 12,
521
+ "id": "944f918e",
522
+ "metadata": {
523
+ "id": "944f918e"
524
+ },
525
+ "outputs": [],
526
+ "source": [
527
+ "# Step 8: Create a custom Dataset class\n",
528
+ "class NepaliTextDataset(Dataset):\n",
529
+ " def __init__(self, input_ids, attention_mask, labels):\n",
530
+ " self.input_ids = input_ids\n",
531
+ " self.attention_mask = attention_mask\n",
532
+ " self.labels = labels\n",
533
+ "\n",
534
+ " def __len__(self):\n",
535
+ " return len(self.labels)\n",
536
+ "\n",
537
+ " def __getitem__(self, idx):\n",
538
+ " return {\n",
539
+ " 'input_ids': torch.tensor(self.input_ids[idx]),\n",
540
+ " 'attention_mask': torch.tensor(self.attention_mask[idx]),\n",
541
+ " 'labels': torch.tensor(self.labels[idx])\n",
542
+ " }"
543
+ ]
544
+ },
545
+ {
546
+ "cell_type": "code",
547
+ "execution_count": 13,
548
+ "id": "a9d426e1",
549
+ "metadata": {
550
+ "id": "a9d426e1"
551
+ },
552
+ "outputs": [
553
+ {
554
+ "name": "stderr",
555
+ "output_type": "stream",
556
+ "text": [
557
+ "Loading weights: 100%|██████████| 100/100 [00:00<00:00, 11666.08it/s]\n",
558
+ "\u001b[1mDistilBertModel LOAD REPORT\u001b[0m from: distilbert-base-multilingual-cased\n",
559
+ "Key | Status | | \n",
560
+ "------------------------+------------+--+-\n",
561
+ "vocab_layer_norm.bias | UNEXPECTED | | \n",
562
+ "vocab_transform.weight | UNEXPECTED | | \n",
563
+ "vocab_layer_norm.weight | UNEXPECTED | | \n",
564
+ "vocab_transform.bias | UNEXPECTED | | \n",
565
+ "vocab_projector.bias | UNEXPECTED | | \n",
566
+ "\n",
567
+ "\u001b[3mNotes:\n",
568
+ "- UNEXPECTED\u001b[3m\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\u001b[0m\n"
569
+ ]
570
+ }
571
+ ],
572
+ "source": [
573
+ "\n",
574
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
575
+ "model = IndicBERTClassifier().to(device)"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "code",
580
+ "execution_count": 14,
581
+ "id": "2740c14a",
582
+ "metadata": {
583
+ "id": "2740c14a"
584
+ },
585
+ "outputs": [
586
+ {
587
+ "name": "stdout",
588
+ "output_type": "stream",
589
+ "text": [
590
+ "Grad accumulation steps: 4\n"
591
+ ]
592
+ }
593
+ ],
594
+ "source": [
595
+ "# === Optimizer, Scheduler & Loss ===\n",
596
+ "optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)\n",
597
+ "loss_fn = nn.CrossEntropyLoss()\n",
598
+ "\n",
599
+ "max_epochs = 6\n",
600
+ "grad_accum_steps = 4 # effective batch = BATCH_SIZE * grad_accum_steps\n",
601
+ "steps_per_epoch = math.ceil(len(train_loader) / grad_accum_steps)\n",
602
+ "total_steps = steps_per_epoch * max_epochs\n",
603
+ "warmup_steps = int(0.1 * total_steps)\n",
604
+ "scheduler = get_linear_schedule_with_warmup(\n",
605
+ " optimizer,\n",
606
+ " num_warmup_steps=warmup_steps,\n",
607
+ " num_training_steps=total_steps,\n",
608
+ ")\n",
609
+ "print('Grad accumulation steps:', grad_accum_steps)"
610
+ ]
611
+ },
612
+ {
613
+ "cell_type": "code",
614
+ "execution_count": 15,
615
+ "id": "14ce04bd",
616
+ "metadata": {
617
+ "id": "14ce04bd"
618
+ },
619
+ "outputs": [],
620
+ "source": [
621
+ "# === Training Loop ===\n",
622
+ "def train(model, loader):\n",
623
+ " model.train()\n",
624
+ " total_loss = 0\n",
625
+ " for batch in loader:\n",
626
+ " input_ids = batch['input_ids'].to(device)\n",
627
+ " attention_mask = batch['attention_mask'].to(device)\n",
628
+ " labels = batch['labels'].to(device)\n",
629
+ "\n",
630
+ " optimizer.zero_grad()\n",
631
+ " outputs = model(input_ids, attention_mask)\n",
632
+ " loss = loss_fn(outputs, labels)\n",
633
+ " loss.backward()\n",
634
+ " optimizer.step()\n",
635
+ " total_loss += loss.item()\n",
636
+ " return total_loss / len(loader)\n",
637
+ "\n",
638
+ "# === Evaluation ===\n",
639
+ "def evaluate(model, loader):\n",
640
+ " model.eval()\n",
641
+ " preds, true = [], []\n",
642
+ " with torch.no_grad():\n",
643
+ " for batch in loader:\n",
644
+ " input_ids = batch['input_ids'].to(device)\n",
645
+ " attention_mask = batch['attention_mask'].to(device)\n",
646
+ " labels = batch['labels'].to(device)\n",
647
+ "\n",
648
+ " outputs = model(input_ids, attention_mask)\n",
649
+ " pred_labels = torch.argmax(outputs, dim=1)\n",
650
+ " preds.extend(pred_labels.cpu().numpy())\n",
651
+ " true.extend(labels.cpu().numpy())\n",
652
+ "\n",
653
+ " print(classification_report(true, preds, target_names=[\"Human\", \"AI\"]))\n"
654
+ ]
655
+ },
656
+ {
657
+ "cell_type": "code",
658
+ "execution_count": null,
659
+ "id": "d24e91b7",
660
+ "metadata": {
661
+ "colab": {
662
+ "base_uri": "https://localhost:8080/"
663
+ },
664
+ "id": "d24e91b7",
665
+ "outputId": "33ef8227-5c71-4c0d-88e7-b1a9e30b45f4"
666
+ },
667
+ "outputs": [
668
+ {
669
+ "name": "stdout",
670
+ "output_type": "stream",
671
+ "text": [
672
+ "\n",
673
+ "Epoch 1/6\n"
674
+ ]
675
+ },
676
+ {
677
+ "name": "stderr",
678
+ "output_type": "stream",
679
+ "text": [
680
+ "/tmp/ipykernel_155548/4183901742.py:4: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.\n",
681
+ " scaler = GradScaler(enabled=use_amp)\n",
682
+ "/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
683
+ " with autocast(enabled=use_amp):\n"
684
+ ]
685
+ },
686
+ {
687
+ "name": "stdout",
688
+ "output_type": "stream",
689
+ "text": [
690
+ "Batch 0 | Loss: 0.8206\n",
691
+ "Batch 50 | Loss: 0.8677\n",
692
+ "Batch 100 | Loss: 0.8435\n",
693
+ "Batch 150 | Loss: 0.6523\n",
694
+ "Batch 200 | Loss: 0.7219\n",
695
+ "Batch 250 | Loss: 0.5793\n",
696
+ "Batch 300 | Loss: 0.6833\n",
697
+ "Batch 350 | Loss: 0.5742\n",
698
+ "Batch 400 | Loss: 0.4844\n",
699
+ "Batch 450 | Loss: 0.5671\n",
700
+ "Batch 500 | Loss: 0.5363\n",
701
+ "Batch 550 | Loss: 0.5386\n",
702
+ "Batch 600 | Loss: 0.5520\n",
703
+ "Batch 650 | Loss: 0.7692\n",
704
+ "Batch 700 | Loss: 0.4680\n",
705
+ "Batch 750 | Loss: 0.6353\n",
706
+ "Train | Loss: 0.6600 | Acc: 0.5913 | F1: 0.5895\n"
707
+ ]
708
+ },
709
+ {
710
+ "name": "stderr",
711
+ "output_type": "stream",
712
+ "text": [
713
+ "/tmp/ipykernel_155548/4183901742.py:55: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
714
+ " with autocast(enabled=use_amp):\n"
715
+ ]
716
+ },
717
+ {
718
+ "name": "stdout",
719
+ "output_type": "stream",
720
+ "text": [
721
+ "Validation | Loss: 0.5192 | Acc: 0.8015 | F1: 0.7812\n",
722
+ " precision recall f1-score support\n",
723
+ "\n",
724
+ " Human 0.75 0.90 0.82 198\n",
725
+ " AI 0.88 0.70 0.78 200\n",
726
+ "\n",
727
+ " accuracy 0.80 398\n",
728
+ " macro avg 0.81 0.80 0.80 398\n",
729
+ "weighted avg 0.81 0.80 0.80 398\n",
730
+ "\n",
731
+ "Saved improved checkpoint: model_best.pth\n",
732
+ "\n",
733
+ "Epoch 2/6\n"
734
+ ]
735
+ },
736
+ {
737
+ "name": "stderr",
738
+ "output_type": "stream",
739
+ "text": [
740
+ "/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
741
+ " with autocast(enabled=use_amp):\n"
742
+ ]
743
+ },
744
+ {
745
+ "name": "stdout",
746
+ "output_type": "stream",
747
+ "text": [
748
+ "Batch 0 | Loss: 0.6078\n",
749
+ "Batch 50 | Loss: 1.1135\n",
750
+ "Batch 100 | Loss: 0.3297\n",
751
+ "Batch 150 | Loss: 0.8473\n",
752
+ "Batch 200 | Loss: 0.9326\n",
753
+ "Batch 250 | Loss: 0.5112\n",
754
+ "Batch 300 | Loss: 0.1645\n",
755
+ "Batch 350 | Loss: 0.2250\n",
756
+ "Batch 400 | Loss: 0.7142\n",
757
+ "Batch 450 | Loss: 0.3741\n",
758
+ "Batch 500 | Loss: 0.3084\n",
759
+ "Batch 550 | Loss: 0.1472\n",
760
+ "Batch 600 | Loss: 0.0679\n",
761
+ "Batch 650 | Loss: 0.1234\n",
762
+ "Batch 700 | Loss: 1.1370\n",
763
+ "Batch 750 | Loss: 0.8843\n",
764
+ "Train | Loss: 0.4817 | Acc: 0.7720 | F1: 0.7665\n"
765
+ ]
766
+ },
767
+ {
768
+ "name": "stderr",
769
+ "output_type": "stream",
770
+ "text": [
771
+ "/tmp/ipykernel_155548/4183901742.py:55: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
772
+ " with autocast(enabled=use_amp):\n"
773
+ ]
774
+ },
775
+ {
776
+ "name": "stdout",
777
+ "output_type": "stream",
778
+ "text": [
779
+ "Validation | Loss: 0.3708 | Acc: 0.8417 | F1: 0.8225\n",
780
+ " precision recall f1-score support\n",
781
+ "\n",
782
+ " Human 0.78 0.95 0.86 198\n",
783
+ " AI 0.94 0.73 0.82 200\n",
784
+ "\n",
785
+ " accuracy 0.84 398\n",
786
+ " macro avg 0.86 0.84 0.84 398\n",
787
+ "weighted avg 0.86 0.84 0.84 398\n",
788
+ "\n",
789
+ "Saved improved checkpoint: model_best.pth\n",
790
+ "\n",
791
+ "Epoch 3/6\n"
792
+ ]
793
+ },
794
+ {
795
+ "name": "stderr",
796
+ "output_type": "stream",
797
+ "text": [
798
+ "/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
799
+ " with autocast(enabled=use_amp):\n"
800
+ ]
801
+ },
802
+ {
803
+ "name": "stdout",
804
+ "output_type": "stream",
805
+ "text": [
806
+ "Batch 0 | Loss: 0.0415\n",
807
+ "Batch 50 | Loss: 0.0845\n",
808
+ "Batch 100 | Loss: 0.0336\n",
809
+ "Batch 150 | Loss: 0.6389\n",
810
+ "Batch 200 | Loss: 1.6021\n",
811
+ "Batch 250 | Loss: 0.0696\n",
812
+ "Batch 300 | Loss: 0.5184\n",
813
+ "Batch 350 | Loss: 0.0569\n",
814
+ "Batch 400 | Loss: 0.8119\n",
815
+ "Batch 450 | Loss: 1.5121\n",
816
+ "Batch 500 | Loss: 0.0330\n",
817
+ "Batch 550 | Loss: 0.0208\n",
818
+ "Batch 600 | Loss: 1.1329\n",
819
+ "Batch 650 | Loss: 0.7745\n",
820
+ "Batch 700 | Loss: 0.0740\n",
821
+ "Batch 750 | Loss: 1.4907\n",
822
+ "Train | Loss: 0.3830 | Acc: 0.8495 | F1: 0.8488\n"
823
+ ]
824
+ },
825
+ {
826
+ "name": "stderr",
827
+ "output_type": "stream",
828
+ "text": [
829
+ "/tmp/ipykernel_155548/4183901742.py:55: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
830
+ " with autocast(enabled=use_amp):\n"
831
+ ]
832
+ },
833
+ {
834
+ "name": "stdout",
835
+ "output_type": "stream",
836
+ "text": [
837
+ "Validation | Loss: 0.3527 | Acc: 0.8668 | F1: 0.8515\n",
838
+ " precision recall f1-score support\n",
839
+ "\n",
840
+ " Human 0.80 0.97 0.88 198\n",
841
+ " AI 0.97 0.76 0.85 200\n",
842
+ "\n",
843
+ " accuracy 0.87 398\n",
844
+ " macro avg 0.88 0.87 0.87 398\n",
845
+ "weighted avg 0.88 0.87 0.87 398\n",
846
+ "\n",
847
+ "Saved improved checkpoint: model_best.pth\n",
848
+ "\n",
849
+ "Epoch 4/6\n"
850
+ ]
851
+ },
852
+ {
853
+ "name": "stderr",
854
+ "output_type": "stream",
855
+ "text": [
856
+ "/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
857
+ " with autocast(enabled=use_amp):\n"
858
+ ]
859
+ },
860
+ {
861
+ "name": "stdout",
862
+ "output_type": "stream",
863
+ "text": [
864
+ "Batch 0 | Loss: 1.2321\n",
865
+ "Batch 50 | Loss: 0.0369\n",
866
+ "Batch 100 | Loss: 0.0161\n",
867
+ "Batch 150 | Loss: 0.2000\n",
868
+ "Batch 200 | Loss: 0.0035\n",
869
+ "Batch 250 | Loss: 2.3207\n",
870
+ "Batch 300 | Loss: 0.0022\n",
871
+ "Batch 350 | Loss: 2.2738\n",
872
+ "Batch 400 | Loss: 0.0011\n",
873
+ "Batch 450 | Loss: 0.0075\n",
874
+ "Batch 500 | Loss: 2.4454\n",
875
+ "Batch 550 | Loss: 0.3863\n",
876
+ "Batch 600 | Loss: 0.0038\n",
877
+ "Batch 650 | Loss: 0.0061\n",
878
+ "Batch 700 | Loss: 0.0005\n",
879
+ "Batch 750 | Loss: 0.0182\n",
880
+ "Train | Loss: 0.4209 | Acc: 0.8923 | F1: 0.8903\n"
881
+ ]
882
+ },
883
+ {
884
+ "name": "stderr",
885
+ "output_type": "stream",
886
+ "text": [
887
+ "/tmp/ipykernel_155548/4183901742.py:55: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
888
+ " with autocast(enabled=use_amp):\n"
889
+ ]
890
+ },
891
+ {
892
+ "name": "stdout",
893
+ "output_type": "stream",
894
+ "text": [
895
+ "Validation | Loss: 0.4601 | Acc: 0.8769 | F1: 0.8831\n",
896
+ " precision recall f1-score support\n",
897
+ "\n",
898
+ " Human 0.92 0.83 0.87 198\n",
899
+ " AI 0.84 0.93 0.88 200\n",
900
+ "\n",
901
+ " accuracy 0.88 398\n",
902
+ " macro avg 0.88 0.88 0.88 398\n",
903
+ "weighted avg 0.88 0.88 0.88 398\n",
904
+ "\n",
905
+ "Saved improved checkpoint: model_best.pth\n",
906
+ "\n",
907
+ "Epoch 5/6\n"
908
+ ]
909
+ },
910
+ {
911
+ "name": "stderr",
912
+ "output_type": "stream",
913
+ "text": [
914
+ "/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
915
+ " with autocast(enabled=use_amp):\n"
916
+ ]
917
+ },
918
+ {
919
+ "name": "stdout",
920
+ "output_type": "stream",
921
+ "text": [
922
+ "Batch 0 | Loss: 0.0010\n",
923
+ "Batch 50 | Loss: 0.0061\n",
924
+ "Batch 100 | Loss: 0.0047\n",
925
+ "Batch 150 | Loss: 0.0201\n",
926
+ "Batch 200 | Loss: 0.0023\n",
927
+ "Batch 250 | Loss: 0.0395\n",
928
+ "Batch 300 | Loss: 0.0011\n",
929
+ "Batch 350 | Loss: 0.0002\n",
930
+ "Batch 400 | Loss: 3.2169\n",
931
+ "Batch 450 | Loss: 4.4883\n",
932
+ "Batch 500 | Loss: 0.0002\n",
933
+ "Batch 550 | Loss: 0.0003\n",
934
+ "Batch 600 | Loss: 0.0000\n",
935
+ "Batch 650 | Loss: 0.0002\n",
936
+ "Batch 700 | Loss: 0.0000\n",
937
+ "Batch 750 | Loss: 4.6367\n",
938
+ "Train | Loss: 0.5447 | Acc: 0.9011 | F1: 0.8990\n"
939
+ ]
940
+ },
941
+ {
942
+ "name": "stderr",
943
+ "output_type": "stream",
944
+ "text": [
945
+ "/tmp/ipykernel_155548/4183901742.py:55: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
946
+ " with autocast(enabled=use_amp):\n"
947
+ ]
948
+ },
949
+ {
950
+ "name": "stdout",
951
+ "output_type": "stream",
952
+ "text": [
953
+ "Validation | Loss: 0.5331 | Acc: 0.9271 | F1: 0.9266\n",
954
+ " precision recall f1-score support\n",
955
+ "\n",
956
+ " Human 0.92 0.94 0.93 198\n",
957
+ " AI 0.94 0.92 0.93 200\n",
958
+ "\n",
959
+ " accuracy 0.93 398\n",
960
+ " macro avg 0.93 0.93 0.93 398\n",
961
+ "weighted avg 0.93 0.93 0.93 398\n",
962
+ "\n",
963
+ "Saved improved checkpoint: model_best.pth\n",
964
+ "\n",
965
+ "Epoch 6/6\n"
966
+ ]
967
+ },
968
+ {
969
+ "name": "stderr",
970
+ "output_type": "stream",
971
+ "text": [
972
+ "/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
973
+ " with autocast(enabled=use_amp):\n"
974
+ ]
975
+ },
976
+ {
977
+ "name": "stdout",
978
+ "output_type": "stream",
979
+ "text": [
980
+ "Batch 0 | Loss: 0.0000\n"
981
+ ]
982
+ }
983
+ ],
984
+ "source": [
985
+ "from torch.cuda.amp import autocast, GradScaler\n",
986
+ "\n",
987
+ "use_amp = device.type == 'cuda'\n",
988
+ "scaler = GradScaler(enabled=use_amp)\n",
989
+ "\n",
990
+ "def train_one_epoch(model, loader):\n",
991
+ " model.train()\n",
992
+ " total_loss = 0.0\n",
993
+ " all_preds, all_true = [], []\n",
994
+ "\n",
995
+ " optimizer.zero_grad(set_to_none=True)\n",
996
+ " for batch_idx, batch in enumerate(loader):\n",
997
+ " input_ids = batch['input_ids'].to(device, non_blocking=True)\n",
998
+ " attention_mask = batch['attention_mask'].to(device, non_blocking=True)\n",
999
+ " labels = batch['labels'].to(device, non_blocking=True)\n",
1000
+ "\n",
1001
+ " with autocast(enabled=use_amp):\n",
1002
+ " logits = model(input_ids, attention_mask=attention_mask)\n",
1003
+ " loss = loss_fn(logits, labels) / grad_accum_steps\n",
1004
+ "\n",
1005
+ " scaler.scale(loss).backward()\n",
1006
+ "\n",
1007
+ " if (batch_idx + 1) % grad_accum_steps == 0 or (batch_idx + 1) == len(loader):\n",
1008
+ " torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n",
1009
+ " scaler.step(optimizer)\n",
1010
+ " scaler.update()\n",
1011
+ " scheduler.step()\n",
1012
+ " optimizer.zero_grad(set_to_none=True)\n",
1013
+ "\n",
1014
+ " total_loss += loss.item() * grad_accum_steps\n",
1015
+ " preds = torch.argmax(logits, dim=1)\n",
1016
+ " all_preds.extend(preds.detach().cpu().numpy())\n",
1017
+ " all_true.extend(labels.detach().cpu().numpy())\n",
1018
+ "\n",
1019
+ " if batch_idx % 50 == 0:\n",
1020
+ " print(f'Batch {batch_idx} | Loss: {(loss.item() * grad_accum_steps):.4f}')\n",
1021
+ "\n",
1022
+ " avg_loss = total_loss / max(len(loader), 1)\n",
1023
+ " train_acc = accuracy_score(all_true, all_preds)\n",
1024
+ " train_f1 = f1_score(all_true, all_preds)\n",
1025
+ " return avg_loss, train_acc, train_f1\n",
1026
+ "\n",
1027
+ "\n",
1028
+ "def evaluate(model, loader):\n",
1029
+ " model.eval()\n",
1030
+ " all_preds, all_true = [], []\n",
1031
+ " total_loss = 0.0\n",
1032
+ "\n",
1033
+ " with torch.no_grad():\n",
1034
+ " for batch in loader:\n",
1035
+ " input_ids = batch['input_ids'].to(device, non_blocking=True)\n",
1036
+ " attention_mask = batch['attention_mask'].to(device, non_blocking=True)\n",
1037
+ " labels = batch['labels'].to(device, non_blocking=True)\n",
1038
+ "\n",
1039
+ " with autocast(enabled=use_amp):\n",
1040
+ " logits = model(input_ids, attention_mask=attention_mask)\n",
1041
+ " loss = loss_fn(logits, labels)\n",
1042
+ "\n",
1043
+ " total_loss += loss.item()\n",
1044
+ " preds = torch.argmax(logits, dim=1)\n",
1045
+ " all_preds.extend(preds.cpu().numpy())\n",
1046
+ " all_true.extend(labels.cpu().numpy())\n",
1047
+ "\n",
1048
+ " val_loss = total_loss / max(len(loader), 1)\n",
1049
+ " val_acc = accuracy_score(all_true, all_preds)\n",
1050
+ " val_f1 = f1_score(all_true, all_preds)\n",
1051
+ "\n",
1052
+ " print(f'Validation | Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}')\n",
1053
+ " print(classification_report(all_true, all_preds, target_names=['Human', 'AI']))\n",
1054
+ " return val_loss, val_acc, val_f1\n",
1055
+ "\n",
1056
+ "\n",
1057
+ "# Training with early stopping on validation F1\n",
1058
+ "patience = 2\n",
1059
+ "best_val_f1 = 0.0\n",
1060
+ "epochs_without_improve = 0\n",
1061
+ "\n",
1062
+ "for epoch in range(1, max_epochs + 1):\n",
1063
+ " print(f'\\nEpoch {epoch}/{max_epochs}')\n",
1064
+ " if device.type == 'cuda':\n",
1065
+ " torch.cuda.empty_cache()\n",
1066
+ "\n",
1067
+ " train_loss, train_acc, train_f1 = train_one_epoch(model, train_loader)\n",
1068
+ " print(f'Train | Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}')\n",
1069
+ "\n",
1070
+ " val_loss, val_acc, val_f1 = evaluate(model, val_loader)\n",
1071
+ "\n",
1072
+ " if val_f1 > best_val_f1:\n",
1073
+ " best_val_f1 = val_f1\n",
1074
+ " epochs_without_improve = 0\n",
1075
+ " torch.save(model.state_dict(), 'model_best.pth')\n",
1076
+ " print('Saved improved checkpoint: model_best.pth')\n",
1077
+ " else:\n",
1078
+ " epochs_without_improve += 1\n",
1079
+ " if epochs_without_improve >= patience:\n",
1080
+ " print('Early stopping triggered.')\n",
1081
+ " break\n",
1082
+ "\n",
1083
+ "print(f'Best validation F1: {best_val_f1:.4f}')"
1084
+ ]
1085
+ },
1086
+ {
1087
+ "cell_type": "code",
1088
+ "execution_count": null,
1089
+ "id": "wBIT-kPaswqy",
1090
+ "metadata": {
1091
+ "id": "wBIT-kPaswqy"
1092
+ },
1093
+ "outputs": [],
1094
+ "source": [
1095
+ "# Optional: save current in-memory weights as latest checkpoint\n",
1096
+ "torch.save(model.state_dict(), 'model_latest.pth')\n",
1097
+ "print('Saved: model_latest.pth')"
1098
+ ]
1099
+ },
1100
+ {
1101
+ "cell_type": "code",
1102
+ "execution_count": null,
1103
+ "id": "19b9652c",
1104
+ "metadata": {
1105
+ "colab": {
1106
+ "base_uri": "https://localhost:8080/"
1107
+ },
1108
+ "id": "19b9652c",
1109
+ "outputId": "e1b12835-b081-4d46-a909-c92cb3b6d230"
1110
+ },
1111
+ "outputs": [
1112
+ {
1113
+ "data": {
1114
+ "text/plain": [
1115
+ "('./nepali_xlmr_classifier/tokenizer_config.json',\n",
1116
+ " './nepali_xlmr_classifier/special_tokens_map.json',\n",
1117
+ " './nepali_xlmr_classifier/sentencepiece.bpe.model',\n",
1118
+ " './nepali_xlmr_classifier/added_tokens.json',\n",
1119
+ " './nepali_xlmr_classifier/tokenizer.json')"
1120
+ ]
1121
+ },
1122
+ "execution_count": 41,
1123
+ "metadata": {},
1124
+ "output_type": "execute_result"
1125
+ }
1126
+ ],
1127
+ "source": [
1128
+ "tokenizer.save_pretrained(\"./nepali_xlmr_classifier\")"
1129
+ ]
1130
+ },
1131
+ {
1132
+ "cell_type": "code",
1133
+ "execution_count": null,
1134
+ "id": "eAnrw316iRw8",
1135
+ "metadata": {
1136
+ "colab": {
1137
+ "base_uri": "https://localhost:8080/"
1138
+ },
1139
+ "id": "eAnrw316iRw8",
1140
+ "outputId": "04885bb5-4f06-459b-a83c-40f5e00703fe"
1141
+ },
1142
+ "outputs": [
1143
+ {
1144
+ "name": "stdout",
1145
+ "output_type": "stream",
1146
+ "text": [
1147
+ "0\n"
1148
+ ]
1149
+ }
1150
+ ],
1151
+ "source": [
1152
+ "def predict(text):\n",
1153
+ " model.eval()\n",
1154
+ " inputs = tokenizer(\n",
1155
+ " text,\n",
1156
+ " return_tensors='pt',\n",
1157
+ " truncation=True,\n",
1158
+ " padding=True,\n",
1159
+ " max_length=MAX_LEN,\n",
1160
+ " )\n",
1161
+ " inputs = {k: v.to(device) for k, v in inputs.items()}\n",
1162
+ "\n",
1163
+ " with torch.no_grad():\n",
1164
+ " logits = model(inputs['input_ids'], inputs['attention_mask'])\n",
1165
+ " probs = torch.softmax(logits, dim=1)\n",
1166
+ " pred = torch.argmax(probs, dim=1).item()\n",
1167
+ " confidence = probs[0, pred].item()\n",
1168
+ "\n",
1169
+ " label = 'AI' if pred == 1 else 'Human'\n",
1170
+ " return label, confidence\n",
1171
+ "\n",
1172
+ "sample = 'अख्तियार दुरुपयोग अनुसन्धान आयोगले सिन्धुपाल्चोक–२ बाट प्रतिनिधिसभा सदस्य निर्वाचित सांसद तथा पूर्वमन्त्री बस्नेतसहित १६ जना र २ कम्पनी विरुद्ध ३ अर्ब २१ करोडभन्दा बढी बिगो कायम गरी बिहीबार विशेष अदालतमा भ्रष्टाचार मुद्दा दायर गरेको छ ।'\n",
1173
+ "label, conf = predict(sample)\n",
1174
+ "print(f'Prediction: {label} | Confidence: {conf:.4f}')"
1175
+ ]
1176
+ },
1177
+ {
1178
+ "cell_type": "code",
1179
+ "execution_count": null,
1180
+ "id": "lqGrqG51NiQV",
1181
+ "metadata": {
1182
+ "colab": {
1183
+ "base_uri": "https://localhost:8080/"
1184
+ },
1185
+ "id": "lqGrqG51NiQV",
1186
+ "outputId": "6bdae59b-2684-4bd0-f804-d16ebd8272db"
1187
+ },
1188
+ "outputs": [
1189
+ {
1190
+ "name": "stdout",
1191
+ "output_type": "stream",
1192
+ "text": [
1193
+ "1\n",
1194
+ "1\n",
1195
+ "1\n",
1196
+ "1\n",
1197
+ "1\n",
1198
+ "1\n",
1199
+ "1\n",
1200
+ "1\n",
1201
+ "1\n",
1202
+ "0\n"
1203
+ ]
1204
+ }
1205
+ ],
1206
+ "source": [
1207
+ "print(predict(\"इन्टरनेटको सुरुवात सन् १९६९ मा अमेरिकी रक्षा मन्त्रालयले निर्माण गरेको ARPANET नामक प्रोजेक्टबाट भएको हो, जसको उद्देश्य आपसी संचारलाई सहज बनाउने थियो र जसले भविष्यमा इन्टरनेटको रूप लियो\"))\n",
1208
+ "\n",
1209
+ "print(predict(\"सुरुमा इन्टरनेट केही वैज्ञानिक तथा सरकारी संस्थाहरूमा सीमित रहेको भए पनि, समयक्रममा यसको पहुँच आम नागरिक, विद्यालय, र व्यवसायिक क्षेत्रमा विस्तार हुँदै गयो\"))\n",
1210
+ "\n",
1211
+ "print(predict(\"ARPANETले कम्प्युटरहरूलाई आपसमा जोड्ने सफल प्रयोग गरेपछि इन्टरनेटको सम्भावना प्रमाणित भयो, जसले गर्दा विश्वभरका अनुसन्धानकर्ताहरू यसप्रति आकर्षित हुन थाले\"))\n",
1212
+ "\n",
1213
+ "print(predict(\"सन् १९९० को दशकमा विश्वव्यापी रूपमा इन्टरनेट विस्तार हुन थालेपछि मानिसहरू सूचनाको आदान–प्रदान, इमेल, र वेबसाइटहरूको प्रयोगमार्फत डिजिटल संसारमा प्रवेश गर्न थाले।\"))\n",
1214
+ "\n",
1215
+ "print(predict(\"इन्टरनेटले शिक्षा, स्वास्थ्य, सञ्चार, मनोरञ्जन, तथा व्यापारजस्ता धेरै क्षेत्रहरूमा अभूतपूर्व परिवर्तन ल्याएको छ, जसले गर्दा मानव जीवन सरल, छरितो र प्रभावकारी बनेको छ।\"))\n",
1216
+ "\n",
1217
+ "print(predict(\"समयसँगै इन्टरनेट एक अत्यावश्यक सेवाको रूपमा विकास भएको छ, जसबिनाको आधुनिक जीवन लगभग असम्भवजस्तै लाग्ने अवस्था सिर्जना भएको छ।\"))\n",
1218
+ "\n",
1219
+ "print(predict(\"आजको युगमा इन्टरनेट केवल सूचना प्राप्तिको माध्यम मात्र नभई ज्ञानको भण्डार, रचनात्मकता प्रदर्शन गर्ने मंच, तथा रोजगार सृजनाको स्रोत पनि बनिसकेको छ।\"))\n",
1220
+ "\n",
1221
+ "print(predict(\"इन्टरनेटको प्रभाव त्यति गहिरो भएको छ कि विद्यालयका बालबालिकादेखि वृद्धसम्म यसको प्रयोगमा संलग्न छन्, जसले डिजिटल विभाजनको अवधारणा जन्माएको छ।\"))\n",
1222
+ "\n",
1223
+ "print(predict(\"इन्टरनेटले विश्वलाई एउटा सानो गाउँमा रूपान्तरण गरेको छ, जहाँ मानिसहरू हजारौं किलोमिटर टाढा भएर पनि एकअर्कासँग प्रत्यक्ष संवाद गर्न सक्छन्।\"))\n",
1224
+ "\n",
1225
+ "print(predict(\"संसदीय समितिले समन्वयकारी भूमिका निर्वाह गर्दै मनसुनजन्य विपद् जोखिम न्यूनीकरण, विपद् प्रतिकार्यका लागि तयारी गर्न तीन तहकै सरकारलाई निर्देशन दिएको छ।\"))\n"
1226
+ ]
1227
+ },
1228
+ {
1229
+ "cell_type": "code",
1230
+ "execution_count": null,
1231
+ "id": "X2ePCc5Disrt",
1232
+ "metadata": {
1233
+ "colab": {
1234
+ "base_uri": "https://localhost:8080/",
1235
+ "height": 35
1236
+ },
1237
+ "id": "X2ePCc5Disrt",
1238
+ "outputId": "a4d27689-28cb-43c0-8333-67f2d3a6e097"
1239
+ },
1240
+ "outputs": [
1241
+ {
1242
+ "data": {
1243
+ "application/vnd.google.colaboratory.intrinsic+json": {
1244
+ "type": "string"
1245
+ },
1246
+ "text/plain": [
1247
+ "'/content/classifier.zip'"
1248
+ ]
1249
+ },
1250
+ "execution_count": 42,
1251
+ "metadata": {},
1252
+ "output_type": "execute_result"
1253
+ }
1254
+ ],
1255
+ "source": [
1256
+ "import shutil\n",
1257
+ "\n",
1258
+ "# Replace 'my_folder' with your folder name or path\n",
1259
+ "folder_path = '/content/nepali_xlmr_classifier'\n",
1260
+ "zip_path = '/content/classifier.zip'\n",
1261
+ "\n",
1262
+ "shutil.make_archive(zip_path.replace('.zip', ''), 'zip', folder_path)\n"
1263
+ ]
1264
+ },
1265
+ {
1266
+ "cell_type": "code",
1267
+ "execution_count": null,
1268
+ "id": "4BDzVg2gN7xi",
1269
+ "metadata": {
1270
+ "colab": {
1271
+ "base_uri": "https://localhost:8080/",
1272
+ "height": 17
1273
+ },
1274
+ "id": "4BDzVg2gN7xi",
1275
+ "outputId": "ef31798e-24f5-45ad-900f-7528b32ae39f"
1276
+ },
1277
+ "outputs": [
1278
+ {
1279
+ "data": {
1280
+ "application/javascript": "\n async function download(id, filename, size) {\n if (!google.colab.kernel.accessAllowed) {\n return;\n }\n const div = document.createElement('div');\n const label = document.createElement('label');\n label.textContent = `Downloading \"${filename}\": `;\n div.appendChild(label);\n const progress = document.createElement('progress');\n progress.max = size;\n div.appendChild(progress);\n document.body.appendChild(div);\n\n const buffers = [];\n let downloaded = 0;\n\n const channel = await google.colab.kernel.comms.open(id);\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n\n for await (const message of channel.messages) {\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n if (message.buffers) {\n for (const buffer of message.buffers) {\n buffers.push(buffer);\n downloaded += buffer.byteLength;\n progress.value = downloaded;\n }\n }\n }\n const blob = new Blob(buffers, {type: 'application/binary'});\n const a = document.createElement('a');\n a.href = window.URL.createObjectURL(blob);\n a.download = filename;\n div.appendChild(a);\n a.click();\n div.remove();\n }\n ",
1281
+ "text/plain": [
1282
+ "<IPython.core.display.Javascript object>"
1283
+ ]
1284
+ },
1285
+ "metadata": {},
1286
+ "output_type": "display_data"
1287
+ },
1288
+ {
1289
+ "data": {
1290
+ "application/javascript": "download(\"download_33034c8f-76d5-48d0-b7cd-3d066ac8e32f\", \"classifier.zip\", 6596694)",
1291
+ "text/plain": [
1292
+ "<IPython.core.display.Javascript object>"
1293
+ ]
1294
+ },
1295
+ "metadata": {},
1296
+ "output_type": "display_data"
1297
+ }
1298
+ ],
1299
+ "source": [
1300
+ "from google.colab import files\n",
1301
+ "\n",
1302
+ "files.download(zip_path)\n"
1303
+ ]
1304
+ },
1305
+ {
1306
+ "cell_type": "code",
1307
+ "execution_count": null,
1308
+ "id": "2jJkcOlw_R1k",
1309
+ "metadata": {
1310
+ "id": "2jJkcOlw_R1k"
1311
+ },
1312
+ "outputs": [],
1313
+ "source": [
1314
+ "torch.save(model.state_dict(), \"final_model.pth\") # AFTER training with classification head\n"
1315
+ ]
1316
+ },
1317
+ {
1318
+ "cell_type": "code",
1319
+ "execution_count": null,
1320
+ "id": "xnHr1IDABebZ",
1321
+ "metadata": {
1322
+ "colab": {
1323
+ "base_uri": "https://localhost:8080/"
1324
+ },
1325
+ "id": "xnHr1IDABebZ",
1326
+ "outputId": "95761a2d-56fa-418c-de03-d66d1ae662ee"
1327
+ },
1328
+ "outputs": [
1329
+ {
1330
+ "name": "stdout",
1331
+ "output_type": "stream",
1332
+ "text": [
1333
+ "The text is predicted to be: Human\n",
1334
+ "1\n",
1335
+ "0\n",
1336
+ "1\n"
1337
+ ]
1338
+ }
1339
+ ],
1340
+ "source": [
1341
+ "# prompt: How to load the model and classifier and use it ? if no other code is in top of this\n",
1342
+ "\n",
1343
+ "# Define the device\n",
1344
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
1345
+ "\n",
1346
+ "# Instantiate the model\n",
1347
+ "model = IndicBERTClassifier().to(device)\n",
1348
+ "\n",
1349
+ "# Load the saved state dictionary\n",
1350
+ "# Make sure the path to your saved model file is correct\n",
1351
+ "model_path = \"final_model.pth\" # Or \"model_95_acc.pth\" if you saved that one last\n",
1352
+ "model.load_state_dict(torch.load(model_path, map_location=device))\n",
1353
+ "\n",
1354
+ "# Set the model to evaluation mode\n",
1355
+ "model.eval()\n",
1356
+ "\n",
1357
+ "# Load the tokenizer\n",
1358
+ "tokenizer_path = \"./nepali_xlmr_classifier\" # Make sure this path is correct\n",
1359
+ "tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)\n",
1360
+ "\n",
1361
+ "# Now the model and tokenizer are loaded and ready to be used for predictions.\n",
1362
+ "# You can use the existing `predict` function or write a new one.\n",
1363
+ "\n",
1364
+ "# Example of using the predict function with the loaded model and tokenizer\n",
1365
+ "def predict(text):\n",
1366
+ " model.eval() # Ensure model is in evaluation mode\n",
1367
+ " inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)\n",
1368
+ " inputs = {k: v.to(device) for k, v in inputs.items()}\n",
1369
+ " with torch.no_grad():\n",
1370
+ " outputs = model(**inputs)\n",
1371
+ "\n",
1372
+ " # Handle if output is tensor (some versions/models return logits directly)\n",
1373
+ " logits = outputs if isinstance(outputs, torch.Tensor) else outputs.logits\n",
1374
+ "\n",
1375
+ " pred = torch.argmax(logits, dim=1).item()\n",
1376
+ " return pred\n",
1377
+ "\n",
1378
+ "# Example usage with some text\n",
1379
+ "text_to_predict = \"This is a test sentence.\" # Replace with your Nepali text\n",
1380
+ "predicted_class = predict(text_to_predict)\n",
1381
+ "\n",
1382
+ "# Interpret the prediction (assuming 0 for Human, 1 for AI based on your previous code)\n",
1383
+ "class_label = \"Human\" if predicted_class == 0 else \"AI\"\n",
1384
+ "print(f\"The text is predicted to be: {class_label}\")\n",
1385
+ "\n",
1386
+ "# You can test with more examples as you did before\n",
1387
+ "print(predict(\"यी सबै वाक्यहरू इन्टरनेटको विकास, प्रभाव, र चुनौतीहरूको गहिरो सन्दर्भ समेटेर तयार पारिएका छन्। यदि तिमीलाई चाहिएको खण्डमा विशेष विषय (जस्तै शिक्षा, साइबर सुरक्षा, ग्रामीण प्रभाव आदि) चाहिएको हो भने, म त्यही विषयमा केन्द्रित लामो वाक्यहरू पनि दिन सक्छु।\"))\n",
1388
+ "print(predict(\"अख्तियार दुरुपयोग अनुसन्धान आयोगले सिन्धुपाल्चोक–२ बाट प्रतिनिधिसभा सदस्य निर्वाचित सांसद तथा पूर्वमन्त्री बस्नेतसहित १६ जना र २ कम्पनी विरुद्ध ३ अर्ब २१ करोडभन्दा बढी बिगो कायम गरी बिहीबार विशेष अदालतमा भ्रष्टाचार मुद्दा दायर गरेको छ । योसँगै बस्नेत सांसद पदबाट स्वतः निलम्बनमा परेका छन् ।\"))\n",
1389
+ "print(predict(\"इन्टरनेटको सुरुवात सन् १९६९ मा अमेरिकी रक्षा मन्त्रालयले निर्माण गरेको ARPANET नामक प्रोजेक्टबाट भएको हो, जसको उद्देश्य आपसी संचारलाई सहज बनाउने थियो र जसले भविष्यमा इन्टरनेटको रूप लियो\"))\n"
1390
+ ]
1391
+ },
1392
+ {
1393
+ "cell_type": "code",
1394
+ "execution_count": null,
1395
+ "id": "gG8fnbqyDUpm",
1396
+ "metadata": {
1397
+ "id": "gG8fnbqyDUpm"
1398
+ },
1399
+ "outputs": [],
1400
+ "source": []
1401
+ }
1402
+ ],
1403
+ "metadata": {
1404
+ "accelerator": "TPU",
1405
+ "colab": {
1406
+ "gpuType": "V28",
1407
+ "provenance": []
1408
+ },
1409
+ "kernelspec": {
1410
+ "display_name": "ml",
1411
+ "language": "python",
1412
+ "name": "python3"
1413
+ },
1414
+ "language_info": {
1415
+ "codemirror_mode": {
1416
+ "name": "ipython",
1417
+ "version": 3
1418
+ },
1419
+ "file_extension": ".py",
1420
+ "mimetype": "text/x-python",
1421
+ "name": "python",
1422
+ "nbconvert_exporter": "python",
1423
+ "pygments_lexer": "ipython3",
1424
+ "version": "3.11.14"
1425
+ }
1426
+ },
1427
+ "nbformat": 4,
1428
+ "nbformat_minor": 5
1429
+ }
notebook/ai_vs_human_nepali/notebook/final_main.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebook/ai_vs_human_nepali/notebook/main.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebook/ai_vs_human_nepali/notebook/working model.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebook/ai_vs_human_nepali/topic_scrapper.ipynb ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 15,
6
+ "id": "4b53d4bc",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "# # Groq Nepali Rewriter\n",
11
+ "\n",
12
+ "# This notebook loads the dataset, builds a Nepali rewrite prompt, tests one sample, and then saves a batch output CSV using the Groq API.\n",
13
+ "\n",
14
+ "# Requirements:\n",
15
+ "# - `GROQ_API_KEY` must be available in `.env`\n",
16
+ "# - the input file must contain a `paragraph` column"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 16,
22
+ "id": "6c8dc1cb",
23
+ "metadata": {},
24
+ "outputs": [
25
+ {
26
+ "data": {
27
+ "text/plain": [
28
+ "True"
29
+ ]
30
+ },
31
+ "execution_count": 16,
32
+ "metadata": {},
33
+ "output_type": "execute_result"
34
+ }
35
+ ],
36
+ "source": [
37
+ "import os\n",
38
+ "import re\n",
39
+ "import time\n",
40
+ "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
41
+ "\n",
42
+ "import pandas as pd\n",
43
+ "from dotenv import load_dotenv\n",
44
+ "from groq import Groq\n",
45
+ "\n",
46
+ "load_dotenv()"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 17,
52
+ "id": "019adfa8",
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "api_key = os.getenv(\"GROQ_API_KEY2\")\n",
57
+ "if not api_key:\n",
58
+ " raise ValueError(\"GROQ_API_KEY not found in .env or environment.\")\n",
59
+ "\n",
60
+ "client = Groq(api_key=api_key)\n",
61
+ "MODEL_NAME = \"llama-3.3-70b-versatile\""
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 18,
67
+ "id": "4b4d2bbe",
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": [
71
+ "data =pd.read_csv(\"DATASET/topics_1000.csv\")"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 19,
77
+ "id": "c36cfbbf",
78
+ "metadata": {},
79
+ "outputs": [
80
+ {
81
+ "data": {
82
+ "text/html": [
83
+ "<div>\n",
84
+ "<style scoped>\n",
85
+ " .dataframe tbody tr th:only-of-type {\n",
86
+ " vertical-align: middle;\n",
87
+ " }\n",
88
+ "\n",
89
+ " .dataframe tbody tr th {\n",
90
+ " vertical-align: top;\n",
91
+ " }\n",
92
+ "\n",
93
+ " .dataframe thead th {\n",
94
+ " text-align: right;\n",
95
+ " }\n",
96
+ "</style>\n",
97
+ "<table border=\"1\" class=\"dataframe\">\n",
98
+ " <thead>\n",
99
+ " <tr style=\"text-align: right;\">\n",
100
+ " <th></th>\n",
101
+ " <th>id</th>\n",
102
+ " <th>topic</th>\n",
103
+ " </tr>\n",
104
+ " </thead>\n",
105
+ " <tbody>\n",
106
+ " <tr>\n",
107
+ " <th>0</th>\n",
108
+ " <td>1</td>\n",
109
+ " <td>नेपालमा कृत्रिम बुद्धिमत्ता विकासको वर्तमान अव...</td>\n",
110
+ " </tr>\n",
111
+ " <tr>\n",
112
+ " <th>1</th>\n",
113
+ " <td>2</td>\n",
114
+ " <td>नेपालको शिक्षा प्रणालीमा डिजिटल प्रविधिको प्रभाव</td>\n",
115
+ " </tr>\n",
116
+ " <tr>\n",
117
+ " <th>2</th>\n",
118
+ " <td>3</td>\n",
119
+ " <td>काठमाडौँ उपत्यकाको वायु प्रदूषण समस्या</td>\n",
120
+ " </tr>\n",
121
+ " <tr>\n",
122
+ " <th>3</th>\n",
123
+ " <td>4</td>\n",
124
+ " <td>नेपालमा जलवायु परिवर्तनका असरहरू</td>\n",
125
+ " </tr>\n",
126
+ " <tr>\n",
127
+ " <th>4</th>\n",
128
+ " <td>5</td>\n",
129
+ " <td>ग्रामीण क्षेत्रमा इन्टरनेट पहुँचको विस्तार</td>\n",
130
+ " </tr>\n",
131
+ " </tbody>\n",
132
+ "</table>\n",
133
+ "</div>"
134
+ ],
135
+ "text/plain": [
136
+ " id topic\n",
137
+ "0 1 नेपालमा कृत्रिम बुद्धिमत्ता विकासको वर्तमान अव...\n",
138
+ "1 2 नेपालको शिक्षा प्रणालीमा डिजिटल प्रविधिको प्रभाव\n",
139
+ "2 3 काठमाडौँ उपत्यकाको वायु प्रदूषण समस्या\n",
140
+ "3 4 नेपालमा जलवायु परिवर्तनका असरहरू\n",
141
+ "4 5 ग्रामीण क्षेत्रमा इन्टरनेट पहुँचको विस्तार"
142
+ ]
143
+ },
144
+ "execution_count": 19,
145
+ "metadata": {},
146
+ "output_type": "execute_result"
147
+ }
148
+ ],
149
+ "source": [
150
+ "data.head()"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": 20,
156
+ "id": "b6e226b8",
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "import numpy as np\n",
161
+ "def build_prompt(paragraph):\n",
162
+ " style = [\n",
163
+ " \"Use simple and clear language.\",\n",
164
+ " \"Make it engaging and interesting to read.\",\n",
165
+ " \"Use a conversational tone.\",\n",
166
+ " \"Keep the original meaning intact.\",\n",
167
+ " \"Avoid complex jargon and technical terms.\",\n",
168
+ " \"Use short sentences and paragraphs.\",\n",
169
+ " \"Add examples or anecdotes to illustrate points.\",\n",
170
+ " \"Use active voice instead of passive voice.\",\n",
171
+ " \"Include a call to action or a thought-provoking question at the end.\",\n",
172
+ " ]\n",
173
+ " selected_style_random_single = np.random.choice(style, size=len(style), replace=False) # Select the first 5 style guidelines\n",
174
+ " prompt = f\"\"\"\n",
175
+ " give me an essay for the following topics puree nepali ok no enlgish language:\n",
176
+ " {paragraph}\n",
177
+ " Rewrite the above paragraph in Nepali, following these style guidelines:\n",
178
+ " {', '.join(selected_style_random_single)}\n",
179
+ " \"\"\"\n",
180
+ " return prompt.strip()"
181
+ ]
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": 21,
186
+ "id": "cf16922b",
187
+ "metadata": {},
188
+ "outputs": [
189
+ {
190
+ "name": "stdout",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "नेपालमा कृत्रिम बुद्धिमत्ता विकासको वर्तमान अवस्था\n",
194
+ "\n",
195
+ "कृत्रिम बुद्धिमत्ता विकास नेपालको लागि एक नयाँ युग हो । यो प्राविधिक क्षेत्र दिन-प्रतिदिन विकसित हुने क्रममा छ । नेपालमा कृत्रिम बुद्धिमत्ताले विभिन्न क्षेत्रमा परिवर्तन ल्याउने क्षमता राख्दछ । जस्तै: स्वास्थ्य सेवामा, शिक्षामा, वित्तीय सेवामा, तथा उत्पादन क्षेत्रमा ।\n",
196
+ "\n",
197
+ "नेपालमा कृत्रिम बुद्धिमत्ताको विकासले नयाँ अवस्था प्राप्त गरिरहेको छ । यो देशमा विभिन्न प्राविधिक कम्पनीहरुले कृत्रिम बुद्धिमत्ताको विकासमा लगनशील छन् । तसर्थ, यसले नेपालमा रोजगारीको अवसर पनि बढाउने छ । उदाहरणको लागि, कृत्रिम बुद्धिमत्ताले स्वास्थ्य सेवामा रोग निदान गर्ने, रोगको उपचार सुझाउने, तथा व्यक्तिको स्वास्थ्य जाँच गर्ने काम गर्नसक्ने छ ।\n",
198
+ "\n",
199
+ "कृत्रिम बुद्धिमत्ताको विकासले नेपालको अर्थतन्त्रमा पनि परिवर्तन ल्याउने छ । यसले व्यवसायिक क्षेत्रमा उत्पादनशीलता बढाउने, उत्पादन मुल्य कम गर्ने, तथा गुणस्तर मापन गर्ने काम गर्नसक्ने छ । उदाहरणको लागि, कृत्रिम बुद्धिमत्ताले वित्तीय सेवामा लेनदेनको निरीक्षण गर्ने, धोकाधोकाको मुल्यांकन गर्ने, तथा वित्तीय संस्थाहरुलाई सुझाव दिने काम गर्नसक्ने छ ।\n",
200
+ "\n",
201
+ "नेपालमा कृत्रिम बुद्धिमत्ता विकासको वर्तमान अवस्थाले देशलाई एक नयाँ दिशामा लम्बने क्षमता राख्दछ । तर, यसको विकासमा चुनौतिहरु पनि छन् । जस्तै: डाटा सुरक्षा, निजताको हनन, तथा श्रमिकहरुको प्रतिस्पर्धी क्षमता । तसर्थ, नेपालमा कृत्रिम बुद्धिमत्ताको विकासलाई प्रोत्साहित गर्नको लागि, हामीले यसको विकासमा लगनशील कम्पनीहरुलाई साथ दिनु पर्छ । हामीले पनि कृत्रिम बुद्धिमत्ता���ो विकासमा योगदान पुर्याउनुपर्छ ।\n",
202
+ "\n",
203
+ "आह, नेपालमा कृत्रिम बुद्धिमत्ता विकासको वर्तमान अवस्थाले देशलाई एक नयाँ दिशामा लम्बने क्षमता राख्दछ । तर, यसको विकासमा हामी के गरिरहेका छौ? हामीले कृत्रिम बुद्धिमत्ताको विकासमा योगदान पुर्याउने छौ कि? हामीले यसको विकासमा चुनौतिहरुलाई मात गर्ने छौ कि? यस प्रश्नको उत्तर हामीसँग छ । आउनうभ, हामी नेपालमा कृत्रिम बुद्धिमत्ताको विकासलाई प्रोत्साहित गरौं । आउनूभ, हामी देशलाई एक नयाँ दिशामा लम्बौं ।\n"
204
+ ]
205
+ }
206
+ ],
207
+ "source": [
208
+ "build_prompt = build_prompt\n",
209
+ "\n",
210
+ "sample_title = str(data.iloc[0][\"topic\"])\n",
211
+ "\n",
212
+ "sample_response = client.chat.completions.create(\n",
213
+ " model=MODEL_NAME,\n",
214
+ " messages=[{\"role\": \"user\", \"content\": build_prompt(sample_title)}],\n",
215
+ ")\n",
216
+ "\n",
217
+ "generated_text = sample_response.choices[0].message.content.strip()\n",
218
+ "print(generated_text)"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": null,
224
+ "id": "c709f126",
225
+ "metadata": {},
226
+ "outputs": [],
227
+ "source": [
228
+ "def grok_step3_5_scraper(\n",
229
+ " input_file,\n",
230
+ " output_file=\"step3_5_grok_nepali.csv\",\n",
231
+ " limit=100,\n",
232
+ " model=MODEL_NAME,\n",
233
+ " requests_per_second=2,\n",
234
+ " max_workers=2,\n",
235
+ " max_retries=3,\n",
236
+ "):\n",
237
+ " working_df = pd.read_csv(input_file)\n",
238
+ " if limit is not None:\n",
239
+ " working_df = working_df.head(limit)\n",
240
+ "\n",
241
+ " cols = set(working_df.columns)\n",
242
+ " if \"Title\" in cols or \"शीर्षक\" in cols:\n",
243
+ " title_col = \"Title\" if \"Title\" in cols else \"शीर्षक\"\n",
244
+ " prompt_col = title_col\n",
245
+ " if \"Paragraph\" in cols:\n",
246
+ " human_col = \"Paragraph\"\n",
247
+ " elif \"विवरण\" in cols:\n",
248
+ " human_col = \"विवरण\"\n",
249
+ " elif \"paragraph\" in cols:\n",
250
+ " human_col = \"paragraph\"\n",
251
+ " else:\n",
252
+ " human_col = prompt_col\n",
253
+ " elif \"paragraph\" in cols or \"Paragraph\" in cols or \"विवरण\" in cols:\n",
254
+ " prompt_col = (\n",
255
+ " \"paragraph\" if \"paragraph\" in cols\n",
256
+ " else (\"Paragraph\" if \"Paragraph\" in cols else \"विवरण\")\n",
257
+ " )\n",
258
+ " human_col = prompt_col\n",
259
+ " title_col = prompt_col\n",
260
+ " else:\n",
261
+ " raise ValueError(\n",
262
+ " \"No supported text columns found. Expected one of: Title/शीर्षक with Paragraph/विवरण, or paragraph.\"\n",
263
+ " )\n",
264
+ "\n",
265
+ " working_df = working_df.dropna(subset=[human_col]).copy()\n",
266
+ "\n",
267
+ " total_input_rows = len(working_df)\n",
268
+ " already_done = 0\n",
269
+ "\n",
270
+ " if os.path.exists(output_file):\n",
271
+ " try:\n",
272
+ " existing_df = pd.read_csv(output_file)\n",
273
+ " already_done = len(existing_df)\n",
274
+ " except pd.errors.EmptyDataError:\n",
275
+ " already_done = 0\n",
276
+ "\n",
277
+ " if already_done >= total_input_rows:\n",
278
+ " print(\n",
279
+ " f\"Nothing to do. {already_done} rows already exist in {output_file} (input rows: {total_input_rows}).\"\n",
280
+ " )\n",
281
+ " return\n",
282
+ "\n",
283
+ " if already_done > 0:\n",
284
+ " working_df = working_df.iloc[already_done:].copy()\n",
285
+ " print(\n",
286
+ " f\"Resuming from row {already_done}. Processing remaining {len(working_df)} rows out of {total_input_rows}.\"\n",
287
+ " )\n",
288
+ " else:\n",
289
+ " print(f\"Loaded {total_input_rows} rows from {input_file}\")\n",
290
+ " print(\n",
291
+ " f\"Using title column: {title_col} | prompt column: {prompt_col} | human column: {human_col}\"\n",
292
+ " )\n",
293
+ "\n",
294
+ " results = []\n",
295
+ "\n",
296
+ " bad_markers = [\n",
297
+ " \"error\",\n",
298
+ " \"invalid\",\n",
299
+ " \"not found\",\n",
300
+ " \"decommissioned\",\n",
301
+ " \"rate limit\",\n",
302
+ " \"api key\",\n",
303
+ " ]\n",
304
+ "\n",
305
+ " def is_valid_ai_text(text: str) -> bool:\n",
306
+ " if not text:\n",
307
+ " return False\n",
308
+ " clean_text = text.strip()\n",
309
+ " if len(clean_text) < 20:\n",
310
+ " return False\n",
311
+ " lower_text = clean_text.lower()\n",
312
+ " return not any(marker in lower_text for marker in bad_markers)\n",
313
+ "\n",
314
+ " def extract_retry_wait_seconds(error_text: str) -> float:\n",
315
+ " match = re.search(r\"try again in\\s*(\\d+)ms\", error_text, re.IGNORECASE)\n",
316
+ " if match:\n",
317
+ " return int(match.group(1)) / 1000.0 + 0.2\n",
318
+ " return 1.5\n",
319
+ "\n",
320
+ " def process_one(idx, title_text, prompt_text, human_text):\n",
321
+ " local_client = Groq(api_key=api_key)\n",
322
+ "\n",
323
+ " for attempt in range(max_retries + 1):\n",
324
+ " try:\n",
325
+ " completion = local_client.chat.completions.create(\n",
326
+ " model=model,\n",
327
+ " messages=[{\"role\": \"user\", \"content\": build_prompt(str(prompt_text))}],\n",
328
+ " temperature=0.2,\n",
329
+ " max_tokens=500,\n",
330
+ " )\n",
331
+ " ai_text = completion.choices[0].message.content.strip()\n",
332
+ "\n",
333
+ " if not is_valid_ai_text(ai_text):\n",
334
+ " if attempt < max_retries:\n",
335
+ " continue\n",
336
+ " return {\n",
337
+ " \"idx\": idx,\n",
338
+ " \"ok\": False,\n",
339
+ " \"reason\": \"invalid_or_error_text\",\n",
340
+ " \"ai_text\": ai_text,\n",
341
+ " }\n",
342
+ "\n",
343
+ " return {\n",
344
+ " \"idx\": idx,\n",
345
+ " \"ok\": True,\n",
346
+ " \"title\": str(title_text),\n",
347
+ " \"human_text\": str(human_text),\n",
348
+ " \"ai_generated_text\": ai_text,\n",
349
+ " }\n",
350
+ " except Exception as error:\n",
351
+ " error_text = str(error)\n",
352
+ " is_rate_limited = (\n",
353
+ " \"rate_limit_exceeded\" in error_text.lower()\n",
354
+ " or \"rate limit reached\" in error_text.lower()\n",
355
+ " or \"429\" in error_text\n",
356
+ " )\n",
357
+ "\n",
358
+ " if is_rate_limited and attempt < max_retries:\n",
359
+ " wait_seconds = extract_retry_wait_seconds(error_text)\n",
360
+ " print(\n",
361
+ " f\"Row {idx} rate-limited, retry {attempt + 1}/{max_retries} after {wait_seconds:.2f}s\"\n",
362
+ " )\n",
363
+ " time.sleep(wait_seconds)\n",
364
+ " continue\n",
365
+ "\n",
366
+ " return {\n",
367
+ " \"idx\": idx,\n",
368
+ " \"ok\": False,\n",
369
+ " \"reason\": error_text,\n",
370
+ " \"ai_text\": \"\",\n",
371
+ " }\n",
372
+ "\n",
373
+ " rows = list(working_df[[title_col, prompt_col, human_col]].itertuples(index=True, name=None))\n",
374
+ " total = len(rows)\n",
375
+ "\n",
376
+ " for start in range(0, total, requests_per_second):\n",
377
+ " window = rows[start : start + requests_per_second]\n",
378
+ " tick_start = time.time()\n",
379
+ "\n",
380
+ " with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
381
+ " futures = {\n",
382
+ " executor.submit(process_one, idx, title_text, prompt_text, human_text): idx\n",
383
+ " for idx, title_text, prompt_text, human_text in window\n",
384
+ " }\n",
385
+ "\n",
386
+ " for future in as_completed(futures):\n",
387
+ " out = future.result()\n",
388
+ " if out[\"ok\"]:\n",
389
+ " # Save as id + ai_gen only\n",
390
+ " results.append({\n",
391
+ " \"id\": out[\"idx\"],\n",
392
+ " \"ai_gen\": out[\"ai_generated_text\"]\n",
393
+ " })\n",
394
+ " print(\n",
395
+ " f\"Row {out['idx']}: generated {len(out['ai_generated_text'].split())} words\"\n",
396
+ " )\n",
397
+ " else:\n",
398
+ " print(f\"Row {out['idx']} skipped: {out['reason']}\")\n",
399
+ "\n",
400
+ " if len(results) >= 10:\n",
401
+ " pd.DataFrame(results)[[\"id\", \"ai_gen\"]].to_csv(\n",
402
+ " output_file,\n",
403
+ " index=False,\n",
404
+ " mode=\"a\",\n",
405
+ " header=not os.path.exists(output_file),\n",
406
+ " )\n",
407
+ " print(f\"Saved {len(results)} valid rows to {output_file}\")\n",
408
+ " results = []\n",
409
+ "\n",
410
+ " elapsed = time.time() - tick_start\n",
411
+ " if elapsed < 1:\n",
412
+ " time.sleep(1 - elapsed)\n",
413
+ "\n",
414
+ " if results:\n",
415
+ " pd.DataFrame(results)[[\"id\", \"ai_gen\"]].to_csv(\n",
416
+ " output_file,\n",
417
+ " index=False,\n",
418
+ " mode=\"a\",\n",
419
+ " header=not os.path.exists(output_file),\n",
420
+ " )\n",
421
+ "\n",
422
+ " print(f\"Finished. Output saved to {output_file}\")"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "code",
427
+ "execution_count": 23,
428
+ "id": "29c3627c",
429
+ "metadata": {},
430
+ "outputs": [
431
+ {
432
+ "ename": "ParserError",
433
+ "evalue": "Error tokenizing data. C error: Expected 8 fields in line 33, saw 16\n",
434
+ "output_type": "error",
435
+ "traceback": [
436
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
437
+ "\u001b[0;31mParserError\u001b[0m Traceback (most recent call last)",
438
+ "Cell \u001b[0;32mIn[23], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m output_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnews_scrap_new21223123.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2\u001b[0m prepared_input \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDATASET/News_csv/ai_vs_human_input_all.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 4\u001b[0m \u001b[43mgrok_step3_5_scraper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_input\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mlimit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mMODEL_NAME\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mrequests_per_second\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_workers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(output_file):\n\u001b[1;32m 15\u001b[0m pd\u001b[38;5;241m.\u001b[39mread_csv(output_file)\u001b[38;5;241m.\u001b[39mtail()\n",
439
+ "Cell \u001b[0;32mIn[22], line 45\u001b[0m, in \u001b[0;36mgrok_step3_5_scraper\u001b[0;34m(input_file, output_file, limit, model, requests_per_second, max_workers, max_retries)\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(output_file):\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 45\u001b[0m existing_df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 46\u001b[0m already_done \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(existing_df)\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m pd\u001b[38;5;241m.\u001b[39merrors\u001b[38;5;241m.\u001b[39mEmptyDataError:\n",
440
+ "File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:873\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, skip_blank_lines, parse_dates, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 861\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 862\u001b[0m dialect,\n\u001b[1;32m 863\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 869\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 870\u001b[0m )\n\u001b[1;32m 871\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 873\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
441
+ "File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:306\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m parser:\n\u001b[0;32m--> 306\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mparser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnrows\u001b[49m\u001b[43m)\u001b[49m\n",
442
+ "File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1947\u001b[0m, in \u001b[0;36mTextFileReader.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1940\u001b[0m nrows \u001b[38;5;241m=\u001b[39m validate_integer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnrows\u001b[39m\u001b[38;5;124m\"\u001b[39m, nrows)\n\u001b[1;32m 1941\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1942\u001b[0m \u001b[38;5;66;03m# error: \"ParserBase\" has no attribute \"read\"\u001b[39;00m\n\u001b[1;32m 1943\u001b[0m (\n\u001b[1;32m 1944\u001b[0m index,\n\u001b[1;32m 1945\u001b[0m columns,\n\u001b[1;32m 1946\u001b[0m col_dict,\n\u001b[0;32m-> 1947\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[attr-defined]\u001b[39;49;00m\n\u001b[1;32m 1948\u001b[0m \u001b[43m \u001b[49m\u001b[43mnrows\u001b[49m\n\u001b[1;32m 1949\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1950\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1951\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n",
443
+ "File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/c_parser_wrapper.py:215\u001b[0m, in \u001b[0;36mCParserWrapper.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlow_memory:\n\u001b[0;32m--> 215\u001b[0m chunks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_low_memory\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnrows\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;66;03m# destructive to chunks\u001b[39;00m\n\u001b[1;32m 217\u001b[0m data \u001b[38;5;241m=\u001b[39m _concatenate_chunks(chunks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnames)\n",
444
+ "File \u001b[0;32mpandas/_libs/parsers.pyx:832\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n",
445
+ "File \u001b[0;32mpandas/_libs/parsers.pyx:897\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
446
+ "File \u001b[0;32mpandas/_libs/parsers.pyx:868\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n",
447
+ "File \u001b[0;32mpandas/_libs/parsers.pyx:885\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._check_tokenize_status\u001b[0;34m()\u001b[0m\n",
448
+ "File \u001b[0;32mpandas/_libs/parsers.pyx:2084\u001b[0m, in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n",
449
+ "\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 8 fields in line 33, saw 16\n"
450
+ ]
451
+ }
452
+ ],
453
+ "source": [
454
+ "output_file = \"news_scrap_new21223123.csv\"\n",
455
+ "prepared_input = \"DATASET/News_csv/ai_vs_human_input_all.csv\"\n",
456
+ "\n",
457
+ "grok_step3_5_scraper(\n",
458
+ " input_file=prepared_input,\n",
459
+ " output_file=output_file,\n",
460
+ " limit=10,\n",
461
+ " model=MODEL_NAME,\n",
462
+ " requests_per_second=2,\n",
463
+ " max_workers=2,\n",
464
+ " max_retries=3,\n",
465
+ ")\n",
466
+ "\n",
467
+ "if os.path.exists(output_file):\n",
468
+ " pd.read_csv(output_file).tail()\n",
469
+ "else:\n",
470
+ " print(f\"No output file found: {output_file}\")"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": null,
476
+ "id": "3c3777e8",
477
+ "metadata": {},
478
+ "outputs": [
479
+ {
480
+ "ename": "ParserError",
481
+ "evalue": "Error tokenizing data. C error: Expected 8 fields in line 33, saw 16\n",
482
+ "output_type": "error",
483
+ "traceback": [
484
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
485
+ "\u001b[0;31mParserError\u001b[0m Traceback (most recent call last)",
486
+ "Cell \u001b[0;32mIn[14], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m teststes \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mtail()\n",
487
+ "File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:873\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, skip_blank_lines, parse_dates, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 861\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 862\u001b[0m dialect,\n\u001b[1;32m 863\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 869\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 870\u001b[0m )\n\u001b[1;32m 871\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 873\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
488
+ "File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:306\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m parser:\n\u001b[0;32m--> 306\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mparser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnrows\u001b[49m\u001b[43m)\u001b[49m\n",
489
+ "File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1947\u001b[0m, in \u001b[0;36mTextFileReader.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1940\u001b[0m nrows \u001b[38;5;241m=\u001b[39m validate_integer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnrows\u001b[39m\u001b[38;5;124m\"\u001b[39m, nrows)\n\u001b[1;32m 1941\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1942\u001b[0m \u001b[38;5;66;03m# error: \"ParserBase\" has no attribute \"read\"\u001b[39;00m\n\u001b[1;32m 1943\u001b[0m (\n\u001b[1;32m 1944\u001b[0m index,\n\u001b[1;32m 1945\u001b[0m columns,\n\u001b[1;32m 1946\u001b[0m col_dict,\n\u001b[0;32m-> 1947\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[attr-defined]\u001b[39;49;00m\n\u001b[1;32m 1948\u001b[0m \u001b[43m \u001b[49m\u001b[43mnrows\u001b[49m\n\u001b[1;32m 1949\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1950\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1951\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n",
490
+ "File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/c_parser_wrapper.py:215\u001b[0m, in \u001b[0;36mCParserWrapper.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlow_memory:\n\u001b[0;32m--> 215\u001b[0m chunks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_low_memory\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnrows\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;66;03m# destructive to chunks\u001b[39;00m\n\u001b[1;32m 217\u001b[0m data \u001b[38;5;241m=\u001b[39m _concatenate_chunks(chunks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnames)\n",
491
+ "File \u001b[0;32mpandas/_libs/parsers.pyx:832\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n",
492
+ "File \u001b[0;32mpandas/_libs/parsers.pyx:897\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
493
+ "File \u001b[0;32mpandas/_libs/parsers.pyx:868\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n",
494
+ "File \u001b[0;32mpandas/_libs/parsers.pyx:885\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._check_tokenize_status\u001b[0;34m()\u001b[0m\n",
495
+ "File \u001b[0;32mpandas/_libs/parsers.pyx:2084\u001b[0m, in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n",
496
+ "\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 8 fields in line 33, saw 16\n"
497
+ ]
498
+ }
499
+ ],
500
+ "source": [
501
+ "teststes = pd.read_csv(output_file).tail()"
502
+ ]
503
+ },
504
+ {
505
+ "cell_type": "code",
506
+ "execution_count": null,
507
+ "id": "89c46554",
508
+ "metadata": {},
509
+ "outputs": [],
510
+ "source": []
511
+ },
512
+ {
513
+ "cell_type": "code",
514
+ "execution_count": null,
515
+ "id": "357ccb81",
516
+ "metadata": {},
517
+ "outputs": [],
518
+ "source": []
519
+ }
520
+ ],
521
+ "metadata": {
522
+ "kernelspec": {
523
+ "display_name": "ml",
524
+ "language": "python",
525
+ "name": "python3"
526
+ },
527
+ "language_info": {
528
+ "codemirror_mode": {
529
+ "name": "ipython",
530
+ "version": 3
531
+ },
532
+ "file_extension": ".py",
533
+ "mimetype": "text/x-python",
534
+ "name": "python",
535
+ "nbconvert_exporter": "python",
536
+ "pygments_lexer": "ipython3",
537
+ "version": "3.11.14"
538
+ }
539
+ },
540
+ "nbformat": 4,
541
+ "nbformat_minor": 5
542
+ }