Upload folder using huggingface_hub
Browse files
learn_with_history_visualisation.ipynb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"id": "4543f137",
|
| 7 |
"metadata": {},
|
| 8 |
"outputs": [
|
|
@@ -273,9 +273,9 @@
|
|
| 273 |
"# --- 3. DATA LOADING ---\n",
|
| 274 |
"def load_data():\n",
|
| 275 |
" texts, labels = [], []\n",
|
| 276 |
-
" print(f\"
|
| 277 |
" if not DATA_ROOT.exists():\n",
|
| 278 |
-
" print(\"
|
| 279 |
" return [], []\n",
|
| 280 |
" \n",
|
| 281 |
" for text_file in DATA_ROOT.rglob(\"*.txt\"):\n",
|
|
@@ -298,7 +298,7 @@
|
|
| 298 |
"filtered_texts = [t for t, l in zip(texts, labels) if l in valid_classes]\n",
|
| 299 |
"filtered_labels = [l for t, l in zip(texts, labels) if l in valid_classes]\n",
|
| 300 |
"\n",
|
| 301 |
-
"print(f\"
|
| 302 |
"\n",
|
| 303 |
"# Label Encoding\n",
|
| 304 |
"label_encoder = LabelEncoder()\n",
|
|
@@ -315,13 +315,13 @@
|
|
| 315 |
")\n",
|
| 316 |
"\n",
|
| 317 |
"# Tokenization\n",
|
| 318 |
-
"print(\"
|
| 319 |
"tokenizer = DistilBertTokenizer.from_pretrained(MODEL_ID)\n",
|
| 320 |
"train_encodings = dict(tokenizer(train_texts, padding=\"max_length\", truncation=True, max_length=MAX_LEN, return_tensors=\"tf\"))\n",
|
| 321 |
"val_encodings = dict(tokenizer(val_texts, padding=\"max_length\", truncation=True, max_length=MAX_LEN, return_tensors=\"tf\"))\n",
|
| 322 |
"\n",
|
| 323 |
"# Model Initialization\n",
|
| 324 |
-
"print(\"
|
| 325 |
"model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_ID, num_labels=num_labels, from_pt=True)\n",
|
| 326 |
"optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5)\n",
|
| 327 |
"model.compile(\n",
|
|
@@ -331,7 +331,7 @@
|
|
| 331 |
")\n",
|
| 332 |
"\n",
|
| 333 |
"# TRAINING\n",
|
| 334 |
-
"print(\"\\
|
| 335 |
"history = model.fit(\n",
|
| 336 |
" x=train_encodings,\n",
|
| 337 |
" y=train_labels,\n",
|
|
@@ -345,7 +345,7 @@
|
|
| 345 |
"plot_cm(model, val_encodings, val_labels, label_encoder.classes_)\n",
|
| 346 |
"\n",
|
| 347 |
"# --- 5. TFLITE CONVERSION ---\n",
|
| 348 |
-
"print(\"\\
|
| 349 |
"@tf.function(input_signature=[tf.TensorSpec([1, MAX_LEN], tf.int32, name=\"input_ids\")])\n",
|
| 350 |
"def serving_fn(input_ids):\n",
|
| 351 |
" return model(input_ids, training=False)\n",
|
|
@@ -359,7 +359,7 @@
|
|
| 359 |
"with Path(TFLITE_OUTPUT).open(\"wb\") as f:\n",
|
| 360 |
" f.write(tflite_model)\n",
|
| 361 |
"\n",
|
| 362 |
-
"print(f\"
|
| 363 |
]
|
| 364 |
},
|
| 365 |
{
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
"id": "4543f137",
|
| 7 |
"metadata": {},
|
| 8 |
"outputs": [
|
|
|
|
| 273 |
"# --- 3. DATA LOADING ---\n",
|
| 274 |
"def load_data():\n",
|
| 275 |
" texts, labels = [], []\n",
|
| 276 |
+
" print(f\"Loading data from: {DATA_ROOT}\")\n",
|
| 277 |
" if not DATA_ROOT.exists():\n",
|
| 278 |
+
" print(\"ERROR: Data folder not found!\")\n",
|
| 279 |
" return [], []\n",
|
| 280 |
" \n",
|
| 281 |
" for text_file in DATA_ROOT.rglob(\"*.txt\"):\n",
|
|
|
|
| 298 |
"filtered_texts = [t for t, l in zip(texts, labels) if l in valid_classes]\n",
|
| 299 |
"filtered_labels = [l for t, l in zip(texts, labels) if l in valid_classes]\n",
|
| 300 |
"\n",
|
| 301 |
+
"print(f\"Loaded {len(filtered_texts)} documents across {len(valid_classes)} categories.\")\n",
|
| 302 |
"\n",
|
| 303 |
"# Label Encoding\n",
|
| 304 |
"label_encoder = LabelEncoder()\n",
|
|
|
|
| 315 |
")\n",
|
| 316 |
"\n",
|
| 317 |
"# Tokenization\n",
|
| 318 |
+
"print(\"Tokenizing data...\")\n",
|
| 319 |
"tokenizer = DistilBertTokenizer.from_pretrained(MODEL_ID)\n",
|
| 320 |
"train_encodings = dict(tokenizer(train_texts, padding=\"max_length\", truncation=True, max_length=MAX_LEN, return_tensors=\"tf\"))\n",
|
| 321 |
"val_encodings = dict(tokenizer(val_texts, padding=\"max_length\", truncation=True, max_length=MAX_LEN, return_tensors=\"tf\"))\n",
|
| 322 |
"\n",
|
| 323 |
"# Model Initialization\n",
|
| 324 |
+
"print(\"Initializing DistilBERT...\")\n",
|
| 325 |
"model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_ID, num_labels=num_labels, from_pt=True)\n",
|
| 326 |
"optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5)\n",
|
| 327 |
"model.compile(\n",
|
|
|
|
| 331 |
")\n",
|
| 332 |
"\n",
|
| 333 |
"# TRAINING\n",
|
| 334 |
+
"print(\"\\nStarting Training...\")\n",
|
| 335 |
"history = model.fit(\n",
|
| 336 |
" x=train_encodings,\n",
|
| 337 |
" y=train_labels,\n",
|
|
|
|
| 345 |
"plot_cm(model, val_encodings, val_labels, label_encoder.classes_)\n",
|
| 346 |
"\n",
|
| 347 |
"# --- 5. TFLITE CONVERSION ---\n",
|
| 348 |
+
"print(\"\\nConverting to TFLite (Flutter compatibility mode)...\")\n",
|
| 349 |
"@tf.function(input_signature=[tf.TensorSpec([1, MAX_LEN], tf.int32, name=\"input_ids\")])\n",
|
| 350 |
"def serving_fn(input_ids):\n",
|
| 351 |
" return model(input_ids, training=False)\n",
|
|
|
|
| 359 |
"with Path(TFLITE_OUTPUT).open(\"wb\") as f:\n",
|
| 360 |
" f.write(tflite_model)\n",
|
| 361 |
"\n",
|
| 362 |
+
"print(f\"SUCCESS! Model saved as {TFLITE_OUTPUT}\")"
|
| 363 |
]
|
| 364 |
},
|
| 365 |
{
|