pemix09 commited on
Commit
eb547cb
·
verified ·
1 Parent(s): ed0f15d

Upload folder using huggingface_hub

Browse files
learn_with_history_visualisation.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 4,
6
  "id": "4543f137",
7
  "metadata": {},
8
  "outputs": [
@@ -273,9 +273,9 @@
273
  "# --- 3. DATA LOADING ---\n",
274
  "def load_data():\n",
275
  " texts, labels = [], []\n",
276
- " print(f\"📂 Loading data from: {DATA_ROOT}\")\n",
277
  " if not DATA_ROOT.exists():\n",
278
- " print(\"ERROR: Data folder not found!\")\n",
279
  " return [], []\n",
280
  " \n",
281
  " for text_file in DATA_ROOT.rglob(\"*.txt\"):\n",
@@ -298,7 +298,7 @@
298
  "filtered_texts = [t for t, l in zip(texts, labels) if l in valid_classes]\n",
299
  "filtered_labels = [l for t, l in zip(texts, labels) if l in valid_classes]\n",
300
  "\n",
301
- "print(f\"Loaded {len(filtered_texts)} documents across {len(valid_classes)} categories.\")\n",
302
  "\n",
303
  "# Label Encoding\n",
304
  "label_encoder = LabelEncoder()\n",
@@ -315,13 +315,13 @@
315
  ")\n",
316
  "\n",
317
  "# Tokenization\n",
318
- "print(\"Tokenizing data...\")\n",
319
  "tokenizer = DistilBertTokenizer.from_pretrained(MODEL_ID)\n",
320
  "train_encodings = dict(tokenizer(train_texts, padding=\"max_length\", truncation=True, max_length=MAX_LEN, return_tensors=\"tf\"))\n",
321
  "val_encodings = dict(tokenizer(val_texts, padding=\"max_length\", truncation=True, max_length=MAX_LEN, return_tensors=\"tf\"))\n",
322
  "\n",
323
  "# Model Initialization\n",
324
- "print(\"🏗️ Initializing DistilBERT...\")\n",
325
  "model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_ID, num_labels=num_labels, from_pt=True)\n",
326
  "optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5)\n",
327
  "model.compile(\n",
@@ -331,7 +331,7 @@
331
  ")\n",
332
  "\n",
333
  "# TRAINING\n",
334
- "print(\"\\n🚀 Starting Training...\")\n",
335
  "history = model.fit(\n",
336
  " x=train_encodings,\n",
337
  " y=train_labels,\n",
@@ -345,7 +345,7 @@
345
  "plot_cm(model, val_encodings, val_labels, label_encoder.classes_)\n",
346
  "\n",
347
  "# --- 5. TFLITE CONVERSION ---\n",
348
- "print(\"\\n🔧 Converting to TFLite (Flutter compatibility mode)...\")\n",
349
  "@tf.function(input_signature=[tf.TensorSpec([1, MAX_LEN], tf.int32, name=\"input_ids\")])\n",
350
  "def serving_fn(input_ids):\n",
351
  " return model(input_ids, training=False)\n",
@@ -359,7 +359,7 @@
359
  "with Path(TFLITE_OUTPUT).open(\"wb\") as f:\n",
360
  " f.write(tflite_model)\n",
361
  "\n",
362
- "print(f\"SUCCESS! Model saved as {TFLITE_OUTPUT}\")"
363
  ]
364
  },
365
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": null,
6
  "id": "4543f137",
7
  "metadata": {},
8
  "outputs": [
 
273
  "# --- 3. DATA LOADING ---\n",
274
  "def load_data():\n",
275
  " texts, labels = [], []\n",
276
+ " print(f\"Loading data from: {DATA_ROOT}\")\n",
277
  " if not DATA_ROOT.exists():\n",
278
+ " print(\"ERROR: Data folder not found!\")\n",
279
  " return [], []\n",
280
  " \n",
281
  " for text_file in DATA_ROOT.rglob(\"*.txt\"):\n",
 
298
  "filtered_texts = [t for t, l in zip(texts, labels) if l in valid_classes]\n",
299
  "filtered_labels = [l for t, l in zip(texts, labels) if l in valid_classes]\n",
300
  "\n",
301
+ "print(f\"Loaded {len(filtered_texts)} documents across {len(valid_classes)} categories.\")\n",
302
  "\n",
303
  "# Label Encoding\n",
304
  "label_encoder = LabelEncoder()\n",
 
315
  ")\n",
316
  "\n",
317
  "# Tokenization\n",
318
+ "print(\"Tokenizing data...\")\n",
319
  "tokenizer = DistilBertTokenizer.from_pretrained(MODEL_ID)\n",
320
  "train_encodings = dict(tokenizer(train_texts, padding=\"max_length\", truncation=True, max_length=MAX_LEN, return_tensors=\"tf\"))\n",
321
  "val_encodings = dict(tokenizer(val_texts, padding=\"max_length\", truncation=True, max_length=MAX_LEN, return_tensors=\"tf\"))\n",
322
  "\n",
323
  "# Model Initialization\n",
324
+ "print(\"Initializing DistilBERT...\")\n",
325
  "model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_ID, num_labels=num_labels, from_pt=True)\n",
326
  "optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5)\n",
327
  "model.compile(\n",
 
331
  ")\n",
332
  "\n",
333
  "# TRAINING\n",
334
+ "print(\"\\nStarting Training...\")\n",
335
  "history = model.fit(\n",
336
  " x=train_encodings,\n",
337
  " y=train_labels,\n",
 
345
  "plot_cm(model, val_encodings, val_labels, label_encoder.classes_)\n",
346
  "\n",
347
  "# --- 5. TFLITE CONVERSION ---\n",
348
+ "print(\"\\nConverting to TFLite (Flutter compatibility mode)...\")\n",
349
  "@tf.function(input_signature=[tf.TensorSpec([1, MAX_LEN], tf.int32, name=\"input_ids\")])\n",
350
  "def serving_fn(input_ids):\n",
351
  " return model(input_ids, training=False)\n",
 
359
  "with Path(TFLITE_OUTPUT).open(\"wb\") as f:\n",
360
  " f.write(tflite_model)\n",
361
  "\n",
362
+ "print(f\"SUCCESS! Model saved as {TFLITE_OUTPUT}\")"
363
  ]
364
  },
365
  {