Spaces:

Az-r-ow
/

TravelNER

Build error

App Files Files Community

Az-r-ow commited on Oct 31, 2024

Commit

5783f3e

1 Parent(s): ae14666

WIP(sentence_processing): notebook to show the difference between processing techniques

Browse files

Files changed (3) hide show

conv_tagged_file_to_bio.py +2 -2
hmm_ner.ipynb +13 -19
sentence_processing.ipynb +82 -0

conv_tagged_file_to_bio.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from app.travel_resolver.libs.nlp.data_processing import from_tagged_file_to_bio_file
-INPUT_FILE = "./data/scripting_lcs_1/sentences_10k.txt"
-OUTPUT_FILE = "./data/bio/fr.bio/fr.sentences.bio"
 tag_entities_pairs = [("<Dep>", "LOC-DEP"), ("<Arr>", "LOC-ARR")]

 from app.travel_resolver.libs.nlp.data_processing import from_tagged_file_to_bio_file
+INPUT_FILE = "./data/scripting_lcs_1/10k_samples.txt"
+OUTPUT_FILE = "./data/bio/fr.bio/10k_samples.bio"
 tag_entities_pairs = [("<Dep>", "LOC-DEP"), ("<Arr>", "LOC-ARR")]

hmm_ner.ipynb CHANGED Viewed

@@ -606,19 +606,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "j: O\n",
-      "e: O\n",
-      " : O\n",
-      "v: LOC-DEP\n",
-      "o: LOC-DEP\n",
-      "u: O\n"
      ]
     }
    ],
@@ -627,7 +627,7 @@
     "\n",
     "vocab = list(em_prob_dict.keys())\n",
     "\n",
-    "test_sentence = \"Je voudrais voyager de Montpellier à Paris.\"\n",
     "\n",
     "test_sentence = process_sentence(test_sentence, stemming=True)\n",
     "\n",
@@ -637,13 +637,13 @@
     "\n",
     "predicted_labels = hmm.predict(test_sentence_encoded)\n",
     "\n",
-    "for word, label in zip(test_sentence, predicted_labels):\n",
     "    print(f\"{word}: {list(unique_labels.keys())[label]}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -663,7 +663,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -671,25 +671,19 @@
      "output_type": "stream",
      "text": [
       "Accuracy: 0.7044728434504792\n",
-      "Precision: 0.8437797024042935\n",
-      "Recall: 0.7044728434504792\n",
       "F1 Score: 0.7390763122386325\n"
      ]
     }
    ],
    "source": [
-    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
     "\n",
-    "y_test = [item for sublist in y_test for item in sublist]\n",
     "\n",
     "accuracy_score = accuracy_score(y_test, predicted_labels_test)\n",
-    "precision_score = precision_score(y_test, predicted_labels_test, average=\"weighted\")\n",
-    "recall_score = recall_score(y_test, predicted_labels_test, average=\"weighted\")\n",
     "f1_score = f1_score(y_test, predicted_labels_test, average=\"weighted\")\n",
     "\n",
     "print(f\"Accuracy: {accuracy_score}\")\n",
-    "print(f\"Precision: {precision_score}\")\n",
-    "print(f\"Recall: {recall_score}\")\n",
     "print(f\"F1 Score: {f1_score}\")"
    ]
   }

   },
   {
    "cell_type": "code",
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "je: O\n",
+      "veux: O\n",
+      "part: O\n",
+      "montpelli: LOC-DEP\n",
+      "ver: LOC-DEP\n",
+      "paris: O\n"
      ]
     }
    ],
     "\n",
     "vocab = list(em_prob_dict.keys())\n",
     "\n",
+    "test_sentence = \"Je veux partir de montpellier vers Paris\"\n",
     "\n",
     "test_sentence = process_sentence(test_sentence, stemming=True)\n",
     "\n",
     "\n",
     "predicted_labels = hmm.predict(test_sentence_encoded)\n",
     "\n",
+    "for word, label in zip(test_sentence.split(\" \"), predicted_labels):\n",
     "    print(f\"{word}: {list(unique_labels.keys())[label]}\")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "text": [
       "Accuracy: 0.7044728434504792\n",
       "F1 Score: 0.7390763122386325\n"
      ]
     }
    ],
    "source": [
+    "from sklearn.metrics import accuracy_score, f1_score\n",
     "\n",
+    "# y_test = [item for sublist in y_test for item in sublist]\n",
     "\n",
     "accuracy_score = accuracy_score(y_test, predicted_labels_test)\n",
     "f1_score = f1_score(y_test, predicted_labels_test, average=\"weighted\")\n",
     "\n",
     "print(f\"Accuracy: {accuracy_score}\")\n",
     "print(f\"F1 Score: {f1_score}\")"
    ]
   }

sentence_processing.ipynb ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sentence Processing for NLP\n",
+    "\n",
+    "In this notebook, we will see the importance of sentence processing and the techniques that we used to train the models.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will use a corpus of `10_000` sentences to demonstrate the difference between the different techniques.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt_tab to /Users/az-r-\n",
+      "[nltk_data]     ow/nltk_data...\n",
+      "[nltk_data]   Package punkt_tab is already up-to-date!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from app.travel_resolver.libs.nlp.data_processing import from_bio_file_to_examples\n",
+    "\n",
+    "\n",
+    "sentences, labels, vocab, unique_labels = from_bio_file_to_examples(\n",
+    "    \"data/bio/fr.bio/10k_samples.bio\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_word_count(sentences):\n",
+    "    words = dict()\n",
+    "    for sentence in sentences:\n",
+    "        for word in sentence:\n",
+    "            if word in words:\n",
+    "                words[word] += 1\n",
+    "            else:\n",
+    "                words[word] = 1\n",
+    "    return words"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}