Az-r-ow commited on
Commit ·
5783f3e
1
Parent(s): ae14666
WIP(sentence_processing): notebook to show the difference between processing techniques
Browse files- conv_tagged_file_to_bio.py +2 -2
- hmm_ner.ipynb +13 -19
- sentence_processing.ipynb +82 -0
conv_tagged_file_to_bio.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
from app.travel_resolver.libs.nlp.data_processing import from_tagged_file_to_bio_file
|
| 2 |
|
| 3 |
|
| 4 |
-
INPUT_FILE = "./data/scripting_lcs_1/
|
| 5 |
-
OUTPUT_FILE = "./data/bio/fr.bio/
|
| 6 |
|
| 7 |
tag_entities_pairs = [("<Dep>", "LOC-DEP"), ("<Arr>", "LOC-ARR")]
|
| 8 |
|
|
|
|
| 1 |
from app.travel_resolver.libs.nlp.data_processing import from_tagged_file_to_bio_file
|
| 2 |
|
| 3 |
|
| 4 |
+
INPUT_FILE = "./data/scripting_lcs_1/10k_samples.txt"
|
| 5 |
+
OUTPUT_FILE = "./data/bio/fr.bio/10k_samples.bio"
|
| 6 |
|
| 7 |
tag_entities_pairs = [("<Dep>", "LOC-DEP"), ("<Arr>", "LOC-ARR")]
|
| 8 |
|
hmm_ner.ipynb
CHANGED
|
@@ -606,19 +606,19 @@
|
|
| 606 |
},
|
| 607 |
{
|
| 608 |
"cell_type": "code",
|
| 609 |
-
"execution_count":
|
| 610 |
"metadata": {},
|
| 611 |
"outputs": [
|
| 612 |
{
|
| 613 |
"name": "stdout",
|
| 614 |
"output_type": "stream",
|
| 615 |
"text": [
|
| 616 |
-
"
|
| 617 |
-
"
|
| 618 |
-
"
|
| 619 |
-
"
|
| 620 |
-
"
|
| 621 |
-
"
|
| 622 |
]
|
| 623 |
}
|
| 624 |
],
|
|
@@ -627,7 +627,7 @@
|
|
| 627 |
"\n",
|
| 628 |
"vocab = list(em_prob_dict.keys())\n",
|
| 629 |
"\n",
|
| 630 |
-
"test_sentence = \"Je
|
| 631 |
"\n",
|
| 632 |
"test_sentence = process_sentence(test_sentence, stemming=True)\n",
|
| 633 |
"\n",
|
|
@@ -637,13 +637,13 @@
|
|
| 637 |
"\n",
|
| 638 |
"predicted_labels = hmm.predict(test_sentence_encoded)\n",
|
| 639 |
"\n",
|
| 640 |
-
"for word, label in zip(test_sentence, predicted_labels):\n",
|
| 641 |
" print(f\"{word}: {list(unique_labels.keys())[label]}\")"
|
| 642 |
]
|
| 643 |
},
|
| 644 |
{
|
| 645 |
"cell_type": "code",
|
| 646 |
-
"execution_count":
|
| 647 |
"metadata": {},
|
| 648 |
"outputs": [],
|
| 649 |
"source": [
|
|
@@ -663,7 +663,7 @@
|
|
| 663 |
},
|
| 664 |
{
|
| 665 |
"cell_type": "code",
|
| 666 |
-
"execution_count":
|
| 667 |
"metadata": {},
|
| 668 |
"outputs": [
|
| 669 |
{
|
|
@@ -671,25 +671,19 @@
|
|
| 671 |
"output_type": "stream",
|
| 672 |
"text": [
|
| 673 |
"Accuracy: 0.7044728434504792\n",
|
| 674 |
-
"Precision: 0.8437797024042935\n",
|
| 675 |
-
"Recall: 0.7044728434504792\n",
|
| 676 |
"F1 Score: 0.7390763122386325\n"
|
| 677 |
]
|
| 678 |
}
|
| 679 |
],
|
| 680 |
"source": [
|
| 681 |
-
"from sklearn.metrics import accuracy_score,
|
| 682 |
"\n",
|
| 683 |
-
"y_test = [item for sublist in y_test for item in sublist]\n",
|
| 684 |
"\n",
|
| 685 |
"accuracy_score = accuracy_score(y_test, predicted_labels_test)\n",
|
| 686 |
-
"precision_score = precision_score(y_test, predicted_labels_test, average=\"weighted\")\n",
|
| 687 |
-
"recall_score = recall_score(y_test, predicted_labels_test, average=\"weighted\")\n",
|
| 688 |
"f1_score = f1_score(y_test, predicted_labels_test, average=\"weighted\")\n",
|
| 689 |
"\n",
|
| 690 |
"print(f\"Accuracy: {accuracy_score}\")\n",
|
| 691 |
-
"print(f\"Precision: {precision_score}\")\n",
|
| 692 |
-
"print(f\"Recall: {recall_score}\")\n",
|
| 693 |
"print(f\"F1 Score: {f1_score}\")"
|
| 694 |
]
|
| 695 |
}
|
|
|
|
| 606 |
},
|
| 607 |
{
|
| 608 |
"cell_type": "code",
|
| 609 |
+
"execution_count": 31,
|
| 610 |
"metadata": {},
|
| 611 |
"outputs": [
|
| 612 |
{
|
| 613 |
"name": "stdout",
|
| 614 |
"output_type": "stream",
|
| 615 |
"text": [
|
| 616 |
+
"je: O\n",
|
| 617 |
+
"veux: O\n",
|
| 618 |
+
"part: O\n",
|
| 619 |
+
"montpelli: LOC-DEP\n",
|
| 620 |
+
"ver: LOC-DEP\n",
|
| 621 |
+
"paris: O\n"
|
| 622 |
]
|
| 623 |
}
|
| 624 |
],
|
|
|
|
| 627 |
"\n",
|
| 628 |
"vocab = list(em_prob_dict.keys())\n",
|
| 629 |
"\n",
|
| 630 |
+
"test_sentence = \"Je veux partir de montpellier vers Paris\"\n",
|
| 631 |
"\n",
|
| 632 |
"test_sentence = process_sentence(test_sentence, stemming=True)\n",
|
| 633 |
"\n",
|
|
|
|
| 637 |
"\n",
|
| 638 |
"predicted_labels = hmm.predict(test_sentence_encoded)\n",
|
| 639 |
"\n",
|
| 640 |
+
"for word, label in zip(test_sentence.split(\" \"), predicted_labels):\n",
|
| 641 |
" print(f\"{word}: {list(unique_labels.keys())[label]}\")"
|
| 642 |
]
|
| 643 |
},
|
| 644 |
{
|
| 645 |
"cell_type": "code",
|
| 646 |
+
"execution_count": 26,
|
| 647 |
"metadata": {},
|
| 648 |
"outputs": [],
|
| 649 |
"source": [
|
|
|
|
| 663 |
},
|
| 664 |
{
|
| 665 |
"cell_type": "code",
|
| 666 |
+
"execution_count": null,
|
| 667 |
"metadata": {},
|
| 668 |
"outputs": [
|
| 669 |
{
|
|
|
|
| 671 |
"output_type": "stream",
|
| 672 |
"text": [
|
| 673 |
"Accuracy: 0.7044728434504792\n",
|
|
|
|
|
|
|
| 674 |
"F1 Score: 0.7390763122386325\n"
|
| 675 |
]
|
| 676 |
}
|
| 677 |
],
|
| 678 |
"source": [
|
| 679 |
+
"from sklearn.metrics import accuracy_score, f1_score\n",
|
| 680 |
"\n",
|
| 681 |
+
"# y_test = [item for sublist in y_test for item in sublist]\n",
|
| 682 |
"\n",
|
| 683 |
"accuracy_score = accuracy_score(y_test, predicted_labels_test)\n",
|
|
|
|
|
|
|
| 684 |
"f1_score = f1_score(y_test, predicted_labels_test, average=\"weighted\")\n",
|
| 685 |
"\n",
|
| 686 |
"print(f\"Accuracy: {accuracy_score}\")\n",
|
|
|
|
|
|
|
| 687 |
"print(f\"F1 Score: {f1_score}\")"
|
| 688 |
]
|
| 689 |
}
|
sentence_processing.ipynb
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Sentence Processing for NLP\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"In this notebook, we will see the importance of sentence processing and the techniques that we used to train the models.\n"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "markdown",
|
| 14 |
+
"metadata": {},
|
| 15 |
+
"source": [
|
| 16 |
+
"We will use a corpus of `10_000` sentences to demonstrate the difference between the different techniques.\n"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "code",
|
| 21 |
+
"execution_count": 1,
|
| 22 |
+
"metadata": {},
|
| 23 |
+
"outputs": [
|
| 24 |
+
{
|
| 25 |
+
"name": "stderr",
|
| 26 |
+
"output_type": "stream",
|
| 27 |
+
"text": [
|
| 28 |
+
"[nltk_data] Downloading package punkt_tab to /Users/az-r-\n",
|
| 29 |
+
"[nltk_data] ow/nltk_data...\n",
|
| 30 |
+
"[nltk_data] Package punkt_tab is already up-to-date!\n"
|
| 31 |
+
]
|
| 32 |
+
}
|
| 33 |
+
],
|
| 34 |
+
"source": [
|
| 35 |
+
"from app.travel_resolver.libs.nlp.data_processing import from_bio_file_to_examples\n",
|
| 36 |
+
"\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"sentences, labels, vocab, unique_labels = from_bio_file_to_examples(\n",
|
| 39 |
+
" \"data/bio/fr.bio/10k_samples.bio\"\n",
|
| 40 |
+
")"
|
| 41 |
+
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"cell_type": "code",
|
| 45 |
+
"execution_count": null,
|
| 46 |
+
"metadata": {},
|
| 47 |
+
"outputs": [],
|
| 48 |
+
"source": [
|
| 49 |
+
"def get_word_count(sentences):\n",
|
| 50 |
+
" words = dict()\n",
|
| 51 |
+
" for sentence in sentences:\n",
|
| 52 |
+
" for word in sentence:\n",
|
| 53 |
+
" if word in words:\n",
|
| 54 |
+
" words[word] += 1\n",
|
| 55 |
+
" else:\n",
|
| 56 |
+
" words[word] = 1\n",
|
| 57 |
+
" return words"
|
| 58 |
+
]
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"metadata": {
|
| 62 |
+
"kernelspec": {
|
| 63 |
+
"display_name": "venv",
|
| 64 |
+
"language": "python",
|
| 65 |
+
"name": "python3"
|
| 66 |
+
},
|
| 67 |
+
"language_info": {
|
| 68 |
+
"codemirror_mode": {
|
| 69 |
+
"name": "ipython",
|
| 70 |
+
"version": 3
|
| 71 |
+
},
|
| 72 |
+
"file_extension": ".py",
|
| 73 |
+
"mimetype": "text/x-python",
|
| 74 |
+
"name": "python",
|
| 75 |
+
"nbconvert_exporter": "python",
|
| 76 |
+
"pygments_lexer": "ipython3",
|
| 77 |
+
"version": "3.12.4"
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"nbformat": 4,
|
| 81 |
+
"nbformat_minor": 2
|
| 82 |
+
}
|