Az-r-ow commited on
Commit
5783f3e
·
1 Parent(s): ae14666

WIP(sentence_processing): notebook to show the difference between processing techniques

Browse files
conv_tagged_file_to_bio.py CHANGED
@@ -1,8 +1,8 @@
1
  from app.travel_resolver.libs.nlp.data_processing import from_tagged_file_to_bio_file
2
 
3
 
4
- INPUT_FILE = "./data/scripting_lcs_1/sentences_10k.txt"
5
- OUTPUT_FILE = "./data/bio/fr.bio/fr.sentences.bio"
6
 
7
  tag_entities_pairs = [("<Dep>", "LOC-DEP"), ("<Arr>", "LOC-ARR")]
8
 
 
1
  from app.travel_resolver.libs.nlp.data_processing import from_tagged_file_to_bio_file
2
 
3
 
4
+ INPUT_FILE = "./data/scripting_lcs_1/10k_samples.txt"
5
+ OUTPUT_FILE = "./data/bio/fr.bio/10k_samples.bio"
6
 
7
  tag_entities_pairs = [("<Dep>", "LOC-DEP"), ("<Arr>", "LOC-ARR")]
8
 
hmm_ner.ipynb CHANGED
@@ -606,19 +606,19 @@
606
  },
607
  {
608
  "cell_type": "code",
609
- "execution_count": 18,
610
  "metadata": {},
611
  "outputs": [
612
  {
613
  "name": "stdout",
614
  "output_type": "stream",
615
  "text": [
616
- "j: O\n",
617
- "e: O\n",
618
- " : O\n",
619
- "v: LOC-DEP\n",
620
- "o: LOC-DEP\n",
621
- "u: O\n"
622
  ]
623
  }
624
  ],
@@ -627,7 +627,7 @@
627
  "\n",
628
  "vocab = list(em_prob_dict.keys())\n",
629
  "\n",
630
- "test_sentence = \"Je voudrais voyager de Montpellier à Paris.\"\n",
631
  "\n",
632
  "test_sentence = process_sentence(test_sentence, stemming=True)\n",
633
  "\n",
@@ -637,13 +637,13 @@
637
  "\n",
638
  "predicted_labels = hmm.predict(test_sentence_encoded)\n",
639
  "\n",
640
- "for word, label in zip(test_sentence, predicted_labels):\n",
641
  " print(f\"{word}: {list(unique_labels.keys())[label]}\")"
642
  ]
643
  },
644
  {
645
  "cell_type": "code",
646
- "execution_count": 19,
647
  "metadata": {},
648
  "outputs": [],
649
  "source": [
@@ -663,7 +663,7 @@
663
  },
664
  {
665
  "cell_type": "code",
666
- "execution_count": 20,
667
  "metadata": {},
668
  "outputs": [
669
  {
@@ -671,25 +671,19 @@
671
  "output_type": "stream",
672
  "text": [
673
  "Accuracy: 0.7044728434504792\n",
674
- "Precision: 0.8437797024042935\n",
675
- "Recall: 0.7044728434504792\n",
676
  "F1 Score: 0.7390763122386325\n"
677
  ]
678
  }
679
  ],
680
  "source": [
681
- "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
682
  "\n",
683
- "y_test = [item for sublist in y_test for item in sublist]\n",
684
  "\n",
685
  "accuracy_score = accuracy_score(y_test, predicted_labels_test)\n",
686
- "precision_score = precision_score(y_test, predicted_labels_test, average=\"weighted\")\n",
687
- "recall_score = recall_score(y_test, predicted_labels_test, average=\"weighted\")\n",
688
  "f1_score = f1_score(y_test, predicted_labels_test, average=\"weighted\")\n",
689
  "\n",
690
  "print(f\"Accuracy: {accuracy_score}\")\n",
691
- "print(f\"Precision: {precision_score}\")\n",
692
- "print(f\"Recall: {recall_score}\")\n",
693
  "print(f\"F1 Score: {f1_score}\")"
694
  ]
695
  }
 
606
  },
607
  {
608
  "cell_type": "code",
609
+ "execution_count": 31,
610
  "metadata": {},
611
  "outputs": [
612
  {
613
  "name": "stdout",
614
  "output_type": "stream",
615
  "text": [
616
+ "je: O\n",
617
+ "veux: O\n",
618
+ "part: O\n",
619
+ "montpelli: LOC-DEP\n",
620
+ "ver: LOC-DEP\n",
621
+ "paris: O\n"
622
  ]
623
  }
624
  ],
 
627
  "\n",
628
  "vocab = list(em_prob_dict.keys())\n",
629
  "\n",
630
+ "test_sentence = \"Je veux partir de montpellier vers Paris\"\n",
631
  "\n",
632
  "test_sentence = process_sentence(test_sentence, stemming=True)\n",
633
  "\n",
 
637
  "\n",
638
  "predicted_labels = hmm.predict(test_sentence_encoded)\n",
639
  "\n",
640
+ "for word, label in zip(test_sentence.split(\" \"), predicted_labels):\n",
641
  " print(f\"{word}: {list(unique_labels.keys())[label]}\")"
642
  ]
643
  },
644
  {
645
  "cell_type": "code",
646
+ "execution_count": 26,
647
  "metadata": {},
648
  "outputs": [],
649
  "source": [
 
663
  },
664
  {
665
  "cell_type": "code",
666
+ "execution_count": null,
667
  "metadata": {},
668
  "outputs": [
669
  {
 
671
  "output_type": "stream",
672
  "text": [
673
  "Accuracy: 0.7044728434504792\n",
 
 
674
  "F1 Score: 0.7390763122386325\n"
675
  ]
676
  }
677
  ],
678
  "source": [
679
+ "from sklearn.metrics import accuracy_score, f1_score\n",
680
  "\n",
681
+ "# y_test = [item for sublist in y_test for item in sublist]\n",
682
  "\n",
683
  "accuracy_score = accuracy_score(y_test, predicted_labels_test)\n",
 
 
684
  "f1_score = f1_score(y_test, predicted_labels_test, average=\"weighted\")\n",
685
  "\n",
686
  "print(f\"Accuracy: {accuracy_score}\")\n",
 
 
687
  "print(f\"F1 Score: {f1_score}\")"
688
  ]
689
  }
sentence_processing.ipynb ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Sentence Processing for NLP\n",
8
+ "\n",
9
+ "In this notebook, we will see the importance of sentence processing and the techniques that we used to train the models.\n"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {},
15
+ "source": [
16
+ "We will use a corpus of `10_000` sentences to demonstrate the difference between the different techniques.\n"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 1,
22
+ "metadata": {},
23
+ "outputs": [
24
+ {
25
+ "name": "stderr",
26
+ "output_type": "stream",
27
+ "text": [
28
+ "[nltk_data] Downloading package punkt_tab to /Users/az-r-\n",
29
+ "[nltk_data] ow/nltk_data...\n",
30
+ "[nltk_data] Package punkt_tab is already up-to-date!\n"
31
+ ]
32
+ }
33
+ ],
34
+ "source": [
35
+ "from app.travel_resolver.libs.nlp.data_processing import from_bio_file_to_examples\n",
36
+ "\n",
37
+ "\n",
38
+ "sentences, labels, vocab, unique_labels = from_bio_file_to_examples(\n",
39
+ " \"data/bio/fr.bio/10k_samples.bio\"\n",
40
+ ")"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "def get_word_count(sentences):\n",
50
+ " words = dict()\n",
51
+ " for sentence in sentences:\n",
52
+ " for word in sentence:\n",
53
+ " if word in words:\n",
54
+ " words[word] += 1\n",
55
+ " else:\n",
56
+ " words[word] = 1\n",
57
+ " return words"
58
+ ]
59
+ }
60
+ ],
61
+ "metadata": {
62
+ "kernelspec": {
63
+ "display_name": "venv",
64
+ "language": "python",
65
+ "name": "python3"
66
+ },
67
+ "language_info": {
68
+ "codemirror_mode": {
69
+ "name": "ipython",
70
+ "version": 3
71
+ },
72
+ "file_extension": ".py",
73
+ "mimetype": "text/x-python",
74
+ "name": "python",
75
+ "nbconvert_exporter": "python",
76
+ "pygments_lexer": "ipython3",
77
+ "version": "3.12.4"
78
+ }
79
+ },
80
+ "nbformat": 4,
81
+ "nbformat_minor": 2
82
+ }