Update utils.py
Browse files
utils.py
CHANGED
|
@@ -63,12 +63,15 @@ import nltk
|
|
| 63 |
from nltk.corpus import stopwords
|
| 64 |
from nltk.tokenize import word_tokenize
|
| 65 |
from nltk.stem import WordNetLemmatizer
|
| 66 |
-
nltk.download('punkt')
|
| 67 |
|
| 68 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 69 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 70 |
import numpy as np
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
################################################
|
|
@@ -109,9 +112,7 @@ def normalise_prompt (prompt):
|
|
| 109 |
tokens = [word for word in tokens if word.isalnum()]
|
| 110 |
|
| 111 |
# Stop Word Entfernung
|
| 112 |
-
|
| 113 |
-
stop_words = set(stopwords.words('deutsch'))
|
| 114 |
-
tokens = [word for word in tokens if not word in stop_words]
|
| 115 |
# 5. Lemmatisierung: Worte in Grundform bringen, um Text besser vergleichen zu können
|
| 116 |
nltk.download('wordnet')
|
| 117 |
lemmatizer = WordNetLemmatizer()
|
|
|
|
| 63 |
from nltk.corpus import stopwords
|
| 64 |
from nltk.tokenize import word_tokenize
|
| 65 |
from nltk.stem import WordNetLemmatizer
|
|
|
|
| 66 |
|
| 67 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 68 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 69 |
import numpy as np
|
| 70 |
|
| 71 |
+
#für die Normalisierung
|
| 72 |
+
nltk.download('punkt')
|
| 73 |
+
nltk.download('stopwords')
|
| 74 |
+
german_stopwords = set(stopwords.words('german'))
|
| 75 |
|
| 76 |
|
| 77 |
################################################
|
|
|
|
| 112 |
tokens = [word for word in tokens if word.isalnum()]
|
| 113 |
|
| 114 |
# Stop Word Entfernung
|
| 115 |
+
tokens = [word for word in tokens if not word in german_stopwords]
|
|
|
|
|
|
|
| 116 |
# 5. Lemmatisierung: Worte in Grundform bringen, um Text besser vergleichen zu können
|
| 117 |
nltk.download('wordnet')
|
| 118 |
lemmatizer = WordNetLemmatizer()
|