Update app.py
Browse files
app.py
CHANGED
|
@@ -3,10 +3,8 @@ import numpy as np
|
|
| 3 |
import re
|
| 4 |
import json
|
| 5 |
|
| 6 |
-
|
| 7 |
import nltk
|
| 8 |
from nltk.corpus import stopwords
|
| 9 |
-
from nltk.tokenize import word_tokenize
|
| 10 |
from nltk.stem import WordNetLemmatizer
|
| 11 |
|
| 12 |
from datasets import load_dataset
|
|
@@ -36,11 +34,6 @@ def download_nltk_data():
|
|
| 36 |
stopwords.words('english')
|
| 37 |
except LookupError:
|
| 38 |
nltk.download('stopwords')
|
| 39 |
-
# try:
|
| 40 |
-
# word_tokenize("test")
|
| 41 |
-
# except LookupError:
|
| 42 |
-
# nltk.download('punkt', force=True)
|
| 43 |
-
|
| 44 |
try:
|
| 45 |
WordNetLemmatizer().lemmatize("test")
|
| 46 |
except LookupError:
|
|
@@ -53,6 +46,10 @@ def clean_text(text):
|
|
| 53 |
text = re.sub(r'[^\w\s]', '', text)
|
| 54 |
return text
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
def process_tokens(tokens, stop_words, lemmatizer):
|
| 57 |
"""Removes stopwords and performs lemmatization on a list of tokens."""
|
| 58 |
tokens = [word for word in tokens if word not in stop_words]
|
|
@@ -113,7 +110,7 @@ def engineer_features(df):
|
|
| 113 |
df['title'] = df['title'].fillna('No Title')
|
| 114 |
df['text'] = df['title'] + ' ' + df['Description']
|
| 115 |
df['text'] = df['text'].apply(clean_text)
|
| 116 |
-
df['tokens'] = df['text'].apply(
|
| 117 |
df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer))
|
| 118 |
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
|
| 119 |
|
|
|
|
| 3 |
import re
|
| 4 |
import json
|
| 5 |
|
|
|
|
| 6 |
import nltk
|
| 7 |
from nltk.corpus import stopwords
|
|
|
|
| 8 |
from nltk.stem import WordNetLemmatizer
|
| 9 |
|
| 10 |
from datasets import load_dataset
|
|
|
|
| 34 |
stopwords.words('english')
|
| 35 |
except LookupError:
|
| 36 |
nltk.download('stopwords')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
try:
|
| 38 |
WordNetLemmatizer().lemmatize("test")
|
| 39 |
except LookupError:
|
|
|
|
| 46 |
text = re.sub(r'[^\w\s]', '', text)
|
| 47 |
return text
|
| 48 |
|
| 49 |
+
def simple_tokenize(text):
|
| 50 |
+
"""Tokenizes text using regex (splits on word boundaries, avoids NLTK punkt)."""
|
| 51 |
+
return re.findall(r'\b\w+\b', text)
|
| 52 |
+
|
| 53 |
def process_tokens(tokens, stop_words, lemmatizer):
|
| 54 |
"""Removes stopwords and performs lemmatization on a list of tokens."""
|
| 55 |
tokens = [word for word in tokens if word not in stop_words]
|
|
|
|
| 110 |
df['title'] = df['title'].fillna('No Title')
|
| 111 |
df['text'] = df['title'] + ' ' + df['Description']
|
| 112 |
df['text'] = df['text'].apply(clean_text)
|
| 113 |
+
df['tokens'] = df['text'].apply(simple_tokenize)
|
| 114 |
df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer))
|
| 115 |
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
|
| 116 |
|