pavan-genai commited on
Commit
d77901d
·
verified ·
1 Parent(s): 90deb22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -8
app.py CHANGED
@@ -3,10 +3,8 @@ import numpy as np
3
  import re
4
  import json
5
 
6
-
7
  import nltk
8
  from nltk.corpus import stopwords
9
- from nltk.tokenize import word_tokenize
10
  from nltk.stem import WordNetLemmatizer
11
 
12
  from datasets import load_dataset
@@ -36,11 +34,6 @@ def download_nltk_data():
36
  stopwords.words('english')
37
  except LookupError:
38
  nltk.download('stopwords')
39
- # try:
40
- # word_tokenize("test")
41
- # except LookupError:
42
- # nltk.download('punkt', force=True)
43
-
44
  try:
45
  WordNetLemmatizer().lemmatize("test")
46
  except LookupError:
@@ -53,6 +46,10 @@ def clean_text(text):
53
  text = re.sub(r'[^\w\s]', '', text)
54
  return text
55
 
 
 
 
 
56
  def process_tokens(tokens, stop_words, lemmatizer):
57
  """Removes stopwords and performs lemmatization on a list of tokens."""
58
  tokens = [word for word in tokens if word not in stop_words]
@@ -113,7 +110,7 @@ def engineer_features(df):
113
  df['title'] = df['title'].fillna('No Title')
114
  df['text'] = df['title'] + ' ' + df['Description']
115
  df['text'] = df['text'].apply(clean_text)
116
- df['tokens'] = df['text'].apply(word_tokenize)
117
  df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer))
118
  df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
119
 
 
3
  import re
4
  import json
5
 
 
6
  import nltk
7
  from nltk.corpus import stopwords
 
8
  from nltk.stem import WordNetLemmatizer
9
 
10
  from datasets import load_dataset
 
34
  stopwords.words('english')
35
  except LookupError:
36
  nltk.download('stopwords')
 
 
 
 
 
37
  try:
38
  WordNetLemmatizer().lemmatize("test")
39
  except LookupError:
 
46
  text = re.sub(r'[^\w\s]', '', text)
47
  return text
48
 
49
+ def simple_tokenize(text):
50
+ """Tokenizes text using regex (splits on word boundaries, avoids NLTK punkt)."""
51
+ return re.findall(r'\b\w+\b', text)
52
+
53
  def process_tokens(tokens, stop_words, lemmatizer):
54
  """Removes stopwords and performs lemmatization on a list of tokens."""
55
  tokens = [word for word in tokens if word not in stop_words]
 
110
  df['title'] = df['title'].fillna('No Title')
111
  df['text'] = df['title'] + ' ' + df['Description']
112
  df['text'] = df['text'].apply(clean_text)
113
+ df['tokens'] = df['text'].apply(simple_tokenize)
114
  df['tokens'] = df['tokens'].apply(lambda x: process_tokens(x, stop_words, lemmatizer))
115
  df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
116