Spaces:

Sai004
/

ArticlePredictor

Sleeping

Sai004 commited on Jul 18, 2023

Commit

9a131ac

1 Parent(s): f1dd5d7

Upload app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -14,6 +14,37 @@ nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('stopwords')
 # Connect to the PostgreSQL database
 # Read the data from the PostgreSQL table
@@ -65,37 +96,6 @@ print('journal_main processed')
 # Journal Dataframe
-stop_words = set(stopwords.words('english'))
-def get_paragraph(row, index):
-    ans = ''
-    for x in row[index]:
-        ans = ans + ' ' + x.lower()
-    return ans
-def remove_accents(text):
-    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
-    return text
-def get_clean_text(row, index):
-    if not isinstance(row[index], str):
-        return ''
-    if row[index] == "NULL":
-        return ''
-    clean_text = ''
-    words = word_tokenize(row[index].lower())
-    for word in words:
-        word = word.replace(',', ' ')
-        word = remove_accents(word)
-        if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
-            clean_text += ' ' + word
-    return clean_text
-def combine(row, indices):
-    ans = ''
-    for i in indices:
-        ans = ans + ' ' + row[i]
-    return ans
 @st.cache_data
 def get_tfidfs(journal_main):
     vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')

 nltk.download('averaged_perceptron_tagger')
 nltk.download('stopwords')
+stop_words = set(stopwords.words('english'))
+def get_paragraph(row, index):
+    ans = ''
+    for x in row[index]:
+        ans = ans + ' ' + x.lower()
+    return ans
+def remove_accents(text):
+    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
+    return text
+def get_clean_text(row, index):
+    if not isinstance(row[index], str):
+        return ''
+    if row[index] == "NULL":
+        return ''
+    clean_text = ''
+    words = word_tokenize(row[index].lower())
+    for word in words:
+        word = word.replace(',', ' ')
+        word = remove_accents(word)
+        if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
+            clean_text += ' ' + word
+    return clean_text
+def combine(row, indices):
+    ans = ''
+    for i in indices:
+        ans = ans + ' ' + row[i]
+    return ans
 # Connect to the PostgreSQL database
 # Read the data from the PostgreSQL table
 # Journal Dataframe
 @st.cache_data
 def get_tfidfs(journal_main):
     vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')