Sai004 commited on
Commit
9a131ac
·
1 Parent(s): f1dd5d7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -31
app.py CHANGED
@@ -14,6 +14,37 @@ nltk.download('punkt')
14
  nltk.download('averaged_perceptron_tagger')
15
  nltk.download('stopwords')
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Connect to the PostgreSQL database
18
 
19
  # Read the data from the PostgreSQL table
@@ -65,37 +96,6 @@ print('journal_main processed')
65
  # Journal Dataframe
66
 
67
 
68
- stop_words = set(stopwords.words('english'))
69
-
70
- def get_paragraph(row, index):
71
- ans = ''
72
- for x in row[index]:
73
- ans = ans + ' ' + x.lower()
74
- return ans
75
-
76
- def remove_accents(text):
77
- text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
78
- return text
79
-
80
- def get_clean_text(row, index):
81
- if not isinstance(row[index], str):
82
- return ''
83
- if row[index] == "NULL":
84
- return ''
85
- clean_text = ''
86
- words = word_tokenize(row[index].lower())
87
- for word in words:
88
- word = word.replace(',', ' ')
89
- word = remove_accents(word)
90
- if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
91
- clean_text += ' ' + word
92
- return clean_text
93
-
94
- def combine(row, indices):
95
- ans = ''
96
- for i in indices:
97
- ans = ans + ' ' + row[i]
98
- return ans
99
  @st.cache_data
100
  def get_tfidfs(journal_main):
101
  vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
 
14
  nltk.download('averaged_perceptron_tagger')
15
  nltk.download('stopwords')
16
 
17
+ stop_words = set(stopwords.words('english'))
18
+
19
+ def get_paragraph(row, index):
20
+ ans = ''
21
+ for x in row[index]:
22
+ ans = ans + ' ' + x.lower()
23
+ return ans
24
+
25
+ def remove_accents(text):
26
+ text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
27
+ return text
28
+
29
+ def get_clean_text(row, index):
30
+ if not isinstance(row[index], str):
31
+ return ''
32
+ if row[index] == "NULL":
33
+ return ''
34
+ clean_text = ''
35
+ words = word_tokenize(row[index].lower())
36
+ for word in words:
37
+ word = word.replace(',', ' ')
38
+ word = remove_accents(word)
39
+ if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
40
+ clean_text += ' ' + word
41
+ return clean_text
42
+
43
+ def combine(row, indices):
44
+ ans = ''
45
+ for i in indices:
46
+ ans = ans + ' ' + row[i]
47
+ return ans
48
  # Connect to the PostgreSQL database
49
 
50
  # Read the data from the PostgreSQL table
 
96
  # Journal Dataframe
97
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  @st.cache_data
100
  def get_tfidfs(journal_main):
101
  vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')