Prageeth-1 commited on
Commit
b5f50fd
·
verified ·
1 Parent(s): 93c21e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -20
app.py CHANGED
@@ -33,22 +33,93 @@ def load_qa_model():
33
  return pipeline("question-answering", model="deepset/roberta-base-squad2")
34
 
35
  # Preprocessing function (same as in Section 01)
36
- def preprocess_text(text):
37
  # Lowercase
38
- text = text.lower()
 
39
  # Remove URLs
40
- text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
41
- # Remove special characters and numbers
42
- text = re.sub(r'[^a-zA-Z\s]', '', text)
43
- # Tokenize
44
- tokens = word_tokenize(text)
45
- # Remove stopwords
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  stop_words = set(stopwords.words('english'))
47
- tokens = [token for token in tokens if token not in stop_words]
48
- # Lemmatization
49
- tokens = [lemmatizer.lemmatize(token) for token in tokens]
50
- # Join tokens back to string
51
- return ' '.join(tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Function to generate word cloud
54
  def generate_wordcloud(text, title=None):
@@ -117,23 +188,30 @@ with tab1:
117
 
118
  # Load the fine-tuned news classifier
119
  classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
 
 
 
 
120
 
121
  # Classify each article and store the predictions
122
- df["predicted_category"] = df["content"].apply(lambda text: classifier(text)[0]["label"])
123
-
124
- # Preprocess and classify
125
-
126
-
 
127
  # Show results
128
  st.subheader("Classification Results")
129
  st.write(df)
130
 
131
  # Show distribution
132
  st.subheader("Class Distribution")
133
- class_dist = df['predicted_category'].value_counts()
134
  st.bar_chart(class_dist)
135
 
136
-
 
 
137
 
138
  # Download button
139
  st.subheader("Download Results")
 
33
  return pipeline("question-answering", model="deepset/roberta-base-squad2")
34
 
35
  # Preprocessing function (same as in Section 01)
36
+ def preprocess_text():
37
  # Lowercase
38
+ df["cleaned_content"] = df["content"].str.lower()
39
+
40
  # Remove URLs
41
+ def remove_urls(text):
42
+ url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
43
+ text = url_pattern.sub(r'', text)
44
+ return text.strip()
45
+
46
+ # applying the function
47
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
48
+
49
+ # Remove Emails
50
+ def remove_emails(text):
51
+ email_pattern = re.compile(r'\S+@\S+')
52
+ return email_pattern.sub(r'', text)
53
+
54
+ # applying the function
55
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
56
+
57
+ #Remove punctuations
58
+ def remove_punctuation(text):
59
+ return "".join([char for char in text if char not in string.punctuation])
60
+
61
+ # applying the function
62
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text)
63
+
64
+ # Get the list of stop words
65
  stop_words = set(stopwords.words('english'))
66
+
67
+ # define the function
68
+ def remove_stopwords(text):
69
+ return " ".join([word for word in str(text).split() if word not in stop_words])
70
+
71
+ # apply the function
72
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
73
+
74
+ # define the function
75
+ def remove_special_characters(text):
76
+ return re.sub(r'[^A-Za-z\s]', '', text)
77
+
78
+ # apply the function
79
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
80
+
81
+ #Remove Frequent words
82
+
83
+ # Get the count of each word in cleaned_text
84
+ word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
85
+
86
+ # Get a set of common words
87
+ common_words = set([word for (word,count) in word_count.most_common(10)])
88
+
89
+ # deinfe the function
90
+ def remove_common_words(text):
91
+ return " ".join([word for word in str(text).split() if word not in common_words])
92
+
93
+ # apply the function
94
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
95
+
96
+ #Remove rare words
97
+ # Get a set of rare words
98
+ rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
99
+ print(rare_words)
100
+
101
+ # define the function
102
+ def remove_rare_words(text):
103
+ return " ".join([word for word in str(text).split() if word not in rare_words])
104
+
105
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
106
+
107
+ df['tokenized_content'] = df['cleaned_content'].apply(lambda text: nltk.word_tokenize(text))
108
+
109
+ # initialize stemmer
110
+ stemmer = PorterStemmer()
111
+
112
+ # Defining the function
113
+ def stem_tokens(tokens):
114
+ stems = [stemmer.stem(token) for token in tokens]
115
+ return stems
116
+
117
+ # apply the function
118
+ df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
119
+
120
+ df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
121
+
122
+
123
 
124
  # Function to generate word cloud
125
  def generate_wordcloud(text, title=None):
 
188
 
189
  # Load the fine-tuned news classifier
190
  classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
191
+
192
+ # Preprocess
193
+ preprocess_text()
194
+
195
 
196
  # Classify each article and store the predictions
197
+ df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
198
+
199
+ #Delete Unnecessary columns
200
+ df = df[['content', 'Class']]
201
+
202
+
203
  # Show results
204
  st.subheader("Classification Results")
205
  st.write(df)
206
 
207
  # Show distribution
208
  st.subheader("Class Distribution")
209
+ class_dist = df['Class'].value_counts()
210
  st.bar_chart(class_dist)
211
 
212
+
213
+
214
+
215
 
216
  # Download button
217
  st.subheader("Download Results")