Prageeth-1 commited on
Commit
770c070
·
verified ·
1 Parent(s): 875d133

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -151
app.py CHANGED
@@ -15,7 +15,6 @@ import string
15
  import os
16
  from nltk.stem import PorterStemmer
17
 
18
-
19
  # Download NLTK resources
20
  nltk.download('punkt')
21
  nltk.download('stopwords')
@@ -25,7 +24,6 @@ nltk.download('wordnet')
25
  nltk_data_path = "/home/user/nltk_data"
26
  if not os.path.exists(nltk_data_path):
27
  os.makedirs(nltk_data_path)
28
-
29
  nltk.data.path.append(nltk_data_path)
30
  nltk.download('punkt', download_dir=nltk_data_path)
31
 
@@ -44,8 +42,6 @@ def load_classification_model():
44
  def load_qa_model():
45
  return pipeline("question-answering", model="deepset/roberta-base-squad2")
46
 
47
-
48
-
49
  # Function to generate word cloud
50
  def generate_wordcloud(text, title=None):
51
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
@@ -55,7 +51,7 @@ def generate_wordcloud(text, title=None):
55
  plt.title(title, fontsize=20)
56
  st.pyplot(plt)
57
 
58
- # Set page config
59
  st.set_page_config(
60
  page_title="News Analysis Dashboard",
61
  page_icon="📰",
@@ -63,188 +59,166 @@ st.set_page_config(
63
  initial_sidebar_state="expanded"
64
  )
65
 
66
- # Custom CSS
67
  st.markdown("""
68
  <style>
69
- .main {
70
- background-color: #f5f5f5;
 
71
  }
72
- .stButton>button {
73
- background-color: #4CAF50;
 
 
 
 
 
74
  color: white;
75
  }
76
- .stDownloadButton>button {
77
- background-color: #2196F3;
 
 
 
 
 
 
 
 
 
 
78
  color: white;
 
 
 
 
79
  }
 
 
 
 
80
  .stTextInput>div>div>input {
81
  background-color: #ffffff;
82
- color : #FF6347;
 
83
  }
84
-
85
- .header {
86
- display: flex;
87
- align-items: center;
 
88
  margin-bottom: 20px;
89
- background-color: #2196F3;
90
- }
91
- .header img {
92
- height: 50px;
93
- margin-right: 10px;
94
- }
95
- .header h1 {
96
- font-size: 40px;
97
- color: white;
98
- margin: 0;
99
- align: center;
100
  }
101
  </style>
102
  """, unsafe_allow_html=True)
103
 
 
104
  st.markdown("""
105
  <div class="header">
106
- <center><h1>Daily Mirror News Analyzer</h1></center>
 
107
  </div>
 
108
 
109
- """, unsafe_allow_html =True)
110
-
111
- # App title and description
112
-
113
  st.markdown("""
114
- Analyze news excerpts with our powerful AI tools:
115
- - Classify news articles into categories
116
- - Get answers to your questions about the news content
117
- - Visualize key themes
118
- """)
 
 
 
 
 
 
 
119
 
120
  # Create tabs for different functionalities
121
  tab1, tab2, tab3 = st.tabs(["News Classification", "Q&A Pipeline", "Advanced Features"])
122
 
123
  with tab1:
124
-
125
  st.header("News Classification Pipeline")
126
  st.write("Upload a CSV file containing news excerpts to classify them into categories.")
127
 
128
- # File uploader
129
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
130
-
131
- # Check the file
132
- if uploaded_file is None:
133
- st.warning("Please upload a CSV file.")
134
-
135
 
 
 
136
  else:
137
  df = pd.read_csv(uploaded_file)
138
-
139
  # Load the fine-tuned news classifier
140
  classifier = pipeline("text-classification", model="Imasha17/News_classification.3")
141
 
142
- # Preprocess
143
- # Lowercase
144
  df["cleaned_content"] = df["content"].str.lower()
145
 
146
  # Remove URLs
147
  def remove_urls(text):
148
  url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
149
- text = url_pattern.sub(r'', text)
150
- return text.strip()
151
-
152
- # applying the function
153
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
154
 
155
  # Remove Emails
156
  def remove_emails(text):
157
  email_pattern = re.compile(r'\S+@\S+')
158
  return email_pattern.sub(r'', text)
 
159
 
160
- # applying the function
161
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
162
-
163
- #Remove punctuations
164
  def remove_punctuation(text):
165
  return "".join([char for char in text if char not in string.punctuation])
 
166
 
167
- # applying the function
168
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))
169
-
170
- # Get the list of stop words
171
  stop_words = set(stopwords.words('english'))
172
-
173
- # define the function
174
  def remove_stopwords(text):
175
- return " ".join([word for word in str(text).split() if word not in stop_words])
176
-
177
- # apply the function
178
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
179
 
180
- # define the function
181
  def remove_special_characters(text):
182
  return re.sub(r'[^A-Za-z\s]', '', text)
183
-
184
- # apply the function
185
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
186
 
187
- #Remove Frequent words
188
-
189
- # Get the count of each word in cleaned_text
190
  word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
191
-
192
- # Get a set of common words
193
- common_words = set([word for (word,count) in word_count.most_common(10)])
194
-
195
- # deinfe the function
196
  def remove_common_words(text):
197
- return " ".join([word for word in str(text).split() if word not in common_words])
198
-
199
- # apply the function
200
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
201
-
202
- #Remove rare words
203
- # Get a set of rare words
204
- rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
205
- print(rare_words)
206
 
207
- # define the function
 
208
  def remove_rare_words(text):
209
- return " ".join([word for word in str(text).split() if word not in rare_words])
210
-
211
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
212
 
 
213
  df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
214
-
215
- # initialize stemmer
216
  stemmer = PorterStemmer()
217
-
218
- # Defining the function
219
  def stem_tokens(tokens):
220
- stems = [stemmer.stem(token) for token in tokens]
221
- return stems
222
-
223
- # apply the function
224
- df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
225
-
226
-
227
-
228
  df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
229
 
230
- # Classify each article and store the predictions
231
  df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
232
-
233
- #Delete Unnecessary columns
234
  df = df[['content','Class']]
235
-
236
-
237
- # Show results
238
  st.subheader("Classification Results")
239
  st.write(df)
240
-
241
- # Show distribution
242
  st.subheader("Class Distribution")
243
  class_dist = df['Class'].value_counts()
244
  st.bar_chart(class_dist)
245
 
246
-
247
- # Download button
248
  st.subheader("Download Results")
249
  csv = df.to_csv(index=False).encode('utf-8')
250
  st.download_button(
@@ -253,59 +227,48 @@ with tab1:
253
  file_name='output.csv',
254
  mime='text/csv'
255
  )
256
-
257
-
258
-
259
 
260
  with tab2:
261
-
262
  st.header("Question Answering Pipeline")
263
  st.write("Ask questions about news content and get answers from our AI model.")
264
 
265
  if uploaded_file is not None:
266
- # Load the CSV and prepare context for the model
267
- context = ' '.join(df['content'].tolist()) # Use predictions for Q&A
268
  st.write(f"Loaded {len(df)} news excerpts")
269
  else:
270
  st.warning("Please upload a CSV file.")
271
 
272
- # Input field for the question
273
  question = st.text_input("Enter your question:")
274
-
275
- # Handle the "Get Answer" button
276
  if st.button("Get Answer"):
277
  if uploaded_file is None:
278
- # Display an error message if no file is uploaded
279
  st.error("Please upload a CSV file before asking a question.")
280
  elif context and question:
281
- # If both a file and a question are provided, answer the question
282
  with st.spinner("Searching for answers..."):
283
- qa_pipeline = load_qa_model() # Ensure this function is defined elsewhere
284
  result = qa_pipeline(question=question, context=context)
285
-
286
- # Display the answer and details
287
  st.subheader("Answer")
288
  st.success(result['answer'])
289
-
290
  st.subheader("Details")
291
  st.write(f"Confidence: {result['score']:.2f}")
292
  else:
293
  st.error("Please enter a question.")
294
-
295
-
296
- # Question Answering section
297
  st.header("Ask Questions Based on Your News Content")
298
  context_1 = st.text_area("Enter the news content (context):")
299
- question_1 = st.text_input("Enter your question:" , key="question_input" )
300
-
301
- if st.button("Get Answer" , key="get_answer_1"):
302
  if context_1 and question_1:
303
- answer_1 = qa_pipeline({'context': context, 'question': question})
304
  st.success(f"Answer: {answer_1['answer']}")
305
  else:
306
- st.warning("Provide both context and question.")
 
307
 
308
  with tab3:
 
309
  st.header("Advanced Features")
310
  st.write("Explore additional functionalities to enhance your news analysis.")
311
 
@@ -330,7 +293,6 @@ with tab3:
330
  with st.spinner("Identifying entities..."):
331
  ner_pipeline = pipeline("ner", grouped_entities=True)
332
  results = ner_pipeline(ner_text)
333
-
334
  entities = []
335
  for entity in results:
336
  entities.append({
@@ -338,7 +300,6 @@ with tab3:
338
  "Word": entity['word'],
339
  "Score": entity['score']
340
  })
341
-
342
  st.table(pd.DataFrame(entities))
343
 
344
  # Text Summarization
@@ -349,30 +310,29 @@ with tab3:
349
  summarizer = pipeline("summarization")
350
  summary = summarizer(summary_text, max_length=130, min_length=30)
351
  st.write(summary[0]['summary_text'])
 
352
 
353
- # Sidebar with additional info
354
  with st.sidebar:
355
- st.image("https://via.placeholder.com/150x50?text=Daily+Mirror", width=150)
356
  st.title("About")
357
  st.write("""
358
  This app helps analyze news content:
359
  - Classify news into categories
360
  - Answer questions about news content
361
  - Perform advanced text analysis
362
- """)
363
 
364
  st.title("Instructions")
365
  st.write("""
366
- 1. Upload a CSV file with 'content' column
367
- 2. Click classify to categorize news
368
- 3. Download results as CSV
369
- 4. Use Q&A tab to ask questions
370
- """)
371
-
372
-
373
 
374
  st.markdown("[View model on Hugging Face](https://huggingface.co/Imasha17/News_classification.3)")
375
 
376
  # Footer
377
  st.markdown("---")
378
- st.markdown("© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers")
 
15
  import os
16
  from nltk.stem import PorterStemmer
17
 
 
18
  # Download NLTK resources
19
  nltk.download('punkt')
20
  nltk.download('stopwords')
 
24
  nltk_data_path = "/home/user/nltk_data"
25
  if not os.path.exists(nltk_data_path):
26
  os.makedirs(nltk_data_path)
 
27
  nltk.data.path.append(nltk_data_path)
28
  nltk.download('punkt', download_dir=nltk_data_path)
29
 
 
42
  def load_qa_model():
43
  return pipeline("question-answering", model="deepset/roberta-base-squad2")
44
 
 
 
45
  # Function to generate word cloud
46
  def generate_wordcloud(text, title=None):
47
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
 
51
  plt.title(title, fontsize=20)
52
  st.pyplot(plt)
53
 
54
+ # Set page config with an attractive icon and layout options
55
  st.set_page_config(
56
  page_title="News Analysis Dashboard",
57
  page_icon="📰",
 
59
  initial_sidebar_state="expanded"
60
  )
61
 
62
+ # Custom CSS to improve styling
63
  st.markdown("""
64
  <style>
65
+ /* Overall page background */
66
+ .reportview-container {
67
+ background: #f0f2f6;
68
  }
69
+ /* Header styling */
70
+ .header {
71
+ background: linear-gradient(90deg, #1a73e8, #4285f4);
72
+ padding: 20px;
73
+ border-radius: 8px;
74
+ margin-bottom: 20px;
75
+ text-align: center;
76
  color: white;
77
  }
78
+ .header h1 {
79
+ font-size: 48px;
80
+ margin: 0;
81
+ font-weight: bold;
82
+ }
83
+ /* Sidebar styling */
84
+ .css-1d391kg {
85
+ background-color: #ffffff;
86
+ }
87
+ /* Button styling */
88
+ .stButton>button {
89
+ background-color: #1a73e8;
90
  color: white;
91
+ border: none;
92
+ padding: 10px 20px;
93
+ border-radius: 5px;
94
+ font-size: 16px;
95
  }
96
+ .stButton>button:hover {
97
+ background-color: #0c55b3;
98
+ }
99
+ /* Text input styling */
100
  .stTextInput>div>div>input {
101
  background-color: #ffffff;
102
+ color: #333333;
103
+ font-size: 16px;
104
  }
105
+ /* Card style containers */
106
+ .card {
107
+ background-color: #ffffff;
108
+ padding: 20px;
109
+ border-radius: 8px;
110
  margin-bottom: 20px;
111
+ box-shadow: 0px 4px 8px rgba(0,0,0,0.05);
 
 
 
 
 
 
 
 
 
 
112
  }
113
  </style>
114
  """, unsafe_allow_html=True)
115
 
116
+ # Banner header
117
  st.markdown("""
118
  <div class="header">
119
+ <h1>Daily Mirror News Analyzer</h1>
120
+ <p style="font-size: 20px; margin-top: 5px;">Analyze, classify, and explore news content with AI</p>
121
  </div>
122
+ """, unsafe_allow_html=True)
123
 
124
+ # Layout introduction text
 
 
 
125
  st.markdown("""
126
+ <div class="card">
127
+ <h2>Welcome!</h2>
128
+ <p>This dashboard allows you to:
129
+ <ul>
130
+ <li>Classify news articles into categories</li>
131
+ <li>Ask questions about the news content</li>
132
+ <li>Visualize sentiment, entities, and summaries</li>
133
+ </ul>
134
+ Use the tabs below to navigate between different functionalities.
135
+ </p>
136
+ </div>
137
+ """, unsafe_allow_html=True)
138
 
139
  # Create tabs for different functionalities
140
  tab1, tab2, tab3 = st.tabs(["News Classification", "Q&A Pipeline", "Advanced Features"])
141
 
142
  with tab1:
143
+ st.markdown('<div class="card">', unsafe_allow_html=True)
144
  st.header("News Classification Pipeline")
145
  st.write("Upload a CSV file containing news excerpts to classify them into categories.")
146
 
147
+ # File uploader with a descriptive message
148
+ uploaded_file = st.file_uploader("Choose a CSV file (must contain a 'content' column)", type="csv")
 
 
 
 
 
149
 
150
+ if uploaded_file is None:
151
+ st.warning("Please upload a CSV file to get started.")
152
  else:
153
  df = pd.read_csv(uploaded_file)
 
154
  # Load the fine-tuned news classifier
155
  classifier = pipeline("text-classification", model="Imasha17/News_classification.3")
156
 
157
+ # Preprocessing steps
 
158
  df["cleaned_content"] = df["content"].str.lower()
159
 
160
  # Remove URLs
161
  def remove_urls(text):
162
  url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
163
+ return url_pattern.sub(r'', text).strip()
164
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_urls)
 
 
 
165
 
166
  # Remove Emails
167
  def remove_emails(text):
168
  email_pattern = re.compile(r'\S+@\S+')
169
  return email_pattern.sub(r'', text)
170
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_emails)
171
 
172
+ # Remove punctuation
 
 
 
173
  def remove_punctuation(text):
174
  return "".join([char for char in text if char not in string.punctuation])
175
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_punctuation)
176
 
177
+ # Remove stopwords
 
 
 
178
  stop_words = set(stopwords.words('english'))
 
 
179
  def remove_stopwords(text):
180
+ return " ".join([word for word in text.split() if word not in stop_words])
181
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_stopwords)
 
 
182
 
183
+ # Remove special characters
184
  def remove_special_characters(text):
185
  return re.sub(r'[^A-Za-z\s]', '', text)
186
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_special_characters)
 
 
187
 
188
+ # Remove frequent words
 
 
189
  word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
190
+ common_words = set([word for (word, count) in word_count.most_common(10)])
 
 
 
 
191
  def remove_common_words(text):
192
+ return " ".join([word for word in text.split() if word not in common_words])
193
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_common_words)
 
 
 
 
 
 
 
194
 
195
+ # Remove rare words
196
+ rare_words = set([word for (word, count) in word_count.most_common()[:-20-1:-1]])
197
  def remove_rare_words(text):
198
+ return " ".join([word for word in text.split() if word not in rare_words])
199
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_rare_words)
 
200
 
201
+ # Tokenize and stem
202
  df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
 
 
203
  stemmer = PorterStemmer()
 
 
204
  def stem_tokens(tokens):
205
+ return [stemmer.stem(token) for token in tokens]
206
+ df['stemmed_content'] = df['tokenized_content'].apply(stem_tokens)
 
 
 
 
 
 
207
  df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
208
 
209
+ # Classify each article and store predictions
210
  df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
211
+
212
+ # Keep only necessary columns
213
  df = df[['content','Class']]
214
+
 
 
215
  st.subheader("Classification Results")
216
  st.write(df)
217
+
 
218
  st.subheader("Class Distribution")
219
  class_dist = df['Class'].value_counts()
220
  st.bar_chart(class_dist)
221
 
 
 
222
  st.subheader("Download Results")
223
  csv = df.to_csv(index=False).encode('utf-8')
224
  st.download_button(
 
227
  file_name='output.csv',
228
  mime='text/csv'
229
  )
230
+ st.markdown('</div>', unsafe_allow_html=True)
 
 
231
 
232
  with tab2:
233
+ st.markdown('<div class="card">', unsafe_allow_html=True)
234
  st.header("Question Answering Pipeline")
235
  st.write("Ask questions about news content and get answers from our AI model.")
236
 
237
  if uploaded_file is not None:
238
+ context = ' '.join(df['content'].tolist())
 
239
  st.write(f"Loaded {len(df)} news excerpts")
240
  else:
241
  st.warning("Please upload a CSV file.")
242
 
 
243
  question = st.text_input("Enter your question:")
 
 
244
  if st.button("Get Answer"):
245
  if uploaded_file is None:
 
246
  st.error("Please upload a CSV file before asking a question.")
247
  elif context and question:
 
248
  with st.spinner("Searching for answers..."):
249
+ qa_pipeline = load_qa_model()
250
  result = qa_pipeline(question=question, context=context)
 
 
251
  st.subheader("Answer")
252
  st.success(result['answer'])
 
253
  st.subheader("Details")
254
  st.write(f"Confidence: {result['score']:.2f}")
255
  else:
256
  st.error("Please enter a question.")
257
+
258
+ st.markdown("---")
 
259
  st.header("Ask Questions Based on Your News Content")
260
  context_1 = st.text_area("Enter the news content (context):")
261
+ question_1 = st.text_input("Enter your question:", key="question_input")
262
+ if st.button("Get Answer", key="get_answer_1"):
 
263
  if context_1 and question_1:
264
+ answer_1 = qa_pipeline({'context': context_1, 'question': question_1})
265
  st.success(f"Answer: {answer_1['answer']}")
266
  else:
267
+ st.warning("Provide both context and question.")
268
+ st.markdown('</div>', unsafe_allow_html=True)
269
 
270
  with tab3:
271
+ st.markdown('<div class="card">', unsafe_allow_html=True)
272
  st.header("Advanced Features")
273
  st.write("Explore additional functionalities to enhance your news analysis.")
274
 
 
293
  with st.spinner("Identifying entities..."):
294
  ner_pipeline = pipeline("ner", grouped_entities=True)
295
  results = ner_pipeline(ner_text)
 
296
  entities = []
297
  for entity in results:
298
  entities.append({
 
300
  "Word": entity['word'],
301
  "Score": entity['score']
302
  })
 
303
  st.table(pd.DataFrame(entities))
304
 
305
  # Text Summarization
 
310
  summarizer = pipeline("summarization")
311
  summary = summarizer(summary_text, max_length=130, min_length=30)
312
  st.write(summary[0]['summary_text'])
313
+ st.markdown('</div>', unsafe_allow_html=True)
314
 
315
+ # Enhanced Sidebar with branding and instructions
316
  with st.sidebar:
317
+ st.image("https://via.placeholder.com/300x100?text=Daily+Mirror", width=300)
318
  st.title("About")
319
  st.write("""
320
  This app helps analyze news content:
321
  - Classify news into categories
322
  - Answer questions about news content
323
  - Perform advanced text analysis
324
+ """)
325
 
326
  st.title("Instructions")
327
  st.write("""
328
+ 1. Upload a CSV file with a 'content' column.
329
+ 2. Click on the appropriate tab to use a feature.
330
+ 3. Download results as CSV.
331
+ 4. Use the Q&A tab to ask questions about the news.
332
+ """)
 
 
333
 
334
  st.markdown("[View model on Hugging Face](https://huggingface.co/Imasha17/News_classification.3)")
335
 
336
  # Footer
337
  st.markdown("---")
338
+ st.markdown("<div style='text-align: center;'>© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers</div>", unsafe_allow_html=True)