Imasha17 commited on
Commit
fe0b7af
·
verified ·
1 Parent(s): 807487d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +378 -0
app.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import nltk
6
+ from nltk.corpus import stopwords
7
+ from nltk.stem import WordNetLemmatizer
8
+ from nltk.tokenize import word_tokenize
9
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
10
+ from wordcloud import WordCloud
11
+ import matplotlib.pyplot as plt
12
+ import io
13
+ from collections import Counter
14
+ import string
15
+ import os
16
+ from nltk.stem import PorterStemmer
17
+
18
+ # Download NLTK resources
19
+ nltk.download('punkt')
20
+ nltk.download('stopwords')
21
+ nltk.download('wordnet')
22
+
23
+ # Ensure NLTK data is downloaded at runtime
24
+ nltk_data_path = "/home/user/nltk_data"
25
+ if not os.path.exists(nltk_data_path):
26
+ os.makedirs(nltk_data_path)
27
+ nltk.data.path.append(nltk_data_path)
28
+ nltk.download('punkt', download_dir=nltk_data_path)
29
+
30
+ # Initialize lemmatizer
31
+ lemmatizer = WordNetLemmatizer()
32
+
33
+ # Load models (cache them to avoid reloading on every interaction)
34
+ @st.cache_resource
35
+ def load_classification_model():
36
+ model_name = "Imasha17/News_classification.4" # Replace with your model path
37
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
38
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
39
+ return pipeline("text-classification", model=model, tokenizer=tokenizer)
40
+
41
+ @st.cache_resource
42
+ def load_qa_model():
43
+ return pipeline("question-answering", model="deepset/roberta-base-squad2")
44
+
45
+ # Function to generate word cloud
46
+ def generate_wordcloud(text, title=None):
47
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
48
+ plt.figure(figsize=(10, 5))
49
+ plt.imshow(wordcloud, interpolation='bilinear')
50
+ plt.axis("off")
51
+ plt.title(title, fontsize=20)
52
+ st.pyplot(plt)
53
+
54
+ # Set page config with an attractive icon and layout options
55
+ st.set_page_config(
56
+ page_title="News Analysis Dashboard",
57
+ page_icon="📰",
58
+ layout="wide",
59
+ initial_sidebar_state="expanded"
60
+ )
61
+
62
+ # Custom CSS to improve styling
63
+ st.markdown("""
64
+ <style>
65
+
66
+ .reportview-container {
67
+ background: #f0f2f6;
68
+ }
69
+ /* Header styling */
70
+ .header {
71
+ background: linear-gradient(90deg, #1a73e8, #4285f4);
72
+ padding: 20px;
73
+ border-radius: 8px;
74
+ margin-bottom: 20px;
75
+ text-align: center;
76
+ color: white;
77
+ }
78
+ .header h1 {
79
+ font-size: 48px;
80
+ margin: 0;
81
+ font-weight: bold;
82
+ }
83
+ /* Sidebar styling */
84
+ .css-1d391kg {
85
+ background-color: #ffffff;
86
+ }
87
+ /* Button styling */
88
+ .stButton>button {
89
+ background-color: #1a73e8;
90
+ color: white;
91
+ border: none;
92
+ padding: 10px 20px;
93
+ border-radius: 5px;
94
+ font-size: 16px;
95
+ }
96
+ .stButton>button:hover {
97
+ background-color: #0c55b3;
98
+ }
99
+ /* Text input styling */
100
+ .stTextInput>div>div>input {
101
+ background-color: #ffffff;
102
+ color: #333333;
103
+ font-size: 16px;
104
+ }
105
+ /* Card style containers */
106
+ .card {
107
+ background-color: #ffffff;
108
+ padding: 20px;
109
+ border-radius: 8px;
110
+ margin-bottom: 20px;
111
+ box-shadow: 0px 4px 8px rgba(0,0,0,0.05);
112
+ colour:#1a73e8;
113
+ }
114
+ </style>
115
+ """, unsafe_allow_html=True)
116
+
117
+ # Banner header
118
+ st.markdown("""
119
+ <div class="header">
120
+ <h1>News Content Analyzer</h1>
121
+ <p style="font-size: 20px; margin-top: 5px;">Analyze, classify, and explore news content with AI</p>
122
+ </div>
123
+ """, unsafe_allow_html=True)
124
+
125
+ # Layout introduction text
126
+ st.markdown("""
127
+ <div class="card">
128
+ <h2 style="color:#1a73e8;">Welcome!</h2>
129
+ <p style="color:#1a73e8;">This dashboard allows you to:
130
+ <ul style="color:#1a73e8;">
131
+ <li>Classify news articles into categories</li>
132
+ <li>Ask questions about the news content</li>
133
+ <li>Visualize sentiment, entities, and summaries</li>
134
+ </ul>
135
+ Use the tabs below to navigate between different functionalities.
136
+ </p>
137
+ </div>
138
+ """, unsafe_allow_html=True)
139
+
140
+ # Create tabs for different functionalities
141
+ tab1, tab2, tab3 = st.tabs(["News Classification", "Ask Questions", "Advanced Features"])
142
+
143
+ with tab1:
144
+ st.markdown('<div class="card">', unsafe_allow_html=True)
145
+ st.header("News Classification ")
146
+ st.write("Upload a CSV file containing news excerpts to classify them into categories.")
147
+
148
+ # File uploader with a descriptive message
149
+ uploaded_file = st.file_uploader("Choose a CSV file (must contain a 'content' column)", type="csv")
150
+
151
+ if uploaded_file is None:
152
+ st.warning("Please upload a CSV file to get started.")
153
+ else:
154
+ df = pd.read_csv(uploaded_file)
155
+
156
+
157
+ #Preview Uploaded Data
158
+ st.subheader("Preview Uploaded Data")
159
+ st.dataframe(df.head(5))
160
+
161
+
162
+ # Load the fine-tuned news classifier
163
+ classifier = pipeline("text-classification", model="Imasha17/News_classification.4")
164
+
165
+ # Preprocessing steps
166
+ df["cleaned_content"] = df["content"].str.lower()
167
+
168
+ # Remove URLs
169
+ def remove_urls(text):
170
+ url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
171
+ return url_pattern.sub(r'', text).strip()
172
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_urls)
173
+
174
+ # Remove Emails
175
+ def remove_emails(text):
176
+ email_pattern = re.compile(r'\S+@\S+')
177
+ return email_pattern.sub(r'', text)
178
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_emails)
179
+
180
+ # Remove punctuation
181
+ def remove_punctuation(text):
182
+ return "".join([char for char in text if char not in string.punctuation])
183
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_punctuation)
184
+
185
+ # Remove stopwords
186
+ stop_words = set(stopwords.words('english'))
187
+ def remove_stopwords(text):
188
+ return " ".join([word for word in text.split() if word not in stop_words])
189
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_stopwords)
190
+
191
+ # Remove special characters
192
+ def remove_special_characters(text):
193
+ return re.sub(r'[^A-Za-z\s]', '', text)
194
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_special_characters)
195
+
196
+ # Remove frequent words
197
+ word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
198
+ common_words = set([word for (word, count) in word_count.most_common(10)])
199
+ def remove_common_words(text):
200
+ return " ".join([word for word in text.split() if word not in common_words])
201
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_common_words)
202
+
203
+ # Remove rare words
204
+ rare_words = set([word for (word, count) in word_count.most_common()[:-20-1:-1]])
205
+ def remove_rare_words(text):
206
+ return " ".join([word for word in text.split() if word not in rare_words])
207
+ df["cleaned_content"] = df["cleaned_content"].apply(remove_rare_words)
208
+
209
+ # Tokenize and stem
210
+ df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
211
+ stemmer = PorterStemmer()
212
+ def stem_tokens(tokens):
213
+ return [stemmer.stem(token) for token in tokens]
214
+ df['stemmed_content'] = df['tokenized_content'].apply(stem_tokens)
215
+ df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
216
+
217
+ # Classify each article and store predictions
218
+ df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
219
+
220
+ # Word Cloud Visualization
221
+ def create_wordcloud(text_data):
222
+ text = ' '.join(text_data)
223
+ wordcloud = WordCloud(width=800, height=400).generate(text)
224
+ plt.figure(figsize=(10, 5))
225
+ plt.imshow(wordcloud, interpolation='bilinear')
226
+ plt.axis('off')
227
+ st.pyplot(plt)
228
+
229
+ st.subheader("Word Cloud of News Content")
230
+ create_wordcloud(df['preprocessed_content'])
231
+
232
+ # Keep only necessary columns
233
+ df = df[['content','Class']]
234
+
235
+
236
+ #show Classification Results
237
+ st.subheader("Classification Results")
238
+ st.write(df)
239
+
240
+
241
+ #show class distribution
242
+ st.subheader("Class Distribution")
243
+ class_dist = df['Class'].value_counts()
244
+ st.bar_chart(class_dist)
245
+
246
+ #download csv file
247
+ st.subheader("Download Results")
248
+ csv = df.to_csv(index=False).encode('utf-8')
249
+ st.download_button(
250
+ label="Download output.csv",
251
+ data=csv,
252
+ file_name='output.csv',
253
+ mime='text/csv'
254
+ )
255
+ st.markdown('</div>', unsafe_allow_html=True)
256
+
257
+ with tab2:
258
+ st.markdown('<div class="card">', unsafe_allow_html=True)
259
+ st.header("Ask Questions Based on Uploaded News Content File")
260
+ st.write("Ask questions about news content and get answers from our AI model.")
261
+
262
+ #check file is uploaded
263
+ if uploaded_file is not None:
264
+ context = ' '.join(df['content'].tolist())
265
+ st.write(f"Loaded {len(df)} news excerpts")
266
+ else:
267
+ st.warning("Please upload a CSV file.")
268
+
269
+ #generate the answer based on uloaded news content file using the given model
270
+ question = st.text_input("Enter your question:")
271
+ if st.button("Get Answer"):
272
+ #check for file available
273
+ if uploaded_file is None:
274
+ st.error("Please upload a CSV file before asking a question.")
275
+ elif context and question:
276
+ with st.spinner("Searching for answers..."):
277
+ #load the model for Q&A pipline
278
+ qa_pipeline = load_qa_model()
279
+ result = qa_pipeline(question=question, context=context)
280
+ st.subheader("Answer")
281
+ st.success(result['answer'])
282
+ st.subheader("Details")
283
+ st.write(f"Confidence: {result['score']:.2f}")
284
+ else:
285
+ st.error("Please enter a question.")
286
+
287
+ #generate the answer based on selected news content using the given model
288
+
289
+ st.markdown("---")
290
+ st.header("Ask Questions Based on Your News Content")
291
+ context_1 = st.text_area("Enter News Content", height=100)
292
+
293
+ question_1 = st.text_input("Enter your question:", key="question_input")
294
+ if st.button("Get Answer", key="get_answer_1"):
295
+ #check for selected context and question are available
296
+ if context_1 and question_1:
297
+ qa_pipeline = load_qa_model()
298
+ answer_1 = qa_pipeline(question=question_1, context=context_1)
299
+ st.success(f"Answer: {answer_1['answer']}")
300
+ else:
301
+ st.warning("Provide both context and question.")
302
+ st.markdown('</div>', unsafe_allow_html=True)
303
+
304
+ with tab3:
305
+ st.markdown('<div class="card">', unsafe_allow_html=True)
306
+ st.header("Advanced Features")
307
+ st.write("Explore additional functionalities to enhance your news analysis.")
308
+
309
+
310
+ # Named Entity Recognition of news content
311
+ st.subheader("Named Entity Recognition Of News Content")
312
+ ner_text = st.text_area("Enter News Content for entity recognition:", height=100)
313
+ if st.button("Extract Entities"):
314
+ with st.spinner("Identifying entities..."):
315
+ #load the model
316
+ ner_pipeline = pipeline("ner", grouped_entities=True)
317
+ results = ner_pipeline(ner_text)
318
+ entities = []
319
+ for entity in results:
320
+ entities.append({
321
+ "Entity": entity['entity_group'],
322
+ "Word": entity['word'],
323
+ "Score": entity['score']
324
+ })
325
+ st.table(pd.DataFrame(entities))
326
+
327
+ # Text Summarization
328
+ st.subheader("News Content Summarization")
329
+ summary_text = st.text_area("Enter news content to summarize:", height=150)
330
+ if st.button("Generate Summary"):
331
+ with st.spinner("Generating summary..."):
332
+ #load the summarization model
333
+ summarizer = pipeline("summarization")
334
+ summary = summarizer(summary_text, max_length=130, min_length=30)
335
+ st.write(summary[0]['summary_text'])
336
+ st.markdown('</div>', unsafe_allow_html=True)
337
+
338
+
339
+ # Sentiment Analysis
340
+ st.subheader("News Tone Detector")
341
+ sentiment_text = st.text_area("Enter text for news content analysis:", height=100)
342
+ if st.button("Analyze Tone"):
343
+ with st.spinner("Analyzing sentiment..."):
344
+ #load the model
345
+ sentiment_pipeline = pipeline("sentiment-analysis")
346
+ result = sentiment_pipeline(sentiment_text)[0]
347
+ st.write(f"Label: {result['label']}")
348
+ st.write(f"Confidence: {result['score']:.2f}")
349
+ if result['label'] == 'POSITIVE':
350
+ st.success("This text appears positive!")
351
+ else:
352
+ st.warning("This text appears negative.")
353
+
354
+
355
+ # Enhanced Sidebar with branding and instructions
356
+ with st.sidebar:
357
+ st.image("news_logo.jpg", width=300)
358
+ st.title("About")
359
+ st.write("""
360
+ This app helps analyze news content:
361
+ - Classify news into categories
362
+ - Answer questions about news content
363
+ - Perform advanced text analysis
364
+ """)
365
+
366
+ st.title("Instructions")
367
+ st.write("""
368
+ 1. Upload a CSV file with a 'content' column.
369
+ 2. Click on the appropriate tab to use a feature.
370
+ 3. Download results as CSV.
371
+ 4. Use the Q&A tab to ask questions about the news.
372
+ """)
373
+
374
+ st.markdown("[View model on Hugging Face](https://huggingface.co/Imasha17/News_classification.4)")
375
+
376
+ # Footer
377
+ st.markdown("---")
378
+ st.markdown("<div style='text-align: center;'>© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers</div>", unsafe_allow_html=True)