Prageeth-1 commited on
Commit
29621a6
·
verified ·
1 Parent(s): 9282558

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +273 -0
app.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import nltk
6
+ from nltk.corpus import stopwords
7
+ from nltk.stem import WordNetLemmatizer
8
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
9
+ from wordcloud import WordCloud
10
+ import matplotlib.pyplot as plt
11
+ import io
12
+
13
+ # Download NLTK resources
14
+ nltk.download('punkt')
15
+ nltk.download('stopwords')
16
+ nltk.download('wordnet')
17
+
18
+ # Initialize lemmatizer
19
+ lemmatizer = WordNetLemmatizer()
20
+
21
+ # Load models (cache them to avoid reloading on every interaction)
22
+ @st.cache_resource
23
+ def load_classification_model():
24
+ model_name = "your-username/daily-mirror-news-classifier" # Replace with your model path
25
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
26
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
27
+ return pipeline("text-classification", model=model, tokenizer=tokenizer)
28
+
29
+ @st.cache_resource
30
+ def load_qa_model():
31
+ return pipeline("question-answering", model="deepset/roberta-base-squad2")
32
+
33
+ # Preprocessing function (same as in Section 01)
34
+ def preprocess_text(text):
35
+ # Lowercase
36
+ text = text.lower()
37
+ # Remove URLs
38
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
39
+ # Remove special characters and numbers
40
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
41
+ # Tokenize
42
+ tokens = word_tokenize(text)
43
+ # Remove stopwords
44
+ stop_words = set(stopwords.words('english'))
45
+ tokens = [token for token in tokens if token not in stop_words]
46
+ # Lemmatization
47
+ tokens = [lemmatizer.lemmatize(token) for token in tokens]
48
+ # Join tokens back to string
49
+ return ' '.join(tokens)
50
+
51
+ # Function to generate word cloud
52
+ def generate_wordcloud(text, title=None):
53
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
54
+ plt.figure(figsize=(10, 5))
55
+ plt.imshow(wordcloud, interpolation='bilinear')
56
+ plt.axis("off")
57
+ plt.title(title, fontsize=20)
58
+ st.pyplot(plt)
59
+
60
+ # Set page config
61
+ st.set_page_config(
62
+ page_title="News Analysis Dashboard",
63
+ page_icon="📰",
64
+ layout="wide",
65
+ initial_sidebar_state="expanded"
66
+ )
67
+
68
+ # Custom CSS
69
+ st.markdown("""
70
+ <style>
71
+ .main {
72
+ background-color: #f5f5f5;
73
+ }
74
+ .stButton>button {
75
+ background-color: #4CAF50;
76
+ color: white;
77
+ }
78
+ .stDownloadButton>button {
79
+ background-color: #2196F3;
80
+ color: white;
81
+ }
82
+ .stTextInput>div>div>input {
83
+ background-color: #ffffff;
84
+ }
85
+ </style>
86
+ """, unsafe_allow_html=True)
87
+
88
+ # App title and description
89
+ st.title("📰 Daily Mirror News Analyzer")
90
+ st.markdown("""
91
+ Analyze news excerpts with our powerful AI tools:
92
+ - Classify news articles into categories
93
+ - Get answers to your questions about the news content
94
+ - Visualize key themes
95
+ """)
96
+
97
+ # Create tabs for different functionalities
98
+ tab1, tab2, tab3 = st.tabs(["📋 News Classification", "❓ Q&A Pipeline", "✨ Advanced Features"])
99
+
100
+ with tab1:
101
+ st.header("News Classification Pipeline")
102
+ st.write("Upload a CSV file containing news excerpts to classify them into categories.")
103
+
104
+ # File uploader
105
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
106
+
107
+ if uploaded_file is not None:
108
+ # Read CSV file
109
+ df = pd.read_csv(uploaded_file)
110
+
111
+ # Check if 'excerpt' column exists
112
+ if 'excerpt' not in df.columns:
113
+ st.error("The CSV file must contain an 'excerpt' column with news content.")
114
+ else:
115
+ # Show preview
116
+ st.subheader("File Preview")
117
+ st.write(df.head())
118
+
119
+ # Classify button
120
+ if st.button("Classify News Excerpts"):
121
+ with st.spinner("Classifying news excerpts..."):
122
+ # Load classification model
123
+ classifier = load_classification_model()
124
+
125
+ # Preprocess and classify
126
+ df['preprocessed_text'] = df['excerpt'].apply(preprocess_text)
127
+ predictions = classifier(df['preprocessed_text'].tolist())
128
+
129
+ # Add predictions to dataframe
130
+ df['class'] = [pred['label'] for pred in predictions]
131
+ df['confidence'] = [pred['score'] for pred in predictions]
132
+
133
+ # Show results
134
+ st.subheader("Classification Results")
135
+ st.write(df)
136
+
137
+ # Show distribution
138
+ st.subheader("Class Distribution")
139
+ class_dist = df['class'].value_counts()
140
+ st.bar_chart(class_dist)
141
+
142
+ # Generate word cloud for each class
143
+ st.subheader("Word Clouds by Category")
144
+ classes = df['class'].unique()
145
+ cols = st.columns(len(classes))
146
+
147
+ for i, class_name in enumerate(classes):
148
+ with cols[i]:
149
+ st.markdown(f"**{class_name}**")
150
+ class_text = ' '.join(df[df['class'] == class_name]['excerpt'])
151
+ generate_wordcloud(class_text)
152
+
153
+ # Download button
154
+ st.subheader("Download Results")
155
+ csv = df.to_csv(index=False).encode('utf-8')
156
+ st.download_button(
157
+ label="Download output.csv",
158
+ data=csv,
159
+ file_name='output.csv',
160
+ mime='text/csv'
161
+ )
162
+
163
+ with tab2:
164
+ st.header("Question Answering Pipeline")
165
+ st.write("Ask questions about news content and get answers from our AI model.")
166
+
167
+ # Option to upload file or enter text manually
168
+ input_option = st.radio("Choose input method:", ("Upload CSV", "Enter Text Manually"))
169
+
170
+ context = ""
171
+
172
+ if input_option == "Upload CSV":
173
+ qa_file = st.file_uploader("Upload news content (CSV)", type="csv")
174
+ if qa_file is not None:
175
+ qa_df = pd.read_csv(qa_file)
176
+ if 'excerpt' not in qa_df.columns:
177
+ st.error("CSV must contain an 'excerpt' column")
178
+ else:
179
+ context = ' '.join(qa_df['excerpt'].tolist())
180
+ st.write(f"Loaded {len(qa_df)} news excerpts")
181
+ else:
182
+ context = st.text_area("Paste news content here:", height=200)
183
+
184
+ question = st.text_input("Enter your question:")
185
+
186
+ if st.button("Get Answer") and context and question:
187
+ with st.spinner("Searching for answers..."):
188
+ qa_pipeline = load_qa_model()
189
+ result = qa_pipeline(question=question, context=context)
190
+
191
+ st.subheader("Answer")
192
+ st.success(result['answer'])
193
+
194
+ st.subheader("Details")
195
+ st.write(f"Confidence: {result['score']:.2f}")
196
+ st.write(f"Context: {result['context']}")
197
+
198
+ with tab3:
199
+ st.header("Advanced Features")
200
+ st.write("Explore additional functionalities to enhance your news analysis.")
201
+
202
+ # Sentiment Analysis
203
+ st.subheader("📊 Sentiment Analysis")
204
+ sentiment_text = st.text_area("Enter text for sentiment analysis:", height=100)
205
+ if st.button("Analyze Sentiment"):
206
+ with st.spinner("Analyzing sentiment..."):
207
+ sentiment_pipeline = pipeline("sentiment-analysis")
208
+ result = sentiment_pipeline(sentiment_text)[0]
209
+ st.write(f"Label: {result['label']}")
210
+ st.write(f"Confidence: {result['score']:.2f}")
211
+ if result['label'] == 'POSITIVE':
212
+ st.success("This text appears positive!")
213
+ else:
214
+ st.warning("This text appears negative.")
215
+
216
+ # Named Entity Recognition
217
+ st.subheader("🏷️ Named Entity Recognition")
218
+ ner_text = st.text_area("Enter text for entity recognition:", height=100)
219
+ if st.button("Extract Entities"):
220
+ with st.spinner("Identifying entities..."):
221
+ ner_pipeline = pipeline("ner", grouped_entities=True)
222
+ results = ner_pipeline(ner_text)
223
+
224
+ entities = []
225
+ for entity in results:
226
+ entities.append({
227
+ "Entity": entity['entity_group'],
228
+ "Word": entity['word'],
229
+ "Score": entity['score']
230
+ })
231
+
232
+ st.table(pd.DataFrame(entities))
233
+
234
+ # Text Summarization
235
+ st.subheader("✍️ Text Summarization")
236
+ summary_text = st.text_area("Enter text to summarize:", height=150)
237
+ if st.button("Generate Summary"):
238
+ with st.spinner("Generating summary..."):
239
+ summarizer = pipeline("summarization")
240
+ summary = summarizer(summary_text, max_length=130, min_length=30)
241
+ st.write(summary[0]['summary_text'])
242
+
243
+ # Sidebar with additional info
244
+ with st.sidebar:
245
+ st.image("https://via.placeholder.com/150x50?text=Daily+Mirror", width=150)
246
+ st.title("About")
247
+ st.write("""
248
+ This app helps analyze news content using AI-powered tools:
249
+ - Classify news into categories
250
+ - Answer questions about news content
251
+ - Perform advanced text analysis
252
+ """)
253
+
254
+ st.title("Instructions")
255
+ st.write("""
256
+ 1. Upload a CSV file with 'excerpt' column
257
+ 2. Click classify to categorize news
258
+ 3. Download results as CSV
259
+ 4. Use Q&A tab to ask questions
260
+ """)
261
+
262
+ st.title("Model Information")
263
+ st.write("""
264
+ - Classification: Fine-tuned DistilBERT
265
+ - Q&A: RoBERTa-base
266
+ - Sentiment: DistilBERT-base
267
+ """)
268
+
269
+ st.markdown("[View model on Hugging Face](https://huggingface.co/your-username/daily-mirror-news-classifier)")
270
+
271
+ # Footer
272
+ st.markdown("---")
273
+ st.markdown("© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers")