Spaces:
Sleeping
Sleeping
Commit
·
79b4249
1
Parent(s):
3452303
fixed UI
Browse files- app.py +24 -10
- src/data_processor.py +20 -9
- src/insights_generator.py +12 -1
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
# app.py
|
|
|
|
| 2 |
|
| 3 |
import streamlit as st
|
| 4 |
import pandas as pd
|
|
@@ -32,11 +33,20 @@ def load_and_process_data():
|
|
| 32 |
dfs = []
|
| 33 |
for f in files_list:
|
| 34 |
try:
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
df['source_file'] = os.path.basename(f)
|
| 37 |
dfs.append(df)
|
| 38 |
except Exception as e:
|
| 39 |
st.error(f"Error reading {os.path.basename(f)}: {e}")
|
|
|
|
| 40 |
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
| 41 |
|
| 42 |
raw_posts_df = read_files_to_dataframe(post_files)
|
|
@@ -47,8 +57,7 @@ def load_and_process_data():
|
|
| 47 |
processed_posts_df = processor.process_all_data(raw_posts_df) if not raw_posts_df.empty else pd.DataFrame()
|
| 48 |
processed_comments_df = processor.process_all_data(raw_comments_df) if not raw_comments_df.empty else pd.DataFrame()
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
# Ensure the columns were actually added. If not, revert to an empty DataFrame.
|
| 52 |
if not processed_posts_df.empty and 'prime_mentions' not in processed_posts_df.columns:
|
| 53 |
st.warning("Could not process 'posts' data correctly. Check data format.")
|
| 54 |
processed_posts_df = pd.DataFrame()
|
|
@@ -56,7 +65,6 @@ def load_and_process_data():
|
|
| 56 |
if not processed_comments_df.empty and 'prime_mentions' not in processed_comments_df.columns:
|
| 57 |
st.warning("Could not process 'comments' data correctly. Check data format.")
|
| 58 |
processed_comments_df = pd.DataFrame()
|
| 59 |
-
# --- END OF CHECK ---
|
| 60 |
|
| 61 |
all_text_df = pd.concat([processed_posts_df, processed_comments_df], ignore_index=True)
|
| 62 |
|
|
@@ -83,9 +91,6 @@ if all_text_df.empty or insights is None:
|
|
| 83 |
prime_posts_df = posts_df[posts_df['prime_mentions'] > 0].copy() if not posts_df.empty and 'prime_mentions' in posts_df else pd.DataFrame()
|
| 84 |
prime_all_text_df = all_text_df[all_text_df['prime_mentions'] > 0].copy() if not all_text_df.empty and 'prime_mentions' in all_text_df else pd.DataFrame()
|
| 85 |
|
| 86 |
-
# The rest of the app.py file remains the same as the previous version...
|
| 87 |
-
# ... (KPI Section, Tabbed Interface, etc.) ...
|
| 88 |
-
|
| 89 |
# --- KPI Section ---
|
| 90 |
st.header("📈 Prime Bank Mention KPIs")
|
| 91 |
kpi1, kpi2 = st.columns(2)
|
|
@@ -141,10 +146,15 @@ with tab2:
|
|
| 141 |
with st.expander("Read Emotion Insights"):
|
| 142 |
emotion_insight = insights.get('emotion', {})
|
| 143 |
st.markdown(f"**Summary:** {emotion_insight.get('summary', 'N/A')}")
|
|
|
|
|
|
|
| 144 |
for emotion, data in emotion_insight.get('details', {}).items():
|
| 145 |
st.markdown(f"**{emotion} is often about:** {data['themes']}")
|
| 146 |
-
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
with col2:
|
| 150 |
st.subheader("Post & Comment Categories")
|
|
@@ -163,6 +173,10 @@ with tab3:
|
|
| 163 |
if not posts_df.empty:
|
| 164 |
st.subheader("Processed Posts Data")
|
| 165 |
st.dataframe(posts_df)
|
|
|
|
| 166 |
if not all_text_df.empty and len(all_text_df) > len(posts_df):
|
| 167 |
st.subheader("Processed Comments & Reviews Data")
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# app.py
|
| 2 |
+
# THIS IS THE START OF THE FILE
|
| 3 |
|
| 4 |
import streamlit as st
|
| 5 |
import pandas as pd
|
|
|
|
| 33 |
dfs = []
|
| 34 |
for f in files_list:
|
| 35 |
try:
|
| 36 |
+
# Use different readers for different file types
|
| 37 |
+
if f.endswith('.csv'):
|
| 38 |
+
df = pd.read_csv(f)
|
| 39 |
+
else: # for .txt files
|
| 40 |
+
with open(f, 'r', encoding='utf-8') as file:
|
| 41 |
+
content = file.read()
|
| 42 |
+
posts = content.split('\n')
|
| 43 |
+
df = pd.DataFrame({'text': [p.strip() for p in posts if p.strip()]})
|
| 44 |
+
|
| 45 |
df['source_file'] = os.path.basename(f)
|
| 46 |
dfs.append(df)
|
| 47 |
except Exception as e:
|
| 48 |
st.error(f"Error reading {os.path.basename(f)}: {e}")
|
| 49 |
+
|
| 50 |
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
| 51 |
|
| 52 |
raw_posts_df = read_files_to_dataframe(post_files)
|
|
|
|
| 57 |
processed_posts_df = processor.process_all_data(raw_posts_df) if not raw_posts_df.empty else pd.DataFrame()
|
| 58 |
processed_comments_df = processor.process_all_data(raw_comments_df) if not raw_comments_df.empty else pd.DataFrame()
|
| 59 |
|
| 60 |
+
# Secondary defensive check to ensure columns were added.
|
|
|
|
| 61 |
if not processed_posts_df.empty and 'prime_mentions' not in processed_posts_df.columns:
|
| 62 |
st.warning("Could not process 'posts' data correctly. Check data format.")
|
| 63 |
processed_posts_df = pd.DataFrame()
|
|
|
|
| 65 |
if not processed_comments_df.empty and 'prime_mentions' not in processed_comments_df.columns:
|
| 66 |
st.warning("Could not process 'comments' data correctly. Check data format.")
|
| 67 |
processed_comments_df = pd.DataFrame()
|
|
|
|
| 68 |
|
| 69 |
all_text_df = pd.concat([processed_posts_df, processed_comments_df], ignore_index=True)
|
| 70 |
|
|
|
|
| 91 |
prime_posts_df = posts_df[posts_df['prime_mentions'] > 0].copy() if not posts_df.empty and 'prime_mentions' in posts_df else pd.DataFrame()
|
| 92 |
prime_all_text_df = all_text_df[all_text_df['prime_mentions'] > 0].copy() if not all_text_df.empty and 'prime_mentions' in all_text_df else pd.DataFrame()
|
| 93 |
|
|
|
|
|
|
|
|
|
|
| 94 |
# --- KPI Section ---
|
| 95 |
st.header("📈 Prime Bank Mention KPIs")
|
| 96 |
kpi1, kpi2 = st.columns(2)
|
|
|
|
| 146 |
with st.expander("Read Emotion Insights"):
|
| 147 |
emotion_insight = insights.get('emotion', {})
|
| 148 |
st.markdown(f"**Summary:** {emotion_insight.get('summary', 'N/A')}")
|
| 149 |
+
|
| 150 |
+
# --- THIS IS THE CORRECTED CODE BLOCK ---
|
| 151 |
for emotion, data in emotion_insight.get('details', {}).items():
|
| 152 |
st.markdown(f"**{emotion} is often about:** {data['themes']}")
|
| 153 |
+
# Only show the example box if an example exists and is valid
|
| 154 |
+
if data.get('example') and data['example'] != "N/A":
|
| 155 |
+
st.write("Example:")
|
| 156 |
+
st.info(f"- \"{data['example'][:150]}...\"")
|
| 157 |
+
# --- END OF CORRECTED CODE BLOCK ---
|
| 158 |
|
| 159 |
with col2:
|
| 160 |
st.subheader("Post & Comment Categories")
|
|
|
|
| 173 |
if not posts_df.empty:
|
| 174 |
st.subheader("Processed Posts Data")
|
| 175 |
st.dataframe(posts_df)
|
| 176 |
+
# Check if there are any comments to display
|
| 177 |
if not all_text_df.empty and len(all_text_df) > len(posts_df):
|
| 178 |
st.subheader("Processed Comments & Reviews Data")
|
| 179 |
+
# Correctly slice the comments from the combined dataframe
|
| 180 |
+
st.dataframe(all_text_df.iloc[len(posts_df):].reset_index(drop=True))
|
| 181 |
+
|
| 182 |
+
# THIS IS THE END OF THE FILE
|
src/data_processor.py
CHANGED
|
@@ -219,10 +219,16 @@ class DataProcessor:
|
|
| 219 |
else:
|
| 220 |
return 'Other', 'General discussion or observation'
|
| 221 |
|
|
|
|
|
|
|
| 222 |
def process_all_data(self, df):
|
| 223 |
"""Apply all processing to dataframe"""
|
| 224 |
-
#
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
text_col = None
|
| 227 |
|
| 228 |
for col in text_columns:
|
|
@@ -230,11 +236,15 @@ class DataProcessor:
|
|
| 230 |
text_col = col
|
| 231 |
break
|
| 232 |
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
# Identify which bank each post is about
|
| 240 |
df[['primary_bank', 'all_banks_mentioned']] = df['text'].apply(
|
|
@@ -265,10 +275,11 @@ class DataProcessor:
|
|
| 265 |
df['viral_score'] += df['likes'].fillna(0)
|
| 266 |
if 'shares' in df.columns:
|
| 267 |
df['viral_score'] += df['shares'].fillna(0) * 2
|
| 268 |
-
if 'comments' in df.columns:
|
| 269 |
df['viral_score'] += df['comments'].fillna(0) * 1.5
|
| 270 |
|
| 271 |
# Add Prime Bank specific viral score boost
|
| 272 |
-
df.
|
|
|
|
| 273 |
|
| 274 |
return df
|
|
|
|
| 219 |
else:
|
| 220 |
return 'Other', 'General discussion or observation'
|
| 221 |
|
| 222 |
+
# In src/data_processor.py
|
| 223 |
+
|
| 224 |
def process_all_data(self, df):
|
| 225 |
"""Apply all processing to dataframe"""
|
| 226 |
+
# --- NEW, MORE ROBUST TEXT COLUMN FINDER ---
|
| 227 |
+
# If the dataframe is empty, return it immediately.
|
| 228 |
+
if df.empty:
|
| 229 |
+
return df
|
| 230 |
+
|
| 231 |
+
text_columns = ['text', 'content', 'message', 'review', 'comment', 'post', 'Text', 'Content', 'Post', 'Review Text']
|
| 232 |
text_col = None
|
| 233 |
|
| 234 |
for col in text_columns:
|
|
|
|
| 236 |
text_col = col
|
| 237 |
break
|
| 238 |
|
| 239 |
+
# If no text column is found, we cannot proceed. Return the empty shell.
|
| 240 |
+
if not text_col:
|
| 241 |
+
st.warning(f"Could not find a text column in one of the data sources.")
|
| 242 |
+
return pd.DataFrame(columns=df.columns) # Return with columns but no data
|
| 243 |
+
|
| 244 |
+
# If the found column is not 'text', rename it to 'text' for consistency.
|
| 245 |
+
if text_col != 'text':
|
| 246 |
+
df.rename(columns={text_col: 'text'}, inplace=True)
|
| 247 |
+
# --- END OF FIX ---
|
| 248 |
|
| 249 |
# Identify which bank each post is about
|
| 250 |
df[['primary_bank', 'all_banks_mentioned']] = df['text'].apply(
|
|
|
|
| 275 |
df['viral_score'] += df['likes'].fillna(0)
|
| 276 |
if 'shares' in df.columns:
|
| 277 |
df['viral_score'] += df['shares'].fillna(0) * 2
|
| 278 |
+
if 'comments' in df.columns: # This column name was missing from your old code
|
| 279 |
df['viral_score'] += df['comments'].fillna(0) * 1.5
|
| 280 |
|
| 281 |
# Add Prime Bank specific viral score boost
|
| 282 |
+
if not df.empty and 'prime_mentions' in df.columns:
|
| 283 |
+
df.loc[df['prime_mentions'] > 0, 'viral_score'] *= 1.2
|
| 284 |
|
| 285 |
return df
|
src/insights_generator.py
CHANGED
|
@@ -39,12 +39,23 @@ class InsightsGenerator:
|
|
| 39 |
|
| 40 |
return self.insights
|
| 41 |
|
|
|
|
|
|
|
| 42 |
def _get_common_words(self, text_series, top_n=5):
|
| 43 |
"""Helper function to find common keywords in a series of text."""
|
| 44 |
if text_series.empty:
|
| 45 |
return "No data"
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
all_text = ' '.join(text_series.astype(str).tolist()).lower()
|
| 49 |
words = re.findall(r'\b[a-z]{4,}\b', all_text) # Find words with 4+ letters
|
| 50 |
filtered_words = [word for word in words if word not in stop_words]
|
|
|
|
| 39 |
|
| 40 |
return self.insights
|
| 41 |
|
| 42 |
+
# In src/insights_generator.py
|
| 43 |
+
|
| 44 |
def _get_common_words(self, text_series, top_n=5):
|
| 45 |
"""Helper function to find common keywords in a series of text."""
|
| 46 |
if text_series.empty:
|
| 47 |
return "No data"
|
| 48 |
|
| 49 |
+
# --- NEW, IMPROVED STOP WORD LIST ---
|
| 50 |
+
stop_words = {
|
| 51 |
+
'the', 'a', 'an', 'is', 'i', 'to', 'for', 'in', 'it', 'and', 'my', 'of', 'prime', 'bank', 'banker',
|
| 52 |
+
'was', 'do', 'with', 'that', 'this', 'have', 'has', 'are', 'not',
|
| 53 |
+
# Common "Banglish" and filler words
|
| 54 |
+
'er', 'ta', 'ki', 'ami', 'amar', 'kore', 'hocche', 'bhalo', 'asholei', 'onek', 'apnar',
|
| 55 |
+
'sir', 'bro', 'please', 'help', 'need', 'know', 'want'
|
| 56 |
+
}
|
| 57 |
+
# --- END OF FIX ---
|
| 58 |
+
|
| 59 |
all_text = ' '.join(text_series.astype(str).tolist()).lower()
|
| 60 |
words = re.findall(r'\b[a-z]{4,}\b', all_text) # Find words with 4+ letters
|
| 61 |
filtered_words = [word for word in words if word not in stop_words]
|