FarhinSadia commited on
Commit
79b4249
·
1 Parent(s): 3452303
Files changed (3) hide show
  1. app.py +24 -10
  2. src/data_processor.py +20 -9
  3. src/insights_generator.py +12 -1
app.py CHANGED
@@ -1,4 +1,5 @@
1
  # app.py
 
2
 
3
  import streamlit as st
4
  import pandas as pd
@@ -32,11 +33,20 @@ def load_and_process_data():
32
  dfs = []
33
  for f in files_list:
34
  try:
35
- df = pd.read_csv(f) if f.endswith('.csv') else pd.DataFrame({'text': [p.strip() for p in open(f, 'r', encoding='utf-8').read().split('\n') if p.strip()]})
 
 
 
 
 
 
 
 
36
  df['source_file'] = os.path.basename(f)
37
  dfs.append(df)
38
  except Exception as e:
39
  st.error(f"Error reading {os.path.basename(f)}: {e}")
 
40
  return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
41
 
42
  raw_posts_df = read_files_to_dataframe(post_files)
@@ -47,8 +57,7 @@ def load_and_process_data():
47
  processed_posts_df = processor.process_all_data(raw_posts_df) if not raw_posts_df.empty else pd.DataFrame()
48
  processed_comments_df = processor.process_all_data(raw_comments_df) if not raw_comments_df.empty else pd.DataFrame()
49
 
50
- # --- SECONDARY DEFENSIVE CHECK ---
51
- # Ensure the columns were actually added. If not, revert to an empty DataFrame.
52
  if not processed_posts_df.empty and 'prime_mentions' not in processed_posts_df.columns:
53
  st.warning("Could not process 'posts' data correctly. Check data format.")
54
  processed_posts_df = pd.DataFrame()
@@ -56,7 +65,6 @@ def load_and_process_data():
56
  if not processed_comments_df.empty and 'prime_mentions' not in processed_comments_df.columns:
57
  st.warning("Could not process 'comments' data correctly. Check data format.")
58
  processed_comments_df = pd.DataFrame()
59
- # --- END OF CHECK ---
60
 
61
  all_text_df = pd.concat([processed_posts_df, processed_comments_df], ignore_index=True)
62
 
@@ -83,9 +91,6 @@ if all_text_df.empty or insights is None:
83
  prime_posts_df = posts_df[posts_df['prime_mentions'] > 0].copy() if not posts_df.empty and 'prime_mentions' in posts_df else pd.DataFrame()
84
  prime_all_text_df = all_text_df[all_text_df['prime_mentions'] > 0].copy() if not all_text_df.empty and 'prime_mentions' in all_text_df else pd.DataFrame()
85
 
86
- # The rest of the app.py file remains the same as the previous version...
87
- # ... (KPI Section, Tabbed Interface, etc.) ...
88
-
89
  # --- KPI Section ---
90
  st.header("📈 Prime Bank Mention KPIs")
91
  kpi1, kpi2 = st.columns(2)
@@ -141,10 +146,15 @@ with tab2:
141
  with st.expander("Read Emotion Insights"):
142
  emotion_insight = insights.get('emotion', {})
143
  st.markdown(f"**Summary:** {emotion_insight.get('summary', 'N/A')}")
 
 
144
  for emotion, data in emotion_insight.get('details', {}).items():
145
  st.markdown(f"**{emotion} is often about:** {data['themes']}")
146
- st.write(f"Example:")
147
- st.info(f"- \"{data['example'][:150]}...\"")
 
 
 
148
 
149
  with col2:
150
  st.subheader("Post & Comment Categories")
@@ -163,6 +173,10 @@ with tab3:
163
  if not posts_df.empty:
164
  st.subheader("Processed Posts Data")
165
  st.dataframe(posts_df)
 
166
  if not all_text_df.empty and len(all_text_df) > len(posts_df):
167
  st.subheader("Processed Comments & Reviews Data")
168
- st.dataframe(all_text_df.iloc[len(posts_df):].reset_index(drop=True))
 
 
 
 
1
  # app.py
2
+ # THIS IS THE START OF THE FILE
3
 
4
  import streamlit as st
5
  import pandas as pd
 
33
  dfs = []
34
  for f in files_list:
35
  try:
36
+ # Use different readers for different file types
37
+ if f.endswith('.csv'):
38
+ df = pd.read_csv(f)
39
+ else: # for .txt files
40
+ with open(f, 'r', encoding='utf-8') as file:
41
+ content = file.read()
42
+ posts = content.split('\n')
43
+ df = pd.DataFrame({'text': [p.strip() for p in posts if p.strip()]})
44
+
45
  df['source_file'] = os.path.basename(f)
46
  dfs.append(df)
47
  except Exception as e:
48
  st.error(f"Error reading {os.path.basename(f)}: {e}")
49
+
50
  return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
51
 
52
  raw_posts_df = read_files_to_dataframe(post_files)
 
57
  processed_posts_df = processor.process_all_data(raw_posts_df) if not raw_posts_df.empty else pd.DataFrame()
58
  processed_comments_df = processor.process_all_data(raw_comments_df) if not raw_comments_df.empty else pd.DataFrame()
59
 
60
+ # Secondary defensive check to ensure columns were added.
 
61
  if not processed_posts_df.empty and 'prime_mentions' not in processed_posts_df.columns:
62
  st.warning("Could not process 'posts' data correctly. Check data format.")
63
  processed_posts_df = pd.DataFrame()
 
65
  if not processed_comments_df.empty and 'prime_mentions' not in processed_comments_df.columns:
66
  st.warning("Could not process 'comments' data correctly. Check data format.")
67
  processed_comments_df = pd.DataFrame()
 
68
 
69
  all_text_df = pd.concat([processed_posts_df, processed_comments_df], ignore_index=True)
70
 
 
91
  prime_posts_df = posts_df[posts_df['prime_mentions'] > 0].copy() if not posts_df.empty and 'prime_mentions' in posts_df else pd.DataFrame()
92
  prime_all_text_df = all_text_df[all_text_df['prime_mentions'] > 0].copy() if not all_text_df.empty and 'prime_mentions' in all_text_df else pd.DataFrame()
93
 
 
 
 
94
  # --- KPI Section ---
95
  st.header("📈 Prime Bank Mention KPIs")
96
  kpi1, kpi2 = st.columns(2)
 
146
  with st.expander("Read Emotion Insights"):
147
  emotion_insight = insights.get('emotion', {})
148
  st.markdown(f"**Summary:** {emotion_insight.get('summary', 'N/A')}")
149
+
150
+ # --- THIS IS THE CORRECTED CODE BLOCK ---
151
  for emotion, data in emotion_insight.get('details', {}).items():
152
  st.markdown(f"**{emotion} is often about:** {data['themes']}")
153
+ # Only show the example box if an example exists and is valid
154
+ if data.get('example') and data['example'] != "N/A":
155
+ st.write("Example:")
156
+ st.info(f"- \"{data['example'][:150]}...\"")
157
+ # --- END OF CORRECTED CODE BLOCK ---
158
 
159
  with col2:
160
  st.subheader("Post & Comment Categories")
 
173
  if not posts_df.empty:
174
  st.subheader("Processed Posts Data")
175
  st.dataframe(posts_df)
176
+ # Check if there are any comments to display
177
  if not all_text_df.empty and len(all_text_df) > len(posts_df):
178
  st.subheader("Processed Comments & Reviews Data")
179
+ # Correctly slice the comments from the combined dataframe
180
+ st.dataframe(all_text_df.iloc[len(posts_df):].reset_index(drop=True))
181
+
182
+ # THIS IS THE END OF THE FILE
src/data_processor.py CHANGED
@@ -219,10 +219,16 @@ class DataProcessor:
219
  else:
220
  return 'Other', 'General discussion or observation'
221
 
 
 
222
  def process_all_data(self, df):
223
  """Apply all processing to dataframe"""
224
- # Find text column
225
- text_columns = ['text', 'content', 'message', 'review', 'comment', 'post', 'Text', 'Content']
 
 
 
 
226
  text_col = None
227
 
228
  for col in text_columns:
@@ -230,11 +236,15 @@ class DataProcessor:
230
  text_col = col
231
  break
232
 
233
- if text_col and text_col != 'text':
234
- df['text'] = df[text_col]
235
-
236
- if 'text' not in df.columns:
237
- return df
 
 
 
 
238
 
239
  # Identify which bank each post is about
240
  df[['primary_bank', 'all_banks_mentioned']] = df['text'].apply(
@@ -265,10 +275,11 @@ class DataProcessor:
265
  df['viral_score'] += df['likes'].fillna(0)
266
  if 'shares' in df.columns:
267
  df['viral_score'] += df['shares'].fillna(0) * 2
268
- if 'comments' in df.columns:
269
  df['viral_score'] += df['comments'].fillna(0) * 1.5
270
 
271
  # Add Prime Bank specific viral score boost
272
- df.loc[df['prime_mentions'] > 0, 'viral_score'] *= 1.2
 
273
 
274
  return df
 
219
  else:
220
  return 'Other', 'General discussion or observation'
221
 
222
+ # In src/data_processor.py
223
+
224
  def process_all_data(self, df):
225
  """Apply all processing to dataframe"""
226
+ # --- NEW, MORE ROBUST TEXT COLUMN FINDER ---
227
+ # If the dataframe is empty, return it immediately.
228
+ if df.empty:
229
+ return df
230
+
231
+ text_columns = ['text', 'content', 'message', 'review', 'comment', 'post', 'Text', 'Content', 'Post', 'Review Text']
232
  text_col = None
233
 
234
  for col in text_columns:
 
236
  text_col = col
237
  break
238
 
239
+ # If no text column is found, we cannot proceed. Return the empty shell.
240
+ if not text_col:
241
+ st.warning(f"Could not find a text column in one of the data sources.")
242
+ return pd.DataFrame(columns=df.columns) # Return with columns but no data
243
+
244
+ # If the found column is not 'text', rename it to 'text' for consistency.
245
+ if text_col != 'text':
246
+ df.rename(columns={text_col: 'text'}, inplace=True)
247
+ # --- END OF FIX ---
248
 
249
  # Identify which bank each post is about
250
  df[['primary_bank', 'all_banks_mentioned']] = df['text'].apply(
 
275
  df['viral_score'] += df['likes'].fillna(0)
276
  if 'shares' in df.columns:
277
  df['viral_score'] += df['shares'].fillna(0) * 2
278
+ if 'comments' in df.columns: # This column name was missing from your old code
279
  df['viral_score'] += df['comments'].fillna(0) * 1.5
280
 
281
  # Add Prime Bank specific viral score boost
282
+ if not df.empty and 'prime_mentions' in df.columns:
283
+ df.loc[df['prime_mentions'] > 0, 'viral_score'] *= 1.2
284
 
285
  return df
src/insights_generator.py CHANGED
@@ -39,12 +39,23 @@ class InsightsGenerator:
39
 
40
  return self.insights
41
 
 
 
42
  def _get_common_words(self, text_series, top_n=5):
43
  """Helper function to find common keywords in a series of text."""
44
  if text_series.empty:
45
  return "No data"
46
 
47
- stop_words = {'the', 'a', 'an', 'is', 'i', 'to', 'for', 'in', 'it', 'and', 'my', 'of', 'prime', 'bank'}
 
 
 
 
 
 
 
 
 
48
  all_text = ' '.join(text_series.astype(str).tolist()).lower()
49
  words = re.findall(r'\b[a-z]{4,}\b', all_text) # Find words with 4+ letters
50
  filtered_words = [word for word in words if word not in stop_words]
 
39
 
40
  return self.insights
41
 
42
+ # In src/insights_generator.py
43
+
44
  def _get_common_words(self, text_series, top_n=5):
45
  """Helper function to find common keywords in a series of text."""
46
  if text_series.empty:
47
  return "No data"
48
 
49
+ # --- NEW, IMPROVED STOP WORD LIST ---
50
+ stop_words = {
51
+ 'the', 'a', 'an', 'is', 'i', 'to', 'for', 'in', 'it', 'and', 'my', 'of', 'prime', 'bank', 'banker',
52
+ 'was', 'do', 'with', 'that', 'this', 'have', 'has', 'are', 'not',
53
+ # Common "Banglish" and filler words
54
+ 'er', 'ta', 'ki', 'ami', 'amar', 'kore', 'hocche', 'bhalo', 'asholei', 'onek', 'apnar',
55
+ 'sir', 'bro', 'please', 'help', 'need', 'know', 'want'
56
+ }
57
+ # --- END OF FIX ---
58
+
59
  all_text = ' '.join(text_series.astype(str).tolist()).lower()
60
  words = re.findall(r'\b[a-z]{4,}\b', all_text) # Find words with 4+ letters
61
  filtered_words = [word for word in words if word not in stop_words]