HussainM899 commited on
Commit
b744269
·
verified ·
1 Parent(s): 8808059

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +559 -558
app.py CHANGED
@@ -1,558 +1,559 @@
1
- import os
2
- from dotenv import load_dotenv
3
- import streamlit as st
4
- import pandas as pd
5
- import plotly.express as px
6
- import google.generativeai as genai
7
- from langchain_google_genai import GoogleGenerativeAI
8
- from io import BytesIO
9
-
10
- # Load environment variables
11
- load_dotenv()
12
-
13
- # Get API key securely
14
- def get_api_key():
15
- """Get API key from environment variables or secrets."""
16
- try:
17
- return st.secrets['GOOGLE_API_KEY']
18
- except:
19
- return os.getenv('GOOGLE_API_KEY')
20
-
21
- # Set the API key
22
- GOOGLE_API_KEY = get_api_key()
23
-
24
- # Configure Gemini
25
- if GOOGLE_API_KEY:
26
- genai.configure(api_key=GOOGLE_API_KEY)
27
- # Configure page settings
28
- st.set_page_config(page_title="Excel Automation App", layout="wide")
29
-
30
- # Declare CADRE_MAPPINGS at the top level of your script, before any functions
31
- CADRE_MAPPINGS = {
32
- "District NSTOP Officer": "District Level",
33
- "DCO/DHCSO": "District Level",
34
- "Disease Surveillance Officer": "District Level",
35
- "Immunization Officer": "District Level",
36
- "Federal/Provincial/District Facilitator": "District Level",
37
- "Divisional NSTOP Officer": "District Level",
38
- "ComNET staff": "District Level",
39
- "Area Coordinator / District Coordinator": "District Level",
40
- "Provincial Facilitator (M&E, Campaign, HRMP, etc.)": "District Level",
41
- "DDHO": "District Level",
42
- "CEO/DHO": "District Level",
43
- "DSV / ASV": "District Level",
44
- "Federal Facilitator (UNICEF)": "Federal Level",
45
- "EPI Coordinator": "Provincial Level",
46
- "Provincial Facilitator (EPI, Coordinator etc)": "Provincial Level",
47
- "Federal/Provincial/District Facilitator": "Provincial Level",
48
- "TPO/ TDO": "Town Level",
49
- "ComNET staff": "Town Level",
50
- "TCO": "Town Level",
51
- "UCPO / UCSP/ UCDO": "UC Level",
52
- "UCMO": "UC Level",
53
- "TTSP/TUSP": "UC Level",
54
- "Social Mobilizers": "UC Level",
55
- "Independent Monitor": "UC Level",
56
- }
57
-
58
- def upload_and_parse_file(uploaded_file):
59
- """Handle file upload and parsing."""
60
- try:
61
- # Detect file type and parse accordingly
62
- if uploaded_file.name.endswith(".csv"):
63
- df = pd.read_csv(uploaded_file)
64
- else:
65
- # Handle multi-level headers
66
- df = pd.read_excel(uploaded_file, header=[0, 1])
67
-
68
- # If multi-level headers exist, combine them
69
- if isinstance(df.columns, pd.MultiIndex):
70
- df.columns = [' '.join(str(col) for col in cols if str(col) != 'nan').strip()
71
- for cols in df.columns.values]
72
-
73
- return df
74
- except Exception as e:
75
- st.error(f"Error reading file: {str(e)}")
76
- return None
77
-
78
- def clean_data(df):
79
- """Perform data cleaning on the DataFrame."""
80
- try:
81
- # Remove duplicate rows
82
- df = df.drop_duplicates()
83
-
84
- # Fill NA values
85
- df = df.fillna("N/A")
86
-
87
- # Remove leading/trailing whitespace from string columns
88
- for col in df.select_dtypes(include=['object']):
89
- df[col] = df[col].str.strip()
90
-
91
- return df
92
- except Exception as e:
93
- st.error(f"Error cleaning data: {str(e)}")
94
- return df
95
-
96
- def map_designations(df, column_name="designation_title"):
97
- """Map designations to cadres dynamically."""
98
- try:
99
- if column_name not in df.columns:
100
- st.error(f"Column '{column_name}' not found in the uploaded file.")
101
- return df
102
-
103
- # Create Cadre column using the mapping
104
- df["Cadre"] = df[column_name].map(CADRE_MAPPINGS).fillna("Unmapped")
105
- return df
106
- except Exception as e:
107
- st.error(f"Error mapping designations: {str(e)}")
108
- return df
109
-
110
- def handle_new_designations(df, column_name="designation_title"):
111
- """Handle new designations and update the CADRE_MAPPINGS dictionary."""
112
- try:
113
- # Get current designations that aren't in our mapping
114
- current_designations = set(df[df['Cadre'] == 'Unmapped'][column_name].unique())
115
-
116
- if current_designations:
117
- st.warning(f"📝 Found {len(current_designations)} new designation(s) that need mapping!")
118
-
119
- # Available cadre levels (predefined options only)
120
- CADRE_LEVELS = [
121
- "District Level",
122
- "Federal Level",
123
- "Provincial Level",
124
- "Town Level",
125
- "UC Level"
126
- ]
127
-
128
- # Create a container for new mappings
129
- new_mappings = {}
130
-
131
- with st.expander("Map New Designations", expanded=True):
132
- st.markdown("### New Designations Found")
133
- st.markdown("Please assign appropriate cadres to the following designations:")
134
-
135
- # Create a form for mapping new designations
136
- for designation in current_designations:
137
- col1, col2 = st.columns([2, 1])
138
- with col1:
139
- st.text(designation)
140
- with col2:
141
- selected_cadre = st.selectbox(
142
- "Select Cadre",
143
- options=CADRE_LEVELS,
144
- key=f"new_designation_{designation}"
145
- )
146
- new_mappings[designation] = selected_cadre
147
-
148
- # Button to confirm mappings
149
- if st.button("Confirm New Mappings"):
150
- # Update CADRE_MAPPINGS
151
- CADRE_MAPPINGS.update(new_mappings)
152
-
153
- # Update the DataFrame with new mappings
154
- df["Cadre"] = df[column_name].map(CADRE_MAPPINGS).fillna("Unmapped")
155
-
156
- st.success("✅ Mappings updated successfully!")
157
-
158
- # Show the new mappings
159
- st.markdown("### New Mappings Added:")
160
- for designation, cadre in new_mappings.items():
161
- st.markdown(f"- **{designation}**: {cadre}")
162
-
163
- # Option to export updated mappings
164
- if st.button("Export Updated Mappings"):
165
- export_mappings(CADRE_MAPPINGS)
166
-
167
- return df
168
-
169
- except Exception as e:
170
- st.error(f"Error handling new designations: {str(e)}")
171
- return df
172
-
173
- def show_interactive_preview(df):
174
- """Show interactive data preview with enhanced features."""
175
- st.subheader("📋 Interactive Data Preview")
176
-
177
- # View options in an expander
178
- with st.expander("🔧 View Options", expanded=False):
179
- # Column selection
180
- cols = st.multiselect(
181
- "Select columns to display:",
182
- df.columns.tolist(),
183
- default=df.columns.tolist()
184
- )
185
-
186
- # Row count slider
187
- row_count = st.slider(
188
- "Number of rows to display:",
189
- min_value=5,
190
- max_value=len(df),
191
- value=min(50, len(df))
192
- )
193
-
194
- # Index visibility
195
- hide_index = st.checkbox("Hide index", value=True)
196
-
197
- # Search and filter in an expander
198
- with st.expander("🔍 Search & Filters", expanded=False):
199
- # Global search
200
- search = st.text_input("Search in all columns:", "")
201
-
202
- # Column-specific filters
203
- filter_col = st.selectbox("Filter by column:", ["None"] + df.columns.tolist())
204
-
205
- if filter_col != "None":
206
- if df[filter_col].dtype in ['int64', 'float64']:
207
- # Numeric filter
208
- min_val, max_val = st.slider(
209
- f"Range for {filter_col}:",
210
- float(df[filter_col].min()),
211
- float(df[filter_col].max()),
212
- (float(df[filter_col].min()), float(df[filter_col].max()))
213
- )
214
- else:
215
- # Category filter
216
- unique_vals = df[filter_col].unique().tolist()
217
- selected_vals = st.multiselect(
218
- f"Select values for {filter_col}:",
219
- unique_vals,
220
- default=unique_vals
221
- )
222
-
223
- # Apply filters
224
- filtered_df = df.copy()
225
-
226
- # Apply search
227
- if search:
228
- mask = filtered_df.astype(str).apply(
229
- lambda x: x.str.contains(search, case=False)
230
- ).any(axis=1)
231
- filtered_df = filtered_df[mask]
232
-
233
- # Apply column filter
234
- if filter_col != "None":
235
- if df[filter_col].dtype in ['int64', 'float64']:
236
- filtered_df = filtered_df[
237
- (filtered_df[filter_col] >= min_val) &
238
- (filtered_df[filter_col] <= max_val)
239
- ]
240
- else:
241
- filtered_df = filtered_df[filtered_df[filter_col].isin(selected_vals)]
242
-
243
- # Show the filtered dataframe
244
- st.dataframe(
245
- filtered_df[cols].head(row_count),
246
- use_container_width=True,
247
- height=400, # Fixed height for scrolling
248
- hide_index=hide_index,
249
- )
250
-
251
- # Show statistics
252
- col1, col2, col3 = st.columns(3)
253
- with col1:
254
- st.caption(f"Showing {len(filtered_df)} of {len(df)} rows")
255
- with col2:
256
- st.caption(f"Selected {len(cols)} columns")
257
- with col3:
258
- st.caption(f"Memory usage: {df.memory_usage().sum() / 1024:.2f} KB")
259
-
260
- return filtered_df
261
-
262
- def show_visualizations(df):
263
- """Display various visualizations of the data."""
264
- try:
265
- st.subheader("📊 Data Visualizations")
266
-
267
- # Cadre distribution if available
268
- if "Cadre" in df.columns:
269
- with st.expander("Cadre Distribution", expanded=True):
270
- fig_cadre = px.pie(df, names="Cadre", title="Distribution of Cadres")
271
- st.plotly_chart(fig_cadre, use_container_width=True)
272
-
273
- # Numeric column distributions
274
- numeric_cols = df.select_dtypes(include=['number']).columns
275
- if len(numeric_cols) > 0:
276
- with st.expander("Numeric Distributions", expanded=False):
277
- selected_column = st.selectbox(
278
- "Select numeric column for distribution",
279
- numeric_cols
280
- )
281
- fig_dist = px.histogram(
282
- df,
283
- x=selected_column,
284
- title=f"Distribution of {selected_column}"
285
- )
286
- st.plotly_chart(fig_dist, use_container_width=True)
287
-
288
- # Correlation matrix for numeric columns
289
- if len(numeric_cols) > 1:
290
- with st.expander("Correlation Matrix", expanded=False):
291
- corr_matrix = df[numeric_cols].corr()
292
- fig_corr = px.imshow(
293
- corr_matrix,
294
- title="Correlation Matrix"
295
- )
296
- st.plotly_chart(fig_corr, use_container_width=True)
297
-
298
- except Exception as e:
299
- st.error(f"Error creating visualizations: {str(e)}")
300
-
301
- def query_gemini(df, question):
302
- """Query Gemini AI with enhanced analytics capabilities"""
303
- try:
304
- if not GOOGLE_API_KEY:
305
- st.error("Google API Key not configured")
306
- return "Error: API Key not found"
307
-
308
- llm = GoogleGenerativeAI(
309
- model="gemini-1.5-pro",
310
- google_api_key=GOOGLE_API_KEY,
311
- temperature=0.1
312
- )
313
-
314
- # Analyze the question to determine what data to include
315
- question_lower = question.lower()
316
-
317
- # Initialize context parts
318
- context_parts = []
319
-
320
- # Add basic dataset info
321
- context_parts.append(f"Total Records: {len(df)}")
322
- context_parts.append(f"Available Columns: {', '.join(df.columns.tolist())}")
323
-
324
- # Add relevant data based on question
325
- if 'district' in question_lower:
326
- district_counts = df['district_name'].value_counts()
327
- context_parts.append("\nDistrict Information:")
328
- context_parts.append(f"Total Districts: {len(district_counts)}")
329
- context_parts.append("Top Districts by Count:")
330
- context_parts.append(district_counts.head().to_string())
331
-
332
- if 'cadre' in question_lower:
333
- cadre_counts = df['Cadre'].value_counts()
334
- context_parts.append("\nCadre Information:")
335
- context_parts.append(cadre_counts.to_string())
336
-
337
- if 'designation' in question_lower:
338
- designation_counts = df['designation_title'].value_counts()
339
- context_parts.append("\nDesignation Information:")
340
- context_parts.append(designation_counts.head().to_string())
341
-
342
- # For questions about "most" or "highest"
343
- if any(word in question_lower for word in ['most', 'highest', 'maximum', 'top']):
344
- if 'district' in question_lower:
345
- top_district = df['district_name'].value_counts().head(1)
346
- context_parts.append(f"\nHighest Count District:")
347
- context_parts.append(f"{top_district.index[0]}: {top_district.values[0]} records")
348
-
349
- # Combine all context parts
350
- context = "\n".join(context_parts)
351
-
352
- prompt = f"""You are an expert Operational data analyst who has more than 15 years of experience in Polio Program internationally. Answer the following question using the provided data:
353
-
354
- Context:
355
- {context}
356
-
357
- Question: {question}
358
-
359
- Requirements for your answer:
360
- 1. Give ONLY the exact answer with specific numbers
361
- 2. For questions about "most" or "highest", give the specific name and count
362
- 3. Format: "[Name/Value] with [count] records" or similar
363
- 4. If asking about a specific column, give values from that column only
364
- 5. Do not mention other columns unless specifically asked
365
- 6. Do not explain methodology
366
- 7. Keep response to one sentence
367
- 8. If data isn't available, say "Data not available"
368
-
369
- Examples:
370
- Q: "Which district has most data?"
371
- A: "Karachi South with 1,234 records."
372
-
373
- Q: "What is the total count?"
374
- A: "The dataset contains 5,678 total records."
375
-
376
- Answer the question directly and concisely."""
377
-
378
- with st.spinner('Analyzing data...'):
379
- response = llm.invoke(prompt)
380
-
381
- # Debug logging
382
- st.session_state['last_context'] = context
383
- st.session_state['last_response'] = response
384
-
385
- return response
386
-
387
- except Exception as e:
388
- st.error(f"Error in analysis: {str(e)}")
389
- return "Error occurred during analysis"
390
-
391
- def export_data(df):
392
- """Allow users to download the processed DataFrame."""
393
- try:
394
- towrite = BytesIO()
395
- df.to_excel(towrite, index=False, engine="openpyxl")
396
- towrite.seek(0)
397
-
398
- return st.download_button(
399
- label="📥 Download Processed Data",
400
- data=towrite,
401
- file_name="processed_data.xlsx",
402
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
403
- )
404
- except Exception as e:
405
- st.error(f"Error exporting data: {str(e)}")
406
-
407
- def export_mappings(mappings):
408
- """Export the updated mappings dictionary."""
409
- try:
410
- import json
411
- mappings_json = json.dumps(mappings, indent=4)
412
- st.download_button(
413
- label="📥 Download Mappings",
414
- data=mappings_json,
415
- file_name="cadre_mappings.json",
416
- mime="application/json"
417
- )
418
- except Exception as e:
419
- st.error(f"Error exporting mappings: {str(e)}")
420
-
421
- def main():
422
- """Main application function."""
423
- try:
424
- st.title("📊 Excel Automation App with Gemini AI")
425
-
426
- # Add sidebar for app navigation
427
- st.sidebar.title("Navigation")
428
- app_mode = st.sidebar.selectbox(
429
- "Choose the app mode",
430
- ["Data Processing", "Analysis & Visualization", "About"]
431
- )
432
-
433
- if app_mode == "About":
434
- st.markdown("""
435
- ### About this app
436
- This app helps you process Excel files and analyze data using AI.
437
-
438
- #### Features:
439
- - Upload and process Excel/CSV files
440
- - Automatic data cleaning
441
- - Interactive data preview
442
- - Designation to Cadre mapping
443
- - AI-powered analysis
444
- - Data visualization
445
- - Export processed data
446
-
447
- #### How to use:
448
- 1. Upload your file
449
- 2. Review and clean the data
450
- 3. Map designations to cadres
451
- 4. Analyze using AI
452
- 5. Export processed data
453
- """)
454
- return
455
-
456
- # Create two columns for layout
457
- col1, col2 = st.columns([2, 1])
458
-
459
- with col1:
460
- uploaded_file = st.file_uploader("Upload your file (CSV/XLS/XLSX)", type=["csv", "xls", "xlsx"])
461
-
462
- if uploaded_file:
463
- try:
464
- # Use the upload_and_parse_file function
465
- df = upload_and_parse_file(uploaded_file)
466
- if df is not None:
467
- st.success("File uploaded successfully!")
468
-
469
- # Clean data with progress indicator
470
- with st.spinner('Cleaning data...'):
471
- df = clean_data(df)
472
-
473
- # Map designations to cadres
474
- with st.spinner('Mapping designations to cadres...'):
475
- df = map_designations(df)
476
-
477
- # Show the unique designations that weren't mapped
478
- unmapped = df[df['Cadre'] == 'Unmapped']['designation_title'].unique()
479
- if len(unmapped) > 0:
480
- st.warning(f"Found {len(unmapped)} unmapped designations!")
481
-
482
- if app_mode == "Data Processing":
483
- # Handle new designations if any are unmapped
484
- if len(unmapped) > 0:
485
- df = handle_new_designations(df)
486
- # Reapply mapping after handling new designations
487
- df = map_designations(df)
488
-
489
- # Show interactive preview
490
- filtered_df = show_interactive_preview(df)
491
-
492
- # Export Options
493
- st.subheader("📥 Export Options")
494
- col1, col2 = st.columns(2)
495
- with col1:
496
- export_data(filtered_df)
497
- with col2:
498
- export_mappings(CADRE_MAPPINGS)
499
-
500
- elif app_mode == "Analysis & Visualization":
501
- show_visualizations(df)
502
-
503
- # Gemini AI Query Section
504
- st.subheader("💬 Ask Gemini AI about your data")
505
-
506
- # Add suggested questions
507
- suggested_questions = [
508
- f"How many total records are in the dataset?",
509
- f"What is the exact count and percentage for each Cadre level?",
510
- f"How many unmapped designations are there?",
511
- f"What is the most common Cadre level?",
512
- f"What percentage of staff is at the District Level?",
513
- "Custom Question"
514
- ]
515
-
516
- question_type = st.selectbox(
517
- "Choose a question type:",
518
- suggested_questions
519
- )
520
-
521
- if question_type == "Custom Question":
522
- question = st.text_input("Enter your question about the data:")
523
- else:
524
- question = question_type
525
-
526
- if question:
527
- with st.spinner('Analyzing data...'):
528
- response = query_gemini(df, question)
529
- st.markdown("### Analysis Results")
530
- st.markdown(response)
531
-
532
- # Add debug expander
533
- with st.expander("Debug Information", expanded=False):
534
- if 'last_context' in st.session_state:
535
- st.text("Context sent to AI:")
536
- st.code(st.session_state['last_context'])
537
- if 'last_response' in st.session_state:
538
- st.text("Raw AI Response:")
539
- st.code(st.session_state['last_response'])
540
-
541
- if st.button("Generate Follow-up Questions"):
542
- follow_up_prompt = f"Based on the previous analysis about '{question}', what are 3 relevant follow-up questions we could ask about this data?"
543
- follow_up_response = query_gemini(df, follow_up_prompt)
544
- st.markdown("### Suggested Follow-up Questions")
545
- st.markdown(follow_up_response)
546
-
547
- except Exception as e:
548
- st.error(f"Error processing file: {str(e)}")
549
-
550
- # Add footer
551
- st.markdown("---")
552
- st.markdown("Built with Streamlit and Gemini AI")
553
-
554
- except Exception as e:
555
- st.error(f"An error occurred: {str(e)}")
556
-
557
- if __name__ == "__main__":
558
- main()
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import streamlit as st
4
+ import pandas as pd
5
+ import plotly.express as px
6
+ import google.generativeai as genai
7
+ from langchain_google_genai import GoogleGenerativeAI
8
+ from io import BytesIO
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ # Get API key securely
14
+ def get_api_key():
15
+ """Get API key from environment variables or secrets."""
16
+ try:
17
+ return st.secrets['GOOGLE_API_KEY']
18
+ except:
19
+ return os.getenv('GOOGLE_API_KEY')
20
+
21
+ # Set the API key
22
+ GOOGLE_API_KEY = get_api_key()
23
+
24
+ # Configure Gemini
25
+ if GOOGLE_API_KEY:
26
+ genai.configure(api_key=GOOGLE_API_KEY)
27
+ # Configure page settings
28
+ st.set_page_config(page_title="Excel Automation App", layout="wide")
29
+
30
+ # Declare CADRE_MAPPINGS at the top level of your script, before any functions
31
+ CADRE_MAPPINGS = {
32
+ "District NSTOP Officer": "District Level",
33
+ "DCO/DHCSO": "District Level",
34
+ "Disease Surveillance Officer": "District Level",
35
+ "Immunization Officer": "District Level",
36
+ "Federal/Provincial/District Facilitator": "District Level",
37
+ "Divisional NSTOP Officer": "District Level",
38
+ "ComNET staff": "District Level",
39
+ "Area Coordinator / District Coordinator": "District Level",
40
+ "Provincial Facilitator (M&E, Campaign, HRMP, etc.)": "District Level",
41
+ "DDHO": "District Level",
42
+ "CEO/DHO": "District Level",
43
+ "DSV / ASV": "District Level",
44
+ "Federal Facilitator (UNICEF)": "Federal Level",
45
+ "EPI Coordinator": "Provincial Level",
46
+ "Provincial Facilitator (EPI, Coordinator etc)": "Provincial Level",
47
+ "Federal/Provincial/District Facilitator": "Provincial Level",
48
+ "TPO/ TDO": "Town Level",
49
+ "ComNET staff": "Town Level",
50
+ "TCO": "Town Level",
51
+ "UCPO / UCSP/ UCDO": "UC Level",
52
+ "UCMO": "UC Level",
53
+ "TTSP/TUSP": "UC Level",
54
+ "Social Mobilizers": "UC Level",
55
+ "Independent Monitor": "UC Level",
56
+ }
57
+
58
+ def upload_and_parse_file(uploaded_file):
59
+ """Handle file upload and parsing."""
60
+ try:
61
+ # Detect file type and parse accordingly
62
+ if uploaded_file.name.endswith(".csv"):
63
+ df = pd.read_csv(uploaded_file)
64
+ else:
65
+ # Handle multi-level headers
66
+ df = pd.read_excel(uploaded_file, header=[0, 1])
67
+
68
+ # If multi-level headers exist, combine them
69
+ if isinstance(df.columns, pd.MultiIndex):
70
+ df.columns = [' '.join(str(col) for col in cols if str(col) != 'nan').strip()
71
+ for cols in df.columns.values]
72
+
73
+ return df
74
+ except Exception as e:
75
+ st.error(f"Error reading file: {str(e)}")
76
+ return None
77
+
78
+ def clean_data(df):
79
+ """Perform data cleaning on the DataFrame."""
80
+ try:
81
+ # Remove duplicate rows
82
+ df = df.drop_duplicates()
83
+
84
+ # Fill NA values
85
+ df = df.fillna("N/A")
86
+
87
+ # Remove leading/trailing whitespace from string columns
88
+ for col in df.select_dtypes(include=['object']):
89
+ df[col] = df[col].str.strip()
90
+
91
+ return df
92
+ except Exception as e:
93
+ st.error(f"Error cleaning data: {str(e)}")
94
+ return df
95
+
96
+ def map_designations(df, column_name="designation_title"):
97
+ """Map designations to cadres dynamically."""
98
+ try:
99
+ if column_name not in df.columns:
100
+ st.error(f"Column '{column_name}' not found in the uploaded file.")
101
+ return df
102
+
103
+ # Create Cadre column using the mapping
104
+ df["Cadre"] = df[column_name].map(CADRE_MAPPINGS).fillna("Unmapped")
105
+ return df
106
+ except Exception as e:
107
+ st.error(f"Error mapping designations: {str(e)}")
108
+ return df
109
+
110
+ def handle_new_designations(df, column_name="designation_title"):
111
+ """Handle new designations and update the CADRE_MAPPINGS dictionary."""
112
+ try:
113
+ # Get current designations that aren't in our mapping
114
+ current_designations = set(df[df['Cadre'] == 'Unmapped'][column_name].unique())
115
+
116
+ if current_designations:
117
+ st.warning(f"📝 Found {len(current_designations)} new designation(s) that need mapping!")
118
+
119
+ # Available cadre levels (predefined options only)
120
+ CADRE_LEVELS = [
121
+ "District Level",
122
+ "Federal Level",
123
+ "Provincial Level",
124
+ "Town Level",
125
+ "UC Level"
126
+ ]
127
+
128
+ # Create a container for new mappings
129
+ new_mappings = {}
130
+
131
+ with st.expander("Map New Designations", expanded=True):
132
+ st.markdown("### New Designations Found")
133
+ st.markdown("Please assign appropriate cadres to the following designations:")
134
+
135
+ # Create a form for mapping new designations
136
+ for designation in current_designations:
137
+ col1, col2 = st.columns([2, 1])
138
+ with col1:
139
+ st.text(designation)
140
+ with col2:
141
+ selected_cadre = st.selectbox(
142
+ "Select Cadre",
143
+ options=CADRE_LEVELS,
144
+ key=f"new_designation_{designation}"
145
+ )
146
+ new_mappings[designation] = selected_cadre
147
+
148
+ # Button to confirm mappings
149
+ if st.button("Confirm New Mappings"):
150
+ # Update CADRE_MAPPINGS
151
+ CADRE_MAPPINGS.update(new_mappings)
152
+
153
+ # Update the DataFrame with new mappings
154
+ df["Cadre"] = df[column_name].map(CADRE_MAPPINGS).fillna("Unmapped")
155
+
156
+ st.success("✅ Mappings updated successfully!")
157
+
158
+ # Show the new mappings
159
+ st.markdown("### New Mappings Added:")
160
+ for designation, cadre in new_mappings.items():
161
+ st.markdown(f"- **{designation}**: {cadre}")
162
+
163
+ # Option to export updated mappings
164
+ if st.button("Export Updated Mappings"):
165
+ export_mappings(CADRE_MAPPINGS)
166
+
167
+ return df
168
+
169
+ except Exception as e:
170
+ st.error(f"Error handling new designations: {str(e)}")
171
+ return df
172
+
173
+ def show_interactive_preview(df):
174
+ """Show interactive data preview with enhanced features."""
175
+ st.subheader("📋 Interactive Data Preview")
176
+
177
+ # View options in an expander
178
+ with st.expander("🔧 View Options", expanded=False):
179
+ # Column selection
180
+ cols = st.multiselect(
181
+ "Select columns to display:",
182
+ df.columns.tolist(),
183
+ default=df.columns.tolist()
184
+ )
185
+
186
+ # Row count slider
187
+ row_count = st.slider(
188
+ "Number of rows to display:",
189
+ min_value=5,
190
+ max_value=len(df),
191
+ value=min(50, len(df))
192
+ )
193
+
194
+ # Index visibility
195
+ hide_index = st.checkbox("Hide index", value=True)
196
+
197
+ # Search and filter in an expander
198
+ with st.expander("🔍 Search & Filters", expanded=False):
199
+ # Global search
200
+ search = st.text_input("Search in all columns:", "")
201
+
202
+ # Column-specific filters
203
+ filter_col = st.selectbox("Filter by column:", ["None"] + df.columns.tolist())
204
+
205
+ if filter_col != "None":
206
+ if df[filter_col].dtype in ['int64', 'float64']:
207
+ # Numeric filter
208
+ min_val, max_val = st.slider(
209
+ f"Range for {filter_col}:",
210
+ float(df[filter_col].min()),
211
+ float(df[filter_col].max()),
212
+ (float(df[filter_col].min()), float(df[filter_col].max()))
213
+ )
214
+ else:
215
+ # Category filter
216
+ unique_vals = df[filter_col].unique().tolist()
217
+ selected_vals = st.multiselect(
218
+ f"Select values for {filter_col}:",
219
+ unique_vals,
220
+ default=unique_vals
221
+ )
222
+
223
+ # Apply filters
224
+ filtered_df = df.copy()
225
+
226
+ # Apply search
227
+ if search:
228
+ mask = filtered_df.astype(str).apply(
229
+ lambda x: x.str.contains(search, case=False)
230
+ ).any(axis=1)
231
+ filtered_df = filtered_df[mask]
232
+
233
+ # Apply column filter
234
+ if filter_col != "None":
235
+ if df[filter_col].dtype in ['int64', 'float64']:
236
+ filtered_df = filtered_df[
237
+ (filtered_df[filter_col] >= min_val) &
238
+ (filtered_df[filter_col] <= max_val)
239
+ ]
240
+ else:
241
+ filtered_df = filtered_df[filtered_df[filter_col].isin(selected_vals)]
242
+
243
+ # Show the filtered dataframe
244
+ st.dataframe(
245
+ filtered_df[cols].head(row_count),
246
+ use_container_width=True,
247
+ height=400, # Fixed height for scrolling
248
+ hide_index=hide_index,
249
+ )
250
+
251
+ # Show statistics
252
+ col1, col2, col3 = st.columns(3)
253
+ with col1:
254
+ st.caption(f"Showing {len(filtered_df)} of {len(df)} rows")
255
+ with col2:
256
+ st.caption(f"Selected {len(cols)} columns")
257
+ with col3:
258
+ st.caption(f"Memory usage: {df.memory_usage().sum() / 1024:.2f} KB")
259
+
260
+ return filtered_df
261
+
262
+ def show_visualizations(df):
263
+ """Display various visualizations of the data."""
264
+ try:
265
+ st.subheader("📊 Data Visualizations")
266
+
267
+ # Cadre distribution if available
268
+ if "Cadre" in df.columns:
269
+ with st.expander("Cadre Distribution", expanded=True):
270
+ fig_cadre = px.pie(df, names="Cadre", title="Distribution of Cadres")
271
+ st.plotly_chart(fig_cadre, use_container_width=True)
272
+
273
+ # Numeric column distributions
274
+ numeric_cols = df.select_dtypes(include=['number']).columns
275
+ if len(numeric_cols) > 0:
276
+ with st.expander("Numeric Distributions", expanded=False):
277
+ selected_column = st.selectbox(
278
+ "Select numeric column for distribution",
279
+ numeric_cols
280
+ )
281
+ fig_dist = px.histogram(
282
+ df,
283
+ x=selected_column,
284
+ title=f"Distribution of {selected_column}"
285
+ )
286
+ st.plotly_chart(fig_dist, use_container_width=True)
287
+
288
+ # Correlation matrix for numeric columns
289
+ if len(numeric_cols) > 1:
290
+ with st.expander("Correlation Matrix", expanded=False):
291
+ corr_matrix = df[numeric_cols].corr()
292
+ fig_corr = px.imshow(
293
+ corr_matrix,
294
+ title="Correlation Matrix"
295
+ )
296
+ st.plotly_chart(fig_corr, use_container_width=True)
297
+
298
+ except Exception as e:
299
+ st.error(f"Error creating visualizations: {str(e)}")
300
+
301
+ def query_gemini(df, question):
302
+ """Query Gemini AI with enhanced analytics capabilities"""
303
+ try:
304
+ if not GOOGLE_API_KEY:
305
+ st.error("Google API Key not configured")
306
+ return "Error: API Key not found"
307
+
308
+ llm = GoogleGenerativeAI(
309
+ model="gemini-1.5-pro",
310
+ google_api_key=GOOGLE_API_KEY,
311
+ temperature=0.1
312
+ )
313
+
314
+ # Analyze the question to determine what data to include
315
+ question_lower = question.lower()
316
+
317
+ # Initialize context parts
318
+ context_parts = []
319
+
320
+ # Add basic dataset info
321
+ context_parts.append(f"Total Records: {len(df)}")
322
+ context_parts.append(f"Available Columns: {', '.join(df.columns.tolist())}")
323
+
324
+ # Add relevant data based on question
325
+ if 'district' in question_lower:
326
+ district_counts = df['district_name'].value_counts()
327
+ context_parts.append("\nDistrict Information:")
328
+ context_parts.append(f"Total Districts: {len(district_counts)}")
329
+ context_parts.append("Top Districts by Count:")
330
+ context_parts.append(district_counts.head().to_string())
331
+
332
+ if 'cadre' in question_lower:
333
+ cadre_counts = df['Cadre'].value_counts()
334
+ context_parts.append("\nCadre Information:")
335
+ context_parts.append(cadre_counts.to_string())
336
+
337
+ if 'designation' in question_lower:
338
+ designation_counts = df['designation_title'].value_counts()
339
+ context_parts.append("\nDesignation Information:")
340
+ context_parts.append(designation_counts.head().to_string())
341
+
342
+ # For questions about "most" or "highest"
343
+ if any(word in question_lower for word in ['most', 'highest', 'maximum', 'top']):
344
+ if 'district' in question_lower:
345
+ top_district = df['district_name'].value_counts().head(1)
346
+ context_parts.append(f"\nHighest Count District:")
347
+ context_parts.append(f"{top_district.index[0]}: {top_district.values[0]} records")
348
+
349
+ # Combine all context parts
350
+ context = "\n".join(context_parts)
351
+
352
+ prompt = f"""You are an expert Operational data analyst who has more than 15 years of experience in Polio Program internationally. Answer the following question using the provided data:
353
+
354
+ Context:
355
+ {context}
356
+
357
+ Question: {question}
358
+
359
+ Requirements for your answer:
360
+ 1. Give ONLY the exact answer with specific numbers
361
+ 2. For questions about "most" or "highest", give the specific name and count
362
+ 3. Format: "[Name/Value] with [count] records" or similar
363
+ 4. If asking about a specific column, give values from that column only
364
+ 5. Do not mention other columns unless specifically asked
365
+ 6. Do not explain methodology
366
+ 7. Keep response to one sentence
367
+ 8. If data isn't available, say "Data not available"
368
+
369
+ Examples:
370
+ Q: "Which district has most data?"
371
+ A: "Karachi South with 1,234 records."
372
+
373
+ Q: "What is the total count?"
374
+ A: "The dataset contains 5,678 total records."
375
+
376
+ Answer the question directly and concisely."""
377
+
378
+ with st.spinner('Analyzing data...'):
379
+ response = llm.invoke(prompt)
380
+
381
+ # Debug logging
382
+ st.session_state['last_context'] = context
383
+ st.session_state['last_response'] = response
384
+
385
+ return response
386
+
387
+ except Exception as e:
388
+ st.error(f"Error in analysis: {str(e)}")
389
+ return "Error occurred during analysis"
390
+
391
+ def export_data(df):
392
+ """Allow users to download the processed DataFrame."""
393
+ try:
394
+ towrite = BytesIO()
395
+ df.to_excel(towrite, index=False, engine="openpyxl")
396
+ towrite.seek(0)
397
+
398
+ return st.download_button(
399
+ label="📥 Download Processed Data",
400
+ data=towrite,
401
+ file_name="processed_data.xlsx",
402
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
403
+ )
404
+ except Exception as e:
405
+ st.error(f"Error exporting data: {str(e)}")
406
+
407
+ def export_mappings(mappings):
408
+ """Export the updated mappings dictionary."""
409
+ try:
410
+ import json
411
+ mappings_json = json.dumps(mappings, indent=4)
412
+ st.download_button(
413
+ label="📥 Download Mappings",
414
+ data=mappings_json,
415
+ file_name="cadre_mappings.json",
416
+ mime="application/json"
417
+ )
418
+ except Exception as e:
419
+ st.error(f"Error exporting mappings: {str(e)}")
420
+
421
+ def main():
422
+ """Main application function."""
423
+ try:
424
+ st.title("📊 Excel Automation App with Gemini AI")
425
+
426
+ # Add sidebar for app navigation
427
+ st.sidebar.title("Navigation")
428
+ app_mode = st.sidebar.selectbox(
429
+ "Choose the app mode",
430
+ ["About", "Data Processing", "Analysis & Visualization"],
431
+ index=0 # This sets "About" as the default selection
432
+ )
433
+
434
+ if app_mode == "About":
435
+ st.markdown("""
436
+ ### About this app
437
+ This app helps you process Excel files and analyze data using AI.
438
+
439
+ #### Features:
440
+ - Upload and process Excel/CSV files
441
+ - Automatic data cleaning
442
+ - Interactive data preview
443
+ - Designation to Cadre mapping
444
+ - AI-powered analysis
445
+ - Data visualization
446
+ - Export processed data
447
+
448
+ #### How to use:
449
+ 1. Upload your file
450
+ 2. Review and clean the data
451
+ 3. Map designations to cadres
452
+ 4. Analyze using AI
453
+ 5. Export processed data
454
+ """)
455
+ return
456
+
457
+ # Create two columns for layout
458
+ col1, col2 = st.columns([2, 1])
459
+
460
+ with col1:
461
+ uploaded_file = st.file_uploader("Upload your file (CSV/XLS/XLSX)", type=["csv", "xls", "xlsx"])
462
+
463
+ if uploaded_file:
464
+ try:
465
+ # Use the upload_and_parse_file function
466
+ df = upload_and_parse_file(uploaded_file)
467
+ if df is not None:
468
+ st.success("File uploaded successfully!")
469
+
470
+ # Clean data with progress indicator
471
+ with st.spinner('Cleaning data...'):
472
+ df = clean_data(df)
473
+
474
+ # Map designations to cadres
475
+ with st.spinner('Mapping designations to cadres...'):
476
+ df = map_designations(df)
477
+
478
+ # Show the unique designations that weren't mapped
479
+ unmapped = df[df['Cadre'] == 'Unmapped']['designation_title'].unique()
480
+ if len(unmapped) > 0:
481
+ st.warning(f"Found {len(unmapped)} unmapped designations!")
482
+
483
+ if app_mode == "Data Processing":
484
+ # Handle new designations if any are unmapped
485
+ if len(unmapped) > 0:
486
+ df = handle_new_designations(df)
487
+ # Reapply mapping after handling new designations
488
+ df = map_designations(df)
489
+
490
+ # Show interactive preview
491
+ filtered_df = show_interactive_preview(df)
492
+
493
+ # Export Options
494
+ st.subheader("📥 Export Options")
495
+ col1, col2 = st.columns(2)
496
+ with col1:
497
+ export_data(filtered_df)
498
+ with col2:
499
+ export_mappings(CADRE_MAPPINGS)
500
+
501
+ elif app_mode == "Analysis & Visualization":
502
+ show_visualizations(df)
503
+
504
+ # Gemini AI Query Section
505
+ st.subheader("💬 Ask Gemini AI about your data")
506
+
507
+ # Add suggested questions
508
+ suggested_questions = [
509
+ f"How many total records are in the dataset?",
510
+ f"What is the exact count and percentage for each Cadre level?",
511
+ f"How many unmapped designations are there?",
512
+ f"What is the most common Cadre level?",
513
+ f"What percentage of staff is at the District Level?",
514
+ "Custom Question"
515
+ ]
516
+
517
+ question_type = st.selectbox(
518
+ "Choose a question type:",
519
+ suggested_questions
520
+ )
521
+
522
+ if question_type == "Custom Question":
523
+ question = st.text_input("Enter your question about the data:")
524
+ else:
525
+ question = question_type
526
+
527
+ if question:
528
+ with st.spinner('Analyzing data...'):
529
+ response = query_gemini(df, question)
530
+ st.markdown("### Analysis Results")
531
+ st.markdown(response)
532
+
533
+ # Add debug expander
534
+ with st.expander("Debug Information", expanded=False):
535
+ if 'last_context' in st.session_state:
536
+ st.text("Context sent to AI:")
537
+ st.code(st.session_state['last_context'])
538
+ if 'last_response' in st.session_state:
539
+ st.text("Raw AI Response:")
540
+ st.code(st.session_state['last_response'])
541
+
542
+ if st.button("Generate Follow-up Questions"):
543
+ follow_up_prompt = f"Based on the previous analysis about '{question}', what are 3 relevant follow-up questions we could ask about this data?"
544
+ follow_up_response = query_gemini(df, follow_up_prompt)
545
+ st.markdown("### Suggested Follow-up Questions")
546
+ st.markdown(follow_up_response)
547
+
548
+ except Exception as e:
549
+ st.error(f"Error processing file: {str(e)}")
550
+
551
+ # Add footer
552
+ st.markdown("---")
553
+ st.markdown("Built with Streamlit and Gemini AI")
554
+
555
+ except Exception as e:
556
+ st.error(f"An error occurred: {str(e)}")
557
+
558
+ if __name__ == "__main__":
559
+ main()