VEDAGI1 commited on
Commit
bb96579
·
verified ·
1 Parent(s): 905cf8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -121
app.py CHANGED
@@ -1,11 +1,10 @@
1
  # app.py
2
  #
3
  # Universal AI Data Analyst with:
4
- # - Unchanged analysis & assessment logic
5
- # - Fixed Gradio event wiring (uses gr.State for history)
6
- # - Triple-quoted progress strings (no unterminated literals)
7
- # - Sleek full-width UI and Voice-to-Text (browser Web Speech API)
8
- # - Optional HIPAA flags (fallback defaults if not present in settings.py)
9
 
10
  from __future__ import annotations
11
 
@@ -45,7 +44,7 @@ from privacy import safety_filter, refusal_reply
45
  from llm_router import cohere_chat, _co_client, cohere_embed
46
 
47
 
48
- # ---------------------- Helpers (analysis logic unchanged) ----------------------
49
 
50
  def load_markdown_text(filepath: str) -> str:
51
  try:
@@ -93,143 +92,194 @@ def safe_log(event_name: str, meta: dict | None = None):
93
 
94
 
95
  def _create_python_script(user_scenario: str, schema_context: str) -> str:
96
- EXPERT_ANALYTICAL_GUIDELINES = """
97
- --- EXPERT ANALYTICAL GUIDELINES ---
98
- When writing your script, you MUST follow these expert business rules:
99
- 1. **Linking Datasets Rule:** If you need to connect facilities to health zones when the 'zone' column is not in the facility list,
100
- you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list,
101
- and *then* assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
102
- 2. **Prioritization Rule:** To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators
103
- to create a multi-factor risk score.
104
- 3. **Capacity Calculation Rule:** For capacity over a 3-month window, assume **60 working days**.
105
- 4. **Cost Calculation Rule:** Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
106
- """
107
-
108
  prompt_for_coder = f"""\
109
- You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
110
- You have dataframes in a list `dfs`.
111
 
112
- {EXPERT_ANALYTICAL_GUIDELINES}
 
 
113
 
114
  --- DATA SCHEMA ---
115
  {schema_context}
116
  --- END DATA SCHEMA ---
117
 
118
- CRITICAL RULES:
119
- 1. **DO NOT READ FILES:** You MUST NOT include `pd.read_csv`. The data is ALREADY loaded in the `dfs` variable. You MUST use this variable. Failure to do so will cause a fatal error.
120
- 2. **JSON OUTPUT ONLY:** Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
121
- 3. **BE PRECISE:** Use the exact, case-sensitive column names from the schema and robustly clean strings (`re.sub()`) before converting to numbers.
122
- 4. **JSON SERIALIZATION:** Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like `int64`) to standard Python types using `.item()` for single values or `.tolist()` for lists.
123
-
124
- --- USER'S SCENARIO ---
125
- {user_scenario}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- --- PYTHON SCRIPT ---
128
- Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
129
- ```python
 
130
  """
131
- generated_text = cohere_chat(prompt_for_coder)
132
- match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
133
- if match:
134
- return match.group(1).strip()
135
- return "print(json.dumps({'error': 'Failed to generate a valid Python script.'}))"
136
-
137
-
 
 
 
 
 
 
138
  def _generate_long_report(prompt: str) -> str:
139
- try:
140
- client = _co_client()
141
- if not client:
142
- return "Error: Cohere client not initialized."
143
- response = client.chat(
144
- model=COHERE_MODEL_PRIMARY,
145
- message=prompt,
146
- max_tokens=4096,
147
- )
148
- return response.text
149
- except Exception as e:
150
- safe_log("cohere_chat_error", {"err": str(e)})
151
- return f"Error during final report generation: {e}"
152
-
153
-
154
  def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
155
- prompt_for_writer = f"""\
156
- You are an expert management consultant and data analyst.
157
- A data science script has run to extract key findings. You have the user's original request and the raw JSON data.
158
-
159
- Your task is to synthesize these raw findings into a single, comprehensive, and professional report that directly answers all of the user's questions with detailed justifications.
160
-
161
- --- USER'S ORIGINAL SCENARIO & DELIVERABLES ---
 
162
  {user_scenario}
163
  --- END SCENARIO ---
164
-
165
  --- RAW DATA FINDINGS (JSON) ---
166
  {raw_data_json}
167
  --- END RAW DATA ---
168
-
169
- Now, write the final, polished report. The report MUST:
170
- 1. Follow the "Expected Output Format" requested by the user.
171
- 2. Use tables, bullet points, and DETAILED narrative justifications for each recommendation.
172
- 3. Synthesize the raw data into actionable insights. Do not just copy the raw numbers; interpret them.
173
- 4. Ensure you fully address ALL evaluation questions, especially the final recommendations.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  """
175
- return _generate_long_report(prompt_for_writer)
176
-
177
-
178
  def _append_msg(h: List[Dict[str, str]], r: str, c: str) -> List[Dict[str, str]]:
179
- return (h or []) + [{"role": r, "content": c}]
180
-
181
-
182
  def ping_cohere() -> str:
183
- try:
184
- cli = _co_client()
185
- if not cli:
186
- return "Cohere client not initialized."
187
- vecs = cohere_embed(["hello", "world"])
188
- return f"Cohere OK ✅ (model={COHERE_MODEL_PRIMARY})" if vecs else "Cohere reachable."
189
- except Exception as e:
190
- return f"Cohere ping failed: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
 
192
 
193
- def handle(user_msg: str, files: list, yield_update) -> str:
194
- try:
195
- # Safety filter on incoming message
196
- safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
197
- if blocked_in:
198
- return refusal_reply(reason_in)
199
-
200
- # Optional PHI redaction for prompts sent to an external LLM
201
- redacted_in = safe_in
202
- if PHI_MODE and REDACT_BEFORE_LLM:
203
- redacted_in = redact_phi(safe_in)
204
-
205
- file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
206
-
207
- if file_paths:
208
- # CSV analysis path (unchanged)
209
- dataframes, schema_parts = [], []
210
- for i, p in enumerate(file_paths):
211
- if p.endswith(".csv"):
212
- try:
213
- df = pd.read_csv(p)
214
- except UnicodeDecodeError:
215
- df = pd.read_csv(p, encoding="latin1")
216
- dataframes.append(df)
217
- schema_parts.append(
218
- f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n"
219
- )
220
-
221
- if not dataframes:
222
- return "Please upload at least one CSV file."
223
-
224
- schema_context = "\n".join(schema_parts)
225
-
226
- # If external PHI is not allowed, use redacted prompt; otherwise use original
227
- prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
228
 
229
- yield_update("""```
230
  🧠 Generating aligned analysis script...
231
- ```""")
232
- analysis_script = _create_python_script(prompt_for_code, schema_context)
 
233
 
234
  yield_update("""```
235
  ⚙️ Executing script to extract raw data...
@@ -549,4 +599,4 @@ with gr.Blocks(theme=gr.themes.Soft(), css=SLEEK_CSS, fill_width=True) as demo:
549
  if __name__ == "__main__":
550
  if not os.getenv("COHERE_API_KEY"):
551
  print("🔴 COHERE_API_KEY environment variable not set. Application may not function correctly.")
552
- demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
 
1
  # app.py
2
  #
3
  # Universal AI Data Analyst with:
4
+ # - IMPROVED: "Plan-and-Execute" logic for high-accuracy analysis.
5
+ # - IMPROVED: Professional, structured report generation.
6
+ # - IMPROVED: Enriched schema context for the AI analyst.
7
+ # - Unchanged UI, event wiring, and core infrastructure.
 
8
 
9
  from __future__ import annotations
10
 
 
44
  from llm_router import cohere_chat, _co_client, cohere_embed
45
 
46
 
47
+ # ---------------------- Helpers (analysis logic selectively improved) ----------------------
48
 
49
  def load_markdown_text(filepath: str) -> str:
50
  try:
 
92
 
93
 
94
  def _create_python_script(user_scenario: str, schema_context: str) -> str:
95
+ """
96
+ IMPROVED: Generates a Python script using a "Plan-and-Execute" approach.
97
+ The AI first creates a step-by-step plan, then writes code to execute it.
98
+ This ensures the analysis is logical, correctly aggregated, and aligned with the user's goal.
99
+ """
 
 
 
 
 
 
 
100
  prompt_for_coder = f"""\
101
+ You are an expert-level Python data scientist acting as a consultant. Your task is to analyze data to answer a user's business request.
 
102
 
103
+ --- USER'S SCENARIO ---
104
+ {user_scenario}
105
+ --- END SCENARIO ---
106
 
107
  --- DATA SCHEMA ---
108
  {schema_context}
109
  --- END DATA SCHEMA ---
110
 
111
+ You must follow a rigorous two-step process:
112
+
113
+ **Step 1: Create a Detailed Analysis Plan.**
114
+ First, think step-by-step. Deconstruct the user's request into a clear, logical plan. The plan must identify the key metrics, necessary data manipulations (cleaning, grouping, aggregation), and the final outputs required.
115
+ - **CRITICAL for aggregation:** If the user asks for analysis by category (e.g., "specialty," "department"), you MUST identify the correct high-level categorical column for grouping. DO NOT aggregate by granular, free-text procedure descriptions unless explicitly asked. Your goal is to find meaningful, strategic trends.
116
+
117
+ **Step 2: Write the Python Script.**
118
+ Based on your plan, write a complete Python script.
119
+
120
+ CRITICAL SCRIPTING RULES:
121
+ 1. **NO FILE READING:** The data is already loaded into a list of pandas DataFrames called `dfs`. You MUST use this variable. Do not include `pd.read_csv`.
122
+ 2. **STRICTLY JSON OUTPUT:** The script's ONLY output to stdout MUST be a single, well-structured JSON object containing all the raw data findings from your plan.
123
+ 3. **ROBUST DATA CLEANING:** Before performing calculations, clean data robustly. Convert numeric columns to numbers using `pd.to_numeric(..., errors='coerce')`. Handle missing values (`NaN`) appropriately (e.g., by excluding them from averages).
124
+ 4. **JSON SERIALIZATION:** Ensure all data in the final dictionary is JSON-serializable. Use `.item()` for single numpy values and `.tolist()` for arrays/series.
125
+
126
+ Now, provide your response in the following format:
127
+
128
+ **ANALYSIS PLAN:**
129
+ ```text
130
+ 1. **Objective:** [Briefly state the main goal]
131
+ 2. **Data Cleaning:** [Describe steps to clean and prepare the data]
132
+ 3. **Analysis Step A:** [e.g., "Calculate average wait times per hospital by grouping `dfs[0]` by 'Facility' and averaging 'Surgery_Median'."]
133
+ 4. **Analysis Step B:** [e.g., "Identify top 5 specialties by grouping `dfs[0]` by the 'Specialty' column and calculating the mean of 'Surgery_Median'."]
134
+ 5. **Analysis Step C:** [e.g., "Determine zone-level performance by grouping by 'Zone' and comparing to the overall provincial average."]
135
+ 6. **JSON Output Structure:** [Describe the keys and values of the final JSON object]
136
+ PYTHON SCRIPT:
137
+ code
138
+ Python
139
+ # Your complete Python script starts here
140
+ import pandas as pd
141
+ import json
142
+ import re
143
 
144
+ # Main analysis logic...
145
+ # ...
146
+ # Final print statement
147
+ print(json.dumps(final_data_structure, indent=4))
148
  """
149
+ generated_text = cohere_chat(prompt_for_coder)
150
+ # This regex is more robust for extracting the final code block
151
+ match = re2.search(r"PYTHON SCRIPT:\s*python\n(.*?)", generated_text, re2.DOTALL)
152
+ if match:
153
+ return match.group(1).strip()
154
+ code
155
+ Code
156
+ # Fallback if the structured format fails
157
+ fallback_match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
158
+ if fallback_match:
159
+ return fallback_match.group(1).strip()
160
+
161
+ return "print(json.dumps({'error': 'Failed to generate a valid Python script from the plan.'}))"
162
  def _generate_long_report(prompt: str) -> str:
163
+ try:
164
+ client = _co_client()
165
+ if not client:
166
+ return "Error: Cohere client not initialized."
167
+ response = client.chat(
168
+ model=COHERE_MODEL_PRIMARY,
169
+ message=prompt,
170
+ max_tokens=4096,
171
+ )
172
+ return response.text
173
+ except Exception as e:
174
+ safe_log("cohere_chat_error", {"err": str(e)})
175
+ return f"Error during final report generation: {e}"
 
 
176
  def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
177
+ """
178
+ IMPROVED: Generates a professional, structured report from the JSON data.
179
+ The prompt guides the AI to synthesize insights in a standard consulting format,
180
+ ensuring a high level of detail and actionable recommendations.
181
+ """
182
+ prompt_for_writer = f"""
183
+ You are an expert management consultant specializing in data-driven strategy. A Python script has been executed to extract key data points based on a user's request. Your task is to synthesize this raw data into a polished, comprehensive, and actionable report.
184
+ --- USER'S ORIGINAL SCENARIO ---
185
  {user_scenario}
186
  --- END SCENARIO ---
 
187
  --- RAW DATA FINDINGS (JSON) ---
188
  {raw_data_json}
189
  --- END RAW DATA ---
190
+ CRITICAL INSTRUCTIONS:
191
+ You must write a final report that follows this exact structure:
192
+ ### Executive Summary
193
+ Start with a brief paragraph summarizing the core problem, key findings, and top recommendations. This should be a high-level overview for a leadership audience.
194
+ ### 1. [First Key Finding, e.g., Hospitals with the Longest Wait Times]
195
+ Present the relevant data in a Markdown table.
196
+ Write a short narrative interpreting the data. What does it mean? Are there any outliers? Why might these facilities have long waits (e.g., specialized care, rural location, capacity issues)?
197
+ ### 2. [Second Key Finding, e.g., Specialties with the Longest Wait Times]
198
+ Present the relevant data in a Markdown table.
199
+ Interpret the findings. Why are these specialties facing delays (e.g., specialist shortages, equipment needs)?
200
+ ### 3. [Third Key Finding, e.g., Zone-Level Performance]
201
+ Present the data in a table, including a comparison to a relevant average or baseline.
202
+ Analyze the geographic or systemic issues this data reveals.
203
+ ### 4. [Fourth Key Finding, if applicable, e.g., Geographic Distribution]
204
+ Synthesize location data with the wait-time findings.
205
+ Discuss the implications for patient equity, travel burdens, and access to care.
206
+ ### 5. Recommendations for Resource Allocation
207
+ Provide specific, actionable, and justified recommendations.
208
+ Structure them by category (e.g., by facility, by specialty, by zone).
209
+ For each recommendation, provide a clear rationale directly linked to the data findings above (e.g., "Allocate additional resources to Glace Bay Hospital because it is a rural facility in a high-wait zone, suggesting a capacity bottleneck.").
210
+ ### Data Limitations
211
+ Briefly mention any potential limitations of the analysis (e.g., missing data, use of proxies, case severity not included). This adds credibility to the report.
212
+ Do not just repeat the JSON data. Your value is in interpreting the numbers, connecting the dots between different findings, and providing clear, data-backed strategic advice.
213
  """
214
+ return _generate_long_report(prompt_for_writer)
 
 
215
  def _append_msg(h: List[Dict[str, str]], r: str, c: str) -> List[Dict[str, str]]:
216
+ return (h or []) + [{"role": r, "content": c}]
 
 
217
  def ping_cohere() -> str:
218
+ try:
219
+ cli = _co_client()
220
+ if not cli:
221
+ return "Cohere client not initialized."
222
+ vecs = cohere_embed(["hello", "world"])
223
+ return f"Cohere OK ✅ (model={COHERE_MODEL_PRIMARY})" if vecs else "Cohere reachable."
224
+ except Exception as e:
225
+ return f"Cohere ping failed: {e}"
226
+ def handle(user_msg: str, files: list, yield_update) -> str:
227
+ try:
228
+ # Safety filter on incoming message
229
+ safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
230
+ if blocked_in:
231
+ return refusal_reply(reason_in)
232
+ code
233
+ Code
234
+ # Optional PHI redaction for prompts sent to an external LLM
235
+ redacted_in = safe_in
236
+ if PHI_MODE and REDACT_BEFORE_LLM:
237
+ redacted_in = redact_phi(safe_in)
238
+
239
+ file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
240
+
241
+ if file_paths:
242
+ # CSV analysis path
243
+ dataframes, schema_parts = [], []
244
+ for i, p in enumerate(file_paths):
245
+ if p.endswith(".csv"):
246
+ try:
247
+ df = pd.read_csv(p)
248
+ except UnicodeDecodeError:
249
+ df = pd.read_csv(p, encoding="latin1")
250
+ dataframes.append(df)
251
+
252
+ # --- IMPROVEMENT: ENRICHED SCHEMA CONTEXT ---
253
+ schema_buffer = io.StringIO()
254
+ df.info(buf=schema_buffer)
255
+ schema_info = schema_buffer.getvalue()
256
+ schema_parts.append(
257
+ f"""DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):
258
+ Head
259
+ {df.head().to_markdown()}
260
+ Schema and Data Types
261
+ code
262
+ Code
263
+ {schema_info}
264
+ Summary Statistics
265
+ {df.describe(include='all').to_markdown()}
266
+ """
267
+ )
268
+ code
269
+ Code
270
+ if not dataframes:
271
+ return "Please upload at least one CSV file."
272
 
273
+ schema_context = "\n".join(schema_parts)
274
 
275
+ # If external PHI is not allowed, use redacted prompt; otherwise use original
276
+ prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
+ yield_update("""```
279
  🧠 Generating aligned analysis script...
280
+ code
281
+ """)
282
+ analysis_script = _create_python_script(prompt_for_code, schema_context)
283
 
284
  yield_update("""```
285
  ⚙️ Executing script to extract raw data...
 
599
  if __name__ == "__main__":
600
  if not os.getenv("COHERE_API_KEY"):
601
  print("🔴 COHERE_API_KEY environment variable not set. Application may not function correctly.")
602
+ demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))