Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
# app.py
|
| 2 |
#
|
| 3 |
# Universal AI Data Analyst with:
|
| 4 |
-
# -
|
| 5 |
-
# -
|
| 6 |
-
# -
|
| 7 |
-
# -
|
| 8 |
-
# - Optional HIPAA flags (fallback defaults if not present in settings.py)
|
| 9 |
|
| 10 |
from __future__ import annotations
|
| 11 |
|
|
@@ -45,7 +44,7 @@ from privacy import safety_filter, refusal_reply
|
|
| 45 |
from llm_router import cohere_chat, _co_client, cohere_embed
|
| 46 |
|
| 47 |
|
| 48 |
-
# ---------------------- Helpers (analysis logic
|
| 49 |
|
| 50 |
def load_markdown_text(filepath: str) -> str:
|
| 51 |
try:
|
|
@@ -93,143 +92,194 @@ def safe_log(event_name: str, meta: dict | None = None):
|
|
| 93 |
|
| 94 |
|
| 95 |
def _create_python_script(user_scenario: str, schema_context: str) -> str:
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
and *then* assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
|
| 102 |
-
2. **Prioritization Rule:** To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators
|
| 103 |
-
to create a multi-factor risk score.
|
| 104 |
-
3. **Capacity Calculation Rule:** For capacity over a 3-month window, assume **60 working days**.
|
| 105 |
-
4. **Cost Calculation Rule:** Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
|
| 106 |
-
"""
|
| 107 |
-
|
| 108 |
prompt_for_coder = f"""\
|
| 109 |
-
You are an expert Python data scientist
|
| 110 |
-
You have dataframes in a list `dfs`.
|
| 111 |
|
| 112 |
-
|
|
|
|
|
|
|
| 113 |
|
| 114 |
--- DATA SCHEMA ---
|
| 115 |
{schema_context}
|
| 116 |
--- END DATA SCHEMA ---
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
|
|
|
| 130 |
"""
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
def _generate_long_report(prompt: str) -> str:
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
| 162 |
{user_scenario}
|
| 163 |
--- END SCENARIO ---
|
| 164 |
-
|
| 165 |
--- RAW DATA FINDINGS (JSON) ---
|
| 166 |
{raw_data_json}
|
| 167 |
--- END RAW DATA ---
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
"""
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
def _append_msg(h: List[Dict[str, str]], r: str, c: str) -> List[Dict[str, str]]:
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
def ping_cohere() -> str:
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
|
|
|
| 192 |
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
# Safety filter on incoming message
|
| 196 |
-
safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
|
| 197 |
-
if blocked_in:
|
| 198 |
-
return refusal_reply(reason_in)
|
| 199 |
-
|
| 200 |
-
# Optional PHI redaction for prompts sent to an external LLM
|
| 201 |
-
redacted_in = safe_in
|
| 202 |
-
if PHI_MODE and REDACT_BEFORE_LLM:
|
| 203 |
-
redacted_in = redact_phi(safe_in)
|
| 204 |
-
|
| 205 |
-
file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
|
| 206 |
-
|
| 207 |
-
if file_paths:
|
| 208 |
-
# CSV analysis path (unchanged)
|
| 209 |
-
dataframes, schema_parts = [], []
|
| 210 |
-
for i, p in enumerate(file_paths):
|
| 211 |
-
if p.endswith(".csv"):
|
| 212 |
-
try:
|
| 213 |
-
df = pd.read_csv(p)
|
| 214 |
-
except UnicodeDecodeError:
|
| 215 |
-
df = pd.read_csv(p, encoding="latin1")
|
| 216 |
-
dataframes.append(df)
|
| 217 |
-
schema_parts.append(
|
| 218 |
-
f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n"
|
| 219 |
-
)
|
| 220 |
-
|
| 221 |
-
if not dataframes:
|
| 222 |
-
return "Please upload at least one CSV file."
|
| 223 |
-
|
| 224 |
-
schema_context = "\n".join(schema_parts)
|
| 225 |
-
|
| 226 |
-
# If external PHI is not allowed, use redacted prompt; otherwise use original
|
| 227 |
-
prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 228 |
|
| 229 |
-
|
| 230 |
🧠 Generating aligned analysis script...
|
| 231 |
-
|
| 232 |
-
|
|
|
|
| 233 |
|
| 234 |
yield_update("""```
|
| 235 |
⚙️ Executing script to extract raw data...
|
|
@@ -549,4 +599,4 @@ with gr.Blocks(theme=gr.themes.Soft(), css=SLEEK_CSS, fill_width=True) as demo:
|
|
| 549 |
if __name__ == "__main__":
|
| 550 |
if not os.getenv("COHERE_API_KEY"):
|
| 551 |
print("🔴 COHERE_API_KEY environment variable not set. Application may not function correctly.")
|
| 552 |
-
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
|
|
|
|
| 1 |
# app.py
|
| 2 |
#
|
| 3 |
# Universal AI Data Analyst with:
|
| 4 |
+
# - IMPROVED: "Plan-and-Execute" logic for high-accuracy analysis.
|
| 5 |
+
# - IMPROVED: Professional, structured report generation.
|
| 6 |
+
# - IMPROVED: Enriched schema context for the AI analyst.
|
| 7 |
+
# - Unchanged UI, event wiring, and core infrastructure.
|
|
|
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
|
|
|
| 44 |
from llm_router import cohere_chat, _co_client, cohere_embed
|
| 45 |
|
| 46 |
|
| 47 |
+
# ---------------------- Helpers (analysis logic selectively improved) ----------------------
|
| 48 |
|
| 49 |
def load_markdown_text(filepath: str) -> str:
|
| 50 |
try:
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
def _create_python_script(user_scenario: str, schema_context: str) -> str:
|
| 95 |
+
"""
|
| 96 |
+
IMPROVED: Generates a Python script using a "Plan-and-Execute" approach.
|
| 97 |
+
The AI first creates a step-by-step plan, then writes code to execute it.
|
| 98 |
+
This ensures the analysis is logical, correctly aggregated, and aligned with the user's goal.
|
| 99 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
prompt_for_coder = f"""\
|
| 101 |
+
You are an expert-level Python data scientist acting as a consultant. Your task is to analyze data to answer a user's business request.
|
|
|
|
| 102 |
|
| 103 |
+
--- USER'S SCENARIO ---
|
| 104 |
+
{user_scenario}
|
| 105 |
+
--- END SCENARIO ---
|
| 106 |
|
| 107 |
--- DATA SCHEMA ---
|
| 108 |
{schema_context}
|
| 109 |
--- END DATA SCHEMA ---
|
| 110 |
|
| 111 |
+
You must follow a rigorous two-step process:
|
| 112 |
+
|
| 113 |
+
**Step 1: Create a Detailed Analysis Plan.**
|
| 114 |
+
First, think step-by-step. Deconstruct the user's request into a clear, logical plan. The plan must identify the key metrics, necessary data manipulations (cleaning, grouping, aggregation), and the final outputs required.
|
| 115 |
+
- **CRITICAL for aggregation:** If the user asks for analysis by category (e.g., "specialty," "department"), you MUST identify the correct high-level categorical column for grouping. DO NOT aggregate by granular, free-text procedure descriptions unless explicitly asked. Your goal is to find meaningful, strategic trends.
|
| 116 |
+
|
| 117 |
+
**Step 2: Write the Python Script.**
|
| 118 |
+
Based on your plan, write a complete Python script.
|
| 119 |
+
|
| 120 |
+
CRITICAL SCRIPTING RULES:
|
| 121 |
+
1. **NO FILE READING:** The data is already loaded into a list of pandas DataFrames called `dfs`. You MUST use this variable. Do not include `pd.read_csv`.
|
| 122 |
+
2. **STRICTLY JSON OUTPUT:** The script's ONLY output to stdout MUST be a single, well-structured JSON object containing all the raw data findings from your plan.
|
| 123 |
+
3. **ROBUST DATA CLEANING:** Before performing calculations, clean data robustly. Convert numeric columns to numbers using `pd.to_numeric(..., errors='coerce')`. Handle missing values (`NaN`) appropriately (e.g., by excluding them from averages).
|
| 124 |
+
4. **JSON SERIALIZATION:** Ensure all data in the final dictionary is JSON-serializable. Use `.item()` for single numpy values and `.tolist()` for arrays/series.
|
| 125 |
+
|
| 126 |
+
Now, provide your response in the following format:
|
| 127 |
+
|
| 128 |
+
**ANALYSIS PLAN:**
|
| 129 |
+
```text
|
| 130 |
+
1. **Objective:** [Briefly state the main goal]
|
| 131 |
+
2. **Data Cleaning:** [Describe steps to clean and prepare the data]
|
| 132 |
+
3. **Analysis Step A:** [e.g., "Calculate average wait times per hospital by grouping `dfs[0]` by 'Facility' and averaging 'Surgery_Median'."]
|
| 133 |
+
4. **Analysis Step B:** [e.g., "Identify top 5 specialties by grouping `dfs[0]` by the 'Specialty' column and calculating the mean of 'Surgery_Median'."]
|
| 134 |
+
5. **Analysis Step C:** [e.g., "Determine zone-level performance by grouping by 'Zone' and comparing to the overall provincial average."]
|
| 135 |
+
6. **JSON Output Structure:** [Describe the keys and values of the final JSON object]
|
| 136 |
+
PYTHON SCRIPT:
|
| 137 |
+
code
|
| 138 |
+
Python
|
| 139 |
+
# Your complete Python script starts here
|
| 140 |
+
import pandas as pd
|
| 141 |
+
import json
|
| 142 |
+
import re
|
| 143 |
|
| 144 |
+
# Main analysis logic...
|
| 145 |
+
# ...
|
| 146 |
+
# Final print statement
|
| 147 |
+
print(json.dumps(final_data_structure, indent=4))
|
| 148 |
"""
|
| 149 |
+
generated_text = cohere_chat(prompt_for_coder)
|
| 150 |
+
# This regex is more robust for extracting the final code block
|
| 151 |
+
match = re2.search(r"PYTHON SCRIPT:\s*python\n(.*?)", generated_text, re2.DOTALL)
|
| 152 |
+
if match:
|
| 153 |
+
return match.group(1).strip()
|
| 154 |
+
code
|
| 155 |
+
Code
|
| 156 |
+
# Fallback if the structured format fails
|
| 157 |
+
fallback_match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
|
| 158 |
+
if fallback_match:
|
| 159 |
+
return fallback_match.group(1).strip()
|
| 160 |
+
|
| 161 |
+
return "print(json.dumps({'error': 'Failed to generate a valid Python script from the plan.'}))"
|
| 162 |
def _generate_long_report(prompt: str) -> str:
|
| 163 |
+
try:
|
| 164 |
+
client = _co_client()
|
| 165 |
+
if not client:
|
| 166 |
+
return "Error: Cohere client not initialized."
|
| 167 |
+
response = client.chat(
|
| 168 |
+
model=COHERE_MODEL_PRIMARY,
|
| 169 |
+
message=prompt,
|
| 170 |
+
max_tokens=4096,
|
| 171 |
+
)
|
| 172 |
+
return response.text
|
| 173 |
+
except Exception as e:
|
| 174 |
+
safe_log("cohere_chat_error", {"err": str(e)})
|
| 175 |
+
return f"Error during final report generation: {e}"
|
|
|
|
|
|
|
| 176 |
def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
|
| 177 |
+
"""
|
| 178 |
+
IMPROVED: Generates a professional, structured report from the JSON data.
|
| 179 |
+
The prompt guides the AI to synthesize insights in a standard consulting format,
|
| 180 |
+
ensuring a high level of detail and actionable recommendations.
|
| 181 |
+
"""
|
| 182 |
+
prompt_for_writer = f"""
|
| 183 |
+
You are an expert management consultant specializing in data-driven strategy. A Python script has been executed to extract key data points based on a user's request. Your task is to synthesize this raw data into a polished, comprehensive, and actionable report.
|
| 184 |
+
--- USER'S ORIGINAL SCENARIO ---
|
| 185 |
{user_scenario}
|
| 186 |
--- END SCENARIO ---
|
|
|
|
| 187 |
--- RAW DATA FINDINGS (JSON) ---
|
| 188 |
{raw_data_json}
|
| 189 |
--- END RAW DATA ---
|
| 190 |
+
CRITICAL INSTRUCTIONS:
|
| 191 |
+
You must write a final report that follows this exact structure:
|
| 192 |
+
### Executive Summary
|
| 193 |
+
Start with a brief paragraph summarizing the core problem, key findings, and top recommendations. This should be a high-level overview for a leadership audience.
|
| 194 |
+
### 1. [First Key Finding, e.g., Hospitals with the Longest Wait Times]
|
| 195 |
+
Present the relevant data in a Markdown table.
|
| 196 |
+
Write a short narrative interpreting the data. What does it mean? Are there any outliers? Why might these facilities have long waits (e.g., specialized care, rural location, capacity issues)?
|
| 197 |
+
### 2. [Second Key Finding, e.g., Specialties with the Longest Wait Times]
|
| 198 |
+
Present the relevant data in a Markdown table.
|
| 199 |
+
Interpret the findings. Why are these specialties facing delays (e.g., specialist shortages, equipment needs)?
|
| 200 |
+
### 3. [Third Key Finding, e.g., Zone-Level Performance]
|
| 201 |
+
Present the data in a table, including a comparison to a relevant average or baseline.
|
| 202 |
+
Analyze the geographic or systemic issues this data reveals.
|
| 203 |
+
### 4. [Fourth Key Finding, if applicable, e.g., Geographic Distribution]
|
| 204 |
+
Synthesize location data with the wait-time findings.
|
| 205 |
+
Discuss the implications for patient equity, travel burdens, and access to care.
|
| 206 |
+
### 5. Recommendations for Resource Allocation
|
| 207 |
+
Provide specific, actionable, and justified recommendations.
|
| 208 |
+
Structure them by category (e.g., by facility, by specialty, by zone).
|
| 209 |
+
For each recommendation, provide a clear rationale directly linked to the data findings above (e.g., "Allocate additional resources to Glace Bay Hospital because it is a rural facility in a high-wait zone, suggesting a capacity bottleneck.").
|
| 210 |
+
### Data Limitations
|
| 211 |
+
Briefly mention any potential limitations of the analysis (e.g., missing data, use of proxies, case severity not included). This adds credibility to the report.
|
| 212 |
+
Do not just repeat the JSON data. Your value is in interpreting the numbers, connecting the dots between different findings, and providing clear, data-backed strategic advice.
|
| 213 |
"""
|
| 214 |
+
return _generate_long_report(prompt_for_writer)
|
|
|
|
|
|
|
| 215 |
def _append_msg(h: List[Dict[str, str]], r: str, c: str) -> List[Dict[str, str]]:
|
| 216 |
+
return (h or []) + [{"role": r, "content": c}]
|
|
|
|
|
|
|
| 217 |
def ping_cohere() -> str:
|
| 218 |
+
try:
|
| 219 |
+
cli = _co_client()
|
| 220 |
+
if not cli:
|
| 221 |
+
return "Cohere client not initialized."
|
| 222 |
+
vecs = cohere_embed(["hello", "world"])
|
| 223 |
+
return f"Cohere OK ✅ (model={COHERE_MODEL_PRIMARY})" if vecs else "Cohere reachable."
|
| 224 |
+
except Exception as e:
|
| 225 |
+
return f"Cohere ping failed: {e}"
|
| 226 |
+
def handle(user_msg: str, files: list, yield_update) -> str:
|
| 227 |
+
try:
|
| 228 |
+
# Safety filter on incoming message
|
| 229 |
+
safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
|
| 230 |
+
if blocked_in:
|
| 231 |
+
return refusal_reply(reason_in)
|
| 232 |
+
code
|
| 233 |
+
Code
|
| 234 |
+
# Optional PHI redaction for prompts sent to an external LLM
|
| 235 |
+
redacted_in = safe_in
|
| 236 |
+
if PHI_MODE and REDACT_BEFORE_LLM:
|
| 237 |
+
redacted_in = redact_phi(safe_in)
|
| 238 |
+
|
| 239 |
+
file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
|
| 240 |
+
|
| 241 |
+
if file_paths:
|
| 242 |
+
# CSV analysis path
|
| 243 |
+
dataframes, schema_parts = [], []
|
| 244 |
+
for i, p in enumerate(file_paths):
|
| 245 |
+
if p.endswith(".csv"):
|
| 246 |
+
try:
|
| 247 |
+
df = pd.read_csv(p)
|
| 248 |
+
except UnicodeDecodeError:
|
| 249 |
+
df = pd.read_csv(p, encoding="latin1")
|
| 250 |
+
dataframes.append(df)
|
| 251 |
+
|
| 252 |
+
# --- IMPROVEMENT: ENRICHED SCHEMA CONTEXT ---
|
| 253 |
+
schema_buffer = io.StringIO()
|
| 254 |
+
df.info(buf=schema_buffer)
|
| 255 |
+
schema_info = schema_buffer.getvalue()
|
| 256 |
+
schema_parts.append(
|
| 257 |
+
f"""DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):
|
| 258 |
+
Head
|
| 259 |
+
{df.head().to_markdown()}
|
| 260 |
+
Schema and Data Types
|
| 261 |
+
code
|
| 262 |
+
Code
|
| 263 |
+
{schema_info}
|
| 264 |
+
Summary Statistics
|
| 265 |
+
{df.describe(include='all').to_markdown()}
|
| 266 |
+
"""
|
| 267 |
+
)
|
| 268 |
+
code
|
| 269 |
+
Code
|
| 270 |
+
if not dataframes:
|
| 271 |
+
return "Please upload at least one CSV file."
|
| 272 |
|
| 273 |
+
schema_context = "\n".join(schema_parts)
|
| 274 |
|
| 275 |
+
# If external PHI is not allowed, use redacted prompt; otherwise use original
|
| 276 |
+
prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
+
yield_update("""```
|
| 279 |
🧠 Generating aligned analysis script...
|
| 280 |
+
code
|
| 281 |
+
""")
|
| 282 |
+
analysis_script = _create_python_script(prompt_for_code, schema_context)
|
| 283 |
|
| 284 |
yield_update("""```
|
| 285 |
⚙️ Executing script to extract raw data...
|
|
|
|
| 599 |
if __name__ == "__main__":
|
| 600 |
if not os.getenv("COHERE_API_KEY"):
|
| 601 |
print("🔴 COHERE_API_KEY environment variable not set. Application may not function correctly.")
|
| 602 |
+
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
|