Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

VEDAGI1 commited on Nov 10

Commit

bb96579

verified ·

1 Parent(s): 905cf8a

Update app.py

Browse files

Files changed (1) hide show

app.py +171 -121

app.py CHANGED Viewed

@@ -1,11 +1,10 @@
 # app.py
 #
 # Universal AI Data Analyst with:
-# - Unchanged analysis & assessment logic
-# - Fixed Gradio event wiring (uses gr.State for history)
-# - Triple-quoted progress strings (no unterminated literals)
-# - Sleek full-width UI and Voice-to-Text (browser Web Speech API)
-# - Optional HIPAA flags (fallback defaults if not present in settings.py)
 from __future__ import annotations
@@ -45,7 +44,7 @@ from privacy import safety_filter, refusal_reply
 from llm_router import cohere_chat, _co_client, cohere_embed
-# ---------------------- Helpers (analysis logic unchanged) ----------------------
 def load_markdown_text(filepath: str) -> str:
     try:
@@ -93,143 +92,194 @@ def safe_log(event_name: str, meta: dict | None = None):
 def _create_python_script(user_scenario: str, schema_context: str) -> str:
-    EXPERT_ANALYTICAL_GUIDELINES = """
---- EXPERT ANALYTICAL GUIDELINES ---
-When writing your script, you MUST follow these expert business rules:
-1.  **Linking Datasets Rule:** If you need to connect facilities to health zones when the 'zone' column is not in the facility list,
-    you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list,
-    and *then* assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
-2.  **Prioritization Rule:** To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators
-    to create a multi-factor risk score.
-3.  **Capacity Calculation Rule:** For capacity over a 3-month window, assume **60 working days**.
-4.  **Cost Calculation Rule:** Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
-"""
     prompt_for_coder = f"""\
-You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
-You have dataframes in a list `dfs`.
-{EXPERT_ANALYTICAL_GUIDELINES}
 --- DATA SCHEMA ---
 {schema_context}
 --- END DATA SCHEMA ---
-CRITICAL RULES:
-1.  **DO NOT READ FILES:** You MUST NOT include `pd.read_csv`. The data is ALREADY loaded in the `dfs` variable. You MUST use this variable. Failure to do so will cause a fatal error.
-2.  **JSON OUTPUT ONLY:** Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
-3.  **BE PRECISE:** Use the exact, case-sensitive column names from the schema and robustly clean strings (`re.sub()`) before converting to numbers.
-4.  **JSON SERIALIZATION:** Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like `int64`) to standard Python types using `.item()` for single values or `.tolist()` for lists.
---- USER'S SCENARIO ---
-{user_scenario}
---- PYTHON SCRIPT ---
-Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
-```python
 """
-    generated_text = cohere_chat(prompt_for_coder)
-    match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
-    if match:
-        return match.group(1).strip()
-    return "print(json.dumps({'error': 'Failed to generate a valid Python script.'}))"
 def _generate_long_report(prompt: str) -> str:
-    try:
-        client = _co_client()
-        if not client:
-            return "Error: Cohere client not initialized."
-        response = client.chat(
-            model=COHERE_MODEL_PRIMARY,
-            message=prompt,
-            max_tokens=4096,
-        )
-        return response.text
-    except Exception as e:
-        safe_log("cohere_chat_error", {"err": str(e)})
-        return f"Error during final report generation: {e}"
 def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
-    prompt_for_writer = f"""\
-You are an expert management consultant and data analyst.
-A data science script has run to extract key findings. You have the user's original request and the raw JSON data.
-Your task is to synthesize these raw findings into a single, comprehensive, and professional report that directly answers all of the user's questions with detailed justifications.
---- USER'S ORIGINAL SCENARIO & DELIVERABLES ---
 {user_scenario}
 --- END SCENARIO ---
 --- RAW DATA FINDINGS (JSON) ---
 {raw_data_json}
 --- END RAW DATA ---
-Now, write the final, polished report. The report MUST:
-1.  Follow the "Expected Output Format" requested by the user.
-2.  Use tables, bullet points, and DETAILED narrative justifications for each recommendation.
-3.  Synthesize the raw data into actionable insights. Do not just copy the raw numbers; interpret them.
-4.  Ensure you fully address ALL evaluation questions, especially the final recommendations.
 """
-    return _generate_long_report(prompt_for_writer)
 def _append_msg(h: List[Dict[str, str]], r: str, c: str) -> List[Dict[str, str]]:
-    return (h or []) + [{"role": r, "content": c}]
 def ping_cohere() -> str:
-    try:
-        cli = _co_client()
-        if not cli:
-            return "Cohere client not initialized."
-        vecs = cohere_embed(["hello", "world"])
-        return f"Cohere OK ✅ (model={COHERE_MODEL_PRIMARY})" if vecs else "Cohere reachable."
-    except Exception as e:
-        return f"Cohere ping failed: {e}"
-def handle(user_msg: str, files: list, yield_update) -> str:
-    try:
-        # Safety filter on incoming message
-        safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
-        if blocked_in:
-            return refusal_reply(reason_in)
-        # Optional PHI redaction for prompts sent to an external LLM
-        redacted_in = safe_in
-        if PHI_MODE and REDACT_BEFORE_LLM:
-            redacted_in = redact_phi(safe_in)
-        file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
-        if file_paths:
-            # CSV analysis path (unchanged)
-            dataframes, schema_parts = [], []
-            for i, p in enumerate(file_paths):
-                if p.endswith(".csv"):
-                    try:
-                        df = pd.read_csv(p)
-                    except UnicodeDecodeError:
-                        df = pd.read_csv(p, encoding="latin1")
-                    dataframes.append(df)
-                    schema_parts.append(
-                        f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n"
-                    )
-            if not dataframes:
-                return "Please upload at least one CSV file."
-            schema_context = "\n".join(schema_parts)
-            # If external PHI is not allowed, use redacted prompt; otherwise use original
-            prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
-            yield_update("""```
 🧠 Generating aligned analysis script...
-```""")
-            analysis_script = _create_python_script(prompt_for_code, schema_context)
             yield_update("""```
 ⚙️ Executing script to extract raw data...
@@ -549,4 +599,4 @@ with gr.Blocks(theme=gr.themes.Soft(), css=SLEEK_CSS, fill_width=True) as demo:
 if __name__ == "__main__":
     if not os.getenv("COHERE_API_KEY"):
         print("🔴 COHERE_API_KEY environment variable not set. Application may not function correctly.")
-    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))

 # app.py
 #
 # Universal AI Data Analyst with:
+# - IMPROVED: "Plan-and-Execute" logic for high-accuracy analysis.
+# - IMPROVED: Professional, structured report generation.
+# - IMPROVED: Enriched schema context for the AI analyst.
+# - Unchanged UI, event wiring, and core infrastructure.
 from __future__ import annotations
 from llm_router import cohere_chat, _co_client, cohere_embed
+# ---------------------- Helpers (analysis logic selectively improved) ----------------------
 def load_markdown_text(filepath: str) -> str:
     try:
 def _create_python_script(user_scenario: str, schema_context: str) -> str:
+    """
+    IMPROVED: Generates a Python script using a "Plan-and-Execute" approach.
+    The AI first creates a step-by-step plan, then writes code to execute it.
+    This ensures the analysis is logical, correctly aggregated, and aligned with the user's goal.
+    """
     prompt_for_coder = f"""\
+You are an expert-level Python data scientist acting as a consultant. Your task is to analyze data to answer a user's business request.
+--- USER'S SCENARIO ---
+{user_scenario}
+--- END SCENARIO ---
 --- DATA SCHEMA ---
 {schema_context}
 --- END DATA SCHEMA ---
+You must follow a rigorous two-step process:
+**Step 1: Create a Detailed Analysis Plan.**
+First, think step-by-step. Deconstruct the user's request into a clear, logical plan. The plan must identify the key metrics, necessary data manipulations (cleaning, grouping, aggregation), and the final outputs required.
+- **CRITICAL for aggregation:** If the user asks for analysis by category (e.g., "specialty," "department"), you MUST identify the correct high-level categorical column for grouping. DO NOT aggregate by granular, free-text procedure descriptions unless explicitly asked. Your goal is to find meaningful, strategic trends.
+**Step 2: Write the Python Script.**
+Based on your plan, write a complete Python script.
+CRITICAL SCRIPTING RULES:
+1.  **NO FILE READING:** The data is already loaded into a list of pandas DataFrames called `dfs`. You MUST use this variable. Do not include `pd.read_csv`.
+2.  **STRICTLY JSON OUTPUT:** The script's ONLY output to stdout MUST be a single, well-structured JSON object containing all the raw data findings from your plan.
+3.  **ROBUST DATA CLEANING:** Before performing calculations, clean data robustly. Convert numeric columns to numbers using `pd.to_numeric(..., errors='coerce')`. Handle missing values (`NaN`) appropriately (e.g., by excluding them from averages).
+4.  **JSON SERIALIZATION:** Ensure all data in the final dictionary is JSON-serializable. Use `.item()` for single numpy values and `.tolist()` for arrays/series.
+Now, provide your response in the following format:
+**ANALYSIS PLAN:**
+```text
+1.  **Objective:** [Briefly state the main goal]
+2.  **Data Cleaning:** [Describe steps to clean and prepare the data]
+3.  **Analysis Step A:** [e.g., "Calculate average wait times per hospital by grouping `dfs[0]` by 'Facility' and averaging 'Surgery_Median'."]
+4.  **Analysis Step B:** [e.g., "Identify top 5 specialties by grouping `dfs[0]` by the 'Specialty' column and calculating the mean of 'Surgery_Median'."]
+5.  **Analysis Step C:** [e.g., "Determine zone-level performance by grouping by 'Zone' and comparing to the overall provincial average."]
+6.  **JSON Output Structure:** [Describe the keys and values of the final JSON object]
+PYTHON SCRIPT:
+code
+Python
+# Your complete Python script starts here
+import pandas as pd
+import json
+import re
+# Main analysis logic...
+# ...
+# Final print statement
+print(json.dumps(final_data_structure, indent=4))
 """
+generated_text = cohere_chat(prompt_for_coder)
+# This regex is more robust for extracting the final code block
+match = re2.search(r"PYTHON SCRIPT:\s*python\n(.*?)", generated_text, re2.DOTALL)
+if match:
+return match.group(1).strip()
+code
+Code
+# Fallback if the structured format fails
+fallback_match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
+if fallback_match:
+    return fallback_match.group(1).strip()
+return "print(json.dumps({'error': 'Failed to generate a valid Python script from the plan.'}))"
 def _generate_long_report(prompt: str) -> str:
+try:
+client = _co_client()
+if not client:
+return "Error: Cohere client not initialized."
+response = client.chat(
+model=COHERE_MODEL_PRIMARY,
+message=prompt,
+max_tokens=4096,
+)
+return response.text
+except Exception as e:
+safe_log("cohere_chat_error", {"err": str(e)})
+return f"Error during final report generation: {e}"
 def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
+"""
+IMPROVED: Generates a professional, structured report from the JSON data.
+The prompt guides the AI to synthesize insights in a standard consulting format,
+ensuring a high level of detail and actionable recommendations.
+"""
+prompt_for_writer = f"""
+You are an expert management consultant specializing in data-driven strategy. A Python script has been executed to extract key data points based on a user's request. Your task is to synthesize this raw data into a polished, comprehensive, and actionable report.
+--- USER'S ORIGINAL SCENARIO ---
 {user_scenario}
 --- END SCENARIO ---
 --- RAW DATA FINDINGS (JSON) ---
 {raw_data_json}
 --- END RAW DATA ---
+CRITICAL INSTRUCTIONS:
+You must write a final report that follows this exact structure:
+### Executive Summary
+Start with a brief paragraph summarizing the core problem, key findings, and top recommendations. This should be a high-level overview for a leadership audience.
+### 1. [First Key Finding, e.g., Hospitals with the Longest Wait Times]
+Present the relevant data in a Markdown table.
+Write a short narrative interpreting the data. What does it mean? Are there any outliers? Why might these facilities have long waits (e.g., specialized care, rural location, capacity issues)?
+### 2. [Second Key Finding, e.g., Specialties with the Longest Wait Times]
+Present the relevant data in a Markdown table.
+Interpret the findings. Why are these specialties facing delays (e.g., specialist shortages, equipment needs)?
+### 3. [Third Key Finding, e.g., Zone-Level Performance]
+Present the data in a table, including a comparison to a relevant average or baseline.
+Analyze the geographic or systemic issues this data reveals.
+### 4. [Fourth Key Finding, if applicable, e.g., Geographic Distribution]
+Synthesize location data with the wait-time findings.
+Discuss the implications for patient equity, travel burdens, and access to care.
+### 5. Recommendations for Resource Allocation
+Provide specific, actionable, and justified recommendations.
+Structure them by category (e.g., by facility, by specialty, by zone).
+For each recommendation, provide a clear rationale directly linked to the data findings above (e.g., "Allocate additional resources to Glace Bay Hospital because it is a rural facility in a high-wait zone, suggesting a capacity bottleneck.").
+### Data Limitations
+Briefly mention any potential limitations of the analysis (e.g., missing data, use of proxies, case severity not included). This adds credibility to the report.
+Do not just repeat the JSON data. Your value is in interpreting the numbers, connecting the dots between different findings, and providing clear, data-backed strategic advice.
 """
+return _generate_long_report(prompt_for_writer)
 def _append_msg(h: List[Dict[str, str]], r: str, c: str) -> List[Dict[str, str]]:
+return (h or []) + [{"role": r, "content": c}]
 def ping_cohere() -> str:
+try:
+cli = _co_client()
+if not cli:
+return "Cohere client not initialized."
+vecs = cohere_embed(["hello", "world"])
+return f"Cohere OK ✅ (model={COHERE_MODEL_PRIMARY})" if vecs else "Cohere reachable."
+except Exception as e:
+return f"Cohere ping failed: {e}"
+def handle(user_msg: str, files: list, yield_update) -> str:
+try:
+# Safety filter on incoming message
+safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
+if blocked_in:
+return refusal_reply(reason_in)
+code
+Code
+# Optional PHI redaction for prompts sent to an external LLM
+    redacted_in = safe_in
+    if PHI_MODE and REDACT_BEFORE_LLM:
+        redacted_in = redact_phi(safe_in)
+    file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
+    if file_paths:
+        # CSV analysis path
+        dataframes, schema_parts = [], []
+        for i, p in enumerate(file_paths):
+            if p.endswith(".csv"):
+                try:
+                    df = pd.read_csv(p)
+                except UnicodeDecodeError:
+                    df = pd.read_csv(p, encoding="latin1")
+                dataframes.append(df)
+                # --- IMPROVEMENT: ENRICHED SCHEMA CONTEXT ---
+                schema_buffer = io.StringIO()
+                df.info(buf=schema_buffer)
+                schema_info = schema_buffer.getvalue()
+                schema_parts.append(
+                    f"""DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):
+Head
+{df.head().to_markdown()}
+Schema and Data Types
+code
+Code
+{schema_info}
+Summary Statistics
+{df.describe(include='all').to_markdown()}
+"""
+)
+code
+Code
+if not dataframes:
+            return "Please upload at least one CSV file."
+        schema_context = "\n".join(schema_parts)
+        # If external PHI is not allowed, use redacted prompt; otherwise use original
+        prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
+        yield_update("""```
 🧠 Generating aligned analysis script...
+code
+""")
+analysis_script = _create_python_script(prompt_for_code, schema_context)
             yield_update("""```
 ⚙️ Executing script to extract raw data...
 if __name__ == "__main__":
     if not os.getenv("COHERE_API_KEY"):
         print("🔴 COHERE_API_KEY environment variable not set. Application may not function correctly.")
+    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))