File size: 8,030 Bytes
aae6699
2fccbc6
 
 
 
 
 
40db972
325f883
 
c99015b
 
 
38cbb08
c99015b
2fccbc6
dddc062
2fccbc6
 
dddc062
325f883
 
c1ff5e2
 
ff957d1
325f883
d5495e2
325f883
2fccbc6
 
325f883
2fccbc6
 
325f883
2fccbc6
 
325f883
 
c90a683
 
2fccbc6
c90a683
 
2fccbc6
c90a683
 
325f883
c90a683
 
 
 
 
2fccbc6
c90a683
dddc062
c90a683
 
 
 
 
 
 
2fccbc6
38cbb08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fccbc6
a2b1fdb
2fccbc6
c90a683
2fccbc6
325f883
 
aae6699
a2b1fdb
 
 
 
325f883
2fccbc6
325f883
c99015b
 
2b74cfe
 
38cbb08
 
 
 
c99015b
 
c90a683
38cbb08
c90a683
c99015b
 
c90a683
84136c9
aec014b
84136c9
c7867b9
c99015b
 
38cbb08
 
 
c99015b
 
 
 
 
a2b1fdb
 
 
 
 
 
 
 
325f883
 
 
 
38cbb08
a2b1fdb
38cbb08
a2b1fdb
325f883
2fccbc6
325f883
c90a683
dddc062
 
c99015b
2fccbc6
c99015b
2fccbc6
 
c90a683
2fccbc6
dddc062
c90a683
2fccbc6
 
 
 
 
325f883
 
7c0897e
325f883
 
 
 
c99015b
2fccbc6
325f883
 
c99015b
 
 
dddc062
 
 
2fccbc6
 
dddc062
38cbb08
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# app.py
from __future__ import annotations
import os
import traceback
import regex as re2
from typing import List, Tuple, Dict, Any

import gradio as gr
import pandas as pd

# New additions for data analysis agent
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_cohere import ChatCohere # <-- NEW, CORRECT IMPORT

# ---- Local modules
from settings import (
    HEALTHCARE_SETTINGS, GENERAL_CONVERSATION_PROMPT, USE_SCENARIO_ENGINE, DEBUG_PLAN,
    COHERE_MODEL_PRIMARY, COHERE_TIMEOUT_S, USE_OPEN_FALLBACKS
)
from audit_log import log_event
from privacy import safety_filter, refusal_reply
from data_registry import DataRegistry
from upload_ingest import extract_text_from_files
from healthcare_analysis import HealthcareAnalyzer
from scenario_planner import parse_to_plan
from scenario_engine import ScenarioEngine
from rag import RAGIndex
from llm_router import generate_narrative, cohere_chat, open_fallback_chat, _co_client, cohere_embed
from narrative_safetynet import build_narrative


# ---------------- Utilities ----------------
def _sanitize_text(s: str) -> str:
    if not isinstance(s, str):
        return s
    return re2.sub(r'[\p{C}--[\n\t]]+', '', s)

# --- NEW: The "Intake Analyst" AI ---
def _create_enhanced_prompt(user_scenario: str) -> str:
    """
    Uses an LLM to pre-process the user's messy prompt into a structured brief
    for the data analysis agent.
    """
    prompt_for_planner = f"""
You are an expert data analysis project manager. Your task is to read the user's unstructured scenario below and create a clear, structured brief for a data analysis AI.

From the user's text, extract the following:
1.  **Primary Objective:** A one-sentence summary of the user's main goal.
2.  **Key Tasks:** A numbered list of the specific questions the user wants answered.
3.  **Expert Guidelines & Assumptions:** A bulleted list of EVERY specific number, metric, calculation method, or assumption mentioned in the text. This is critical for high-quality analysis.
4.  **Required Output Format:** A description of how the user wants the final answer to be structured.

Present this as a clean brief. Then, include the user's original text at the end.

--- USER'S SCENARIO ---
{user_scenario}
"""
    structured_brief = cohere_chat(prompt_for_planner)
    if not structured_brief:
        return user_scenario
    return structured_brief

def is_healthcare_scenario(text: str, has_files: bool) -> bool:
    """
    Dynamic detection: require uploaded files AND either structured scenario sections
    or healthcare keywords (configured in settings).
    """
    t = (text or "").lower() # <-- INDENTATION IS NOW FIXED
    kws = HEALTHCARE_SETTINGS["healthcare_keywords"]
    structured = any(s in t for s in ["background", "situation", "tasks", "deliverables"])
    return has_files and (structured or any(k in t for k in kws))

def _append_msg(history_messages: List[Dict[str, str]], role: str, content: str) -> List[Dict[str, str]]:
    return (history_messages or []) + [{"role": role, "content": content}]

def ping_cohere() -> str:
    """Lightweight health check against Cohere (embeddings call)."""
    try:
        cli = _co_client()
        if not cli:
            return "Cohere client not initialized. Is COHERE_API_KEY set?"
        vecs = cohere_embed(["hello", "world"])
        if vecs and len(vecs) == 2:
            return f"Cohere OK ✅ (model={COHERE_MODEL_PRIMARY}, timeout={COHERE_TIMEOUT_S}s)"
        return "Cohere reachable, but embeddings returned no vectors."
    except Exception as e:
        return f"Cohere ping failed: {e}"

# ---------------- Core handler ----------------
def handle(user_msg: str, history_messages: List[Dict[str, str]], files: list) -> Tuple[List[Dict[str, str]], str]:
    """
    Core logic handler with the new two-step AI process.
    """
    try:
        safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
        if blocked_in:
            reply = refusal_reply(reason_in)
            new_hist = _append_msg(history_messages, "user", user_msg)
            new_hist = _append_msg(new_hist, "assistant", reply)
            return new_hist, ""

        file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]

        if file_paths:
            try:
                dataframes = [pd.read_csv(p) for p in file_paths if p.endswith('.csv')]
                if not dataframes:
                    reply = "Please upload at least one CSV file."
                    new_hist = _append_msg(history_messages, "user", user_msg)
                    new_hist = _append_msg(new_hist, "assistant", reply)
                    return new_hist, ""

                llm = ChatCohere(model=COHERE_MODEL_PRIMARY, temperature=0)
                enhanced_prompt = _create_enhanced_prompt(safe_in)
                AGENT_PREFIX = """...""" # Prefix content remains the same

                agent = create_pandas_dataframe_agent(
                    llm,
                    dataframes,
                    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                    verbose=True,
                    allow_dangerous_code=True,
                    prefix=AGENT_PREFIX
                )

                # Use the new .invoke() method
                result = agent.invoke({"input": enhanced_prompt})
                reply = _sanitize_text(result.get("output", "No output generated."))

            except Exception as e:
                tb = traceback.format_exc()
                log_event("agent_error", None, {"err": str(e), "tb": tb})
                reply = f"An error occurred while analyzing the data: {e}"
        else:
            prompt = f"{GENERAL_CONVERSATION_PROMPT}\n\nUser: {safe_in}\nAssistant:"
            reply = cohere_chat(prompt) or open_fallback_chat(prompt) or "How can I help further?"
            reply = _sanitize_text(reply)

        new_hist = _append_msg(history_messages, "user", user_msg)
        new_hist = _append_msg(new_hist, "assistant", reply)
        return new_hist, ""

    except Exception as e:
        tb = traceback.format_exc()
        log_event("app_error", None, {"err": str(e), "tb": tb})
        reply = f"A critical error occurred: {e}\n\n{tb}"
        new_hist = _append_msg(history_messages, "user", user_msg)
        new_hist = _append_msg(new_hist, "assistant", reply)
        return new_hist, ""

# ---------------- UI ----------------
with gr.Blocks(analytics_enabled=False) as demo:
    gr.Markdown("## Universal AI Data Analyst")

    with gr.Row():
        chat = gr.Chatbot(label="Chat History", type="messages", height=520)
        files = gr.Files(
            label="Upload Data Files (CSV recommended)",
            file_count="multiple",
            type="filepath",
            file_types=[".csv"]
        )

    msg = gr.Textbox(label="Prompt", placeholder="Paste your scenario, tasks, and any specific instructions here.")
    with gr.Row():
        send = gr.Button("Send")
        clear = gr.Button("Clear")
        ping_btn = gr.Button("Ping Cohere")
    ping_out = gr.Markdown()

    def _on_send(m, h, f):
        h2, _ = handle(m, h, f or [])
        return h2, ""

    send.click(_on_send, inputs=[msg, chat, files], outputs=[chat, msg])
    msg.submit(_on_send, inputs=[msg, chat, files], outputs=[chat, msg])
    clear.click(lambda: ([], "", None), outputs=[chat, msg, files])
    ping_btn.click(lambda: ping_cohere(), outputs=[ping_out])

if __name__ == "__main__":
    if not os.getenv("COHERE_API_KEY"):
        print("🔴 COHERE_API_KEY environment variable not set. Application may not function correctly.")

    log_event("startup", None, {
        "cohere_key_present": bool(os.getenv("COHERE_API_KEY")),
        "cohere_model": COHERE_MODEL_PRIMARY,
        "open_fallbacks": USE_OPEN_FALLBACKS,
        "timeout_s": COHERE_TIMEOUT_S
    })
    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))