Rajan Sharma commited on
Commit
85429e8
·
verified ·
1 Parent(s): f68dc31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -60
app.py CHANGED
@@ -1,59 +1,70 @@
1
- from future import annotations
2
  import os
3
  import io
4
  import json
5
  import traceback
6
  from contextlib import redirect_stdout
7
  from typing import List, Dict, Any
 
8
  import gradio as gr
9
  import pandas as pd
10
  from datetime import datetime
11
  import regex as re2
12
  import re
13
- --- BACKEND IMPORTS ---
14
  from langchain_cohere import ChatCohere
15
- --- LOCAL MODULE IMPORTS ---
16
  from settings import (
17
- GENERAL_CONVERSATION_PROMPT,
18
- COHERE_MODEL_PRIMARY, COHERE_TIMEOUT_S, USE_OPEN_FALLBACKS
19
  )
20
  from audit_log import log_event
21
  from privacy import safety_filter, refusal_reply
22
  from llm_router import cohere_chat, _co_client, cohere_embed
23
- --- UTILITY FUNCTIONS ---
24
  def load_markdown_text(filepath: str) -> str:
25
- """Safely loads text content from a markdown file."""
26
- try:
27
- with open(filepath, 'r', encoding='utf-8') as f: return f.read()
28
- except FileNotFoundError:
29
- return f"Error: Document {os.path.basename(filepath)} not found."
 
30
  def _sanitize_text(s: str) -> str:
31
- if not isinstance(s, str): return s
32
- return re2.sub(r'[\p{C}--[\n\t]]+', '', s)
33
- --- THE "ANALYST-WRITER" PIPELINE ---
34
  def _create_python_script(user_scenario: str, schema_context: str) -> str:
35
- """Asks the AI to write a Python script that outputs raw, structured JSON."""
36
- code
37
- Code
38
- # --- THE FINAL, MOST ROBUST PROMPT ---
39
- prompt_for_coder = f"""
40
- You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request and print the findings as a single JSON object.
41
- --- DATA CONTEXT ---
42
- The data is pre-loaded into a Python list of pandas DataFrames called dfs.
 
 
 
 
 
 
 
 
43
  {schema_context}
44
- --- END DATA CONTEXT ---
 
45
  CRITICAL RULES:
46
- DO NOT READ FILES: You MUST NOT include pd.read_csv. The data is in the dfs variable.
47
- JSON OUTPUT ONLY: Your script's ONLY output must be a single JSON object printed to stdout.
48
- JSON SERIALIZATION (VERY IMPORTANT): The json library can only handle standard Python types. Before creating the final dictionary, ensure all values are standard types. If a value is a pandas/numpy number (like int64), convert it to a standard Python int or float using .item(). If a value is a pandas Series, convert it using .tolist().
49
- DEFENSIVE CODING (CRITICAL): Before passing a variable to a function, be paranoid. For example, if you write a helper function that expects a dictionary, DO NOT pass it a list. If a function expects a single item, DO NOT pass it a whole dataframe. Always check the data type of your variables. This will prevent AttributeError crashes.
50
- BE PRECISE: Use the exact, case-sensitive column names from the schema and robustly clean strings (re.sub()) before converting them to numbers.
51
  --- USER'S SCENARIO ---
52
  {user_scenario}
 
53
  --- PYTHON SCRIPT ---
54
- Now, write the complete, robust, and defensive Python script that analyzes the dfs variable and prints a single, serializable JSON object.
55
- code
56
- Python
57
  """
58
  generated_text = cohere_chat(prompt_for_coder)
59
  match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
@@ -61,7 +72,6 @@ Python
61
  return "print(json.dumps({'error': 'Failed to generate a valid Python script.'}))"
62
 
63
  def _generate_long_report(prompt: str) -> str:
64
- """Calls the Cohere API directly with a much higher max_tokens limit."""
65
  try:
66
  client = _co_client()
67
  if not client: return "Error: Cohere client not initialized."
@@ -76,11 +86,13 @@ def _generate_long_report(prompt: str) -> str:
76
  return f"Error during final report generation: {e}"
77
 
78
  def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
79
- """Asks the AI to act as a consultant and write a polished report from the raw data."""
80
  prompt_for_writer = f"""
81
- You are an expert management consultant. A data science script has extracted key findings. Your task is to synthesize these findings into a professional report that answers the user's questions.
 
 
 
82
 
83
- --- USER'S ORIGINAL SCENARIO ---
84
  {user_scenario}
85
  --- END SCENARIO ---
86
 
@@ -92,7 +104,7 @@ Now, write the final, polished report. The report MUST:
92
  1. Follow the "Expected Output Format" requested by the user.
93
  2. Use tables, bullet points, and DETAILED narrative justifications for each recommendation.
94
  3. Synthesize the raw data into actionable insights. Do not just copy the raw numbers; interpret them.
95
- 4. Ensure you fully address ALL evaluation questions.
96
  """
97
  return _generate_long_report(prompt_for_writer)
98
 
@@ -100,7 +112,6 @@ def _append_msg(h: List[Dict[str, str]], r: str, c: str) -> List[Dict[str, str]]
100
  return (h or []) + [{"role": r, "content": c}]
101
 
102
  def ping_cohere() -> str:
103
- """Lightweight health check."""
104
  try:
105
  cli = _co_client()
106
  if not cli: return "Cohere client not initialized."
@@ -108,10 +119,7 @@ def ping_cohere() -> str:
108
  return f"Cohere OK ✅ (model={COHERE_MODEL_PRIMARY})" if vecs else "Cohere reachable."
109
  except Exception as e: return f"Cohere ping failed: {e}"
110
 
111
- # --- THE CORE ANALYSIS ENGINE ---
112
-
113
  def handle(user_msg: str, files: list, yield_update) -> str:
114
- """Orchestrates the 'Analyst-Writer' pipeline."""
115
  try:
116
  safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
117
  if blocked_in: return refusal_reply(reason_in)
@@ -156,11 +164,9 @@ def handle(user_msg: str, files: list, yield_update) -> str:
156
  log_event("app_error", None, {"err": str(e), "tb": tb})
157
  return f"A critical error occurred: {e}"
158
 
159
- # --- PRE-LOAD LEGAL DOCUMENTS ---
160
  PRIVACY_POLICY_TEXT = load_markdown_text("privacy_policy.md")
161
  TERMS_OF_SERVICE_TEXT = load_markdown_text("terms_of_service.md")
162
 
163
- # ---------------- THE PROFESSIONAL UI ----------------
164
  with gr.Blocks(theme="soft", css="style.css") as demo:
165
  assessment_history = gr.State([])
166
 
@@ -219,7 +225,7 @@ with gr.Blocks(theme="soft", css="style.css") as demo:
219
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
220
 
221
  if files:
222
- file_names = [os.path.basename(fn.name if hasattr(fn, 'name') else fn) for fn in files]
223
  new_assessment = {"id": timestamp, "prompt": prompt, "files": file_names, "response": ai_response_text}
224
  updated_history = (history_state_list or []) + [new_assessment]
225
  history_labels = [f"{item['id']} - {item['prompt'][:40]}..." for item in updated_history]
@@ -227,17 +233,15 @@ with gr.Blocks(theme="soft", css="style.css") as demo:
227
  else:
228
  yield final_chat, history_state_list, gr.update()
229
 
230
-
231
- def view_history(selection, history_state_list):
232
- if not selection or not history_state_list:
233
- return ""
234
- # THE FIX IS HERE: Correctly extract just the timestamp (the first part)
235
- selected_id = selection.split(" - ")[0]
236
- selected_assessment = next((item for item in history_state_list if item["id"] == selected_id), None)
237
-
238
- if selected_assessment:
239
- file_list_md = "\n- ".join(selected_assessment.get('files', []))
240
- return f"""### Assessment from: {selected_assessment['id']}
241
  **Files Used:**
242
  - {file_list_md}
243
  ---
@@ -247,19 +251,21 @@ with gr.Blocks(theme="soft", css="style.css") as demo:
247
  **AI Generated Response:**
248
  {selected_assessment['response']}
249
  """
250
- return "Could not find the selected assessment."
251
 
252
  send_btn.click(
253
  run_analysis_wrapper,
254
  inputs=[prompt_input, files_input, chat_history_output, assessment_history],
255
  outputs=[chat_history_output, assessment_history, history_dropdown]
256
  )
257
- history_dropdown.change(view_history, inputs=[history_dropdown, assessment_history], outputs=[history_display])
258
- # We remove 'assessment_history' from the list of outputs, so it is no longer cleared.
259
- # We remove 'assessment_history' from the list of outputs, so it is no longer cleared.
 
 
260
  clear_btn.click(
261
- lambda: (None, None, []),
262
- outputs=[prompt_input, files_input, chat_history_output]
263
  )
264
  ping_btn.click(ping_cohere, outputs=[ping_out])
265
  privacy_link.click(lambda: gr.update(visible=True), outputs=[privacy_modal])
 
1
+ from __future__ import annotations
2
  import os
3
  import io
4
  import json
5
  import traceback
6
  from contextlib import redirect_stdout
7
  from typing import List, Dict, Any
8
+
9
  import gradio as gr
10
  import pandas as pd
11
  from datetime import datetime
12
  import regex as re2
13
  import re
14
+
15
  from langchain_cohere import ChatCohere
16
+
17
  from settings import (
18
+ GENERAL_CONVERSATION_PROMPT,
19
+ COHERE_MODEL_PRIMARY, COHERE_TIMEOUT_S, USE_OPEN_FALLBACKS
20
  )
21
  from audit_log import log_event
22
  from privacy import safety_filter, refusal_reply
23
  from llm_router import cohere_chat, _co_client, cohere_embed
24
+
25
  def load_markdown_text(filepath: str) -> str:
26
+ try:
27
+ with open(filepath, 'r', encoding='utf-8') as f:
28
+ return f.read()
29
+ except FileNotFoundError:
30
+ return f"**Error:** Document `{os.path.basename(filepath)}` not found."
31
+
32
  def _sanitize_text(s: str) -> str:
33
+ if not isinstance(s, str): return s
34
+ return re2.sub(r'[\p{C}--[\n\t]]+', '', s)
35
+
36
  def _create_python_script(user_scenario: str, schema_context: str) -> str:
37
+ EXPERT_ANALYTICAL_GUIDELINES = """
38
+ --- EXPERT ANALYTICAL GUIDELINES ---
39
+ When writing your script, you MUST follow these expert business rules:
40
+ 1. **Linking Datasets Rule:** If you need to connect facilities to health zones when the 'zone' column is not in the facility list, you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list, and *then* assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
41
+ 2. **Prioritization Rule:** To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators to create a multi-factor risk score.
42
+ 3. **Capacity Calculation Rule:** For capacity over a 3-month window, assume **60 working days**.
43
+ 4. **Cost Calculation Rule:** Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
44
+ """
45
+
46
+ prompt_for_coder = f"""
47
+ You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
48
+ You have dataframes in a list `dfs`.
49
+
50
+ {EXPERT_ANALYTICAL_GUIDELINES}
51
+
52
+ --- DATA SCHEMA ---
53
  {schema_context}
54
+ --- END DATA SCHEMA ---
55
+
56
  CRITICAL RULES:
57
+ 1. **DO NOT READ FILES:** You MUST NOT include `pd.read_csv`. The data is ALREADY loaded in the `dfs` variable. You MUST use this variable. Failure to do so will cause a fatal error.
58
+ 2. **JSON OUTPUT ONLY:** Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
59
+ 3. **BE PRECISE:** Use the exact, case-sensitive column names from the schema and robustly clean strings (`re.sub()`) before converting to numbers.
60
+ 4. **JSON SERIALIZATION:** Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like `int64`) to standard Python types using `.item()` for single values or `.tolist()` for lists.
61
+
62
  --- USER'S SCENARIO ---
63
  {user_scenario}
64
+
65
  --- PYTHON SCRIPT ---
66
+ Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
67
+ ```python
 
68
  """
69
  generated_text = cohere_chat(prompt_for_coder)
70
  match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
 
72
  return "print(json.dumps({'error': 'Failed to generate a valid Python script.'}))"
73
 
74
  def _generate_long_report(prompt: str) -> str:
 
75
  try:
76
  client = _co_client()
77
  if not client: return "Error: Cohere client not initialized."
 
86
  return f"Error during final report generation: {e}"
87
 
88
  def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
 
89
  prompt_for_writer = f"""
90
+ You are an expert management consultant and data analyst.
91
+ A data science script has run to extract key findings. You have the user's original request and the raw JSON data.
92
+
93
+ Your task is to synthesize these raw findings into a single, comprehensive, and professional report that directly answers all of the user's questions with detailed justifications.
94
 
95
+ --- USER'S ORIGINAL SCENARIO & DELIVERABLES ---
96
  {user_scenario}
97
  --- END SCENARIO ---
98
 
 
104
  1. Follow the "Expected Output Format" requested by the user.
105
  2. Use tables, bullet points, and DETAILED narrative justifications for each recommendation.
106
  3. Synthesize the raw data into actionable insights. Do not just copy the raw numbers; interpret them.
107
+ 4. Ensure you fully address ALL evaluation questions, especially the final recommendations.
108
  """
109
  return _generate_long_report(prompt_for_writer)
110
 
 
112
  return (h or []) + [{"role": r, "content": c}]
113
 
114
  def ping_cohere() -> str:
 
115
  try:
116
  cli = _co_client()
117
  if not cli: return "Cohere client not initialized."
 
119
  return f"Cohere OK ✅ (model={COHERE_MODEL_PRIMARY})" if vecs else "Cohere reachable."
120
  except Exception as e: return f"Cohere ping failed: {e}"
121
 
 
 
122
  def handle(user_msg: str, files: list, yield_update) -> str:
 
123
  try:
124
  safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
125
  if blocked_in: return refusal_reply(reason_in)
 
164
  log_event("app_error", None, {"err": str(e), "tb": tb})
165
  return f"A critical error occurred: {e}"
166
 
 
167
  PRIVACY_POLICY_TEXT = load_markdown_text("privacy_policy.md")
168
  TERMS_OF_SERVICE_TEXT = load_markdown_text("terms_of_service.md")
169
 
 
170
  with gr.Blocks(theme="soft", css="style.css") as demo:
171
  assessment_history = gr.State([])
172
 
 
225
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
226
 
227
  if files:
228
+ file_names = [os.path.basename(f.name if hasattr(f, 'name') else f) for f in files]
229
  new_assessment = {"id": timestamp, "prompt": prompt, "files": file_names, "response": ai_response_text}
230
  updated_history = (history_state_list or []) + [new_assessment]
231
  history_labels = [f"{item['id']} - {item['prompt'][:40]}..." for item in updated_history]
 
233
  else:
234
  yield final_chat, history_state_list, gr.update()
235
 
236
+ def view_history(selection, history_state_list):
237
+ if not selection or not history_state_list:
238
+ return ""
239
+ selected_id = selection.split(" - ")
240
+ selected_assessment = next((item for item in history_state_list if item["id"] == selected_id), None)
241
+
242
+ if selected_assessment:
243
+ file_list_md = "\n- ".join(selected_assessment.get('files', []))
244
+ return f"""### Assessment from: {selected_assessment['id']}
 
 
245
  **Files Used:**
246
  - {file_list_md}
247
  ---
 
251
  **AI Generated Response:**
252
  {selected_assessment['response']}
253
  """
254
+ return "Could not find the selected assessment."
255
 
256
  send_btn.click(
257
  run_analysis_wrapper,
258
  inputs=[prompt_input, files_input, chat_history_output, assessment_history],
259
  outputs=[chat_history_output, assessment_history, history_dropdown]
260
  )
261
+ history_dropdown.change(
262
+ view_history,
263
+ inputs=[history_dropdown, assessment_history],
264
+ outputs=[history_display]
265
+ )
266
  clear_btn.click(
267
+ lambda: (None, None, []),
268
+ outputs=[prompt_input, files_input, chat_history_output]
269
  )
270
  ping_btn.click(ping_cohere, outputs=[ping_out])
271
  privacy_link.click(lambda: gr.update(visible=True), outputs=[privacy_modal])