Update app.py
Browse files
app.py
CHANGED
|
@@ -1,17 +1,24 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import io
|
| 3 |
import re
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
-
import openai
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
-
from dotenv import load_dotenv
|
| 9 |
from PIL import Image
|
| 10 |
import traceback
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
def load_file(file):
|
| 17 |
"""Load a CSV or Excel file into a pandas DataFrame."""
|
|
@@ -25,12 +32,12 @@ def load_file(file):
|
|
| 25 |
if file_path is None:
|
| 26 |
return None
|
| 27 |
try:
|
| 28 |
-
if file_name.endswith(
|
| 29 |
df = pd.read_csv(file_path)
|
| 30 |
-
elif file_name.endswith(
|
| 31 |
-
df = pd.read_excel(file_path, engine=
|
| 32 |
-
elif file_name.endswith(
|
| 33 |
-
df = pd.read_excel(file_path, engine=
|
| 34 |
else:
|
| 35 |
return None
|
| 36 |
except Exception as e:
|
|
@@ -41,12 +48,12 @@ def load_file(file):
|
|
| 41 |
# Assume file is a file-like object (as on your local machine)
|
| 42 |
file_name = file.name.lower()
|
| 43 |
try:
|
| 44 |
-
if file_name.endswith(
|
| 45 |
df = pd.read_csv(file)
|
| 46 |
-
elif file_name.endswith(
|
| 47 |
-
df = pd.read_excel(file, engine=
|
| 48 |
-
elif file_name.endswith(
|
| 49 |
-
df = pd.read_excel(file, engine=
|
| 50 |
else:
|
| 51 |
return None
|
| 52 |
except Exception as e:
|
|
@@ -54,24 +61,16 @@ def load_file(file):
|
|
| 54 |
return None
|
| 55 |
return df
|
| 56 |
|
|
|
|
| 57 |
def preview_file(file):
|
| 58 |
"""Return the DataFrame for preview."""
|
| 59 |
df = load_file(file)
|
| 60 |
if df is None:
|
| 61 |
-
# Return a DataFrame with an error message instead of a plain string
|
| 62 |
return pd.DataFrame({"Error": ["Error loading file or unsupported file type."]})
|
| 63 |
return df
|
| 64 |
|
|
|
|
| 65 |
def generate_basic_understanding_code(df_preview):
|
| 66 |
-
"""
|
| 67 |
-
Generate Python code that performs an exploratory analysis on the DataFrame.
|
| 68 |
-
The generated code should output a variable 'basic_info' that is a dictionary containing:
|
| 69 |
-
- The data types of each column.
|
| 70 |
-
- For numeric columns, summary statistics (mean, median, std, etc.).
|
| 71 |
-
- For non-numeric columns, counts, unique values, mode, and frequency distributions.
|
| 72 |
-
If charts are generated, ensure plt.show() is called after each chart so they can be captured.
|
| 73 |
-
Note: When converting dates, use pd.to_datetime() without a fixed format or with dayfirst=True.
|
| 74 |
-
"""
|
| 75 |
prompt = f"""
|
| 76 |
You are a data analysis expert. Write Python code that performs an exploratory analysis of the DataFrame.
|
| 77 |
Assume a pandas DataFrame named 'df' is already loaded.
|
|
@@ -83,37 +82,26 @@ For each column in df, include its data type.
|
|
| 83 |
When converting date strings to datetime, use pd.to_datetime() without a fixed format or with dayfirst=True.
|
| 84 |
If your analysis includes charts, call plt.show() after each chart so they can be captured.
|
| 85 |
Only reference columns that are present in df.columns.
|
| 86 |
-
|
| 87 |
Note: The following safe built-ins are available: list, dict, set, tuple, abs, min, max, sum, len, range, print, pd, plt, __import__.
|
| 88 |
-
|
| 89 |
DataFrame preview:
|
| 90 |
Columns: {list(df_preview.columns)}
|
| 91 |
Sample Data (first 3 rows):
|
| 92 |
{df_preview.head(3).to_dict(orient='records')}
|
| 93 |
"""
|
| 94 |
-
response =
|
| 95 |
-
model=
|
| 96 |
messages=[
|
| 97 |
{"role": "system", "content": "You are an expert data analysis assistant who outputs only raw Python code."},
|
| 98 |
-
{"role": "user", "content": prompt}
|
| 99 |
],
|
| 100 |
temperature=0.3,
|
| 101 |
max_tokens=3500,
|
| 102 |
)
|
| 103 |
-
code = response.choices[0].message.content.strip()
|
| 104 |
return code
|
| 105 |
|
|
|
|
| 106 |
def generate_problem_solving_code(nl_query, df_preview, basic_info):
|
| 107 |
-
"""
|
| 108 |
-
Generate Python code that solves the user's analysis query.
|
| 109 |
-
The code should assume that the DataFrame 'df' is loaded and that the variable 'basic_info'
|
| 110 |
-
(the output from the initial exploratory analysis) is available.
|
| 111 |
-
The final analysis should be assigned to a variable named 'result' as a dictionary with keys:
|
| 112 |
-
'summary', 'detailed_stats', 'insights', and 'chart_descriptions'.
|
| 113 |
-
If charts are generated, call plt.show() after each chart so they can be captured.
|
| 114 |
-
Note: When converting date strings to datetime, use pd.to_datetime() without a fixed format or with dayfirst=True.
|
| 115 |
-
Only reference columns that are present in df.columns.
|
| 116 |
-
"""
|
| 117 |
prompt = f"""
|
| 118 |
You are a data analysis expert. Write Python code that performs the analysis as described below.
|
| 119 |
Assume a pandas DataFrame named 'df' is already loaded and that you have already generated an exploratory summary stored in 'basic_info'.
|
|
@@ -127,28 +115,26 @@ Incorporate insights from 'basic_info' if relevant.
|
|
| 127 |
When converting date strings to datetime, use pd.to_datetime() without a fixed format or with dayfirst=True.
|
| 128 |
If your analysis includes charts, call plt.show() after each chart so they can be captured.
|
| 129 |
Only reference columns that are present in df.columns.
|
| 130 |
-
|
| 131 |
Note: The following safe built-ins are available: list, dict, set, tuple, abs, min, max, sum, len, range, print, pd, plt, __import__.
|
| 132 |
-
|
| 133 |
DataFrame preview:
|
| 134 |
Columns: {list(df_preview.columns)}
|
| 135 |
Sample Data (first 3 rows):
|
| 136 |
{df_preview.head(3).to_dict(orient='records')}
|
| 137 |
-
|
| 138 |
User Query: "{nl_query}"
|
| 139 |
"""
|
| 140 |
-
response =
|
| 141 |
-
model=
|
| 142 |
messages=[
|
| 143 |
{"role": "system", "content": "You are an expert data analysis assistant who outputs only raw Python code."},
|
| 144 |
-
{"role": "user", "content": prompt}
|
| 145 |
],
|
| 146 |
temperature=0.3,
|
| 147 |
max_tokens=3500,
|
| 148 |
)
|
| 149 |
-
code = response.choices[0].message.content.strip()
|
| 150 |
return code
|
| 151 |
|
|
|
|
| 152 |
def validate_generated_code(code, df):
|
| 153 |
"""
|
| 154 |
Validate that the generated code references only columns that exist in the DataFrame.
|
|
@@ -161,6 +147,7 @@ def validate_generated_code(code, df):
|
|
| 161 |
return False, missing_cols
|
| 162 |
return True, []
|
| 163 |
|
|
|
|
| 164 |
def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globals=None):
|
| 165 |
"""
|
| 166 |
Execute the generated code in a restricted namespace.
|
|
@@ -171,13 +158,15 @@ def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globa
|
|
| 171 |
code_lines = code.splitlines()
|
| 172 |
clean_lines = [line for line in code_lines if not line.strip().startswith("```")]
|
| 173 |
clean_code = "\n".join(clean_lines).strip()
|
| 174 |
-
|
| 175 |
# Validate that the generated code references only existing DataFrame columns.
|
| 176 |
valid, missing_cols = validate_generated_code(clean_code, df)
|
| 177 |
if not valid:
|
| 178 |
-
return (
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
| 181 |
# Expanded safe built-ins. Including float, int, bool, etc.
|
| 182 |
safe_builtins = {
|
| 183 |
"abs": abs,
|
|
@@ -205,10 +194,11 @@ def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globa
|
|
| 205 |
"__import__": __import__,
|
| 206 |
}
|
| 207 |
safe_globals = {"__builtins__": safe_builtins, "df": df, "plt": plt, "charts": []}
|
| 208 |
-
|
| 209 |
# Pre-import seaborn as sns if available.
|
| 210 |
try:
|
| 211 |
import seaborn as sns
|
|
|
|
| 212 |
safe_globals["sns"] = sns
|
| 213 |
except ImportError:
|
| 214 |
pass
|
|
@@ -216,8 +206,9 @@ def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globa
|
|
| 216 |
if extra_globals is not None:
|
| 217 |
safe_globals.update(extra_globals)
|
| 218 |
safe_locals = {}
|
| 219 |
-
|
| 220 |
if capture_charts:
|
|
|
|
| 221 |
def custom_show(*args, **kwargs):
|
| 222 |
buf = io.BytesIO()
|
| 223 |
plt.savefig(buf, format="png")
|
|
@@ -225,24 +216,30 @@ def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globa
|
|
| 225 |
img = Image.open(buf).convert("RGB")
|
| 226 |
safe_globals["charts"].append(img)
|
| 227 |
plt.close()
|
|
|
|
| 228 |
safe_globals["plt"].show = custom_show
|
| 229 |
-
|
| 230 |
try:
|
| 231 |
-
# Directly execute the multi-line generated code.
|
| 232 |
exec(clean_code, safe_globals, safe_locals)
|
| 233 |
output = safe_locals.get("result", None)
|
| 234 |
if output is None:
|
| 235 |
output = safe_locals.get("basic_info", None)
|
| 236 |
-
except Exception
|
| 237 |
error_details = traceback.format_exc()
|
| 238 |
if "ValueError: time data" in error_details:
|
| 239 |
-
error_details +=
|
|
|
|
|
|
|
|
|
|
| 240 |
if "KeyError" in error_details:
|
| 241 |
error_details += "\nHint: The generated code might be referencing columns that do not exist in your DataFrame."
|
| 242 |
if "NameError" in error_details:
|
| 243 |
-
error_details +=
|
|
|
|
|
|
|
|
|
|
| 244 |
return f"An error occurred during code execution:\n{error_details}", safe_globals["charts"]
|
| 245 |
-
|
| 246 |
if capture_charts and not safe_globals["charts"]:
|
| 247 |
fig_nums = plt.get_fignums()
|
| 248 |
for num in fig_nums:
|
|
@@ -253,50 +250,48 @@ def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globa
|
|
| 253 |
img = Image.open(buf).convert("RGB")
|
| 254 |
safe_globals["charts"].append(img)
|
| 255 |
plt.close("all")
|
| 256 |
-
|
| 257 |
if interactive:
|
| 258 |
for img in safe_globals["charts"]:
|
| 259 |
img.show()
|
| 260 |
-
|
| 261 |
if output is None:
|
| 262 |
output = "No output variable ('result' or 'basic_info') was set by the code."
|
| 263 |
return output, safe_globals["charts"]
|
| 264 |
|
|
|
|
| 265 |
def generate_interpretation(analysis_result, nl_query):
|
| 266 |
"""
|
| 267 |
-
Use
|
| 268 |
Provide context from the user's query and explain what the results mean.
|
| 269 |
The response will be formatted in markdown.
|
| 270 |
"""
|
| 271 |
prompt = f"""
|
| 272 |
You are a knowledgeable data analyst. Based on the following analysis result and the user's query, provide a detailed interpretation and descriptive analysis of the results. Explain what the results mean, any insights that can be drawn, and any potential limitations.
|
| 273 |
Please format your output in markdown (including headers, bullet points, and other markdown formatting as appropriate).
|
| 274 |
-
|
| 275 |
User Query: "{nl_query}"
|
| 276 |
-
|
| 277 |
Analysis Result:
|
| 278 |
{analysis_result}
|
| 279 |
-
|
| 280 |
Provide a clear and detailed explanation in plain language.
|
| 281 |
"""
|
| 282 |
-
response =
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
)
|
| 291 |
-
interpretation = response.choices[0].message.content.strip()
|
| 292 |
return interpretation
|
| 293 |
|
|
|
|
| 294 |
def generate_and_run(nl_query, file, interactive_mode=False):
|
| 295 |
"""
|
| 296 |
-
Load the file, generate both a basic understanding and a detailed analysis code using
|
| 297 |
execute the generated code, and then generate an interpretation of the analysis result.
|
| 298 |
Returns a tuple: (analysis result, combined generated code, DataFrame preview, charts, interpretation).
|
| 299 |
-
|
| 300 |
The process is split into two steps:
|
| 301 |
1. Generate basic understanding code that produces 'basic_info'.
|
| 302 |
2. Generate problem-solving code that uses 'basic_info' and produces the final analysis ('result').
|
|
@@ -304,34 +299,37 @@ def generate_and_run(nl_query, file, interactive_mode=False):
|
|
| 304 |
df = load_file(file)
|
| 305 |
if df is None:
|
| 306 |
return "Error loading file.", "", pd.DataFrame({"Error": ["No data available."]}), [], ""
|
| 307 |
-
|
| 308 |
df_preview = df.copy()
|
| 309 |
# Step 1: Generate and execute basic understanding code.
|
| 310 |
basic_code = generate_basic_understanding_code(df_preview)
|
| 311 |
basic_info, basic_charts = safe_exec_code(basic_code, df, capture_charts=False, interactive=interactive_mode)
|
| 312 |
-
|
| 313 |
# Step 2: Generate and execute problem-solving code, injecting basic_info.
|
| 314 |
problem_code = generate_problem_solving_code(nl_query, df_preview, basic_info)
|
| 315 |
-
result, problem_charts = safe_exec_code(
|
| 316 |
-
|
|
|
|
|
|
|
| 317 |
interpretation = generate_interpretation(result, nl_query)
|
| 318 |
combined_code = f"### Basic Understanding Code:\n{basic_code}\n\n### Problem Solving Code:\n{problem_code}"
|
| 319 |
combined_charts = basic_charts + problem_charts
|
| 320 |
return result, combined_code, df_preview, combined_charts, interpretation
|
| 321 |
|
|
|
|
| 322 |
# Gradio interface setup
|
| 323 |
with gr.Blocks() as demo:
|
| 324 |
gr.Markdown("## Dynamic Data Analysis with Two-Step Code Generation and Interpretation")
|
| 325 |
-
|
| 326 |
with gr.Tab("Data Upload & Preview"):
|
| 327 |
file_input = gr.File(label="Upload CSV or Excel file (.csv, .xls, .xlsx)")
|
| 328 |
data_preview = gr.Dataframe(label="Data Preview")
|
| 329 |
file_input.change(fn=preview_file, inputs=file_input, outputs=data_preview)
|
| 330 |
-
|
| 331 |
with gr.Tab("Generate & Execute Analysis (Gradio Mode)"):
|
| 332 |
nl_query = gr.Textbox(
|
| 333 |
-
label="Enter your query",
|
| 334 |
-
placeholder="e.g., Generate summary statistics and charts for Gender and Age distributions"
|
| 335 |
)
|
| 336 |
generate_btn = gr.Button("Generate & Execute Code")
|
| 337 |
analysis_output = gr.Textbox(label="Analysis Result", lines=10)
|
|
@@ -339,15 +337,14 @@ with gr.Blocks() as demo:
|
|
| 339 |
preview_output = gr.Dataframe(label="Data Preview")
|
| 340 |
charts_output = gr.Gallery(label="Charts", show_label=True)
|
| 341 |
interpretation_output = gr.Markdown(label="Interpretation")
|
| 342 |
-
|
| 343 |
generate_btn.click(
|
| 344 |
fn=lambda query, file: generate_and_run(query, file, interactive_mode=True),
|
| 345 |
inputs=[nl_query, file_input],
|
| 346 |
-
outputs=[analysis_output, code_output, preview_output, charts_output, interpretation_output]
|
| 347 |
)
|
| 348 |
-
|
| 349 |
# Launch the app. This main block is useful for Hugging Face Spaces.
|
| 350 |
if __name__ == "__main__":
|
| 351 |
demo.launch()
|
| 352 |
# demo.launch(auth=("username", "password"))
|
| 353 |
-
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
import os
|
| 3 |
import io
|
| 4 |
import re
|
| 5 |
import gradio as gr
|
| 6 |
import pandas as pd
|
|
|
|
| 7 |
import matplotlib.pyplot as plt
|
|
|
|
| 8 |
from PIL import Image
|
| 9 |
import traceback
|
| 10 |
|
| 11 |
+
from groq import Groq
|
| 12 |
+
|
| 13 |
+
# Groq config (Secrets en Hugging Face Space)
|
| 14 |
+
GROQ_API_KEY = (os.getenv("GROQ_API_KEY") or "").strip()
|
| 15 |
+
GROQ_MODEL = (os.getenv("GROQ_MODEL") or "llama-3.3-70b-versatile").strip()
|
| 16 |
+
|
| 17 |
+
if not GROQ_API_KEY:
|
| 18 |
+
raise RuntimeError("Falta GROQ_API_KEY en Secrets del Space.")
|
| 19 |
+
|
| 20 |
+
groq_client = Groq(api_key=GROQ_API_KEY)
|
| 21 |
+
|
| 22 |
|
| 23 |
def load_file(file):
|
| 24 |
"""Load a CSV or Excel file into a pandas DataFrame."""
|
|
|
|
| 32 |
if file_path is None:
|
| 33 |
return None
|
| 34 |
try:
|
| 35 |
+
if file_name.endswith(".csv"):
|
| 36 |
df = pd.read_csv(file_path)
|
| 37 |
+
elif file_name.endswith(".xlsx"):
|
| 38 |
+
df = pd.read_excel(file_path, engine="openpyxl")
|
| 39 |
+
elif file_name.endswith(".xls"):
|
| 40 |
+
df = pd.read_excel(file_path, engine="xlrd")
|
| 41 |
else:
|
| 42 |
return None
|
| 43 |
except Exception as e:
|
|
|
|
| 48 |
# Assume file is a file-like object (as on your local machine)
|
| 49 |
file_name = file.name.lower()
|
| 50 |
try:
|
| 51 |
+
if file_name.endswith(".csv"):
|
| 52 |
df = pd.read_csv(file)
|
| 53 |
+
elif file_name.endswith(".xlsx"):
|
| 54 |
+
df = pd.read_excel(file, engine="openpyxl")
|
| 55 |
+
elif file_name.endswith(".xls"):
|
| 56 |
+
df = pd.read_excel(file, engine="xlrd")
|
| 57 |
else:
|
| 58 |
return None
|
| 59 |
except Exception as e:
|
|
|
|
| 61 |
return None
|
| 62 |
return df
|
| 63 |
|
| 64 |
+
|
| 65 |
def preview_file(file):
|
| 66 |
"""Return the DataFrame for preview."""
|
| 67 |
df = load_file(file)
|
| 68 |
if df is None:
|
|
|
|
| 69 |
return pd.DataFrame({"Error": ["Error loading file or unsupported file type."]})
|
| 70 |
return df
|
| 71 |
|
| 72 |
+
|
| 73 |
def generate_basic_understanding_code(df_preview):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
prompt = f"""
|
| 75 |
You are a data analysis expert. Write Python code that performs an exploratory analysis of the DataFrame.
|
| 76 |
Assume a pandas DataFrame named 'df' is already loaded.
|
|
|
|
| 82 |
When converting date strings to datetime, use pd.to_datetime() without a fixed format or with dayfirst=True.
|
| 83 |
If your analysis includes charts, call plt.show() after each chart so they can be captured.
|
| 84 |
Only reference columns that are present in df.columns.
|
|
|
|
| 85 |
Note: The following safe built-ins are available: list, dict, set, tuple, abs, min, max, sum, len, range, print, pd, plt, __import__.
|
|
|
|
| 86 |
DataFrame preview:
|
| 87 |
Columns: {list(df_preview.columns)}
|
| 88 |
Sample Data (first 3 rows):
|
| 89 |
{df_preview.head(3).to_dict(orient='records')}
|
| 90 |
"""
|
| 91 |
+
response = groq_client.chat.completions.create(
|
| 92 |
+
model=GROQ_MODEL,
|
| 93 |
messages=[
|
| 94 |
{"role": "system", "content": "You are an expert data analysis assistant who outputs only raw Python code."},
|
| 95 |
+
{"role": "user", "content": prompt},
|
| 96 |
],
|
| 97 |
temperature=0.3,
|
| 98 |
max_tokens=3500,
|
| 99 |
)
|
| 100 |
+
code = (response.choices[0].message.content or "").strip()
|
| 101 |
return code
|
| 102 |
|
| 103 |
+
|
| 104 |
def generate_problem_solving_code(nl_query, df_preview, basic_info):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
prompt = f"""
|
| 106 |
You are a data analysis expert. Write Python code that performs the analysis as described below.
|
| 107 |
Assume a pandas DataFrame named 'df' is already loaded and that you have already generated an exploratory summary stored in 'basic_info'.
|
|
|
|
| 115 |
When converting date strings to datetime, use pd.to_datetime() without a fixed format or with dayfirst=True.
|
| 116 |
If your analysis includes charts, call plt.show() after each chart so they can be captured.
|
| 117 |
Only reference columns that are present in df.columns.
|
|
|
|
| 118 |
Note: The following safe built-ins are available: list, dict, set, tuple, abs, min, max, sum, len, range, print, pd, plt, __import__.
|
|
|
|
| 119 |
DataFrame preview:
|
| 120 |
Columns: {list(df_preview.columns)}
|
| 121 |
Sample Data (first 3 rows):
|
| 122 |
{df_preview.head(3).to_dict(orient='records')}
|
|
|
|
| 123 |
User Query: "{nl_query}"
|
| 124 |
"""
|
| 125 |
+
response = groq_client.chat.completions.create(
|
| 126 |
+
model=GROQ_MODEL,
|
| 127 |
messages=[
|
| 128 |
{"role": "system", "content": "You are an expert data analysis assistant who outputs only raw Python code."},
|
| 129 |
+
{"role": "user", "content": prompt},
|
| 130 |
],
|
| 131 |
temperature=0.3,
|
| 132 |
max_tokens=3500,
|
| 133 |
)
|
| 134 |
+
code = (response.choices[0].message.content or "").strip()
|
| 135 |
return code
|
| 136 |
|
| 137 |
+
|
| 138 |
def validate_generated_code(code, df):
|
| 139 |
"""
|
| 140 |
Validate that the generated code references only columns that exist in the DataFrame.
|
|
|
|
| 147 |
return False, missing_cols
|
| 148 |
return True, []
|
| 149 |
|
| 150 |
+
|
| 151 |
def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globals=None):
|
| 152 |
"""
|
| 153 |
Execute the generated code in a restricted namespace.
|
|
|
|
| 158 |
code_lines = code.splitlines()
|
| 159 |
clean_lines = [line for line in code_lines if not line.strip().startswith("```")]
|
| 160 |
clean_code = "\n".join(clean_lines).strip()
|
| 161 |
+
|
| 162 |
# Validate that the generated code references only existing DataFrame columns.
|
| 163 |
valid, missing_cols = validate_generated_code(clean_code, df)
|
| 164 |
if not valid:
|
| 165 |
+
return (
|
| 166 |
+
f"Generated code references missing columns: {missing_cols}\nPlease adjust your prompt or data.",
|
| 167 |
+
[],
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
# Expanded safe built-ins. Including float, int, bool, etc.
|
| 171 |
safe_builtins = {
|
| 172 |
"abs": abs,
|
|
|
|
| 194 |
"__import__": __import__,
|
| 195 |
}
|
| 196 |
safe_globals = {"__builtins__": safe_builtins, "df": df, "plt": plt, "charts": []}
|
| 197 |
+
|
| 198 |
# Pre-import seaborn as sns if available.
|
| 199 |
try:
|
| 200 |
import seaborn as sns
|
| 201 |
+
|
| 202 |
safe_globals["sns"] = sns
|
| 203 |
except ImportError:
|
| 204 |
pass
|
|
|
|
| 206 |
if extra_globals is not None:
|
| 207 |
safe_globals.update(extra_globals)
|
| 208 |
safe_locals = {}
|
| 209 |
+
|
| 210 |
if capture_charts:
|
| 211 |
+
|
| 212 |
def custom_show(*args, **kwargs):
|
| 213 |
buf = io.BytesIO()
|
| 214 |
plt.savefig(buf, format="png")
|
|
|
|
| 216 |
img = Image.open(buf).convert("RGB")
|
| 217 |
safe_globals["charts"].append(img)
|
| 218 |
plt.close()
|
| 219 |
+
|
| 220 |
safe_globals["plt"].show = custom_show
|
| 221 |
+
|
| 222 |
try:
|
|
|
|
| 223 |
exec(clean_code, safe_globals, safe_locals)
|
| 224 |
output = safe_locals.get("result", None)
|
| 225 |
if output is None:
|
| 226 |
output = safe_locals.get("basic_info", None)
|
| 227 |
+
except Exception:
|
| 228 |
error_details = traceback.format_exc()
|
| 229 |
if "ValueError: time data" in error_details:
|
| 230 |
+
error_details += (
|
| 231 |
+
"\nHint: The generated code might be using a fixed datetime format. "
|
| 232 |
+
"Consider using pd.to_datetime() without a fixed format or with dayfirst=True."
|
| 233 |
+
)
|
| 234 |
if "KeyError" in error_details:
|
| 235 |
error_details += "\nHint: The generated code might be referencing columns that do not exist in your DataFrame."
|
| 236 |
if "NameError" in error_details:
|
| 237 |
+
error_details += (
|
| 238 |
+
"\nHint: Ensure that all required built-in types and libraries (like float, int, etc.) "
|
| 239 |
+
"are included in the safe built-ins."
|
| 240 |
+
)
|
| 241 |
return f"An error occurred during code execution:\n{error_details}", safe_globals["charts"]
|
| 242 |
+
|
| 243 |
if capture_charts and not safe_globals["charts"]:
|
| 244 |
fig_nums = plt.get_fignums()
|
| 245 |
for num in fig_nums:
|
|
|
|
| 250 |
img = Image.open(buf).convert("RGB")
|
| 251 |
safe_globals["charts"].append(img)
|
| 252 |
plt.close("all")
|
| 253 |
+
|
| 254 |
if interactive:
|
| 255 |
for img in safe_globals["charts"]:
|
| 256 |
img.show()
|
| 257 |
+
|
| 258 |
if output is None:
|
| 259 |
output = "No output variable ('result' or 'basic_info') was set by the code."
|
| 260 |
return output, safe_globals["charts"]
|
| 261 |
|
| 262 |
+
|
| 263 |
def generate_interpretation(analysis_result, nl_query):
|
| 264 |
"""
|
| 265 |
+
Use Groq to generate a detailed interpretation of the analysis result.
|
| 266 |
Provide context from the user's query and explain what the results mean.
|
| 267 |
The response will be formatted in markdown.
|
| 268 |
"""
|
| 269 |
prompt = f"""
|
| 270 |
You are a knowledgeable data analyst. Based on the following analysis result and the user's query, provide a detailed interpretation and descriptive analysis of the results. Explain what the results mean, any insights that can be drawn, and any potential limitations.
|
| 271 |
Please format your output in markdown (including headers, bullet points, and other markdown formatting as appropriate).
|
|
|
|
| 272 |
User Query: "{nl_query}"
|
|
|
|
| 273 |
Analysis Result:
|
| 274 |
{analysis_result}
|
|
|
|
| 275 |
Provide a clear and detailed explanation in plain language.
|
| 276 |
"""
|
| 277 |
+
response = groq_client.chat.completions.create(
|
| 278 |
+
model=GROQ_MODEL,
|
| 279 |
+
messages=[
|
| 280 |
+
{"role": "system", "content": "You are an expert data analysis assistant who explains analysis results clearly."},
|
| 281 |
+
{"role": "user", "content": prompt},
|
| 282 |
+
],
|
| 283 |
+
temperature=0.5,
|
| 284 |
+
max_tokens=5000,
|
| 285 |
)
|
| 286 |
+
interpretation = (response.choices[0].message.content or "").strip()
|
| 287 |
return interpretation
|
| 288 |
|
| 289 |
+
|
| 290 |
def generate_and_run(nl_query, file, interactive_mode=False):
|
| 291 |
"""
|
| 292 |
+
Load the file, generate both a basic understanding and a detailed analysis code using Groq,
|
| 293 |
execute the generated code, and then generate an interpretation of the analysis result.
|
| 294 |
Returns a tuple: (analysis result, combined generated code, DataFrame preview, charts, interpretation).
|
|
|
|
| 295 |
The process is split into two steps:
|
| 296 |
1. Generate basic understanding code that produces 'basic_info'.
|
| 297 |
2. Generate problem-solving code that uses 'basic_info' and produces the final analysis ('result').
|
|
|
|
| 299 |
df = load_file(file)
|
| 300 |
if df is None:
|
| 301 |
return "Error loading file.", "", pd.DataFrame({"Error": ["No data available."]}), [], ""
|
| 302 |
+
|
| 303 |
df_preview = df.copy()
|
| 304 |
# Step 1: Generate and execute basic understanding code.
|
| 305 |
basic_code = generate_basic_understanding_code(df_preview)
|
| 306 |
basic_info, basic_charts = safe_exec_code(basic_code, df, capture_charts=False, interactive=interactive_mode)
|
| 307 |
+
|
| 308 |
# Step 2: Generate and execute problem-solving code, injecting basic_info.
|
| 309 |
problem_code = generate_problem_solving_code(nl_query, df_preview, basic_info)
|
| 310 |
+
result, problem_charts = safe_exec_code(
|
| 311 |
+
problem_code, df, capture_charts=True, interactive=interactive_mode, extra_globals={"basic_info": basic_info}
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
interpretation = generate_interpretation(result, nl_query)
|
| 315 |
combined_code = f"### Basic Understanding Code:\n{basic_code}\n\n### Problem Solving Code:\n{problem_code}"
|
| 316 |
combined_charts = basic_charts + problem_charts
|
| 317 |
return result, combined_code, df_preview, combined_charts, interpretation
|
| 318 |
|
| 319 |
+
|
| 320 |
# Gradio interface setup
|
| 321 |
with gr.Blocks() as demo:
|
| 322 |
gr.Markdown("## Dynamic Data Analysis with Two-Step Code Generation and Interpretation")
|
| 323 |
+
|
| 324 |
with gr.Tab("Data Upload & Preview"):
|
| 325 |
file_input = gr.File(label="Upload CSV or Excel file (.csv, .xls, .xlsx)")
|
| 326 |
data_preview = gr.Dataframe(label="Data Preview")
|
| 327 |
file_input.change(fn=preview_file, inputs=file_input, outputs=data_preview)
|
| 328 |
+
|
| 329 |
with gr.Tab("Generate & Execute Analysis (Gradio Mode)"):
|
| 330 |
nl_query = gr.Textbox(
|
| 331 |
+
label="Enter your query",
|
| 332 |
+
placeholder="e.g., Generate summary statistics and charts for Gender and Age distributions",
|
| 333 |
)
|
| 334 |
generate_btn = gr.Button("Generate & Execute Code")
|
| 335 |
analysis_output = gr.Textbox(label="Analysis Result", lines=10)
|
|
|
|
| 337 |
preview_output = gr.Dataframe(label="Data Preview")
|
| 338 |
charts_output = gr.Gallery(label="Charts", show_label=True)
|
| 339 |
interpretation_output = gr.Markdown(label="Interpretation")
|
| 340 |
+
|
| 341 |
generate_btn.click(
|
| 342 |
fn=lambda query, file: generate_and_run(query, file, interactive_mode=True),
|
| 343 |
inputs=[nl_query, file_input],
|
| 344 |
+
outputs=[analysis_output, code_output, preview_output, charts_output, interpretation_output],
|
| 345 |
)
|
| 346 |
+
|
| 347 |
# Launch the app. This main block is useful for Hugging Face Spaces.
|
| 348 |
if __name__ == "__main__":
|
| 349 |
demo.launch()
|
| 350 |
# demo.launch(auth=("username", "password"))
|
|
|