CSV

Sleeping

App Files Files Community

alexacido commited on Jan 14

Commit

85c7df6

verified ·

1 Parent(s): 56295c4

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -89

app.py CHANGED Viewed

@@ -1,17 +1,24 @@
 import os
 import io
 import re
 import gradio as gr
 import pandas as pd
-import openai
 import matplotlib.pyplot as plt
-from dotenv import load_dotenv
 from PIL import Image
 import traceback
-# Load your OpenAI API key from the environment (Hugging Face Spaces secrets will populate it)
-load_dotenv()
-openai.api_key = os.getenv("OPENAI_API_KEY")
 def load_file(file):
     """Load a CSV or Excel file into a pandas DataFrame."""
@@ -25,12 +32,12 @@ def load_file(file):
         if file_path is None:
             return None
         try:
-            if file_name.endswith('.csv'):
                 df = pd.read_csv(file_path)
-            elif file_name.endswith('.xlsx'):
-                df = pd.read_excel(file_path, engine='openpyxl')
-            elif file_name.endswith('.xls'):
-                df = pd.read_excel(file_path, engine='xlrd')
             else:
                 return None
         except Exception as e:
@@ -41,12 +48,12 @@ def load_file(file):
         # Assume file is a file-like object (as on your local machine)
         file_name = file.name.lower()
         try:
-            if file_name.endswith('.csv'):
                 df = pd.read_csv(file)
-            elif file_name.endswith('.xlsx'):
-                df = pd.read_excel(file, engine='openpyxl')
-            elif file_name.endswith('.xls'):
-                df = pd.read_excel(file, engine='xlrd')
             else:
                 return None
         except Exception as e:
@@ -54,24 +61,16 @@ def load_file(file):
             return None
         return df
 def preview_file(file):
     """Return the DataFrame for preview."""
     df = load_file(file)
     if df is None:
-        # Return a DataFrame with an error message instead of a plain string
         return pd.DataFrame({"Error": ["Error loading file or unsupported file type."]})
     return df
 def generate_basic_understanding_code(df_preview):
-    """
-    Generate Python code that performs an exploratory analysis on the DataFrame.
-    The generated code should output a variable 'basic_info' that is a dictionary containing:
-    - The data types of each column.
-    - For numeric columns, summary statistics (mean, median, std, etc.).
-    - For non-numeric columns, counts, unique values, mode, and frequency distributions.
-    If charts are generated, ensure plt.show() is called after each chart so they can be captured.
-    Note: When converting dates, use pd.to_datetime() without a fixed format or with dayfirst=True.
-    """
     prompt = f"""
 You are a data analysis expert. Write Python code that performs an exploratory analysis of the DataFrame.
 Assume a pandas DataFrame named 'df' is already loaded.
@@ -83,37 +82,26 @@ For each column in df, include its data type.
 When converting date strings to datetime, use pd.to_datetime() without a fixed format or with dayfirst=True.
 If your analysis includes charts, call plt.show() after each chart so they can be captured.
 Only reference columns that are present in df.columns.
 Note: The following safe built-ins are available: list, dict, set, tuple, abs, min, max, sum, len, range, print, pd, plt, __import__.
 DataFrame preview:
 Columns: {list(df_preview.columns)}
 Sample Data (first 3 rows):
 {df_preview.head(3).to_dict(orient='records')}
 """
-    response = openai.chat.completions.create(
-        model="gpt-4o-mini",
         messages=[
             {"role": "system", "content": "You are an expert data analysis assistant who outputs only raw Python code."},
-            {"role": "user", "content": prompt}
         ],
         temperature=0.3,
         max_tokens=3500,
     )
-    code = response.choices[0].message.content.strip()
     return code
 def generate_problem_solving_code(nl_query, df_preview, basic_info):
-    """
-    Generate Python code that solves the user's analysis query.
-    The code should assume that the DataFrame 'df' is loaded and that the variable 'basic_info'
-    (the output from the initial exploratory analysis) is available.
-    The final analysis should be assigned to a variable named 'result' as a dictionary with keys:
-    'summary', 'detailed_stats', 'insights', and 'chart_descriptions'.
-    If charts are generated, call plt.show() after each chart so they can be captured.
-    Note: When converting date strings to datetime, use pd.to_datetime() without a fixed format or with dayfirst=True.
-    Only reference columns that are present in df.columns.
-    """
     prompt = f"""
 You are a data analysis expert. Write Python code that performs the analysis as described below.
 Assume a pandas DataFrame named 'df' is already loaded and that you have already generated an exploratory summary stored in 'basic_info'.
@@ -127,28 +115,26 @@ Incorporate insights from 'basic_info' if relevant.
 When converting date strings to datetime, use pd.to_datetime() without a fixed format or with dayfirst=True.
 If your analysis includes charts, call plt.show() after each chart so they can be captured.
 Only reference columns that are present in df.columns.
 Note: The following safe built-ins are available: list, dict, set, tuple, abs, min, max, sum, len, range, print, pd, plt, __import__.
 DataFrame preview:
 Columns: {list(df_preview.columns)}
 Sample Data (first 3 rows):
 {df_preview.head(3).to_dict(orient='records')}
 User Query: "{nl_query}"
 """
-    response = openai.chat.completions.create(
-        model="gpt-4o-mini",
         messages=[
             {"role": "system", "content": "You are an expert data analysis assistant who outputs only raw Python code."},
-            {"role": "user", "content": prompt}
         ],
         temperature=0.3,
         max_tokens=3500,
     )
-    code = response.choices[0].message.content.strip()
     return code
 def validate_generated_code(code, df):
     """
     Validate that the generated code references only columns that exist in the DataFrame.
@@ -161,6 +147,7 @@ def validate_generated_code(code, df):
         return False, missing_cols
     return True, []
 def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globals=None):
     """
     Execute the generated code in a restricted namespace.
@@ -171,13 +158,15 @@ def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globa
     code_lines = code.splitlines()
     clean_lines = [line for line in code_lines if not line.strip().startswith("```")]
     clean_code = "\n".join(clean_lines).strip()
     # Validate that the generated code references only existing DataFrame columns.
     valid, missing_cols = validate_generated_code(clean_code, df)
     if not valid:
-        return (f"Generated code references missing columns: {missing_cols}\nPlease adjust your prompt or data.",
-                [])
     # Expanded safe built-ins. Including float, int, bool, etc.
     safe_builtins = {
         "abs": abs,
@@ -205,10 +194,11 @@ def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globa
         "__import__": __import__,
     }
     safe_globals = {"__builtins__": safe_builtins, "df": df, "plt": plt, "charts": []}
     # Pre-import seaborn as sns if available.
     try:
         import seaborn as sns
         safe_globals["sns"] = sns
     except ImportError:
         pass
@@ -216,8 +206,9 @@ def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globa
     if extra_globals is not None:
         safe_globals.update(extra_globals)
     safe_locals = {}
     if capture_charts:
         def custom_show(*args, **kwargs):
             buf = io.BytesIO()
             plt.savefig(buf, format="png")
@@ -225,24 +216,30 @@ def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globa
             img = Image.open(buf).convert("RGB")
             safe_globals["charts"].append(img)
             plt.close()
         safe_globals["plt"].show = custom_show
     try:
-        # Directly execute the multi-line generated code.
         exec(clean_code, safe_globals, safe_locals)
         output = safe_locals.get("result", None)
         if output is None:
             output = safe_locals.get("basic_info", None)
-    except Exception as ex:
         error_details = traceback.format_exc()
         if "ValueError: time data" in error_details:
-            error_details += "\nHint: The generated code might be using a fixed datetime format. Consider using pd.to_datetime() without a fixed format or with dayfirst=True."
         if "KeyError" in error_details:
             error_details += "\nHint: The generated code might be referencing columns that do not exist in your DataFrame."
         if "NameError" in error_details:
-            error_details += "\nHint: Ensure that all required built-in types and libraries (like float, int, etc.) are included in the safe built-ins."
         return f"An error occurred during code execution:\n{error_details}", safe_globals["charts"]
     if capture_charts and not safe_globals["charts"]:
         fig_nums = plt.get_fignums()
         for num in fig_nums:
@@ -253,50 +250,48 @@ def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globa
             img = Image.open(buf).convert("RGB")
             safe_globals["charts"].append(img)
         plt.close("all")
     if interactive:
         for img in safe_globals["charts"]:
             img.show()
     if output is None:
         output = "No output variable ('result' or 'basic_info') was set by the code."
     return output, safe_globals["charts"]
 def generate_interpretation(analysis_result, nl_query):
     """
-    Use OpenAI to generate a detailed interpretation of the analysis result.
     Provide context from the user's query and explain what the results mean.
     The response will be formatted in markdown.
     """
     prompt = f"""
 You are a knowledgeable data analyst. Based on the following analysis result and the user's query, provide a detailed interpretation and descriptive analysis of the results. Explain what the results mean, any insights that can be drawn, and any potential limitations.
 Please format your output in markdown (including headers, bullet points, and other markdown formatting as appropriate).
 User Query: "{nl_query}"
 Analysis Result:
 {analysis_result}
 Provide a clear and detailed explanation in plain language.
 """
-    response = openai.chat.completions.create(
-         model="gpt-4o-mini",
-         messages=[
-             {"role": "system", "content": "You are an expert data analysis assistant who explains analysis results clearly."},
-             {"role": "user", "content": prompt}
-         ],
-         temperature=0.5,
-         max_tokens=5000,
     )
-    interpretation = response.choices[0].message.content.strip()
     return interpretation
 def generate_and_run(nl_query, file, interactive_mode=False):
     """
-    Load the file, generate both a basic understanding and a detailed analysis code using OpenAI,
     execute the generated code, and then generate an interpretation of the analysis result.
     Returns a tuple: (analysis result, combined generated code, DataFrame preview, charts, interpretation).
     The process is split into two steps:
     1. Generate basic understanding code that produces 'basic_info'.
     2. Generate problem-solving code that uses 'basic_info' and produces the final analysis ('result').
@@ -304,34 +299,37 @@ def generate_and_run(nl_query, file, interactive_mode=False):
     df = load_file(file)
     if df is None:
         return "Error loading file.", "", pd.DataFrame({"Error": ["No data available."]}), [], ""
     df_preview = df.copy()
     # Step 1: Generate and execute basic understanding code.
     basic_code = generate_basic_understanding_code(df_preview)
     basic_info, basic_charts = safe_exec_code(basic_code, df, capture_charts=False, interactive=interactive_mode)
     # Step 2: Generate and execute problem-solving code, injecting basic_info.
     problem_code = generate_problem_solving_code(nl_query, df_preview, basic_info)
-    result, problem_charts = safe_exec_code(problem_code, df, capture_charts=True, interactive=interactive_mode, extra_globals={"basic_info": basic_info})
     interpretation = generate_interpretation(result, nl_query)
     combined_code = f"### Basic Understanding Code:\n{basic_code}\n\n### Problem Solving Code:\n{problem_code}"
     combined_charts = basic_charts + problem_charts
     return result, combined_code, df_preview, combined_charts, interpretation
 # Gradio interface setup
 with gr.Blocks() as demo:
     gr.Markdown("## Dynamic Data Analysis with Two-Step Code Generation and Interpretation")
     with gr.Tab("Data Upload & Preview"):
         file_input = gr.File(label="Upload CSV or Excel file (.csv, .xls, .xlsx)")
         data_preview = gr.Dataframe(label="Data Preview")
         file_input.change(fn=preview_file, inputs=file_input, outputs=data_preview)
     with gr.Tab("Generate & Execute Analysis (Gradio Mode)"):
         nl_query = gr.Textbox(
-            label="Enter your query",
-            placeholder="e.g., Generate summary statistics and charts for Gender and Age distributions"
         )
         generate_btn = gr.Button("Generate & Execute Code")
         analysis_output = gr.Textbox(label="Analysis Result", lines=10)
@@ -339,15 +337,14 @@ with gr.Blocks() as demo:
         preview_output = gr.Dataframe(label="Data Preview")
         charts_output = gr.Gallery(label="Charts", show_label=True)
         interpretation_output = gr.Markdown(label="Interpretation")
         generate_btn.click(
             fn=lambda query, file: generate_and_run(query, file, interactive_mode=True),
             inputs=[nl_query, file_input],
-            outputs=[analysis_output, code_output, preview_output, charts_output, interpretation_output]
         )
 # Launch the app. This main block is useful for Hugging Face Spaces.
 if __name__ == "__main__":
     demo.launch()
     # demo.launch(auth=("username", "password"))

+# app.py
 import os
 import io
 import re
 import gradio as gr
 import pandas as pd
 import matplotlib.pyplot as plt
 from PIL import Image
 import traceback
+from groq import Groq
+# Groq config (Secrets en Hugging Face Space)
+GROQ_API_KEY = (os.getenv("GROQ_API_KEY") or "").strip()
+GROQ_MODEL = (os.getenv("GROQ_MODEL") or "llama-3.3-70b-versatile").strip()
+if not GROQ_API_KEY:
+    raise RuntimeError("Falta GROQ_API_KEY en Secrets del Space.")
+groq_client = Groq(api_key=GROQ_API_KEY)
 def load_file(file):
     """Load a CSV or Excel file into a pandas DataFrame."""
         if file_path is None:
             return None
         try:
+            if file_name.endswith(".csv"):
                 df = pd.read_csv(file_path)
+            elif file_name.endswith(".xlsx"):
+                df = pd.read_excel(file_path, engine="openpyxl")
+            elif file_name.endswith(".xls"):
+                df = pd.read_excel(file_path, engine="xlrd")
             else:
                 return None
         except Exception as e:
         # Assume file is a file-like object (as on your local machine)
         file_name = file.name.lower()
         try:
+            if file_name.endswith(".csv"):
                 df = pd.read_csv(file)
+            elif file_name.endswith(".xlsx"):
+                df = pd.read_excel(file, engine="openpyxl")
+            elif file_name.endswith(".xls"):
+                df = pd.read_excel(file, engine="xlrd")
             else:
                 return None
         except Exception as e:
             return None
         return df
 def preview_file(file):
     """Return the DataFrame for preview."""
     df = load_file(file)
     if df is None:
         return pd.DataFrame({"Error": ["Error loading file or unsupported file type."]})
     return df
 def generate_basic_understanding_code(df_preview):
     prompt = f"""
 You are a data analysis expert. Write Python code that performs an exploratory analysis of the DataFrame.
 Assume a pandas DataFrame named 'df' is already loaded.
 When converting date strings to datetime, use pd.to_datetime() without a fixed format or with dayfirst=True.
 If your analysis includes charts, call plt.show() after each chart so they can be captured.
 Only reference columns that are present in df.columns.
 Note: The following safe built-ins are available: list, dict, set, tuple, abs, min, max, sum, len, range, print, pd, plt, __import__.
 DataFrame preview:
 Columns: {list(df_preview.columns)}
 Sample Data (first 3 rows):
 {df_preview.head(3).to_dict(orient='records')}
 """
+    response = groq_client.chat.completions.create(
+        model=GROQ_MODEL,
         messages=[
             {"role": "system", "content": "You are an expert data analysis assistant who outputs only raw Python code."},
+            {"role": "user", "content": prompt},
         ],
         temperature=0.3,
         max_tokens=3500,
     )
+    code = (response.choices[0].message.content or "").strip()
     return code
 def generate_problem_solving_code(nl_query, df_preview, basic_info):
     prompt = f"""
 You are a data analysis expert. Write Python code that performs the analysis as described below.
 Assume a pandas DataFrame named 'df' is already loaded and that you have already generated an exploratory summary stored in 'basic_info'.
 When converting date strings to datetime, use pd.to_datetime() without a fixed format or with dayfirst=True.
 If your analysis includes charts, call plt.show() after each chart so they can be captured.
 Only reference columns that are present in df.columns.
 Note: The following safe built-ins are available: list, dict, set, tuple, abs, min, max, sum, len, range, print, pd, plt, __import__.
 DataFrame preview:
 Columns: {list(df_preview.columns)}
 Sample Data (first 3 rows):
 {df_preview.head(3).to_dict(orient='records')}
 User Query: "{nl_query}"
 """
+    response = groq_client.chat.completions.create(
+        model=GROQ_MODEL,
         messages=[
             {"role": "system", "content": "You are an expert data analysis assistant who outputs only raw Python code."},
+            {"role": "user", "content": prompt},
         ],
         temperature=0.3,
         max_tokens=3500,
     )
+    code = (response.choices[0].message.content or "").strip()
     return code
 def validate_generated_code(code, df):
     """
     Validate that the generated code references only columns that exist in the DataFrame.
         return False, missing_cols
     return True, []
 def safe_exec_code(code, df, capture_charts=True, interactive=False, extra_globals=None):
     """
     Execute the generated code in a restricted namespace.
     code_lines = code.splitlines()
     clean_lines = [line for line in code_lines if not line.strip().startswith("```")]
     clean_code = "\n".join(clean_lines).strip()
     # Validate that the generated code references only existing DataFrame columns.
     valid, missing_cols = validate_generated_code(clean_code, df)
     if not valid:
+        return (
+            f"Generated code references missing columns: {missing_cols}\nPlease adjust your prompt or data.",
+            [],
+        )
     # Expanded safe built-ins. Including float, int, bool, etc.
     safe_builtins = {
         "abs": abs,
         "__import__": __import__,
     }
     safe_globals = {"__builtins__": safe_builtins, "df": df, "plt": plt, "charts": []}
     # Pre-import seaborn as sns if available.
     try:
         import seaborn as sns
         safe_globals["sns"] = sns
     except ImportError:
         pass
     if extra_globals is not None:
         safe_globals.update(extra_globals)
     safe_locals = {}
     if capture_charts:
         def custom_show(*args, **kwargs):
             buf = io.BytesIO()
             plt.savefig(buf, format="png")
             img = Image.open(buf).convert("RGB")
             safe_globals["charts"].append(img)
             plt.close()
         safe_globals["plt"].show = custom_show
     try:
         exec(clean_code, safe_globals, safe_locals)
         output = safe_locals.get("result", None)
         if output is None:
             output = safe_locals.get("basic_info", None)
+    except Exception:
         error_details = traceback.format_exc()
         if "ValueError: time data" in error_details:
+            error_details += (
+                "\nHint: The generated code might be using a fixed datetime format. "
+                "Consider using pd.to_datetime() without a fixed format or with dayfirst=True."
+            )
         if "KeyError" in error_details:
             error_details += "\nHint: The generated code might be referencing columns that do not exist in your DataFrame."
         if "NameError" in error_details:
+            error_details += (
+                "\nHint: Ensure that all required built-in types and libraries (like float, int, etc.) "
+                "are included in the safe built-ins."
+            )
         return f"An error occurred during code execution:\n{error_details}", safe_globals["charts"]
     if capture_charts and not safe_globals["charts"]:
         fig_nums = plt.get_fignums()
         for num in fig_nums:
             img = Image.open(buf).convert("RGB")
             safe_globals["charts"].append(img)
         plt.close("all")
     if interactive:
         for img in safe_globals["charts"]:
             img.show()
     if output is None:
         output = "No output variable ('result' or 'basic_info') was set by the code."
     return output, safe_globals["charts"]
 def generate_interpretation(analysis_result, nl_query):
     """
+    Use Groq to generate a detailed interpretation of the analysis result.
     Provide context from the user's query and explain what the results mean.
     The response will be formatted in markdown.
     """
     prompt = f"""
 You are a knowledgeable data analyst. Based on the following analysis result and the user's query, provide a detailed interpretation and descriptive analysis of the results. Explain what the results mean, any insights that can be drawn, and any potential limitations.
 Please format your output in markdown (including headers, bullet points, and other markdown formatting as appropriate).
 User Query: "{nl_query}"
 Analysis Result:
 {analysis_result}
 Provide a clear and detailed explanation in plain language.
 """
+    response = groq_client.chat.completions.create(
+        model=GROQ_MODEL,
+        messages=[
+            {"role": "system", "content": "You are an expert data analysis assistant who explains analysis results clearly."},
+            {"role": "user", "content": prompt},
+        ],
+        temperature=0.5,
+        max_tokens=5000,
     )
+    interpretation = (response.choices[0].message.content or "").strip()
     return interpretation
 def generate_and_run(nl_query, file, interactive_mode=False):
     """
+    Load the file, generate both a basic understanding and a detailed analysis code using Groq,
     execute the generated code, and then generate an interpretation of the analysis result.
     Returns a tuple: (analysis result, combined generated code, DataFrame preview, charts, interpretation).
     The process is split into two steps:
     1. Generate basic understanding code that produces 'basic_info'.
     2. Generate problem-solving code that uses 'basic_info' and produces the final analysis ('result').
     df = load_file(file)
     if df is None:
         return "Error loading file.", "", pd.DataFrame({"Error": ["No data available."]}), [], ""
     df_preview = df.copy()
     # Step 1: Generate and execute basic understanding code.
     basic_code = generate_basic_understanding_code(df_preview)
     basic_info, basic_charts = safe_exec_code(basic_code, df, capture_charts=False, interactive=interactive_mode)
     # Step 2: Generate and execute problem-solving code, injecting basic_info.
     problem_code = generate_problem_solving_code(nl_query, df_preview, basic_info)
+    result, problem_charts = safe_exec_code(
+        problem_code, df, capture_charts=True, interactive=interactive_mode, extra_globals={"basic_info": basic_info}
+    )
     interpretation = generate_interpretation(result, nl_query)
     combined_code = f"### Basic Understanding Code:\n{basic_code}\n\n### Problem Solving Code:\n{problem_code}"
     combined_charts = basic_charts + problem_charts
     return result, combined_code, df_preview, combined_charts, interpretation
 # Gradio interface setup
 with gr.Blocks() as demo:
     gr.Markdown("## Dynamic Data Analysis with Two-Step Code Generation and Interpretation")
     with gr.Tab("Data Upload & Preview"):
         file_input = gr.File(label="Upload CSV or Excel file (.csv, .xls, .xlsx)")
         data_preview = gr.Dataframe(label="Data Preview")
         file_input.change(fn=preview_file, inputs=file_input, outputs=data_preview)
     with gr.Tab("Generate & Execute Analysis (Gradio Mode)"):
         nl_query = gr.Textbox(
+            label="Enter your query",
+            placeholder="e.g., Generate summary statistics and charts for Gender and Age distributions",
         )
         generate_btn = gr.Button("Generate & Execute Code")
         analysis_output = gr.Textbox(label="Analysis Result", lines=10)
         preview_output = gr.Dataframe(label="Data Preview")
         charts_output = gr.Gallery(label="Charts", show_label=True)
         interpretation_output = gr.Markdown(label="Interpretation")
         generate_btn.click(
             fn=lambda query, file: generate_and_run(query, file, interactive_mode=True),
             inputs=[nl_query, file_input],
+            outputs=[analysis_output, code_output, preview_output, charts_output, interpretation_output],
         )
 # Launch the app. This main block is useful for Hugging Face Spaces.
 if __name__ == "__main__":
     demo.launch()
     # demo.launch(auth=("username", "password"))