Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,48 +12,51 @@ from pathlib import Path
|
|
| 12 |
import pandas as pd
|
| 13 |
from dotenv import load_dotenv
|
| 14 |
from fastapi import FastAPI, UploadFile, File, HTTPException, Body
|
| 15 |
-
from fastapi.responses import JSONResponse
|
| 16 |
from pydantic import BaseModel, Field
|
| 17 |
|
| 18 |
from google import genai
|
| 19 |
-
from google.
|
| 20 |
|
| 21 |
# -----------------------------
|
| 22 |
# Initial Configuration
|
| 23 |
# -----------------------------
|
| 24 |
|
| 25 |
-
# Load environment variables from
|
| 26 |
load_dotenv()
|
| 27 |
|
| 28 |
# Set up logging
|
| 29 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 30 |
logger = logging.getLogger(__name__)
|
| 31 |
|
| 32 |
-
#
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# -----------------------------
|
| 37 |
# Initialize Gemini Client & FastAPI App
|
| 38 |
# -----------------------------
|
| 39 |
|
| 40 |
-
# Configure the Gemini client with the API key from environment variables
|
| 41 |
try:
|
| 42 |
-
api_key = "
|
| 43 |
if not api_key:
|
| 44 |
-
raise ValueError("GOOGLE_API_KEY not found in environment variables.")
|
| 45 |
genai.configure(api_key=api_key)
|
| 46 |
logger.info("Google GenAI client configured successfully.")
|
| 47 |
except Exception as e:
|
| 48 |
-
logger.error(f"Failed to configure Google GenAI client: {e}")
|
| 49 |
-
#
|
| 50 |
sys.exit(1)
|
| 51 |
|
| 52 |
# Initialize FastAPI app
|
| 53 |
app = FastAPI(
|
| 54 |
title="Data Analysis and Visualization API",
|
| 55 |
description="An API to analyze Excel files and generate Python code for visualizations using Google's Gemini.",
|
| 56 |
-
version="1.
|
| 57 |
)
|
| 58 |
|
| 59 |
# -----------------------------
|
|
@@ -77,7 +80,7 @@ class VisualizationResponse(BaseModel):
|
|
| 77 |
|
| 78 |
|
| 79 |
# -----------------------------
|
| 80 |
-
# Helper Functions
|
| 81 |
# -----------------------------
|
| 82 |
|
| 83 |
def get_metadata(df: pd.DataFrame) -> dict:
|
|
@@ -93,16 +96,16 @@ def get_metadata(df: pd.DataFrame) -> dict:
|
|
| 93 |
def generate_metadata_analysis(metadata: dict) -> dict:
|
| 94 |
"""Generates a JSON summary and suggestions from metadata using Gemini."""
|
| 95 |
metadata_text = json.dumps(metadata, indent=2)
|
| 96 |
-
model = "gemini-pro"
|
| 97 |
|
| 98 |
system_instruction = """
|
| 99 |
You are a structured data analysis AI. Your output must be strict JSON.
|
| 100 |
|
| 101 |
1. Summary:
|
| 102 |
-
Provide a concise description of what kind of data this is, what it likely represents, and its domain or use-case.
|
| 103 |
|
| 104 |
2. Suggestions:
|
| 105 |
-
Provide exactly three actionable analyses and visualizations based on the metadata.
|
| 106 |
|
| 107 |
Respond in this exact JSON format:
|
| 108 |
{
|
|
@@ -110,7 +113,6 @@ def generate_metadata_analysis(metadata: dict) -> dict:
|
|
| 110 |
"suggestions": ["<analysis #1>", "<analysis #2>", "<analysis #3>"]
|
| 111 |
}
|
| 112 |
"""
|
| 113 |
-
|
| 114 |
try:
|
| 115 |
response = genai.GenerativeModel(
|
| 116 |
model_name=model,
|
|
@@ -129,16 +131,13 @@ def generate_visualization_code(file_path: str, command: str) -> dict:
|
|
| 129 |
model = "gemini-pro"
|
| 130 |
|
| 131 |
system_instruction = f"""
|
| 132 |
-
You are a Python assistant that MUST return output strictly in JSON format
|
| 133 |
-
The
|
| 134 |
-
|
| 135 |
-
Requirements:
|
| 136 |
-
- "type": The suggested visualization type as a lowercase string (e.g., "bar", "pie", "line", "scatter").
|
| 137 |
-
- "code": A string of Python code. This code MUST print a JSON object to standard output. The JSON should contain the data needed for the plot. Use pandas to process the data.
|
| 138 |
-
- The code must access the data using this exact line: df = pd.read_excel(r"{file_path}")
|
| 139 |
-
- "explanation": A concise, one-sentence description of what the visualization shows.
|
| 140 |
-
"""
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
try:
|
| 143 |
response = genai.GenerativeModel(
|
| 144 |
model_name=model,
|
|
@@ -169,12 +168,10 @@ async def analyze_file(file: UploadFile = File(...)):
|
|
| 169 |
file_path = UPLOADS_DIR / f"{file_id}_{file.filename}"
|
| 170 |
|
| 171 |
try:
|
| 172 |
-
# Save the uploaded file
|
| 173 |
with open(file_path, "wb") as buffer:
|
| 174 |
buffer.write(await file.read())
|
| 175 |
-
logger.info(f"File '{file.filename}' saved
|
| 176 |
|
| 177 |
-
# Process the file
|
| 178 |
df = pd.read_excel(file_path)
|
| 179 |
metadata = get_metadata(df)
|
| 180 |
logger.info(f"Metadata extracted for file_id: {file_id}")
|
|
@@ -187,13 +184,11 @@ async def analyze_file(file: UploadFile = File(...)):
|
|
| 187 |
summary=analysis.get("summary", "No summary provided."),
|
| 188 |
suggestions=analysis.get("suggestions", [])
|
| 189 |
)
|
| 190 |
-
|
| 191 |
except Exception as e:
|
| 192 |
logger.error(f"An error occurred during file analysis: {e}")
|
| 193 |
-
# Clean up the saved file in case of an error
|
| 194 |
if file_path.exists():
|
| 195 |
os.remove(file_path)
|
| 196 |
-
raise HTTPException(status_code=500, detail=f"An internal error occurred: {e}")
|
| 197 |
|
| 198 |
|
| 199 |
@app.post("/visualize", response_model=VisualizationResponse)
|
|
@@ -202,40 +197,29 @@ async def visualize_data(request: VisualizationRequest):
|
|
| 202 |
Generate and execute Python code for a visualization based on a file_id
|
| 203 |
and a selected command from the analysis step.
|
| 204 |
"""
|
| 205 |
-
# Find the file corresponding to the file_id
|
| 206 |
matching_files = list(UPLOADS_DIR.glob(f"{request.file_id}_*"))
|
| 207 |
if not matching_files:
|
| 208 |
-
logger.error(f"File with ID '{request.file_id}' not found.")
|
| 209 |
-
raise HTTPException(status_code=404, detail="File not found. Please re-upload
|
| 210 |
|
| 211 |
file_path = matching_files[0]
|
| 212 |
logger.info(f"Found file '{file_path}' for file_id '{request.file_id}'")
|
| 213 |
|
| 214 |
-
# Generate the visualization code from Gemini
|
| 215 |
agent_output = generate_visualization_code(str(file_path), request.command)
|
| 216 |
code_to_run = agent_output.get("code")
|
| 217 |
|
| 218 |
if not code_to_run:
|
| 219 |
raise HTTPException(status_code=500, detail="AI model failed to generate valid code.")
|
| 220 |
-
|
| 221 |
logger.info(f"Code generated for command: '{request.command}'")
|
| 222 |
|
| 223 |
-
# --- Safe Code Execution using subprocess ---
|
| 224 |
try:
|
| 225 |
logger.info("Executing generated code in a sandboxed subprocess...")
|
| 226 |
process = subprocess.run(
|
| 227 |
[sys.executable, "-c", code_to_run],
|
| 228 |
-
capture_output=True,
|
| 229 |
-
text=True,
|
| 230 |
-
check=True, # Raises CalledProcessError for non-zero exit codes
|
| 231 |
-
timeout=15 # Add a timeout for safety
|
| 232 |
)
|
| 233 |
-
|
| 234 |
-
# The output from the script is expected to be a JSON string
|
| 235 |
stdout = process.stdout.strip()
|
| 236 |
-
logger.info(f"Code executed successfully. Stdout: {stdout
|
| 237 |
-
|
| 238 |
-
# Parse the JSON output from the executed code
|
| 239 |
chart_data = json.loads(stdout)
|
| 240 |
|
| 241 |
return VisualizationResponse(
|
|
@@ -244,12 +228,11 @@ async def visualize_data(request: VisualizationRequest):
|
|
| 244 |
data=chart_data,
|
| 245 |
generated_code=code_to_run
|
| 246 |
)
|
| 247 |
-
|
| 248 |
except subprocess.CalledProcessError as e:
|
| 249 |
logger.error(f"Error executing generated code. Stderr: {e.stderr}")
|
| 250 |
raise HTTPException(status_code=500, detail=f"Error during code execution: {e.stderr}")
|
| 251 |
except json.JSONDecodeError:
|
| 252 |
-
logger.error(f"Failed to decode JSON from
|
| 253 |
raise HTTPException(status_code=500, detail="Generated code did not produce valid JSON output.")
|
| 254 |
except subprocess.TimeoutExpired:
|
| 255 |
logger.error("Code execution timed out.")
|
|
@@ -261,4 +244,4 @@ async def visualize_data(request: VisualizationRequest):
|
|
| 261 |
|
| 262 |
@app.get("/", include_in_schema=False)
|
| 263 |
def root():
|
| 264 |
-
return {"message": "Welcome to the Data Analysis
|
|
|
|
| 12 |
import pandas as pd
|
| 13 |
from dotenv import load_dotenv
|
| 14 |
from fastapi import FastAPI, UploadFile, File, HTTPException, Body
|
|
|
|
| 15 |
from pydantic import BaseModel, Field
|
| 16 |
|
| 17 |
from google import genai
|
| 18 |
+
from google.generativeai import types
|
| 19 |
|
| 20 |
# -----------------------------
|
| 21 |
# Initial Configuration
|
| 22 |
# -----------------------------
|
| 23 |
|
| 24 |
+
# Load environment variables (will load from Hugging Face secrets)
|
| 25 |
load_dotenv()
|
| 26 |
|
| 27 |
# Set up logging
|
| 28 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 29 |
logger = logging.getLogger(__name__)
|
| 30 |
|
| 31 |
+
# --- MODIFICATION FOR HUGGING FACE ---
|
| 32 |
+
# Use the /tmp directory for ephemeral file storage.
|
| 33 |
+
# This is a standard temporary directory in Linux environments like HF Spaces.
|
| 34 |
+
UPLOADS_DIR = Path("/tmp/uploads")
|
| 35 |
+
# Create the directory; parents=True ensures creation of parent dirs if needed.
|
| 36 |
+
UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
|
| 37 |
+
logger.info(f"Using temporary directory for uploads: {UPLOADS_DIR}")
|
| 38 |
|
| 39 |
# -----------------------------
|
| 40 |
# Initialize Gemini Client & FastAPI App
|
| 41 |
# -----------------------------
|
| 42 |
|
| 43 |
+
# Configure the Gemini client with the API key from environment variables/secrets
|
| 44 |
try:
|
| 45 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
| 46 |
if not api_key:
|
| 47 |
+
raise ValueError("GOOGLE_API_KEY not found in environment variables or secrets.")
|
| 48 |
genai.configure(api_key=api_key)
|
| 49 |
logger.info("Google GenAI client configured successfully.")
|
| 50 |
except Exception as e:
|
| 51 |
+
logger.error(f"FATAL: Failed to configure Google GenAI client: {e}")
|
| 52 |
+
# Exit if the client can't be configured, as the app is non-functional without it.
|
| 53 |
sys.exit(1)
|
| 54 |
|
| 55 |
# Initialize FastAPI app
|
| 56 |
app = FastAPI(
|
| 57 |
title="Data Analysis and Visualization API",
|
| 58 |
description="An API to analyze Excel files and generate Python code for visualizations using Google's Gemini.",
|
| 59 |
+
version="1.1.0"
|
| 60 |
)
|
| 61 |
|
| 62 |
# -----------------------------
|
|
|
|
| 80 |
|
| 81 |
|
| 82 |
# -----------------------------
|
| 83 |
+
# Helper Functions
|
| 84 |
# -----------------------------
|
| 85 |
|
| 86 |
def get_metadata(df: pd.DataFrame) -> dict:
|
|
|
|
| 96 |
def generate_metadata_analysis(metadata: dict) -> dict:
|
| 97 |
"""Generates a JSON summary and suggestions from metadata using Gemini."""
|
| 98 |
metadata_text = json.dumps(metadata, indent=2)
|
| 99 |
+
model = "gemini-pro"
|
| 100 |
|
| 101 |
system_instruction = """
|
| 102 |
You are a structured data analysis AI. Your output must be strict JSON.
|
| 103 |
|
| 104 |
1. Summary:
|
| 105 |
+
Provide a concise description of what kind of data this is, what it likely represents, and its domain or use-case.
|
| 106 |
|
| 107 |
2. Suggestions:
|
| 108 |
+
Provide exactly three actionable analyses and visualizations based on the metadata.
|
| 109 |
|
| 110 |
Respond in this exact JSON format:
|
| 111 |
{
|
|
|
|
| 113 |
"suggestions": ["<analysis #1>", "<analysis #2>", "<analysis #3>"]
|
| 114 |
}
|
| 115 |
"""
|
|
|
|
| 116 |
try:
|
| 117 |
response = genai.GenerativeModel(
|
| 118 |
model_name=model,
|
|
|
|
| 131 |
model = "gemini-pro"
|
| 132 |
|
| 133 |
system_instruction = f"""
|
| 134 |
+
You are a Python assistant that MUST return output strictly in JSON format.
|
| 135 |
+
The JSON MUST contain exactly three keys: "type", "code", "explanation".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
+
- "type": Lowercase visualization type (e.g., "bar", "pie", "line").
|
| 138 |
+
- "code": A string of Python code that prints a JSON object to standard output. The code must access data using this exact line: df = pd.read_excel(r"{file_path}")
|
| 139 |
+
- "explanation": A one-sentence description of the visualization.
|
| 140 |
+
"""
|
| 141 |
try:
|
| 142 |
response = genai.GenerativeModel(
|
| 143 |
model_name=model,
|
|
|
|
| 168 |
file_path = UPLOADS_DIR / f"{file_id}_{file.filename}"
|
| 169 |
|
| 170 |
try:
|
|
|
|
| 171 |
with open(file_path, "wb") as buffer:
|
| 172 |
buffer.write(await file.read())
|
| 173 |
+
logger.info(f"File '{file.filename}' saved to temp path '{file_path}'")
|
| 174 |
|
|
|
|
| 175 |
df = pd.read_excel(file_path)
|
| 176 |
metadata = get_metadata(df)
|
| 177 |
logger.info(f"Metadata extracted for file_id: {file_id}")
|
|
|
|
| 184 |
summary=analysis.get("summary", "No summary provided."),
|
| 185 |
suggestions=analysis.get("suggestions", [])
|
| 186 |
)
|
|
|
|
| 187 |
except Exception as e:
|
| 188 |
logger.error(f"An error occurred during file analysis: {e}")
|
|
|
|
| 189 |
if file_path.exists():
|
| 190 |
os.remove(file_path)
|
| 191 |
+
raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}")
|
| 192 |
|
| 193 |
|
| 194 |
@app.post("/visualize", response_model=VisualizationResponse)
|
|
|
|
| 197 |
Generate and execute Python code for a visualization based on a file_id
|
| 198 |
and a selected command from the analysis step.
|
| 199 |
"""
|
|
|
|
| 200 |
matching_files = list(UPLOADS_DIR.glob(f"{request.file_id}_*"))
|
| 201 |
if not matching_files:
|
| 202 |
+
logger.error(f"File with ID '{request.file_id}' not found in {UPLOADS_DIR}.")
|
| 203 |
+
raise HTTPException(status_code=404, detail="File not found. It may have been cleared from the temporary cache. Please re-upload.")
|
| 204 |
|
| 205 |
file_path = matching_files[0]
|
| 206 |
logger.info(f"Found file '{file_path}' for file_id '{request.file_id}'")
|
| 207 |
|
|
|
|
| 208 |
agent_output = generate_visualization_code(str(file_path), request.command)
|
| 209 |
code_to_run = agent_output.get("code")
|
| 210 |
|
| 211 |
if not code_to_run:
|
| 212 |
raise HTTPException(status_code=500, detail="AI model failed to generate valid code.")
|
|
|
|
| 213 |
logger.info(f"Code generated for command: '{request.command}'")
|
| 214 |
|
|
|
|
| 215 |
try:
|
| 216 |
logger.info("Executing generated code in a sandboxed subprocess...")
|
| 217 |
process = subprocess.run(
|
| 218 |
[sys.executable, "-c", code_to_run],
|
| 219 |
+
capture_output=True, text=True, check=True, timeout=20
|
|
|
|
|
|
|
|
|
|
| 220 |
)
|
|
|
|
|
|
|
| 221 |
stdout = process.stdout.strip()
|
| 222 |
+
logger.info(f"Code executed successfully. Stdout length: {len(stdout)}")
|
|
|
|
|
|
|
| 223 |
chart_data = json.loads(stdout)
|
| 224 |
|
| 225 |
return VisualizationResponse(
|
|
|
|
| 228 |
data=chart_data,
|
| 229 |
generated_code=code_to_run
|
| 230 |
)
|
|
|
|
| 231 |
except subprocess.CalledProcessError as e:
|
| 232 |
logger.error(f"Error executing generated code. Stderr: {e.stderr}")
|
| 233 |
raise HTTPException(status_code=500, detail=f"Error during code execution: {e.stderr}")
|
| 234 |
except json.JSONDecodeError:
|
| 235 |
+
logger.error(f"Failed to decode JSON from stdout. Output was: {stdout}")
|
| 236 |
raise HTTPException(status_code=500, detail="Generated code did not produce valid JSON output.")
|
| 237 |
except subprocess.TimeoutExpired:
|
| 238 |
logger.error("Code execution timed out.")
|
|
|
|
| 244 |
|
| 245 |
@app.get("/", include_in_schema=False)
|
| 246 |
def root():
|
| 247 |
+
return {"message": "Welcome to the Data Analysis API. Visit /docs for the API interface."}
|