Spaces:

triflix
/

testingops

Paused

App Files Files Community

triflix commited on Sep 24, 2025

Commit

81f703a

verified ·

1 Parent(s): e8c15b1

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +20 -0
app.py +252 -0
requirements.txt +7 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9-slim-buster
+# Set the working directory in the container
+WORKDIR /app
+# Install any needed packages specified in requirements.txt
+# First, copy just the requirements.txt to leverage Docker cache
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code
+COPY . .
+# Expose port 7860 as requested by the user for Hugging Face Spaces
+EXPOSE 7860
+# Command to run the application
+# Use Uvicorn to run FastAPI, binding to 0.0.0.0 and the exposed port
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import os
+import shutil
+import json
+import pandas as pd
+import base64
+from google import genai
+from google.genai import types
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from typing import List
+app = FastAPI()
+# Define a temporary directory for file storage
+TMP_DIR = "/tmp/fastapi_files"
+@app.on_event("startup")
+async def startup_event():
+    """Create the temporary directory on startup if it doesn't exist."""
+    os.makedirs(TMP_DIR, exist_ok=True)
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Clean up the temporary directory on shutdown."""
+    if os.path.exists(TMP_DIR):
+        shutil.rmtree(TMP_DIR)
+def load_file(path: str):
+    ext = os.path.splitext(path)[-1].lower()
+    if ext == ".csv":
+        df = pd.read_csv(path)
+    elif ext in [".xls", ".xlsx"]:
+        # For API, we cannot interactively ask for sheet number.
+        # We'll assume the first sheet or require sheet_name as a parameter if needed.
+        # For now, let's just load the first sheet.
+        df = pd.read_excel(path, sheet_name=0)
+    else:
+        raise ValueError("Unsupported file type")
+    return df.copy()
+def preprocess(df, drop_thresh=0.5):
+    df = df.copy()
+    df.columns = [str(c).strip().lower().replace(" ", "_") for c in df.columns]
+    df = df.loc[:, df.isnull().mean() < drop_thresh]
+    for col in df.columns:
+        if pd.api.types.is_numeric_dtype(df[col]):
+            df.loc[:, col] = df[col].fillna(df[col].median())
+        elif pd.api.types.is_datetime64_any_dtype(df[col]):
+            df.loc[:, col] = df[col].fillna(pd.Timestamp('1970-01-01'))
+        else:
+            df.loc[:, col] = df[col].fillna("Unknown")
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            try:
+                df.loc[:, col] = pd.to_numeric(df[col])
+            except:
+                pass
+    df = df.drop_duplicates()
+    return df
+def metadata(df):
+    return {
+        "rows": df.shape[0],
+        "columns": df.shape[1],
+        "column_names": list(df.columns),
+        "column_types": df.dtypes.astype(str).to_dict(),
+        "unique_values": {col: df[col].nunique() for col in df.columns}
+    }
+def generate_summary(meta, fiverow):
+    client = genai.Client(api_key="AIzaSyDLa5cYGVVLVvKHuzBWVKJ-UtfQ7NgpRK0") # Use environment variable for API key
+    model = "gemini-2.5-flash-lite"
+    # direct structured system instruction enhanced with multiple layout templates
+    system_prompt = """
+You are a strict JSON generator.
+Input contains:
+- meta: dataframe metadata
+- fiverow: first 5 records of dataframe
+You must output JSON with the following structure:
+{
+  "summary": "<short natural language overview of dataset>",
+  "recommended_charts": [
+    {
+      "type": "<one of: bar, pie, timeseries, histogram, scatter, multiple_columns, stacked_bar, heatmap>",
+      "title": "<short title for chart>",
+      "columns": ["<col1>", "<col2>", "..."],
+      "python_code": "<full runnable Python code using seaborn/matplotlib that produces the chart>"
+    },
+    ...
+  ]
+}
+Mandatory rules:
+- Always produce syntactically valid JSON ONLY. No text outside the JSON object.
+- Provide at least these chart types somewhere in recommended_charts: bar, pie, timeseries, histogram, scatter, multiple_columns, stacked_bar, heatmap.
+- Use only column names that appear in meta['column_names'].
+- The python_code string must be self-contained and runnable assuming a variable `df` exists containing the full cleaned DataFrame. Start the code with imports:
+    import pandas as pd
+    import seaborn as sns
+    import matplotlib.pyplot as plt
+  and include any necessary preprocessing steps (e.g., parsing dates).
+- For timeseries charts ensure the datetime column is parsed (`pd.to_datetime`) before plotting.
+- For multiple_columns provide a pairplot or facetgrid example that uses up to 4 numeric columns or sensible categorical splits.
+- For stacked_bar, show aggregation code (groupby + unstack) and plotting with df.plot(kind='bar', stacked=True).
+- For heatmap, compute correlation matrix and plot sns.heatmap with annotations.
+- For pie charts, ensure grouping/aggregation when there are >20 unique categories (group small categories into 'Other').
+- For histogram and scatter include axis labels and tight_layout; include plt.show() at the end.
+- Keep code minimal but complete so a user can copy-paste and run (assume seaborn, matplotlib, pandas installed).
+- For each chart add a sensible "columns" list showing which columns the code uses.
+- Do not include examples using columns not present in meta.
+- Do not include more than 10 recommended_charts.
+- Ensure strings inside the JSON are escaped properly so the JSON parses.
+Produce concise natural-language one-line summary in "summary". Ensure JSON is parseable by json.loads in Python.
+"""
+    user_prompt = {
+        "meta": meta,
+        "fiverow": fiverow
+    }
+    contents = [
+        types.Content(
+            role="user",
+            parts=[types.Part.from_text(text=str(user_prompt))],
+        ),
+    ]
+    generate_content_config = types.GenerateContentConfig(
+        thinking_config=types.ThinkingConfig(thinking_budget=0),
+        response_mime_type="application/json",
+        system_instruction=[types.Part.from_text(text=system_prompt)],
+    )
+    response = ""
+    for chunk in client.models.generate_content_stream(
+        model=model,
+        contents=contents,
+        config=generate_content_config,
+    ):
+        if chunk.text:
+            response += chunk.text
+    return response
+@app.get("/")
+async def read_root():
+    return {"message": "Welcome to the FastAPI Hugging Face Space API with Data Analysis!"}
+@app.post("/analyze_data/")
+async def analyze_data(file: UploadFile = File(...)):
+    """
+    Uploads a file, preprocesses it, and generates a summary and recommended charts.
+    """
+    file_path = os.path.join(TMP_DIR, file.filename)
+    try:
+        # Save the uploaded file to the temporary directory
+        with open(file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        # Load and preprocess the file
+        df = load_file(file_path)
+        df_clean = preprocess(df)
+        # Generate metadata and first 5 rows
+        meta = metadata(df_clean)
+        fiverow = df_clean.head(5).to_dict(orient="records")
+        # Generate summary and charts using the AI model
+        summary_json = generate_summary(meta, fiverow)
+        # Clean up the uploaded file after processing
+        os.remove(file_path)
+        return json.loads(summary_json) # Return the parsed JSON response
+    except ValueError as ve:
+        raise HTTPException(status_code=400, detail=str(ve))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"An error occurred during data analysis: {e}")
+# The following endpoints are kept for general file management but are not directly used by the new /analyze_data endpoint.
+# They can be removed if not needed, or modified to work with the /tmp directory.
+@app.post("/uploadfile/")
+async def create_upload_file(file: UploadFile = File(...)):
+    """
+    Uploads a single file to the temporary directory.
+    """
+    file_path = os.path.join(TMP_DIR, file.filename)
+    try:
+        with open(file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        return {"filename": file.filename, "message": f"File '{file.filename}' uploaded successfully to {TMP_DIR}"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Could not upload file: {e}")
+@app.post("/uploadfiles/")
+async def create_upload_files(files: List[UploadFile] = File(...)):
+    """
+    Uploads multiple files to the temporary directory.
+    """
+    uploaded_files = []
+    for file in files:
+        file_path = os.path.join(TMP_DIR, file.filename)
+        try:
+            with open(file_path, "wb") as buffer:
+                shutil.copyfileobj(file.file, buffer)
+            uploaded_files.append({"filename": file.filename, "path": file_path})
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Could not upload file '{file.filename}': {e}")
+    return {"message": f"Successfully uploaded {len(uploaded_files)} files to {TMP_DIR}", "files": uploaded_files}
+@app.get("/list_files/")
+async def list_uploaded_files():
+    """
+    Lists all files currently in the temporary directory.
+    """
+    if not os.path.exists(TMP_DIR):
+        return {"message": "Temporary directory does not exist or is empty."}
+    files = os.listdir(TMP_DIR)
+    return {"files": files, "path": TMP_DIR}
+@app.get("/download_file/{filename}")
+async def download_file(filename: str):
+    """
+    Downloads a specific file from the temporary directory.
+    """
+    file_path = os.path.join(TMP_DIR, filename)
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="File not found.")
+    # In a real application, you would return a FileResponse here.
+    # For this example, we'll just confirm the file exists.
+    return {"message": f"File '{filename}' found at {file_path}. In a real app, this would be downloaded."}
+@app.post("/process_file/{filename}")
+async def process_file_data(filename: str):
+    """
+    Example endpoint to process data from an uploaded file.
+    This assumes the file is already uploaded to the temporary directory.
+    """
+    file_path = os.path.join(TMP_DIR, filename)
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="File not found. Please upload it first.")
+    try:
+        with open(file_path, "r") as f:
+            content = f.readlines()[:5] # Read first 5 lines
+        return {"filename": filename, "processed_content_sample": content, "message": "File processed successfully."}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing file: {e}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+uvicorn
+python-multipart
+pandas
+google-generativeai
+seaborn
+matplotlib