Spaces:

triflix
/

phase2

Paused

App Files Files Community

triflix commited on Sep 27, 2025

Commit

167ca93

verified ·

1 Parent(s): c68da81

Create app.py

Browse files

Files changed (1) hide show

app.py +468 -0

app.py ADDED Viewed

	@@ -0,0 +1,468 @@

+# fastapi_snapshot_app_improved.py
+from fastapi import FastAPI, UploadFile, File, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+import pandas as pd
+import os
+import json
+import tempfile
+from typing import Optional, List
+from pydantic import BaseModel
+from google import genai
+from google.genai import types
+import logging
+import hashlib
+import uuid
+from datetime import datetime, timezone
+import motor.motor_asyncio
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+# ----------------------------
+# Configuration
+# ----------------------------
+MONGO_URI = os.getenv("MONGO_URI", "mongodb://localhost:27017")
+DB_NAME = os.getenv("DB_NAME", "data_analysis")
+SNAPSHOT_BUCKET = os.getenv("SNAPSHOT_DIR", "/tmp/snapshots")
+os.makedirs(SNAPSHOT_BUCKET, exist_ok=True)
+MAX_UPLOAD_SIZE = int(os.getenv("MAX_UPLOAD_SIZE_BYTES", 200 * 1024 * 1024))  # 200MB default
+METADATA_ONLY_FALLBACK = os.getenv("METADATA_ONLY_FALLBACK", "true").lower() == "true"
+TTL_DAYS = int(os.getenv("SNAPSHOT_TTL_DAYS", "0"))  # 0 = no TTL
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# FastAPI app
+app = FastAPI(title="Data Analysis API with Snapshotting", version="3.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Mongo client (async)
+mongo_client = motor.motor_asyncio.AsyncIOMotorClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+snapshots = db.snapshots
+# Thread pool for blocking tasks (AI calls, heavy pandas ops)
+EXECUTOR = ThreadPoolExecutor(max_workers=int(os.getenv("EXECUTOR_WORKERS", "2")))
+# ---------- Helpers ----------
+def sha256_bytes(data: bytes) -> str:
+    h = hashlib.sha256()
+    h.update(data)
+    return h.hexdigest()
+def sha256_text(text: str) -> str:
+    return sha256_bytes(text.encode("utf-8"))
+def sha256_obj(obj) -> str:
+    text = json.dumps(obj, sort_keys=True, default=str)
+    return sha256_text(text)
+def canonical_types(df: pd.DataFrame) -> dict:
+    def map_type(dtype):
+        if pd.api.types.is_integer_dtype(dtype) or pd.api.types.is_float_dtype(dtype):
+            return "numeric"
+        if pd.api.types.is_datetime64_any_dtype(dtype):
+            return "datetime"
+        return "object"
+    return {col: map_type(dtype) for col, dtype in df.dtypes.items()}
+async def save_preprocessed_df(df: pd.DataFrame, snapshot_id: str) -> str:
+    path = os.path.join(SNAPSHOT_BUCKET, f"{snapshot_id}.csv")
+    # use pandas to_csv which is blocking; run in executor to avoid blocking event loop
+    loop = asyncio.get_running_loop()
+    await loop.run_in_executor(EXECUTOR, df.to_csv, path, False, False, None)
+    return path
+def load_file_from_path(file_path: str, original_filename: str) -> pd.DataFrame:
+    ext = os.path.splitext(original_filename)[-1].lower()
+    if ext == ".csv":
+        # try common encodings; let pandas infer by default
+        return pd.read_csv(file_path)
+    elif ext in [".xls", ".xlsx"]:
+        return pd.read_excel(file_path, sheet_name=0)
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")
+def preprocess(df: pd.DataFrame, drop_thresh=0.5) -> pd.DataFrame:
+    df = df.copy()
+    df.columns = [str(c).strip().lower().replace(" ", "_") for c in df.columns]
+    df = df.loc[:, df.isnull().mean() < drop_thresh]
+    for col in df.columns:
+        if pd.api.types.is_numeric_dtype(df[col]):
+            df.loc[:, col] = df[col].fillna(df[col].median())
+        elif pd.api.types.is_datetime64_any_dtype(df[col]):
+            df.loc[:, col] = df[col].fillna(pd.Timestamp('1970-01-01'))
+        else:
+            df.loc[:, col] = df[col].fillna("Unknown")
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            try:
+                df.loc[:, col] = pd.to_numeric(df[col])
+            except Exception:
+                pass
+    df = df.drop_duplicates()
+    return df
+def get_metadata(df: pd.DataFrame) -> dict:
+    return {
+        "rows": int(df.shape[0]),
+        "columns": int(df.shape[1]),
+        "column_names": list(df.columns),
+        "column_types": {col: str(dtype) for col, dtype in df.dtypes.items()},
+        "unique_values": {col: int(df[col].nunique()) for col in df.columns}
+    }
+def data_fingerprint(df: pd.DataFrame, n_sample_rows: int = 100) -> str:
+    # Deterministic fingerprint: canonical column order, sample head & tail JSON + aggregated stats
+    df2 = df.copy()
+    df2 = df2.reindex(sorted(df2.columns), axis=1)
+    head = df2.head(n_sample_rows).to_json(orient="split", date_format="iso", force_ascii=False)
+    tail = df2.tail(n_sample_rows).to_json(orient="split", date_format="iso", force_ascii=False)
+    col_aggs = {c: {"nunique": int(df2[c].nunique()), "nulls": int(df2[c].isnull().sum())} for c in df2.columns}
+    text = head + tail + json.dumps(col_aggs, sort_keys=True, default=str)
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+def stream_save_and_hash(upload_file: UploadFile, tmp_path: str, size_limit: Optional[int] = None) -> str:
+    h = hashlib.sha256()
+    total = 0
+    with open(tmp_path, "wb") as f:
+        while True:
+            chunk = upload_file.file.read(8192)
+            if not chunk:
+                break
+            f.write(chunk)
+            h.update(chunk)
+            total += len(chunk)
+            if size_limit and total > size_limit:
+                raise HTTPException(status_code=413, detail="Uploaded file exceeds maximum allowed size")
+    return h.hexdigest()
+# ---------- AI interaction (blocking) ----------
+def generate_summary_blocking(meta, fiverow) -> str:
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        raise RuntimeError("GEMINI_API_KEY not set")
+    client = genai.Client(api_key=api_key)
+    model = "gemini-2.5-flash-lite"
+    system_prompt = """
+You are a strict JSON generator.
+Input contains:
+- meta: dataframe metadata
+- fiverow: first 5 records of dataframe
+You must output JSON with the following structure:
+{ "summary": "<short natural language overview>", "recommended_charts": [ ... ] }
+Always produce syntactically valid JSON ONLY.
+"""
+    user_prompt = {"meta": meta, "fiverow": fiverow}
+    contents = [
+        types.Content(
+            role="user",
+            parts=[types.Part.from_text(text=str(user_prompt))],
+        ),
+    ]
+    generate_content_config = types.GenerateContentConfig(
+        thinking_config=types.ThinkingConfig(thinking_budget=0),
+        response_mime_type="application/json",
+        system_instruction=[types.Part.from_text(text=system_prompt)],
+    )
+    response = ""
+    for chunk in client.models.generate_content_stream(
+        model=model,
+        contents=contents,
+        config=generate_content_config,
+    ):
+        if chunk.text:
+            response += chunk.text
+    try:
+        _ = json.loads(response)
+    except Exception as e:
+        logger.error("AI returned invalid JSON: %s", str(e))
+        raise RuntimeError("AI returned invalid JSON")
+    return response
+async def generate_summary_async(meta, fiverow) -> str:
+    loop = asyncio.get_running_loop()
+    return await loop.run_in_executor(EXECUTOR, generate_summary_blocking, meta, fiverow)
+# ---------- API Models ----------
+class DrillRequest(BaseModel):
+    snapshot_id: str
+    filter_column: str
+    filter_value: str
+    limit: Optional[int] = 100
+    offset: Optional[int] = 0
+    highlight_columns: Optional[List[str]] = None
+# ---------- Startup: indexes ----------
+@app.on_event("startup")
+async def create_indexes():
+    try:
+        await snapshots.create_index("file_hash")
+        await snapshots.create_index("data_hash")
+        await snapshots.create_index("meta_hash")
+        await snapshots.create_index("snapshot_id", unique=True)
+        if TTL_DAYS > 0:
+            await snapshots.create_index("created_at_dt", expireAfterSeconds=TTL_DAYS * 24 * 3600)
+        logger.info("Indexes ensured on snapshots collection")
+    except Exception:
+        logger.exception("Error creating indexes")
+# ---------- Routes ----------
+@app.get("/")
+async def root():
+    return {"message": "Data Analysis API with snapshotting is running"}
+@app.post("/analyze")
+async def analyze(file: UploadFile = File(...)):
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No file provided")
+    allowed_extensions = ['.csv', '.xls', '.xlsx']
+    file_ext = os.path.splitext(file.filename)[-1].lower()
+    if file_ext not in allowed_extensions:
+        raise HTTPException(status_code=400, detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}")
+    # stream save + file hash (prevents OOM)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file:
+        tmp_path = tmp_file.name
+    try:
+        file_hash = stream_save_and_hash(file, tmp_path, size_limit=MAX_UPLOAD_SIZE)
+    except HTTPException:
+        try:
+            os.unlink(tmp_path)
+        except Exception:
+            pass
+        raise
+    except Exception as e:
+        try:
+            os.unlink(tmp_path)
+        except Exception:
+            pass
+        logger.exception("Error saving uploaded file")
+        raise HTTPException(status_code=500, detail=str(e))
+    try:
+        # load and preprocess (blocking; small files ok). For very large files, consider streaming/parsing.
+        df = load_file_from_path(tmp_path, file.filename)
+        df_clean = preprocess(df)
+        meta = get_metadata(df_clean)
+        fiverow = df_clean.head(5).to_dict(orient="records")
+        # compute hashes: data_hash first, canonical meta_hash
+        data_hash = data_fingerprint(df_clean)
+        meta_hash = sha256_obj({
+            "rows": meta["rows"],
+            "columns": meta["columns"],
+            "column_names": meta["column_names"],
+            "column_types": canonical_types(df_clean),
+        })
+        # search order: exact file -> data_hash -> meta_hash (if allowed)
+        existing = await snapshots.find_one({"file_hash": file_hash})
+        cache_hit = None
+        if not existing:
+            existing = await snapshots.find_one({"data_hash": data_hash})
+            if existing:
+                cache_hit = "data"
+        if not existing and METADATA_ONLY_FALLBACK:
+            existing = await snapshots.find_one({"meta_hash": meta_hash})
+            if existing:
+                cache_hit = "meta"
+        if existing:
+            # return consistent snapshot_id
+            snapshot_id_return = existing.get("snapshot_id") or str(existing.get("_id"))
+            return {
+                "id": snapshot_id_return,
+                "summary": existing.get("summary"),
+                "chart_data": existing.get("chart_data"),
+                "metadata": existing.get("metadata"),
+                "created_at": existing.get("created_at"),
+                "cached": True,
+                "cache_hit": cache_hit or "file",
+            }
+        # Not found -> create processing snapshot doc with status
+        snapshot_id = uuid.uuid4().hex
+        created_at_iso = datetime.now(timezone.utc).isoformat()
+        created_at_dt = datetime.now(timezone.utc)
+        doc = {
+            "snapshot_id": snapshot_id,
+            "filename": file.filename,
+            "file_hash": file_hash,
+            "data_hash": data_hash,
+            "meta_hash": meta_hash,
+            "metadata": meta,
+            "summary": None,
+            "chart_data": None,
+            "preprocessed_path": None,
+            "status": "processing",
+            "created_at": created_at_iso,
+            "created_at_dt": created_at_dt,
+        }
+        await snapshots.insert_one(doc)
+        # Generate summary (blocking AI call offloaded to executor)
+        try:
+            summary_json = await generate_summary_async(meta, fiverow)
+            summary_obj = json.loads(summary_json)
+            chart_data = summary_obj.get("recommended_charts")
+        except Exception as e:
+            await snapshots.update_one({"snapshot_id": snapshot_id}, {"$set": {"status": "failed", "error": str(e)}})
+            raise
+        # save preprocessed csv for drilling and later retrieval (non-blocking via executor)
+        preprocessed_path = await save_preprocessed_df(df_clean, snapshot_id)
+        # finalize doc
+        await snapshots.update_one(
+            {"snapshot_id": snapshot_id},
+            {"$set": {
+                "summary": summary_obj,
+                "chart_data": chart_data,
+                "preprocessed_path": preprocessed_path,
+                "status": "done",
+                "completed_at": datetime.now(timezone.utc).isoformat()
+            }}
+        )
+        return {
+            "id": snapshot_id,
+            "summary": summary_obj,
+            "chart_data": chart_data,
+            "metadata": meta,
+            "created_at": created_at_iso,
+            "cached": False,
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("Error processing file")
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except Exception:
+            pass
+@app.get("/snapshots")
+async def list_snapshots(limit: int = Query(20, ge=1, le=100), offset: int = Query(0, ge=0)):
+    cursor = snapshots.find({}, {"preprocessed_path": 0, "summary": 0, "chart_data": 0}).sort("created_at_dt", -1).skip(offset).limit(limit)
+    items = []
+    async for doc in cursor:
+        items.append({
+            "id": doc.get("snapshot_id") or str(doc.get("_id")),
+            "filename": doc.get("filename"),
+            "metadata": doc.get("metadata"),
+            "status": doc.get("status"),
+            "created_at": doc.get("created_at"),
+        })
+    return {"count": len(items), "items": items}
+@app.get("/snapshot/{snapshot_id}")
+async def get_snapshot(snapshot_id: str):
+    doc = await snapshots.find_one({"snapshot_id": snapshot_id})
+    if not doc:
+        raise HTTPException(status_code=404, detail="Snapshot not found")
+    return {
+        "id": doc["snapshot_id"],
+        "filename": doc.get("filename"),
+        "metadata": doc.get("metadata"),
+        "summary": doc.get("summary"),
+        "chart_data": doc.get("chart_data"),
+        "status": doc.get("status"),
+        "created_at": doc.get("created_at"),
+    }
+@app.get("/preprocessed/{snapshot_id}")
+async def get_preprocessed(snapshot_id: str, limit: int = 100, offset: int = 0):
+    doc = await snapshots.find_one({"snapshot_id": snapshot_id})
+    if not doc:
+        raise HTTPException(status_code=404, detail="Snapshot not found")
+    path = doc.get("preprocessed_path")
+    if not path or not os.path.exists(path):
+        raise HTTPException(status_code=404, detail="Preprocessed data not available")
+    df = pd.read_csv(path)
+    total = len(df)
+    rows = df.iloc[offset: offset + limit].to_dict(orient="records")
+    return {"total": total, "offset": offset, "limit": limit, "rows": rows}
+@app.post("/drill")
+async def drill(req: DrillRequest):
+    doc = await snapshots.find_one({"snapshot_id": req.snapshot_id})
+    if not doc:
+        raise HTTPException(status_code=404, detail="Snapshot not found")
+    path = doc.get("preprocessed_path")
+    if not path or not os.path.exists(path):
+        raise HTTPException(status_code=404, detail="Preprocessed data not available")
+    df = pd.read_csv(path)
+    if req.filter_column not in df.columns:
+        raise HTTPException(status_code=400, detail=f"Column {req.filter_column} not found in preprocessed data")
+    try:
+        filtered = df[df[req.filter_column] == req.filter_value]
+        if filtered.empty:
+            filtered = df[df[req.filter_column].astype(str) == str(req.filter_value)]
+    except Exception:
+        filtered = df[df[req.filter_column].astype(str) == str(req.filter_value)]
+    total = len(filtered)
+    rows = filtered.iloc[req.offset: req.offset + req.limit].to_dict(orient="records")
+    highlights = req.highlight_columns or [req.filter_column]
+    highlights = [c for c in highlights if c in df.columns]
+    return {
+        "snapshot_id": req.snapshot_id,
+        "filter_column": req.filter_column,
+        "filter_value": req.filter_value,
+        "total_matches": total,
+        "offset": req.offset,
+        "limit": req.limit,
+        "rows": rows,
+        "highlight_columns": highlights,
+    }
+# Global exception handlers
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request, exc):
+    return JSONResponse(status_code=exc.status_code, content={"error": exc.detail})
+@app.exception_handler(Exception)
+async def general_exception_handler(request, exc):
+    logger.exception("Unhandled exception")
+    return JSONResponse(status_code=500, content={"error": "Internal server error", "details": str(exc)})
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")))