Spaces:

rithwikreal
/

AnalysisApp

Sleeping

App Files Files Community

rithwikreal commited on Sep 28, 2025

Commit

b855b87

verified ·

1 Parent(s): 9d86fb2

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -277

app.py CHANGED Viewed

@@ -2,308 +2,108 @@
 import gradio as gr
 import pandas as pd
 import io
-import re
-import gc
 import os
-from typing import Tuple, Optional, List
-# ---------- Helpers for uploaded file reading ----------
-def read_uploaded_file(file):
-    """Try multiple ways to get bytes from Gradio upload objects."""
-    if file is None:
-        return None, None, "No file uploaded."
-    try:
-        if hasattr(file, "read"):
-            content = file.read()
-            name = getattr(file, "name", None)
-            return content, name, None
-    except Exception:
-        pass
-    try:
-        if isinstance(file, (str, os.PathLike)):
-            path = str(file)
-            if os.path.exists(path):
-                with open(path, "rb") as f:
-                    content = f.read()
-                return content, os.path.basename(path), None
-    except Exception:
-        pass
-    try:
-        if isinstance(file, dict):
-            name = file.get("name") or file.get("filename")
-            data = file.get("data") or file.get("content") or file.get("bytes")
-            if isinstance(data, (bytes, bytearray)):
-                return data, name, None
-            if isinstance(data, str) and os.path.exists(data):
-                with open(data, "rb") as f:
-                    content = f.read()
-                return content, name or os.path.basename(data), None
-    except Exception:
-        pass
-    try:
-        name = getattr(file, "name", None)
-        if name and isinstance(name, str) and os.path.exists(name):
-            with open(name, "rb") as f:
-                content = f.read()
-            return content, os.path.basename(name), None
-    except Exception:
-        pass
-    return None, None, "Uploaded file format not supported by this server environment."
-def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
-    """Read bytes and convert to DataFrame (no disk writes)."""
-    content, name, err = read_uploaded_file(file)
-    if err:
-        return None, f"Error reading file: {err}"
-    if content is None:
-        return None, "No content read from uploaded file."
     try:
-        fname = (name or "").lower()
-        if fname.endswith(".csv") or (isinstance(content, (bytes, bytearray)) and b"," in content[:200]):
             df = pd.read_csv(io.BytesIO(content))
         else:
             df = pd.read_excel(io.BytesIO(content))
     except Exception as e:
         return None, f"Error reading file: {e}"
-    finally:
-        try:
-            del content
-        except Exception:
-            pass
-        gc.collect()
-    return df, None
-# ---------- Column matching in queries ----------
-def find_columns_in_query(columns: List[str], query: str, max_matches: int = 3) -> List[str]:
-    """Return a list of best matching column names from the DataFrame for words in the query."""
-    q = query.lower()
-    found = []
-    # exact word matches first
-    for col in columns:
-        cl = col.lower()
-        # exact full word present
-        if re.search(r"\b" + re.escape(cl) + r"\b", q):
-            found.append(col)
-            if len(found) >= max_matches:
-                return found
-    # partial matches (any token)
-    q_tokens = set(re.findall(r"[a-z0-9_]+", q))
-    for col in columns:
-        if col in found:
-            continue
-        cl = col.lower()
-        col_tokens = set(re.findall(r"[a-z0-9_]+", cl))
-        if q_tokens & col_tokens:
-            found.append(col)
-            if len(found) >= max_matches:
-                return found
-    # fallback: if query contains "department" but no exact column, look for column names containing department
-    for col in columns:
-        if "department" in col.lower() and col not in found:
-            found.append(col)
-            if len(found) >= max_matches:
-                return found
-    return found
-# ---------- Aggregation helpers ----------
-def group_count(df: pd.DataFrame, group_col: str, top_n: Optional[int] = None):
-    res = df.groupby(group_col).size().reset_index(name="count").sort_values("count", ascending=False).reset_index(drop=True)
-    if top_n:
-        return res.head(top_n)
-    return res
-def group_agg(df: pd.DataFrame, group_col: str, value_col: str, agg: str):
-    if agg in ("mean", "avg", "average"):
-        res = df.groupby(group_col)[value_col].mean().reset_index().rename(columns={value_col: "average"})
-    elif agg in ("sum",):
-        res = df.groupby(group_col)[value_col].sum().reset_index().rename(columns={value_col: "sum"})
-    elif agg in ("max",):
-        res = df.groupby(group_col)[value_col].max().reset_index().rename(columns={value_col: "max"})
-    elif agg in ("min",):
-        res = df.groupby(group_col)[value_col].min().reset_index().rename(columns={value_col: "min"})
-    else:
-        res = df.groupby(group_col)[value_col].agg(agg).reset_index().rename(columns={value_col: agg})
-    return res.sort_values(res.columns[-1], ascending=False).reset_index(drop=True)
-def compute_percentage_counts(df: pd.DataFrame, group_col: str):
-    counts = group_count(df, group_col)
-    total = counts["count"].sum()
-    counts["percentage"] = (counts["count"] / total * 100).round(2)
-    return counts
-def compute_percentage_of_value(df: pd.DataFrame, group_col: str, value_col: str):
-    # percent share of value_col per group
-    sums = df.groupby(group_col)[value_col].sum().reset_index().rename(columns={value_col: "sum"})
-    total = sums["sum"].sum()
-    sums["percentage"] = (sums["sum"] / total * 100).round(2)
-    return sums.sort_values("sum", ascending=False).reset_index(drop=True)
-# ---------- Natural language parser & action ----------
-def simple_nl_to_action(df: pd.DataFrame, query: str):
-    q = (query or "").strip().lower()
-    if q == "":
-        return None, "Please type a question like: 'department wise head count', 'percentage of employees by department', 'average salary by department', or 'show columns'."
-    cols = list(df.columns)
-    matched = find_columns_in_query(cols, q, max_matches=3)  # up to 3 column matches
-    # direct commands
-    if "columns" in q or "show columns" in q or "list columns" in q:
-        return pd.DataFrame({"columns": cols}), None
-    # overall totals
-    if re.search(r"\b(total|how many|count of rows|number of rows|total employees|total employee)\b", q):
-        return pd.DataFrame({"total_rows": [len(df)]}), None
-    # show first N rows
-    m = re.search(r"(first|head)\s*(\d+)?", q)
-    if "head" in q or "first" in q:
-        n = 5
-        if m and m.group(2):
-            n = int(m.group(2))
-        return df.head(n), None
-    # describe / summary
-    if "describe" in q or "summary" in q or "statistics" in q:
-        return df.describe(include='all').reset_index(), None
-    # HEADCOUNT / COUNT requests (department wise head count etc.)
-    if any(w in q for w in ["headcount", "head count", "head-count", "headcounts", "head count", "number of employees", "how many", "count by", "count of", "count"]):
-        # If a grouping column is mentioned, use it
-        if matched:
-            group_col = matched[0]
-            # if user mentions percentage as well
-            if "%" in q or "percentage" in q or "percent" in q or "share" in q:
-                return compute_percentage_counts(df, group_col), None
-            # If they asked which has maximum
-            if any(w in q for w in ["most", "maximum", "max", "highest", "where max", "to which"]):
-                counts = group_count(df, group_col)
-                top = counts.head(1)
-                # also show full counts for context
-                summary = counts
-                # build a small output that includes top and summary (we'll return summary; top is first row)
-                return summary, f"Top: {top.iloc[0,0]} with {top.iloc[0,1]} (rows)."
-            # just return counts
-            return group_count(df, group_col), None
-        else:
-            # no group column mentioned: return total rows
-            return pd.DataFrame({"total_rows": [len(df)]}), None
-    # AGGREGATION requests (average, mean, sum, max/min of a numeric column grouped by another)
-    if any(w in q for w in ["average", "mean", "avg", "sum", "total", "maximum", "minimum", "max", "min"]):
-        # try to detect grouping and value column
-        if len(matched) >= 2:
-            group_col = matched[0]
-            value_col = matched[1]
-        elif len(matched) == 1:
-            # ambiguous: user mentioned one column. If that's numeric, perhaps they want overall average
-            cand = matched[0]
-            if pd.api.types.is_numeric_dtype(df[cand]):
-                # overall stat
-                if any(w in q for w in ["average", "mean", "avg"]):
-                    return pd.DataFrame({f"overall_{cand}_average": [df[cand].mean()]}), None
-                if "sum" in q or "total" in q:
-                    return pd.DataFrame({f"overall_{cand}_sum": [df[cand].sum()]}), None
-            # else ask for more clarity
-            return None, "I found one column but couldn't tell grouping vs value column. Please ask like 'average Salary by Department' or 'sum Sales by Region'."
-        else:
-            return None, "Please mention columns. Example: 'average Salary by Department' or 'sum Sales by Region'."
-        # determine aggregation type
-        if any(w in q for w in ["average", "mean", "avg"]):
-            return group_agg(df, group_col, value_col, "mean"), None
-        if any(w in q for w in ["sum", "total"]):
-            return group_agg(df, group_col, value_col, "sum"), None
-        if any(w in q for w in ["max", "maximum", "highest"]):
-            return group_agg(df, group_col, value_col, "max"), None
-        if any(w in q for w in ["min", "minimum", "lowest"]):
-            return group_agg(df, group_col, value_col, "min"), None
-    # PERCENTAGE requests for a numeric column per group
-    if any(w in q for w in ["percentage", "%", "percent", "share"]):
-        # if two columns mentioned, assume first is group, second is numeric value
-        if len(matched) >= 2:
-            group_col = matched[0]
-            value_col = matched[1]
-            if pd.api.types.is_numeric_dtype(df[value_col]):
-                return compute_percentage_of_value(df, group_col, value_col), None
-            else:
-                return None, f"Column '{value_col}' is not numeric; cannot compute percentage of values."
-        elif len(matched) == 1:
-            group_col = matched[0]
-            # percent of counts
-            return compute_percentage_counts(df, group_col), None
-        else:
-            return None, "Please mention the group column (and optionally a numeric column). Example: 'percentage of Salary by Department' or 'percentage of employees by Department'."
-    # SHOW specific columns (e.g., 'show Department and Salary')
-    m = re.search(r"show (.+)", q)
-    if m:
-        # try to extract column names from matched list
-        if matched:
-            # if user asked show with two columns, return them
-            return df[matched].head(200), None
-        else:
-            return None, "Couldn't identify columns to show. Use 'show columns' to view exact names."
-    # fallback: return first 10 rows with suggestion
-    return df.head(10), "Couldn't parse exact request — showing first 10 rows. Try: 'show columns', 'department wise head count', 'percentage of employees by department', or 'average Salary by Department'."
-# ---------- Processing wrapper ----------
-def process(file, query):
-    df, err = load_file_bytes_to_df(file)
-    if err:
-        try:
-            del file
-        except Exception:
-            pass
-        gc.collect()
-        return None, err
     try:
-        res, msg = simple_nl_to_action(df, query)
-        if isinstance(res, pd.DataFrame):
-            out_df = res.copy()
         else:
-            out_df = None
-    except Exception as e:
-        out_df = None
-        msg = f"Error while processing: {e}"
-    try:
-        del df
-        del file
-    except Exception:
-        pass
-    gc.collect()
-    if isinstance(out_df, pd.DataFrame):
-        return out_df, (msg or "OK")
-    else:
-        return None, (msg or "No result")
-# ---------- Clear / reset ----------
 def clear_all():
     return (
         gr.File.update(value=None),
-        gr.Textbox.update(value=""),
         gr.Dataframe.update(value=None),
         gr.Textbox.update(value=""),
     )
-# ---------- Gradio UI ----------
 with gr.Blocks() as demo:
-    gr.Markdown("# Chat-with-CSV — enhanced analysis (ephemeral uploads)")
-    with gr.Row():
-        file_input = gr.File(label="Upload CSV or XLSX (will not be saved)", file_count="single")
-        query_input = gr.Textbox(label="Ask a question (examples: 'department wise head count', 'percentage of Salary by Department', 'average Salary by Department')", placeholder="Type your question here")
     with gr.Row():
-        submit = gr.Button("Run query")
-        clear_btn = gr.Button("Clear / Reset (remove uploaded file & results)")
-    output_table = gr.Dataframe(headers=None, label="Result table")
-    status = gr.Textbox(label="Status / Messages", interactive=False)
-    submit.click(fn=process, inputs=[file_input, query_input], outputs=[output_table, status])
-    clear_btn.click(fn=clear_all, inputs=None, outputs=[file_input, query_input, output_table, status])
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import pandas as pd
 import io
 import os
+import google.generativeai as genai
+import gc
+# Load API key securely from Hugging Face secret
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+if not GEMINI_API_KEY:
+    raise ValueError("Gemini API key not set. Please add GEMINI_API_KEY in Space Secrets.")
+genai.configure(api_key=GEMINI_API_KEY)
+# Keep DataFrame in memory during session
+session_df = None
+def load_file(file):
+    """Load uploaded CSV/XLSX into pandas DataFrame."""
+    global session_df
+    if file is None:
+        return None, "No file uploaded"
     try:
+        name = getattr(file, "name", "")
+        content = file.read()
+        if name.endswith(".csv") or b"," in content[:200]:
             df = pd.read_csv(io.BytesIO(content))
         else:
             df = pd.read_excel(io.BytesIO(content))
+        session_df = df
+        return df.head(5), f"File loaded with {df.shape[0]} rows and {df.shape[1]} columns."
     except Exception as e:
         return None, f"Error reading file: {e}"
+def ask_question(query):
+    """Send the question + DF structure to Gemini and run returned Python code."""
+    global session_df
+    if session_df is None:
+        return None, "Please upload a file first."
+    # Build prompt for Gemini
+    preview = session_df.head(10).to_csv(index=False)
+    columns = list(session_df.columns)
+    prompt = f"""
+You are a data analyst.
+The user uploaded a dataset with these columns: {columns}.
+Here are the first 10 rows:
+{preview}
+User question: {query}
+Write Python pandas code (only the code, no explanations, no imports) that answers the question
+and assigns the result to a variable named result.
+If aggregation is needed, show a DataFrame (not just a number).
+Keep the output concise (max 200 rows).
+"""
     try:
+        # Ask Gemini to generate code
+        model = genai.GenerativeModel("gemini-pro")
+        response = model.generate_content(prompt)
+        code = response.text.strip("`\n ")
+        # Execute the code safely
+        local_vars = {"pd": pd, "result": None, "df": session_df.copy()}
+        exec(code, {}, local_vars)
+        result = local_vars.get("result")
+        if isinstance(result, pd.DataFrame):
+            return result, f"Answer based on your question: {query}"
         else:
+            return None, f"No table returned. Code was:\n{code}"
+    except Exception as e:
+        return None, f"Error: {e}"
 def clear_all():
+    global session_df
+    session_df = None
+    gc.collect()
     return (
         gr.File.update(value=None),
         gr.Dataframe.update(value=None),
         gr.Textbox.update(value=""),
+        gr.Textbox.update(value=""),
     )
 with gr.Blocks() as demo:
+    gr.Markdown("# Chat with CSV (Gemini-powered, private API key)")
     with gr.Row():
+        file_input = gr.File(label="Upload CSV/XLSX")
+        load_btn = gr.Button("Load file")
+    file_preview = gr.Dataframe(headers=None, label="Preview (first 5 rows)")
+    file_status = gr.Textbox(label="File status")
+    query_input = gr.Textbox(label="Ask a question")
+    ask_btn = gr.Button("Ask Gemini")
+    result_table = gr.Dataframe(headers=None, label="Result")
+    status = gr.Textbox(label="Status / Messages")
+    clear_btn = gr.Button("Clear / Reset")
+    load_btn.click(fn=load_file, inputs=file_input, outputs=[file_preview, file_status])
+    ask_btn.click(fn=ask_question, inputs=query_input, outputs=[result_table, status])
+    clear_btn.click(fn=clear_all, outputs=[file_input, file_preview, query_input, result_table])
 if __name__ == "__main__":
     demo.launch()