Spaces:

yonkoyonks
/

csvBot

Build error

App Files Files Community

yonkoyonks commited on Oct 16, 2025

Commit

e89bea7

verified ·

1 Parent(s): 6b78fee

Update utils.py

Browse files

Files changed (1) hide show

utils.py +7 -38

utils.py CHANGED Viewed

@@ -2,29 +2,7 @@ from huggingface_hub import InferenceClient
 import os
 import pandas as pd
-# ------------------- Helper Functions -------------------
-def split_multi_value_columns(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Automatically splits any column that contains multiple comma-separated values
-    into separate columns.
-    """
-    new_df = df.copy()
-    for col in df.columns:
-        # Check if the first non-null row contains a comma
-        sample = df[col].dropna().iloc[0] if not df[col].dropna().empty else ""
-        if isinstance(sample, str) and "," in sample:
-            # Split the column into multiple columns
-            split_cols = df[col].str.split(",", expand=True)
-            split_cols = split_cols.rename(columns=lambda i: f"{col}_{i+1}")
-            new_df = new_df.drop(columns=[col]).join(split_cols)
-    return new_df
 def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str:
-    """
-    Returns a text summary of the dataframe for LLM prompts.
-    """
     summary = f"Columns: {', '.join(df.columns)}\n\n"
     if len(df) > max_rows:
         sample = df.sample(max_rows, random_state=42)
@@ -35,32 +13,23 @@ def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str:
     summary += sample.to_string(index=False)
     return summary
-# ------------------- Main Query Function -------------------
 def query_agent(df: pd.DataFrame, query: str) -> str:
-    """
-    Analyzes a dataframe to answer queries. Supports:
-    - Direct analysis of most common values (single or multiple columns)
-    - Fallback to LLM using google/gemma-2b-it
-    """
-    # Automatically split multi-value columns
-    df = split_multi_value_columns(df)
     query_lower = query.lower()
-    # -------- Direct Analysis for Most Common Values --------
     try:
         if "most common" in query_lower or "most frequent" in query_lower:
-            # Find all columns mentioned in the query
             cols_in_query = [col for col in df.columns if col.lower() in query_lower]
             if len(cols_in_query) == 1:
                 col = cols_in_query[0]
                 value = df[col].mode()[0]
                 return f"The most common value in column '{col}' is '{value}'."
             elif len(cols_in_query) > 1:
-                # Most common combination across multiple columns
                 combo_series = df[cols_in_query].apply(lambda row: tuple(row), axis=1)
                 most_common_combo = combo_series.mode()[0]
                 combo_str = ", ".join(f"{col}={val}" for col, val in zip(cols_in_query, most_common_combo))
@@ -69,7 +38,7 @@ def query_agent(df: pd.DataFrame, query: str) -> str:
     except Exception as e:
         print("Direct analysis failed:", e)
-    # -------- Use LLM Fallback if Direct Analysis Fails --------
     data_text = summarize_dataframe(df)
     prompt = f"""
 You are a data analysis assistant with expertise in statistics and data interpretation.
@@ -85,7 +54,7 @@ Question:
 Answer (with explanation):
 """
-    # Initialize Hugging Face InferenceClient with explicit provider
     client = InferenceClient(
         model="google/gemma-2b-it",
         provider="hf-inference",

 import os
 import pandas as pd
 def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str:
     summary = f"Columns: {', '.join(df.columns)}\n\n"
     if len(df) > max_rows:
         sample = df.sample(max_rows, random_state=42)
     summary += sample.to_string(index=False)
     return summary
 def query_agent(df: pd.DataFrame, query: str) -> str:
     query_lower = query.lower()
+    # ----------------- Direct Analysis for Most Common -----------------
     try:
         if "most common" in query_lower or "most frequent" in query_lower:
+            # Look for multiple columns in query
             cols_in_query = [col for col in df.columns if col.lower() in query_lower]
             if len(cols_in_query) == 1:
                 col = cols_in_query[0]
                 value = df[col].mode()[0]
                 return f"The most common value in column '{col}' is '{value}'."
             elif len(cols_in_query) > 1:
+                # Compute most common combination of values across the columns
                 combo_series = df[cols_in_query].apply(lambda row: tuple(row), axis=1)
                 most_common_combo = combo_series.mode()[0]
                 combo_str = ", ".join(f"{col}={val}" for col, val in zip(cols_in_query, most_common_combo))
     except Exception as e:
         print("Direct analysis failed:", e)
+    # ----------------- Use LLM if direct analysis fails -----------------
     data_text = summarize_dataframe(df)
     prompt = f"""
 You are a data analysis assistant with expertise in statistics and data interpretation.
 Answer (with explanation):
 """
+    # Initialize client with explicit provider
     client = InferenceClient(
         model="google/gemma-2b-it",
         provider="hf-inference",