csvBot / utils.py
yonkoyonks's picture
Update utils.py
68dc9ea verified
from huggingface_hub import InferenceClient
import os
import pandas as pd
def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str:
summary = f"Columns: {', '.join(df.columns)}\n\n"
if len(df) > max_rows:
sample = df.sample(max_rows, random_state=42)
summary += "Showing a random sample of rows:\n"
else:
sample = df
summary += "Showing all rows:\n"
summary += sample.to_string(index=False)
return summary
def query_agent(df: pd.DataFrame, query: str) -> str:
query_lower = query.lower()
# ----------------- Direct Analysis for Most Common -----------------
try:
if "most common" in query_lower or "most frequent" in query_lower:
# Look for multiple columns in query
cols_in_query = [col for col in df.columns if col.lower() in query_lower]
if len(cols_in_query) == 1:
col = cols_in_query[0]
value = df[col].mode()[0]
return f"The most common value in column '{col}' is '{value}'."
elif len(cols_in_query) > 1:
# Compute most common combination of values across the columns
combo_series = df[cols_in_query].apply(lambda row: tuple(row), axis=1)
most_common_combo = combo_series.mode()[0]
combo_str = ", ".join(f"{col}={val}" for col, val in zip(cols_in_query, most_common_combo))
return f"The most common combination of values is: {combo_str}"
except Exception as e:
print("Direct analysis failed:", e)
# ----------------- Use LLM if direct analysis fails -----------------
data_text = summarize_dataframe(df)
prompt = f"""
You are a data analysis assistant with expertise in statistics and data interpretation.
Analyze the dataset sample below and answer the user's question in a clear, detailed, and well-explained way.
Include both the direct answer and a short explanation or reasoning behind it.
Dataset Summary:
{data_text}
Question:
{query}
Answer (with explanation):
"""
# Initialize client with explicit provider
client = InferenceClient(
model="google/gemma-2b-it",
provider="hf-inference",
token=os.environ.get("HUGGINGFACE_API_TOKEN"),
)
try:
response = client.text_generation(
prompt,
max_new_tokens=1024,
temperature=0.7,
)
except Exception as e:
print("Model call failed:", e)
return "⚠️ Sorry, the model could not generate an answer. Please try again."
# Extract text safely
if isinstance(response, str):
return response
elif isinstance(response, dict) and "generated_text" in response:
return response["generated_text"]
elif isinstance(response, list) and len(response) > 0 and "generated_text" in response[0]:
return response[0]["generated_text"]
else:
return str(response)