csv-redaer-bot / utils.py
charesz's picture
Update utils.py
46b0e84 verified
from huggingface_hub import InferenceClient
import os
import json
import pandas as pd
def compute_dataset_stats(dataset_summary):
"""Compute mean, median, std if numeric values exist."""
try:
df = pd.DataFrame(dataset_summary)
stats = df.describe(include='all').to_dict()
return stats
except Exception:
return {}
def format_stats(stats):
"""Format statistics for readability."""
try:
return json.dumps(stats, indent=2)
except:
return "No statistics available."
def get_hf_client():
token = os.getenv("HF_TOKEN")
return InferenceClient(token=token) if token else None
def query_agent_from_csv(user_query, dataset_summary, chat_history, model_repo):
client = get_hf_client()
if client is None:
return "", "Missing HF_TOKEN. Please set it in your environment or secrets."
dataset_stats = compute_dataset_stats(dataset_summary)
formatted_stats = format_stats(dataset_stats)
prompt = f"""
You are a helpful and expert data analyst.
Use ONLY the dataset info and summary statistics below to answer.
πŸ“Š Dataset Summary:
{json.dumps(dataset_summary, indent=2)}
πŸ“Œ Descriptive Statistics (mean, median, std, etc.):
{formatted_stats}
πŸ’¬ Previous Conversation:
{json.dumps(chat_history[-5:], indent=2)}
❓ Current Question: {user_query}
⚠️ Important Rules:
- If a value cannot be derived from dataset, clearly say "The dataset does not provide this information."
- Show calculations briefly if applicable.
- Be concise and factual.
"""
try:
response = client.chat_completion(
model=model_repo,
messages=[
{"role": "system", "content": "You answer based ONLY on the dataset provided."},
{"role": "user", "content": prompt}
],
max_tokens=512
)
answer = response.choices[0].message["content"]
return answer, ""
except Exception as e:
return "", f"Error contacting model: {e}"