from huggingface_hub import InferenceClient import os import json import pandas as pd def compute_dataset_stats(dataset_summary): """Compute mean, median, std if numeric values exist.""" try: df = pd.DataFrame(dataset_summary) stats = df.describe(include='all').to_dict() return stats except Exception: return {} def format_stats(stats): """Format statistics for readability.""" try: return json.dumps(stats, indent=2) except: return "No statistics available." def get_hf_client(): token = os.getenv("HF_TOKEN") return InferenceClient(token=token) if token else None def query_agent_from_csv(user_query, dataset_summary, chat_history, model_repo): client = get_hf_client() if client is None: return "", "Missing HF_TOKEN. Please set it in your environment or secrets." dataset_stats = compute_dataset_stats(dataset_summary) formatted_stats = format_stats(dataset_stats) prompt = f""" You are a helpful and expert data analyst. Use ONLY the dataset info and summary statistics below to answer. 📊 Dataset Summary: {json.dumps(dataset_summary, indent=2)} 📌 Descriptive Statistics (mean, median, std, etc.): {formatted_stats} 💬 Previous Conversation: {json.dumps(chat_history[-5:], indent=2)} ❓ Current Question: {user_query} ⚠️ Important Rules: - If a value cannot be derived from dataset, clearly say "The dataset does not provide this information." - Show calculations briefly if applicable. - Be concise and factual. """ try: response = client.chat_completion( model=model_repo, messages=[ {"role": "system", "content": "You answer based ONLY on the dataset provided."}, {"role": "user", "content": prompt} ], max_tokens=512 ) answer = response.choices[0].message["content"] return answer, "" except Exception as e: return "", f"Error contacting model: {e}"