from huggingface_hub import InferenceClient
import os
import json
import pandas as pd

def compute_dataset_stats(dataset_summary):
    """Compute mean, median, std if numeric values exist."""
    try:
        df = pd.DataFrame(dataset_summary)
        stats = df.describe(include='all').to_dict()
        return stats
    except Exception:
        return {}

def format_stats(stats):
    """Format statistics for readability."""
    try:
        return json.dumps(stats, indent=2)
    except:
        return "No statistics available."

def get_hf_client():
    token = os.getenv("HF_TOKEN")
    return InferenceClient(token=token) if token else None

def query_agent_from_csv(user_query, dataset_summary, chat_history, model_repo):
    client = get_hf_client()
    if client is None:
        return "", "Missing HF_TOKEN. Please set it in your environment or secrets."

    dataset_stats = compute_dataset_stats(dataset_summary)
    formatted_stats = format_stats(dataset_stats)

    prompt = f"""
You are a helpful and expert data analyst.
Use ONLY the dataset info and summary statistics below to answer.

📊 Dataset Summary:
{json.dumps(dataset_summary, indent=2)}

📌 Descriptive Statistics (mean, median, std, etc.):
{formatted_stats}

💬 Previous Conversation:
{json.dumps(chat_history[-5:], indent=2)}

❓ Current Question: {user_query}

⚠️ Important Rules:
- If a value cannot be derived from dataset, clearly say "The dataset does not provide this information."
- Show calculations briefly if applicable.
- Be concise and factual.
"""

    try:
        response = client.chat_completion(
            model=model_repo,
            messages=[
                {"role": "system", "content": "You answer based ONLY on the dataset provided."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=512
        )
        answer = response.choices[0].message["content"]
        return answer, ""
    except Exception as e:
        return "", f"Error contacting model: {e}"