Spaces:

charesz
/

csv-redaer-bot

Sleeping

App Files Files Community

csv-redaer-bot / utils.py

charesz

Update utils.py

46b0e84 verified 4 months ago

raw

history blame contribute delete

2.02 kB

	from huggingface_hub import InferenceClient
	import os
	import json
	import pandas as pd

	def compute_dataset_stats(dataset_summary):
	"""Compute mean, median, std if numeric values exist."""
	try:
	df = pd.DataFrame(dataset_summary)
	stats = df.describe(include='all').to_dict()
	return stats
	except Exception:
	return {}

	def format_stats(stats):
	"""Format statistics for readability."""
	try:
	return json.dumps(stats, indent=2)
	except:
	return "No statistics available."

	def get_hf_client():
	token = os.getenv("HF_TOKEN")
	return InferenceClient(token=token) if token else None

	def query_agent_from_csv(user_query, dataset_summary, chat_history, model_repo):
	client = get_hf_client()
	if client is None:
	return "", "Missing HF_TOKEN. Please set it in your environment or secrets."

	dataset_stats = compute_dataset_stats(dataset_summary)
	formatted_stats = format_stats(dataset_stats)

	prompt = f"""
	You are a helpful and expert data analyst.
	Use ONLY the dataset info and summary statistics below to answer.

	📊 Dataset Summary:
	{json.dumps(dataset_summary, indent=2)}

	📌 Descriptive Statistics (mean, median, std, etc.):
	{formatted_stats}

	💬 Previous Conversation:
	{json.dumps(chat_history[-5:], indent=2)}

	❓ Current Question: {user_query}

	⚠️ Important Rules:
	- If a value cannot be derived from dataset, clearly say "The dataset does not provide this information."
	- Show calculations briefly if applicable.
	- Be concise and factual.
	"""

	try:
	response = client.chat_completion(
	model=model_repo,
	messages=[
	{"role": "system", "content": "You answer based ONLY on the dataset provided."},
	{"role": "user", "content": prompt}
	],
	max_tokens=512
	)
	answer = response.choices[0].message["content"]
	return answer, ""
	except Exception as e:
	return "", f"Error contacting model: {e}"