Spaces:

yonkoyonks
/

csvBot

Sleeping

App Files Files Community

csvBot / utils.py

yonkoyonks

Update utils.py

68dc9ea verified 4 months ago

raw

history blame contribute delete

2.98 kB

	from huggingface_hub import InferenceClient
	import os
	import pandas as pd

	def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str:
	summary = f"Columns: {', '.join(df.columns)}\n\n"
	if len(df) > max_rows:
	sample = df.sample(max_rows, random_state=42)
	summary += "Showing a random sample of rows:\n"
	else:
	sample = df
	summary += "Showing all rows:\n"
	summary += sample.to_string(index=False)
	return summary


	def query_agent(df: pd.DataFrame, query: str) -> str:
	query_lower = query.lower()

	# ----------------- Direct Analysis for Most Common -----------------
	try:
	if "most common" in query_lower or "most frequent" in query_lower:
	# Look for multiple columns in query
	cols_in_query = [col for col in df.columns if col.lower() in query_lower]

	if len(cols_in_query) == 1:
	col = cols_in_query[0]
	value = df[col].mode()[0]
	return f"The most common value in column '{col}' is '{value}'."

	elif len(cols_in_query) > 1:
	# Compute most common combination of values across the columns
	combo_series = df[cols_in_query].apply(lambda row: tuple(row), axis=1)
	most_common_combo = combo_series.mode()[0]
	combo_str = ", ".join(f"{col}={val}" for col, val in zip(cols_in_query, most_common_combo))
	return f"The most common combination of values is: {combo_str}"

	except Exception as e:
	print("Direct analysis failed:", e)

	# ----------------- Use LLM if direct analysis fails -----------------
	data_text = summarize_dataframe(df)
	prompt = f"""
	You are a data analysis assistant with expertise in statistics and data interpretation.
	Analyze the dataset sample below and answer the user's question in a clear, detailed, and well-explained way.
	Include both the direct answer and a short explanation or reasoning behind it.

	Dataset Summary:
	{data_text}

	Question:
	{query}

	Answer (with explanation):
	"""

	# Initialize client with explicit provider
	client = InferenceClient(
	model="google/gemma-2b-it",
	provider="hf-inference",
	token=os.environ.get("HUGGINGFACE_API_TOKEN"),
	)

	try:
	response = client.text_generation(
	prompt,
	max_new_tokens=1024,
	temperature=0.7,
	)
	except Exception as e:
	print("Model call failed:", e)
	return "⚠️ Sorry, the model could not generate an answer. Please try again."

	# Extract text safely
	if isinstance(response, str):
	return response
	elif isinstance(response, dict) and "generated_text" in response:
	return response["generated_text"]
	elif isinstance(response, list) and len(response) > 0 and "generated_text" in response[0]:
	return response[0]["generated_text"]
	else:
	return str(response)