Spaces:

salihfurkaan
/

auto-data-analyst

Runtime error

App Files Files Community

auto-data-analyst / src /llm.py

salihfurkaan

Enhance ML prompt and robust visualization logic

841e95d about 1 month ago

raw

history blame contribute delete

6.23 kB

	from huggingface_hub import InferenceClient
	import os

	def get_client(api_token=None):
	# If user provides a token, use it.
	# Otherwise, fall back to environment variable (if exists) or None (which uses free/anonymous if allowed, or fails)
	# Note: Spaces usually have a default robust token if set in secrets, but here we want to prioritize user input if given.

	token = api_token if api_token and api_token.strip() else os.getenv("HF_TOKEN")
	# Using Qwen2.5-72B-Instruct as it is one of the most capable open models available on the free tier
	return InferenceClient("Qwen/Qwen2.5-72B-Instruct", token=token, timeout=30)

	def generate_text(prompt, client, max_new_tokens=512):
	try:
	messages = [
	{"role": "system", "content": "You are a senior data analyst. You provide professional, concise, and accurate insights based on data summaries. You do NOT hallucinate numbers."},
	{"role": "user", "content": prompt}
	]
	response = client.chat_completion(messages, max_tokens=max_new_tokens)
	return response.choices[0].message.content
	except Exception as e:
	error_msg = str(e)
	if "401" in error_msg or "api_key" in error_msg.lower():
	return (
	"⚠️ AI Insights Unavailable\n\n"
	"Authentication failed. Please provide a valid Hugging Face Token in the sidebar.\n"
	"1. Get a token from [Hugging Face Settings](https://huggingface.co/settings/tokens).\n"
	"2. Paste it into the 'Hugging Face Token' box above."
	)
	return f"Based on the analysis, please review the charts and data profile. (AI Insights unavailable: {error_msg})"

	def get_insights(overview_text, anomalies_text, api_token=None):
	client = get_client(api_token)
	prompt = f"""
	Analyze the following dataset summary and anomaly report.
	Generate 3-5 key professional insights.
	Focus on data quality, distribution patterns, and potential issues.
	Do not make up specific values not present in the summary.

	Data Summary:
	{overview_text}

	Anomaly Report:
	{anomalies_text}

	Output Format:
	- Insight 1
	- Insight 2
	- Insight 3
	...
	"""
	return generate_text(prompt, client)

	def get_followup_questions(overview_text, api_token=None):
	client = get_client(api_token)
	prompt = f"""
	Based on the following dataset summary, suggest 3-5 relevant follow-up questions
	that a data analyst should ask to deeper understand the business context or data quality.

	Data Summary:
	{overview_text}

	Output Format:
	1. Question 1
	2. Question 2
	3. Question 3
	...
	"""
	return generate_text(prompt, client)

	def ask_llm(message, history, overview_text, api_token=None):
	client = get_client(api_token)

	# Construct a prompt that includes the dataset context
	# History comes as list of [user_msg, bot_msg]
	# We'll format it for the model or just use the last query with context if stateless,
	# but for a chat, we should include history.

	context = f"""
	You are a helpful Data Analyst Assistant.
	You have access to the following dataset summary:
	{overview_text}
	"""

	messages = [{"role": "system", "content": context}]

	for user_msg, bot_msg in history:
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if bot_msg:
	messages.append({"role": "assistant", "content": bot_msg})

	messages.append({"role": "user", "content": message})

	try:
	response = client.chat_completion(messages, max_tokens=512)
	return response.choices[0].message.content
	except Exception as e:
	return f"Sorry, I encountered an error: {str(e)}"

	def get_ml_recommendations(overview_text, api_token=None):
	client = get_client(api_token)
	prompt = f"""
	Based on the following Data Summary, act as a Lead Machine Learning Engineer and design a specific modeling strategy.

	CRITICAL INSTRUCTIONS:
	1. Analyze the Columns: Look at the column names, types, and sample values in the summary.
	2. Identify Target: Pick the most logical target variable for a business use case. If none is obvious, suggest a Clustering approach.
	3. Specific Recommendations: Do NOT list generic steps.
	- Instead of "Encode categorical variables", say "One-hot encode 'Region' and Label encode 'Risk_Level'".
	- Instead of "Handle missing values", say "Impute 'Age' with median because..."
	4. Model Selection: Recommend models specifically suited for this dataset's size and features (e.g., "XGBoost because 'Income' is skewed...").

	Data Summary:
	{overview_text}

	Output Format:
	## 🤖 ML Modeling Strategy

	### 🎯 Objective
	- Target Variable: [Column Name] (or "None - Unsupervised")
	- Problem Type: [Regression/Classification/Clustering]
	- Goal: [Predicting X to optimize Y...]

	### 🛠️ Feature Engineering & Preprocessing
	- [Specific Column]: [Specific Transformation]
	- [Specific Column]: [Specific handling for missing/outliers]

	### 🚀 Model Selection
	1. [Model Name]: [Why it fits THIS data]
	2. [Model Name]: [Why it fits THIS data]
	"""
	return generate_text(prompt, client)

	def analyze_text_content(text_samples, api_token=None):
	client = get_client(api_token)

	# Limit samples
	text_preview = "\n".join(text_samples[:20]) # First 20 lines

	prompt = f"""
	Analyze the following text content.

	1. Summarize the main topics or themes.
	2. Sentiment Analysis: Determine if the overall tone is Positive, Negative, or Neutral.
	3. Keywords: Extract 5-7 key entities or terms.

	Text Preview:
	{text_preview}

	Output Format:
	## 📝 Text Analysis

	### 📌 Summary
	[Brief summary of content]

	### 🎭 Sentiment
	[Sentiment Label]: [Reasoning]

	### 🔑 Keywords
	- [Keyword 1]
	- [Keyword 2]
	...
	"""
	return generate_text(prompt, client)