salihfurkaan's picture
Enhance ML prompt and robust visualization logic
841e95d
from huggingface_hub import InferenceClient
import os
def get_client(api_token=None):
# If user provides a token, use it.
# Otherwise, fall back to environment variable (if exists) or None (which uses free/anonymous if allowed, or fails)
# Note: Spaces usually have a default robust token if set in secrets, but here we want to prioritize user input if given.
token = api_token if api_token and api_token.strip() else os.getenv("HF_TOKEN")
# Using Qwen2.5-72B-Instruct as it is one of the most capable open models available on the free tier
return InferenceClient("Qwen/Qwen2.5-72B-Instruct", token=token, timeout=30)
def generate_text(prompt, client, max_new_tokens=512):
try:
messages = [
{"role": "system", "content": "You are a senior data analyst. You provide professional, concise, and accurate insights based on data summaries. You do NOT hallucinate numbers."},
{"role": "user", "content": prompt}
]
response = client.chat_completion(messages, max_tokens=max_new_tokens)
return response.choices[0].message.content
except Exception as e:
error_msg = str(e)
if "401" in error_msg or "api_key" in error_msg.lower():
return (
"⚠️ **AI Insights Unavailable**\n\n"
"Authentication failed. Please provide a valid Hugging Face Token in the sidebar.\n"
"1. Get a token from [Hugging Face Settings](https://huggingface.co/settings/tokens).\n"
"2. Paste it into the 'Hugging Face Token' box above."
)
return f"Based on the analysis, please review the charts and data profile. (AI Insights unavailable: {error_msg})"
def get_insights(overview_text, anomalies_text, api_token=None):
client = get_client(api_token)
prompt = f"""
Analyze the following dataset summary and anomaly report.
Generate 3-5 key professional insights.
Focus on data quality, distribution patterns, and potential issues.
Do not make up specific values not present in the summary.
Data Summary:
{overview_text}
Anomaly Report:
{anomalies_text}
Output Format:
- Insight 1
- Insight 2
- Insight 3
...
"""
return generate_text(prompt, client)
def get_followup_questions(overview_text, api_token=None):
client = get_client(api_token)
prompt = f"""
Based on the following dataset summary, suggest 3-5 relevant follow-up questions
that a data analyst should ask to deeper understand the business context or data quality.
Data Summary:
{overview_text}
Output Format:
1. Question 1
2. Question 2
3. Question 3
...
"""
return generate_text(prompt, client)
def ask_llm(message, history, overview_text, api_token=None):
client = get_client(api_token)
# Construct a prompt that includes the dataset context
# History comes as list of [user_msg, bot_msg]
# We'll format it for the model or just use the last query with context if stateless,
# but for a chat, we should include history.
context = f"""
You are a helpful Data Analyst Assistant.
You have access to the following dataset summary:
{overview_text}
"""
messages = [{"role": "system", "content": context}]
for user_msg, bot_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if bot_msg:
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
try:
response = client.chat_completion(messages, max_tokens=512)
return response.choices[0].message.content
except Exception as e:
return f"Sorry, I encountered an error: {str(e)}"
def get_ml_recommendations(overview_text, api_token=None):
client = get_client(api_token)
prompt = f"""
Based on the following **Data Summary**, act as a Lead Machine Learning Engineer and design a specific modeling strategy.
**CRITICAL INSTRUCTIONS:**
1. **Analyze the Columns**: Look at the column names, types, and sample values in the summary.
2. **Identify Target**: Pick the most logical target variable for a business use case. If none is obvious, suggest a Clustering approach.
3. **Specific Recommendations**: Do NOT list generic steps.
- Instead of "Encode categorical variables", say "One-hot encode 'Region' and Label encode 'Risk_Level'".
- Instead of "Handle missing values", say "Impute 'Age' with median because..."
4. **Model Selection**: Recommend models specifically suited for this dataset's size and features (e.g., "XGBoost because 'Income' is skewed...").
Data Summary:
{overview_text}
Output Format:
## πŸ€– ML Modeling Strategy
### 🎯 Objective
- **Target Variable**: [Column Name] (or "None - Unsupervised")
- **Problem Type**: [Regression/Classification/Clustering]
- **Goal**: [Predicting X to optimize Y...]
### πŸ› οΈ Feature Engineering & Preprocessing
- **[Specific Column]**: [Specific Transformation]
- **[Specific Column]**: [Specific handling for missing/outliers]
### πŸš€ Model Selection
1. **[Model Name]**: [Why it fits THIS data]
2. **[Model Name]**: [Why it fits THIS data]
"""
return generate_text(prompt, client)
def analyze_text_content(text_samples, api_token=None):
client = get_client(api_token)
# Limit samples
text_preview = "\n".join(text_samples[:20]) # First 20 lines
prompt = f"""
Analyze the following text content.
1. **Summarize** the main topics or themes.
2. **Sentiment Analysis**: Determine if the overall tone is Positive, Negative, or Neutral.
3. **Keywords**: Extract 5-7 key entities or terms.
Text Preview:
{text_preview}
Output Format:
## πŸ“ Text Analysis
### πŸ“Œ Summary
[Brief summary of content]
### 🎭 Sentiment
**[Sentiment Label]**: [Reasoning]
### πŸ”‘ Keywords
- [Keyword 1]
- [Keyword 2]
...
"""
return generate_text(prompt, client)