Spaces:
Running
Running
File size: 6,226 Bytes
bb9980b 8a352c5 9333089 bb9980b 8a352c5 bb9980b 2f1bb6b 8a352c5 2f1bb6b bb9980b 8a352c5 bb9980b 8a352c5 bb9980b 8a352c5 bb9980b 8a352c5 e46965b 5679c31 841e95d 5679c31 841e95d 5679c31 841e95d 5679c31 841e95d 5679c31 841e95d 5679c31 b969e68 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | from huggingface_hub import InferenceClient
import os
def get_client(api_token=None):
# If user provides a token, use it.
# Otherwise, fall back to environment variable (if exists) or None (which uses free/anonymous if allowed, or fails)
# Note: Spaces usually have a default robust token if set in secrets, but here we want to prioritize user input if given.
token = api_token if api_token and api_token.strip() else os.getenv("HF_TOKEN")
# Using Qwen2.5-72B-Instruct as it is one of the most capable open models available on the free tier
return InferenceClient("Qwen/Qwen2.5-72B-Instruct", token=token, timeout=30)
def generate_text(prompt, client, max_new_tokens=512):
try:
messages = [
{"role": "system", "content": "You are a senior data analyst. You provide professional, concise, and accurate insights based on data summaries. You do NOT hallucinate numbers."},
{"role": "user", "content": prompt}
]
response = client.chat_completion(messages, max_tokens=max_new_tokens)
return response.choices[0].message.content
except Exception as e:
error_msg = str(e)
if "401" in error_msg or "api_key" in error_msg.lower():
return (
"β οΈ **AI Insights Unavailable**\n\n"
"Authentication failed. Please provide a valid Hugging Face Token in the sidebar.\n"
"1. Get a token from [Hugging Face Settings](https://huggingface.co/settings/tokens).\n"
"2. Paste it into the 'Hugging Face Token' box above."
)
return f"Based on the analysis, please review the charts and data profile. (AI Insights unavailable: {error_msg})"
def get_insights(overview_text, anomalies_text, api_token=None):
client = get_client(api_token)
prompt = f"""
Analyze the following dataset summary and anomaly report.
Generate 3-5 key professional insights.
Focus on data quality, distribution patterns, and potential issues.
Do not make up specific values not present in the summary.
Data Summary:
{overview_text}
Anomaly Report:
{anomalies_text}
Output Format:
- Insight 1
- Insight 2
- Insight 3
...
"""
return generate_text(prompt, client)
def get_followup_questions(overview_text, api_token=None):
client = get_client(api_token)
prompt = f"""
Based on the following dataset summary, suggest 3-5 relevant follow-up questions
that a data analyst should ask to deeper understand the business context or data quality.
Data Summary:
{overview_text}
Output Format:
1. Question 1
2. Question 2
3. Question 3
...
"""
return generate_text(prompt, client)
def ask_llm(message, history, overview_text, api_token=None):
client = get_client(api_token)
# Construct a prompt that includes the dataset context
# History comes as list of [user_msg, bot_msg]
# We'll format it for the model or just use the last query with context if stateless,
# but for a chat, we should include history.
context = f"""
You are a helpful Data Analyst Assistant.
You have access to the following dataset summary:
{overview_text}
"""
messages = [{"role": "system", "content": context}]
for user_msg, bot_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if bot_msg:
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
try:
response = client.chat_completion(messages, max_tokens=512)
return response.choices[0].message.content
except Exception as e:
return f"Sorry, I encountered an error: {str(e)}"
def get_ml_recommendations(overview_text, api_token=None):
client = get_client(api_token)
prompt = f"""
Based on the following **Data Summary**, act as a Lead Machine Learning Engineer and design a specific modeling strategy.
**CRITICAL INSTRUCTIONS:**
1. **Analyze the Columns**: Look at the column names, types, and sample values in the summary.
2. **Identify Target**: Pick the most logical target variable for a business use case. If none is obvious, suggest a Clustering approach.
3. **Specific Recommendations**: Do NOT list generic steps.
- Instead of "Encode categorical variables", say "One-hot encode 'Region' and Label encode 'Risk_Level'".
- Instead of "Handle missing values", say "Impute 'Age' with median because..."
4. **Model Selection**: Recommend models specifically suited for this dataset's size and features (e.g., "XGBoost because 'Income' is skewed...").
Data Summary:
{overview_text}
Output Format:
## π€ ML Modeling Strategy
### π― Objective
- **Target Variable**: [Column Name] (or "None - Unsupervised")
- **Problem Type**: [Regression/Classification/Clustering]
- **Goal**: [Predicting X to optimize Y...]
### π οΈ Feature Engineering & Preprocessing
- **[Specific Column]**: [Specific Transformation]
- **[Specific Column]**: [Specific handling for missing/outliers]
### π Model Selection
1. **[Model Name]**: [Why it fits THIS data]
2. **[Model Name]**: [Why it fits THIS data]
"""
return generate_text(prompt, client)
def analyze_text_content(text_samples, api_token=None):
client = get_client(api_token)
# Limit samples
text_preview = "\n".join(text_samples[:20]) # First 20 lines
prompt = f"""
Analyze the following text content.
1. **Summarize** the main topics or themes.
2. **Sentiment Analysis**: Determine if the overall tone is Positive, Negative, or Neutral.
3. **Keywords**: Extract 5-7 key entities or terms.
Text Preview:
{text_preview}
Output Format:
## π Text Analysis
### π Summary
[Brief summary of content]
### π Sentiment
**[Sentiment Label]**: [Reasoning]
### π Keywords
- [Keyword 1]
- [Keyword 2]
...
"""
return generate_text(prompt, client)
|