from huggingface_hub import InferenceClient import os def get_client(api_token=None): # If user provides a token, use it. # Otherwise, fall back to environment variable (if exists) or None (which uses free/anonymous if allowed, or fails) # Note: Spaces usually have a default robust token if set in secrets, but here we want to prioritize user input if given. token = api_token if api_token and api_token.strip() else os.getenv("HF_TOKEN") # Using Qwen2.5-72B-Instruct as it is one of the most capable open models available on the free tier return InferenceClient("Qwen/Qwen2.5-72B-Instruct", token=token, timeout=30) def generate_text(prompt, client, max_new_tokens=512): try: messages = [ {"role": "system", "content": "You are a senior data analyst. You provide professional, concise, and accurate insights based on data summaries. You do NOT hallucinate numbers."}, {"role": "user", "content": prompt} ] response = client.chat_completion(messages, max_tokens=max_new_tokens) return response.choices[0].message.content except Exception as e: error_msg = str(e) if "401" in error_msg or "api_key" in error_msg.lower(): return ( "⚠️ **AI Insights Unavailable**\n\n" "Authentication failed. Please provide a valid Hugging Face Token in the sidebar.\n" "1. Get a token from [Hugging Face Settings](https://huggingface.co/settings/tokens).\n" "2. Paste it into the 'Hugging Face Token' box above." ) return f"Based on the analysis, please review the charts and data profile. (AI Insights unavailable: {error_msg})" def get_insights(overview_text, anomalies_text, api_token=None): client = get_client(api_token) prompt = f""" Analyze the following dataset summary and anomaly report. Generate 3-5 key professional insights. Focus on data quality, distribution patterns, and potential issues. Do not make up specific values not present in the summary. Data Summary: {overview_text} Anomaly Report: {anomalies_text} Output Format: - Insight 1 - Insight 2 - Insight 3 ... """ return generate_text(prompt, client) def get_followup_questions(overview_text, api_token=None): client = get_client(api_token) prompt = f""" Based on the following dataset summary, suggest 3-5 relevant follow-up questions that a data analyst should ask to deeper understand the business context or data quality. Data Summary: {overview_text} Output Format: 1. Question 1 2. Question 2 3. Question 3 ... """ return generate_text(prompt, client) def ask_llm(message, history, overview_text, api_token=None): client = get_client(api_token) # Construct a prompt that includes the dataset context # History comes as list of [user_msg, bot_msg] # We'll format it for the model or just use the last query with context if stateless, # but for a chat, we should include history. context = f""" You are a helpful Data Analyst Assistant. You have access to the following dataset summary: {overview_text} """ messages = [{"role": "system", "content": context}] for user_msg, bot_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if bot_msg: messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) try: response = client.chat_completion(messages, max_tokens=512) return response.choices[0].message.content except Exception as e: return f"Sorry, I encountered an error: {str(e)}" def get_ml_recommendations(overview_text, api_token=None): client = get_client(api_token) prompt = f""" Based on the following **Data Summary**, act as a Lead Machine Learning Engineer and design a specific modeling strategy. **CRITICAL INSTRUCTIONS:** 1. **Analyze the Columns**: Look at the column names, types, and sample values in the summary. 2. **Identify Target**: Pick the most logical target variable for a business use case. If none is obvious, suggest a Clustering approach. 3. **Specific Recommendations**: Do NOT list generic steps. - Instead of "Encode categorical variables", say "One-hot encode 'Region' and Label encode 'Risk_Level'". - Instead of "Handle missing values", say "Impute 'Age' with median because..." 4. **Model Selection**: Recommend models specifically suited for this dataset's size and features (e.g., "XGBoost because 'Income' is skewed..."). Data Summary: {overview_text} Output Format: ## 🤖 ML Modeling Strategy ### 🎯 Objective - **Target Variable**: [Column Name] (or "None - Unsupervised") - **Problem Type**: [Regression/Classification/Clustering] - **Goal**: [Predicting X to optimize Y...] ### 🛠️ Feature Engineering & Preprocessing - **[Specific Column]**: [Specific Transformation] - **[Specific Column]**: [Specific handling for missing/outliers] ### 🚀 Model Selection 1. **[Model Name]**: [Why it fits THIS data] 2. **[Model Name]**: [Why it fits THIS data] """ return generate_text(prompt, client) def analyze_text_content(text_samples, api_token=None): client = get_client(api_token) # Limit samples text_preview = "\n".join(text_samples[:20]) # First 20 lines prompt = f""" Analyze the following text content. 1. **Summarize** the main topics or themes. 2. **Sentiment Analysis**: Determine if the overall tone is Positive, Negative, or Neutral. 3. **Keywords**: Extract 5-7 key entities or terms. Text Preview: {text_preview} Output Format: ## 📝 Text Analysis ### 📌 Summary [Brief summary of content] ### 🎭 Sentiment **[Sentiment Label]**: [Reasoning] ### 🔑 Keywords - [Keyword 1] - [Keyword 2] ... """ return generate_text(prompt, client)