Spaces:

salihfurkaan
/

auto-data-analyst

Running

File size: 6,226 Bytes

from huggingface_hub import InferenceClient
import os

def get_client(api_token=None):
    # If user provides a token, use it.
    # Otherwise, fall back to environment variable (if exists) or None (which uses free/anonymous if allowed, or fails)
    # Note: Spaces usually have a default robust token if set in secrets, but here we want to prioritize user input if given.
    
    token = api_token if api_token and api_token.strip() else os.getenv("HF_TOKEN")
    # Using Qwen2.5-72B-Instruct as it is one of the most capable open models available on the free tier
    return InferenceClient("Qwen/Qwen2.5-72B-Instruct", token=token, timeout=30)

def generate_text(prompt, client, max_new_tokens=512):
    try:
        messages = [
            {"role": "system", "content": "You are a senior data analyst. You provide professional, concise, and accurate insights based on data summaries. You do NOT hallucinate numbers."},
            {"role": "user", "content": prompt}
        ]
        response = client.chat_completion(messages, max_tokens=max_new_tokens)
        return response.choices[0].message.content
    except Exception as e:
        error_msg = str(e)
        if "401" in error_msg or "api_key" in error_msg.lower():
            return (
                "⚠️ **AI Insights Unavailable**\n\n"
                "Authentication failed. Please provide a valid Hugging Face Token in the sidebar.\n"
                "1. Get a token from [Hugging Face Settings](https://huggingface.co/settings/tokens).\n"
                "2. Paste it into the 'Hugging Face Token' box above."
            )
        return f"Based on the analysis, please review the charts and data profile. (AI Insights unavailable: {error_msg})"

def get_insights(overview_text, anomalies_text, api_token=None):
    client = get_client(api_token)
    prompt = f"""
    Analyze the following dataset summary and anomaly report.
    Generate 3-5 key professional insights. 
    Focus on data quality, distribution patterns, and potential issues.
    Do not make up specific values not present in the summary.
    
    Data Summary:
    {overview_text}
    
    Anomaly Report:
    {anomalies_text}
    
    Output Format:
    - Insight 1
    - Insight 2
    - Insight 3
    ...
    """
    return generate_text(prompt, client)

def get_followup_questions(overview_text, api_token=None):
    client = get_client(api_token)
    prompt = f"""
    Based on the following dataset summary, suggest 3-5 relevant follow-up questions
    that a data analyst should ask to deeper understand the business context or data quality.
    
    Data Summary:
    {overview_text}
    
    Output Format:
    1. Question 1
    2. Question 2
    3. Question 3
    ...
    """
    return generate_text(prompt, client)

def ask_llm(message, history, overview_text, api_token=None):
    client = get_client(api_token)
    
    # Construct a prompt that includes the dataset context
    # History comes as list of [user_msg, bot_msg]
    # We'll format it for the model or just use the last query with context if stateless, 
    # but for a chat, we should include history.
    
    context = f"""
    You are a helpful Data Analyst Assistant.
    You have access to the following dataset summary:
    {overview_text}
    """
    
    messages = [{"role": "system", "content": context}]
    
    for user_msg, bot_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if bot_msg:
            messages.append({"role": "assistant", "content": bot_msg})
    
    messages.append({"role": "user", "content": message})
    
    try:
        response = client.chat_completion(messages, max_tokens=512)
        return response.choices[0].message.content
    except Exception as e:
        return f"Sorry, I encountered an error: {str(e)}"

def get_ml_recommendations(overview_text, api_token=None):
    client = get_client(api_token)
    prompt = f"""
    Based on the following **Data Summary**, act as a Lead Machine Learning Engineer and design a specific modeling strategy.
    
    **CRITICAL INSTRUCTIONS:**
    1. **Analyze the Columns**: Look at the column names, types, and sample values in the summary.
    2. **Identify Target**: Pick the most logical target variable for a business use case. If none is obvious, suggest a Clustering approach.
    3. **Specific Recommendations**: Do NOT list generic steps. 
       - Instead of "Encode categorical variables", say "One-hot encode 'Region' and Label encode 'Risk_Level'".
       - Instead of "Handle missing values", say "Impute 'Age' with median because..."
    4. **Model Selection**: Recommend models specifically suited for this dataset's size and features (e.g., "XGBoost because 'Income' is skewed...").

    Data Summary:
    {overview_text}
    
    Output Format:
    ## 🤖 ML Modeling Strategy
    
    ### 🎯 Objective
    - **Target Variable**: [Column Name] (or "None - Unsupervised")
    - **Problem Type**: [Regression/Classification/Clustering]
    - **Goal**: [Predicting X to optimize Y...]
    
    ### 🛠️ Feature Engineering & Preprocessing
    - **[Specific Column]**: [Specific Transformation]
    - **[Specific Column]**: [Specific handling for missing/outliers]
    
    ### 🚀 Model Selection
    1. **[Model Name]**: [Why it fits THIS data]
    2. **[Model Name]**: [Why it fits THIS data]
    """
    return generate_text(prompt, client)

def analyze_text_content(text_samples, api_token=None):
    client = get_client(api_token)
    
    # Limit samples
    text_preview = "\n".join(text_samples[:20]) # First 20 lines
    
    prompt = f"""
    Analyze the following text content.
    
    1. **Summarize** the main topics or themes.
    2. **Sentiment Analysis**: Determine if the overall tone is Positive, Negative, or Neutral.
    3. **Keywords**: Extract 5-7 key entities or terms.
    
    Text Preview:
    {text_preview}
    
    Output Format:
    ## 📝 Text Analysis
    
    ### 📌 Summary
    [Brief summary of content]
    
    ### 🎭 Sentiment
    **[Sentiment Label]**: [Reasoning]
    
    ### 🔑 Keywords
    - [Keyword 1]
    - [Keyword 2]
    ...
    """
    return generate_text(prompt, client)