File size: 6,226 Bytes
bb9980b
 
 
8a352c5
 
 
 
 
 
9333089
 
bb9980b
8a352c5
bb9980b
 
 
 
 
 
 
 
2f1bb6b
 
 
 
8a352c5
 
 
2f1bb6b
 
bb9980b
8a352c5
 
bb9980b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a352c5
bb9980b
8a352c5
 
bb9980b
 
 
 
 
 
 
 
 
 
 
 
 
8a352c5
e46965b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5679c31
 
 
 
841e95d
 
 
 
 
 
 
 
 
 
5679c31
 
 
 
841e95d
5679c31
841e95d
 
 
 
5679c31
841e95d
 
 
5679c31
841e95d
 
 
5679c31
 
b969e68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from huggingface_hub import InferenceClient
import os

def get_client(api_token=None):
    # If user provides a token, use it.
    # Otherwise, fall back to environment variable (if exists) or None (which uses free/anonymous if allowed, or fails)
    # Note: Spaces usually have a default robust token if set in secrets, but here we want to prioritize user input if given.
    
    token = api_token if api_token and api_token.strip() else os.getenv("HF_TOKEN")
    # Using Qwen2.5-72B-Instruct as it is one of the most capable open models available on the free tier
    return InferenceClient("Qwen/Qwen2.5-72B-Instruct", token=token, timeout=30)

def generate_text(prompt, client, max_new_tokens=512):
    try:
        messages = [
            {"role": "system", "content": "You are a senior data analyst. You provide professional, concise, and accurate insights based on data summaries. You do NOT hallucinate numbers."},
            {"role": "user", "content": prompt}
        ]
        response = client.chat_completion(messages, max_tokens=max_new_tokens)
        return response.choices[0].message.content
    except Exception as e:
        error_msg = str(e)
        if "401" in error_msg or "api_key" in error_msg.lower():
            return (
                "⚠️ **AI Insights Unavailable**\n\n"
                "Authentication failed. Please provide a valid Hugging Face Token in the sidebar.\n"
                "1. Get a token from [Hugging Face Settings](https://huggingface.co/settings/tokens).\n"
                "2. Paste it into the 'Hugging Face Token' box above."
            )
        return f"Based on the analysis, please review the charts and data profile. (AI Insights unavailable: {error_msg})"

def get_insights(overview_text, anomalies_text, api_token=None):
    client = get_client(api_token)
    prompt = f"""
    Analyze the following dataset summary and anomaly report.
    Generate 3-5 key professional insights. 
    Focus on data quality, distribution patterns, and potential issues.
    Do not make up specific values not present in the summary.
    
    Data Summary:
    {overview_text}
    
    Anomaly Report:
    {anomalies_text}
    
    Output Format:
    - Insight 1
    - Insight 2
    - Insight 3
    ...
    """
    return generate_text(prompt, client)

def get_followup_questions(overview_text, api_token=None):
    client = get_client(api_token)
    prompt = f"""
    Based on the following dataset summary, suggest 3-5 relevant follow-up questions
    that a data analyst should ask to deeper understand the business context or data quality.
    
    Data Summary:
    {overview_text}
    
    Output Format:
    1. Question 1
    2. Question 2
    3. Question 3
    ...
    """
    return generate_text(prompt, client)

def ask_llm(message, history, overview_text, api_token=None):
    client = get_client(api_token)
    
    # Construct a prompt that includes the dataset context
    # History comes as list of [user_msg, bot_msg]
    # We'll format it for the model or just use the last query with context if stateless, 
    # but for a chat, we should include history.
    
    context = f"""
    You are a helpful Data Analyst Assistant.
    You have access to the following dataset summary:
    {overview_text}
    """
    
    messages = [{"role": "system", "content": context}]
    
    for user_msg, bot_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if bot_msg:
            messages.append({"role": "assistant", "content": bot_msg})
    
    messages.append({"role": "user", "content": message})
    
    try:
        response = client.chat_completion(messages, max_tokens=512)
        return response.choices[0].message.content
    except Exception as e:
        return f"Sorry, I encountered an error: {str(e)}"

def get_ml_recommendations(overview_text, api_token=None):
    client = get_client(api_token)
    prompt = f"""
    Based on the following **Data Summary**, act as a Lead Machine Learning Engineer and design a specific modeling strategy.
    
    **CRITICAL INSTRUCTIONS:**
    1. **Analyze the Columns**: Look at the column names, types, and sample values in the summary.
    2. **Identify Target**: Pick the most logical target variable for a business use case. If none is obvious, suggest a Clustering approach.
    3. **Specific Recommendations**: Do NOT list generic steps. 
       - Instead of "Encode categorical variables", say "One-hot encode 'Region' and Label encode 'Risk_Level'".
       - Instead of "Handle missing values", say "Impute 'Age' with median because..."
    4. **Model Selection**: Recommend models specifically suited for this dataset's size and features (e.g., "XGBoost because 'Income' is skewed...").

    Data Summary:
    {overview_text}
    
    Output Format:
    ## πŸ€– ML Modeling Strategy
    
    ### 🎯 Objective
    - **Target Variable**: [Column Name] (or "None - Unsupervised")
    - **Problem Type**: [Regression/Classification/Clustering]
    - **Goal**: [Predicting X to optimize Y...]
    
    ### πŸ› οΈ Feature Engineering & Preprocessing
    - **[Specific Column]**: [Specific Transformation]
    - **[Specific Column]**: [Specific handling for missing/outliers]
    
    ### πŸš€ Model Selection
    1. **[Model Name]**: [Why it fits THIS data]
    2. **[Model Name]**: [Why it fits THIS data]
    """
    return generate_text(prompt, client)

def analyze_text_content(text_samples, api_token=None):
    client = get_client(api_token)
    
    # Limit samples
    text_preview = "\n".join(text_samples[:20]) # First 20 lines
    
    prompt = f"""
    Analyze the following text content.
    
    1. **Summarize** the main topics or themes.
    2. **Sentiment Analysis**: Determine if the overall tone is Positive, Negative, or Neutral.
    3. **Keywords**: Extract 5-7 key entities or terms.
    
    Text Preview:
    {text_preview}
    
    Output Format:
    ## πŸ“ Text Analysis
    
    ### πŸ“Œ Summary
    [Brief summary of content]
    
    ### 🎭 Sentiment
    **[Sentiment Label]**: [Reasoning]
    
    ### πŸ”‘ Keywords
    - [Keyword 1]
    - [Keyword 2]
    ...
    """
    return generate_text(prompt, client)