File size: 7,013 Bytes
90e0941
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import time
import pandas as pd
import numpy as np
from io import StringIO
from huggingface_hub import InferenceClient
import google.generativeai as genai

# ======================================================
# ๐Ÿ”ง HELPER FUNCTIONS
# ======================================================

def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512, retries=2):
    """Safely call Hugging Face text generation with retry and graceful fallback."""
    for attempt in range(retries + 1):
        try:
            resp = client.text_generation(
                prompt,
                temperature=temperature,
                max_new_tokens=max_tokens,
                return_full_text=False,
            )
            return resp.strip()
        except Exception as e:
            err = str(e)
            if "503" in err or "Service Temporarily Unavailable" in err:
                time.sleep(2)
                if attempt < retries:
                    continue
                else:
                    return "โš ๏ธ The Hugging Face model is temporarily unavailable. Please try again later."
            elif "Supported task: conversational" in err:
                chat_resp = client.chat_completion(
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=max_tokens,
                    temperature=temperature,
                )
                return chat_resp["choices"][0]["message"]["content"].strip()
            else:
                raise e
    return "โš ๏ธ Failed after multiple retries."

# ======================================================
# ๐Ÿงผ DATA CLEANING
# ======================================================

def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
    """Perform a basic fallback cleaning if AI-based cleaning fails."""
    df = df.copy()
    df.dropna(axis=1, how="all", inplace=True)
    df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]

    for col in df.columns:
        if df[col].dtype == "O":
            if not df[col].mode().empty:
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna("Unknown", inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)

    df.drop_duplicates(inplace=True)
    return df


def ai_clean_dataset(df: pd.DataFrame, cleaner_client: InferenceClient) -> (pd.DataFrame, str):
    """Clean dataset intelligently using the chosen Hugging Face model."""
    if len(df) > 50:
        return df, "โš ๏ธ AI cleaning skipped: dataset has more than 50 rows."

    csv_text = df.to_csv(index=False)
    prompt = f"""
You are a professional data cleaning assistant.
Clean and standardize the dataset below dynamically:
1. Handle missing values
2. Fix column name inconsistencies
3. Convert data types (dates, numbers, categories)
4. Remove irrelevant or duplicate rows
Return ONLY a valid CSV text (no markdown, no explanations).

Dataset:
{csv_text}
"""
    try:
        cleaned_str = safe_hf_generate(cleaner_client, prompt, temperature=0.1, max_tokens=4096)
        cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").strip()
        cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
        cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
        return cleaned_df, "โœ… AI cleaning completed successfully."
    except Exception as e:
        return fallback_clean(df), f"โš ๏ธ AI cleaning failed, used fallback cleaning instead: {str(e)}"

# ======================================================
# ๐Ÿ“Š DATA SUMMARIZATION
# ======================================================

def summarize_for_analysis(df: pd.DataFrame, sample_rows: int = 10) -> str:
    """Generate a concise textual summary of the dataset for AI models."""
    summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]

    for col in df.columns:
        non_null = int(df[col].notnull().sum())
        if pd.api.types.is_numeric_dtype(df[col]):
            desc = df[col].describe().to_dict()
            summary.append(
                f"- {col}: mean={desc.get('mean', np.nan):.2f}, median={df[col].median():.2f}, non_null={non_null}"
            )
        else:
            top = df[col].value_counts().head(3).to_dict()
            summary.append(f"- {col}: top_values={top}, non_null={non_null}")

    sample = df.head(sample_rows).to_csv(index=False)
    summary.append("--- Sample Data ---")
    summary.append(sample)
    return "\n".join(summary)

# ======================================================
# ๐Ÿง  ANALYSIS LOGIC
# ======================================================

def query_analysis_model(
    df: pd.DataFrame,
    user_query: str,
    dataset_name: str,
    analyst_model: str,
    hf_client: InferenceClient = None,
    temperature: float = 0.3,
    max_tokens: int = 1024,
    gemini_api_key: str = None
) -> str:
    """Query the selected AI model (Hugging Face or Gemini) to analyze the dataset."""
    prompt_summary = summarize_for_analysis(df)
    prompt = f"""
You are a professional data analyst.
Analyze the dataset '{dataset_name}' and answer the user's question.

--- DATA SUMMARY ---
{prompt_summary}

--- USER QUESTION ---
{user_query}

Respond with:
1. Key insights and patterns
2. Quantitative findings
3. Notable relationships or anomalies
4. Data-driven recommendations
"""

    try:
        if analyst_model == "Gemini 2.5 Flash (Google)":
            if not gemini_api_key:
                return "โš ๏ธ Gemini API key missing. Cannot use Gemini."
            genai.configure(api_key=gemini_api_key)
            response = genai.GenerativeModel("gemini-2.5-flash").generate_content(
                prompt,
                generation_config={
                    "temperature": temperature,
                    "max_output_tokens": max_tokens
                }
            )
            return response.text if hasattr(response, "text") else "No valid text response."

        # Otherwise, use Hugging Face model
        result = safe_hf_generate(hf_client, prompt, temperature=temperature, max_tokens=max_tokens)

        # fallback to Gemini if Hugging Face fails
        if "temporarily unavailable" in result.lower() and gemini_api_key:
            genai.configure(api_key=gemini_api_key)
            alt = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
            return f"๐Ÿ”„ Fallback to Gemini:\n\n{alt.text}"
        return result

    except Exception as e:
        if "503" in str(e) and gemini_api_key:
            genai.configure(api_key=gemini_api_key)
            response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
            return f"๐Ÿ”„ Fallback to Gemini due to 503 error:\n\n{response.text}"
        return f"โš ๏ธ Analysis failed: {str(e)}"

# ======================================================
# ๐Ÿ” END OF MODULE
# ======================================================