Spaces:
Sleeping
Sleeping
File size: 7,013 Bytes
90e0941 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import os
import time
import pandas as pd
import numpy as np
from io import StringIO
from huggingface_hub import InferenceClient
import google.generativeai as genai
# ======================================================
# ๐ง HELPER FUNCTIONS
# ======================================================
def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512, retries=2):
"""Safely call Hugging Face text generation with retry and graceful fallback."""
for attempt in range(retries + 1):
try:
resp = client.text_generation(
prompt,
temperature=temperature,
max_new_tokens=max_tokens,
return_full_text=False,
)
return resp.strip()
except Exception as e:
err = str(e)
if "503" in err or "Service Temporarily Unavailable" in err:
time.sleep(2)
if attempt < retries:
continue
else:
return "โ ๏ธ The Hugging Face model is temporarily unavailable. Please try again later."
elif "Supported task: conversational" in err:
chat_resp = client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
temperature=temperature,
)
return chat_resp["choices"][0]["message"]["content"].strip()
else:
raise e
return "โ ๏ธ Failed after multiple retries."
# ======================================================
# ๐งผ DATA CLEANING
# ======================================================
def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
"""Perform a basic fallback cleaning if AI-based cleaning fails."""
df = df.copy()
df.dropna(axis=1, how="all", inplace=True)
df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
for col in df.columns:
if df[col].dtype == "O":
if not df[col].mode().empty:
df[col].fillna(df[col].mode()[0], inplace=True)
else:
df[col].fillna("Unknown", inplace=True)
else:
df[col].fillna(df[col].median(), inplace=True)
df.drop_duplicates(inplace=True)
return df
def ai_clean_dataset(df: pd.DataFrame, cleaner_client: InferenceClient) -> (pd.DataFrame, str):
"""Clean dataset intelligently using the chosen Hugging Face model."""
if len(df) > 50:
return df, "โ ๏ธ AI cleaning skipped: dataset has more than 50 rows."
csv_text = df.to_csv(index=False)
prompt = f"""
You are a professional data cleaning assistant.
Clean and standardize the dataset below dynamically:
1. Handle missing values
2. Fix column name inconsistencies
3. Convert data types (dates, numbers, categories)
4. Remove irrelevant or duplicate rows
Return ONLY a valid CSV text (no markdown, no explanations).
Dataset:
{csv_text}
"""
try:
cleaned_str = safe_hf_generate(cleaner_client, prompt, temperature=0.1, max_tokens=4096)
cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").strip()
cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
return cleaned_df, "โ
AI cleaning completed successfully."
except Exception as e:
return fallback_clean(df), f"โ ๏ธ AI cleaning failed, used fallback cleaning instead: {str(e)}"
# ======================================================
# ๐ DATA SUMMARIZATION
# ======================================================
def summarize_for_analysis(df: pd.DataFrame, sample_rows: int = 10) -> str:
"""Generate a concise textual summary of the dataset for AI models."""
summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]
for col in df.columns:
non_null = int(df[col].notnull().sum())
if pd.api.types.is_numeric_dtype(df[col]):
desc = df[col].describe().to_dict()
summary.append(
f"- {col}: mean={desc.get('mean', np.nan):.2f}, median={df[col].median():.2f}, non_null={non_null}"
)
else:
top = df[col].value_counts().head(3).to_dict()
summary.append(f"- {col}: top_values={top}, non_null={non_null}")
sample = df.head(sample_rows).to_csv(index=False)
summary.append("--- Sample Data ---")
summary.append(sample)
return "\n".join(summary)
# ======================================================
# ๐ง ANALYSIS LOGIC
# ======================================================
def query_analysis_model(
df: pd.DataFrame,
user_query: str,
dataset_name: str,
analyst_model: str,
hf_client: InferenceClient = None,
temperature: float = 0.3,
max_tokens: int = 1024,
gemini_api_key: str = None
) -> str:
"""Query the selected AI model (Hugging Face or Gemini) to analyze the dataset."""
prompt_summary = summarize_for_analysis(df)
prompt = f"""
You are a professional data analyst.
Analyze the dataset '{dataset_name}' and answer the user's question.
--- DATA SUMMARY ---
{prompt_summary}
--- USER QUESTION ---
{user_query}
Respond with:
1. Key insights and patterns
2. Quantitative findings
3. Notable relationships or anomalies
4. Data-driven recommendations
"""
try:
if analyst_model == "Gemini 2.5 Flash (Google)":
if not gemini_api_key:
return "โ ๏ธ Gemini API key missing. Cannot use Gemini."
genai.configure(api_key=gemini_api_key)
response = genai.GenerativeModel("gemini-2.5-flash").generate_content(
prompt,
generation_config={
"temperature": temperature,
"max_output_tokens": max_tokens
}
)
return response.text if hasattr(response, "text") else "No valid text response."
# Otherwise, use Hugging Face model
result = safe_hf_generate(hf_client, prompt, temperature=temperature, max_tokens=max_tokens)
# fallback to Gemini if Hugging Face fails
if "temporarily unavailable" in result.lower() and gemini_api_key:
genai.configure(api_key=gemini_api_key)
alt = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
return f"๐ Fallback to Gemini:\n\n{alt.text}"
return result
except Exception as e:
if "503" in str(e) and gemini_api_key:
genai.configure(api_key=gemini_api_key)
response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
return f"๐ Fallback to Gemini due to 503 error:\n\n{response.text}"
return f"โ ๏ธ Analysis failed: {str(e)}"
# ======================================================
# ๐ END OF MODULE
# ======================================================
|