Spaces:

Starberry15
/

data_analysis

Sleeping

App Files Files Community

Starberry15 commited on Oct 22

Commit

aefddc0

verified ·

1 Parent(s): 3c77075

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +23 -65

src/streamlit_app.py CHANGED Viewed

@@ -1,13 +1,12 @@
 # streamlit_data_analysis_app.py
-# Streamlit Data Analysis App for Hugging Face Spaces + Gemini 2.0 Flash
 # Features:
 # - Upload CSV / Excel
 # - Automatic cleaning & standardization
 # - Preprocessing (imputation, encoding, scaling)
 # - Quick visualizations
 # - Dataset summary + preview
-# - Insights from LLMs (Gemini or Hugging Face)
-# - Auto fallback and detailed error messages
 import os
 import streamlit as st
@@ -19,49 +18,36 @@ from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
-from huggingface_hub import InferenceClient
 import google.generativeai as genai
 # ---------- CONFIGURATION ----------
 st.set_page_config(page_title="Data Analysis App", layout="wide")
-# Load API keys safely
-try:
-    HF_TOKEN = st.secrets["HF_TOKEN"]
-except Exception:
-    HF_TOKEN = os.getenv("HF_TOKEN")
 try:
     GEMINI_API_KEY = st.secrets["GEMINI_API_KEY"]
 except Exception:
     GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-# Setup Gemini if available
 if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
     st.success("✅ Gemini API key loaded successfully.")
-elif HF_TOKEN:
-    st.success("✅ Hugging Face token loaded successfully.")
 else:
-    st.warning("⚠️ No Gemini or Hugging Face token found. LLM features will be disabled.")
-# Default models
-MODEL_OPTIONS = {
-    "gemini-2.0-flash": "Gemini 2.0 Flash (Google AI, fast, free-tier)",
-    "mistralai/Mistral-7B-Instruct-v0.3": "Mistral 7B Instruct (open)",
-    "HuggingFaceH4/zephyr-7b-beta": "Zephyr 7B Beta (open)",
-    "bigscience/bloom-3b": "Bloom 3B (lightweight)",
-}
 # ---------- UTILITIES ----------
 def read_file(uploaded_file):
     name = uploaded_file.name.lower()
-    if name.endswith(('.csv', '.txt')):
-        return pd.read_csv(uploaded_file)
-    elif name.endswith(('.xls', '.xlsx')):
-        return pd.read_excel(uploaded_file)
-    else:
-        raise ValueError("Unsupported file type. Please upload CSV or Excel.")
 def clean_column_name(col: str) -> str:
     col = str(col).strip().lower().replace("\n", " ").replace("\t", " ")
@@ -138,7 +124,7 @@ def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd
                 feature_names += cols
     return pd.DataFrame(X, columns=feature_names)
-# ---------- LLM HELPERS ----------
 def build_dataset_prompt(summary, user_question=None):
     s = [f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns."]
     for c in summary['columns']:
@@ -152,31 +138,7 @@ def build_dataset_prompt(summary, user_question=None):
         s.append("Please provide a summary, notable patterns, and suggestions for visualizations.")
     return "\n".join(s)
-def call_llm_huggingface(prompt: str, model: str, max_tokens: int = 512) -> str:
-    if not HF_TOKEN:
-        return "⚠️ No Hugging Face token found."
-    client = InferenceClient(token=HF_TOKEN)
-    try:
-        response = client.text_generation(model=model, inputs=prompt, max_new_tokens=max_tokens)
-        if isinstance(response, dict):
-            return response.get('generated_text', str(response))
-        return str(response)
-    except Exception as e:
-        if "403" in str(e):
-            fallback = "mistralai/Mistral-7B-Instruct-v0.3"
-            if model != fallback:
-                try:
-                    st.warning(f"🚫 Access denied to {model}. Falling back to {fallback}...")
-                    response = client.text_generation(model=fallback, inputs=prompt, max_new_tokens=max_tokens)
-                    if isinstance(response, dict):
-                        return response.get('generated_text', str(response))
-                    return str(response)
-                except Exception as e2:
-                    return f"❌ Fallback model also failed: {e2}"
-            return "🚫 Access denied (403). Try using an open-access model."
-        return f"❌ LLM call failed: {e}"
-def call_llm_gemini(prompt: str, model="gemini-2.0-flash", max_tokens=512):
     if not GEMINI_API_KEY:
         return "⚠️ Gemini API key not found."
     try:
@@ -187,13 +149,12 @@ def call_llm_gemini(prompt: str, model="gemini-2.0-flash", max_tokens=512):
         return f"❌ Gemini call failed: {e}"
 # ---------- STREAMLIT UI ----------
-st.title("📊 Data Analysis & Cleaning App")
-st.markdown("Upload CSV or Excel, clean and preprocess it, visualize data, and get insights from an AI model.")
 with st.sidebar:
     st.header("⚙️ Options")
-    model_choice = st.selectbox("Select Model", options=list(MODEL_OPTIONS.keys()), format_func=lambda k: MODEL_OPTIONS[k])
-    max_tokens = st.slider("LLM max tokens", 128, 1024, 512, 64)
     impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
     encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
     scale_numeric = st.checkbox("Scale numeric features", True)
@@ -235,7 +196,7 @@ if uploaded_file:
         second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
     if st.button("Show Visualization"):
-        fig, ax = plt.subplots(figsize=(8,5))
         try:
             if viz_type == 'Histogram':
                 sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
@@ -253,15 +214,12 @@ if uploaded_file:
         except Exception as e:
             st.error(f"Visualization failed: {e}")
-    st.subheader("🧠 Ask the AI for Insights")
     user_q = st.text_area("Enter your question (optional):")
     if st.button("Get Insights"):
-        with st.spinner("Generating insights..."):
             prompt = build_dataset_prompt(summary, user_q if user_q else None)
-            if model_choice.startswith("gemini"):
-                llm_resp = call_llm_gemini(prompt, model_choice, max_tokens)
-            else:
-                llm_resp = call_llm_huggingface(prompt, model_choice, max_tokens)
             st.write(llm_resp)
 else:

 # streamlit_data_analysis_app.py
+# Streamlit Data Analysis App using Gemini 2.0 Flash (Free-tier)
 # Features:
 # - Upload CSV / Excel
 # - Automatic cleaning & standardization
 # - Preprocessing (imputation, encoding, scaling)
 # - Quick visualizations
 # - Dataset summary + preview
+# - Insights powered by Gemini 2.0 Flash (Google AI)
 import os
 import streamlit as st
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 import google.generativeai as genai
 # ---------- CONFIGURATION ----------
 st.set_page_config(page_title="Data Analysis App", layout="wide")
+# Load Gemini API key safely
 try:
     GEMINI_API_KEY = st.secrets["GEMINI_API_KEY"]
 except Exception:
     GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
     st.success("✅ Gemini API key loaded successfully.")
 else:
+    st.warning("⚠️ No Gemini API key found. Please add GEMINI_API_KEY to .env or Streamlit secrets.")
 # ---------- UTILITIES ----------
 def read_file(uploaded_file):
     name = uploaded_file.name.lower()
+    try:
+        if name.endswith(('.csv', '.txt')):
+            return pd.read_csv(uploaded_file, encoding="utf-8", errors="replace")
+        elif name.endswith(('.xls', '.xlsx')):
+            return pd.read_excel(uploaded_file)
+        else:
+            raise ValueError("Unsupported file type. Please upload CSV or Excel.")
+    except Exception as e:
+        st.error(f"❌ File reading failed: {e}")
+        raise
 def clean_column_name(col: str) -> str:
     col = str(col).strip().lower().replace("\n", " ").replace("\t", " ")
                 feature_names += cols
     return pd.DataFrame(X, columns=feature_names)
+# ---------- LLM (Gemini only) ----------
 def build_dataset_prompt(summary, user_question=None):
     s = [f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns."]
     for c in summary['columns']:
         s.append("Please provide a summary, notable patterns, and suggestions for visualizations.")
     return "\n".join(s)
+def call_llm_gemini(prompt: str, model="gemini-2.0-flash"):
     if not GEMINI_API_KEY:
         return "⚠️ Gemini API key not found."
     try:
         return f"❌ Gemini call failed: {e}"
 # ---------- STREAMLIT UI ----------
+st.title("📊 Data Analysis & Cleaning App (Gemini-Powered)")
+st.markdown("Upload CSV or Excel, clean and preprocess it, visualize data, and get insights powered by **Gemini 2.0 Flash**.")
 with st.sidebar:
     st.header("⚙️ Options")
+    st.info("Using **Gemini 2.0 Flash (Google AI)** for insights.")
     impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
     encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
     scale_numeric = st.checkbox("Scale numeric features", True)
         second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
     if st.button("Show Visualization"):
+        fig, ax = plt.subplots(figsize=(8, 5))
         try:
             if viz_type == 'Histogram':
                 sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
         except Exception as e:
             st.error(f"Visualization failed: {e}")
+    st.subheader("🧠 Ask Gemini for Insights")
     user_q = st.text_area("Enter your question (optional):")
     if st.button("Get Insights"):
+        with st.spinner("Generating insights via Gemini..."):
             prompt = build_dataset_prompt(summary, user_q if user_q else None)
+            llm_resp = call_llm_gemini(prompt)
             st.write(llm_resp)
 else: