Starberry15 commited on
Commit
aefddc0
Β·
verified Β·
1 Parent(s): 3c77075

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +23 -65
src/streamlit_app.py CHANGED
@@ -1,13 +1,12 @@
1
  # streamlit_data_analysis_app.py
2
- # Streamlit Data Analysis App for Hugging Face Spaces + Gemini 2.0 Flash
3
  # Features:
4
  # - Upload CSV / Excel
5
  # - Automatic cleaning & standardization
6
  # - Preprocessing (imputation, encoding, scaling)
7
  # - Quick visualizations
8
  # - Dataset summary + preview
9
- # - Insights from LLMs (Gemini or Hugging Face)
10
- # - Auto fallback and detailed error messages
11
 
12
  import os
13
  import streamlit as st
@@ -19,49 +18,36 @@ from sklearn.impute import SimpleImputer
19
  from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
20
  from sklearn.compose import ColumnTransformer
21
  from sklearn.pipeline import Pipeline
22
- from huggingface_hub import InferenceClient
23
  import google.generativeai as genai
24
 
25
  # ---------- CONFIGURATION ----------
26
  st.set_page_config(page_title="Data Analysis App", layout="wide")
27
 
28
- # Load API keys safely
29
- try:
30
- HF_TOKEN = st.secrets["HF_TOKEN"]
31
- except Exception:
32
- HF_TOKEN = os.getenv("HF_TOKEN")
33
-
34
  try:
35
  GEMINI_API_KEY = st.secrets["GEMINI_API_KEY"]
36
  except Exception:
37
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
38
 
39
- # Setup Gemini if available
40
  if GEMINI_API_KEY:
41
  genai.configure(api_key=GEMINI_API_KEY)
42
  st.success("βœ… Gemini API key loaded successfully.")
43
- elif HF_TOKEN:
44
- st.success("βœ… Hugging Face token loaded successfully.")
45
  else:
46
- st.warning("⚠️ No Gemini or Hugging Face token found. LLM features will be disabled.")
47
-
48
- # Default models
49
- MODEL_OPTIONS = {
50
- "gemini-2.0-flash": "Gemini 2.0 Flash (Google AI, fast, free-tier)",
51
- "mistralai/Mistral-7B-Instruct-v0.3": "Mistral 7B Instruct (open)",
52
- "HuggingFaceH4/zephyr-7b-beta": "Zephyr 7B Beta (open)",
53
- "bigscience/bloom-3b": "Bloom 3B (lightweight)",
54
- }
55
 
56
  # ---------- UTILITIES ----------
57
  def read_file(uploaded_file):
58
  name = uploaded_file.name.lower()
59
- if name.endswith(('.csv', '.txt')):
60
- return pd.read_csv(uploaded_file)
61
- elif name.endswith(('.xls', '.xlsx')):
62
- return pd.read_excel(uploaded_file)
63
- else:
64
- raise ValueError("Unsupported file type. Please upload CSV or Excel.")
 
 
 
 
65
 
66
  def clean_column_name(col: str) -> str:
67
  col = str(col).strip().lower().replace("\n", " ").replace("\t", " ")
@@ -138,7 +124,7 @@ def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd
138
  feature_names += cols
139
  return pd.DataFrame(X, columns=feature_names)
140
 
141
- # ---------- LLM HELPERS ----------
142
  def build_dataset_prompt(summary, user_question=None):
143
  s = [f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns."]
144
  for c in summary['columns']:
@@ -152,31 +138,7 @@ def build_dataset_prompt(summary, user_question=None):
152
  s.append("Please provide a summary, notable patterns, and suggestions for visualizations.")
153
  return "\n".join(s)
154
 
155
- def call_llm_huggingface(prompt: str, model: str, max_tokens: int = 512) -> str:
156
- if not HF_TOKEN:
157
- return "⚠️ No Hugging Face token found."
158
- client = InferenceClient(token=HF_TOKEN)
159
- try:
160
- response = client.text_generation(model=model, inputs=prompt, max_new_tokens=max_tokens)
161
- if isinstance(response, dict):
162
- return response.get('generated_text', str(response))
163
- return str(response)
164
- except Exception as e:
165
- if "403" in str(e):
166
- fallback = "mistralai/Mistral-7B-Instruct-v0.3"
167
- if model != fallback:
168
- try:
169
- st.warning(f"🚫 Access denied to {model}. Falling back to {fallback}...")
170
- response = client.text_generation(model=fallback, inputs=prompt, max_new_tokens=max_tokens)
171
- if isinstance(response, dict):
172
- return response.get('generated_text', str(response))
173
- return str(response)
174
- except Exception as e2:
175
- return f"❌ Fallback model also failed: {e2}"
176
- return "🚫 Access denied (403). Try using an open-access model."
177
- return f"❌ LLM call failed: {e}"
178
-
179
- def call_llm_gemini(prompt: str, model="gemini-2.0-flash", max_tokens=512):
180
  if not GEMINI_API_KEY:
181
  return "⚠️ Gemini API key not found."
182
  try:
@@ -187,13 +149,12 @@ def call_llm_gemini(prompt: str, model="gemini-2.0-flash", max_tokens=512):
187
  return f"❌ Gemini call failed: {e}"
188
 
189
  # ---------- STREAMLIT UI ----------
190
- st.title("πŸ“Š Data Analysis & Cleaning App")
191
- st.markdown("Upload CSV or Excel, clean and preprocess it, visualize data, and get insights from an AI model.")
192
 
193
  with st.sidebar:
194
  st.header("βš™οΈ Options")
195
- model_choice = st.selectbox("Select Model", options=list(MODEL_OPTIONS.keys()), format_func=lambda k: MODEL_OPTIONS[k])
196
- max_tokens = st.slider("LLM max tokens", 128, 1024, 512, 64)
197
  impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
198
  encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
199
  scale_numeric = st.checkbox("Scale numeric features", True)
@@ -235,7 +196,7 @@ if uploaded_file:
235
  second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
236
 
237
  if st.button("Show Visualization"):
238
- fig, ax = plt.subplots(figsize=(8,5))
239
  try:
240
  if viz_type == 'Histogram':
241
  sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
@@ -253,15 +214,12 @@ if uploaded_file:
253
  except Exception as e:
254
  st.error(f"Visualization failed: {e}")
255
 
256
- st.subheader("🧠 Ask the AI for Insights")
257
  user_q = st.text_area("Enter your question (optional):")
258
  if st.button("Get Insights"):
259
- with st.spinner("Generating insights..."):
260
  prompt = build_dataset_prompt(summary, user_q if user_q else None)
261
- if model_choice.startswith("gemini"):
262
- llm_resp = call_llm_gemini(prompt, model_choice, max_tokens)
263
- else:
264
- llm_resp = call_llm_huggingface(prompt, model_choice, max_tokens)
265
  st.write(llm_resp)
266
 
267
  else:
 
1
  # streamlit_data_analysis_app.py
2
+ # Streamlit Data Analysis App using Gemini 2.0 Flash (Free-tier)
3
  # Features:
4
  # - Upload CSV / Excel
5
  # - Automatic cleaning & standardization
6
  # - Preprocessing (imputation, encoding, scaling)
7
  # - Quick visualizations
8
  # - Dataset summary + preview
9
+ # - Insights powered by Gemini 2.0 Flash (Google AI)
 
10
 
11
  import os
12
  import streamlit as st
 
18
  from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
19
  from sklearn.compose import ColumnTransformer
20
  from sklearn.pipeline import Pipeline
 
21
  import google.generativeai as genai
22
 
23
  # ---------- CONFIGURATION ----------
24
  st.set_page_config(page_title="Data Analysis App", layout="wide")
25
 
26
+ # Load Gemini API key safely
 
 
 
 
 
27
  try:
28
  GEMINI_API_KEY = st.secrets["GEMINI_API_KEY"]
29
  except Exception:
30
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
31
 
 
32
  if GEMINI_API_KEY:
33
  genai.configure(api_key=GEMINI_API_KEY)
34
  st.success("βœ… Gemini API key loaded successfully.")
 
 
35
  else:
36
+ st.warning("⚠️ No Gemini API key found. Please add GEMINI_API_KEY to .env or Streamlit secrets.")
 
 
 
 
 
 
 
 
37
 
38
  # ---------- UTILITIES ----------
39
  def read_file(uploaded_file):
40
  name = uploaded_file.name.lower()
41
+ try:
42
+ if name.endswith(('.csv', '.txt')):
43
+ return pd.read_csv(uploaded_file, encoding="utf-8", errors="replace")
44
+ elif name.endswith(('.xls', '.xlsx')):
45
+ return pd.read_excel(uploaded_file)
46
+ else:
47
+ raise ValueError("Unsupported file type. Please upload CSV or Excel.")
48
+ except Exception as e:
49
+ st.error(f"❌ File reading failed: {e}")
50
+ raise
51
 
52
  def clean_column_name(col: str) -> str:
53
  col = str(col).strip().lower().replace("\n", " ").replace("\t", " ")
 
124
  feature_names += cols
125
  return pd.DataFrame(X, columns=feature_names)
126
 
127
+ # ---------- LLM (Gemini only) ----------
128
  def build_dataset_prompt(summary, user_question=None):
129
  s = [f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns."]
130
  for c in summary['columns']:
 
138
  s.append("Please provide a summary, notable patterns, and suggestions for visualizations.")
139
  return "\n".join(s)
140
 
141
+ def call_llm_gemini(prompt: str, model="gemini-2.0-flash"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  if not GEMINI_API_KEY:
143
  return "⚠️ Gemini API key not found."
144
  try:
 
149
  return f"❌ Gemini call failed: {e}"
150
 
151
  # ---------- STREAMLIT UI ----------
152
+ st.title("πŸ“Š Data Analysis & Cleaning App (Gemini-Powered)")
153
+ st.markdown("Upload CSV or Excel, clean and preprocess it, visualize data, and get insights powered by **Gemini 2.0 Flash**.")
154
 
155
  with st.sidebar:
156
  st.header("βš™οΈ Options")
157
+ st.info("Using **Gemini 2.0 Flash (Google AI)** for insights.")
 
158
  impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
159
  encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
160
  scale_numeric = st.checkbox("Scale numeric features", True)
 
196
  second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
197
 
198
  if st.button("Show Visualization"):
199
+ fig, ax = plt.subplots(figsize=(8, 5))
200
  try:
201
  if viz_type == 'Histogram':
202
  sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
 
214
  except Exception as e:
215
  st.error(f"Visualization failed: {e}")
216
 
217
+ st.subheader("🧠 Ask Gemini for Insights")
218
  user_q = st.text_area("Enter your question (optional):")
219
  if st.button("Get Insights"):
220
+ with st.spinner("Generating insights via Gemini..."):
221
  prompt = build_dataset_prompt(summary, user_q if user_q else None)
222
+ llm_resp = call_llm_gemini(prompt)
 
 
 
223
  st.write(llm_resp)
224
 
225
  else: