Starberry15 commited on
Commit
5e0cf9a
Β·
verified Β·
1 Parent(s): 2e917ae

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +133 -221
src/streamlit_app.py CHANGED
@@ -2,18 +2,16 @@
2
  # Streamlit Data Analysis App for Hugging Face Spaces
3
  # Features:
4
  # - Upload CSV / Excel
5
- # - Automatic cleaning & standardization (column names, missing values, dtypes)
6
  # - Preprocessing (imputation, encoding, scaling)
7
  # - Quick visualizations (histogram, boxplot, scatter, correlation heatmap)
8
  # - Preview cleaned dataset
9
- # - LLM-powered dataset summary & insights using Hugging Face Inference API
10
- # - Uses HF_TOKEN from Streamlit secrets (or environment variable)
 
11
 
12
  import os
13
  import io
14
- import math
15
- from typing import Optional, Tuple, List, Dict
16
-
17
  import streamlit as st
18
  import pandas as pd
19
  import numpy as np
@@ -25,28 +23,27 @@ from sklearn.compose import ColumnTransformer
25
  from sklearn.pipeline import Pipeline
26
  from huggingface_hub import InferenceClient
27
 
28
- # ---------- Configuration ----------
29
  st.set_page_config(page_title="Data Analysis App", layout="wide")
30
 
31
- # Try to read HF token from Streamlit secrets then environment
32
- HF_TOKEN = None
33
- try:
34
- HF_TOKEN = st.secrets.get("HF_TOKEN")
35
- except Exception:
36
- HF_TOKEN = None
37
  if not HF_TOKEN:
38
- HF_TOKEN = os.getenv("HF_TOKEN")
 
 
39
 
40
- # Default open-source model choices (available on Hugging Face)
41
  MODEL_OPTIONS = {
42
- "bigscience/bloomz-7b1": "BloomZ 7B (instruction-tuned)",
43
- "tiiuae/falcon-7b-instruct": "Falcon 7B Instruct",
44
- "bigscience/bloom-3b": "Bloom 3B (lighter)"
45
  }
46
 
47
- # ---------- Utility functions ----------
48
 
49
  def read_file(uploaded_file) -> pd.DataFrame:
 
50
  name = uploaded_file.name.lower()
51
  if name.endswith(('.csv', '.txt')):
52
  return pd.read_csv(uploaded_file)
@@ -57,291 +54,206 @@ def read_file(uploaded_file) -> pd.DataFrame:
57
 
58
 
59
  def clean_column_name(col: str) -> str:
60
- # standardize: strip, lower, replace spaces and special chars with _
61
- col = str(col).strip()
62
- col = col.replace("\n", " ").replace("\t", " ")
63
- col = col.lower()
64
  col = "_".join(col.split())
65
- # keep alphanumerics and _
66
  col = ''.join(c for c in col if (c.isalnum() or c == '_'))
67
- # collapse multiple _
68
  while '__' in col:
69
  col = col.replace('__', '_')
70
  return col
71
 
72
 
73
  def standardize_dataframe(df: pd.DataFrame, drop_all_nan_cols: bool = True) -> pd.DataFrame:
 
74
  df = df.copy()
75
- # strip whitespace from string columns
76
  for c in df.select_dtypes(include=['object']).columns:
77
  df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
78
- # standardize column names
79
  df.columns = [clean_column_name(c) for c in df.columns]
80
- # drop fully empty columns
81
  if drop_all_nan_cols:
82
  df.dropna(axis=1, how='all', inplace=True)
83
- # try to parse datetime columns heuristically
84
  for c in df.columns:
85
  if df[c].dtype == object:
86
  sample = df[c].dropna().astype(str).head(20)
87
  if not sample.empty:
88
- # quick heuristic: if majority parse as datetime
89
  parsed = pd.to_datetime(sample, errors='coerce')
90
  if parsed.notna().sum() / len(sample) > 0.6:
91
  df[c] = pd.to_datetime(df[c], errors='coerce')
92
  return df
93
 
94
 
95
- def summarize_dataframe(df: pd.DataFrame, max_rows: int = 5) -> Dict:
96
- summary = {}
97
- summary['shape'] = df.shape
98
- summary['columns'] = []
99
  for c in df.columns:
100
- col_info = {
101
- 'name': c,
102
- 'dtype': str(df[c].dtype),
103
- 'n_missing': int(df[c].isna().sum()),
104
- 'n_unique': int(df[c].nunique(dropna=True)) if df[c].dtype != 'object' else int(df[c].nunique(dropna=True)),
105
- }
106
  if pd.api.types.is_numeric_dtype(df[c]):
107
- desc = df[c].describe().to_dict()
108
- col_info['summary'] = {k: float(v) for k, v in desc.items()}
109
  elif pd.api.types.is_datetime64_any_dtype(df[c]):
110
- col_info['summary'] = {
111
- 'min': str(df[c].min()),
112
- 'max': str(df[c].max())
113
- }
114
  else:
115
- col_info['top_values'] = df[c].dropna().astype(str).value_counts().head(5).to_dict()
116
- summary['columns'].append(col_info)
117
- summary['preview'] = df.head(max_rows).to_dict(orient='records')
118
  return summary
119
 
120
 
121
- def prepare_preprocessing_pipeline(df: pd.DataFrame, impute_strategy_num='median', scale_numeric=True, encode_categorical='onehot') -> Tuple[Pipeline, List[str]]:
 
122
  numeric_cols = list(df.select_dtypes(include=[np.number]).columns)
123
  cat_cols = list(df.select_dtypes(include=['object', 'category', 'bool']).columns)
124
- datetime_cols = list(df.select_dtypes(include=['datetime64']).columns)
125
-
126
  transformers = []
127
  if numeric_cols:
128
- num_pipeline = Pipeline(steps=[
129
- ('imputer', SimpleImputer(strategy=impute_strategy_num)),
130
- ])
131
  if scale_numeric:
132
- num_pipeline.steps.append(('scaler', StandardScaler()))
133
- transformers.append(('num', num_pipeline, numeric_cols))
134
  if cat_cols:
135
  if encode_categorical == 'onehot':
136
- cat_pipeline = Pipeline(steps=[
137
  ('imputer', SimpleImputer(strategy='most_frequent')),
138
  ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
139
  ])
140
  else:
141
- cat_pipeline = Pipeline(steps=[
142
  ('imputer', SimpleImputer(strategy='most_frequent')),
143
  ('ord', OrdinalEncoder())
144
  ])
145
- transformers.append(('cat', cat_pipeline, cat_cols))
146
-
147
- preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')
148
- return preprocessor, numeric_cols + cat_cols + datetime_cols
149
 
150
 
151
  def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd.DataFrame:
152
- # returns processed numpy array and rebuilt column names for easy display
153
  X = preprocessor.fit_transform(df)
154
- # build feature names
155
  feature_names = []
156
- for name, trans, columns in preprocessor.transformers_:
157
  if name == 'num':
158
- feature_names += columns
159
  elif name == 'cat':
160
- # try to extract categories from OneHotEncoder
161
  try:
162
- ohe = trans.named_steps.get('onehot')
163
- cats = ohe.categories_
164
- for col, catvals in zip(columns, cats):
165
- for v in catvals:
166
- feature_names.append(f"{col}__{v}")
167
  except Exception:
168
- # fallback
169
- feature_names += columns
170
- else:
171
- feature_names += columns
172
- proc_df = pd.DataFrame(X, columns=feature_names)
173
- return proc_df
174
-
175
- # ---------- LLM helper ----------
176
-
177
- def build_dataset_prompt(summary: Dict, user_question: Optional[str] = None) -> str:
178
- # Build a robust prompt summarizing the dataset for the LLM to give insights
179
- s = []
180
- s.append("You are a helpful data analyst assistant. I will give you a dataset summary and ask for insights and next steps.")
181
- s.append(f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns.")
182
- s.append("Columns:")
183
- for col in summary['columns']:
184
- s.append(f"- {col['name']} (dtype: {col['dtype']}; missing: {col['n_missing']}; unique: {col['n_unique']})")
185
- if 'summary' in col:
186
- s.append(f" summary: {col['summary']}")
187
- if 'top_values' in col:
188
- s.append(f" top values: {col['top_values']}")
189
- s.append("Preview of top rows:")
190
- for r in summary['preview']:
191
- s.append(str(r))
192
  if user_question:
193
- s.append("User question: " + user_question)
194
  else:
195
- s.append("Please provide: 1) quick dataset quality assessment, 2) columns of interest, 3) suggested cleaning steps, 4) recommended visualizations and quick findings, 5) suggested next steps for modeling or analysis.")
196
- prompt = "\n".join(s)
197
- return prompt
198
 
199
 
200
- def call_llm(prompt: str, model: str = 'bigscience/bloomz-7b1', max_tokens: int = 512) -> str:
 
201
  if not HF_TOKEN:
202
- return "ERROR: HF_TOKEN not found. Put your Hugging Face token in Streamlit secrets under 'HF_TOKEN' or set the HF_TOKEN environment variable."
203
  client = InferenceClient(token=HF_TOKEN)
204
- # Use the text generation endpoint
205
  try:
206
  response = client.text_generation(model=model, inputs=prompt, max_new_tokens=max_tokens)
207
- # The returned object structure depends on HF inference client; try to be robust
208
- if isinstance(response, list):
209
- return response[0].get('generated_text', str(response))
210
- elif isinstance(response, dict):
211
  return response.get('generated_text', str(response))
212
- else:
213
- return str(response)
214
  except Exception as e:
215
- return f"LLM call failed: {e}"
216
-
217
- # ---------- Streamlit UI ----------
218
-
219
- st.title("Data Analysis & Cleaning App β€” Streamlit (Deployable to Hugging Face Spaces)")
220
- st.markdown("Upload a CSV or Excel file, clean it, preprocess, preview cleaned data, visualize quickly, and ask an LLM for insights.")
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  with st.sidebar:
223
- st.header("Options")
224
- model_choice = st.selectbox("LLM model (Inference API)", options=list(MODEL_OPTIONS.keys()), format_func=lambda k: MODEL_OPTIONS[k])
225
- max_tokens = st.slider("LLM max tokens", min_value=128, max_value=1024, value=512, step=64)
226
  impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
227
  encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
228
- scale_numeric = st.checkbox("Scale numeric features", value=True)
229
- show_raw_preview = st.checkbox("Show raw preview (before cleaning)", value=True)
230
 
231
- uploaded_file = st.file_uploader("Upload CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
232
 
233
  if uploaded_file:
234
- try:
235
- with st.spinner("Reading file..."):
236
- raw_df = read_file(uploaded_file)
237
- except Exception as e:
238
- st.error(f"Failed to read file: {e}")
239
- st.stop()
240
 
241
  if show_raw_preview:
242
- st.subheader("Raw data preview")
243
- st.dataframe(raw_df.head(10))
244
-
245
- st.subheader("Cleaning & Standardization")
246
- drop_all_nan_cols = st.checkbox("Drop columns with all missing values", value=True)
247
- cleaned_df = standardize_dataframe(raw_df, drop_all_nan_cols=drop_all_nan_cols)
248
- st.write(f"Data after standardization β€” shape: {cleaned_df.shape}")
249
- st.dataframe(cleaned_df.head(10))
250
-
251
- st.subheader("Quick data summary")
252
- summary = summarize_dataframe(cleaned_df, max_rows=5)
253
- col1, col2 = st.columns([2,1])
254
- with col1:
255
- st.write(f"**Shape:** {summary['shape']}")
256
- st.write("**Columns:**")
257
- for c in summary['columns']:
258
- st.markdown(f"- **{c['name']}** β€” dtype: {c['dtype']} β€” missing: {c['n_missing']} β€” unique: {c['n_unique']}")
259
- with col2:
260
- st.write("**Preview (head)**")
261
- st.table(pd.DataFrame(summary['preview']))
262
-
263
- st.subheader("Preprocessing")
264
- if st.button("Generate preprocessing pipeline and preview processed data"):
265
- preprocessor, kept_cols = prepare_preprocessing_pipeline(cleaned_df, impute_strategy_num=impute_strategy_num, scale_numeric=scale_numeric, encode_categorical=encode_categorical)
266
- try:
267
- proc_df = apply_preprocessing(cleaned_df, preprocessor)
268
- st.success("Preprocessing applied β€” showing preview")
269
- st.dataframe(proc_df.head(10))
270
- st.markdown(f"Processed feature count: **{proc_df.shape[1]}**")
271
- csv = proc_df.to_csv(index=False)
272
- st.download_button("Download processed CSV", data=csv, file_name="processed_data.csv")
273
- except Exception as e:
274
- st.error(f"Failed to process dataset: {e}")
275
 
276
- st.subheader("Quick visualizations")
277
- viz_col = st.selectbox("Select column for visualization (numeric or categorical)", options=list(cleaned_df.columns))
278
- viz_type = st.selectbox("Chart type", ['Histogram', 'Boxplot', 'Bar (categorical)', 'Scatter (choose second column)', 'Correlation heatmap'])
 
279
 
280
- if viz_type == 'Scatter (choose second column)':
281
- second_col = st.selectbox("Second column for scatter", options=[c for c in cleaned_df.columns if c != viz_col])
 
 
282
 
283
- if st.button("Show visualization"):
284
- fig = plt.figure(figsize=(8,5))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  try:
286
  if viz_type == 'Histogram':
287
- series = pd.to_numeric(cleaned_df[viz_col], errors='coerce')
288
- series.dropna(inplace=True)
289
- plt.hist(series, bins='auto')
290
- plt.title(f'Histogram β€” {viz_col}')
291
  elif viz_type == 'Boxplot':
292
- series = pd.to_numeric(cleaned_df[viz_col], errors='coerce')
293
- sns.boxplot(x=series)
294
- plt.title(f'Boxplot β€” {viz_col}')
295
  elif viz_type == 'Bar (categorical)':
296
- counts = cleaned_df[viz_col].astype(str).value_counts().head(30)
297
- sns.barplot(x=counts.values, y=counts.index)
298
- plt.title(f'Bar chart β€” {viz_col}')
299
- elif viz_type == 'Scatter (choose second column)':
300
- x = pd.to_numeric(cleaned_df[viz_col], errors='coerce')
301
- y = pd.to_numeric(cleaned_df[second_col], errors='coerce')
302
- mask = x.notna() & y.notna()
303
- plt.scatter(x[mask], y[mask], alpha=0.6)
304
- plt.xlabel(viz_col)
305
- plt.ylabel(second_col)
306
- plt.title(f'Scatter β€” {viz_col} vs {second_col}')
307
  elif viz_type == 'Correlation heatmap':
308
- numeric = cleaned_df.select_dtypes(include=[np.number])
309
- corr = numeric.corr()
310
- sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
311
- plt.title('Correlation heatmap (numeric features)')
312
  st.pyplot(fig)
313
  except Exception as e:
314
- st.error(f"Failed to create visualization: {e}")
315
-
316
- st.subheader("Ask the LLM for insights (optional)")
317
- user_question = st.text_area("Specific question for the LLM (if empty, a general assessment will be produced)")
318
- if st.button("Get LLM insights"):
319
- with st.spinner("Preparing prompt and calling LLM..."):
320
- prompt = build_dataset_prompt(summary, user_question=user_question if user_question else None)
321
- llm_answer = call_llm(prompt, model=model_choice, max_tokens=max_tokens)
322
- st.subheader("LLM response")
323
- st.write(llm_answer)
324
-
325
- st.subheader("Duplicate & Missing-value helpers")
326
- if st.button("Show duplicate rows (if any)"):
327
- dup = cleaned_df[cleaned_df.duplicated(keep=False)]
328
- if dup.empty:
329
- st.write("No duplicates found")
330
- else:
331
- st.dataframe(dup)
332
- if st.button("Show columns with > 20% missing values"):
333
- thresh = 0.2
334
- miss = (cleaned_df.isna().mean() > thresh)
335
- cols = list(miss[miss].index)
336
- if not cols:
337
- st.write("No columns have more than 20% missing values")
338
- else:
339
- st.write(cols)
340
 
341
- st.markdown("---")
342
- st.markdown("**Deployment notes**: This app is ready to be deployed to Hugging Face Spaces. Add your Hugging Face token to the Space secrets as `HF_TOKEN`. Use a GPU-enabled Space if you want to run large models locally; otherwise the Inference API will run models hosted by Hugging Face via your token.")
 
 
 
 
 
343
 
344
  else:
345
- st.info("Upload a CSV or Excel file to get started.")
346
-
347
- # End of app
 
2
  # Streamlit Data Analysis App for Hugging Face Spaces
3
  # Features:
4
  # - Upload CSV / Excel
5
+ # - Automatic cleaning & standardization
6
  # - Preprocessing (imputation, encoding, scaling)
7
  # - Quick visualizations (histogram, boxplot, scatter, correlation heatmap)
8
  # - Preview cleaned dataset
9
+ # - LLM-powered insights using Hugging Face Inference API
10
+ # - Auto fallback if model access (403) fails
11
+ # - Uses HF_TOKEN from Streamlit secrets or environment
12
 
13
  import os
14
  import io
 
 
 
15
  import streamlit as st
16
  import pandas as pd
17
  import numpy as np
 
23
  from sklearn.pipeline import Pipeline
24
  from huggingface_hub import InferenceClient
25
 
26
+ # ---------- CONFIGURATION ----------
27
  st.set_page_config(page_title="Data Analysis App", layout="wide")
28
 
29
+ # Load HF token
30
+ HF_TOKEN = st.secrets.get("HF_TOKEN", os.getenv("HF_TOKEN"))
 
 
 
 
31
  if not HF_TOKEN:
32
+ st.warning("⚠️ HF_TOKEN not found. Please add it to your Hugging Face Space secrets or environment.")
33
+ else:
34
+ st.success("βœ… Hugging Face token loaded successfully.")
35
 
36
+ # Default open-access models
37
  MODEL_OPTIONS = {
38
+ "mistralai/Mistral-7B-Instruct-v0.3": "Mistral 7B Instruct (open, strong)",
39
+ "HuggingFaceH4/zephyr-7b-beta": "Zephyr 7B Beta (open, fluent)",
40
+ "bigscience/bloom-3b": "Bloom 3B (lightweight, open)"
41
  }
42
 
43
+ # ---------- UTILITY FUNCTIONS ----------
44
 
45
  def read_file(uploaded_file) -> pd.DataFrame:
46
+ """Reads uploaded CSV or Excel file."""
47
  name = uploaded_file.name.lower()
48
  if name.endswith(('.csv', '.txt')):
49
  return pd.read_csv(uploaded_file)
 
54
 
55
 
56
  def clean_column_name(col: str) -> str:
57
+ col = str(col).strip().lower().replace("\n", " ").replace("\t", " ")
 
 
 
58
  col = "_".join(col.split())
 
59
  col = ''.join(c for c in col if (c.isalnum() or c == '_'))
 
60
  while '__' in col:
61
  col = col.replace('__', '_')
62
  return col
63
 
64
 
65
  def standardize_dataframe(df: pd.DataFrame, drop_all_nan_cols: bool = True) -> pd.DataFrame:
66
+ """Standardizes column names and cleans whitespace."""
67
  df = df.copy()
 
68
  for c in df.select_dtypes(include=['object']).columns:
69
  df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
 
70
  df.columns = [clean_column_name(c) for c in df.columns]
 
71
  if drop_all_nan_cols:
72
  df.dropna(axis=1, how='all', inplace=True)
 
73
  for c in df.columns:
74
  if df[c].dtype == object:
75
  sample = df[c].dropna().astype(str).head(20)
76
  if not sample.empty:
 
77
  parsed = pd.to_datetime(sample, errors='coerce')
78
  if parsed.notna().sum() / len(sample) > 0.6:
79
  df[c] = pd.to_datetime(df[c], errors='coerce')
80
  return df
81
 
82
 
83
+ def summarize_dataframe(df: pd.DataFrame, max_rows: int = 5):
84
+ """Creates a structured summary of the dataframe."""
85
+ summary = {'shape': df.shape, 'columns': [], 'preview': df.head(max_rows).to_dict(orient='records')}
 
86
  for c in df.columns:
87
+ info = {'name': c, 'dtype': str(df[c].dtype), 'n_missing': int(df[c].isna().sum()), 'n_unique': int(df[c].nunique(dropna=True))}
 
 
 
 
 
88
  if pd.api.types.is_numeric_dtype(df[c]):
89
+ info['summary'] = df[c].describe().to_dict()
 
90
  elif pd.api.types.is_datetime64_any_dtype(df[c]):
91
+ info['summary'] = {'min': str(df[c].min()), 'max': str(df[c].max())}
 
 
 
92
  else:
93
+ info['top_values'] = df[c].astype(str).value_counts().head(5).to_dict()
94
+ summary['columns'].append(info)
 
95
  return summary
96
 
97
 
98
+ def prepare_preprocessing_pipeline(df: pd.DataFrame, impute_strategy_num='median', scale_numeric=True, encode_categorical='onehot'):
99
+ """Build preprocessing pipeline for numeric and categorical features."""
100
  numeric_cols = list(df.select_dtypes(include=[np.number]).columns)
101
  cat_cols = list(df.select_dtypes(include=['object', 'category', 'bool']).columns)
 
 
102
  transformers = []
103
  if numeric_cols:
104
+ num_pipe = [('imputer', SimpleImputer(strategy=impute_strategy_num))]
 
 
105
  if scale_numeric:
106
+ num_pipe.append(('scaler', StandardScaler()))
107
+ transformers.append(('num', Pipeline(num_pipe), numeric_cols))
108
  if cat_cols:
109
  if encode_categorical == 'onehot':
110
+ cat_pipe = Pipeline([
111
  ('imputer', SimpleImputer(strategy='most_frequent')),
112
  ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
113
  ])
114
  else:
115
+ cat_pipe = Pipeline([
116
  ('imputer', SimpleImputer(strategy='most_frequent')),
117
  ('ord', OrdinalEncoder())
118
  ])
119
+ transformers.append(('cat', cat_pipe, cat_cols))
120
+ return ColumnTransformer(transformers), numeric_cols + cat_cols
 
 
121
 
122
 
123
  def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd.DataFrame:
124
+ """Applies preprocessing pipeline and returns processed DataFrame."""
125
  X = preprocessor.fit_transform(df)
 
126
  feature_names = []
127
+ for name, trans, cols in preprocessor.transformers_:
128
  if name == 'num':
129
+ feature_names += cols
130
  elif name == 'cat':
 
131
  try:
132
+ ohe = trans.named_steps['onehot']
133
+ for col, cats in zip(cols, ohe.categories_):
134
+ feature_names += [f"{col}__{c}" for c in cats]
 
 
135
  except Exception:
136
+ feature_names += cols
137
+ return pd.DataFrame(X, columns=feature_names)
138
+
139
+
140
+ # ---------- LLM INTEGRATION ----------
141
+
142
+ def build_dataset_prompt(summary, user_question=None):
143
+ """Builds a prompt for dataset insights."""
144
+ s = [f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns."]
145
+ for c in summary['columns']:
146
+ s.append(f"- {c['name']} ({c['dtype']}) missing={c['n_missing']} unique={c['n_unique']}")
147
+ s.append("Preview:")
148
+ for row in summary['preview']:
149
+ s.append(str(row))
 
 
 
 
 
 
 
 
 
 
150
  if user_question:
151
+ s.append(f"User question: {user_question}")
152
  else:
153
+ s.append("Please give a dataset summary, patterns, and visualization suggestions.")
154
+ return "\n".join(s)
 
155
 
156
 
157
+ def call_llm(prompt: str, model: str, max_tokens: int = 512) -> str:
158
+ """Calls the Hugging Face Inference API with error handling and fallback."""
159
  if not HF_TOKEN:
160
+ return "⚠️ No Hugging Face token found."
161
  client = InferenceClient(token=HF_TOKEN)
 
162
  try:
163
  response = client.text_generation(model=model, inputs=prompt, max_new_tokens=max_tokens)
164
+ if isinstance(response, dict):
 
 
 
165
  return response.get('generated_text', str(response))
166
+ return str(response)
 
167
  except Exception as e:
168
+ if "403" in str(e):
169
+ fallback = "mistralai/Mistral-7B-Instruct-v0.3"
170
+ if model != fallback:
171
+ try:
172
+ st.warning(f"🚫 Access denied to {model}. Falling back to {fallback}...")
173
+ response = client.text_generation(model=fallback, inputs=prompt, max_new_tokens=max_tokens)
174
+ if isinstance(response, dict):
175
+ return response.get('generated_text', str(response))
176
+ return str(response)
177
+ except Exception as e2:
178
+ return f"❌ Fallback model also failed: {e2}"
179
+ return "🚫 Access denied (403). Try using an open-access model like Mistral or Zephyr."
180
+ return f"❌ LLM call failed: {e}"
181
+
182
+ # ---------- STREAMLIT UI ----------
183
+
184
+ st.title("πŸ“Š Data Analysis & Cleaning App (Hugging Face + Streamlit)")
185
+ st.markdown("Upload CSV or Excel files, clean, preprocess, visualize, and generate insights using an LLM.")
186
 
187
  with st.sidebar:
188
+ st.header("βš™οΈ Options")
189
+ model_choice = st.selectbox("Select LLM model", options=list(MODEL_OPTIONS.keys()), format_func=lambda k: MODEL_OPTIONS[k])
190
+ max_tokens = st.slider("LLM max tokens", 128, 1024, 512, 64)
191
  impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
192
  encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
193
+ scale_numeric = st.checkbox("Scale numeric features", True)
194
+ show_raw_preview = st.checkbox("Show raw preview", True)
195
 
196
+ uploaded_file = st.file_uploader("πŸ“‚ Upload your CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
197
 
198
  if uploaded_file:
199
+ with st.spinner("Reading file..."):
200
+ raw_df = read_file(uploaded_file)
 
 
 
 
201
 
202
  if show_raw_preview:
203
+ st.subheader("Raw Data Preview")
204
+ st.dataframe(raw_df.head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
+ st.subheader("Data Cleaning & Standardization")
207
+ cleaned_df = standardize_dataframe(raw_df)
208
+ st.write(f"βœ… Cleaned data shape: {cleaned_df.shape}")
209
+ st.dataframe(cleaned_df.head())
210
 
211
+ st.subheader("Summary")
212
+ summary = summarize_dataframe(cleaned_df)
213
+ st.write(f"Shape: {summary['shape']}")
214
+ st.json(summary['columns'])
215
 
216
+ st.subheader("Preprocessing")
217
+ if st.button("Generate Preprocessing Pipeline"):
218
+ preproc, _ = prepare_preprocessing_pipeline(cleaned_df, impute_strategy_num, scale_numeric, encode_categorical)
219
+ processed_df = apply_preprocessing(cleaned_df, preproc)
220
+ st.success("Preprocessing complete!")
221
+ st.dataframe(processed_df.head())
222
+ st.download_button("⬇️ Download Processed CSV", processed_df.to_csv(index=False), "processed_data.csv")
223
+
224
+ st.subheader("Visualizations")
225
+ viz_col = st.selectbox("Select column", options=cleaned_df.columns)
226
+ viz_type = st.selectbox("Visualization type", ['Histogram', 'Boxplot', 'Bar (categorical)', 'Scatter', 'Correlation heatmap'])
227
+
228
+ if viz_type == 'Scatter':
229
+ second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
230
+
231
+ if st.button("Show Visualization"):
232
+ fig, ax = plt.subplots(figsize=(8,5))
233
  try:
234
  if viz_type == 'Histogram':
235
+ sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
 
 
 
236
  elif viz_type == 'Boxplot':
237
+ sns.boxplot(x=cleaned_df[viz_col], ax=ax)
 
 
238
  elif viz_type == 'Bar (categorical)':
239
+ counts = cleaned_df[viz_col].astype(str).value_counts().head(20)
240
+ sns.barplot(x=counts.values, y=counts.index, ax=ax)
241
+ elif viz_type == 'Scatter':
242
+ sns.scatterplot(x=cleaned_df[viz_col], y=cleaned_df[second_col], ax=ax)
 
 
 
 
 
 
 
243
  elif viz_type == 'Correlation heatmap':
244
+ corr = cleaned_df.select_dtypes(include=[np.number]).corr()
245
+ sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
 
 
246
  st.pyplot(fig)
247
  except Exception as e:
248
+ st.error(f"Visualization failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
+ st.subheader("🧠 Ask the LLM for Insights")
251
+ user_q = st.text_area("Enter your question (optional):")
252
+ if st.button("Get Insights"):
253
+ with st.spinner("Generating insights..."):
254
+ prompt = build_dataset_prompt(summary, user_q if user_q else None)
255
+ llm_resp = call_llm(prompt, model_choice, max_tokens)
256
+ st.write(llm_resp)
257
 
258
  else:
259
+ st.info("πŸ“₯ Upload a file to begin.")