Starberry15 commited on
Commit
4335246
Β·
verified Β·
1 Parent(s): 62e465d

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +335 -100
src/streamlit_app.py CHANGED
@@ -1,112 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
- import time
3
- import pandas as pd
 
 
4
  import streamlit as st
5
- from io import StringIO
6
- from dotenv import load_dotenv
7
- from huggingface_hub import InferenceClient, login
 
 
 
 
 
 
8
 
9
- # ==========================================================
10
- # πŸ” Load environment + authenticate
11
- # ==========================================================
12
- load_dotenv()
13
- HF_TOKEN = os.getenv("HF_TOKEN")
14
 
 
 
 
 
 
 
15
  if not HF_TOKEN:
16
- st.error("❌ Missing Hugging Face token. Please set HF_TOKEN in your .env file.")
17
- else:
18
- login(token=HF_TOKEN)
19
-
20
- # Create HF clients
21
- cleaner_client = InferenceClient(model="Qwen/Qwen2.5-Coder-14B", token=HF_TOKEN)
22
- analyst_client = InferenceClient(model="Qwen/Qwen2.5-14B-Instruct", token=HF_TOKEN)
23
-
24
- # ==========================================================
25
- # πŸŽ›οΈ App Layout
26
- # ==========================================================
27
- st.set_page_config(page_title="🧹 Smart Data Analysis", page_icon="πŸ“Š", layout="wide")
28
- st.title("πŸ“Š Smart Data Analysis Assistant")
29
- st.caption("Clean messy data, then run AI-powered insights and statistical analysis β€” all locally with open-source models.")
30
-
31
- # ==========================================================
32
- # πŸ“ Upload CSV
33
- # ==========================================================
34
- uploaded_file = st.file_uploader("πŸ“€ Upload your CSV dataset", type=["csv"])
35
- if uploaded_file:
36
- df_raw = pd.read_csv(uploaded_file)
37
- st.subheader("πŸ“„ Raw Data Preview")
38
- st.dataframe(df_raw.head())
39
-
40
- # ==========================================================
41
- # 🧹 Data Cleaning
42
- # ==========================================================
43
- if st.button("🧹 Clean Data using Qwen Coder 14B"):
44
- with st.spinner("Cleaning data... please wait ⏳"):
45
- try:
46
- # Convert DataFrame to text for cleaning
47
- csv_text = df_raw.to_csv(index=False)
48
-
49
- prompt = f"""
50
- You are a Python data cleaning assistant.
51
- Clean this dataset and fix inconsistent column names, missing values, and formatting.
52
- Return a clean CSV version that can be loaded into pandas directly.
53
-
54
- Dataset:
55
- {csv_text}
56
- """
57
-
58
- response = cleaner_client.text_generation(
59
- prompt,
60
- temperature=0.2,
61
- max_new_tokens=2048,
62
- )
63
-
64
- cleaned_csv = response.strip().split("```")[-1] # extract text
65
- df_cleaned = pd.read_csv(StringIO(cleaned_csv))
66
-
67
- st.session_state.cleaned_df = df_cleaned
68
- st.success("βœ… Data cleaned successfully!")
69
- st.dataframe(df_cleaned.head())
70
-
71
- except Exception as e:
72
- st.error(f"⚠️ Cleaning failed: {e}")
73
-
74
- # ==========================================================
75
- # πŸ“Š Data Analysis
76
- # ==========================================================
77
- if "cleaned_df" in st.session_state:
78
- df = st.session_state.cleaned_df
79
- st.divider()
80
- st.subheader("πŸ“ˆ AI Data Analysis")
81
-
82
- user_query = st.text_area("Ask about your data:", placeholder="e.g., What is the correlation between experience and salary?")
83
- if st.button("πŸ” Analyze"):
84
- with st.spinner("Analyzing with Qwen 14B Instruct..."):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  try:
86
- csv_excerpt = df.head(30).to_csv(index=False)
87
- analysis_prompt = f"""
88
- You are a data analyst. Analyze this dataset and answer the question.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- Data sample (CSV):
91
- {csv_excerpt}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- Question:
94
- {user_query}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- Instructions:
97
- - Be accurate and concise.
98
- - If numerical analysis is relevant, describe it.
99
- - Use markdown for readability.
100
- """
101
 
102
- response = analyst_client.text_generation(
103
- analysis_prompt,
104
- temperature=0.5,
105
- max_new_tokens=1024,
106
- )
107
 
108
- st.markdown("### 🧠 Analysis Result")
109
- st.write(response.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- except Exception as e:
112
- st.error(f"⚠️ Analysis failed: {e}")
 
1
+ # streamlit_data_analysis_app.py
2
+ # Streamlit Data Analysis App for Hugging Face Spaces
3
+ # Features:
4
+ # - Upload CSV / Excel
5
+ # - Automatic cleaning & standardization (column names, missing values, dtypes)
6
+ # - Preprocessing (imputation, encoding, scaling)
7
+ # - Quick visualizations (histogram, boxplot, scatter, correlation heatmap)
8
+ # - Preview cleaned dataset
9
+ # - LLM-powered dataset summary & insights using Hugging Face Inference API
10
+ # - Uses HF_TOKEN from Streamlit secrets (or environment variable)
11
+
12
  import os
13
+ import io
14
+ import math
15
+ from typing import Optional, Tuple, List, Dict
16
+
17
  import streamlit as st
18
+ import pandas as pd
19
+ import numpy as np
20
+ import matplotlib.pyplot as plt
21
+ import seaborn as sns
22
+ from sklearn.impute import SimpleImputer
23
+ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
24
+ from sklearn.compose import ColumnTransformer
25
+ from sklearn.pipeline import Pipeline
26
+ from huggingface_hub import InferenceClient
27
 
28
+ # ---------- Configuration ----------
29
+ st.set_page_config(page_title="Data Analysis App", layout="wide")
 
 
 
30
 
31
+ # Try to read HF token from Streamlit secrets then environment
32
+ HF_TOKEN = None
33
+ try:
34
+ HF_TOKEN = st.secrets.get("HF_TOKEN")
35
+ except Exception:
36
+ HF_TOKEN = None
37
  if not HF_TOKEN:
38
+ HF_TOKEN = os.getenv("HF_TOKEN")
39
+
40
+ # Default open-source model choices (available on Hugging Face)
41
+ MODEL_OPTIONS = {
42
+ "bigscience/bloomz-7b1": "BloomZ 7B (instruction-tuned)",
43
+ "tiiuae/falcon-7b-instruct": "Falcon 7B Instruct",
44
+ "bigscience/bloom-3b": "Bloom 3B (lighter)"
45
+ }
46
+
47
+ # ---------- Utility functions ----------
48
+
49
+ def read_file(uploaded_file: st.uploaded_file_manager.UploadedFile) -> pd.DataFrame:
50
+ name = uploaded_file.name.lower()
51
+ if name.endswith(('.csv', '.txt')):
52
+ return pd.read_csv(uploaded_file)
53
+ elif name.endswith(('.xls', '.xlsx')):
54
+ return pd.read_excel(uploaded_file)
55
+ else:
56
+ raise ValueError("Unsupported file type. Please upload CSV or Excel.")
57
+
58
+
59
+ def clean_column_name(col: str) -> str:
60
+ # standardize: strip, lower, replace spaces and special chars with _
61
+ col = str(col).strip()
62
+ col = col.replace("\n", " ").replace("\t", " ")
63
+ col = col.lower()
64
+ col = "_".join(col.split())
65
+ # keep alphanumerics and _
66
+ col = ''.join(c for c in col if (c.isalnum() or c == '_'))
67
+ # collapse multiple _
68
+ while '__' in col:
69
+ col = col.replace('__', '_')
70
+ return col
71
+
72
+
73
+ def standardize_dataframe(df: pd.DataFrame, drop_all_nan_cols: bool = True) -> pd.DataFrame:
74
+ df = df.copy()
75
+ # strip whitespace from string columns
76
+ for c in df.select_dtypes(include=['object']).columns:
77
+ df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
78
+ # standardize column names
79
+ df.columns = [clean_column_name(c) for c in df.columns]
80
+ # drop fully empty columns
81
+ if drop_all_nan_cols:
82
+ df.dropna(axis=1, how='all', inplace=True)
83
+ # try to parse datetime columns heuristically
84
+ for c in df.columns:
85
+ if df[c].dtype == object:
86
+ sample = df[c].dropna().astype(str).head(20)
87
+ if not sample.empty:
88
+ # quick heuristic: if majority parse as datetime
89
+ parsed = pd.to_datetime(sample, errors='coerce')
90
+ if parsed.notna().sum() / len(sample) > 0.6:
91
+ df[c] = pd.to_datetime(df[c], errors='coerce')
92
+ return df
93
+
94
+
95
+ def summarize_dataframe(df: pd.DataFrame, max_rows: int = 5) -> Dict:
96
+ summary = {}
97
+ summary['shape'] = df.shape
98
+ summary['columns'] = []
99
+ for c in df.columns:
100
+ col_info = {
101
+ 'name': c,
102
+ 'dtype': str(df[c].dtype),
103
+ 'n_missing': int(df[c].isna().sum()),
104
+ 'n_unique': int(df[c].nunique(dropna=True)) if df[c].dtype != 'object' else int(df[c].nunique(dropna=True)),
105
+ }
106
+ if pd.api.types.is_numeric_dtype(df[c]):
107
+ desc = df[c].describe().to_dict()
108
+ col_info['summary'] = {k: float(v) for k, v in desc.items()}
109
+ elif pd.api.types.is_datetime64_any_dtype(df[c]):
110
+ col_info['summary'] = {
111
+ 'min': str(df[c].min()),
112
+ 'max': str(df[c].max())
113
+ }
114
+ else:
115
+ col_info['top_values'] = df[c].dropna().astype(str).value_counts().head(5).to_dict()
116
+ summary['columns'].append(col_info)
117
+ summary['preview'] = df.head(max_rows).to_dict(orient='records')
118
+ return summary
119
+
120
+
121
+ def prepare_preprocessing_pipeline(df: pd.DataFrame, impute_strategy_num='median', scale_numeric=True, encode_categorical='onehot') -> Tuple[Pipeline, List[str]]:
122
+ numeric_cols = list(df.select_dtypes(include=[np.number]).columns)
123
+ cat_cols = list(df.select_dtypes(include=['object', 'category', 'bool']).columns)
124
+ datetime_cols = list(df.select_dtypes(include=['datetime64']).columns)
125
+
126
+ transformers = []
127
+ if numeric_cols:
128
+ num_pipeline = Pipeline(steps=[
129
+ ('imputer', SimpleImputer(strategy=impute_strategy_num)),
130
+ ])
131
+ if scale_numeric:
132
+ num_pipeline.steps.append(('scaler', StandardScaler()))
133
+ transformers.append(('num', num_pipeline, numeric_cols))
134
+ if cat_cols:
135
+ if encode_categorical == 'onehot':
136
+ cat_pipeline = Pipeline(steps=[
137
+ ('imputer', SimpleImputer(strategy='most_frequent')),
138
+ ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
139
+ ])
140
+ else:
141
+ cat_pipeline = Pipeline(steps=[
142
+ ('imputer', SimpleImputer(strategy='most_frequent')),
143
+ ('ord', OrdinalEncoder())
144
+ ])
145
+ transformers.append(('cat', cat_pipeline, cat_cols))
146
+
147
+ preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')
148
+ return preprocessor, numeric_cols + cat_cols + datetime_cols
149
+
150
+
151
+ def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd.DataFrame:
152
+ # returns processed numpy array and rebuilt column names for easy display
153
+ X = preprocessor.fit_transform(df)
154
+ # build feature names
155
+ feature_names = []
156
+ for name, trans, columns in preprocessor.transformers_:
157
+ if name == 'num':
158
+ feature_names += columns
159
+ elif name == 'cat':
160
+ # try to extract categories from OneHotEncoder
161
  try:
162
+ ohe = trans.named_steps.get('onehot')
163
+ cats = ohe.categories_
164
+ for col, catvals in zip(columns, cats):
165
+ for v in catvals:
166
+ feature_names.append(f"{col}__{v}")
167
+ except Exception:
168
+ # fallback
169
+ feature_names += columns
170
+ else:
171
+ feature_names += columns
172
+ proc_df = pd.DataFrame(X, columns=feature_names)
173
+ return proc_df
174
+
175
+ # ---------- LLM helper ----------
176
+
177
+ def build_dataset_prompt(summary: Dict, user_question: Optional[str] = None) -> str:
178
+ # Build a robust prompt summarizing the dataset for the LLM to give insights
179
+ s = []
180
+ s.append("You are a helpful data analyst assistant. I will give you a dataset summary and ask for insights and next steps.")
181
+ s.append(f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns.")
182
+ s.append("Columns:")
183
+ for col in summary['columns']:
184
+ s.append(f"- {col['name']} (dtype: {col['dtype']}; missing: {col['n_missing']}; unique: {col['n_unique']})")
185
+ if 'summary' in col:
186
+ s.append(f" summary: {col['summary']}")
187
+ if 'top_values' in col:
188
+ s.append(f" top values: {col['top_values']}")
189
+ s.append("Preview of top rows:")
190
+ for r in summary['preview']:
191
+ s.append(str(r))
192
+ if user_question:
193
+ s.append("User question: " + user_question)
194
+ else:
195
+ s.append("Please provide: 1) quick dataset quality assessment, 2) columns of interest, 3) suggested cleaning steps, 4) recommended visualizations and quick findings, 5) suggested next steps for modeling or analysis.")
196
+ prompt = "\n".join(s)
197
+ return prompt
198
+
199
 
200
+ def call_llm(prompt: str, model: str = 'bigscience/bloomz-7b1', max_tokens: int = 512) -> str:
201
+ if not HF_TOKEN:
202
+ return "ERROR: HF_TOKEN not found. Put your Hugging Face token in Streamlit secrets under 'HF_TOKEN' or set the HF_TOKEN environment variable."
203
+ client = InferenceClient(token=HF_TOKEN)
204
+ # Use the text generation endpoint
205
+ try:
206
+ response = client.text_generation(model=model, inputs=prompt, max_new_tokens=max_tokens)
207
+ # The returned object structure depends on HF inference client; try to be robust
208
+ if isinstance(response, list):
209
+ return response[0].get('generated_text', str(response))
210
+ elif isinstance(response, dict):
211
+ return response.get('generated_text', str(response))
212
+ else:
213
+ return str(response)
214
+ except Exception as e:
215
+ return f"LLM call failed: {e}"
216
 
217
+ # ---------- Streamlit UI ----------
218
+
219
+ st.title("Data Analysis & Cleaning App β€” Streamlit (Deployable to Hugging Face Spaces)")
220
+ st.markdown("Upload a CSV or Excel file, clean it, preprocess, preview cleaned data, visualize quickly, and ask an LLM for insights.")
221
+
222
+ with st.sidebar:
223
+ st.header("Options")
224
+ model_choice = st.selectbox("LLM model (Inference API)", options=list(MODEL_OPTIONS.keys()), format_func=lambda k: MODEL_OPTIONS[k])
225
+ max_tokens = st.slider("LLM max tokens", min_value=128, max_value=1024, value=512, step=64)
226
+ impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
227
+ encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
228
+ scale_numeric = st.checkbox("Scale numeric features", value=True)
229
+ show_raw_preview = st.checkbox("Show raw preview (before cleaning)", value=True)
230
+
231
+ uploaded_file = st.file_uploader("Upload CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
232
+
233
+ if uploaded_file:
234
+ try:
235
+ with st.spinner("Reading file..."):
236
+ raw_df = read_file(uploaded_file)
237
+ except Exception as e:
238
+ st.error(f"Failed to read file: {e}")
239
+ st.stop()
240
 
241
+ if show_raw_preview:
242
+ st.subheader("Raw data preview")
243
+ st.dataframe(raw_df.head(10))
 
 
244
 
245
+ st.subheader("Cleaning & Standardization")
246
+ drop_all_nan_cols = st.checkbox("Drop columns with all missing values", value=True)
247
+ cleaned_df = standardize_dataframe(raw_df, drop_all_nan_cols=drop_all_nan_cols)
248
+ st.write(f"Data after standardization β€” shape: {cleaned_df.shape}")
249
+ st.dataframe(cleaned_df.head(10))
250
 
251
+ st.subheader("Quick data summary")
252
+ summary = summarize_dataframe(cleaned_df, max_rows=5)
253
+ col1, col2 = st.columns([2,1])
254
+ with col1:
255
+ st.write(f"**Shape:** {summary['shape']}")
256
+ st.write("**Columns:**")
257
+ for c in summary['columns']:
258
+ st.markdown(f"- **{c['name']}** β€” dtype: {c['dtype']} β€” missing: {c['n_missing']} β€” unique: {c['n_unique']}")
259
+ with col2:
260
+ st.write("**Preview (head)**")
261
+ st.table(pd.DataFrame(summary['preview']))
262
+
263
+ st.subheader("Preprocessing")
264
+ if st.button("Generate preprocessing pipeline and preview processed data"):
265
+ preprocessor, kept_cols = prepare_preprocessing_pipeline(cleaned_df, impute_strategy_num=impute_strategy_num, scale_numeric=scale_numeric, encode_categorical=encode_categorical)
266
+ try:
267
+ proc_df = apply_preprocessing(cleaned_df, preprocessor)
268
+ st.success("Preprocessing applied β€” showing preview")
269
+ st.dataframe(proc_df.head(10))
270
+ st.markdown(f"Processed feature count: **{proc_df.shape[1]}**")
271
+ csv = proc_df.to_csv(index=False)
272
+ st.download_button("Download processed CSV", data=csv, file_name="processed_data.csv")
273
+ except Exception as e:
274
+ st.error(f"Failed to process dataset: {e}")
275
+
276
+ st.subheader("Quick visualizations")
277
+ viz_col = st.selectbox("Select column for visualization (numeric or categorical)", options=list(cleaned_df.columns))
278
+ viz_type = st.selectbox("Chart type", ['Histogram', 'Boxplot', 'Bar (categorical)', 'Scatter (choose second column)', 'Correlation heatmap'])
279
+
280
+ if viz_type == 'Scatter (choose second column)':
281
+ second_col = st.selectbox("Second column for scatter", options=[c for c in cleaned_df.columns if c != viz_col])
282
+
283
+ if st.button("Show visualization"):
284
+ fig = plt.figure(figsize=(8,5))
285
+ try:
286
+ if viz_type == 'Histogram':
287
+ series = pd.to_numeric(cleaned_df[viz_col], errors='coerce')
288
+ series.dropna(inplace=True)
289
+ plt.hist(series, bins='auto')
290
+ plt.title(f'Histogram β€” {viz_col}')
291
+ elif viz_type == 'Boxplot':
292
+ series = pd.to_numeric(cleaned_df[viz_col], errors='coerce')
293
+ sns.boxplot(x=series)
294
+ plt.title(f'Boxplot β€” {viz_col}')
295
+ elif viz_type == 'Bar (categorical)':
296
+ counts = cleaned_df[viz_col].astype(str).value_counts().head(30)
297
+ sns.barplot(x=counts.values, y=counts.index)
298
+ plt.title(f'Bar chart β€” {viz_col}')
299
+ elif viz_type == 'Scatter (choose second column)':
300
+ x = pd.to_numeric(cleaned_df[viz_col], errors='coerce')
301
+ y = pd.to_numeric(cleaned_df[second_col], errors='coerce')
302
+ mask = x.notna() & y.notna()
303
+ plt.scatter(x[mask], y[mask], alpha=0.6)
304
+ plt.xlabel(viz_col)
305
+ plt.ylabel(second_col)
306
+ plt.title(f'Scatter β€” {viz_col} vs {second_col}')
307
+ elif viz_type == 'Correlation heatmap':
308
+ numeric = cleaned_df.select_dtypes(include=[np.number])
309
+ corr = numeric.corr()
310
+ sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
311
+ plt.title('Correlation heatmap (numeric features)')
312
+ st.pyplot(fig)
313
+ except Exception as e:
314
+ st.error(f"Failed to create visualization: {e}")
315
+
316
+ st.subheader("Ask the LLM for insights (optional)")
317
+ user_question = st.text_area("Specific question for the LLM (if empty, a general assessment will be produced)")
318
+ if st.button("Get LLM insights"):
319
+ with st.spinner("Preparing prompt and calling LLM..."):
320
+ prompt = build_dataset_prompt(summary, user_question=user_question if user_question else None)
321
+ llm_answer = call_llm(prompt, model=model_choice, max_tokens=max_tokens)
322
+ st.subheader("LLM response")
323
+ st.write(llm_answer)
324
+
325
+ st.subheader("Duplicate & Missing-value helpers")
326
+ if st.button("Show duplicate rows (if any)"):
327
+ dup = cleaned_df[cleaned_df.duplicated(keep=False)]
328
+ if dup.empty:
329
+ st.write("No duplicates found")
330
+ else:
331
+ st.dataframe(dup)
332
+ if st.button("Show columns with > 20% missing values"):
333
+ thresh = 0.2
334
+ miss = (cleaned_df.isna().mean() > thresh)
335
+ cols = list(miss[miss].index)
336
+ if not cols:
337
+ st.write("No columns have more than 20% missing values")
338
+ else:
339
+ st.write(cols)
340
+
341
+ st.markdown("---")
342
+ st.markdown("**Deployment notes**: This app is ready to be deployed to Hugging Face Spaces. Add your Hugging Face token to the Space secrets as `HF_TOKEN`. Use a GPU-enabled Space if you want to run large models locally; otherwise the Inference API will run models hosted by Hugging Face via your token.")
343
+
344
+ else:
345
+ st.info("Upload a CSV or Excel file to get started.")
346
 
347
+ # End of app