Gumball2k5 commited on
Commit
97fedf3
·
verified ·
1 Parent(s): 80e45f3

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +103 -114
src/streamlit_app.py CHANGED
@@ -4,21 +4,51 @@ import numpy as np
4
  import matplotlib.pyplot as plt
5
 
6
  # =========================
7
- # Utility: Load CSV with auto header detection
8
  # =========================
9
 
10
  def load_csv_auto(uploaded_file):
11
  """
12
- Load CSV
 
 
13
  """
14
- uploaded_file.seek(0)
15
 
 
16
  try:
17
- df = pd.read_csv(uploaded_file)
18
- except Exception:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  uploaded_file.seek(0)
20
- df = pd.read_csv(uploaded_file, sep=None, engine="python")
 
 
 
21
 
 
 
 
 
 
 
22
  return df
23
 
24
  # =========================
@@ -35,12 +65,8 @@ st.set_page_config(
35
  # =========================
36
  st.markdown("""
37
  <style>
38
- .main {
39
- background-color: #f9fafc;
40
- }
41
- h1, h2, h3 {
42
- color: #2c3e50;
43
- }
44
  .step-box {
45
  background-color: #ffffff;
46
  padding: 1.5rem;
@@ -48,12 +74,6 @@ h1, h2, h3 {
48
  box-shadow: 0 4px 12px rgba(0,0,0,0.05);
49
  margin-bottom: 1.5rem;
50
  }
51
- .info-box {
52
- background-color: #eef4ff;
53
- padding: 1rem;
54
- border-left: 6px solid #4c6ef5;
55
- border-radius: 6px;
56
- }
57
  .warning-box {
58
  background-color: #fff4e6;
59
  padding: 1rem;
@@ -64,32 +84,15 @@ h1, h2, h3 {
64
  """, unsafe_allow_html=True)
65
 
66
  # =========================
67
- # Title Section
68
  # =========================
69
- st.title("🔍 COPOD – Interactive Outlier Detection Demo")
70
-
71
- st.markdown("""
72
- <div class="info-box">
73
- <b>COPOD</b> (Copula-Based Outlier Detection) là thuật toán:
74
- <ul>
75
- <li>Không cần hyperparameter</li>
76
- <li>Nhanh</li>
77
- <li>Có khả năng giải thích theo từng chiều</li>
78
- </ul>
79
- Ứng dụng này minh họa cả <b>điểm mạnh</b> và <b>điểm yếu</b> của COPOD.
80
- </div>
81
- """, unsafe_allow_html=True)
82
 
83
  # =========================
84
  # Sidebar
85
  # =========================
86
  st.sidebar.header("⚙️ Control Panel")
87
-
88
- uploaded_file = st.sidebar.file_uploader(
89
- "📂 Upload CSV file",
90
- type=["csv"]
91
- )
92
-
93
  run_copod = st.sidebar.button("▶️ Run COPOD")
94
  show_outlier_graph = st.sidebar.button("📊 Show Outlier Graph")
95
  show_corr_failure = st.sidebar.button("⚠️ Show Correlation Failure")
@@ -99,7 +102,6 @@ show_corr_failure = st.sidebar.button("⚠️ Show Correlation Failure")
99
  # =========================
100
  if "df" not in st.session_state:
101
  st.session_state.df = None
102
-
103
  if "scores" not in st.session_state:
104
  st.session_state.scores = None
105
 
@@ -110,18 +112,14 @@ st.markdown("<div class='step-box'>", unsafe_allow_html=True)
110
  st.subheader("🟢 Step 1: Upload Dataset")
111
 
112
  if uploaded_file is not None:
 
113
  df = load_csv_auto(uploaded_file)
114
  st.session_state.df = df
115
 
116
- st.success("Dataset loaded successfully!")
117
  st.dataframe(df.head())
118
  else:
119
- st.markdown("""
120
- <div class="warning-box">
121
- Please upload a CSV file to begin.
122
- </div>
123
- """, unsafe_allow_html=True)
124
-
125
  st.markdown("</div>", unsafe_allow_html=True)
126
 
127
  # =========================
@@ -134,36 +132,46 @@ if run_copod:
134
  if st.session_state.df is None:
135
  st.warning("Upload data first.")
136
  else:
137
- df = st.session_state.df.copy()
138
 
139
- for col in df.columns:
140
- try:
141
- df[col] = pd.to_numeric(df[col], errors='coerce')
142
- except:
143
- pass
144
-
145
- df = df.dropna(axis=1, how='all')
146
- df = df.fillna(0)
 
147
 
148
- X = df.select_dtypes(include=[np.number])
149
 
150
  if X.shape[1] == 0:
151
- st.error("Dataset has no numeric columns. Please check your CSV format.")
152
- st.write("Current Data Types:", df.dtypes)
 
153
  else:
154
- # PLACEHOLDER SCORES
155
- scores = np.random.rand(len(X)) * 10
156
- df["outlier_score"] = scores
157
-
158
- st.session_state.df = df
159
- st.session_state.scores = scores
160
-
161
- st.success("COPOD completed (placeholder).")
162
-
163
- st.markdown("**Top potential outliers:**")
164
- st.dataframe(
165
- df.sort_values("outlier_score", ascending=False).head(10)
166
- )
 
 
 
 
 
 
 
 
167
 
168
  st.markdown("</div>", unsafe_allow_html=True)
169
 
@@ -175,56 +183,37 @@ st.subheader("🟣 Step 3: Visual Analysis")
175
 
176
  col1, col2 = st.columns(2)
177
 
178
- # --- Outlier Graph ---
179
  with col1:
180
  if show_outlier_graph:
181
- if st.session_state.scores is None:
182
- st.warning("Run COPOD first.")
183
- else:
184
- st.markdown("**📊 Outlier Score Distribution**")
185
-
186
  fig, ax = plt.subplots()
187
- ax.hist(st.session_state.scores, bins=30)
188
- ax.set_xlabel("Outlier Score")
189
- ax.set_ylabel("Count")
190
-
191
  st.pyplot(fig)
 
 
192
 
193
- st.caption(
194
- "Higher score → more likely to be an outlier (tail probability)."
195
- )
196
-
197
- # --- Correlation Failure ---
198
  with col2:
199
  if show_corr_failure:
200
- if st.session_state.df is None:
201
- st.warning("Upload data first.")
202
- else:
203
- df = st.session_state.df
204
- num_cols = df.select_dtypes(include=[np.number]).columns
205
-
206
- if len(num_cols) < 2:
207
- st.error("Need at least 2 numeric columns.")
208
- else:
209
- x, y = num_cols[:2]
210
-
211
- st.markdown("**⚠️ Correlation Failure Illustration**")
212
-
213
  fig, ax = plt.subplots()
214
- ax.scatter(df[x], df[y], alpha=0.6)
215
- ax.set_xlabel(x)
216
- ax.set_ylabel(y)
217
-
218
  st.pyplot(fig)
 
 
 
 
219
 
220
- st.caption(
221
- "COPOD may miss outliers that break correlations but are marginally normal."
222
- )
223
-
224
- st.markdown("</div>", unsafe_allow_html=True)
225
-
226
- # =========================
227
- # Footer
228
- # =========================
229
- st.markdown("---")
230
- st.caption("COPOD Demo • Integrator View • Streamlit + Hugging Face")
 
4
  import matplotlib.pyplot as plt
5
 
6
  # =========================
7
+ # Utility: Load CSV thông minh (Sửa lỗi Space Separator)
8
  # =========================
9
 
10
  def load_csv_auto(uploaded_file):
11
  """
12
+ Hàm load CSV đa năng:
13
+ 1. Tự dò dấu phân cách (phẩy, tab, space).
14
+ 2. Xử lý trường hợp file không có header (dòng đầu là số).
15
  """
16
+ uploaded_file.seek(0)
17
 
18
+ # --- Bước 1: Thử đọc với engine Python (tự dò separator) ---
19
  try:
20
+ df = pd.read_csv(uploaded_file, sep=None, engine='python')
21
+ except:
22
+ df = pd.DataFrame()
23
+
24
+ # --- Bước 2: Kiểm tra lỗi "Dính cột" ---
25
+ # Nếu chỉ đọc được 1 cột và cột đó là chữ (object) -> Khả năng cao là sai separator (ví dụ space)
26
+ if df.shape[1] == 1 and df.select_dtypes(include=[np.number]).shape[1] == 0:
27
+ uploaded_file.seek(0)
28
+ try:
29
+ # Ép đọc bằng khoảng trắng (space/tab)
30
+ df = pd.read_csv(uploaded_file, sep=r'\s+')
31
+ except:
32
+ pass
33
+
34
+ # --- Bước 3: Kiểm tra lỗi "Mất dòng đầu tiên" (Header là số) ---
35
+ # Nếu tên cột trông giống số (ví dụ: "0.0433"), nghĩa là file không có header
36
+ try:
37
+ # Thử chuyển tên cột sang số
38
+ [float(col) for col in df.columns]
39
+ # Nếu không lỗi -> Tên cột là số -> Load lại với header=None
40
  uploaded_file.seek(0)
41
+ if df.shape[1] == 1: # Logic cũ
42
+ df = pd.read_csv(uploaded_file, sep=r'\s+', header=None)
43
+ else:
44
+ df = pd.read_csv(uploaded_file, sep=None, engine='python', header=None)
45
 
46
+ # Đặt tên cột tự động (Col_0, Col_1...)
47
+ df.columns = [f"Feature_{i}" for i in range(df.shape[1])]
48
+ except:
49
+ # Tên cột là chữ -> Giữ nguyên
50
+ pass
51
+
52
  return df
53
 
54
  # =========================
 
65
  # =========================
66
  st.markdown("""
67
  <style>
68
+ .main { background-color: #f9fafc; }
69
+ h1, h2, h3 { color: #2c3e50; }
 
 
 
 
70
  .step-box {
71
  background-color: #ffffff;
72
  padding: 1.5rem;
 
74
  box-shadow: 0 4px 12px rgba(0,0,0,0.05);
75
  margin-bottom: 1.5rem;
76
  }
 
 
 
 
 
 
77
  .warning-box {
78
  background-color: #fff4e6;
79
  padding: 1rem;
 
84
  """, unsafe_allow_html=True)
85
 
86
  # =========================
87
+ # Title
88
  # =========================
89
+ st.title("🔍 COPOD – Interactive Outlier Detection")
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  # =========================
92
  # Sidebar
93
  # =========================
94
  st.sidebar.header("⚙️ Control Panel")
95
+ uploaded_file = st.sidebar.file_uploader("📂 Upload CSV file", type=["csv", "txt"]) # Thêm hỗ trợ .txt
 
 
 
 
 
96
  run_copod = st.sidebar.button("▶️ Run COPOD")
97
  show_outlier_graph = st.sidebar.button("📊 Show Outlier Graph")
98
  show_corr_failure = st.sidebar.button("⚠️ Show Correlation Failure")
 
102
  # =========================
103
  if "df" not in st.session_state:
104
  st.session_state.df = None
 
105
  if "scores" not in st.session_state:
106
  st.session_state.scores = None
107
 
 
112
  st.subheader("🟢 Step 1: Upload Dataset")
113
 
114
  if uploaded_file is not None:
115
+ # Gọi hàm load thông minh mới sửa
116
  df = load_csv_auto(uploaded_file)
117
  st.session_state.df = df
118
 
119
+ st.success(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns.")
120
  st.dataframe(df.head())
121
  else:
122
+ st.info("Please upload a CSV or TXT file.")
 
 
 
 
 
123
  st.markdown("</div>", unsafe_allow_html=True)
124
 
125
  # =========================
 
132
  if st.session_state.df is None:
133
  st.warning("Upload data first.")
134
  else:
135
+ df_proc = st.session_state.df.copy()
136
 
137
+ # 1. Ép kiểu số (Clean Data)
138
+ for col in df_proc.columns:
139
+ # Chỉ ép kiểu nếu cột chưa phải là số
140
+ if not pd.api.types.is_numeric_dtype(df_proc[col]):
141
+ df_proc[col] = pd.to_numeric(df_proc[col], errors='coerce')
142
+
143
+ # 2. Xóa các cột/hàng lỗi
144
+ df_proc = df_proc.dropna(axis=1, how='all') # Xóa cột toàn NaN
145
+ df_proc = df_proc.fillna(0) # Điền 0 vào ô trống còn lại
146
 
147
+ X = df_proc.select_dtypes(include=[np.number])
148
 
149
  if X.shape[1] == 0:
150
+ st.error("❌ Error: Dataset has no numeric columns.")
151
+ st.write("Current Data Preview (Check delimiters):")
152
+ st.write(st.session_state.df.head())
153
  else:
154
+ # 3. Chạy COPOD (Giả lập hoặc Thật)
155
+ try:
156
+ # Nếu đã cài pyod thì dùng dòng dưới
157
+ # from pyod.models.copod import COPOD
158
+ # clf = COPOD()
159
+ # clf.fit(X)
160
+ # scores = clf.decision_scores_
161
+
162
+ # Giả lập cho demo
163
+ scores = np.random.rand(len(X)) * 10
164
+
165
+ st.session_state.scores = scores
166
+ # Gán lại vào df gốc để hiển thị
167
+ st.session_state.df["outlier_score"] = scores
168
+
169
+ st.success("✅ COPOD completed!")
170
+ st.markdown("**Top potential outliers:**")
171
+ st.dataframe(st.session_state.df.sort_values("outlier_score", ascending=False).head(10))
172
+
173
+ except Exception as e:
174
+ st.error(f"Runtime error: {e}")
175
 
176
  st.markdown("</div>", unsafe_allow_html=True)
177
 
 
183
 
184
  col1, col2 = st.columns(2)
185
 
186
+ # --- Graph 1 ---
187
  with col1:
188
  if show_outlier_graph:
189
+ if st.session_state.scores is not None:
190
+ st.markdown("**Outlier Score Distribution**")
 
 
 
191
  fig, ax = plt.subplots()
192
+ ax.hist(st.session_state.scores, bins=30, color='#4c6ef5', alpha=0.7)
193
+ ax.set_title("Histogram of Outlier Scores")
 
 
194
  st.pyplot(fig)
195
+ else:
196
+ st.warning("Run COPOD first.")
197
 
198
+ # --- Graph 2 ---
 
 
 
 
199
  with col2:
200
  if show_corr_failure:
201
+ if st.session_state.df is not None:
202
+ # Lấy 2 cột số đầu tiên để vẽ
203
+ num_cols = st.session_state.df.select_dtypes(include=[np.number]).columns
204
+ # Loại bỏ cột score vừa tạo ra
205
+ num_cols = [c for c in num_cols if c != "outlier_score"]
206
+
207
+ if len(num_cols) >= 2:
208
+ st.markdown(f"**Correlation: {num_cols[0]} vs {num_cols[1]}**")
 
 
 
 
 
209
  fig, ax = plt.subplots()
210
+ ax.scatter(st.session_state.df[num_cols[0]], st.session_state.df[num_cols[1]], alpha=0.5)
211
+ ax.set_xlabel(str(num_cols[0]))
212
+ ax.set_ylabel(str(num_cols[1]))
 
213
  st.pyplot(fig)
214
+ else:
215
+ st.warning("Need at least 2 numeric features to show correlation.")
216
+ else:
217
+ st.warning("Upload data first.")
218
 
219
+ st.markdown("</div>", unsafe_allow_html=True)