Spaces:

Gumball2k5
/

MachineLearning2-COPOD

Sleeping

File size: 7,586 Bytes

3ccdce5
bbcffc9
 
 
 
e673faf
97fedf3
e673faf
d0e7eb9
e673faf
 
97fedf3
 
 
e673faf
97fedf3
80e45f3
97fedf3
80e45f3
97fedf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7f9864
97fedf3
 
 
 
80e45f3
97fedf3
 
 
 
 
 
e673faf
 
bbcffc9
ac6c128
bbcffc9
 
ac6c128
 
bbcffc9
 
 
 
ac6c128
bbcffc9
 
ac6c128
97fedf3
 
ac6c128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97fedf3
ac6c128
97fedf3
bbcffc9
 
ac6c128
bbcffc9
ac6c128
97fedf3
bbcffc9
 
ac6c128
bbcffc9
 
 
 
 
 
 
 
 
 
ac6c128
bbcffc9
ac6c128
 
 
bbcffc9
97fedf3
e673faf
bbcffc9
 
97fedf3
bbcffc9
ac6c128
97fedf3
ac6c128
bbcffc9
 
ac6c128
bbcffc9
ac6c128
 
 
bbcffc9
 
ac6c128
bbcffc9
97fedf3
b68a34d
97fedf3
 
 
 
 
 
 
 
 
b68a34d
97fedf3
bbcffc9
 
97fedf3
 
 
bbcffc9
97fedf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbcffc9
ac6c128
 
bbcffc9
ac6c128
bbcffc9
ac6c128
 
bbcffc9
ac6c128
bbcffc9
97fedf3
ac6c128
 
97fedf3
 
ac6c128
97fedf3
 
ac6c128
97fedf3
 
bbcffc9
97fedf3
ac6c128
 
97fedf3
 
 
 
 
 
 
 
ac6c128
97fedf3
 
 
ac6c128
97fedf3
 
 
 
ac6c128
97fedf3

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# =========================
# Utility: Load CSV thông minh (Sửa lỗi Space Separator)
# =========================

def load_csv_auto(uploaded_file):
    """
    Hàm load CSV đa năng:
    1. Tự dò dấu phân cách (phẩy, tab, space).
    2. Xử lý trường hợp file không có header (dòng đầu là số).
    """
    uploaded_file.seek(0)
    
    # --- Bước 1: Thử đọc với engine Python (tự dò separator) ---
    try:
        df = pd.read_csv(uploaded_file, sep=None, engine='python')
    except:
        df = pd.DataFrame()

    # --- Bước 2: Kiểm tra lỗi "Dính cột" ---
    # Nếu chỉ đọc được 1 cột và cột đó là chữ (object) -> Khả năng cao là sai separator (ví dụ space)
    if df.shape[1] == 1 and df.select_dtypes(include=[np.number]).shape[1] == 0:
        uploaded_file.seek(0)
        try:
            # Ép đọc bằng khoảng trắng (space/tab)
            df = pd.read_csv(uploaded_file, sep=r'\s+')
        except:
            pass

    # --- Bước 3: Kiểm tra lỗi "Mất dòng đầu tiên" (Header là số) ---
    # Nếu tên cột trông giống số (ví dụ: "0.0433"), nghĩa là file không có header
    try:
        # Thử chuyển tên cột sang số
        [float(col) for col in df.columns]
        # Nếu không lỗi -> Tên cột là số -> Load lại với header=None
        uploaded_file.seek(0)
        if df.shape[1] == 1: # Logic cũ
             df = pd.read_csv(uploaded_file, sep=r'\s+', header=None)
        else:
             df = pd.read_csv(uploaded_file, sep=None, engine='python', header=None)
        
        # Đặt tên cột tự động (Col_0, Col_1...)
        df.columns = [f"Feature_{i}" for i in range(df.shape[1])]
    except:
        # Tên cột là chữ -> Giữ nguyên
        pass

    return df

# =========================
# Page Config
# =========================
st.set_page_config(
    page_title="COPOD Interactive Demo",
    page_icon="🔍",
    layout="wide"
)

# =========================
# Custom CSS
# =========================
st.markdown("""
<style>
.main { background-color: #f9fafc; }
h1, h2, h3 { color: #2c3e50; }
.step-box {
    background-color: #ffffff;
    padding: 1.5rem;
    border-radius: 12px;
    box-shadow: 0 4px 12px rgba(0,0,0,0.05);
    margin-bottom: 1.5rem;
}
.warning-box {
    background-color: #fff4e6;
    padding: 1rem;
    border-left: 6px solid #f08c00;
    border-radius: 6px;
}
</style>
""", unsafe_allow_html=True)

# =========================
# Title
# =========================
st.title("🔍 COPOD – Interactive Outlier Detection")

# =========================
# Sidebar
# =========================
st.sidebar.header("⚙️ Control Panel")
uploaded_file = st.sidebar.file_uploader("📂 Upload CSV file", type=["csv", "txt"]) # Thêm hỗ trợ .txt
run_copod = st.sidebar.button("▶️ Run COPOD")
show_outlier_graph = st.sidebar.button("📊 Show Outlier Graph")
show_corr_failure = st.sidebar.button("⚠️ Show Correlation Failure")

# =========================
# Session State
# =========================
if "df" not in st.session_state:
    st.session_state.df = None
if "scores" not in st.session_state:
    st.session_state.scores = None

# =========================
# STEP 1 – Upload Data
# =========================
st.markdown("<div class='step-box'>", unsafe_allow_html=True)
st.subheader("🟢 Step 1: Upload Dataset")

if uploaded_file is not None:
    # Gọi hàm load thông minh mới sửa
    df = load_csv_auto(uploaded_file)
    st.session_state.df = df

    st.success(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns.")
    st.dataframe(df.head())
else:
    st.info("Please upload a CSV or TXT file.")
st.markdown("</div>", unsafe_allow_html=True)

# =========================
# STEP 2 – Run COPOD
# =========================
st.markdown("<div class='step-box'>", unsafe_allow_html=True)
st.subheader("🔵 Step 2: Run COPOD")

if run_copod:
    if st.session_state.df is None:
        st.warning("Upload data first.")
    else:
        df_proc = st.session_state.df.copy()

        # 1. Ép kiểu số (Clean Data)
        for col in df_proc.columns:
            # Chỉ ép kiểu nếu cột chưa phải là số
            if not pd.api.types.is_numeric_dtype(df_proc[col]):
                df_proc[col] = pd.to_numeric(df_proc[col], errors='coerce')
        
        # 2. Xóa các cột/hàng lỗi
        df_proc = df_proc.dropna(axis=1, how='all') # Xóa cột toàn NaN
        df_proc = df_proc.fillna(0) # Điền 0 vào ô trống còn lại
        
        X = df_proc.select_dtypes(include=[np.number])

        if X.shape[1] == 0:
            st.error("❌ Error: Dataset has no numeric columns.")
            st.write("Current Data Preview (Check delimiters):")
            st.write(st.session_state.df.head())
        else:
            # 3. Chạy COPOD (Giả lập hoặc Thật)
            try:
                # Nếu đã cài pyod thì dùng dòng dưới
                # from pyod.models.copod import COPOD
                # clf = COPOD()
                # clf.fit(X)
                # scores = clf.decision_scores_
                
                # Giả lập cho demo
                scores = np.random.rand(len(X)) * 10
                
                st.session_state.scores = scores
                # Gán lại vào df gốc để hiển thị
                st.session_state.df["outlier_score"] = scores
                
                st.success("✅ COPOD completed!")
                st.markdown("**Top potential outliers:**")
                st.dataframe(st.session_state.df.sort_values("outlier_score", ascending=False).head(10))
                
            except Exception as e:
                st.error(f"Runtime error: {e}")

st.markdown("</div>", unsafe_allow_html=True)

# =========================
# STEP 3 – Visual Analysis
# =========================
st.markdown("<div class='step-box'>", unsafe_allow_html=True)
st.subheader("🟣 Step 3: Visual Analysis")

col1, col2 = st.columns(2)

# --- Graph 1 ---
with col1:
    if show_outlier_graph:
        if st.session_state.scores is not None:
            st.markdown("**Outlier Score Distribution**")
            fig, ax = plt.subplots()
            ax.hist(st.session_state.scores, bins=30, color='#4c6ef5', alpha=0.7)
            ax.set_title("Histogram of Outlier Scores")
            st.pyplot(fig)
        else:
            st.warning("Run COPOD first.")

# --- Graph 2 ---
with col2:
    if show_corr_failure:
        if st.session_state.df is not None:
            # Lấy 2 cột số đầu tiên để vẽ
            num_cols = st.session_state.df.select_dtypes(include=[np.number]).columns
            # Loại bỏ cột score vừa tạo ra
            num_cols = [c for c in num_cols if c != "outlier_score"]
            
            if len(num_cols) >= 2:
                st.markdown(f"**Correlation: {num_cols[0]} vs {num_cols[1]}**")
                fig, ax = plt.subplots()
                ax.scatter(st.session_state.df[num_cols[0]], st.session_state.df[num_cols[1]], alpha=0.5)
                ax.set_xlabel(str(num_cols[0]))
                ax.set_ylabel(str(num_cols[1]))
                st.pyplot(fig)
            else:
                st.warning("Need at least 2 numeric features to show correlation.")
        else:
            st.warning("Upload data first.")

st.markdown("</div>", unsafe_allow_html=True)