Spaces:

yoon-gu
/

concept-drift-simulator

Sleeping

Yoon-gu Hwang Claude commited on Nov 13, 2025

Commit

7ab1194

1 Parent(s): 3ab49ae

Change visualization to bar chart with binary classification

- Update drift_simulator.py: Generate binary classification data (class 0 vs class 1)
- Update visualizer.py: Create bar chart visualization instead of scatter plot
- Update analyzer.py: Analyze binary classification data
- Add test files to .gitignore

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (4) hide show

.gitignore +2 -0
analyzer.py +59 -29
drift_simulator.py +30 -33
visualizer.py +119 -68

.gitignore CHANGED Viewed

@@ -15,3 +15,5 @@ build/
 .vscode/
 *.log
 flagged/

 .vscode/
 *.log
 flagged/
+*.html
+test_*.py

analyzer.py CHANGED Viewed

@@ -11,11 +11,21 @@ def analyze_drift(X: np.ndarray, y: np.ndarray, drift_points: np.ndarray, drift_
         "drift_locations": drift_points.tolist() if len(drift_points) > 0 else [],
     }
-    # 전체 통계
-    analysis["mean_y"] = float(np.mean(y))
-    analysis["std_y"] = float(np.std(y))
-    analysis["min_y"] = float(np.min(y))
-    analysis["max_y"] = float(np.max(y))
     # 세그먼트별 분석
     segments = []
@@ -26,26 +36,31 @@ def analyze_drift(X: np.ndarray, y: np.ndarray, drift_points: np.ndarray, drift_
         end = segment_boundaries[i + 1]
         segment_y = y[start:end]
-        segment_X = X[start:end]
-        # 선형 회귀 계수 계산 (기울기)
-        if len(segment_X) > 1:
-            coeffs = np.polyfit(segment_X, segment_y, 1)
-            slope = float(coeffs[0])
-            intercept = float(coeffs[1])
         else:
-            slope = 0.0
-            intercept = float(segment_y[0]) if len(segment_y) > 0 else 0.0
-        segments.append({
-            "segment_id": i,
-            "start_idx": int(start),
-            "end_idx": int(end),
-            "mean": float(np.mean(segment_y)),
-            "std": float(np.std(segment_y)),
-            "slope": slope,
-            "intercept": intercept
-        })
     analysis["segments"] = segments
@@ -55,27 +70,42 @@ def analyze_drift(X: np.ndarray, y: np.ndarray, drift_points: np.ndarray, drift_
 def format_analysis_summary(analysis: Dict) -> str:
     """분석 결과를 사람이 읽기 쉬운 형식으로 포맷"""
     summary = f"""
 ## 드리프트 분석 결과
-**드리프트 유형:** {analysis['drift_type'].upper()}
 **전체 데이터:**
 - 총 샘플 수: {analysis['total_samples']}
 - 드리프트 발생 횟수: {analysis['num_drift_points']}
-- 평균: {analysis['mean_y']:.2f}
 - 표준편차: {analysis['std_y']:.2f}
 - 범위: [{analysis['min_y']:.2f}, {analysis['max_y']:.2f}]
-**세그먼트별 분석:**
 """
     for seg in analysis['segments']:
-        summary += f"""
 **세그먼트 {seg['segment_id'] + 1}** (샘플 {seg['start_idx']}-{seg['end_idx']})
 - 평균: {seg['mean']:.2f}
 - 표준편차: {seg['std']:.2f}
-- 관계식: y = {seg['slope']:.2f}x + {seg['intercept']:.2f}
 """
     return summary

         "drift_locations": drift_points.tolist() if len(drift_points) > 0 else [],
     }
+    # incremental drift는 연속 값, 나머지는 이진 분류
+    if drift_type == "incremental":
+        # 연속 값 분석
+        analysis["mean_y"] = float(np.mean(y))
+        analysis["std_y"] = float(np.std(y))
+        analysis["min_y"] = float(np.min(y))
+        analysis["max_y"] = float(np.max(y))
+    else:
+        # 이진 분류 분석
+        class_0_count = int(np.sum(y == 0))
+        class_1_count = int(np.sum(y == 1))
+        analysis["class_0_count"] = class_0_count
+        analysis["class_1_count"] = class_1_count
+        analysis["class_0_ratio"] = float(class_0_count / len(y))
+        analysis["class_1_ratio"] = float(class_1_count / len(y))
     # 세그먼트별 분석
     segments = []
         end = segment_boundaries[i + 1]
         segment_y = y[start:end]
+        if drift_type == "incremental":
+            # 연속 값 세그먼트 분석
+            segments.append({
+                "segment_id": i,
+                "start_idx": int(start),
+                "end_idx": int(end),
+                "mean": float(np.mean(segment_y)),
+                "std": float(np.std(segment_y))
+            })
         else:
+            # 이진 분류 세그먼트 분석
+            class_0_count = int(np.sum(segment_y == 0))
+            class_1_count = int(np.sum(segment_y == 1))
+            total = len(segment_y)
+            segments.append({
+                "segment_id": i,
+                "start_idx": int(start),
+                "end_idx": int(end),
+                "class_0_count": class_0_count,
+                "class_1_count": class_1_count,
+                "class_0_ratio": float(class_0_count / total) if total > 0 else 0.0,
+                "class_1_ratio": float(class_1_count / total) if total > 0 else 0.0
+            })
     analysis["segments"] = segments
 def format_analysis_summary(analysis: Dict) -> str:
     """분석 결과를 사람이 읽기 쉬운 형식으로 포맷"""
+    drift_type = analysis['drift_type']
     summary = f"""
 ## 드리프트 분석 결과
+**드리프트 유형:** {drift_type.upper()}
 **전체 데이터:**
 - 총 샘플 수: {analysis['total_samples']}
 - 드리프트 발생 횟수: {analysis['num_drift_points']}
+"""
+    if drift_type == "incremental":
+        summary += f"""- 평균 값: {analysis['mean_y']:.2f}
 - 표준편차: {analysis['std_y']:.2f}
 - 범위: [{analysis['min_y']:.2f}, {analysis['max_y']:.2f}]
+"""
+    else:
+        summary += f"""- Class 0 (파란색): {analysis['class_0_count']} 샘플 ({analysis['class_0_ratio']*100:.1f}%)
+- Class 1 (초록색): {analysis['class_1_count']} 샘플 ({analysis['class_1_ratio']*100:.1f}%)
 """
+    summary += "\n**세그먼트별 분석:**\n"
     for seg in analysis['segments']:
+        if drift_type == "incremental":
+            summary += f"""
 **세그먼트 {seg['segment_id'] + 1}** (샘플 {seg['start_idx']}-{seg['end_idx']})
 - 평균: {seg['mean']:.2f}
 - 표준편차: {seg['std']:.2f}
+"""
+        else:
+            summary += f"""
+**세그먼트 {seg['segment_id'] + 1}** (샘플 {seg['start_idx']}-{seg['end_idx']})
+- Class 0: {seg['class_0_count']} 샘플 ({seg['class_0_ratio']*100:.1f}%)
+- Class 1: {seg['class_1_count']} 샘플 ({seg['class_1_ratio']*100:.1f}%)
 """
     return summary

drift_simulator.py CHANGED Viewed

@@ -3,14 +3,14 @@ from typing import Tuple
 def generate_sudden_drift(n_samples: int = 1000, drift_point: int = 500) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """급격한 드리프트: t 시점에서 갑자기 데이터 분포 변경"""
-    X = np.linspace(0, 10, n_samples)
-    y = np.zeros(n_samples)
-    # Before drift: y = 2*X + noise
-    y[:drift_point] = 2 * X[:drift_point] + np.random.normal(0, 1, drift_point)
-    # After drift: y = -X + 5 + noise (완전히 다른 관계)
-    y[drift_point:] = -X[drift_point:] + 5 + np.random.normal(0, 1, n_samples - drift_point)
     drift_points = np.array([drift_point])
     return X, y, drift_points
@@ -18,44 +18,41 @@ def generate_sudden_drift(n_samples: int = 1000, drift_point: int = 500) -> Tupl
 def generate_gradual_drift(n_samples: int = 1000, drift_start: int = 300, drift_end: int = 700) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """점진적 드리프트: 두 분포가 섞이며 천천히 전환"""
-    X = np.linspace(0, 10, n_samples)
-    y = np.zeros(n_samples)
-    # Before drift: y = 2*X
-    y[:drift_start] = 2 * X[:drift_start] + np.random.normal(0, 1, drift_start)
-    # Gradual transition: mixture of old and new concepts
     transition_length = drift_end - drift_start
     for i in range(drift_start, drift_end):
         weight = (i - drift_start) / transition_length
-        old_concept = 2 * X[i] + np.random.normal(0, 1)
-        new_concept = -X[i] + 5 + np.random.normal(0, 1)
-        y[i] = (1 - weight) * old_concept + weight * new_concept
-    # After drift: y = -X + 5
-    y[drift_end:] = -X[drift_end:] + 5 + np.random.normal(0, 1, n_samples - drift_end)
     drift_points = np.array([drift_start, drift_end])
     return X, y, drift_points
-def generate_incremental_drift(n_samples: int = 1000, n_steps: int = 5) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """증분적 드리프트: 계단식으로 작은 변화가 누적"""
-    X = np.linspace(0, 10, n_samples)
-    y = np.zeros(n_samples)
-    step_size = n_samples // (n_steps + 1)
     drift_points = []
-    for step in range(n_steps + 1):
         start_idx = step * step_size
-        end_idx = (step + 1) * step_size if step < n_steps else n_samples
-        # 각 단계마다 기울기가 조금씩 변화
-        slope = 2 - (step / n_steps) * 3  # 2에서 -1로 점진적 변화
-        intercept = (step / n_steps) * 5  # 0에서 5로 점진적 변화
-        y[start_idx:end_idx] = slope * X[start_idx:end_idx] + intercept + np.random.normal(0, 1, end_idx - start_idx)
         if step > 0:
             drift_points.append(start_idx)
@@ -65,8 +62,8 @@ def generate_incremental_drift(n_samples: int = 1000, n_steps: int = 5) -> Tuple
 def generate_recurring_drift(n_samples: int = 1000, cycle_length: int = 250) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """반복적 드리프트: 이전 분포가 주기적으로 재등장"""
-    X = np.linspace(0, 10, n_samples)
-    y = np.zeros(n_samples)
     drift_points = []
@@ -74,11 +71,11 @@ def generate_recurring_drift(n_samples: int = 1000, cycle_length: int = 250) ->
         cycle_pos = i % cycle_length
         if cycle_pos < cycle_length // 2:
-            # Concept A: y = 2*X
-            y[i] = 2 * X[i] + np.random.normal(0, 1)
         else:
-            # Concept B: y = -X + 5
-            y[i] = -X[i] + 5 + np.random.normal(0, 1)
         if cycle_pos == cycle_length // 2:
             drift_points.append(i)

 def generate_sudden_drift(n_samples: int = 1000, drift_point: int = 500) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """급격한 드리프트: t 시점에서 갑자기 데이터 분포 변경"""
+    X = np.arange(n_samples)  # 시간 인덱스
+    y = np.zeros(n_samples, dtype=int)
+    # Before drift: class 0 (파란색)
+    y[:drift_point] = 0
+    # After drift: class 1 (초록색)
+    y[drift_point:] = 1
     drift_points = np.array([drift_point])
     return X, y, drift_points
 def generate_gradual_drift(n_samples: int = 1000, drift_start: int = 300, drift_end: int = 700) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """점진적 드리프트: 두 분포가 섞이며 천천히 전환"""
+    X = np.arange(n_samples)  # 시간 인덱스
+    y = np.zeros(n_samples, dtype=int)
+    # Before drift: class 0 (파란색)
+    y[:drift_start] = 0
+    # Gradual transition: class 0과 class 1이 섞임
     transition_length = drift_end - drift_start
     for i in range(drift_start, drift_end):
+        # 점진적으로 class 1의 비율 증가
         weight = (i - drift_start) / transition_length
+        y[i] = 1 if np.random.random() < weight else 0
+    # After drift: class 1 (초록색)
+    y[drift_end:] = 1
     drift_points = np.array([drift_start, drift_end])
     return X, y, drift_points
+def generate_incremental_drift(n_samples: int = 1000, n_steps: int = 10) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """증분적 드리프트: 계단식으로 작은 변화가 누적"""
+    X = np.arange(n_samples)  # 시간 인덱스
+    y = np.zeros(n_samples)  # 연속 값 (시각화를 위해)
+    step_size = n_samples // n_steps
     drift_points = []
+    for step in range(n_steps):
         start_idx = step * step_size
+        end_idx = (step + 1) * step_size if step < n_steps - 1 else n_samples
+        # 각 단계마다 0에서 1로 점진적 변화
+        value = step / (n_steps - 1)
+        y[start_idx:end_idx] = value
         if step > 0:
             drift_points.append(start_idx)
 def generate_recurring_drift(n_samples: int = 1000, cycle_length: int = 250) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """반복적 드리프트: 이전 분포가 주기적으로 재등장"""
+    X = np.arange(n_samples)  # 시간 인덱스
+    y = np.zeros(n_samples, dtype=int)
     drift_points = []
         cycle_pos = i % cycle_length
         if cycle_pos < cycle_length // 2:
+            # Concept A: class 0 (파란색)
+            y[i] = 0
         else:
+            # Concept B: class 1 (초록색)
+            y[i] = 1
         if cycle_pos == cycle_length // 2:
             drift_points.append(i)

visualizer.py CHANGED Viewed

@@ -7,42 +7,60 @@ def create_drift_visualization(X: np.ndarray, y: np.ndarray, drift_points: np.nd
     fig = go.Figure()
-    # 메인 데이터 scatter plot
-    fig.add_trace(go.Scatter(
-        x=X,
-        y=y,
-        mode='markers',
-        name='Data Points',
-        marker=dict(
-            size=6,
-            color=np.arange(len(X)),
-            colorscale='Viridis',
-            showscale=True,
-            colorbar=dict(title="Time Step"),
-            line=dict(width=0.5, color='white')
-        ),
-        hovertemplate='X: %{x:.2f}<br>y: %{y:.2f}<br>Index: %{marker.color}<extra></extra>'
-    ))
-    # 드리프트 발생 지점 표시
-    y_min, y_max = y.min(), y.max()
-    y_range = y_max - y_min
-    for i, drift_point in enumerate(drift_points):
-        fig.add_vline(
-            x=X[drift_point],
-            line_dash="dash",
-            line_color="red",
-            annotation_text=f"Drift {i+1}",
-            annotation_position="top"
-        )
     # 레이아웃 설정
     title_map = {
-        "sudden": "Sudden (Abrupt) Drift - 급격한 드리프트",
-        "gradual": "Gradual Drift - 점진적 드리프트",
-        "incremental": "Incremental Drift - 증분적 드리프트",
-        "recurring": "Recurring Drift - 반복적 드리프트"
     }
     fig.update_layout(
@@ -50,31 +68,31 @@ def create_drift_visualization(X: np.ndarray, y: np.ndarray, drift_points: np.nd
             text=title_map.get(drift_type, "Concept Drift"),
             x=0.5,
             xanchor='center',
-            font=dict(size=20)
         ),
-        xaxis_title="Feature (X)",
-        yaxis_title="Target (y)",
         hovermode='closest',
         template='plotly_white',
-        height=600,
-        showlegend=True,
         legend=dict(
             yanchor="top",
             y=0.99,
-            xanchor="left",
-            x=0.01
         ),
         xaxis=dict(
-            showgrid=True,
-            gridwidth=1,
-            gridcolor='LightGray'
         ),
         yaxis=dict(
-            showgrid=True,
-            gridwidth=1,
-            gridcolor='LightGray'
         ),
-        plot_bgcolor='white'
     )
     return fig
@@ -86,7 +104,9 @@ def create_comparison_visualization(drift_data_dict: dict) -> go.Figure:
     fig = make_subplots(
         rows=2, cols=2,
-        subplot_titles=("Sudden Drift", "Gradual Drift", "Incremental Drift", "Recurring Drift")
     )
     positions = [(1, 1), (1, 2), (2, 1), (2, 2)]
@@ -96,28 +116,59 @@ def create_comparison_visualization(drift_data_dict: dict) -> go.Figure:
         if drift_type in drift_data_dict:
             X, y, drift_points = drift_data_dict[drift_type]
-            fig.add_trace(
-                go.Scatter(
-                    x=X,
-                    y=y,
-                    mode='markers',
-                    marker=dict(size=3, color=np.arange(len(X)), colorscale='Viridis'),
-                    showlegend=False
-                ),
-                row=row, col=col
-            )
-            # 드리프트 지점 표시
-            for drift_point in drift_points:
-                fig.add_vline(
-                    x=X[drift_point],
-                    line_dash="dash",
-                    line_color="red",
                     row=row, col=col
                 )
-    fig.update_xaxes(title_text="X")
-    fig.update_yaxes(title_text="y")
-    fig.update_layout(height=800, title_text="Concept Drift Types Comparison", showlegend=False)
     return fig

     fig = go.Figure()
+    # incremental drift는 연속 값으로 처리
+    if drift_type == "incremental":
+        # 0-1 사이 값을 색상으로 매핑
+        colors = []
+        for val in y:
+            # 파란색(0)에서 초록색(1)로 점진적 변화
+            blue = int(255 * (1 - val))
+            green = int(255 * val)
+            colors.append(f'rgb({blue}, {green}, 150)')
+        fig.add_trace(go.Bar(
+            x=X,
+            y=np.ones(len(X)),
+            marker=dict(
+                color=colors,
+                line=dict(width=0)
+            ),
+            showlegend=False,
+            hovertemplate='Time: %{x}<br>Value: %{customdata:.2f}<extra></extra>',
+            customdata=y
+        ))
+    else:
+        # 이진 분류 (0: 파란색, 1: 초록색)
+        class_0_indices = np.where(y == 0)[0]
+        class_1_indices = np.where(y == 1)[0]
+        # Class 0 (파란색)
+        if len(class_0_indices) > 0:
+            fig.add_trace(go.Bar(
+                x=X[class_0_indices],
+                y=np.ones(len(class_0_indices)),
+                marker=dict(color='rgb(70, 130, 180)', line=dict(width=0)),
+                name='Class 0',
+                showlegend=True,
+                hovertemplate='Time: %{x}<br>Class: 0<extra></extra>'
+            ))
+        # Class 1 (초록색)
+        if len(class_1_indices) > 0:
+            fig.add_trace(go.Bar(
+                x=X[class_1_indices],
+                y=np.ones(len(class_1_indices)),
+                marker=dict(color='rgb(60, 179, 113)', line=dict(width=0)),
+                name='Class 1',
+                showlegend=True,
+                hovertemplate='Time: %{x}<br>Class: 1<extra></extra>'
+            ))
     # 레이아웃 설정
     title_map = {
+        "sudden": "Sudden Drift",
+        "gradual": "Gradual Drift",
+        "incremental": "Incremental Drift",
+        "recurring": "Reoccurring Concepts"
     }
     fig.update_layout(
             text=title_map.get(drift_type, "Concept Drift"),
             x=0.5,
             xanchor='center',
+            font=dict(size=20, weight='bold')
         ),
+        xaxis_title="Time",
+        yaxis_title="Data distribution",
         hovermode='closest',
         template='plotly_white',
+        height=400,
+        showlegend=(drift_type != "incremental"),
         legend=dict(
             yanchor="top",
             y=0.99,
+            xanchor="right",
+            x=0.99
         ),
         xaxis=dict(
+            showgrid=False,
+            showticklabels=False
         ),
         yaxis=dict(
+            showgrid=False,
+            showticklabels=False,
+            range=[0, 1.2]
         ),
+        plot_bgcolor='white',
+        bargap=0
     )
     return fig
     fig = make_subplots(
         rows=2, cols=2,
+        subplot_titles=("Sudden Drift", "Gradual Drift", "Incremental Drift", "Reoccurring Concepts"),
+        vertical_spacing=0.15,
+        horizontal_spacing=0.1
     )
     positions = [(1, 1), (1, 2), (2, 1), (2, 2)]
         if drift_type in drift_data_dict:
             X, y, drift_points = drift_data_dict[drift_type]
+            if drift_type == "incremental":
+                # Incremental drift: 연속 색상 변화
+                colors = []
+                for val in y:
+                    blue = int(255 * (1 - val))
+                    green = int(255 * val)
+                    colors.append(f'rgb({blue}, {green}, 150)')
+                fig.add_trace(
+                    go.Bar(
+                        x=X,
+                        y=np.ones(len(X)),
+                        marker=dict(color=colors, line=dict(width=0)),
+                        showlegend=False
+                    ),
                     row=row, col=col
                 )
+            else:
+                # 이진 분류
+                class_0_indices = np.where(y == 0)[0]
+                class_1_indices = np.where(y == 1)[0]
+                if len(class_0_indices) > 0:
+                    fig.add_trace(
+                        go.Bar(
+                            x=X[class_0_indices],
+                            y=np.ones(len(class_0_indices)),
+                            marker=dict(color='rgb(70, 130, 180)', line=dict(width=0)),
+                            showlegend=False
+                        ),
+                        row=row, col=col
+                    )
+                if len(class_1_indices) > 0:
+                    fig.add_trace(
+                        go.Bar(
+                            x=X[class_1_indices],
+                            y=np.ones(len(class_1_indices)),
+                            marker=dict(color='rgb(60, 179, 113)', line=dict(width=0)),
+                            showlegend=False
+                        ),
+                        row=row, col=col
+                    )
+    # 레이아웃 설정
+    fig.update_xaxes(title_text="Time", showgrid=False, showticklabels=False)
+    fig.update_yaxes(title_text="Data distribution", showgrid=False, showticklabels=False, range=[0, 1.2])
+    fig.update_layout(
+        height=800,
+        title_text="Concept Drift Types Comparison",
+        showlegend=False,
+        bargap=0,
+        template='plotly_white'
+    )
     return fig