Spaces:

LeonceNsh
/

Privacy_Auditor

Runtime error

App Files Files Community

LeonceNsh commited on Sep 7, 2025

Commit

1a26f82

verified ·

1 Parent(s): 86a50cd

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

__pycache__/app.cpython-311.pyc +2 -2
app.py +282 -135
logs/privacy_audit_detailed.log +10 -0

__pycache__/app.cpython-311.pyc CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ec8d0c4140eb5d8d4c06e1c1122cd017af9ea151dde8bc20f7bb86f67c951446
-size 117955

 version https://git-lfs.github.com/spec/v1
+oid sha256:a5a5f57c1929a7fa923f92f6a6522741c51bb6b424ff1abf62a5084a0f3d0d9e
+size 126179

app.py CHANGED Viewed

@@ -1002,150 +1002,297 @@ except Exception as e:
     auditor = None
 def create_safe_distance_plot(result: Dict[str, Any]) -> go.Figure:
-    """Create safe distance plot with error handling"""
     try:
         if "error" in result:
-            fig = go.Figure()
-            fig.add_annotation(
-                text=f"Audit Error: {result.get('error', 'Unknown error')}<br>Step: {result.get('step_failed', 'Unknown')}",
-                x=0.5, y=0.5, showarrow=False,
-                font=dict(size=16, color="red"),
-                align="center"
-            )
-            fig.update_layout(
-                title="Audit Failed - No Visualization Available",
-                xaxis=dict(visible=False),
-                yaxis=dict(visible=False)
-            )
-            return fig
-        stats = result.get("distance_statistics", {})
-        # Create comprehensive dashboard
-        from plotly.subplots import make_subplots
-        fig = make_subplots(
-            rows=2, cols=2,
-            subplot_titles=("Distance Statistics", "Risk Assessment", "Data Quality", "Audit Summary"),
-            specs=[[{"type": "bar"}, {"type": "indicator"}],
-                   [{"type": "bar"}, {"type": "table"}]]
-        )
-        # Distance statistics
-        metrics = ["Mean", "Median", "Std Dev", "Min", "Max"]
-        values = [
-            stats.get("mean_nearest_distance", 0),
-            stats.get("median_nearest_distance", 0),
-            stats.get("std_nearest_distance", 0),
-            stats.get("min_nearest_distance", 0),
-            stats.get("max_nearest_distance", 0)
-        ]
-        fig.add_trace(
-            go.Bar(
-                x=metrics,
-                y=values,
-                name="Distance Stats",
-                marker_color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'],
-                text=[f"{v:.4f}" for v in values],
-                textposition='auto'
-            ),
-            row=1, col=1
-        )
-        # Risk assessment gauge
-        risk_level = result.get("privacy_assessment", {}).get("risk_level", "UNKNOWN")
-        epsilon = result.get("privacy_assessment", {}).get("primary_epsilon", 0)
-        risk_colors = {
-            "EXCEPTIONAL": "#2ca02c", "VERY LOW": "#8dd3c7", "LOW": "#ffd92f",
-            "MEDIUM": "#ff7f0e", "HIGH": "#d62728", "VERY HIGH": "#8b0000",
-            "CRITICAL": "#4b0082", "UNKNOWN": "#gray"
-        }
-        fig.add_trace(
-            go.Indicator(
-                mode="gauge+number",
-                value=epsilon,
-                title={"text": f"Privacy Risk: {risk_level}"},
-                number={"font": {"size": 20}},
-                gauge={
-                    "axis": {"range": [0, max(5, epsilon * 1.2)]},
-                    "bar": {"color": risk_colors.get(risk_level, "gray")},
-                    "steps": [
-                        {"range": [0, 0.1], "color": "lightgreen"},
-                        {"range": [0.1, 1], "color": "yellow"},
-                        {"range": [1, 5], "color": "red"}
-                    ],
-                    "threshold": {
-                        "line": {"color": "black", "width": 4},
-                        "thickness": 0.75,
-                        "value": epsilon
-                    }
                 }
-            ),
-            row=1, col=2
-        )
-        # Data quality metrics
-        quality_metrics = ["Zero Distances", "Close Matches", "Total Samples"]
-        quality_values = [
-            stats.get("zero_distance_count", 0),
-            stats.get("small_distance_count", 0),
-            result.get("dataset_info", {}).get("real_samples_used", 0)
-        ]
-        fig.add_trace(
-            go.Bar(
-                x=quality_metrics,
-                y=quality_values,
-                name="Quality Metrics",
-                marker_color=['#d62728', '#ff7f0e', '#1f77b4'],
-                text=quality_values,
-                textposition='auto'
-            ),
-            row=2, col=1
-        )
-        # Summary table
-        audit_meta = result.get("audit_metadata", {})
-        summary_data = [
-            ["Metric", "Value"],
-            ["Audit ID", result.get("audit_id", "N/A")],
-            ["ε (95% confidence)", f"{epsilon:.6f}"],
-            ["Risk Level", risk_level],
-            ["Distance Metric", audit_meta.get("distance_metric", "N/A")],
-            ["Duration (s)", str(audit_meta.get("duration_seconds", "N/A"))],
-            ["Dimensions", str(result.get("dataset_info", {}).get("dimensions", "N/A"))]
-        ]
-        fig.add_trace(
-            go.Table(
-                header=dict(values=["Metric", "Value"], fill_color="lightblue", font=dict(size=12)),
-                cells=dict(values=list(zip(*summary_data[1:])), fill_color="white", font=dict(size=11))
-            ),
-            row=2, col=2
-        )
-        fig.update_layout(
-            title="Privacy Audit Dashboard",
-            height=600,
-            showlegend=False,
-            template="plotly_white"
-        )
-        return fig
-    except Exception as e:
-        logger.error(f"Distance plot creation failed: {e}")
-        # Return error figure
-        fig = go.Figure()
-        fig.add_annotation(
-            text=f"Visualization Error: {str(e)}",
-            x=0.5, y=0.5, showarrow=False,
-            font=dict(size=16, color="red")
-        )
-        return fig
 def create_safe_epsilon_plot(result: Dict[str, Any]) -> go.Figure:
     """Create safe epsilon plot with error handling"""

     auditor = None
 def create_safe_distance_plot(result: Dict[str, Any]) -> go.Figure:
+    """Create enhanced privacy audit dashboard with improved data visualization"""
     try:
         if "error" in result:
+            return _create_error_figure(result)
+        return _create_comprehensive_dashboard(result)
+    except Exception as e:
+        logger.error(f"Distance plot creation failed: {e}")
+        return _create_error_figure({"error": str(e)})
+def _create_error_figure(result: Dict[str, Any]) -> go.Figure:
+    """Create error visualization with clear messaging"""
+    fig = go.Figure()
+    fig.add_annotation(
+        text=f"<b>Audit Error</b><br>{result.get('error', 'Unknown error')}<br><span style='font-size:12px'>Step: {result.get('step_failed', 'Unknown')}</span>",
+        x=0.5, y=0.5, showarrow=False,
+        font=dict(size=16, color="#dc3545"),
+        align="center",
+        bgcolor="rgba(220, 53, 69, 0.1)",
+        bordercolor="#dc3545",
+        borderwidth=2
+    )
+    fig.update_layout(
+        title="Privacy Audit Failed",
+        xaxis=dict(visible=False),
+        yaxis=dict(visible=False),
+        plot_bgcolor="white",
+        paper_bgcolor="white"
+    )
+    return fig
+def _create_comprehensive_dashboard(result: Dict[str, Any]) -> go.Figure:
+    """Create comprehensive privacy dashboard with multiple visualizations"""
+    from plotly.subplots import make_subplots
+    # Create subplot structure with better spacing
+    fig = make_subplots(
+        rows=2, cols=3,
+        subplot_titles=(
+            "Distance Distribution Analysis",
+            "Privacy Risk Assessment",
+            "Data Quality Indicators",
+            "Dataset Overview",
+            "Privacy Bounds Comparison",
+            "Processing Pipeline Status"
+        ),
+        specs=[
+            [{"type": "bar"}, {"type": "indicator"}, {"type": "scatter"}],
+            [{"type": "bar"}, {"type": "bar"}, {"type": "bar"}]
+        ],
+        vertical_spacing=0.15,
+        horizontal_spacing=0.1
+    )
+    _add_distance_analysis(fig, result)
+    _add_risk_assessment(fig, result)
+    _add_quality_indicators(fig, result)
+    _add_dataset_overview(fig, result)
+    _add_privacy_bounds(fig, result)
+    _add_processing_status(fig, result)
+    # Enhanced layout with professional styling
+    fig.update_layout(
+        title={
+            "text": "<b>Privacy Audit Dashboard</b><br><sub>Comprehensive Analysis of Synthetic Data Privacy</sub>",
+            "x": 0.5,
+            "xanchor": "center",
+            "font": {"size": 20, "color": "#2c3e50"}
+        },
+        height=700,
+        showlegend=False,
+        plot_bgcolor="white",
+        paper_bgcolor="#f8f9fa",
+        font=dict(family="Arial, sans-serif", size=11, color="#2c3e50"),
+        margin=dict(t=120, b=50, l=50, r=50)
+    )
+    return fig
+def _add_distance_analysis(fig, result: Dict[str, Any]):
+    """Add enhanced distance distribution analysis"""
+    stats = result.get("distance_statistics", {})
+    # Create meaningful distance metrics with better labeling
+    metrics = ["Mean Distance", "Median Distance", "Standard Dev", "25th Percentile", "75th Percentile"]
+    values = [
+        stats.get("mean_nearest_distance", 0),
+        stats.get("median_nearest_distance", 0),
+        stats.get("std_nearest_distance", 0),
+        stats.get("q25_nearest_distance", 0),
+        stats.get("q75_nearest_distance", 0)
+    ]
+    # Use colorblind-friendly palette with semantic meaning
+    colors = ['#3498db', '#2ecc71', '#f39c12', '#9b59b6', '#e74c3c']
+    fig.add_trace(
+        go.Bar(
+            x=metrics,
+            y=values,
+            marker_color=colors,
+            text=[f"{v:.6f}" if v > 0 else "0" for v in values],
+            textposition='outside',
+            textfont=dict(size=10, color="#2c3e50"),
+            hovertemplate="<b>%{x}</b><br>Value: %{y:.6f}<extra></extra>",
+            name="Distance Metrics"
+        ),
+        row=1, col=1
+    )
+    # Customize axes for better readability
+    fig.update_xaxes(title_text="Distance Metrics", row=1, col=1, tickangle=45)
+    fig.update_yaxes(title_text="Distance Value", row=1, col=1, tickformat=".2e")
+def _add_risk_assessment(fig, result: Dict[str, Any]):
+    """Add enhanced privacy risk assessment gauge"""
+    risk_level = result.get("privacy_assessment", {}).get("risk_level", "UNKNOWN")
+    epsilon = result.get("privacy_assessment", {}).get("primary_epsilon", 0)
+    # Enhanced risk color palette with better accessibility
+    risk_colors = {
+        "EXCEPTIONAL": "#27ae60", "VERY LOW": "#2ecc71", "LOW": "#f1c40f",
+        "MEDIUM": "#e67e22", "HIGH": "#e74c3c", "VERY HIGH": "#c0392b",
+        "CRITICAL": "#8e44ad", "UNKNOWN": "#7f8c8d"
+    }
+    # Determine gauge range based on epsilon value
+    max_range = max(5.0, epsilon * 1.5) if epsilon > 0 else 5.0
+    fig.add_trace(
+        go.Indicator(
+            mode="gauge+number+delta",
+            value=epsilon,
+            title={
+                "text": f"<b>ε-DP Privacy Budget</b><br><span style='font-size:14px'>{risk_level} Risk</span>",
+                "font": {"size": 16}
+            },
+            number={"font": {"size": 24, "color": risk_colors.get(risk_level, "#7f8c8d")}},
+            delta={"reference": 1.0, "valueformat": ".6f"},
+            gauge={
+                "axis": {
+                    "range": [0, max_range],
+                    "tickwidth": 1,
+                    "tickcolor": "#2c3e50",
+                    "tickfont": {"size": 10}
+                },
+                "bar": {"color": risk_colors.get(risk_level, "#7f8c8d"), "thickness": 0.8},
+                "steps": [
+                    {"range": [0, 0.01], "color": "#d5f4e6", "name": "Exceptional"},
+                    {"range": [0.01, 0.1], "color": "#a9dfbf", "name": "Very Low"},
+                    {"range": [0.1, 0.5], "color": "#fcf3cf", "name": "Low"},
+                    {"range": [0.5, 1.0], "color": "#f8c471", "name": "Medium"},
+                    {"range": [1.0, 2.0], "color": "#f1948a", "name": "High"},
+                    {"range": [2.0, max_range], "color": "#e8daef", "name": "Critical"}
+                ],
+                "threshold": {
+                    "line": {"color": "#2c3e50", "width": 3},
+                    "thickness": 0.9,
+                    "value": 1.0  # Reference line at ε = 1.0
                 }
+            }
+        ),
+        row=1, col=2
+    )
+def _add_quality_indicators(fig, result: Dict[str, Any]):
+    """Add data quality indicators with memorization detection"""
+    stats = result.get("distance_statistics", {})
+    # Calculate meaningful quality percentages
+    total_samples = result.get("dataset_info", {}).get("real_samples_used", 1)
+    zero_distances = stats.get("zero_distance_count", 0)
+    small_distances = stats.get("small_distance_count", 0)
+    # Create scatter plot showing memorization vs privacy
+    memorization_pct = (zero_distances / total_samples) * 100 if total_samples > 0 else 0
+    near_memorization_pct = ((small_distances - zero_distances) / total_samples) * 100 if total_samples > 0 else 0
+    safe_samples_pct = 100 - memorization_pct - near_memorization_pct
+    categories = ["Safe Samples", "Near Matches", "Exact Matches"]
+    percentages = [safe_samples_pct, near_memorization_pct, memorization_pct]
+    colors = ['#27ae60', '#f39c12', '#e74c3c']
+    # Create horizontal bar chart for better readability
+    fig.add_trace(
+        go.Bar(
+            y=categories,
+            x=percentages,
+            orientation='h',
+            marker_color=colors,
+            text=[f"{p:.1f}%" for p in percentages],
+            textposition='auto',
+            hovertemplate="<b>%{y}</b><br>Percentage: %{x:.1f}%<br>Count: %{customdata}<extra></extra>",
+            customdata=[total_samples - small_distances, small_distances - zero_distances, zero_distances],
+            name="Data Quality"
+        ),
+        row=1, col=3
+    )
+    fig.update_xaxes(title_text="Percentage of Samples", row=1, col=3, range=[0, 100])
+    fig.update_yaxes(title_text="Sample Categories", row=1, col=3)
+def _add_dataset_overview(fig, result: Dict[str, Any]):
+    """Add dataset overview with key metrics"""
+    dataset_info = result.get("dataset_info", {})
+    metrics = ["Real Samples", "Synthetic Samples", "Dimensions", "Common Features"]
+    values = [
+        dataset_info.get("real_samples_used", 0),
+        dataset_info.get("synthetic_samples", 0),
+        dataset_info.get("dimensions", 0),
+        dataset_info.get("common_features", 0)
+    ]
+    fig.add_trace(
+        go.Bar(
+            x=metrics,
+            y=values,
+            marker_color=['#3498db', '#9b59b6', '#e74c3c', '#2ecc71'],
+            text=[f"{v:,}" for v in values],
+            textposition='outside',
+            hovertemplate="<b>%{x}</b><br>Count: %{y:,}<extra></extra>",
+            name="Dataset Metrics"
+        ),
+        row=2, col=1
+    )
+    fig.update_xaxes(title_text="Dataset Characteristics", row=2, col=1, tickangle=45)
+    fig.update_yaxes(title_text="Count", row=2, col=1)
+def _add_privacy_bounds(fig, result: Dict[str, Any]):
+    """Add privacy bounds comparison across confidence levels"""
+    epsilon_bounds = result.get("epsilon_bounds", {})
+    confidence_levels = ["90%", "95%", "99%"]
+    epsilon_values = [
+        epsilon_bounds.get("eps_lb_90", 0),
+        epsilon_bounds.get("eps_lb_95", 0),
+        epsilon_bounds.get("eps_lb_99", 0)
+    ]
+    # Use gradient colors to show increasing confidence
+    colors = ['#52c41a', '#1890ff', '#722ed1']
+    fig.add_trace(
+        go.Bar(
+            x=confidence_levels,
+            y=epsilon_values,
+            marker_color=colors,
+            text=[f"ε = {v:.6f}" for v in epsilon_values],
+            textposition='outside',
+            hovertemplate="<b>%{x} Confidence</b><br>ε Lower Bound: %{y:.6f}<extra></extra>",
+            name="Privacy Bounds"
+        ),
+        row=2, col=2
+    )
+    fig.update_xaxes(title_text="Confidence Level", row=2, col=2)
+    fig.update_yaxes(title_text="ε Lower Bound", row=2, col=2, type="log" if max(epsilon_values) > 0 else "linear")
+def _add_processing_status(fig, result: Dict[str, Any]):
+    """Add processing pipeline status visualization"""
+    real_report = result.get("preprocessing_reports", {}).get("real_dataset", {})
+    synth_report = result.get("preprocessing_reports", {}).get("synthetic_dataset", {})
+    # Count completed processing steps
+    real_steps = len(real_report.get("steps_completed", []))
+    synth_steps = len(synth_report.get("steps_completed", []))
+    total_steps = 6  # Expected number of processing steps
+    datasets = ["Real Dataset", "Synthetic Dataset"]
+    completion = [real_steps / total_steps * 100, synth_steps / total_steps * 100]
+    colors = ['#28a745' if c == 100 else '#ffc107' for c in completion]
+    fig.add_trace(
+        go.Bar(
+            x=datasets,
+            y=completion,
+            marker_color=colors,
+            text=[f"{c:.0f}%<br>({int(c/100*total_steps)}/{total_steps})" for c in completion],
+            textposition='auto',
+            hovertemplate="<b>%{x}</b><br>Processing: %{y:.0f}% Complete<extra></extra>",
+            name="Processing Status"
+        ),
+        row=2, col=3
+    )
+    fig.update_xaxes(title_text="Dataset Type", row=2, col=3)
+    fig.update_yaxes(title_text="Processing Completion %", row=2, col=3, range=[0, 100])
 def create_safe_epsilon_plot(result: Dict[str, Any]) -> go.Figure:
     """Create safe epsilon plot with error handling"""

logs/privacy_audit_detailed.log CHANGED Viewed

@@ -27,3 +27,13 @@
 2025-09-07 02:43:44,053 - app - INFO - <module>:999 - Privacy auditor initialized successfully
 2025-09-07 02:43:44,583 - httpx - INFO - _send_single_request:1038 - HTTP Request: GET https://checkip.amazonaws.com/ "HTTP/1.1 200 "
 2025-09-07 02:43:44,859 - httpx - INFO - _send_single_request:1038 - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"

 2025-09-07 02:43:44,053 - app - INFO - <module>:999 - Privacy auditor initialized successfully
 2025-09-07 02:43:44,583 - httpx - INFO - _send_single_request:1038 - HTTP Request: GET https://checkip.amazonaws.com/ "HTTP/1.1 200 "
 2025-09-07 02:43:44,859 - httpx - INFO - _send_single_request:1038 - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
+2025-09-07 02:50:11,253 - app - INFO - <module>:51 - Privacy Auditor Starting - 2025-09-07 02:50:11
+2025-09-07 02:50:11,254 - app - INFO - __init__:265 - Initialized Privacy Auditor - Session: df0d8164
+2025-09-07 02:50:11,254 - app - INFO - __init__:266 - Configuration: {'confidence_level': 0.95, 'subsample_size': None, 'categorical_encoding': 'onehot', 'numerical_scaling': 'standard', 'distance_metric': 'euclidean', 'enable_preprocessing_report': True, 'max_file_size_mb': 500, 'timeout_seconds': 300, 'enable_data_validation': True, 'chunk_size': 10000, 'max_categories_onehot': 50}
+2025-09-07 02:50:11,254 - app - INFO - <module>:999 - Privacy auditor initialized successfully
+2025-09-07 02:50:11,639 - httpx - INFO - _send_single_request:1038 - HTTP Request: GET https://checkip.amazonaws.com/ "HTTP/1.1 200 "
+2025-09-07 02:50:11,964 - httpx - INFO - _send_single_request:1038 - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
+2025-09-07 02:50:30,254 - app - INFO - <module>:51 - Privacy Auditor Starting - 2025-09-07 02:50:30
+2025-09-07 02:50:30,255 - app - INFO - __init__:265 - Initialized Privacy Auditor - Session: 2d9998de
+2025-09-07 02:50:30,255 - app - INFO - __init__:266 - Configuration: {'confidence_level': 0.95, 'subsample_size': None, 'categorical_encoding': 'onehot', 'numerical_scaling': 'standard', 'distance_metric': 'euclidean', 'enable_preprocessing_report': True, 'max_file_size_mb': 500, 'timeout_seconds': 300, 'enable_data_validation': True, 'chunk_size': 10000, 'max_categories_onehot': 50}
+2025-09-07 02:50:30,255 - app - INFO - <module>:999 - Privacy auditor initialized successfully