Spaces:

intrect
/

artifactnet

Runtime error

File size: 23,776 Bytes

0020ddc

#!/usr/bin/env python3
# Purpose: ArtifactNet HF Spaces (ZeroGPU) — Gradio demo

"""ArtifactNet — AI Music Forensic Detector.

HF Spaces + ZeroGPU 전용 빌드.
  - Upload-only (YouTube/URL 제거)
  - Remote inference / residual snapshot / sqlite 로그 제거
  - Error report 는 api.intrect.io 로 POST (옵션)
  - AcoustID 제거 (API key 비공개 유지)
"""

import json
import os
import sys
import tempfile
import time
import warnings
from pathlib import Path

import gradio as gr
import numpy as np
import requests as _requests
import torch

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from config import SR, CHUNK_SAMPLES, MIN_CONFIDENT_DURATION
from inference.audio_utils import load_audio_mono_tensor, get_audio_info
from inference.e2e_model import run_e2e_inference, load_models
from visualization.feature_bars import plot_feature_bars
from visualization.radar import plot_forensic_radar, forensic_features_explanation
from visualization.spectrogram import plot_spectrograms
from visualization.timeline import plot_timeline

warnings.filterwarnings("ignore")

API_BASE = os.environ.get("INTRECT_API_BASE", "https://api.intrect.io")

# ============================================================
# Upload validation
# ============================================================

_AUDIO_MAGIC = {
    b"RIFF":     "wav",
    b"fLaC":     "flac",
    b"\xff\xfb": "mp3",
    b"\xff\xf3": "mp3",
    b"\xff\xf2": "mp3",
    b"ID3":      "mp3",
    b"OggS":     "ogg",
}
_FTYP_BRANDS = {b"M4A ", b"isom", b"mp42", b"dash", b"MSNV"}
_MAX_UPLOAD_BYTES = 100 * 1024 * 1024
_ALLOWED_EXTENSIONS = {".wav", ".flac", ".mp3", ".ogg", ".opus", ".m4a", ".aac", ".webm"}


def _validate_audio_file(path: str) -> str | None:
    if not os.path.isfile(path):
        return "<p style='color:#ff4757'>파일을 찾을 수 없습니다.</p>"
    file_size = os.path.getsize(path)
    if file_size > _MAX_UPLOAD_BYTES:
        mb = file_size / 1024 / 1024
        return f"<p style='color:#ff4757'>파일이 너무 큽니다 ({mb:.0f}MB). 최대 100MB까지 허용됩니다.</p>"
    if file_size < 100:
        return "<p style='color:#ff4757'>파일이 너무 작습니다.</p>"

    ext = os.path.splitext(path)[1].lower()
    if ext not in _ALLOWED_EXTENSIONS:
        return (f"<p style='color:#ff4757'>지원하지 않는 형식입니다 ({ext}). "
                f"WAV, FLAC, MP3, OGG, Opus, M4A만 지원합니다.</p>")

    try:
        with open(path, "rb") as f:
            header = f.read(12)
    except Exception:
        return "<p style='color:#ff4757'>파일을 읽을 수 없습니다.</p>"

    detected = None
    for magic, fmt in _AUDIO_MAGIC.items():
        if header[:len(magic)] == magic:
            detected = fmt
            break
    if detected is None and header[4:8] == b"ftyp":
        if header[8:12] in _FTYP_BRANDS:
            detected = "m4a"
    if detected is None and header[:4] == b"\x1a\x45\xdf\xa3":
        detected = "webm"

    if detected is None:
        return ("<p style='color:#ff4757'>유효한 오디오 파일이 아닙니다.</p>")
    return None


# ============================================================
# Verdict stats
# ============================================================
_MEDIAN_THRESHOLD = 0.5


def _compute_segment_stats(chunk_probs, chunk_metadata=None):
    arr = np.array(chunk_probs)
    n = len(arr)
    q25, q50, q75 = np.percentile(arr, [25, 50, 75])

    if chunk_metadata and len(chunk_metadata) == len(chunk_probs):
        rms_arr = np.array([m.get('rms', 1.0) for m in chunk_metadata])
        median_rms = np.median(rms_arr)
        weights = rms_arr / (median_rms + 1e-10)
        weights = weights / weights.sum()
        sorted_indices = np.argsort(arr)
        sorted_probs = arr[sorted_indices]
        sorted_weights = weights[sorted_indices]
        cumsum_weights = np.cumsum(sorted_weights)
        idx = np.searchsorted(cumsum_weights, 0.5)
        weighted_median = float(sorted_probs[min(idx, len(sorted_probs) - 1)])
    else:
        weighted_median = float(q50)

    return {
        "n": n,
        "mean": float(np.mean(arr)),
        "median": float(q50),
        "weighted_median": weighted_median,
        "q25": float(q25),
        "q75": float(q75),
        "iqr": float(q75 - q25),
        "std": float(np.std(arr)),
        "pct_high": float((arr >= 0.8).sum() / n) if n else 0.0,
        "pct_above_50": float((arr >= 0.5).sum() / n) if n else 0.0,
        "pct_low": float((arr < 0.2).sum() / n) if n else 0.0,
        "n_high": int((arr >= 0.8).sum()),
        "n_mid": int(((arr >= 0.5) & (arr < 0.8)).sum()),
        "n_low": int((arr < 0.5).sum()),
    }


# ============================================================
# Verdict HTML card
# ============================================================

def _verdict_html(verdict, stats, is_stereo, duration=0, elapsed=0,
                  is_short=False, audio_format=""):
    if verdict == "No file":
        return """
        <div style="text-align:center;padding:30px;background:#16213e;
                    border-radius:12px;color:#888;">
            <p style="font-size:16px;">Upload an audio file to begin analysis</p>
        </div>"""

    mean_prob = stats["mean"]
    median_prob = stats["median"]
    pct_high = stats["pct_high"]
    n_total = stats["n"]

    if verdict == "AI Generated":
        color = "#ff4757"
        icon = "&#9888;"
        desc = f"{pct_high:.0%} of segments show strong AI indicators"
    elif verdict == "Partial AI":
        color = "#ffa502"
        icon = "&#9888;"
        iqr = stats.get("iqr", 0)
        desc = f"Bimodal distribution (IQR={iqr:.2f}) — possible AI vocals over human instrumental"
    else:
        color = "#2ed573"
        icon = "&#10003;"
        desc = "No significant AI generation indicators found"

    channels = "Stereo" if is_stereo else "Mono"
    n_high, n_mid, n_low = stats["n_high"], stats["n_mid"], stats["n_low"]
    if n_total > 0:
        pct_h = n_high / n_total * 100
        pct_m = n_mid / n_total * 100
        pct_l = n_low / n_total * 100
    else:
        pct_h = pct_m = 0.0
        pct_l = 100.0

    short_warn = ""
    if is_short:
        short_warn = f"""
        <div style="margin-top:8px;padding:8px 12px;background:rgba(255,165,2,0.15);
                    border-radius:6px;border-left:3px solid #ffa502;font-size:12px;
                    color:#ccc;line-height:1.5;">
            <b style="color:#ffa502;">Short file ({duration:.0f}s):</b>
            Files under {MIN_CONFIDENT_DURATION}s have fewer segments for analysis.
            Use tracks longer than {MIN_CONFIDENT_DURATION}s for best results.
        </div>"""

    mono_warn = ""
    if not is_stereo:
        mono_warn = """
        <div style="margin-top:8px;padding:6px 10px;background:rgba(255,165,2,0.15);
                    border-radius:6px;border-left:3px solid #ffa502;font-size:12px;">
            Mono input — stereo phase features unavailable.
        </div>"""

    return f"""
    <div style="text-align:center;padding:20px;background:#16213e;
                border-radius:12px;border:2px solid {color};">
        <div style="font-size:14px;color:{color};letter-spacing:1px;
                    text-transform:uppercase;font-weight:600;">
            {icon} Verdict
        </div>
        <div style="font-size:32px;font-weight:bold;color:{color};
                    letter-spacing:2px;margin:6px 0;">{verdict.upper()}</div>
        <div style="color:#aaa;font-size:13px;margin-bottom:10px;">{desc}</div>
        <div style="font-size:36px;font-weight:bold;color:white;margin:4px 0;">
            median={median_prob:.1%} &nbsp;
            <span style="font-size:18px;color:#888;">mean={mean_prob:.1%}</span>
        </div>
        <div style="margin:10px auto;max-width:320px;">
            <div style="height:14px;background:#333;border-radius:7px;
                        overflow:hidden;display:flex;">
                <div style="width:{pct_h:.1f}%;background:#ff4757;"></div>
                <div style="width:{pct_m:.1f}%;background:#ffa502;"></div>
                <div style="width:{pct_l:.1f}%;background:#2ed573;"></div>
            </div>
            <div style="display:flex;justify-content:space-between;
                        font-size:10px;color:#888;margin-top:2px;">
                <span style="color:#ff4757;">{n_high} high</span>
                <span style="color:#ffa502;">{n_mid} mid</span>
                <span style="color:#2ed573;">{n_low} low</span>
            </div>
        </div>
        <div style="color:#999;font-size:13px;margin-top:10px;">
            {n_total} segments &nbsp;|&nbsp;
            IQR={stats['iqr']:.2f} &nbsp;|&nbsp;
            {channels} &nbsp;|&nbsp;
            {duration:.1f}s &nbsp;|&nbsp;
            {elapsed:.1f}s
        </div>
        <div style="display:flex;justify-content:center;gap:12px;margin-top:8px;">
            <span style="background:#16213e;border:1px solid #333;border-radius:6px;
                         padding:4px 10px;font-size:12px;color:#3498db;">
                Format: <b>{audio_format}</b>
            </span>
        </div>
        {short_warn}
        {mono_warn}
    </div>"""


# ============================================================
# Main analysis (Upload only)
# ============================================================

def analyze_audio(audio_path, progress=gr.Progress()):
    if audio_path is None:
        return (
            _verdict_html("No file", {}, False, 0, 0, False),
            None, None, None, None, None, None, {},
        )

    file_err = _validate_audio_file(audio_path)
    if file_err:
        return file_err, None, None, None, None, None, None, {}

    progress(0, desc="🎵 Loading audio...")
    t0 = time.time()

    try:
        mono_tensor, audio_np, is_stereo = load_audio_mono_tensor(audio_path)
    except Exception as e:
        err = f"<p style='color:#ff4757'>Error loading audio: {e}</p>"
        return err, None, None, None, None, None, None, {}

    info = get_audio_info(audio_np, is_stereo)
    mono_np = mono_tensor.numpy()
    duration = info["duration"]

    progress(0.2, desc="🔬 Running AI forensic analysis on CPU (ONNX)...")
    chunk_probs, _, chunk_metadata, forensic_stats, router_feat, verdict_feat = \
        run_e2e_inference(mono_tensor)

    progress(0.6, desc="📊 Computing distribution statistics...")
    seg_stats = _compute_segment_stats(chunk_probs, chunk_metadata)
    elapsed = time.time() - t0

    progress(0.8, desc="🎨 Generating visualizations...")
    is_short = duration < MIN_CONFIDENT_DURATION

    audio_ext = os.path.splitext(audio_path)[1].lower()
    fmt_map = {".wav": "WAV", ".flac": "FLAC", ".mp3": "MP3",
               ".opus": "Opus", ".ogg": "OGG", ".m4a": "M4A",
               ".aac": "AAC", ".webm": "WebM"}
    audio_format = fmt_map.get(audio_ext, audio_ext.lstrip(".").upper() or "Unknown")

    median_prob = seg_stats.get("weighted_median", seg_stats["median"])
    verdict = "AI Generated" if median_prob >= _MEDIAN_THRESHOLD else "Human-Made"

    iqr = seg_stats.get("iqr", 0)
    n_high = seg_stats.get("n_high", 0)
    n_low = seg_stats.get("n_low", 0)
    n_total = seg_stats.get("n", 1)
    if (iqr >= 0.4
            and n_high >= max(3, n_total * 0.1)
            and n_low >= max(3, n_total * 0.1)):
        verdict = "Partial AI"

    verdict_html = _verdict_html(
        verdict, seg_stats, is_stereo,
        duration=duration, elapsed=elapsed,
        is_short=is_short, audio_format=audio_format,
    )

    spec_fig = plot_spectrograms(mono_np)
    timeline_fig = plot_timeline(
        chunk_probs, mono_np, chunk_metadata,
        weighted_median=seg_stats.get("weighted_median")
    )
    radar_fig = plot_forensic_radar(forensic_stats)
    bars_fig = plot_feature_bars(forensic_stats)
    forensic_explanation = forensic_features_explanation()

    filename = os.path.basename(audio_path) if audio_path else "unknown"
    result_json = {
        "filename": filename,
        "verdict": verdict,
        "is_short_file": is_short,
        "duration_sec": round(duration, 2),
        "is_stereo": is_stereo,
        "elapsed_sec": round(elapsed, 2),
        "segment_stats": {k: round(v, 4) if isinstance(v, float) else v
                          for k, v in seg_stats.items()},
        "segment_probs": [round(p, 4) for p in chunk_probs],
        "format": audio_format,
    }
    json_path = os.path.join(tempfile.gettempdir(), "artifactnet_result.json")
    with open(json_path, "w") as f:
        json.dump(result_json, f, indent=2)

    progress(1.0, desc="✅ Analysis complete!")

    analysis_state = {
        "filename": filename,
        "duration": duration,
        "is_stereo": is_stereo,
        "elapsed": elapsed,
        "verdict": verdict,
        "forensic_stats": forensic_stats,
        "seg_stats": seg_stats,
        "chunk_probs": chunk_probs,
        "is_short": is_short,
        "predicted_verdict": "ai" if verdict == "AI Generated" else (
            "real" if verdict == "Human-Made" else "unknown"
        ),
        "predicted_probability": round(median_prob, 6),
    }
    return verdict_html, spec_fig, timeline_fig, radar_fig, bars_fig, forensic_explanation, json_path, analysis_state


# ============================================================
# Error report → api.intrect.io
# ============================================================

def submit_error_report(analysis_state, reported_as: str, comment: str):
    if not analysis_state or not analysis_state.get("filename"):
        return gr.update(visible=True,
                         value='<span style="color:#ff7675;font-size:12px;">Please analyze a file first.</span>')

    meta = {
        "filename": analysis_state.get("filename"),
        "reported_as": (reported_as or "unsure").lower(),
        "comment": (comment or "").strip()[:500],
        "predicted_verdict": analysis_state.get("predicted_verdict"),
        "predicted_probability": analysis_state.get("predicted_probability"),
        "source_hint": "hf-space",
    }
    try:
        with _requests.Session() as s:
            r = s.post(
                f"{API_BASE.rstrip('/')}/v1/reports",
                data={"report": json.dumps(meta)},
                timeout=10,
            )
        if r.status_code >= 300:
            try:
                detail = r.json().get("detail", r.text[:200])
            except Exception:
                detail = r.text[:200]
            return gr.update(visible=True,
                             value=f'<span style="color:#ff7675;font-size:12px;">Report failed: {detail}</span>')
    except Exception as e:
        return gr.update(visible=True,
                         value=f'<span style="color:#ff7675;font-size:12px;">Report failed: {e}</span>')

    return gr.update(
        visible=True,
        value='<span style="color:#2ed573;font-size:12px;">✅ Thanks! Report submitted.</span>',
    )


# ============================================================
# Gradio UI
# ============================================================

def build_ui():
    theme = gr.themes.Base(
        primary_hue="orange",
        secondary_hue="blue",
        neutral_hue="slate",
        font=gr.themes.GoogleFont("Inter"),
    ).set(
        body_background_fill="#0f0f23",
        block_background_fill="#1a1a2e",
        block_border_color="#333",
        input_background_fill="#16213e",
        button_primary_background_fill="#ffa502",
        button_primary_text_color="black",
    )

    custom_css = """
    .gradio-container { margin: 0 auto !important; }
    footer { display: none !important; }
    .gr-button-primary { border-radius: 8px !important; font-weight: 600 !important; }
    .gr-input, .gr-box { border-color: #333 !important; }
    .gr-panel { border-color: #333 !important; }
    h1, h2, h3 { font-family: 'Inter', sans-serif !important; }
    .demo-nav { display: flex; justify-content: space-between; align-items: center;
      padding: 12px 20px; border-bottom: 1px solid #333; margin: -16px -16px 16px; }
    .demo-nav a { color: #8b949e; text-decoration: none; font-size: 13px; }
    .demo-nav a:hover { color: #ffa502; }
    .demo-nav .brand { color: #ffa502; font-weight: 700; font-size: 16px; letter-spacing: 2px; text-transform: uppercase; }
    """

    with gr.Blocks(theme=theme, css=custom_css,
                   title="ArtifactNet — AI Music Forensic Detector") as demo:
        gr.HTML("""
        <div class="demo-nav">
            <a href="https://intrect.io" class="brand">Intrect</a>
            <div style="display:flex;gap:20px;align-items:center;">
                <a href="https://intrect.io">Home</a>
                <a href="https://dash.intrect.io">Dashboard</a>
                <a href="https://intrect.io/#pricing">Pricing</a>
            </div>
        </div>
        """)

        gr.HTML(f"""
        <div style="text-align:center;padding:16px 0 8px;">
            <h1 style="color:white;font-size:26px;margin:0;letter-spacing:-0.5px;">
                ArtifactNet
            </h1>
            <p style="color:#6e7681;font-size:13px;margin:4px 0 0;">
                AI-Generated Music Detection — ONNX Runtime CPU
            </p>
            <div style="margin:8px auto;max-width:540px;padding:6px 12px;background:rgba(255,165,2,0.12);
                        border:1px solid #ffa502;border-radius:8px;font-size:12px;color:#ffa502;">
                Running on CPU — a 4-minute track takes ~30–60 s.
            </div>
        </div>
        """)

        with gr.Row():
            with gr.Column(scale=1):
                audio_input = gr.Audio(
                    label="WAV / MP3 / FLAC (max 100MB, 5 min)",
                    type="filepath",
                    sources=["upload"],
                )
                analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
            with gr.Column(scale=1):
                verdict_output = gr.HTML(
                    value=_verdict_html("No file", {}, False, 0, 0, False),
                    label="Verdict",
                )
                with gr.Accordion("Think this result is wrong?", open=False):
                    gr.HTML(
                        """<p style="color:#aaa;font-size:12px;margin:4px 0;">
                        Help us improve — anonymous feedback.
                        </p>"""
                    )
                    report_reported_as = gr.Radio(
                        choices=[
                            ("It should be AI", "ai"),
                            ("It should be Real / Human", "real"),
                            ("Unsure / Mixed", "unsure"),
                        ],
                        label="What do you think it actually is?",
                        value="ai",
                    )
                    report_comment = gr.Textbox(
                        label="Optional comment (≤500 chars)",
                        placeholder="Any context we should know?",
                        max_lines=3,
                        lines=2,
                    )
                    report_submit_btn = gr.Button("🚩 Submit report", variant="secondary", size="sm")
                    report_status = gr.HTML(value="", visible=False)

        with gr.Row():
            spec_output = gr.Plot(label="Spectral Analysis")

        with gr.Row():
            with gr.Column(scale=2):
                timeline_output = gr.Plot(label="P(AI) Timeline")
            with gr.Column(scale=1):
                radar_output = gr.Plot(label="Forensic Features")

        with gr.Row():
            bars_output = gr.Plot(label="Feature Strength Analysis")

        forensic_explanation_output = gr.HTML(visible=False)

        with gr.Row():
            json_output = gr.File(label="Result JSON", visible=True)

        with gr.Accordion("About ArtifactNet", open=False):
            gr.HTML(f"""
            <div style="color:#ccc;font-size:13px;line-height:1.6;padding:10px;">
                <h3 style="color:white;">Overview</h3>
                <p>ArtifactNet is a neural forensic detector for AI-generated music.
                It uses HPSS and 7-channel forensic features to detect generation artifacts.</p>

                <h3 style="color:white;">Pipeline</h3>
                <ol>
                    <li>STFT + U-Net artifact residual</li>
                    <li>HPSS (harmonic-percussive separation)</li>
                    <li>7ch features (mel, H/P ratio, temporal derivatives, spectral flux)</li>
                    <li>CNN classifier → per-segment P(AI)</li>
                    <li>Median aggregation across segments</li>
                </ol>

                <h3 style="color:white;">Limitations</h3>
                <ul>
                    <li>Short files (&lt;{MIN_CONFIDENT_DURATION}s) have lower confidence</li>
                    <li>Mono input reduces accuracy</li>
                    <li>Heavily processed audio may affect results</li>
                </ul>
                <p style="color:#888;font-size:11px;margin-top:10px;">
                    Research project — interpret alongside other evidence. See
                    <a href="https://api.intrect.io/legal/disclaimer" style="color:#6e7681;">Disclaimer</a>.
                </p>
            </div>
            """)

        analysis_state = gr.State({})
        outputs = [verdict_output, spec_output, timeline_output,
                   radar_output, bars_output, forensic_explanation_output,
                   json_output, analysis_state]

        analyze_btn.click(
            fn=analyze_audio,
            inputs=[audio_input],
            outputs=outputs,
            api_name=False,
            concurrency_limit=1,
            concurrency_id="gpu_inference",
        )

        report_submit_btn.click(
            fn=submit_error_report,
            inputs=[analysis_state, report_reported_as, report_comment],
            outputs=[report_status],
        )

        gr.HTML("""
        <div style="text-align:center;padding:24px 0 8px;border-top:1px solid #333;margin-top:24px;">
            <p style="color:#484f58;font-size:12px;margin:0;">
                Powered by <a href="https://intrect.io" style="color:#ffa502;text-decoration:none;">Intrect</a>
                &nbsp;|&nbsp; <a href="https://dash.intrect.io" style="color:#6e7681;text-decoration:none;">Dashboard</a>
                &nbsp;|&nbsp; <a href="https://intrect.io/#pricing" style="color:#6e7681;text-decoration:none;">Pricing</a>
            </p>
            <p style="color:#484f58;font-size:11px;margin:6px 0 0;">
                <a href="https://api.intrect.io/legal/terms" style="color:#6e7681;text-decoration:none;">Terms</a>
                &nbsp;&middot;&nbsp; <a href="https://api.intrect.io/legal/privacy" style="color:#6e7681;text-decoration:none;">Privacy</a>
                &nbsp;&middot;&nbsp; <a href="https://api.intrect.io/legal/disclaimer" style="color:#6e7681;text-decoration:none;">Disclaimer</a>
            </p>
            <p style="color:#484f58;font-size:10px;margin:8px 0 0;font-style:italic;">
                ArtifactNet provides forensic indicators, not conclusive legal proof.
            </p>
        </div>
        """)

    return demo


# ============================================================
# Entry point
# ============================================================

print("[hf-spaces] downloading ONNX models from HF Hub...", flush=True)
load_models()
print("[hf-spaces] models ready (onnxruntime CPU).", flush=True)

demo = build_ui()
demo.queue(max_size=10, default_concurrency_limit=1)


if __name__ == "__main__":
    demo.launch()