Spaces:

intrect
/

artifactnet

Runtime error

App Files Files

artifactnet / app.py

intrect

feat(space): CPU ONNX runtime build (v9.4, full-song sliding aggregation)

0020ddc 30 days ago

raw

history blame

23.8 kB

	#!/usr/bin/env python3
	# Purpose: ArtifactNet HF Spaces (ZeroGPU) — Gradio demo

	"""ArtifactNet — AI Music Forensic Detector.

	HF Spaces + ZeroGPU 전용 빌드.
	- Upload-only (YouTube/URL 제거)
	- Remote inference / residual snapshot / sqlite 로그 제거
	- Error report 는 api.intrect.io 로 POST (옵션)
	- AcoustID 제거 (API key 비공개 유지)
	"""

	import json
	import os
	import sys
	import tempfile
	import time
	import warnings
	from pathlib import Path

	import gradio as gr
	import numpy as np
	import requests as _requests
	import torch

	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	from config import SR, CHUNK_SAMPLES, MIN_CONFIDENT_DURATION
	from inference.audio_utils import load_audio_mono_tensor, get_audio_info
	from inference.e2e_model import run_e2e_inference, load_models
	from visualization.feature_bars import plot_feature_bars
	from visualization.radar import plot_forensic_radar, forensic_features_explanation
	from visualization.spectrogram import plot_spectrograms
	from visualization.timeline import plot_timeline

	warnings.filterwarnings("ignore")

	API_BASE = os.environ.get("INTRECT_API_BASE", "https://api.intrect.io")

	# ============================================================
	# Upload validation
	# ============================================================

	_AUDIO_MAGIC = {
	b"RIFF": "wav",
	b"fLaC": "flac",
	b"\xff\xfb": "mp3",
	b"\xff\xf3": "mp3",
	b"\xff\xf2": "mp3",
	b"ID3": "mp3",
	b"OggS": "ogg",
	}
	_FTYP_BRANDS = {b"M4A ", b"isom", b"mp42", b"dash", b"MSNV"}
	_MAX_UPLOAD_BYTES = 100 * 1024 * 1024
	_ALLOWED_EXTENSIONS = {".wav", ".flac", ".mp3", ".ogg", ".opus", ".m4a", ".aac", ".webm"}


	def _validate_audio_file(path: str) -> str \| None:
	if not os.path.isfile(path):
	return "<p style='color:#ff4757'>파일을 찾을 수 없습니다.</p>"
	file_size = os.path.getsize(path)
	if file_size > _MAX_UPLOAD_BYTES:
	mb = file_size / 1024 / 1024
	return f"<p style='color:#ff4757'>파일이 너무 큽니다 ({mb:.0f}MB). 최대 100MB까지 허용됩니다.</p>"
	if file_size < 100:
	return "<p style='color:#ff4757'>파일이 너무 작습니다.</p>"

	ext = os.path.splitext(path)[1].lower()
	if ext not in _ALLOWED_EXTENSIONS:
	return (f"<p style='color:#ff4757'>지원하지 않는 형식입니다 ({ext}). "
	f"WAV, FLAC, MP3, OGG, Opus, M4A만 지원합니다.</p>")

	try:
	with open(path, "rb") as f:
	header = f.read(12)
	except Exception:
	return "<p style='color:#ff4757'>파일을 읽을 수 없습니다.</p>"

	detected = None
	for magic, fmt in _AUDIO_MAGIC.items():
	if header[:len(magic)] == magic:
	detected = fmt
	break
	if detected is None and header[4:8] == b"ftyp":
	if header[8:12] in _FTYP_BRANDS:
	detected = "m4a"
	if detected is None and header[:4] == b"\x1a\x45\xdf\xa3":
	detected = "webm"

	if detected is None:
	return ("<p style='color:#ff4757'>유효한 오디오 파일이 아닙니다.</p>")
	return None


	# ============================================================
	# Verdict stats
	# ============================================================
	_MEDIAN_THRESHOLD = 0.5


	def _compute_segment_stats(chunk_probs, chunk_metadata=None):
	arr = np.array(chunk_probs)
	n = len(arr)
	q25, q50, q75 = np.percentile(arr, [25, 50, 75])

	if chunk_metadata and len(chunk_metadata) == len(chunk_probs):
	rms_arr = np.array([m.get('rms', 1.0) for m in chunk_metadata])
	median_rms = np.median(rms_arr)
	weights = rms_arr / (median_rms + 1e-10)
	weights = weights / weights.sum()
	sorted_indices = np.argsort(arr)
	sorted_probs = arr[sorted_indices]
	sorted_weights = weights[sorted_indices]
	cumsum_weights = np.cumsum(sorted_weights)
	idx = np.searchsorted(cumsum_weights, 0.5)
	weighted_median = float(sorted_probs[min(idx, len(sorted_probs) - 1)])
	else:
	weighted_median = float(q50)

	return {
	"n": n,
	"mean": float(np.mean(arr)),
	"median": float(q50),
	"weighted_median": weighted_median,
	"q25": float(q25),
	"q75": float(q75),
	"iqr": float(q75 - q25),
	"std": float(np.std(arr)),
	"pct_high": float((arr >= 0.8).sum() / n) if n else 0.0,
	"pct_above_50": float((arr >= 0.5).sum() / n) if n else 0.0,
	"pct_low": float((arr < 0.2).sum() / n) if n else 0.0,
	"n_high": int((arr >= 0.8).sum()),
	"n_mid": int(((arr >= 0.5) & (arr < 0.8)).sum()),
	"n_low": int((arr < 0.5).sum()),
	}


	# ============================================================
	# Verdict HTML card
	# ============================================================

	def _verdict_html(verdict, stats, is_stereo, duration=0, elapsed=0,
	is_short=False, audio_format=""):
	if verdict == "No file":
	return """
	<div style="text-align:center;padding:30px;background:#16213e;
	border-radius:12px;color:#888;">
	<p style="font-size:16px;">Upload an audio file to begin analysis</p>
	</div>"""

	mean_prob = stats["mean"]
	median_prob = stats["median"]
	pct_high = stats["pct_high"]
	n_total = stats["n"]

	if verdict == "AI Generated":
	color = "#ff4757"
	icon = "⚠"
	desc = f"{pct_high:.0%} of segments show strong AI indicators"
	elif verdict == "Partial AI":
	color = "#ffa502"
	icon = "⚠"
	iqr = stats.get("iqr", 0)
	desc = f"Bimodal distribution (IQR={iqr:.2f}) — possible AI vocals over human instrumental"
	else:
	color = "#2ed573"
	icon = "✓"
	desc = "No significant AI generation indicators found"

	channels = "Stereo" if is_stereo else "Mono"
	n_high, n_mid, n_low = stats["n_high"], stats["n_mid"], stats["n_low"]
	if n_total > 0:
	pct_h = n_high / n_total * 100
	pct_m = n_mid / n_total * 100
	pct_l = n_low / n_total * 100
	else:
	pct_h = pct_m = 0.0
	pct_l = 100.0

	short_warn = ""
	if is_short:
	short_warn = f"""
	<div style="margin-top:8px;padding:8px 12px;background:rgba(255,165,2,0.15);
	border-radius:6px;border-left:3px solid #ffa502;font-size:12px;
	color:#ccc;line-height:1.5;">
	<b style="color:#ffa502;">Short file ({duration:.0f}s):</b>
	Files under {MIN_CONFIDENT_DURATION}s have fewer segments for analysis.
	Use tracks longer than {MIN_CONFIDENT_DURATION}s for best results.
	</div>"""

	mono_warn = ""
	if not is_stereo:
	mono_warn = """
	<div style="margin-top:8px;padding:6px 10px;background:rgba(255,165,2,0.15);
	border-radius:6px;border-left:3px solid #ffa502;font-size:12px;">
	Mono input — stereo phase features unavailable.
	</div>"""

	return f"""
	<div style="text-align:center;padding:20px;background:#16213e;
	border-radius:12px;border:2px solid {color};">
	<div style="font-size:14px;color:{color};letter-spacing:1px;
	text-transform:uppercase;font-weight:600;">
	{icon} Verdict
	</div>
	<div style="font-size:32px;font-weight:bold;color:{color};
	letter-spacing:2px;margin:6px 0;">{verdict.upper()}</div>
	<div style="color:#aaa;font-size:13px;margin-bottom:10px;">{desc}</div>
	<div style="font-size:36px;font-weight:bold;color:white;margin:4px 0;">
	median={median_prob:.1%}
	<span style="font-size:18px;color:#888;">mean={mean_prob:.1%}</span>
	</div>
	<div style="margin:10px auto;max-width:320px;">
	<div style="height:14px;background:#333;border-radius:7px;
	overflow:hidden;display:flex;">
	<div style="width:{pct_h:.1f}%;background:#ff4757;"></div>
	<div style="width:{pct_m:.1f}%;background:#ffa502;"></div>
	<div style="width:{pct_l:.1f}%;background:#2ed573;"></div>
	</div>
	<div style="display:flex;justify-content:space-between;
	font-size:10px;color:#888;margin-top:2px;">
	<span style="color:#ff4757;">{n_high} high</span>
	<span style="color:#ffa502;">{n_mid} mid</span>
	<span style="color:#2ed573;">{n_low} low</span>
	</div>
	</div>
	<div style="color:#999;font-size:13px;margin-top:10px;">
	{n_total} segments  \|
	IQR={stats['iqr']:.2f}  \|
	{channels}  \|
	{duration:.1f}s  \|
	{elapsed:.1f}s
	</div>
	<div style="display:flex;justify-content:center;gap:12px;margin-top:8px;">
	<span style="background:#16213e;border:1px solid #333;border-radius:6px;
	padding:4px 10px;font-size:12px;color:#3498db;">
	Format: <b>{audio_format}</b>
	</span>
	</div>
	{short_warn}
	{mono_warn}
	</div>"""


	# ============================================================
	# Main analysis (Upload only)
	# ============================================================

	def analyze_audio(audio_path, progress=gr.Progress()):
	if audio_path is None:
	return (
	_verdict_html("No file", {}, False, 0, 0, False),
	None, None, None, None, None, None, {},
	)

	file_err = _validate_audio_file(audio_path)
	if file_err:
	return file_err, None, None, None, None, None, None, {}

	progress(0, desc="🎵 Loading audio...")
	t0 = time.time()

	try:
	mono_tensor, audio_np, is_stereo = load_audio_mono_tensor(audio_path)
	except Exception as e:
	err = f"<p style='color:#ff4757'>Error loading audio: {e}</p>"
	return err, None, None, None, None, None, None, {}

	info = get_audio_info(audio_np, is_stereo)
	mono_np = mono_tensor.numpy()
	duration = info["duration"]

	progress(0.2, desc="🔬 Running AI forensic analysis on CPU (ONNX)...")
	chunk_probs, _, chunk_metadata, forensic_stats, router_feat, verdict_feat = \
	run_e2e_inference(mono_tensor)

	progress(0.6, desc="📊 Computing distribution statistics...")
	seg_stats = _compute_segment_stats(chunk_probs, chunk_metadata)
	elapsed = time.time() - t0

	progress(0.8, desc="🎨 Generating visualizations...")
	is_short = duration < MIN_CONFIDENT_DURATION

	audio_ext = os.path.splitext(audio_path)[1].lower()
	fmt_map = {".wav": "WAV", ".flac": "FLAC", ".mp3": "MP3",
	".opus": "Opus", ".ogg": "OGG", ".m4a": "M4A",
	".aac": "AAC", ".webm": "WebM"}
	audio_format = fmt_map.get(audio_ext, audio_ext.lstrip(".").upper() or "Unknown")

	median_prob = seg_stats.get("weighted_median", seg_stats["median"])
	verdict = "AI Generated" if median_prob >= _MEDIAN_THRESHOLD else "Human-Made"

	iqr = seg_stats.get("iqr", 0)
	n_high = seg_stats.get("n_high", 0)
	n_low = seg_stats.get("n_low", 0)
	n_total = seg_stats.get("n", 1)
	if (iqr >= 0.4
	and n_high >= max(3, n_total * 0.1)
	and n_low >= max(3, n_total * 0.1)):
	verdict = "Partial AI"

	verdict_html = _verdict_html(
	verdict, seg_stats, is_stereo,
	duration=duration, elapsed=elapsed,
	is_short=is_short, audio_format=audio_format,
	)

	spec_fig = plot_spectrograms(mono_np)
	timeline_fig = plot_timeline(
	chunk_probs, mono_np, chunk_metadata,
	weighted_median=seg_stats.get("weighted_median")
	)
	radar_fig = plot_forensic_radar(forensic_stats)
	bars_fig = plot_feature_bars(forensic_stats)
	forensic_explanation = forensic_features_explanation()

	filename = os.path.basename(audio_path) if audio_path else "unknown"
	result_json = {
	"filename": filename,
	"verdict": verdict,
	"is_short_file": is_short,
	"duration_sec": round(duration, 2),
	"is_stereo": is_stereo,
	"elapsed_sec": round(elapsed, 2),
	"segment_stats": {k: round(v, 4) if isinstance(v, float) else v
	for k, v in seg_stats.items()},
	"segment_probs": [round(p, 4) for p in chunk_probs],
	"format": audio_format,
	}
	json_path = os.path.join(tempfile.gettempdir(), "artifactnet_result.json")
	with open(json_path, "w") as f:
	json.dump(result_json, f, indent=2)

	progress(1.0, desc="✅ Analysis complete!")

	analysis_state = {
	"filename": filename,
	"duration": duration,
	"is_stereo": is_stereo,
	"elapsed": elapsed,
	"verdict": verdict,
	"forensic_stats": forensic_stats,
	"seg_stats": seg_stats,
	"chunk_probs": chunk_probs,
	"is_short": is_short,
	"predicted_verdict": "ai" if verdict == "AI Generated" else (
	"real" if verdict == "Human-Made" else "unknown"
	),
	"predicted_probability": round(median_prob, 6),
	}
	return verdict_html, spec_fig, timeline_fig, radar_fig, bars_fig, forensic_explanation, json_path, analysis_state


	# ============================================================
	# Error report → api.intrect.io
	# ============================================================

	def submit_error_report(analysis_state, reported_as: str, comment: str):
	if not analysis_state or not analysis_state.get("filename"):
	return gr.update(visible=True,
	value='<span style="color:#ff7675;font-size:12px;">Please analyze a file first.</span>')

	meta = {
	"filename": analysis_state.get("filename"),
	"reported_as": (reported_as or "unsure").lower(),
	"comment": (comment or "").strip()[:500],
	"predicted_verdict": analysis_state.get("predicted_verdict"),
	"predicted_probability": analysis_state.get("predicted_probability"),
	"source_hint": "hf-space",
	}
	try:
	with _requests.Session() as s:
	r = s.post(
	f"{API_BASE.rstrip('/')}/v1/reports",
	data={"report": json.dumps(meta)},
	timeout=10,
	)
	if r.status_code >= 300:
	try:
	detail = r.json().get("detail", r.text[:200])
	except Exception:
	detail = r.text[:200]
	return gr.update(visible=True,
	value=f'<span style="color:#ff7675;font-size:12px;">Report failed: {detail}</span>')
	except Exception as e:
	return gr.update(visible=True,
	value=f'<span style="color:#ff7675;font-size:12px;">Report failed: {e}</span>')

	return gr.update(
	visible=True,
	value='<span style="color:#2ed573;font-size:12px;">✅ Thanks! Report submitted.</span>',
	)


	# ============================================================
	# Gradio UI
	# ============================================================

	def build_ui():
	theme = gr.themes.Base(
	primary_hue="orange",
	secondary_hue="blue",
	neutral_hue="slate",
	font=gr.themes.GoogleFont("Inter"),
	).set(
	body_background_fill="#0f0f23",
	block_background_fill="#1a1a2e",
	block_border_color="#333",
	input_background_fill="#16213e",
	button_primary_background_fill="#ffa502",
	button_primary_text_color="black",
	)

	custom_css = """
	.gradio-container { margin: 0 auto !important; }
	footer { display: none !important; }
	.gr-button-primary { border-radius: 8px !important; font-weight: 600 !important; }
	.gr-input, .gr-box { border-color: #333 !important; }
	.gr-panel { border-color: #333 !important; }
	h1, h2, h3 { font-family: 'Inter', sans-serif !important; }
	.demo-nav { display: flex; justify-content: space-between; align-items: center;
	padding: 12px 20px; border-bottom: 1px solid #333; margin: -16px -16px 16px; }
	.demo-nav a { color: #8b949e; text-decoration: none; font-size: 13px; }
	.demo-nav a:hover { color: #ffa502; }
	.demo-nav .brand { color: #ffa502; font-weight: 700; font-size: 16px; letter-spacing: 2px; text-transform: uppercase; }
	"""

	with gr.Blocks(theme=theme, css=custom_css,
	title="ArtifactNet — AI Music Forensic Detector") as demo:
	gr.HTML("""
	<div class="demo-nav">
	<a href="https://intrect.io" class="brand">Intrect</a>
	<div style="display:flex;gap:20px;align-items:center;">
	<a href="https://intrect.io">Home</a>
	<a href="https://dash.intrect.io">Dashboard</a>
	<a href="https://intrect.io/#pricing">Pricing</a>
	</div>
	</div>
	""")

	gr.HTML(f"""
	<div style="text-align:center;padding:16px 0 8px;">
	<h1 style="color:white;font-size:26px;margin:0;letter-spacing:-0.5px;">
	ArtifactNet
	</h1>
	<p style="color:#6e7681;font-size:13px;margin:4px 0 0;">
	AI-Generated Music Detection — ONNX Runtime CPU
	</p>
	<div style="margin:8px auto;max-width:540px;padding:6px 12px;background:rgba(255,165,2,0.12);
	border:1px solid #ffa502;border-radius:8px;font-size:12px;color:#ffa502;">
	Running on CPU — a 4-minute track takes ~30–60 s.
	</div>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(
	label="WAV / MP3 / FLAC (max 100MB, 5 min)",
	type="filepath",
	sources=["upload"],
	)
	analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
	with gr.Column(scale=1):
	verdict_output = gr.HTML(
	value=_verdict_html("No file", {}, False, 0, 0, False),
	label="Verdict",
	)
	with gr.Accordion("Think this result is wrong?", open=False):
	gr.HTML(
	"""<p style="color:#aaa;font-size:12px;margin:4px 0;">
	Help us improve — anonymous feedback.
	</p>"""
	)
	report_reported_as = gr.Radio(
	choices=[
	("It should be AI", "ai"),
	("It should be Real / Human", "real"),
	("Unsure / Mixed", "unsure"),
	],
	label="What do you think it actually is?",
	value="ai",
	)
	report_comment = gr.Textbox(
	label="Optional comment (≤500 chars)",
	placeholder="Any context we should know?",
	max_lines=3,
	lines=2,
	)
	report_submit_btn = gr.Button("🚩 Submit report", variant="secondary", size="sm")
	report_status = gr.HTML(value="", visible=False)

	with gr.Row():
	spec_output = gr.Plot(label="Spectral Analysis")

	with gr.Row():
	with gr.Column(scale=2):
	timeline_output = gr.Plot(label="P(AI) Timeline")
	with gr.Column(scale=1):
	radar_output = gr.Plot(label="Forensic Features")

	with gr.Row():
	bars_output = gr.Plot(label="Feature Strength Analysis")

	forensic_explanation_output = gr.HTML(visible=False)

	with gr.Row():
	json_output = gr.File(label="Result JSON", visible=True)

	with gr.Accordion("About ArtifactNet", open=False):
	gr.HTML(f"""
	<div style="color:#ccc;font-size:13px;line-height:1.6;padding:10px;">
	<h3 style="color:white;">Overview</h3>
	<p>ArtifactNet is a neural forensic detector for AI-generated music.
	It uses HPSS and 7-channel forensic features to detect generation artifacts.</p>

	<h3 style="color:white;">Pipeline</h3>
	<ol>
	<li>STFT + U-Net artifact residual</li>
	<li>HPSS (harmonic-percussive separation)</li>
	<li>7ch features (mel, H/P ratio, temporal derivatives, spectral flux)</li>
	<li>CNN classifier → per-segment P(AI)</li>
	<li>Median aggregation across segments</li>
	</ol>

	<h3 style="color:white;">Limitations</h3>
	<ul>
	<li>Short files (<{MIN_CONFIDENT_DURATION}s) have lower confidence</li>
	<li>Mono input reduces accuracy</li>
	<li>Heavily processed audio may affect results</li>
	</ul>
	<p style="color:#888;font-size:11px;margin-top:10px;">
	Research project — interpret alongside other evidence. See
	<a href="https://api.intrect.io/legal/disclaimer" style="color:#6e7681;">Disclaimer</a>.
	</p>
	</div>
	""")

	analysis_state = gr.State({})
	outputs = [verdict_output, spec_output, timeline_output,
	radar_output, bars_output, forensic_explanation_output,
	json_output, analysis_state]

	analyze_btn.click(
	fn=analyze_audio,
	inputs=[audio_input],
	outputs=outputs,
	api_name=False,
	concurrency_limit=1,
	concurrency_id="gpu_inference",
	)

	report_submit_btn.click(
	fn=submit_error_report,
	inputs=[analysis_state, report_reported_as, report_comment],
	outputs=[report_status],
	)

	gr.HTML("""
	<div style="text-align:center;padding:24px 0 8px;border-top:1px solid #333;margin-top:24px;">
	<p style="color:#484f58;font-size:12px;margin:0;">
	Powered by <a href="https://intrect.io" style="color:#ffa502;text-decoration:none;">Intrect</a>
	\|  <a href="https://dash.intrect.io" style="color:#6e7681;text-decoration:none;">Dashboard</a>
	\|  <a href="https://intrect.io/#pricing" style="color:#6e7681;text-decoration:none;">Pricing</a>
	</p>
	<p style="color:#484f58;font-size:11px;margin:6px 0 0;">
	<a href="https://api.intrect.io/legal/terms" style="color:#6e7681;text-decoration:none;">Terms</a>
	·  <a href="https://api.intrect.io/legal/privacy" style="color:#6e7681;text-decoration:none;">Privacy</a>
	·  <a href="https://api.intrect.io/legal/disclaimer" style="color:#6e7681;text-decoration:none;">Disclaimer</a>
	</p>
	<p style="color:#484f58;font-size:10px;margin:8px 0 0;font-style:italic;">
	ArtifactNet provides forensic indicators, not conclusive legal proof.
	</p>
	</div>
	""")

	return demo


	# ============================================================
	# Entry point
	# ============================================================

	print("[hf-spaces] downloading ONNX models from HF Hub...", flush=True)
	load_models()
	print("[hf-spaces] models ready (onnxruntime CPU).", flush=True)

	demo = build_ui()
	demo.queue(max_size=10, default_concurrency_limit=1)


	if __name__ == "__main__":
	demo.launch()