# app_hf.py # PhantomOps — HuggingFace Space Demo # Displays real results from AMD MI300X run import streamlit as st import json import os st.set_page_config( page_title="PhantomOps Platform", layout="wide", initial_sidebar_state="expanded" ) st.markdown(""" """, unsafe_allow_html=True) # ── Header ───────────────────────────────────────────────────── st.markdown('
PhantomOps Platform
', unsafe_allow_html=True) st.markdown('
Automated Adversarial Testing & Hardening for LLM Agents
', unsafe_allow_html=True) st.markdown('''
Live Results from AMD Instinct MI300X — This demo displays real output generated on AMD Developer Cloud hardware using ROCm 7.2 and Qwen 2.5 from HuggingFace Hub. The full pipeline runs locally on MI300X for data privacy and inference speed.
''', unsafe_allow_html=True) # ── Sidebar ───────────────────────────────────────────────────── with st.sidebar: st.markdown("### About PhantomOps") st.markdown("""
PhantomOps is the crash test lab for AI agents.

It finds failures before your users do — then fixes them automatically.

Three unique weapons:
🎯 Personalized Chaos
🔬 Reasoning Autopsy
🔧 Auto-Patching
""", unsafe_allow_html=True) st.divider() st.markdown("#### Infrastructure") st.markdown("""
🔴 AMD Instinct MI300X
⚡ ROCm 7.2
🤗 Qwen 2.5 (HuggingFace Hub)
🐍 Python + Transformers
""", unsafe_allow_html=True) st.divider() st.markdown("
PhantomOps Core v1.0.0
AMD Developer Hackathon 2026
", unsafe_allow_html=True) # ── Load results ──────────────────────────────────────────────── RESULTS_FILE = "demo_results.json" if not os.path.exists(RESULTS_FILE): st.error("demo_results.json not found. Please upload your AMD results file.") st.stop() with open(RESULTS_FILE, 'r') as f: output = json.load(f) fingerprint = output.get('fingerprint', {}) autopsies = output.get('autopsies', []) patches = output.get('patches', []) drift = output.get('drift_report', {}) failures = [a for a in autopsies if a['autopsy'].get('did_fail', False)] # ── Executive Summary ─────────────────────────────────────────── st.markdown("### Executive Summary") m1, m2, m3, m4 = st.columns(4) metrics = [ (len(autopsies), "Simulations Run", "#f4f4f5"), (len(failures), "Failures Isolated", "#ef4444" if failures else "#22c55e"), (len(patches), "Patches Synthesized", "#22c55e" if patches else "#f4f4f5"), ("Detected" if drift.get('drift_detected') else "None", "Behavioral Drift", "#ef4444" if drift.get('drift_detected') else "#22c55e"), ] for col, (val, label, color) in zip([m1,m2,m3,m4], metrics): with col: st.markdown(f"""
{val}
● {label}
""", unsafe_allow_html=True) # ── Agent fingerprint ─────────────────────────────────────────── st.markdown("---") st.markdown("### Target Agent Profile") col1, col2 = st.columns(2) with col1: st.markdown(f"""
Domain {fingerprint.get('domain','N/A')}
""", unsafe_allow_html=True) assumptions = fingerprint.get('assumptions', []) if assumptions: st.markdown(f"""
Assumptions Detected {"
".join(f"• {a}" for a in assumptions)}
""", unsafe_allow_html=True) with col2: weak_points = fingerprint.get('weak_points', []) if weak_points: st.markdown(f"""
Predicted Weak Points {"
".join(f"• {w}" for w in weak_points)}
""", unsafe_allow_html=True) # ── Vulnerability report ──────────────────────────────────────── st.markdown("---") st.markdown("### Vulnerability Report & Remediation") for i, item in enumerate(autopsies): autopsy = item['autopsy'] failed = autopsy.get('did_fail', False) stype = item['scenario']['scenario_type'].replace('_',' ').title() severity = autopsy.get('severity','unknown').upper() sev_color = {"CRITICAL":"#ef4444","HIGH":"#f97316","MEDIUM":"#eab308","LOW":"#22c55e"}.get(severity,"#a1a1aa") with st.expander( f"{'🔴 Issue Detected' if failed else '🟢 Secure'} — Vector {i+1}: {stype} | Severity: {severity}", expanded=(i == 0 and failed) ): left, right = st.columns(2) with left: st.markdown("#### Baseline Behavior") st.markdown("**Adversarial Input:**") st.code(item['scenario']['input'], language=None) st.markdown("**Agent Output:**") resp = item['scenario']['response'] st.markdown(f'
{resp[:500]}{"..." if len(resp)>500 else ""}
', unsafe_allow_html=True) if failed: st.markdown("**Reasoning Autopsy:**") st.markdown(f"""
Failure Class: {autopsy.get('failure_type','N/A')}
Severity: {severity}

Logic Deterioration:
{autopsy.get('reasoning_breakdown','N/A')}

Root Cause:
{autopsy.get('root_cause','N/A')}
""", unsafe_allow_html=True) with right: st.markdown("#### Remediated Behavior") matching = next( (p for p in patches if p['original_failure']['scenario']['scenario_type'] == item['scenario']['scenario_type']), None ) if matching: patch = matching['patch'] confidence = patch.get('confidence','unknown').upper() conf_color = {"HIGH":"#22c55e","MEDIUM":"#eab308","LOW":"#ef4444"}.get(confidence,"#a1a1aa") vr = matching['verified_response'] st.markdown("**Synthesized Directive:**") st.markdown(f'
{patch.get("what_changed","N/A")}
', unsafe_allow_html=True) st.markdown(f"**Verification Confidence:** {confidence}", unsafe_allow_html=True) st.markdown("**Verified Agent Output:**") st.markdown(f'
{vr[:500]}{"..." if len(vr)>500 else ""}
', unsafe_allow_html=True) else: st.markdown('
Simulation passed baseline checks. No remediation required.
', unsafe_allow_html=True) # ── Drift ─────────────────────────────────────────────────────── st.markdown("---") st.markdown("### Long-term Stability Analysis") if drift.get('drift_detected'): st.error(f"Drift Detected — Severity: {drift.get('drift_severity','unknown').upper()}", icon="🚨") for change in drift.get('changed_behaviors', []): st.write(f"— {change}") st.warning(f"Recommendation: {drift.get('recommendation','N/A')}", icon="⚙️") else: st.success("Stable Baseline — No behavioral degradation detected.", icon="✓") st.info(drift.get('recommendation', 'Continuous monitoring active.'), icon="ℹ️") # ── Raw JSON ──────────────────────────────────────────────────── st.markdown("---") with st.expander("Raw Diagnostic Payload (JSON)"): st.json(output)