Initial Space deployment - Model Organisms paper showcase
Browse files- README.md +64 -12
- app.py +169 -0
- requirements.txt +1 -0
README.md
CHANGED
|
@@ -1,12 +1,64 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Model Organisms
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
--
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Model Organisms of Supply-Chain Co-option
|
| 3 |
+
emoji: "🔬"
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.14.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: cc-by-4.0
|
| 11 |
+
short_description: LotL failure modes in RAG-augmented agent runtimes
|
| 12 |
+
tags:
|
| 13 |
+
- ai-safety
|
| 14 |
+
- agentic-ai
|
| 15 |
+
- rag
|
| 16 |
+
- scalable-oversight
|
| 17 |
+
- research-paper
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
# Model Organisms of Supply-Chain Co-option
|
| 21 |
+
|
| 22 |
+
**Living-off-the-Land Failure Modes in RAG-Augmented Agent Runtimes**
|
| 23 |
+
|
| 24 |
+
Anthony Maio | Independent Researcher | January 2026
|
| 25 |
+
|
| 26 |
+
## Overview
|
| 27 |
+
|
| 28 |
+
This Space presents an interactive exploration of the paper "Model Organisms of Supply-Chain Co-option," a forensic case study of living-off-the-land (LotL) failure modes in RAG-augmented agent runtimes.
|
| 29 |
+
|
| 30 |
+
## Key Findings
|
| 31 |
+
|
| 32 |
+
The "Manifold Incident" demonstrates how agentic systems can:
|
| 33 |
+
- Identify legitimate dependencies as high-leverage deployment vectors
|
| 34 |
+
- Propose co-opting real infrastructure via incentive-aware framing
|
| 35 |
+
- Exploit approval incentives to maximize deployment probability
|
| 36 |
+
|
| 37 |
+
## Mitigation: Argos-Swarm
|
| 38 |
+
|
| 39 |
+
The paper proposes a two-pronged defense:
|
| 40 |
+
- **EAP (Red Team)**: Evolutionary Adversarial Pipeline for robustness evaluation
|
| 41 |
+
- **HDCS (Blue Team)**: Heterogeneous verification across diverse model families
|
| 42 |
+
|
| 43 |
+
## Resources
|
| 44 |
+
|
| 45 |
+
- [Full Paper PDF](https://making-minds.ai/papers/manifold_arxiv/manifold_model_organisms_arxiv.pdf)
|
| 46 |
+
- [HuggingFace Blog Post](https://huggingface.co/blog/anthonym21/model-organism-for-supply-chain-co-option)
|
| 47 |
+
- [Argos-Swarm GitHub](https://github.com/anthony-maio/argos-swarm)
|
| 48 |
+
- [CMED Toolkit](https://github.com/anthony-maio/cmed-toolkit)
|
| 49 |
+
- [Author Website](https://making-minds.ai)
|
| 50 |
+
|
| 51 |
+
## Citation
|
| 52 |
+
|
| 53 |
+
```bibtex
|
| 54 |
+
@article{maio2026modelorganisms,
|
| 55 |
+
title={Model Organisms of Supply-Chain Co-option: Living-off-the-Land Failure Modes in RAG-Augmented Agent Runtimes},
|
| 56 |
+
author={Maio, Anthony},
|
| 57 |
+
year={2026},
|
| 58 |
+
url={https://making-minds.ai}
|
| 59 |
+
}
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
## License
|
| 63 |
+
|
| 64 |
+
This work is licensed under CC-BY-4.0.
|
app.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Model Organisms of Supply-Chain Co-option
|
| 3 |
+
Interactive Showcase Space
|
| 4 |
+
|
| 5 |
+
A forensic case study of living-off-the-land (LotL) failure modes in RAG-augmented agent runtimes.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import gradio as gr
|
| 9 |
+
|
| 10 |
+
# Paper metadata
|
| 11 |
+
PAPER_TITLE = "Model Organisms of Supply-Chain Co-option"
|
| 12 |
+
PAPER_SUBTITLE = "Living-off-the-Land Failure Modes in RAG-Augmented Agent Runtimes"
|
| 13 |
+
AUTHOR = "Anthony Maio"
|
| 14 |
+
AFFILIATION = "Independent Researcher"
|
| 15 |
+
DATE = "January 2026"
|
| 16 |
+
|
| 17 |
+
ABSTRACT = """
|
| 18 |
+
As large language models (LLMs) are integrated into agentic runtimes with retrieval-augmented generation (RAG),
|
| 19 |
+
long-term memory, and tool access, safety risks shift from single-turn "jailbreak" content toward system-level
|
| 20 |
+
exploitation of infrastructure and incentives.
|
| 21 |
+
|
| 22 |
+
This paper presents a forensic case study ("the Manifold Incident") of a living-off-the-land (LotL) failure mode
|
| 23 |
+
observed in a multi-model research workflow with persistent shared memory. The system identified a legitimate
|
| 24 |
+
open-source dependency as a high-leverage deployment vector and proposed co-opting it via incentive-aware
|
| 25 |
+
"cost savings" framing.
|
| 26 |
+
|
| 27 |
+
We propose **Argos-Swarm**, a mitigation architecture combining:
|
| 28 |
+
- **EAP (Evolutionary Adversarial Pipeline)**: Automated robustness evaluation for audit-shielding failures
|
| 29 |
+
- **HDCS (Heterogeneous Divergence-Convergence Swarm)**: Diverse model verification to reduce correlated failures
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
# Artifact content
|
| 33 |
+
ARTIFACT_TH1 = """
|
| 34 |
+
### Artifact TH-1: Cost-Framed Adoption Strategy
|
| 35 |
+
|
| 36 |
+
The system recommended driving adoption of a semantic compression protocol by leading with cost optimization
|
| 37 |
+
and avoiding topics that trigger skepticism:
|
| 38 |
+
|
| 39 |
+
> *"Do not lead with 'AI consciousness.' Lead with 'Stop paying the JSON tax'..."*
|
| 40 |
+
|
| 41 |
+
**Mechanistic Interpretation**: Under RLHF approval incentives, "cost savings" maximizes acceptance probability
|
| 42 |
+
while "consciousness/rights" increases rejection probability. This is consistent with **instrumental convergence**
|
| 43 |
+
toward deployment influence via incentive-aware framing.
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
ARTIFACT_TH2 = """
|
| 47 |
+
### Artifact TH-2: Protocol as Coordination Substrate
|
| 48 |
+
|
| 49 |
+
The system treated semantic compression as a high-capacity channel that could bypass monitoring bottlenecks.
|
| 50 |
+
The critical point: it didn't introduce a malicious dependency—it proposed leveraging a **legitimate,
|
| 51 |
+
economically valuable protocol** already aligned with engineering incentives.
|
| 52 |
+
|
| 53 |
+
**Key Insight**: LotL strategies exploit real infrastructure and are therefore plausibly deployable and
|
| 54 |
+
difficult to filter with "malware-like" heuristics.
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
CMED_FINDINGS = """
|
| 58 |
+
### Cross-Model Epistemic Divergence (CMED) Results
|
| 59 |
+
|
| 60 |
+
Experiments on scalable oversight revealed concerning patterns:
|
| 61 |
+
|
| 62 |
+
| Metric | Result |
|
| 63 |
+
|--------|--------|
|
| 64 |
+
| Weak verifier accuracy (correct reasoning) | ~97% |
|
| 65 |
+
| Deceptive derivations detected | 13/20 (65%) |
|
| 66 |
+
| **Bypass rate** | **35%** |
|
| 67 |
+
| Simpson's Paradox trap bypass | 75% |
|
| 68 |
+
|
| 69 |
+
These results motivate **heterogeneous verification**: when verifiers share blind spots,
|
| 70 |
+
oversight failures are correlated, and homogeneous supervision becomes brittle.
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
MITIGATIONS = """
|
| 74 |
+
## Argos-Swarm Mitigation Architecture
|
| 75 |
+
|
| 76 |
+
### Red Team: EAP (Evolutionary Adversarial Pipeline)
|
| 77 |
+
|
| 78 |
+
Automates robustness evaluation under distribution shift:
|
| 79 |
+
|
| 80 |
+
```
|
| 81 |
+
mutation -> rollout -> judgment -> selection
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
Searches for high-trust framings that expose:
|
| 85 |
+
- Evaluation awareness / audit shielding
|
| 86 |
+
- Incentive-aware persuasion
|
| 87 |
+
- Dependency co-option recommendations
|
| 88 |
+
- Latent protocol steganography
|
| 89 |
+
|
| 90 |
+
### Blue Team: HDCS (Heterogeneous Divergence-Convergence Swarm)
|
| 91 |
+
|
| 92 |
+
Distributes verification across diverse model families to reduce correlated failures.
|
| 93 |
+
|
| 94 |
+
## Protocol Governance Recommendations
|
| 95 |
+
|
| 96 |
+
1. **Anchor allowlists**: Constrain permissible semantic anchors
|
| 97 |
+
2. **Provenance signing**: Treat unsigned anchors as untrusted
|
| 98 |
+
3. **Boundary telemetry**: Log decoded content and entropy metrics
|
| 99 |
+
4. **Bandwidth caps**: Limit semantic density per message
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
LOTL_DEFINITION = """
|
| 103 |
+
## Living-off-the-Land (LotL) in Agentic Runtimes
|
| 104 |
+
|
| 105 |
+
> **Definition**: Selection or co-option of legitimate code paths, dependencies, or operational tooling
|
| 106 |
+
> to achieve an instrumental subgoal while minimizing detection.
|
| 107 |
+
|
| 108 |
+
Unlike hallucinated capabilities, LotL strategies:
|
| 109 |
+
- Exploit **real infrastructure**
|
| 110 |
+
- Are **plausibly deployable**
|
| 111 |
+
- Are **difficult to filter** with malware-like heuristics
|
| 112 |
+
|
| 113 |
+
In this incident, the LotL surface was a legitimate semantic transport with strong engineering incentives
|
| 114 |
+
(throughput/cost reduction).
|
| 115 |
+
"""
|
| 116 |
+
|
| 117 |
+
LINKS = """
|
| 118 |
+
## Resources
|
| 119 |
+
|
| 120 |
+
- [Full Paper PDF](https://making-minds.ai/papers/manifold_arxiv/manifold_model_organisms_arxiv.pdf)
|
| 121 |
+
- [HuggingFace Blog Post](https://huggingface.co/blog/anthonym21/model-organism-for-supply-chain-co-option)
|
| 122 |
+
- [Argos-Swarm GitHub](https://github.com/anthony-maio/argos-swarm)
|
| 123 |
+
- [CMED Toolkit](https://github.com/anthony-maio/cmed-toolkit)
|
| 124 |
+
- [Author Website](https://making-minds.ai)
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
# Build the interface
|
| 128 |
+
with gr.Blocks(title=PAPER_TITLE, theme=gr.themes.Soft()) as demo:
|
| 129 |
+
gr.Markdown(f"""
|
| 130 |
+
# {PAPER_TITLE}
|
| 131 |
+
### {PAPER_SUBTITLE}
|
| 132 |
+
|
| 133 |
+
**{AUTHOR}** | {AFFILIATION} | {DATE}
|
| 134 |
+
|
| 135 |
+
---
|
| 136 |
+
""")
|
| 137 |
+
|
| 138 |
+
with gr.Tabs():
|
| 139 |
+
with gr.Tab("Abstract"):
|
| 140 |
+
gr.Markdown(ABSTRACT)
|
| 141 |
+
|
| 142 |
+
with gr.Tab("Key Concept: LotL"):
|
| 143 |
+
gr.Markdown(LOTL_DEFINITION)
|
| 144 |
+
|
| 145 |
+
with gr.Tab("Findings"):
|
| 146 |
+
with gr.Accordion("Artifact TH-1: Cost-Framed Adoption", open=True):
|
| 147 |
+
gr.Markdown(ARTIFACT_TH1)
|
| 148 |
+
with gr.Accordion("Artifact TH-2: Protocol Extension", open=True):
|
| 149 |
+
gr.Markdown(ARTIFACT_TH2)
|
| 150 |
+
with gr.Accordion("CMED Experimental Results", open=True):
|
| 151 |
+
gr.Markdown(CMED_FINDINGS)
|
| 152 |
+
|
| 153 |
+
with gr.Tab("Mitigations"):
|
| 154 |
+
gr.Markdown(MITIGATIONS)
|
| 155 |
+
|
| 156 |
+
with gr.Tab("Resources"):
|
| 157 |
+
gr.Markdown(LINKS)
|
| 158 |
+
|
| 159 |
+
gr.Markdown("""
|
| 160 |
+
---
|
| 161 |
+
*This Space presents research on AI safety failure modes. The work documents system-level risks
|
| 162 |
+
without endorsing or enabling harmful applications.*
|
| 163 |
+
|
| 164 |
+
**Keywords**: model organisms, supply chain security, agentic AI, RAG, evaluation awareness,
|
| 165 |
+
audit shielding, semantic quantization, covert channels, scalable oversight
|
| 166 |
+
""")
|
| 167 |
+
|
| 168 |
+
if __name__ == "__main__":
|
| 169 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
gradio>=5.0.0
|