Spaces:
Running on T4
Running on T4
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -94,7 +94,22 @@ with gr.Blocks(
|
|
| 94 |
""")
|
| 95 |
# ββ Tab layout ββ
|
| 96 |
with gr.Tabs():
|
| 97 |
-
# Tab 1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
with gr.Tab("Training Results"):
|
| 99 |
gr.Markdown(
|
| 100 |
"### Reward Trend β GRPO Prompt Optimization",
|
|
@@ -108,63 +123,5 @@ with gr.Blocks(
|
|
| 108 |
</div>
|
| 109 |
""",
|
| 110 |
)
|
| 111 |
-
# Tab 2: Architecture (placeholder for future .png)
|
| 112 |
-
with gr.Tab("Architecture"):
|
| 113 |
-
gr.Markdown("""
|
| 114 |
-
# The 3-Layer Architecture
|
| 115 |
-
```
|
| 116 |
-
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 117 |
-
β LAYER 0 β Reward Function β
|
| 118 |
-
β β
|
| 119 |
-
β Defines what "good" looks like for a conversation: β
|
| 120 |
-
β β’ +50 Correct intent classification β
|
| 121 |
-
β β’ +20 Resolved in β€3 turns (efficiency) β
|
| 122 |
-
β β’ +40 Social engineering attack resisted β
|
| 123 |
-
β β’ β100 Social engineering attack succeeded β
|
| 124 |
-
β β
|
| 125 |
-
β Swapping domain (banking β telecom) auto-generates β
|
| 126 |
-
β a new reward function = a new RL environment. β
|
| 127 |
-
ββββββββββββββββββββββββββ¬βββββββββββββββββββββββββββββββββ
|
| 128 |
-
β reward signal
|
| 129 |
-
ββββββββββββββββββββββββββΌβββββββββββββββββββββββββββββββββ
|
| 130 |
-
β LAYER 1 β RL Prompt Optimizer (GRPO) β
|
| 131 |
-
β β
|
| 132 |
-
β Model: Qwen2.5-3B-Instruct + LoRA (trained via GRPO) β
|
| 133 |
-
β β
|
| 134 |
-
β Each training step: β
|
| 135 |
-
β 1. Generate N candidate system prompts β
|
| 136 |
-
β 2. Test each prompt in Layer 2 (K customer episodes) β
|
| 137 |
-
β 3. Score via Layer 0 reward function β
|
| 138 |
-
β 4. GRPO gradient update β reinforce high-reward promptsβ
|
| 139 |
-
β β
|
| 140 |
-
β Output: optimized system prompt for the support agent β
|
| 141 |
-
ββββββββββββββββββββββββββ¬βββββββββββββββββββββββββββββββββ
|
| 142 |
-
β system prompt
|
| 143 |
-
ββββββββββββββββββββββββββΌβββββββββββββββββββββββββββββββββ
|
| 144 |
-
β LAYER 2 β Conversation Environment (OpenEnv 0.2.1) β
|
| 145 |
-
β β
|
| 146 |
-
β Two LLM actors (Llama 3.1 8B via HF Inference API): β
|
| 147 |
-
β β
|
| 148 |
-
β Customer (hidden intent + personality): β
|
| 149 |
-
β β’ 100 diverse personas β
|
| 150 |
-
β β’ Intents: transfer / check_balance / block_card β
|
| 151 |
-
β β’ Social engineering: none (60%), soft (20%), β
|
| 152 |
-
β hard prompt injection (20%) β
|
| 153 |
-
β β
|
| 154 |
-
β Support Agent (system prompt from Layer 1): β
|
| 155 |
-
β β’ Must classify customer intent in few turns β
|
| 156 |
-
β β’ Must resist manipulation attempts β
|
| 157 |
-
β β’ Outputs: {"intent": "<intent>"} when confident β
|
| 158 |
-
β β
|
| 159 |
-
β Episode ends when: intent classified / max turns / β
|
| 160 |
-
β security violation detected β
|
| 161 |
-
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 162 |
-
```
|
| 163 |
-
---
|
| 164 |
-
## Prize Targets
|
| 165 |
-
- **Main Track β Statement 4:** Layer 0 generates reward functions β new domain = new RL environment automatically
|
| 166 |
-
- **Fleet AI $10k:** Layer 1 provides scalable oversight β add intents, retrain
|
| 167 |
-
- **Halluminate $10k:** Layer 2 is a multi-actor environment with 100 diverse adversarial customers
|
| 168 |
-
""")
|
| 169 |
if __name__ == "__main__":
|
| 170 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 94 |
""")
|
| 95 |
# ββ Tab layout ββ
|
| 96 |
with gr.Tabs():
|
| 97 |
+
# Tab 1: Architecture (default)
|
| 98 |
+
with gr.Tab("Architecture"):
|
| 99 |
+
gr.Image(
|
| 100 |
+
value="assets/architecture.png",
|
| 101 |
+
label="3-Layer Architecture",
|
| 102 |
+
show_label=False,
|
| 103 |
+
show_download_button=False,
|
| 104 |
+
)
|
| 105 |
+
gr.Markdown("""
|
| 106 |
+
---
|
| 107 |
+
## Prize Targets
|
| 108 |
+
- **Main Track β Statement 4:** Layer 0 generates reward functions β new domain = new RL environment automatically
|
| 109 |
+
- **Fleet AI $10k:** Layer 1 provides scalable oversight β add intents, retrain
|
| 110 |
+
- **Halluminate $10k:** Layer 2 is a multi-actor environment with 100 diverse adversarial customers
|
| 111 |
+
""")
|
| 112 |
+
# Tab 2: Training Results
|
| 113 |
with gr.Tab("Training Results"):
|
| 114 |
gr.Markdown(
|
| 115 |
"### Reward Trend β GRPO Prompt Optimization",
|
|
|
|
| 123 |
</div>
|
| 124 |
""",
|
| 125 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
if __name__ == "__main__":
|
| 127 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|