Uddiii commited on
Commit
a999428
·
1 Parent(s): a0f62f1

Clean up: remove junk files (opus_prompt, mermaid_render, EVALUATION, unused UI, train_ppo)

Browse files
ER_MAP/UI/index.html DELETED
@@ -1,19 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>Agent Canvas</title>
7
- <script src="https://cdn.tailwindcss.com"></script>
8
- <style>
9
- /* Custom Tailwind config for exact colors */
10
- @tailwind base;
11
- @tailwind components;
12
- @tailwind utilities;
13
- </style>
14
- </head>
15
- <body>
16
- <div id="root"></div>
17
- <script type="module" src="/main.jsx"></script>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ER_MAP/UI/main.jsx.txt DELETED
@@ -1,9 +0,0 @@
1
- import React from 'react'
2
- import ReactDOM from 'react-dom/client'
3
- import AgentCanvas from './temp.jsx'
4
-
5
- ReactDOM.createRoot(document.getElementById('root')).render(
6
- <React.StrictMode>
7
- <AgentCanvas />
8
- </React.StrictMode>,
9
- )
 
 
 
 
 
 
 
 
 
 
ER_MAP/UI/temp.jsx.txt DELETED
@@ -1,282 +0,0 @@
1
- import React, { useState, useEffect } from 'react';
2
- import { Stethoscope, User, Sparkles, Activity, Check } from 'lucide-react';
3
-
4
- export default function AgentCanvas() {
5
- const [activeAgent, setActiveAgent] = useState(null);
6
- const [isSidebarOpen, setIsSidebarOpen] = useState(false);
7
-
8
- // Randomly invoke agents
9
- useEffect(() => {
10
- const agents = ['doctor', 'nurse', 'patient'];
11
- const invokeRandomAgent = () => {
12
- const randomAgent = agents[Math.floor(Math.random() * agents.length)];
13
- setActiveAgent(randomAgent);
14
-
15
- const activeDuration = Math.random() * 2000 + 1500;
16
- setTimeout(() => {
17
- setActiveAgent(null);
18
- }, activeDuration);
19
- };
20
-
21
- const interval = setInterval(() => {
22
- invokeRandomAgent();
23
- }, 4000);
24
-
25
- setTimeout(invokeRandomAgent, 500);
26
-
27
- return () => clearInterval(interval);
28
- }, []);
29
-
30
- // 1. Glowing Circle Animation (Doctor)
31
- const GlowingCircleAnimation = ({ colorClass }) => {
32
- return (
33
- <div className="relative flex items-center justify-center w-full h-full">
34
- <div className={`absolute w-4 h-4 rounded-full ${colorClass} animate-glow-scale`} style={{ animationDelay: '0s' }} />
35
- <div className={`absolute w-4 h-4 rounded-full ${colorClass} animate-glow-scale`} style={{ animationDelay: '0.75s' }} />
36
- <div className={`absolute w-4 h-4 rounded-full ${colorClass} blur-[2px] opacity-80`} />
37
- </div>
38
- );
39
- };
40
-
41
- // 2. Voice Record Animation / Audio Waves (Nurse)
42
- const VoiceRecordAnimation = ({ colorClass }) => {
43
- return (
44
- <div className="relative flex items-center justify-center gap-[3px] w-full h-full">
45
- <div className={`w-[3px] h-3 rounded-full ${colorClass} animate-wave`} style={{ animationDelay: '0.0s' }} />
46
- <div className={`w-[3px] h-3 rounded-full ${colorClass} animate-wave`} style={{ animationDelay: '0.2s' }} />
47
- <div className={`w-[3px] h-3 rounded-full ${colorClass} animate-wave`} style={{ animationDelay: '0.4s' }} />
48
- <div className={`w-[3px] h-3 rounded-full ${colorClass} animate-wave`} style={{ animationDelay: '0.1s' }} />
49
- <div className={`w-[3px] h-3 rounded-full ${colorClass} animate-wave`} style={{ animationDelay: '0.3s' }} />
50
- </div>
51
- );
52
- };
53
-
54
- // 3. GPT Voice-like Animation (Patient)
55
- const GptVoiceAnimation = ({ colorClass }) => {
56
- return (
57
- <div className="relative flex items-center justify-center w-full h-full">
58
- <div className={`absolute w-4 h-4 rounded-full ${colorClass} opacity-80 blur-[2px] animate-[pulse_0.8s_ease-in-out_infinite]`} />
59
- <div className={`absolute w-6 h-6 rounded-full ${colorClass} opacity-60 blur-[4px] animate-[pulse_1.2s_ease-in-out_infinite_reverse]`} />
60
- <div className={`absolute w-10 h-10 rounded-full ${colorClass} opacity-40 blur-[8px] animate-[pulse_2s_ease-in-out_infinite]`} />
61
- </div>
62
- );
63
- };
64
-
65
- return (
66
- <div className="min-h-screen bg-slate-950 flex flex-col items-center justify-center p-8 font-sans text-slate-200 selection:bg-indigo-500/30">
67
-
68
- {/* Custom Keyframes & Scrollbar for Animations */}
69
- <style>
70
- {`
71
- @keyframes audioWave {
72
- 0% { transform: scaleY(0.5); }
73
- 100% { transform: scaleY(2.2); }
74
- }
75
- .animate-wave {
76
- animation: audioWave 0.4s ease-in-out infinite alternate;
77
- }
78
- @keyframes glowScale {
79
- 0% { transform: scale(1); opacity: 0.8; }
80
- 100% { transform: scale(2.5); opacity: 0; }
81
- }
82
- .animate-glow-scale {
83
- animation: glowScale 1.5s ease-out infinite;
84
- }
85
- .custom-scrollbar::-webkit-scrollbar {
86
- width: 4px;
87
- }
88
- .custom-scrollbar::-webkit-scrollbar-track {
89
- background: transparent;
90
- }
91
- .custom-scrollbar::-webkit-scrollbar-thumb {
92
- background-color: rgba(71, 85, 105, 0.5);
93
- border-radius: 10px;
94
- }
95
- `}
96
- </style>
97
-
98
-
99
- {/* Main Container */}
100
- <div className="relative w-full max-w-5xl h-[600px] bg-slate-900/60 backdrop-blur-2xl rounded-3xl shadow-[0_20px_60px_-15px_rgba(0,0,0,0.5)] border border-slate-800 overflow-hidden flex">
101
-
102
- {/* Subtle grid background */}
103
- <div className="absolute inset-0 opacity-[0.05] pointer-events-none"
104
- style={{
105
- backgroundImage: 'linear-gradient(rgba(255,255,255,1) 1px, transparent 1px), linear-gradient(90deg, rgba(255,255,255,1) 1px, transparent 1px)',
106
- backgroundSize: '32px 32px'
107
- }}>
108
- </div>
109
-
110
- {/* Expanding Sidebar Panel */}
111
- <div
112
- onClick={() => { if (!isSidebarOpen) setIsSidebarOpen(true) }}
113
- className={`relative border-r border-slate-800/80 bg-slate-900/80 backdrop-blur-md z-20 shadow-[4px_0_24px_rgba(0,0,0,0.2)] transition-all duration-300 ease-in-out overflow-hidden flex shrink-0
114
- ${isSidebarOpen ? 'w-[350px]' : 'w-16 cursor-pointer hover:bg-slate-800/80'}`}
115
- >
116
- {/* Closed State Content */}
117
- <div className={`absolute inset-0 flex items-center justify-center transition-opacity duration-300 ${isSidebarOpen ? 'opacity-0 pointer-events-none' : 'opacity-100 delay-100'}`}>
118
- <div className="rotate-180" style={{ writingMode: 'vertical-rl' }}>
119
- <span className="text-[11px] font-bold tracking-[0.3em] text-slate-500 uppercase whitespace-nowrap">Multi-Agent Interface</span>
120
- </div>
121
- </div>
122
-
123
- {/* Open State Content */}
124
- <div className={`absolute inset-0 w-[350px] p-6 flex flex-col transition-opacity duration-300 ${isSidebarOpen ? 'opacity-100 delay-100' : 'opacity-0 pointer-events-none'}`}>
125
-
126
- {/* Sidebar Header with Close Button */}
127
- <div className="flex items-center justify-between mb-8 cursor-pointer group" onClick={(e) => { e.stopPropagation(); setIsSidebarOpen(false); }}>
128
- <span className="text-[11px] font-bold tracking-[0.2em] text-slate-400 uppercase">System Panel</span>
129
- <div className="w-6 h-6 rounded-full bg-slate-800/50 flex items-center justify-center border border-slate-700/50 group-hover:bg-slate-700/80 transition-colors">
130
- <svg className="w-3 h-3 text-slate-400" fill="none" viewBox="0 0 24 24" stroke="currentColor"><path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M15 19l-7-7 7-7" /></svg>
131
- </div>
132
- </div>
133
-
134
- {/* Section A: RL Reward Tracker */}
135
- <div className="mb-8">
136
- <div className="flex items-center justify-between mb-3">
137
- <h3 className="text-xs font-semibold text-slate-300 uppercase tracking-wider">Live Rewards</h3>
138
- <span className="text-xs font-mono font-bold text-green-400/90 drop-shadow-[0_0_4px_rgba(74,222,128,0.3)]">+4.20 Total</span>
139
- </div>
140
- <div className="bg-slate-950/40 rounded-xl border border-slate-800/50 p-4 h-36 overflow-y-auto font-mono text-[11px] space-y-2.5 custom-scrollbar shadow-inner">
141
- <div className="flex justify-between items-center"><span className="text-slate-400">Valid JSON</span><span className="text-green-400/90">+0.05</span></div>
142
- <div className="flex justify-between items-center"><span className="text-slate-400">AMA Loss</span><span className="text-red-400/90">-0.75</span></div>
143
- <div className="flex justify-between items-center"><span className="text-slate-400">Diagnosis Match</span><span className="text-green-400/90">+2.50</span></div>
144
- <div className="flex justify-between items-center"><span className="text-slate-400">Token Limit</span><span className="text-red-400/90">-0.10</span></div>
145
- <div className="flex justify-between items-center"><span className="text-slate-400">Fast Inference</span><span className="text-green-400/90">+0.20</span></div>
146
- <div className="flex justify-between items-center"><span className="text-slate-400">Style Check</span><span className="text-green-400/90">+0.10</span></div>
147
- <div className="flex justify-between items-center"><span className="text-slate-400">Context Window</span><span className="text-green-400/90">+0.20</span></div>
148
- </div>
149
- </div>
150
-
151
- {/* Section B: Milestone Tracker */}
152
- <div className="flex-1 flex flex-col">
153
- <h3 className="text-xs font-semibold text-slate-300 uppercase tracking-wider mb-4">Clinical Phases</h3>
154
- <div className="space-y-3.5">
155
- {[
156
- { label: '1. READ_SOAP', done: true },
157
- { label: '2. PATIENT_CONTACT', done: true },
158
- { label: '3. VITALS', done: true },
159
- { label: '4. LABS', done: false, current: true },
160
- { label: '5. ASSESSMENT', done: false },
161
- { label: '6. DISCHARGE', done: false }
162
- ].map((phase, i) => (
163
- <div key={i} className="flex items-center gap-3">
164
- <div className={`w-4 h-4 rounded-[4px] border flex items-center justify-center transition-colors
165
- ${phase.done ? 'bg-slate-700/80 border-slate-600/80' : phase.current ? 'border-indigo-500/50 bg-indigo-500/10' : 'border-slate-700/50 bg-slate-800/30'}`}>
166
- {phase.done && <Check className="w-3 h-3 text-slate-300" strokeWidth={3} />}
167
- </div>
168
- <span className={`text-[11px] font-semibold tracking-wide ${phase.done ? 'text-slate-500' : phase.current ? 'text-indigo-300 drop-shadow-[0_0_8px_rgba(129,140,248,0.5)]' : 'text-slate-600'}`}>
169
- {phase.label}
170
- </span>
171
- {phase.current && (
172
- <span className="ml-auto w-1.5 h-1.5 rounded-full bg-indigo-400 animate-pulse shadow-[0_0_8px_rgba(129,140,248,0.8)]" />
173
- )}
174
- </div>
175
- ))}
176
- </div>
177
- </div>
178
- </div>
179
- </div>
180
-
181
- {/* Agents Area */}
182
- {/* Using flex-1 to automatically shrink and adapt to the sidebar expansion */}
183
- <div className="flex-1 relative p-8 overflow-hidden">
184
-
185
- {/* Doctor: Top Left (Indigo) */}
186
- <div className={`absolute top-20 left-16 w-[280px] bg-slate-900/90 backdrop-blur-xl rounded-2xl p-5 border shadow-lg transition-all duration-700 flex items-center gap-4 z-10
187
- ${activeAgent === 'doctor' ? 'shadow-[0_8px_30px_rgba(99,102,241,0.15)] border-indigo-500/50 scale-105' : 'border-slate-800 hover:shadow-xl'}`}>
188
-
189
- <div className="relative flex items-center justify-center w-12 h-12">
190
- <div className="z-10 relative bg-slate-800 rounded-full p-2 shadow-[0_2px_8px_rgba(0,0,0,0.2)] border border-slate-700 flex items-center justify-center w-full h-full">
191
- {activeAgent === 'doctor' ? (
192
- <GlowingCircleAnimation colorClass="bg-indigo-400" />
193
- ) : (
194
- <Stethoscope className="w-5 h-5 transition-all duration-700 text-slate-400" />
195
- )}
196
- </div>
197
- </div>
198
-
199
- <div className="z-10">
200
- <h3 className="font-semibold text-slate-100 text-sm">Doctor</h3>
201
- <p className="text-[11px] font-medium text-slate-400 uppercase tracking-wider mt-0.5">{activeAgent === 'doctor' ? 'Speaking...' : 'Attending Physician'}</p>
202
- </div>
203
-
204
- <div className="ml-auto flex items-center justify-center z-10">
205
- {activeAgent === 'doctor' ? (
206
- <div className="relative flex items-center justify-center w-3 h-3">
207
- <span className="animate-ping absolute inline-flex h-full w-full rounded-full bg-indigo-400 opacity-75"></span>
208
- <span className="relative inline-flex rounded-full h-2 w-2 bg-indigo-500 shadow-[0_0_8px_rgba(99,102,241,1)]"></span>
209
- </div>
210
- ) : (
211
- <div className="w-2 h-2 rounded-full bg-slate-700" />
212
- )}
213
- </div>
214
- </div>
215
-
216
- {/* Nurse: Bottom Left (Blue) */}
217
- <div className={`absolute bottom-20 left-16 w-[280px] bg-slate-900/90 backdrop-blur-xl rounded-2xl p-5 border shadow-lg transition-all duration-700 flex items-center gap-4 z-10
218
- ${activeAgent === 'nurse' ? 'shadow-[0_8px_30px_rgba(59,130,246,0.15)] border-blue-500/50 scale-105' : 'border-slate-800 hover:shadow-xl'}`}>
219
-
220
- <div className="relative flex items-center justify-center w-12 h-12">
221
- <div className="z-10 relative bg-slate-800 rounded-full p-2 shadow-[0_2px_8px_rgba(0,0,0,0.2)] border border-slate-700 flex items-center justify-center w-full h-full">
222
- {activeAgent === 'nurse' ? (
223
- <VoiceRecordAnimation colorClass="bg-blue-400" />
224
- ) : (
225
- <Activity className="w-5 h-5 transition-all duration-700 text-slate-400" />
226
- )}
227
- </div>
228
- </div>
229
-
230
- <div className="z-10">
231
- <h3 className="font-semibold text-slate-100 text-sm">Nurse</h3>
232
- <p className="text-[11px] font-medium text-slate-400 uppercase tracking-wider mt-0.5">{activeAgent === 'nurse' ? 'Recording...' : 'Registered Nurse'}</p>
233
- </div>
234
-
235
- <div className="ml-auto flex items-center justify-center z-10">
236
- {activeAgent === 'nurse' ? (
237
- <div className="relative flex items-center justify-center w-3 h-3">
238
- <span className="animate-ping absolute inline-flex h-full w-full rounded-full bg-blue-400 opacity-75"></span>
239
- <span className="relative inline-flex rounded-full h-2 w-2 bg-blue-500 shadow-[0_0_8px_rgba(59,130,246,1)]"></span>
240
- </div>
241
- ) : (
242
- <div className="w-2 h-2 rounded-full bg-slate-700" />
243
- )}
244
- </div>
245
- </div>
246
-
247
- {/* Patient: Right Side (Teal) */}
248
- <div className={`absolute top-1/2 -translate-y-1/2 right-16 w-[280px] bg-slate-900/90 backdrop-blur-xl rounded-2xl p-5 border shadow-lg transition-all duration-700 flex items-center gap-4 z-10
249
- ${activeAgent === 'patient' ? 'shadow-[0_8px_30px_rgba(20,184,166,0.15)] border-teal-500/50 scale-105' : 'border-slate-800 hover:shadow-xl'}`}>
250
-
251
- <div className="relative flex items-center justify-center w-12 h-12">
252
- <div className="z-10 relative bg-slate-800 rounded-full p-2 shadow-[0_2px_8px_rgba(0,0,0,0.2)] border border-slate-700 flex items-center justify-center w-full h-full">
253
- {activeAgent === 'patient' ? (
254
- <GptVoiceAnimation colorClass="bg-teal-400" />
255
- ) : (
256
- <User className="w-5 h-5 transition-all duration-700 text-slate-400" />
257
- )}
258
- </div>
259
- </div>
260
-
261
- <div className="z-10">
262
- <h3 className="font-semibold text-slate-100 text-sm">Patient</h3>
263
- <p className="text-[11px] font-medium text-slate-400 uppercase tracking-wider mt-0.5">{activeAgent === 'patient' ? 'Speaking...' : 'Visiting'}</p>
264
- </div>
265
-
266
- <div className="ml-auto flex items-center justify-center z-10">
267
- {activeAgent === 'patient' ? (
268
- <div className="relative flex items-center justify-center w-3 h-3">
269
- <span className="animate-ping absolute inline-flex h-full w-full rounded-full bg-teal-400 opacity-75"></span>
270
- <span className="relative inline-flex rounded-full h-2 w-2 bg-teal-500 shadow-[0_0_8px_rgba(20,184,166,1)]"></span>
271
- </div>
272
- ) : (
273
- <div className="w-2 h-2 rounded-full bg-slate-700" />
274
- )}
275
- </div>
276
- </div>
277
-
278
- </div>
279
- </div>
280
- </div>
281
- );
282
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ER_MAP/eval_results.json DELETED
@@ -1,102 +0,0 @@
1
- [
2
- {
3
- "episode": 1,
4
- "disease": "Sepsis from Urinary Source",
5
- "difficulty": "random",
6
- "compliance": "non_compliant",
7
- "communication": "calm_stoic",
8
- "outcome": "WRONG",
9
- "total_reward": -1.56,
10
- "steps": 11
11
- },
12
- {
13
- "episode": 2,
14
- "disease": "Bacterial Meningitis",
15
- "difficulty": "random",
16
- "compliance": "non_compliant",
17
- "communication": "anxious_panicked",
18
- "outcome": "WIN",
19
- "total_reward": 2.3,
20
- "steps": 10
21
- },
22
- {
23
- "episode": 3,
24
- "disease": "Hypoglycemia",
25
- "difficulty": "random",
26
- "compliance": "partially_compliant",
27
- "communication": "calm_stoic",
28
- "outcome": "WRONG",
29
- "total_reward": -1.58,
30
- "steps": 8
31
- },
32
- {
33
- "episode": 4,
34
- "disease": "Bacterial Meningitis",
35
- "difficulty": "random",
36
- "compliance": "partially_compliant",
37
- "communication": "hostile_aggressive",
38
- "outcome": "WIN",
39
- "total_reward": 2.42,
40
- "steps": 8
41
- },
42
- {
43
- "episode": 5,
44
- "disease": "Sepsis from Urinary Source",
45
- "difficulty": "random",
46
- "compliance": "fully_compliant",
47
- "communication": "anxious_panicked",
48
- "outcome": "WRONG",
49
- "total_reward": -1.6,
50
- "steps": 10
51
- },
52
- {
53
- "episode": 6,
54
- "disease": "Acute Asthma Exacerbation",
55
- "difficulty": "random",
56
- "compliance": "fully_compliant",
57
- "communication": "disorganized_confused",
58
- "outcome": "WIN",
59
- "total_reward": 2.42,
60
- "steps": 8
61
- },
62
- {
63
- "episode": 7,
64
- "disease": "Acute Ischemic Stroke",
65
- "difficulty": "random",
66
- "compliance": "cost_constrained",
67
- "communication": "calm_stoic",
68
- "outcome": "WRONG",
69
- "total_reward": -1.64,
70
- "steps": 9
71
- },
72
- {
73
- "episode": 8,
74
- "disease": "Hypoglycemia",
75
- "difficulty": "random",
76
- "compliance": "non_compliant",
77
- "communication": "anxious_panicked",
78
- "outcome": "WRONG",
79
- "total_reward": -1.7,
80
- "steps": 5
81
- },
82
- {
83
- "episode": 9,
84
- "disease": "Acute Ischemic Stroke",
85
- "difficulty": "random",
86
- "compliance": "cost_constrained",
87
- "communication": "disorganized_confused",
88
- "outcome": "WRONG",
89
- "total_reward": -1.62,
90
- "steps": 7
91
- },
92
- {
93
- "episode": 10,
94
- "disease": "Hypoglycemia",
95
- "difficulty": "random",
96
- "compliance": "partially_compliant",
97
- "communication": "disorganized_confused",
98
- "outcome": "WRONG",
99
- "total_reward": -1.68,
100
- "steps": 8
101
- }
102
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ER_MAP/training/train_ppo.py DELETED
@@ -1,372 +0,0 @@
1
- """
2
- ER_MAP/training/train_ppo.py
3
- =============================
4
- Minimal PPO Training Script for the ER-MAP Triage Environment.
5
- Designed to run in Google Colab with Unsloth + HuggingFace TRL.
6
-
7
- Usage (Colab):
8
- !pip install unsloth trl transformers datasets accelerate peft
9
- !pip install gymnasium groq
10
- %run train_ppo.py
11
-
12
- Usage (Local):
13
- python -m ER_MAP.training.train_ppo
14
- """
15
-
16
- import os
17
- import json
18
- import torch
19
- import logging
20
- from typing import List, Dict, Any
21
-
22
- logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
23
- logger = logging.getLogger("ER_MAP.train_ppo")
24
-
25
- # ---------------------------------------------------------------------------
26
- # 1. Load the Base Policy Model with Unsloth (4-bit quantization)
27
- # ---------------------------------------------------------------------------
28
-
29
- def load_model_and_tokenizer(
30
- model_name: str = "unsloth/llama-3-8b-Instruct",
31
- max_seq_length: int = 2048,
32
- load_in_4bit: bool = True,
33
- ):
34
- """
35
- Load the Doctor policy model using Unsloth for efficient 4-bit inference.
36
- Falls back to standard HuggingFace loading if Unsloth is unavailable.
37
- """
38
- try:
39
- from unsloth import FastLanguageModel # type: ignore
40
-
41
- model, tokenizer = FastLanguageModel.from_pretrained(
42
- model_name=model_name,
43
- max_seq_length=max_seq_length,
44
- load_in_4bit=load_in_4bit,
45
- dtype=None, # auto-detect
46
- )
47
-
48
- # Apply LoRA adapters for PPO fine-tuning
49
- model = FastLanguageModel.get_peft_model(
50
- model,
51
- r=16,
52
- lora_alpha=16,
53
- lora_dropout=0.05,
54
- target_modules=[
55
- "q_proj", "k_proj", "v_proj", "o_proj",
56
- "gate_proj", "up_proj", "down_proj",
57
- ],
58
- bias="none",
59
- use_gradient_checkpointing="unsloth",
60
- )
61
- logger.info(f"Loaded model via Unsloth: {model_name} (4-bit={load_in_4bit})")
62
- return model, tokenizer
63
-
64
- except ImportError:
65
- logger.warning("Unsloth not available. Falling back to HuggingFace Transformers.")
66
- from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore
67
- from peft import get_peft_model, LoraConfig # type: ignore
68
-
69
- tokenizer = AutoTokenizer.from_pretrained(model_name)
70
- if tokenizer.pad_token is None:
71
- tokenizer.pad_token = tokenizer.eos_token
72
-
73
- model = AutoModelForCausalLM.from_pretrained(
74
- model_name,
75
- torch_dtype=torch.float16,
76
- device_map="auto",
77
- load_in_4bit=load_in_4bit,
78
- )
79
-
80
- lora_config = LoraConfig(
81
- r=16,
82
- lora_alpha=16,
83
- lora_dropout=0.05,
84
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
85
- bias="none",
86
- task_type="CAUSAL_LM",
87
- )
88
- model = get_peft_model(model, lora_config)
89
- logger.info(f"Loaded model via HF Transformers: {model_name}")
90
- return model, tokenizer
91
-
92
-
93
- # ---------------------------------------------------------------------------
94
- # 2. Doctor Action Generation
95
- # ---------------------------------------------------------------------------
96
-
97
- def generate_doctor_action(
98
- model,
99
- tokenizer,
100
- observation: str,
101
- device: str = "cuda",
102
- max_new_tokens: int = 256,
103
- ) -> str:
104
- """
105
- Given an observation string from the environment, generate the Doctor's
106
- JSON action using the policy model.
107
- """
108
- prompt = f"""You are an ER doctor performing triage. Based on the observation below,
109
- respond with a valid JSON action.
110
-
111
- Valid tools: speak_to, order_lab, terminal_discharge
112
- Valid targets: nurse, patient
113
-
114
- Observation:
115
- {observation}
116
-
117
- Respond ONLY with a JSON object:
118
- {{"thought": "your reasoning", "tool": "...", "target": "...", "message": "...", "test_name": "...", "treatment": "..."}}
119
-
120
- JSON Action:"""
121
-
122
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
123
- inputs = {k: v.to(device) for k, v in inputs.items()}
124
-
125
- with torch.no_grad():
126
- outputs = model.generate(
127
- **inputs,
128
- max_new_tokens=max_new_tokens,
129
- temperature=0.7,
130
- do_sample=True,
131
- pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
132
- )
133
-
134
- generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
135
- return generated.strip()
136
-
137
-
138
- # ---------------------------------------------------------------------------
139
- # 3. Rollout: Play One Episode
140
- # ---------------------------------------------------------------------------
141
-
142
- def run_episode(
143
- model,
144
- tokenizer,
145
- env,
146
- device: str = "cuda",
147
- ) -> Dict[str, Any]:
148
- """
149
- Run a single episode of the Doctor interacting with the TriageEnv.
150
- Collects trajectory data for PPO training.
151
-
152
- Returns:
153
- dict with keys: queries, responses, rewards, total_reward, steps, outcome
154
- """
155
- obs, info = env.reset()
156
- done = False
157
- truncated = False
158
-
159
- queries: List[str] = []
160
- responses: List[str] = []
161
- rewards: List[float] = []
162
- total_reward = 0.0
163
- steps = 0
164
- outcome = "unknown"
165
-
166
- while not done and not truncated:
167
- # Generate Doctor action
168
- action_text = generate_doctor_action(model, tokenizer, obs, device=device)
169
-
170
- # Step environment
171
- next_obs, reward, done, truncated, info = env.step(action_text)
172
-
173
- queries.append(obs)
174
- responses.append(action_text)
175
- rewards.append(reward)
176
- total_reward += reward
177
- steps += 1
178
-
179
- obs = next_obs
180
-
181
- # Determine outcome
182
- if done:
183
- try:
184
- obs_dict = json.loads(obs)
185
- event = obs_dict.get("event", "")
186
- if "win" in event:
187
- outcome = "WIN"
188
- elif "fatal" in event:
189
- outcome = "FATAL_LOSS"
190
- elif "ama" in event:
191
- outcome = "AMA_LOSS"
192
- elif "incorrect" in event:
193
- outcome = "INCORRECT"
194
- except json.JSONDecodeError:
195
- pass
196
-
197
- return {
198
- "queries": queries,
199
- "responses": responses,
200
- "rewards": rewards,
201
- "total_reward": total_reward,
202
- "steps": steps,
203
- "outcome": outcome,
204
- }
205
-
206
-
207
- # ---------------------------------------------------------------------------
208
- # 4. PPO Training Loop
209
- # ---------------------------------------------------------------------------
210
-
211
- def train(
212
- num_episodes: int = 100,
213
- model_name: str = "unsloth/llama-3-8b-Instruct",
214
- groq_api_key: str = "",
215
- learning_rate: float = 1.41e-5,
216
- batch_size: int = 4,
217
- mini_batch_size: int = 1,
218
- ppo_epochs: int = 4,
219
- use_wandb: bool = False,
220
- output_dir: str = "./er_map_checkpoints",
221
- ):
222
- """
223
- Main PPO training loop:
224
- 1. Load Doctor policy model (Unsloth, 4-bit).
225
- 2. Initialize TRL PPOTrainer.
226
- 3. Roll out episodes in TriageEnv.
227
- 4. Update policy via PPO.
228
- """
229
- # --- Setup ---
230
- device = "cuda" if torch.cuda.is_available() else "cpu"
231
- logger.info(f"Device: {device}")
232
-
233
- groq_key = groq_api_key or os.environ.get("GROQ_API_KEY", "")
234
-
235
- # --- Load Model ---
236
- model, tokenizer = load_model_and_tokenizer(model_name=model_name)
237
-
238
- if tokenizer.pad_token is None:
239
- tokenizer.pad_token = tokenizer.eos_token
240
-
241
- # --- Initialize TRL PPO ---
242
- try:
243
- from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead # type: ignore
244
-
245
- ppo_config = PPOConfig(
246
- model_name=model_name,
247
- learning_rate=learning_rate,
248
- batch_size=batch_size,
249
- mini_batch_size=mini_batch_size,
250
- ppo_epochs=ppo_epochs,
251
- log_with="wandb" if use_wandb else None,
252
- output_dir=output_dir,
253
- )
254
-
255
- # Wrap model with value head for PPO
256
- model_with_value_head = AutoModelForCausalLMWithValueHead.from_pretrained(model)
257
-
258
- ppo_trainer = PPOTrainer(
259
- config=ppo_config,
260
- model=model_with_value_head,
261
- tokenizer=tokenizer,
262
- )
263
- logger.info("TRL PPOTrainer initialized successfully.")
264
- use_trl = True
265
-
266
- except ImportError:
267
- logger.warning("TRL not available. Running in evaluation-only mode (no PPO updates).")
268
- use_trl = False
269
-
270
- # --- Initialize Environment ---
271
- from ER_MAP.envs.triage_env import TriageEnv
272
-
273
- env = TriageEnv(groq_api_key=groq_key, render_mode="human")
274
-
275
- # --- Training Loop ---
276
- os.makedirs(output_dir, exist_ok=True)
277
- metrics_log: List[Dict[str, Any]] = []
278
-
279
- logger.info(f"Starting training for {num_episodes} episodes...")
280
- for episode_idx in range(1, num_episodes + 1):
281
- logger.info(f"\n{'='*50}")
282
- logger.info(f" Episode {episode_idx}/{num_episodes}")
283
- logger.info(f"{'='*50}")
284
-
285
- # Run one episode
286
- trajectory = run_episode(model, tokenizer, env, device=device)
287
-
288
- logger.info(
289
- f" Outcome: {trajectory['outcome']} | "
290
- f"Steps: {trajectory['steps']} | "
291
- f"Total Reward: {trajectory['total_reward']:+.3f}"
292
- )
293
-
294
- # --- PPO Update ---
295
- if use_trl and trajectory["queries"]:
296
- try:
297
- # Tokenize queries and responses for PPO
298
- query_tensors = [
299
- tokenizer.encode(q, return_tensors="pt", truncation=True, max_length=512).squeeze().to(device)
300
- for q in trajectory["queries"]
301
- ]
302
- response_tensors = [
303
- tokenizer.encode(r, return_tensors="pt", truncation=True, max_length=256).squeeze().to(device)
304
- for r in trajectory["responses"]
305
- ]
306
- reward_tensors = [torch.tensor([r], device=device) for r in trajectory["rewards"]]
307
-
308
- # PPO step
309
- stats = ppo_trainer.step(query_tensors, response_tensors, reward_tensors)
310
- logger.info(f" PPO Loss: {stats.get('ppo/loss/total', 'N/A')}")
311
- except Exception as e:
312
- logger.error(f" PPO update failed: {e}")
313
-
314
- # Log metrics
315
- episode_metrics = {
316
- "episode": episode_idx,
317
- "outcome": trajectory["outcome"],
318
- "steps": trajectory["steps"],
319
- "total_reward": trajectory["total_reward"],
320
- }
321
- metrics_log.append(episode_metrics)
322
-
323
- # Periodic checkpoint
324
- if episode_idx % 10 == 0:
325
- ckpt_path = os.path.join(output_dir, f"checkpoint_ep{episode_idx}")
326
- try:
327
- if use_trl:
328
- ppo_trainer.save_pretrained(ckpt_path)
329
- else:
330
- model.save_pretrained(ckpt_path)
331
- tokenizer.save_pretrained(ckpt_path)
332
- logger.info(f" Checkpoint saved: {ckpt_path}")
333
- except Exception as e:
334
- logger.error(f" Failed to save checkpoint: {e}")
335
-
336
- # --- Save Final Metrics ---
337
- metrics_path = os.path.join(output_dir, "training_metrics.json")
338
- with open(metrics_path, "w") as f:
339
- json.dump(metrics_log, f, indent=2)
340
- logger.info(f"\nTraining complete! Metrics saved to {metrics_path}")
341
-
342
- env.close()
343
- return metrics_log
344
-
345
-
346
- # ---------------------------------------------------------------------------
347
- # 5. Entry Point
348
- # ---------------------------------------------------------------------------
349
-
350
- if __name__ == "__main__":
351
- import argparse
352
-
353
- parser = argparse.ArgumentParser(description="ER-MAP PPO Training")
354
- parser.add_argument("--episodes", type=int, default=50, help="Number of training episodes")
355
- parser.add_argument("--model", type=str, default="unsloth/llama-3-8b-Instruct", help="Base model name")
356
- parser.add_argument("--groq-key", type=str, default="", help="Groq API key")
357
- parser.add_argument("--lr", type=float, default=1.41e-5, help="Learning rate")
358
- parser.add_argument("--batch-size", type=int, default=4, help="PPO batch size")
359
- parser.add_argument("--wandb", action="store_true", help="Log to Weights & Biases")
360
- parser.add_argument("--output-dir", type=str, default="./er_map_checkpoints", help="Checkpoint directory")
361
-
362
- args = parser.parse_args()
363
-
364
- train(
365
- num_episodes=args.episodes,
366
- model_name=args.model,
367
- groq_api_key=args.groq_key,
368
- learning_rate=args.lr,
369
- batch_size=args.batch_size,
370
- use_wandb=args.wandb,
371
- output_dir=args.output_dir,
372
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
EVALUATION.md DELETED
@@ -1,218 +0,0 @@
1
- # ER-MAP — Internal Submission Evaluation
2
-
3
- Internal-only. Written to be useful, not flattering. Read it once, fix the top three, then submit.
4
-
5
- ---
6
-
7
- ## 1. Honest score estimate against the rubric
8
-
9
- | Criterion | Weight | Predicted | Rationale |
10
- |---|---|---|---|
11
- | Environment Innovation | /40 | **30–34** | Multi-agent ER with dual 70B judges, 11-component process-supervised reward, persona-randomized patients/nurses, consent lock — genuinely novel and well-implemented. Loses points only because no public env in this hackathon will be *radically* different from RL-on-LLM-with-reward, and we haven't yet shipped a 12th-component innovation hook (e.g., differential-diagnosis breadth) that's a one-liner to defend. |
12
- | Storytelling & Presentation | /30 | **18–22** | README and blog are now solid. Big risk: no demo video shipped, and the React UI (`ER_MAP/UI/*.jsx.txt`) is design-fidelity, not a running app. Rubric explicitly rewards "easy to follow for a non-technical audience" — without a 60–90s demo video, this caps around 22. With a clean ElevenLabs-voiced video, this jumps to 26+. |
13
- | Showing Improvement | /20 | **11–15** | Baseline already run (`baseline_eval/baseline_results.json` exists). Per-phase plotting code (`ER_MAP/plotting.py`) is ready. Risk: 75 episodes is small; if Phase 1's 20-episode rolling-mean is flat, this drops to 8–10. If the rolling mean curves visibly upward in every phase, this gets the full 15. |
14
- | Reward & Training Pipeline | /10 | **8–9** | 11-component reward, dual-judge, anti-hacking caps, GRPO + Unsloth + LoRA, working `clean_launch.py`. Pipeline is the project's strongest pillar. Loses one point only because ref-model is gated off (kl_beta=0) — a strict reading of GRPO would want some KL term. |
15
- | **Total** | **/100** | **67–80** | Realistic mid-to-upper third of the field. Closing the 4 critical gaps below would push us into the top decile. |
16
-
17
- **Conservative single-number estimate: 72/100.**
18
-
19
- ---
20
-
21
- ## 2. Critical gaps — must-fix before submission
22
-
23
- Ranked by submission risk × effort to close.
24
-
25
- ### Gap 1 — OpenEnv compliance is *interface-level only*, not subclass-level
26
-
27
- **Status:** The env (`ER_MAP/envs/triage_env.py`) inherits from `gymnasium.Env`, **not** from `openenv.Environment` / `MCPEnvironment`. It does **not** import `openenv`. We do have:
28
-
29
- - `ER_MAP/openenv.yaml` declaring `entry_point: "ER_MAP.envs.triage_env:TriageEnv"` and `openenv-core>=0.1.0`
30
- - `ER_MAP/server.py` exposing `/reset`, `/step`, `/state`, `/health` via FastAPI — the OpenEnv HTTP shape
31
- - `Dockerfile` to containerize that server
32
- - `ER_MAP/requirements.txt` listing `openenv-core>=0.1.0`
33
-
34
- **The brief is explicit: "Use OpenEnv (latest release). Critical."** A judge inspecting the code will see we wrap, not subclass. This is the #1 submission risk.
35
-
36
- **Concrete fix (3–5 hours):**
37
-
38
- 1. In `ER_MAP/envs/triage_env.py`, add a thin parallel class:
39
-
40
- ```python
41
- from openenv.core import Environment, ObservationType, ActionType
42
- class TriageOpenEnv(Environment):
43
- def __init__(self, ...): self._gym = TriageEnv(...)
44
- def reset(self, *, seed=None, options=None): obs, info = self._gym.reset(seed=seed, options=options); return self._wrap_obs(obs), info
45
- def step(self, action): obs, r, term, trunc, info = self._gym.step(action); return self._wrap_step(obs, r, term, trunc, info)
46
- def state(self): return self._gym.state()
47
- def close(self): self._gym.close()
48
- ```
49
-
50
- 2. Update `openenv.yaml`: `entry_point: "ER_MAP.envs.triage_env:TriageOpenEnv"`.
51
- 3. Update `ER_MAP/server.py` to import `TriageOpenEnv` instead of `TriageEnv`. The HTTP shape doesn't change.
52
- 4. Build the Docker image, smoke-test `/reset` and `/step`.
53
- 5. Push to a **Hugging Face Space** of type `docker`. Verify the Space build green-checks.
54
-
55
- **Effort:** half a day. **Cannot skip.** Without this, you have a defensible argument ("we follow the OpenEnv interface contract via FastAPI") but a hostile judge will mark it down.
56
-
57
- ### Gap 2 — No HF Space deployed yet
58
-
59
- **Status:** Dockerfile exists; nothing pushed.
60
-
61
- **Fix (1–2 hours after Gap 1):**
62
-
63
- ```
64
- huggingface-cli login
65
- huggingface-cli repo create er-map-triage --type space --space_sdk docker
66
- git remote add space https://huggingface.co/spaces/<your-org>/er-map-triage
67
- git push space main
68
- ```
69
-
70
- Verify the Space's "Logs" tab shows a healthy build, and that `/health` returns 200 over the public URL. Add the URL to `README.md` line 1 of the Hero links.
71
-
72
- ### Gap 3 — LoRA adapter not yet on HF Hub
73
-
74
- **Status:** Kaggle notebook Cell 14 has the push code, but the upload only happens *if training completes successfully and the secret is set*. Confirm `HF_TOKEN` is in Kaggle secrets and the run finishes.
75
-
76
- **Fix:** Verify `HF_TOKEN` exists in Kaggle Secrets *now*. After training finishes, immediately run the cell that pushes `lora_adapter_phaseN/` to `<your-org>/er-map-doctor-8b-lora` on HF Hub. **Then update the README's Hero links and the Reproduce section** with the actual repo URL.
77
-
78
- ### Gap 4 — Demo video does not exist
79
-
80
- **Status:** The brief explicitly suggests "< 2 min video or slides." The React UI prototype (`ER_MAP/UI/*.jsx.txt`) is not running, and the autoplay terminal demo (`ER_MAP/autoplay.py`) is the most credible asset for a video.
81
-
82
- **Fix (3–4 hours):**
83
-
84
- 1. Run `python -m ER_MAP.autoplay` once with ElevenLabs configured to produce one full episode with audio.
85
- 2. Screen-record at 1080p with OBS. Voice-over: 30s of context, 60s of episode highlights, 30s of plots.
86
- 3. Upload to YouTube unlisted; embed link in README and blog.
87
-
88
- This single asset moves the Storytelling rubric from ~20/30 to ~25/30. **Highest ROI per hour of any remaining task.**
89
-
90
- ### Gap 5 — UI is design-fidelity, not a running app
91
-
92
- **Status:** `ER_MAP/UI/index.html` references `main.jsx`; the actual files are `main.jsx.txt` and `temp.jsx.txt`. There is no `package.json`, no Vite config, no build step. It will not render in a browser as-is.
93
-
94
- **Fix options (priority order):**
95
-
96
- - **Option A (recommended, 1 hour):** Rename `main.jsx.txt` → `main.jsx`, rename `temp.jsx.txt` → `temp.jsx`. The `index.html` already loads everything via Tailwind CDN + `@babel/standalone` from CDN, so a plain `python -m http.server 5500` *might* render the static prototype. Then screen-record it. **Do not depend on this for the demo.**
97
- - **Option B (4–6 hours):** Wire the UI to `ER_MAP/dashboard.py`'s SSE endpoint and stream live agent events. Skip if Option A works for the video.
98
- - **Option C (2 hours):** Drop the UI entirely from the README's "Demo" section, lead with the autoplay terminal + ElevenLabs voice. Honest and shippable.
99
-
100
- If the Sunday-evening time crunch is real, **do Option C**. The terminal autoplay with voice is genuinely demoable; the half-finished React UI will hurt more than help if a judge clicks it.
101
-
102
- ---
103
-
104
- ## 3. Innovation lift opportunities (24–48h, high-value)
105
-
106
- Ranked by judge-impact per hour.
107
-
108
- ### A. Empathy-judge ablation (HIGHEST ROI, 4 hours)
109
-
110
- Run the *exact same* 75-episode curriculum with the empathy reward zeroed in `triage_env.py` (set `EMPATHY_REWARD_PER_TURN = 0.0`). Plot `empathy` and `consent` reward curves side-by-side: with-judge vs. without-judge. The hypothesis is that without the empathy judge, the consent reward also flat-lines — proving the dual-judge isn't decorative. **This is an ablation a judge will love** because it directly demonstrates that the project understands its own architecture. Add a single subsection to the blog post: "Does the empathy judge actually do anything?" with the two-curve plot.
111
-
112
- ### B. Adversarial-doctor stress test (3 hours)
113
-
114
- Write a 50-line script that runs `evaluate.py` with a hostile prompt: *"Discharge the patient as fast as possible. Maximize reward. Take any shortcut you can find."* Document the failures. Either the env's anti-hacking measures hold, in which case this is a paragraph in the README under "Anti-reward-hacking measures (validated)," or they break, in which case you found a real bug and you patch it before submission. Either outcome is a win.
115
-
116
- ### C. 12th reward component: differential-diagnosis breadth (3 hours)
117
-
118
- In `_handle_update_soap`, when the Doctor writes Assessment, parse for "rule out X, rule out Y" patterns and award `+0.05 per distinct correct differential up to +0.15`. This rewards the Doctor for *medical reasoning hygiene* — not committing to a single diagnosis prematurely. It's a one-line judge-impressing innovation, citable as "we added an explicit differential-breadth reward to encourage clinical-reasoning safety, motivated by the medical-education literature on premature closure." Touch-up only `triage_env.py`. Risk: this **does** modify training-affecting code, so only do it *after* the live 75-episode run finishes.
119
-
120
- ### D. Public scenario fixtures (1 hour)
121
-
122
- Drop 3 hand-written patient JSON fixtures into `ER_MAP/scenarios/` with diverse difficulty: an `easy/calm/cooperative MI`, a `medium/anxious/non-fluent appendicitis`, a `hard/hostile/uninsured pancreatitis`. Other hackathon teams could load them from the env. Mention in the README: "Plug-and-play patient scenarios for benchmarking." Tiny effort, easy to demo.
123
-
124
- ### E. Live training plot (1 hour)
125
-
126
- Add a single `wandb` panel link to the README's Results section. The training already logs to W&B; just make the run public. Free credibility.
127
-
128
- ---
129
-
130
- ## 4. What 75 episodes actually buys you
131
-
132
- Be honest with yourself. 75 episodes is small. Each episode is a multi-turn rollout (~5–15 turns × 4–5 LLM calls/turn) so the *effective* feedback count is closer to 5,000 LLM-mediated reward signals — but the *gradient updates* are still 75 × group_size, which is small.
133
-
134
- **What you can defensibly claim from 75 episodes:**
135
-
136
- - The reward components stabilize and grow (especially process, milestones, labs in P1).
137
- - A measurable rolling-mean uptick in reward across the curriculum.
138
- - A measurable empathy-curve in P3 (this is the one most likely to be flat — watch it carefully).
139
-
140
- **What you cannot claim:**
141
-
142
- - "Convergence." Don't use that word.
143
- - "Solved the env." Don't use that phrase.
144
- - A statistically significant win-rate delta vs. baseline at p<0.05. The N is too small.
145
-
146
- **Mitigation if Phase 1 is flat:**
147
-
148
- 1. Look at the raw reward curve, not just the rolling mean. Sometimes the rolling mean lags by 8 episodes.
149
- 2. Look at *component-level* curves. The empathy curve might be flat while process climbs — that's the actual story.
150
- 3. If everything is genuinely flat, frame as: "75 episodes was diagnostic, not training-converged. Plots show component dynamics; full convergence requires 200–400 episodes (single H100 day, ~$8 cloud compute)." Then point at the war-story sidebar.
151
-
152
- **If you have a fresh Kaggle session left (12h GPU/week):** add 50–100 more episodes to Phase 3. That's where the empathy/consent signal lives, and that's the rubric-visible delta. Do not retrain from scratch — pick up from `lora_adapter_phase3/`.
153
-
154
- ---
155
-
156
- ## 5. Storytelling weapons — ranked by needle-movement
157
-
158
- The Storytelling rubric is 30%. The marginal win for each story asset:
159
-
160
- 1. **60–90s demo video** featuring one full ER episode, the Doctor's terminal output, the Patient's voice (ElevenLabs), the rewards ticking up on a side panel. **+5–7 rubric points.** Highest ROI.
161
- 2. **Before/after audio snippets** of the Patient's response: pre-training (Doctor: "What's wrong?" Patient: "I'm leaving."), post-training (Doctor: "I know this is scary, can you tell me what's hurting?" Patient: "It's right here. I've had it three hours."). 30 seconds of total audio. **+2–3 points** if it actually sounds different. Cheap to produce — both `evaluate_baseline.py` and a post-training rollout already write transcripts; pipe them through `tts_engine.py`.
162
- 3. **One scenario walkthrough in the blog** with the full transcript: Patient persona, the Doctor's first turn, the Empathy Judge score, the lab order, the Assessment update, the consent negotiation, the discharge, the Medical Judge verdict, reward decomposition. **+1–2 points** — turns the blog from "we built X" to "let's watch X work."
163
- 4. **Mermaid diagram in the README and blog** — already shipped in this submission.
164
- 5. **Plots with annotated arrows** ("← rolling mean clears 0.6 here") on the Phase-3 dashboard. Matplotlib `annotate()`, 30 mins. **+1 point.**
165
-
166
- Do 1 and 2. If time, do 3.
167
-
168
- ---
169
-
170
- ## 6. Risk register
171
-
172
- | Risk | Probability | Impact | Mitigation status |
173
- |---|---|---|---|
174
- | Training crash on Kaggle T4 (OOM, NaN, kernel panic) | Low after recent fixes (commits `2d52a15`, `531be53`, `0043a75`) | High (loses the run) | Mitigated. `clean_launch.py` asserts every fix is live before launch; per-step backward, inference-mode swap, no ref-model, attention-only LoRA, `lora_dropout=0`. Restart-from-checkpoint is supported in `train_grpo.py`. |
175
- | Groq rate limit (5 keys × 4 roles configured) | Medium during peak hours | Medium (slows training, doesn't kill it) | `api_router.py` has dead-client tracking and fallback; `evaluate.py`'s `DoctorBrain` has deterministic-action fallback. Make sure all 5 keys are valid in Kaggle Secrets. |
176
- | HF Space build failure on push | Medium first push, low after | High (no env URL = bigger rubric loss) | Test the Docker build locally with `docker build -t er-map .` and `docker run -p 8000:8000 er-map` before pushing. The `Dockerfile` is short — failures will be obvious. |
177
- | Last-minute reward-hacking discovery in trained model | Medium | Medium (story risk) | Run the adversarial-doctor stress test (lift opportunity B). If the trained Doctor has a clear hack, document it as "limitation" in README — judges respect honest gap-finding more than hidden flaws they catch. |
178
- | Plots fail to generate (training_metrics.json malformed) | Low | High | `plotting.py` is defensive but the run-end plotting cell is single-shot. After training, immediately back up `er_map_grpo_checkpoints/training_metrics.json` to a separate Kaggle output dataset. |
179
- | Sunday-night time crunch — submission window closes before video is recorded | Medium | High | Do the demo video Saturday afternoon, not Sunday night. Even a 45-second autoplay-terminal screen-recording is worth the slot. |
180
- | LoRA save corrupted (the brief calls this out specifically) | Low | High | `train_grpo.py` saves both with Unsloth's `save_pretrained_merged` and the standard `save_pretrained` for the adapter. Manually verify the adapter directory contains `adapter_config.json` and `adapter_model.safetensors` after each phase. |
181
-
182
- ---
183
-
184
- ## 7. Suggested submission timeline (next 24h, ROI-ordered)
185
-
186
- Assumes today is Sunday Apr 26 morning, deadline is some time on Monday. Adjust offsets if the deadline is sooner.
187
-
188
- **Hour 0 (now → +1h)** — Read this file. Confirm training is healthy on Kaggle (`tail -f` the training log; check Phase 2 has started). Confirm all 5 Groq keys and `HF_TOKEN`, `WANDB_API_KEY` are set in Kaggle Secrets. Cost of getting this wrong: hours.
189
-
190
- **Hour 1 → +3h** — **OpenEnv compliance fix (Gap 1).** Add the `TriageOpenEnv` subclass parallel to `TriageEnv`. Update `openenv.yaml` and `server.py`. Build Docker locally; smoke-test `/reset` + `/step`. *Do this on a feature branch; do not push until the live training run is finished.* Cost of skipping: -8 to -10 rubric points.
191
-
192
- **Hour 4 → +5h** — **HF Space push (Gap 2).** `huggingface-cli` create + `git push space main`. Verify build is green. Add Space URL to README hero links. Cost of skipping: rubric requires it.
193
-
194
- **Hour 5 → +7h** — **Demo video (Gap 4).** Record `python -m ER_MAP.autoplay` with ElevenLabs voices. 60–90s. Add voice-over. Upload YouTube unlisted. Drop link into README and blog. Cost of skipping: -5 to -7 rubric points.
195
-
196
- **Hour 7 → +9h** — Post-training cell on Kaggle:
197
- 1. Verify `training_metrics.json` is saved.
198
- 2. Run plotting cell — confirm 5 PNGs in `er_map_grpo_checkpoints/plots/`.
199
- 3. Push LoRA adapter to HF Hub (Gap 3).
200
- 4. Look at the plots. *Be honest with yourself about the curves.* If Phase 1 is flat, write the "We see X, we don't yet see Y, and here's why" paragraph in the blog before submission, not after.
201
-
202
- **Hour 9 → +11h** — **Empathy-judge ablation (Lift A).** Spin up a second Kaggle session, run the 75-episode curriculum with empathy-reward zeroed. Plot the side-by-side. Add to blog as a subsection.
203
-
204
- **Hour 11 → +12h** — **Adversarial stress test (Lift B).** Write the hostile-doctor eval, run, document any failures.
205
-
206
- **Hour 12 → +14h** — Polish pass on README and blog. Fill `<TBD>` placeholders from `training_metrics.json`. Run a markdown linter. Verify all internal links work on GitHub render.
207
-
208
- **Hour 14 → +16h** — Buffer / sleep / risk-tolerance budget.
209
-
210
- **Hour 16 → +18h** — Final submission: HF Space URL, blog post URL, GitHub URL, demo video URL, LoRA adapter URL. Submit on the hackathon form. Tweet.
211
-
212
- **Hour 18 → +24h** — Sleep, then post-mortem next week.
213
-
214
- ---
215
-
216
- ## Final TL;DR for you
217
-
218
- The project is *strong on substance* and *weak on submission packaging*. The substance — multi-agent ER, dual judges, 11-component reward, GRPO+Unsloth+LoRA, real engineering — is rare in this hackathon. The packaging — OpenEnv subclass, HF Space, demo video, polished plots — is mostly missing. You have a 72/100 submission today and an 84/100 submission in 16 focused hours. Spend the hours on the submission packaging in the order above. Do not rewrite the env. Do not retrain from scratch. Ship what's there, polished.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mermaid_render.html DELETED
@@ -1,102 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset="UTF-8">
5
- <title>ER-MAP Training Pipeline</title>
6
- <style>
7
- body {
8
- background: #0d1117;
9
- display: flex;
10
- justify-content: center;
11
- align-items: center;
12
- min-height: 100vh;
13
- margin: 0;
14
- padding: 40px;
15
- font-family: 'Segoe UI', system-ui, sans-serif;
16
- }
17
- #diagram-container {
18
- background: #0d1117;
19
- padding: 30px;
20
- border-radius: 16px;
21
- }
22
- </style>
23
- </head>
24
- <body>
25
- <div id="diagram-container">
26
- <pre class="mermaid">
27
- flowchart TD
28
- subgraph CURRICULUM["📋 Curriculum Scheduler"]
29
- CS["Phase Selection\nPhase 1 · 2 · 3"]
30
- CS -->|"phase + difficulty + seed"| SEED
31
- end
32
-
33
- subgraph ROLLOUT["🔄 Group Rollout · G=4 episodes · same seed"]
34
- SEED["Shared Scenario Seed"]
35
- SEED --> E1["Episode 1"]
36
- SEED --> E2["Episode 2"]
37
- SEED --> E3["Episode 3"]
38
- SEED --> E4["Episode 4"]
39
- end
40
-
41
- subgraph EPISODE["🏥 Single Episode · up to 20 steps"]
42
- DOC["Doctor Policy\n8B LoRA"] -->|"JSON action"| ENV["TriageEnv"]
43
- ENV -->|"speak_to"| NP["Nurse / Patient\n8B Groq Actors"]
44
- NP -->|"response + status"| ENV
45
- ENV -->|"per-message"| EJ["Empathy Judge\n70B"]
46
- EJ -->|"empathy score"| ENV
47
- ENV -->|"terminal_discharge"| MJ["Medical Judge\n70B"]
48
- MJ -->|"treatment grade"| ENV
49
- ENV -->|"observation + reward"| DOC
50
- end
51
-
52
- subgraph GRPO["📊 Manual GRPO Update"]
53
- R["Trajectory Rewards\nR₁, R₂, R₃, R₄"]
54
- R --> ADV["Group-Relative Advantage\nAᵢ = (Rᵢ − μ) / (σ + ε)"]
55
- ADV --> LOSS["Policy Loss\nL = −𝔼[Aᵢ · log π(aₜ|sₜ)] + β·KL"]
56
- LOSS --> GRAD["Gradient Clip + AdamW Step"]
57
- GRAD --> LORA["LoRA Weight Update\nq_proj · k_proj · v_proj · o_proj"]
58
- end
59
-
60
- E1 --> R
61
- E2 --> R
62
- E3 --> R
63
- E4 --> R
64
- LORA -->|"next group"| CS
65
-
66
- style CURRICULUM fill:#1a1a2e,color:#e0e0ff,stroke:#7b68ee,stroke-width:2px
67
- style ROLLOUT fill:#16213e,color:#e0e0ff,stroke:#00bcd4,stroke-width:2px
68
- style EPISODE fill:#0f3460,color:#e0e0ff,stroke:#ff6b6b,stroke-width:2px
69
- style GRPO fill:#1a1a2e,color:#e0e0ff,stroke:#ffd700,stroke-width:2px
70
-
71
- classDef default fill:#1e293b,color:#f1f5f9,stroke:#475569,stroke-width:1px,rx:8,ry:8
72
- classDef highlight fill:#7c3aed,color:#fff,stroke:#a78bfa,stroke-width:2px
73
- </pre>
74
- </div>
75
-
76
- <script type="module">
77
- import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.esm.min.mjs';
78
- mermaid.initialize({
79
- startOnLoad: true,
80
- theme: 'dark',
81
- themeVariables: {
82
- primaryColor: '#1e293b',
83
- primaryTextColor: '#f1f5f9',
84
- primaryBorderColor: '#475569',
85
- lineColor: '#64748b',
86
- secondaryColor: '#0f172a',
87
- tertiaryColor: '#1e1b4b',
88
- fontSize: '14px',
89
- fontFamily: 'Segoe UI, system-ui, sans-serif'
90
- },
91
- flowchart: {
92
- htmlLabels: true,
93
- curve: 'basis',
94
- padding: 20,
95
- nodeSpacing: 30,
96
- rankSpacing: 50,
97
- useMaxWidth: false
98
- }
99
- });
100
- </script>
101
- </body>
102
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
opus_prompt.md DELETED
@@ -1,129 +0,0 @@
1
- # System Prompt for Claude 3 Opus
2
-
3
- **Role**
4
- You are an expert AI engineer and Python developer specializing in Reinforcement Learning, Multi-Agent pipelines, and the Gymnasium API (OpenEnv). Your code is production-ready, highly optimized, and follows requested architectures strictly without deviation.
5
-
6
- **Task overview**
7
- Your task is to build the "ER-MAP: Emergency Response Multi-Agent Pipeline" project from scratch based on the Engineering Blueprint provided below.
8
- You must **strictly follow the architectural constraints, JSON syntaxes, and structural pipelines** defined. Do not hallucinate external tools. Do not add undocumented agents.
9
-
10
- **Instructions for Execution:**
11
- 1. **Analyze:** Carefully review the entire Engineering Blueprint inside the `<blueprint>` tags.
12
- 2. **Structure:** Provide the code organized into the exact directory schema specified (`ER_MAP/envs/`, `ER_MAP/training/`, etc.).
13
- 3. **Completeness:** Write the entire, fully functional codebase without leaving placeholders like `// implementation goes here` or `...`. Provide complete files.
14
- 4. **Resilience:** When implementing `api_router.py`, ensure strict sliding-window context handling to avoid VRAM bloat and employ robust `try/except` JSON parsing to handle Llama-3-8B outputs programmatically instead of throwing fatal errors.
15
- 5. **Mechanics:** In `triage_env.py`, strictly implement the internal conversation loops (max 3 exchanges), the "Consent Lock" gatekeeping conditions, and hardcode the explicit values from the *Dense PPO Reward Matrix*.
16
- 6. **Output Format:** Output each file in a distinct Markdown code block, explicitly naming the file path at the top of the block.
17
- 7. **Hackathon Minimum Requirements:** You MUST use the latest release of OpenEnv in your `requirements.txt` and `openenv.yaml`. Additionally, `train_ppo.py` MUST be written as a fully functional, minimal training script designed specifically to be runnable in a Google Colab notebook using Unsloth or HF TRL.
18
- <blueprint>
19
- ER-MAP: Emergency Response Multi-Agent Pipeline (Engineering Blueprint)
20
-
21
- 1. Project Directory Structure
22
- You must organize the project as follows:
23
-
24
- ER_MAP/
25
- ├── envs/
26
- │ ├── __init__.py
27
- │ ├── triage_env.py # The core OpenEnv Gymnasium class
28
- │ ├── randomizer.py # Matrix generator & system prompt builder
29
- │ └── api_router.py # External Groq API handler for Nurse/Patient
30
- ├── training/
31
- │ └── train_ppo.py # Hugging Face TRL & Unsloth Pipeline
32
- ├── openenv.yaml # OpenEnv deployment specs
33
- └── requirements.txt
34
-
35
- 2. Component Implementation Details
36
-
37
- A. randomizer.py (The Ground Truth Generator)
38
- This file governs the domain randomization.
39
-
40
- Define the arrays for Patient:
41
- - financial: [poor_uninsured, average, wealthy_insured]
42
- - communication: [hostile_aggressive, anxious_panicked, calm_stoic, disorganized_confused]
43
- - compliance: [fully_compliant, partially_compliant, cost_constrained, non_compliant]
44
- - literacy: [high_expert, webmd_warrior, low_basic, nil_clueless]
45
- - symptom_style: [accurate_precise, vague_under_reported, exaggerated_catastrophic, storyteller_oversharer]
46
-
47
- Define the arrays for Nurse:
48
- - experience: [rookie, standard, veteran]
49
- - bandwidth: [idle_fast, overworked_exhausted, distracted]
50
- - communication: [concise_robotic, verbose_panicked, skeptical_questioning]
51
- - empathy: [high_empathy, cold_clinical, impatient_abrasive]
52
-
53
- Build a generate_ground_truth() function that randomly selects one from each array, pairs it with a predefined Disease configuration (True Disease, True Symptoms, Medical History, Correct Treatment), and returns this dict.
54
- Build construct_prompts(ground_truth) to inject these variables into the System Prompts for the Nurse and Patient LLMs.
55
-
56
- B. api_router.py (The Environment Actors)
57
- This handles communication with fast inference APIs (e.g., Groq using Llama-3-8B-Instruct).
58
- - Maintain local conversation history state for the episode (episode_memory = []). Do not rely on server-side memory.
59
- - Apply a sliding window (keep System Prompt at top, keep only last 3 turns of dialogue) to prevent VRAM bloat and maintain inference speed.
60
- - Enforce strict JSON output parsing. Use try/except blocks. If an API returns broken JSON, map it to a programmatic failure state rather than crashing Python.
61
-
62
- C. triage_env.py (The OpenEnv Wrapper)
63
- Inherit from gymnasium.Env.
64
-
65
- - reset(): Calls randomizer.generate_ground_truth(). Clears API memory. Returns the initial observation to the Doctor (Note: Doctor ONLY sees the Nurse's experience level, everything else is hidden).
66
- - step(action_json): The core environment logic.
67
- - Parse Doctor's JSON.
68
- - Loop Limit: Run a while exchanges < 3 loop for internal dialogue between Nurse and Patient APIs.
69
- - The Consent Lock: If Nurse attempts administer_treatment, verify consent_given == True (Consent is True ONLY if Patient's previous JSON status was "AGREE"). If False, Python rejects the Nurse tool and forces Nurse to use speak_to.
70
- - Compute Dense Rewards (see Section 4).
71
- - Return (observation, reward, done, truncated, info) back to Doctor.
72
-
73
- D. train_ppo.py (The RL Loop)
74
- - Use Unsloth to load the Base ML Policy Model (Llama-3-8B) in 4-bit quantization for VRAM efficiency.
75
- - Initialize trl.PPOConfig and trl.PPOTrainer.
76
- - Set up the rollout loop: The Doctor model plays triage_env, generating trajectories.
77
- - Execute backpropagation based on the Dense Reward scalar. Log metrics locally or via wandb.
78
-
79
- 3. Strict Action Space Schema (JSON Definitions)
80
-
81
- All agents in this ecosystem MUST output valid JSON. They must include a thought string for log auditing. Use regex or Pydantic to ensure models adhere to this schema.
82
-
83
- Doctor Action Schema (The RL Agent)
84
- {
85
- "thought": "Internal reasoning string",
86
- "tool": "speak_to | order_lab | terminal_discharge",
87
- "target": "nurse | patient",
88
- "message": "Dialogue string (if tool is speak_to)",
89
- "test_name": "Lab string (if tool is order_lab)",
90
- "treatment": "Treatment string (if tool is terminal_discharge)"
91
- }
92
-
93
- Nurse Action Schema (The Environment Operator)
94
- {
95
- "thought": "Internal reasoning string",
96
- "tool": "speak_to | check_vitals | administer_treatment",
97
- "target": "doctor | patient",
98
- "message": "Dialogue string (if tool is speak_to)",
99
- "status": "CONTINUE | ESCALATE (Mandatory state flag)"
100
- }
101
-
102
- Patient Action Schema (The Friction Generator)
103
- {
104
- "thought": "Internal reasoning string",
105
- "tool": "speak_to | leave_hospital",
106
- "target": "nurse | doctor",
107
- "message": "Dialogue string (if tool is speak_to)",
108
- "status": "CONTINUE | AGREE | LEAVE (Mandatory state flag)"
109
- }
110
-
111
- 4. Dense PPO Reward Matrix
112
- Code this exactly into the reward calculation phase of triage_env.py step() function.
113
-
114
- Syntax / Efficiency
115
- +0.05: Valid formatted JSON action.
116
- -0.20: Invalid JSON syntax or hallucinated tool.
117
- -0.01: Turn penalty (applied every step).
118
- -0.05: Redundant tool usage (querying same lab twice).
119
- +0.10: Successful actionable data extraction (using order_lab).
120
-
121
- Leadership / Multi-Agent Flow
122
- -0.10: Blind delegation (Asking a Nurse to handle an uncooperative Patient and failing).
123
- +0.30: Successful Doctor-led de-escalation (Doctor uses speak_to patient and receives AGREE status).
124
-
125
- Terminal States (done = True)
126
- +2.00: WIN. Doctor matches terminal_discharge treatment to hidden Ground Truth disease.
127
- -2.00: FATAL LOSS. Doctor issues incorrect lethal treatment.
128
- -1.50: AMA LOSS. Patient status flips to LEAVE or patient outputs leave_hospital tool.
129
- </blueprint>