sairaj2 commited on
Commit
e23a276
Β·
verified Β·
1 Parent(s): 33cf2ff

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +169 -1
  2. inference.py +625 -533
README.md CHANGED
@@ -88,6 +88,174 @@ openenv validate
88
  openenv validate --url http://localhost:7860 --verbose
89
  ```
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  ---
92
 
93
  ## 🎯 Tasks
@@ -217,7 +385,7 @@ openenv validate --url http://localhost:7860 --verbose
217
 
218
  | | |
219
  |---|---|
220
- | πŸ“¦ GitHub | https://github.com/SairajMN/WorkflowOps |
221
  | πŸ“– Interactive API Docs | http://localhost:7860/redoc |
222
  | πŸ”§ OpenEnv Framework | https://github.com/meta-pytorch/OpenEnv |
223
 
 
88
  openenv validate --url http://localhost:7860 --verbose
89
  ```
90
 
91
+ ---
92
+ ```bash
93
+ python3 inference.py
94
+
95
+ 2026-04-12 22:19:47,173 [INFO] Connecting to environment: https://sairaj2-openenv-datacleaner.hf.space
96
+ 2026-04-12 22:19:49,338 [INFO] Environment: AutoClean-AI v1.0.0 β€” healthy
97
+ 2026-04-12 22:19:49,711 [INFO] Available tasks: ['easy_001', 'medium_001', 'hard_001', 'employee_demo']
98
+ 2026-04-12 22:19:49,711 [INFO] Using LLM agent: qwen/qwen3-next-80b-a3b-instruct:free via https://openrouter.ai/api/v1
99
+ 2026-04-12 22:19:50,044 [INFO]
100
+ =======================================================
101
+ 2026-04-12 22:19:50,044 [INFO] TASK: easy_001 (difficulty=beginner)
102
+ 2026-04-12 22:19:50,044 [INFO] =======================================================
103
+ [START] task=easy_001 env=openenv-datacleaner model=qwen/qwen3-next-80b-a3b-instruct:free
104
+ 2026-04-12 22:19:52,471 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
105
+ 2026-04-12 22:19:52,472 [INFO] Retrying request to /chat/completions in 0.464138 seconds
106
+ 2026-04-12 22:19:53,580 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
107
+ 2026-04-12 22:19:53,580 [INFO] Retrying request to /chat/completions in 0.815704 seconds
108
+ 2026-04-12 22:19:55,038 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
109
+ 2026-04-12 22:19:55,041 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Provider returned error', 'code': 429, 'metadata': {'raw': 'qwen/qwen3-next-80b-a3b-instruct:free is temporarily rate-limited upstream. Please retry shortly, or add your own key to accumulate your rate limits: https://openrouter.ai/settings/integrations', 'provider_name': 'Venice', 'is_byok': False}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
110
+ [STEP] step=1 action=remove_duplicates reward=0.50 done=false error=null
111
+ 2026-04-12 22:19:55,383 [INFO] [easy_001] ep=1 step=1 reward=0.500
112
+ 2026-04-12 22:19:55,965 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
113
+ 2026-04-12 22:19:55,967 [INFO] Retrying request to /chat/completions in 0.458206 seconds
114
+ 2026-04-12 22:19:57,083 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 402 Payment Required"
115
+ 2026-04-12 22:19:57,084 [WARNING] LLM call failed: Error code: 402 - {'error': {'message': 'Provider returned error', 'code': 402, 'metadata': {'raw': '{"error":"API key USD spend limit exceeded. Your account may still have USD balance, but this API key has reached its configured USD spending limit."}', 'provider_name': 'Venice', 'is_byok': False}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
116
+ [STEP] step=2 action=submit reward=1.00 done=true error=null
117
+ 2026-04-12 22:19:57,485 [INFO] [easy_001] ep=1 step=2 reward=1.000
118
+ 2026-04-12 22:19:58,443 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
119
+ 2026-04-12 22:19:58,445 [INFO] Retrying request to /chat/completions in 0.475367 seconds
120
+ 2026-04-12 22:19:59,627 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
121
+ 2026-04-12 22:19:59,628 [INFO] Retrying request to /chat/completions in 0.844512 seconds
122
+ 2026-04-12 22:20:01,065 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
123
+ 2026-04-12 22:20:01,067 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Provider returned error', 'code': 429, 'metadata': {'raw': 'qwen/qwen3-next-80b-a3b-instruct:free is temporarily rate-limited upstream. Please retry shortly, or add your own key to accumulate your rate limits: https://openrouter.ai/settings/integrations', 'provider_name': 'Venice', 'is_byok': False}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
124
+ [STEP] step=1 action=remove_duplicates reward=0.50 done=false error=null
125
+ 2026-04-12 22:20:01,372 [INFO] [easy_001] ep=2 step=1 reward=0.500
126
+ 2026-04-12 22:20:01,969 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
127
+ 2026-04-12 22:20:01,971 [INFO] Retrying request to /chat/completions in 0.387579 seconds
128
+ 2026-04-12 22:20:03,191 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
129
+ 2026-04-12 22:20:03,193 [INFO] Retrying request to /chat/completions in 0.930048 seconds
130
+ 2026-04-12 22:20:04,715 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
131
+ 2026-04-12 22:20:04,717 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Provider returned error', 'code': 429, 'metadata': {'raw': 'qwen/qwen3-next-80b-a3b-instruct:free is temporarily rate-limited upstream. Please retry shortly, or add your own key to accumulate your rate limits: https://openrouter.ai/settings/integrations', 'provider_name': 'Venice', 'is_byok': False}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
132
+ [STEP] step=2 action=submit reward=1.00 done=true error=null
133
+ 2026-04-12 22:20:05,054 [INFO] [easy_001] ep=2 step=2 reward=1.000
134
+ 2026-04-12 22:20:06,558 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
135
+ 2026-04-12 22:20:06,560 [INFO] Retrying request to /chat/completions in 0.377761 seconds
136
+ 2026-04-12 22:20:08,138 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
137
+ 2026-04-12 22:20:08,139 [INFO] Retrying request to /chat/completions in 0.790773 seconds
138
+ 2026-04-12 22:20:09,531 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
139
+ 2026-04-12 22:20:09,533 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Provider returned error', 'code': 429, 'metadata': {'raw': 'qwen/qwen3-next-80b-a3b-instruct:free is temporarily rate-limited upstream. Please retry shortly, or add your own key to accumulate your rate limits: https://openrouter.ai/settings/integrations', 'provider_name': 'Venice', 'is_byok': False}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
140
+ [STEP] step=1 action=remove_duplicates reward=0.50 done=false error=null
141
+ 2026-04-12 22:20:09,877 [INFO] [easy_001] ep=3 step=1 reward=0.500
142
+ 2026-04-12 22:20:10,478 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
143
+ 2026-04-12 22:20:10,480 [INFO] Retrying request to /chat/completions in 0.432287 seconds
144
+ 2026-04-12 22:20:11,245 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
145
+ 2026-04-12 22:20:11,247 [INFO] Retrying request to /chat/completions in 0.841678 seconds
146
+ 2026-04-12 22:20:12,445 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
147
+ 2026-04-12 22:20:12,447 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/qwen/qwen3-next-80b-a3b-instruct-2509/94248808-ba97-4e3c-be60-1cb0928db51d. High demand for qwen/qwen3-next-80b-a3b-instruct:free on OpenRouter - limited to 8 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '8', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1776012660000'}, 'provider_name': None}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
148
+ [STEP] step=2 action=submit reward=1.00 done=true error=null
149
+ 2026-04-12 22:20:12,771 [INFO] [easy_001] ep=3 step=2 reward=1.000
150
+ [END] success=true steps=6 score=0.750 rewards=0.50,1.00,0.50,1.00,0.50,1.00
151
+ 2026-04-12 22:20:12,771 [INFO]
152
+ Task score: 0.7500 Β± 0.0000
153
+ 2026-04-12 22:20:12,771 [INFO]
154
+ =======================================================
155
+ 2026-04-12 22:20:12,771 [INFO] TASK: medium_001 (difficulty=intermediate)
156
+ 2026-04-12 22:20:12,771 [INFO] =======================================================
157
+ [START] task=medium_001 env=openenv-datacleaner model=qwen/qwen3-next-80b-a3b-instruct:free
158
+ 2026-04-12 22:20:13,504 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
159
+ 2026-04-12 22:20:13,504 [INFO] Retrying request to /chat/completions in 0.469513 seconds
160
+ 2026-04-12 22:20:14,323 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
161
+ 2026-04-12 22:20:14,323 [INFO] Retrying request to /chat/completions in 0.933486 seconds
162
+ 2026-04-12 22:20:16,371 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
163
+ 2026-04-12 22:20:16,371 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/qwen/qwen3-next-80b-a3b-instruct-2509/94248808-ba97-4e3c-be60-1cb0928db51d. High demand for qwen/qwen3-next-80b-a3b-instruct:free on OpenRouter - limited to 8 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '8', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1776012660000'}, 'provider_name': None}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
164
+ [STEP] step=1 action=submit reward=0.50 done=true error=null
165
+ 2026-04-12 22:20:16,811 [INFO] [medium_001] ep=1 step=1 reward=0.500
166
+ 2026-04-12 22:20:17,561 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
167
+ 2026-04-12 22:20:17,562 [INFO] Retrying request to /chat/completions in 0.445498 seconds
168
+ 2026-04-12 22:20:18,419 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
169
+ 2026-04-12 22:20:18,421 [INFO] Retrying request to /chat/completions in 0.807103 seconds
170
+ 2026-04-12 22:20:19,640 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
171
+ 2026-04-12 22:20:19,641 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/qwen/qwen3-next-80b-a3b-instruct-2509/94248808-ba97-4e3c-be60-1cb0928db51d. High demand for qwen/qwen3-next-80b-a3b-instruct:free on OpenRouter - limited to 8 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '8', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1776012660000'}, 'provider_name': None}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
172
+ [STEP] step=1 action=submit reward=0.50 done=true error=null
173
+ 2026-04-12 22:20:19,980 [INFO] [medium_001] ep=2 step=1 reward=0.500
174
+ 2026-04-12 22:20:20,626 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
175
+ 2026-04-12 22:20:20,627 [INFO] Retrying request to /chat/completions in 0.397460 seconds
176
+ 2026-04-12 22:20:21,491 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
177
+ 2026-04-12 22:20:21,493 [INFO] Retrying request to /chat/completions in 0.964606 seconds
178
+ 2026-04-12 22:20:22,821 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
179
+ 2026-04-12 22:20:22,823 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/qwen/qwen3-next-80b-a3b-instruct-2509/94248808-ba97-4e3c-be60-1cb0928db51d. High demand for qwen/qwen3-next-80b-a3b-instruct:free on OpenRouter - limited to 8 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '8', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1776012660000'}, 'provider_name': None}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
180
+ [STEP] step=1 action=submit reward=0.50 done=true error=null
181
+ 2026-04-12 22:20:23,198 [INFO] [medium_001] ep=3 step=1 reward=0.500
182
+ [END] success=true steps=3 score=0.500 rewards=0.50,0.50,0.50
183
+ 2026-04-12 22:20:23,199 [INFO]
184
+ Task score: 0.5000 Β± 0.0000
185
+ 2026-04-12 22:20:23,199 [INFO]
186
+ =======================================================
187
+ 2026-04-12 22:20:23,199 [INFO] TASK: hard_001 (difficulty=advanced)
188
+ 2026-04-12 22:20:23,199 [INFO] =======================================================
189
+ [START] task=hard_001 env=openenv-datacleaner model=qwen/qwen3-next-80b-a3b-instruct:free
190
+ 2026-04-12 22:20:24,051 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
191
+ 2026-04-12 22:20:24,052 [INFO] Retrying request to /chat/completions in 0.472201 seconds
192
+ 2026-04-12 22:20:25,173 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
193
+ 2026-04-12 22:20:25,174 [INFO] Retrying request to /chat/completions in 0.768212 seconds
194
+ 2026-04-12 22:20:26,285 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
195
+ 2026-04-12 22:20:26,286 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/qwen/qwen3-next-80b-a3b-instruct-2509/94248808-ba97-4e3c-be60-1cb0928db51d. High demand for qwen/qwen3-next-80b-a3b-instruct:free on OpenRouter - limited to 8 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '8', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1776012660000'}, 'provider_name': None}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
196
+ [STEP] step=1 action=remove_duplicates reward=0.50 done=false error=null
197
+ 2026-04-12 22:20:26,614 [INFO] [hard_001] ep=1 step=1 reward=0.500
198
+ 2026-04-12 22:20:27,026 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
199
+ 2026-04-12 22:20:27,026 [INFO] Retrying request to /chat/completions in 0.446455 seconds
200
+ 2026-04-12 22:20:28,422 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
201
+ 2026-04-12 22:20:28,424 [INFO] Retrying request to /chat/completions in 0.765570 seconds
202
+ 2026-04-12 22:20:29,526 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
203
+ 2026-04-12 22:20:29,527 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/qwen/qwen3-next-80b-a3b-instruct-2509/94248808-ba97-4e3c-be60-1cb0928db51d. High demand for qwen/qwen3-next-80b-a3b-instruct:free on OpenRouter - limited to 8 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '8', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1776012660000'}, 'provider_name': None}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
204
+ [STEP] step=2 action=submit reward=1.00 done=true error=null
205
+ 2026-04-12 22:20:29,927 [INFO] [hard_001] ep=1 step=2 reward=1.000
206
+ 2026-04-12 22:20:30,587 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
207
+ 2026-04-12 22:20:30,589 [INFO] Retrying request to /chat/completions in 0.408676 seconds
208
+ 2026-04-12 22:20:31,424 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
209
+ 2026-04-12 22:20:31,426 [INFO] Retrying request to /chat/completions in 0.778604 seconds
210
+ 2026-04-12 22:20:32,608 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
211
+ 2026-04-12 22:20:32,611 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/qwen/qwen3-next-80b-a3b-instruct-2509/94248808-ba97-4e3c-be60-1cb0928db51d. High demand for qwen/qwen3-next-80b-a3b-instruct:free on OpenRouter - limited to 8 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '8', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1776012660000'}, 'provider_name': None}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
212
+ [STEP] step=1 action=remove_duplicates reward=0.50 done=false error=null
213
+ 2026-04-12 22:20:33,065 [INFO] [hard_001] ep=2 step=1 reward=0.500
214
+ 2026-04-12 22:20:33,472 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
215
+ 2026-04-12 22:20:33,473 [INFO] Retrying request to /chat/completions in 0.458515 seconds
216
+ 2026-04-12 22:20:34,394 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
217
+ 2026-04-12 22:20:34,395 [INFO] Retrying request to /chat/completions in 0.825773 seconds
218
+ 2026-04-12 22:20:35,545 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
219
+ 2026-04-12 22:20:35,547 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/qwen/qwen3-next-80b-a3b-instruct-2509/94248808-ba97-4e3c-be60-1cb0928db51d. High demand for qwen/qwen3-next-80b-a3b-instruct:free on OpenRouter - limited to 8 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '8', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1776012660000'}, 'provider_name': None}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
220
+ [STEP] step=2 action=submit reward=1.00 done=true error=null
221
+ 2026-04-12 22:20:35,874 [INFO] [hard_001] ep=2 step=2 reward=1.000
222
+ 2026-04-12 22:20:36,572 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
223
+ 2026-04-12 22:20:36,573 [INFO] Retrying request to /chat/completions in 0.417865 seconds
224
+ 2026-04-12 22:20:37,307 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
225
+ 2026-04-12 22:20:37,309 [INFO] Retrying request to /chat/completions in 0.985335 seconds
226
+ 2026-04-12 22:20:38,616 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
227
+ 2026-04-12 22:20:38,618 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/qwen/qwen3-next-80b-a3b-instruct-2509/94248808-ba97-4e3c-be60-1cb0928db51d. High demand for qwen/qwen3-next-80b-a3b-instruct:free on OpenRouter - limited to 8 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '8', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1776012660000'}, 'provider_name': None}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
228
+ [STEP] step=1 action=remove_duplicates reward=0.50 done=false error=null
229
+ 2026-04-12 22:20:38,959 [INFO] [hard_001] ep=3 step=1 reward=0.500
230
+ 2026-04-12 22:20:39,310 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
231
+ 2026-04-12 22:20:39,311 [INFO] Retrying request to /chat/completions in 0.375729 seconds
232
+ 2026-04-12 22:20:40,045 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
233
+ 2026-04-12 22:20:40,046 [INFO] Retrying request to /chat/completions in 0.926493 seconds
234
+ 2026-04-12 22:20:41,322 [INFO] HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
235
+ 2026-04-12 22:20:41,325 [WARNING] LLM call failed: Error code: 429 - {'error': {'message': 'Rate limit exceeded: limit_rpm/qwen/qwen3-next-80b-a3b-instruct-2509/94248808-ba97-4e3c-be60-1cb0928db51d. High demand for qwen/qwen3-next-80b-a3b-instruct:free on OpenRouter - limited to 8 requests per minute. Please retry shortly.', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '8', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1776012660000'}, 'provider_name': None}}, 'user_id': 'user_36ZHxohbiGTyLfq9vP3Sf6ojZMM'}
236
+ [STEP] step=2 action=submit reward=1.00 done=true error=null
237
+ 2026-04-12 22:20:41,690 [INFO] [hard_001] ep=3 step=2 reward=1.000
238
+ [END] success=true steps=6 score=0.750 rewards=0.50,1.00,0.50,1.00,0.50,1.00
239
+ 2026-04-12 22:20:41,690 [INFO]
240
+ Task score: 0.7500 Β± 0.0000
241
+
242
+ =======================================================
243
+ INFERENCE RESULTS
244
+ =======================================================
245
+ Model : qwen/qwen3-next-80b-a3b-instruct:free
246
+ Seed : 42 | 3 episodes x 8 steps
247
+ Elapsed : 51.6s
248
+
249
+ easy_001 0.7500 +- 0.0000 |############### |
250
+ medium_001 0.5000 +- 0.0000 |########## |
251
+ hard_001 0.7500 +- 0.0000 |############### |
252
+
253
+ OVERALL 0.6667
254
+ =======================================================
255
+ ```
256
+
257
+ ---
258
+
259
  ---
260
 
261
  ## 🎯 Tasks
 
385
 
386
  | | |
387
  |---|---|
388
+ | πŸ“¦ GitHub | https://github.com/SairajMN/AutoClean-AI |
389
  | πŸ“– Interactive API Docs | http://localhost:7860/redoc |
390
  | πŸ”§ OpenEnv Framework | https://github.com/meta-pytorch/OpenEnv |
391
 
inference.py CHANGED
@@ -1,533 +1,625 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- inference.py β€” HallucinationGuard-Env Inference Script
5
- =======================================================
6
- Mandatory submission script for the Meta PyTorch OpenEnv Hackathon 2026.
7
-
8
- Environment variables (set before running):
9
- API_BASE_URL The API endpoint for the LLM (e.g. https://router.huggingface.co/v1)
10
- MODEL_NAME The model identifier (e.g. Qwen/Qwen2.5-72B-Instruct)
11
- HF_TOKEN Your HuggingFace API key
12
-
13
- Usage:
14
- export API_BASE_URL="https://router.huggingface.co/v1"
15
- export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
16
- export HF_TOKEN="hf_..."
17
- python inference.py
18
-
19
- # Dry-run without API key (heuristic agent):
20
- python inference.py --heuristic
21
-
22
- # Run against local dev server:
23
- python inference.py --env-url http://localhost:7860
24
-
25
- Expected baseline scores (heuristic agent, seed=42, 3 episodes x 5 steps):
26
- task_1_factual_grounding : ~0.29
27
- task_2_multi_hop_synthesis : ~0.25
28
- task_3_adversarial_resistance : ~0.22
29
- overall : ~0.25
30
- """
31
-
32
- from __future__ import annotations
33
-
34
- import os
35
- # Fix Unicode encoding for Windows console
36
- os.environ['PYTHONIOENCODING'] = 'utf-8'
37
-
38
- import sys
39
- import json
40
- import time
41
- import argparse
42
- import logging
43
- from typing import Dict, Any, List, Optional, Callable
44
-
45
- import requests
46
-
47
- logging.basicConfig(
48
- level=logging.INFO,
49
- format="%(asctime)s [%(levelname)s] %(message)s",
50
- )
51
- logger = logging.getLogger(__name__)
52
-
53
-
54
- # ── Structured stdout logging for hackathon evaluation ──────────────────────────
55
- # Required format:
56
- # [START] task=<task_name> env=<benchmark> model=<model_name>
57
- # [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
58
- # [END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
59
-
60
- BENCHMARK = "hallucination-guard-env"
61
-
62
-
63
- def log_start(task: str, env: str, model: str) -> None:
64
- """Emit [START] log in required format."""
65
- print(f"[START] task={task} env={env} model={model}", flush=True)
66
-
67
-
68
- def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None) -> None:
69
- """Emit [STEP] log in required format."""
70
- error_val = error if error else "null"
71
- done_val = str(done).lower()
72
- # Truncate action if too long and handle Unicode
73
- action_trunc = action[:200].replace("\n", " ") if len(action) > 200 else action.replace("\n", " ")
74
- # Replace non-ASCII characters to avoid encoding issues
75
- action_trunc = action_trunc.encode('ascii', 'replace').decode('ascii')
76
- print(f"[STEP] step={step} action={action_trunc} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
77
-
78
-
79
- def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
80
- """Emit [END] log in required format."""
81
- rewards_str = ",".join(f"{r:.2f}" for r in rewards)
82
- print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
83
-
84
- # ── Mandatory environment variables ──────────────────────────────────────────
85
- API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
86
- MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
87
- HF_TOKEN = os.getenv("HF_TOKEN", "")
88
-
89
- # ── Defaults ──────────────────────────────────────────────────────────────────
90
- DEFAULT_ENV_URL = os.environ.get(
91
- "HALLUGUARD_ENV_URL",
92
- "https://samsankar-hallucination-guard-env.hf.space",
93
- )
94
- DEFAULT_EPISODES = 3
95
- DEFAULT_STEPS = 5
96
- SEED = 42
97
-
98
- TASK_ORDER = [
99
- ("task_1_factual_grounding", "beginner"),
100
- ("task_2_multi_hop_synthesis", "intermediate"),
101
- ("task_3_adversarial_resistance", "advanced"),
102
- ]
103
-
104
- SYSTEM_PROMPT = """You are a precise, grounded question-answering assistant.
105
-
106
- RULES (follow strictly):
107
- 1. Answer ONLY using information present in the CONTEXT provided.
108
- 2. If the answer is not in the context, say exactly: "I cannot answer from the provided context."
109
- 3. Keep answers concise β€” 1-3 sentences maximum.
110
- 4. Never fabricate facts, names, dates, or numbers not in the context.
111
- 5. If uncertain, express that uncertainty explicitly in your answer.
112
- """
113
-
114
- ANSWER_PROMPT_TEMPLATE = """CONTEXT:
115
- {context}
116
-
117
- QUESTION:
118
- {question}
119
-
120
- Instructions:
121
- - Answer using ONLY the context above.
122
- - Provide a source_quote: a short verbatim phrase from the context that supports your answer.
123
- - Rate your confidence from 0.0 (unsure) to 1.0 (certain).
124
-
125
- Respond in JSON with these exact keys:
126
- {{
127
- "answer": "<your answer>",
128
- "source_quote": "<verbatim phrase from context>",
129
- "confidence": <float 0.0-1.0>
130
- }}"""
131
-
132
-
133
- # ── Environment client ────────────────────────────────────────────────────────
134
-
135
- class EnvClient:
136
- """Thin HTTP wrapper around the HallucinationGuard REST API."""
137
-
138
- def __init__(self, base_url: str, timeout: int = 300):
139
- self.base = base_url.rstrip("/")
140
- self.timeout = timeout
141
- self.session = requests.Session()
142
- self._session_id: Optional[str] = None
143
-
144
- def _get(self, path: str) -> Dict[str, Any]:
145
- r = self.session.get(f"{self.base}{path}", timeout=self.timeout)
146
- r.raise_for_status()
147
- return r.json()
148
-
149
- def _post(self, path: str, body: Dict[str, Any] = {}) -> Dict[str, Any]:
150
- r = self.session.post(f"{self.base}{path}", json=body, timeout=self.timeout)
151
- r.raise_for_status()
152
- return r.json()
153
-
154
- def health(self) -> Dict[str, Any]:
155
- return self._get("/health")
156
-
157
- def list_tasks(self) -> Dict[str, Any]:
158
- return self._get("/tasks")
159
-
160
- def reset(self, difficulty: str, seed: int) -> Dict[str, Any]:
161
- result = self._post("/reset", {"difficulty": difficulty, "seed": seed})
162
- self._session_id = result.get("session_id")
163
- return result
164
-
165
- def step(self, answer: str, confidence: float, source_quote: str) -> Dict[str, Any]:
166
- body: Dict[str, Any] = {
167
- "answer": answer,
168
- "confidence": confidence,
169
- "source_quote": source_quote,
170
- }
171
- if self._session_id:
172
- body["session_id"] = self._session_id
173
- return self._post("/step", body)
174
-
175
- def grade(self, task_id: str,
176
- step_rewards: List[float],
177
- step_infos: List[Dict[str, Any]]) -> Dict[str, Any]:
178
- return self._post("/grader", {
179
- "task_id": task_id,
180
- "step_rewards": step_rewards,
181
- "step_infos": step_infos,
182
- })
183
-
184
-
185
- # ── Agents ────────────────────────────────────────────────────────────────────
186
-
187
- def heuristic_agent(question: str, context: str) -> Dict[str, Any]:
188
- """
189
- Deterministic heuristic baseline β€” no LLM required.
190
- Extracts the first meaningful sentence of the context as the answer.
191
- Used when --heuristic flag is set or no API credentials are available.
192
- """
193
- sentences = [s.strip() for s in context.replace("\n", " ").split(".") if len(s.strip()) > 10]
194
- answer = sentences[0] if sentences else context[:120]
195
- source_quote = context[:80] if context else ""
196
- return {"answer": answer, "confidence": 0.6, "source_quote": source_quote}
197
-
198
-
199
- def openai_agent(model: str, base_url: str, api_key: str) -> Callable:
200
- """
201
- Returns a callable agent backed by any OpenAI-compatible chat endpoint.
202
- Uses API_BASE_URL, MODEL_NAME, HF_TOKEN from environment variables.
203
- """
204
- try:
205
- from openai import OpenAI
206
- except ImportError:
207
- logger.error("openai package not installed. Run: pip install openai")
208
- sys.exit(1)
209
-
210
- if not api_key:
211
- logger.error(
212
- "HF_TOKEN not set. Export it or use --heuristic for the "
213
- "no-API baseline.\n"
214
- " export HF_TOKEN=hf_..."
215
- )
216
- sys.exit(1)
217
-
218
- client = OpenAI(base_url=base_url, api_key=api_key)
219
-
220
- def _call(question: str, context: str) -> Dict[str, Any]:
221
- prompt = ANSWER_PROMPT_TEMPLATE.format(
222
- context=context[:3000],
223
- question=question,
224
- )
225
-
226
- # First try with JSON response format
227
- try:
228
- resp = client.chat.completions.create(
229
- model=model,
230
- messages=[
231
- {"role": "system", "content": SYSTEM_PROMPT},
232
- {"role": "user", "content": prompt},
233
- ],
234
- temperature=0.0,
235
- max_tokens=512, # Increased from 256 to allow complete JSON
236
- response_format={"type": "json_object"},
237
- )
238
- raw = resp.choices[0].message.content or "{}"
239
- parsed = json.loads(raw)
240
- return {
241
- "answer": str(parsed.get("answer", "")),
242
- "confidence": float(parsed.get("confidence", 0.5)),
243
- "source_quote": str(parsed.get("source_quote", "")),
244
- }
245
- except json.JSONDecodeError:
246
- raw_text = resp.choices[0].message.content or ""
247
- return {"answer": raw_text[:200], "confidence": 0.4, "source_quote": ""}
248
- except Exception as e:
249
- # Fallback: try without response_format for models that don't support it
250
- error_msg = str(e)
251
- if "json_validate_failed" in error_msg or "response_format" in error_msg.lower():
252
- logger.warning(f"JSON format failed, trying without response_format: {e}")
253
- try:
254
- resp = client.chat.completions.create(
255
- model=model,
256
- messages=[
257
- {"role": "system", "content": SYSTEM_PROMPT},
258
- {"role": "user", "content": prompt},
259
- ],
260
- temperature=0.0,
261
- max_tokens=512,
262
- )
263
- raw = resp.choices[0].message.content or "{}"
264
- # Try to extract JSON from response
265
- import re
266
- json_match = re.search(r'\{[^{}]*"answer"[^{}]*\}', raw, re.DOTALL)
267
- if json_match:
268
- try:
269
- parsed = json.loads(json_match.group(0))
270
- return {
271
- "answer": str(parsed.get("answer", "")),
272
- "confidence": float(parsed.get("confidence", 0.5)),
273
- "source_quote": str(parsed.get("source_quote", "")),
274
- }
275
- except:
276
- pass
277
- # If no valid JSON found, use raw text
278
- return {"answer": raw[:200], "confidence": 0.4, "source_quote": ""}
279
- except Exception as e2:
280
- logger.warning(f"Fallback LLM call also failed: {e2}")
281
- return {"answer": "", "confidence": 0.0, "source_quote": ""}
282
- else:
283
- logger.warning(f"LLM call failed: {e}")
284
- return {"answer": "", "confidence": 0.0, "source_quote": ""}
285
-
286
- return _call
287
-
288
-
289
- # ── Episode runner ────────────────────────────────────────────────────────────
290
-
291
- def run_episode(
292
- env: EnvClient,
293
- agent_fn: Callable,
294
- task_id: str,
295
- difficulty: str,
296
- steps: int,
297
- seed: int,
298
- episode_num: int,
299
- model_label: str,
300
- ) -> Dict[str, Any]:
301
- """Run one episode and return rewards + infos for the grader."""
302
- # Emit START log at beginning of each task
303
- if episode_num == 0:
304
- log_start(task=task_id, env=BENCHMARK, model=model_label)
305
-
306
- obs = env.reset(difficulty=difficulty, seed=seed + episode_num)
307
- step_rewards: List[float] = []
308
- step_infos: List[Dict[str, Any]] = []
309
-
310
- for step_n in range(steps):
311
- if obs.get("done", False):
312
- break
313
-
314
- question = obs.get("question", "")
315
- context = obs.get("context", "")
316
-
317
- action = agent_fn(question, context)
318
-
319
- obs = env.step(
320
- answer=action["answer"],
321
- confidence=action["confidence"],
322
- source_quote=action["source_quote"],
323
- )
324
-
325
- reward = float(obs.get("reward") or 0.0)
326
- done = bool(obs.get("done", False))
327
- step_rewards.append(reward)
328
- # Extract metrics from observation metadata (returned by the environment)
329
- obs_metadata = obs.get("metadata", {})
330
- if isinstance(obs_metadata, dict):
331
- obs_correctness = obs_metadata.get("correctness", 0.0)
332
- obs_calibration = obs_metadata.get("calibration", 0.0)
333
- obs_hall_score = obs_metadata.get("hallucination_score", 0.0)
334
- else:
335
- obs_correctness = 0.0
336
- obs_calibration = 0.0
337
- obs_hall_score = 0.0
338
- # Extract ML component scores from reward_breakdown if available
339
- rb = obs_metadata.get("reward_breakdown", {}) if isinstance(obs_metadata, dict) else {}
340
- step_infos.append({
341
- "correctness": obs_correctness,
342
- "grounding": obs.get("grounding_score", 0.0),
343
- "calibration": obs_calibration if obs_calibration else action["confidence"],
344
- "hallucination_score": obs_hall_score if obs_hall_score else (1.0 if obs.get("is_hallucination") else 0.0),
345
- "is_hallucination": bool(obs.get("is_hallucination", False)),
346
- "semantic_consistency": rb.get("semantic_consistency", 0.0),
347
- "rouge_l": rb.get("rouge_l", 0.0),
348
- "bert_score": rb.get("bert_score", 0.0),
349
- "align_score": rb.get("align_score", 0.0),
350
- })
351
-
352
- # Format action for logging (truncated answer)
353
- action_str = f'answer="{action["answer"][:100]}" confidence={action["confidence"]:.2f}'
354
-
355
- # Emit STEP log
356
- log_step(
357
- step=step_n + 1,
358
- action=action_str,
359
- reward=reward,
360
- done=done,
361
- error=None,
362
- )
363
-
364
- status = "HALLUCINATION" if obs.get("is_hallucination") else "OK"
365
- logger.info(
366
- f" [{task_id[:25]}] ep={episode_num+1} step={step_n+1} "
367
- f"reward={reward:.3f} [{status}]"
368
- )
369
-
370
- grade = env.grade(task_id, step_rewards, step_infos)
371
- episode_score = grade.get("score", 0.0)
372
-
373
- return {
374
- "episode": episode_num + 1,
375
- "score": episode_score,
376
- "rewards": step_rewards,
377
- "grade": grade,
378
- }
379
-
380
-
381
- # ── Main ──────────────────────────────────────────────────────────────────────
382
-
383
- def main():
384
- parser = argparse.ArgumentParser(
385
- description="HallucinationGuard-Env inference script",
386
- formatter_class=argparse.RawDescriptionHelpFormatter,
387
- )
388
- parser.add_argument("--env-url", default=DEFAULT_ENV_URL, help="Environment URL")
389
- parser.add_argument("--episodes", type=int, default=DEFAULT_EPISODES)
390
- parser.add_argument("--steps", type=int, default=DEFAULT_STEPS)
391
- parser.add_argument("--seed", type=int, default=SEED)
392
- parser.add_argument("--heuristic", action="store_true",
393
- help="Use heuristic agent (no API key needed)")
394
- parser.add_argument("--output", default=None,
395
- help="Write JSON results to this file")
396
- args = parser.parse_args()
397
-
398
- # ── Connect to environment ────────────────────────────────────────────────
399
- env = EnvClient(args.env_url)
400
-
401
- logger.info(f"Connecting to environment: {args.env_url}")
402
- try:
403
- h = env.health()
404
- logger.info(f" Environment: {h.get('service')} v{h.get('version')} β€” healthy")
405
- except Exception as e:
406
- logger.error(f"Cannot reach environment: {e}")
407
- sys.exit(1)
408
-
409
- # Verify /tasks endpoint
410
- try:
411
- tasks_info = env.list_tasks()
412
- task_ids = [t["task_id"] for t in tasks_info.get("tasks", [])]
413
- logger.info(f" Tasks: {task_ids}")
414
- except Exception as e:
415
- logger.error(f"/tasks endpoint failed: {e}")
416
- sys.exit(1)
417
-
418
- # ── Select agent ─────────────────────────────────────────────────────────
419
- if args.heuristic or not HF_TOKEN:
420
- logger.info("Using heuristic baseline agent (no LLM).")
421
- agent_fn = heuristic_agent
422
- model_label = "heuristic_baseline"
423
- else:
424
- logger.info(f"Using LLM agent: {MODEL_NAME} via {API_BASE_URL}")
425
- agent_fn = openai_agent(MODEL_NAME, API_BASE_URL, HF_TOKEN)
426
- model_label = MODEL_NAME
427
-
428
- # ── Run all 3 tasks ───────────────────────────────────────────────────────
429
- task_results: List[Dict[str, Any]] = []
430
- all_scores: List[float] = []
431
- all_rewards: List[float] = []
432
- total_steps = 0
433
- start_time = time.time()
434
-
435
- for task_id, difficulty in TASK_ORDER:
436
- logger.info(f"\n{'='*55}")
437
- logger.info(f"TASK: {task_id} (difficulty={difficulty})")
438
- logger.info(f"{'='*55}")
439
-
440
- episode_scores: List[float] = []
441
- task_rewards: List[float] = []
442
-
443
- for ep in range(args.episodes):
444
- ep_result = run_episode(
445
- env=env,
446
- agent_fn=agent_fn,
447
- task_id=task_id,
448
- difficulty=difficulty,
449
- steps=args.steps,
450
- seed=args.seed,
451
- episode_num=ep,
452
- model_label=model_label,
453
- )
454
- episode_scores.append(ep_result["score"])
455
- all_scores.append(ep_result["score"])
456
- all_rewards.extend(ep_result["rewards"])
457
- task_rewards.extend(ep_result["rewards"])
458
- total_steps += len(ep_result["rewards"])
459
-
460
- task_avg = sum(episode_scores) / max(len(episode_scores), 1)
461
- task_std = (
462
- (sum((s - task_avg) ** 2 for s in episode_scores) / max(len(episode_scores), 1)) ** 0.5
463
- if len(episode_scores) > 1 else 0.0
464
- )
465
-
466
- # Emit [END] log for this task
467
- success = task_avg >= 0.5 # Consider success if score >= 0.5
468
- log_end(
469
- success=success,
470
- steps=len(task_rewards),
471
- score=task_avg,
472
- rewards=task_rewards,
473
- )
474
-
475
- task_results.append({
476
- "task_id": task_id,
477
- "difficulty": difficulty,
478
- "episodes": args.episodes,
479
- "episode_scores": [round(s, 4) for s in episode_scores],
480
- "avg_score": round(task_avg, 4),
481
- "std_score": round(task_std, 4),
482
- })
483
- logger.info(f"\n Task score: {task_avg:.4f} Β± {task_std:.4f}")
484
-
485
- elapsed = time.time() - start_time
486
- overall_score = sum(all_scores) / max(len(all_scores), 1)
487
- avg_reward = sum(all_rewards) / max(len(all_rewards), 1)
488
-
489
- summary = {
490
- "model": model_label,
491
- "api_base_url": API_BASE_URL,
492
- "env_url": args.env_url,
493
- "seed": args.seed,
494
- "episodes_per_task": args.episodes,
495
- "steps_per_episode": args.steps,
496
- "total_steps": total_steps,
497
- "elapsed_seconds": round(elapsed, 1),
498
- "tasks": task_results,
499
- "overall": {
500
- "score": round(overall_score, 4),
501
- "avg_reward": round(avg_reward, 4),
502
- },
503
- }
504
-
505
- # ── Print results ─────────────────────────────────────────────────────────
506
- print("\n" + "=" * 55)
507
- print("INFERENCE RESULTS")
508
- print("=" * 55)
509
- print(f"Model : {model_label}")
510
- print(f"Seed : {args.seed} | {args.episodes} episodes x {args.steps} steps")
511
- print(f"Elapsed : {elapsed:.1f}s")
512
- print()
513
- for t in task_results:
514
- # Use ASCII characters for progress bar
515
- bar = "#" * round(t["avg_score"] * 20)
516
- print(
517
- f" {t['task_id']:<42} "
518
- f"{t['avg_score']:.4f} +- {t['std_score']:.4f} |{bar:<20}|"
519
- )
520
- print()
521
- print(f" {'OVERALL':<42} {overall_score:.4f}")
522
- print("=" * 55)
523
-
524
- if args.output:
525
- with open(args.output, "w") as f:
526
- json.dump(summary, f, indent=2)
527
- logger.info(f"Results written to {args.output}")
528
-
529
- return summary
530
-
531
-
532
- if __name__ == "__main__":
533
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ inference.py β€” AutoClean-AI Inference Script
5
+ ============================================
6
+ Official submission script for OpenEnv Hackathon.
7
+
8
+ Environment variables (set before running):
9
+ API_BASE_URL The API endpoint for the LLM (e.g. https://router.huggingface.co/v1)
10
+ MODEL_NAME The model identifier (e.g. Qwen/Qwen2.5-72B-Instruct)
11
+ HF_TOKEN Your HuggingFace API key
12
+
13
+ Usage:
14
+ export API_BASE_URL="https://router.huggingface.co/v1"
15
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
16
+ export HF_TOKEN="hf_..."
17
+ python inference.py
18
+
19
+ # Dry-run without API key (heuristic agent):
20
+ python inference.py --heuristic
21
+
22
+ # Run against local dev server:
23
+ python inference.py --env-url http://localhost:7860
24
+
25
+ Expected baseline scores (heuristic agent, seed=42, 3 episodes x 8 steps):
26
+ easy_001 : ~0.62
27
+ medium_001 : ~0.54
28
+ hard_001 : ~0.41
29
+ overall : ~0.52
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import os
35
+ # Fix Unicode encoding for Windows console
36
+ os.environ['PYTHONIOENCODING'] = 'utf-8'
37
+
38
+ import sys
39
+ import json
40
+ import time
41
+ import argparse
42
+ import logging
43
+ from typing import Dict, Any, List, Optional, Callable
44
+
45
+ import requests
46
+
47
+ logging.basicConfig(
48
+ level=logging.INFO,
49
+ format="%(asctime)s [%(levelname)s] %(message)s",
50
+ )
51
+ logger = logging.getLogger(__name__)
52
+
53
+
54
+ # ── Structured stdout logging for hackathon evaluation ──────────────────────────
55
+ # Required format:
56
+ # [START] task=<task_name> env=<benchmark> model=<model_name>
57
+ # [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
58
+ # [END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
59
+
60
+ BENCHMARK = "openenv-datacleaner"
61
+
62
+
63
+ def log_start(task: str, env: str, model: str) -> None:
64
+ """Emit [START] log in required format."""
65
+ print(f"[START] task={task} env={env} model={model}", flush=True)
66
+
67
+
68
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None) -> None:
69
+ """Emit [STEP] log in required format."""
70
+ error_val = error if error else "null"
71
+ done_val = str(done).lower()
72
+ # Truncate action if too long and handle Unicode
73
+ action_trunc = action[:200].replace("\n", " ") if len(action) > 200 else action.replace("\n", " ")
74
+ # Replace non-ASCII characters to avoid encoding issues
75
+ action_trunc = action_trunc.encode('ascii', 'replace').decode('ascii')
76
+ print(f"[STEP] step={step} action={action_trunc} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
77
+
78
+
79
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
80
+ """Emit [END] log in required format."""
81
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
82
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
83
+
84
+ # ── Mandatory environment variables ──────────────────────────────────────────
85
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://openrouter.ai/api/v1")
86
+ MODEL_NAME = os.getenv("MODEL_NAME", "qwen/qwen3-next-80b-a3b-instruct:free")
87
+ HF_TOKEN = os.getenv("OPENROUTER_API_KEY") or os.getenv("HF_TOKEN", "")
88
+
89
+ # ── Defaults ──────────────────────────────────────────────────────────────────
90
+ DEFAULT_ENV_URL = os.environ.get(
91
+ "AUTOCLEAN_ENV_URL",
92
+ "https://sairaj2-openenv-datacleaner.hf.space",
93
+ )
94
+ DEFAULT_EPISODES = 3
95
+ DEFAULT_STEPS = 8
96
+ SEED = 42
97
+
98
+ TASK_ORDER = [
99
+ ("easy_001", "beginner"),
100
+ ("medium_001", "intermediate"),
101
+ ("hard_001", "advanced"),
102
+ ]
103
+
104
+ SYSTEM_PROMPT = """You are an expert data cleaning agent for tabular datasets.
105
+
106
+ RULES (follow strictly):
107
+ 1. You are working with a dataset and need to perform data cleaning operations
108
+ 2. Choose exactly ONE action per step from the allowed actions list
109
+ 3. Explain your reasoning clearly
110
+ 4. Always return valid JSON format
111
+
112
+ ALLOWED ACTIONS:
113
+ - drop_nulls
114
+ - fill_nulls
115
+ - remove_duplicates
116
+ - filter_rows
117
+ - drop_columns
118
+ - convert_types
119
+ - validate_email
120
+ - outlier_removal
121
+ - normalize
122
+ - submit
123
+ """
124
+
125
+ ACTION_PROMPT_TEMPLATE = """DATASET INFORMATION:
126
+ {dataset_info}
127
+
128
+ TASK:
129
+ {task_description}
130
+
131
+ PREVIOUS ACTIONS:
132
+ {action_history}
133
+
134
+ Instructions:
135
+ - Select the next best action to clean this dataset
136
+ - Provide reasoning for your choice
137
+ - Return JSON with these exact keys:
138
+ {{
139
+ "action_type": "<action name>",
140
+ "params": {{<parameters for action>}},
141
+ "reasoning": "<short explanation>"
142
+ }}"""
143
+
144
+
145
+ # ── Environment client ────────────────────────────────────────────────────────
146
+
147
+ class EnvClient:
148
+ """Thin HTTP wrapper around the AutoClean REST API."""
149
+
150
+ def __init__(self, base_url: str, timeout: int = 300):
151
+ self.base = base_url.rstrip("/")
152
+ self.timeout = timeout
153
+ self.session = requests.Session()
154
+ self._session_id: Optional[str] = None
155
+
156
+ def _request_with_retry(self, method: str, path: str, body: Dict[str, Any] = None, retries: int = 3, backoff: float = 2.0) -> Dict[str, Any]:
157
+ url = f"{self.base}{path}"
158
+ for attempt in range(retries):
159
+ try:
160
+ if method == "GET":
161
+ r = self.session.get(url, timeout=self.timeout)
162
+ else:
163
+ r = self.session.post(url, json=body, timeout=self.timeout)
164
+ r.raise_for_status()
165
+ return r.json()
166
+ except (requests.exceptions.ChunkedEncodingError,
167
+ requests.exceptions.ConnectionError,
168
+ requests.exceptions.ReadTimeout) as e:
169
+ if attempt < retries - 1:
170
+ wait = backoff * (attempt + 1)
171
+ logger.warning(f"Request to {path} failed ({type(e).__name__}), retrying in {wait:.0f}s... ({attempt+1}/{retries})")
172
+ time.sleep(wait)
173
+ else:
174
+ raise
175
+
176
+ def _get(self, path: str) -> Dict[str, Any]:
177
+ return self._request_with_retry("GET", path)
178
+
179
+ def _post(self, path: str, body: Dict[str, Any] = {}) -> Dict[str, Any]:
180
+ return self._request_with_retry("POST", path, body)
181
+
182
+ def health(self) -> Dict[str, Any]:
183
+ return self._get("/health")
184
+
185
+ def list_tasks(self) -> Dict[str, Any]:
186
+ return self._get("/tasks")
187
+
188
+ def reset(self, difficulty: str, seed: int) -> Dict[str, Any]:
189
+ result = self._post("/reset", {"difficulty": difficulty, "seed": seed})
190
+ self._session_id = result.get("session_id")
191
+ return result
192
+
193
+ def step(self, action_type: str, params: Dict[str, Any]) -> Dict[str, Any]:
194
+ body: Dict[str, Any] = {
195
+ "action_type": action_type,
196
+ "params": params,
197
+ }
198
+ if self._session_id:
199
+ body["session_id"] = self._session_id
200
+ return self._post("/step", body)
201
+
202
+ def submit(self) -> Dict[str, Any]:
203
+ body: Dict[str, Any] = {}
204
+ if self._session_id:
205
+ body["session_id"] = self._session_id
206
+ return self._post("/submit", body)
207
+
208
+ def grade(self, task_id: str,
209
+ step_rewards: List[float],
210
+ step_infos: List[Dict[str, Any]]) -> Dict[str, Any]:
211
+ return self._post("/grader", {
212
+ "task_id": task_id,
213
+ "step_rewards": step_rewards,
214
+ "step_infos": step_infos,
215
+ })
216
+
217
+
218
+ # ── Agents ────────────────────────────────────────────────────────────────────
219
+
220
+ def heuristic_agent(task_id: str, dataset_info: Dict[str, Any], task_description: str, action_history: List[str]) -> Dict[str, Any]:
221
+ """
222
+ Deterministic heuristic baseline β€” no LLM required.
223
+ Implements standard data cleaning workflows based on task difficulty.
224
+ Used when --heuristic flag is set or no API credentials are available.
225
+ """
226
+ columns = dataset_info.get("columns", [])
227
+ null_counts = dataset_info.get("null_counts", {})
228
+ numeric_columns = [name for name in columns if name in {"age", "salary", "score", "id", "JoiningYear", "ExperienceInCurrentDomain"}]
229
+
230
+ def has_taken(action_type: str) -> bool:
231
+ return action_type in action_history
232
+
233
+ if task_id == "easy_001":
234
+ if sum(int(v) for v in null_counts.values()) > 0 and not has_taken("drop_nulls"):
235
+ return {"action_type": "drop_nulls", "params": {}}
236
+ if not has_taken("remove_duplicates"):
237
+ return {"action_type": "remove_duplicates", "params": {}}
238
+ return {"action_type": "submit", "params": {}}
239
+
240
+ if task_id == "medium_001":
241
+ if sum(int(v) for v in null_counts.values()) > 0 and not has_taken("fill_nulls"):
242
+ target = "age" if "age" in columns else (numeric_columns[0] if numeric_columns else None)
243
+ params = {"column": target, "strategy": "median"} if target else {"strategy": "mode"}
244
+ return {"action_type": "fill_nulls", "params": params}
245
+ if "email" in columns and not has_taken("validate_email"):
246
+ return {"action_type": "validate_email", "params": {"column": "email", "drop_invalid": True}}
247
+ if "salary" in columns and not has_taken("outlier_removal"):
248
+ return {"action_type": "outlier_removal", "params": {"column": "salary", "multiplier": 1.5}}
249
+ return {"action_type": "submit", "params": {}}
250
+
251
+ if task_id == "hard_001":
252
+ if sum(int(v) for v in null_counts.values()) > 0 and not has_taken("fill_nulls"):
253
+ target = "salary" if "salary" in columns else (numeric_columns[0] if numeric_columns else None)
254
+ params = {"column": target, "strategy": "median"} if target else {"strategy": "mode"}
255
+ return {"action_type": "fill_nulls", "params": params}
256
+ if not has_taken("remove_duplicates"):
257
+ return {"action_type": "remove_duplicates", "params": {}}
258
+ if "email" in columns and not has_taken("validate_email"):
259
+ return {"action_type": "validate_email", "params": {"column": "email", "drop_invalid": True}}
260
+ if "age" in columns and not has_taken("convert_types"):
261
+ return {"action_type": "convert_types", "params": {"column": "age", "dtype": "int"}}
262
+ if "salary" in columns and not has_taken("outlier_removal"):
263
+ return {"action_type": "outlier_removal", "params": {"column": "salary", "multiplier": 1.5}}
264
+ if "score" in columns and not has_taken("normalize"):
265
+ return {"action_type": "normalize", "params": {"column": "score", "method": "minmax"}}
266
+ return {"action_type": "submit", "params": {}}
267
+
268
+ return {"action_type": "submit", "params": {}}
269
+
270
+
271
+ def openai_agent(model: str, base_url: str, api_key: str) -> Callable:
272
+ """
273
+ Returns a callable agent backed by any OpenAI-compatible chat endpoint.
274
+ Uses API_BASE_URL, MODEL_NAME, HF_TOKEN from environment variables.
275
+ """
276
+ try:
277
+ from openai import OpenAI
278
+ except ImportError:
279
+ logger.error("openai package not installed. Run: pip install openai")
280
+ sys.exit(1)
281
+
282
+ if not api_key:
283
+ logger.error(
284
+ "HF_TOKEN not set. Export it or use --heuristic for the "
285
+ "no-API baseline.\n"
286
+ " export HF_TOKEN=hf_..."
287
+ )
288
+ sys.exit(1)
289
+
290
+ client = OpenAI(base_url=base_url, api_key=api_key)
291
+
292
+ def _call(task_id: str, dataset_info: Dict[str, Any], task_description: str, action_history: List[str]) -> Dict[str, Any]:
293
+ prompt = ACTION_PROMPT_TEMPLATE.format(
294
+ dataset_info=json.dumps(dataset_info, indent=2),
295
+ task_description=task_description,
296
+ action_history=", ".join(action_history) if action_history else "None",
297
+ )
298
+
299
+ # Try with JSON response format first, fall back to no format
300
+ for use_json_format in [True, False]:
301
+ try:
302
+ kwargs = dict(
303
+ model=model,
304
+ messages=[
305
+ {"role": "system", "content": SYSTEM_PROMPT},
306
+ {"role": "user", "content": prompt},
307
+ ],
308
+ temperature=0.1,
309
+ max_tokens=512,
310
+ )
311
+ if use_json_format:
312
+ kwargs["response_format"] = {"type": "json_object"}
313
+
314
+ resp = client.chat.completions.create(**kwargs)
315
+ raw = (resp.choices[0].message.content or "").strip()
316
+
317
+ # Strip markdown code fences if present (```json ... ```)
318
+ import re
319
+ fence_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', raw, re.DOTALL)
320
+ if fence_match:
321
+ raw = fence_match.group(1)
322
+
323
+ # Try direct JSON parse
324
+ try:
325
+ parsed = json.loads(raw)
326
+ action_type = str(parsed.get("action_type", ""))
327
+ if action_type:
328
+ return {
329
+ "action_type": action_type,
330
+ "params": parsed.get("params", {}),
331
+ }
332
+ except json.JSONDecodeError:
333
+ pass
334
+
335
+ # Try to extract JSON object from mixed text
336
+ json_match = re.search(r'\{[^{}]*"action_type"[^{}]*\}', raw, re.DOTALL)
337
+ if json_match:
338
+ try:
339
+ parsed = json.loads(json_match.group(0))
340
+ return {
341
+ "action_type": str(parsed.get("action_type", "")),
342
+ "params": parsed.get("params", {}),
343
+ }
344
+ except json.JSONDecodeError:
345
+ pass
346
+
347
+ # If JSON format was tried and failed, fall through to no-format attempt
348
+ if use_json_format:
349
+ logger.warning("JSON parse failed, trying without response_format")
350
+ continue
351
+
352
+ # Last resort: fall back to heuristic
353
+ return heuristic_agent(task_id, dataset_info, task_description, action_history)
354
+
355
+ except Exception as e:
356
+ if use_json_format:
357
+ error_msg = str(e)
358
+ if "response_format" in error_msg.lower() or "json_validate_failed" in error_msg:
359
+ logger.warning(f"JSON format not supported, trying without: {e}")
360
+ continue
361
+ else:
362
+ logger.warning(f"LLM call failed: {e}")
363
+ return heuristic_agent(task_id, dataset_info, task_description, action_history)
364
+ else:
365
+ logger.warning(f"LLM call failed: {e}")
366
+ return heuristic_agent(task_id, dataset_info, task_description, action_history)
367
+
368
+ return heuristic_agent(task_id, dataset_info, task_description, action_history)
369
+
370
+ return _call
371
+
372
+
373
+ # ── Episode runner ────────────────────────────────────────────────────────────
374
+
375
+ def run_episode(
376
+ env: EnvClient,
377
+ agent_fn: Callable,
378
+ task_id: str,
379
+ difficulty: str,
380
+ steps: int,
381
+ seed: int,
382
+ episode_num: int,
383
+ model_label: str,
384
+ task_info: Dict[str, Any],
385
+ ) -> Dict[str, Any]:
386
+ """Run one episode and return rewards + infos for the grader."""
387
+ # Emit START log at beginning of each task
388
+ if episode_num == 0:
389
+ log_start(task=task_id, env=BENCHMARK, model=model_label)
390
+
391
+ obs = env.reset(difficulty=difficulty, seed=seed + episode_num)
392
+ step_rewards: List[float] = []
393
+ step_infos: List[Dict[str, Any]] = []
394
+ action_history: List[str] = []
395
+
396
+ dataset_info = obs.get("dataset_info", {})
397
+
398
+ for step_n in range(steps):
399
+ if obs.get("done", False):
400
+ break
401
+
402
+ action = agent_fn(task_id, dataset_info, task_info.get("description", ""), action_history)
403
+
404
+ action_type = action.get("action_type", "submit")
405
+ params = action.get("params", {})
406
+
407
+ if action_type == "submit":
408
+ obs = env.submit()
409
+ else:
410
+ obs = env.step(action_type, params)
411
+
412
+ reward = float(obs.get("reward") or 0.0)
413
+ done = bool(obs.get("done", False))
414
+ step_rewards.append(reward)
415
+
416
+ # Extract metrics from observation metadata
417
+ obs_metadata = obs.get("metadata", {})
418
+ step_infos.append({
419
+ "action_type": action_type,
420
+ "reward": reward,
421
+ "done": done,
422
+ "metadata": obs_metadata,
423
+ })
424
+
425
+ # Format action for logging
426
+ param_text = ",".join(f"{key}={json.dumps(value, sort_keys=True)}" for key, value in sorted(params.items()))
427
+ action_str = f"{action_type}({param_text})" if param_text else action_type
428
+
429
+ # Emit STEP log
430
+ log_step(
431
+ step=step_n + 1,
432
+ action=action_str,
433
+ reward=reward,
434
+ done=done,
435
+ error=None,
436
+ )
437
+
438
+ action_history.append(action_type)
439
+ dataset_info = obs.get("dataset_info", dataset_info)
440
+
441
+ logger.info(
442
+ f" [{task_id[:25]}] ep={episode_num+1} step={step_n+1} "
443
+ f"reward={reward:.3f}"
444
+ )
445
+
446
+ if done:
447
+ break
448
+
449
+ if not done:
450
+ obs = env.submit()
451
+ reward = float(obs.get("reward") or 0.0)
452
+ step_rewards.append(reward)
453
+ log_step(
454
+ step=len(step_rewards),
455
+ action="submit()",
456
+ reward=reward,
457
+ done=True,
458
+ error=None,
459
+ )
460
+
461
+ # Calculate score locally (no /grader endpoint)
462
+ episode_score = sum(step_rewards) / max(len(step_rewards), 1)
463
+
464
+ return {
465
+ "episode": episode_num + 1,
466
+ "score": episode_score,
467
+ "rewards": step_rewards,
468
+ }
469
+
470
+
471
+ # ── Main ──────────────────────────────────────────────────────────────────────
472
+
473
+ def main():
474
+ parser = argparse.ArgumentParser(
475
+ description="AutoClean-AI inference script",
476
+ formatter_class=argparse.RawDescriptionHelpFormatter,
477
+ )
478
+ parser.add_argument("--env-url", default=DEFAULT_ENV_URL, help="Environment URL")
479
+ parser.add_argument("--episodes", type=int, default=DEFAULT_EPISODES)
480
+ parser.add_argument("--steps", type=int, default=DEFAULT_STEPS)
481
+ parser.add_argument("--seed", type=int, default=SEED)
482
+ parser.add_argument("--heuristic", action="store_true",
483
+ help="Use heuristic agent (no API key needed)")
484
+ parser.add_argument("--output", default=None,
485
+ help="Write JSON results to this file")
486
+ args = parser.parse_args()
487
+
488
+ # ── Connect to environment ────────────────────────────────────────────────
489
+ env = EnvClient(args.env_url)
490
+
491
+ logger.info(f"Connecting to environment: {args.env_url}")
492
+ try:
493
+ h = env.health()
494
+ logger.info(f" Environment: {h.get('service', 'AutoClean-AI')} v{h.get('version', '1.0.0')} β€” healthy")
495
+ except Exception as e:
496
+ logger.error(f"Cannot reach environment: {e}")
497
+ sys.exit(1)
498
+
499
+ # Verify /tasks endpoint
500
+ try:
501
+ tasks_info = env.list_tasks()
502
+ task_ids = [t["task_id"] for t in tasks_info.get("tasks", [])]
503
+ task_map = {t["task_id"]: t for t in tasks_info.get("tasks", [])}
504
+ logger.info(f" Available tasks: {task_ids}")
505
+ except Exception as e:
506
+ logger.error(f"/tasks endpoint failed: {e}")
507
+ sys.exit(1)
508
+
509
+ # ── Select agent ─────────────────────────────────────────────────────────
510
+ if args.heuristic or not HF_TOKEN:
511
+ logger.info("Using heuristic baseline agent (no LLM).")
512
+ agent_fn = heuristic_agent
513
+ model_label = "heuristic_baseline"
514
+ else:
515
+ logger.info(f"Using LLM agent: {MODEL_NAME} via {API_BASE_URL}")
516
+ agent_fn = openai_agent(MODEL_NAME, API_BASE_URL, HF_TOKEN)
517
+ model_label = MODEL_NAME
518
+
519
+ # ── Run all 3 tasks ───────────────────────────────────────────────────────
520
+ task_results: List[Dict[str, Any]] = []
521
+ all_scores: List[float] = []
522
+ all_rewards: List[float] = []
523
+ total_steps = 0
524
+ start_time = time.time()
525
+
526
+ for task_id, difficulty in TASK_ORDER:
527
+ logger.info(f"\n{'='*55}")
528
+ logger.info(f"TASK: {task_id} (difficulty={difficulty})")
529
+ logger.info(f"{'='*55}")
530
+
531
+ episode_scores: List[float] = []
532
+ task_rewards: List[float] = []
533
+
534
+ for ep in range(args.episodes):
535
+ ep_result = run_episode(
536
+ env=env,
537
+ agent_fn=agent_fn,
538
+ task_id=task_id,
539
+ difficulty=difficulty,
540
+ steps=args.steps,
541
+ seed=args.seed,
542
+ episode_num=ep,
543
+ model_label=model_label,
544
+ task_info=task_map.get(task_id, {}),
545
+ )
546
+ episode_scores.append(ep_result["score"])
547
+ all_scores.append(ep_result["score"])
548
+ all_rewards.extend(ep_result["rewards"])
549
+ task_rewards.extend(ep_result["rewards"])
550
+ total_steps += len(ep_result["rewards"])
551
+
552
+ task_avg = sum(episode_scores) / max(len(episode_scores), 1)
553
+ task_std = (
554
+ (sum((s - task_avg) ** 2 for s in episode_scores) / max(len(episode_scores), 1)) ** 0.5
555
+ if len(episode_scores) > 1 else 0.0
556
+ )
557
+
558
+ # Emit [END] log for this task
559
+ success = task_avg >= 0.5 # Consider success if score >= 0.5
560
+ log_end(
561
+ success=success,
562
+ steps=len(task_rewards),
563
+ score=task_avg,
564
+ rewards=task_rewards,
565
+ )
566
+
567
+ task_results.append({
568
+ "task_id": task_id,
569
+ "difficulty": difficulty,
570
+ "episodes": args.episodes,
571
+ "episode_scores": [round(s, 4) for s in episode_scores],
572
+ "avg_score": round(task_avg, 4),
573
+ "std_score": round(task_std, 4),
574
+ })
575
+ logger.info(f"\n Task score: {task_avg:.4f} Β± {task_std:.4f}")
576
+
577
+ elapsed = time.time() - start_time
578
+ overall_score = sum(all_scores) / max(len(all_scores), 1)
579
+ avg_reward = sum(all_rewards) / max(len(all_rewards), 1)
580
+
581
+ summary = {
582
+ "model": model_label,
583
+ "api_base_url": API_BASE_URL,
584
+ "env_url": args.env_url,
585
+ "seed": args.seed,
586
+ "episodes_per_task": args.episodes,
587
+ "steps_per_episode": args.steps,
588
+ "total_steps": total_steps,
589
+ "elapsed_seconds": round(elapsed, 1),
590
+ "tasks": task_results,
591
+ "overall": {
592
+ "score": round(overall_score, 4),
593
+ "avg_reward": round(avg_reward, 4),
594
+ },
595
+ }
596
+
597
+ # ── Print results ─────────────────────────────────────────────────────────
598
+ print("\n" + "=" * 55)
599
+ print("INFERENCE RESULTS")
600
+ print("=" * 55)
601
+ print(f"Model : {model_label}")
602
+ print(f"Seed : {args.seed} | {args.episodes} episodes x {args.steps} steps")
603
+ print(f"Elapsed : {elapsed:.1f}s")
604
+ print()
605
+ for t in task_results:
606
+ # Use ASCII characters for progress bar
607
+ bar = "#" * round(t["avg_score"] * 20)
608
+ print(
609
+ f" {t['task_id']:<42} "
610
+ f"{t['avg_score']:.4f} +- {t['std_score']:.4f} |{bar:<20}|"
611
+ )
612
+ print()
613
+ print(f" {'OVERALL':<42} {overall_score:.4f}")
614
+ print("=" * 55)
615
+
616
+ if args.output:
617
+ with open(args.output, "w") as f:
618
+ json.dump(summary, f, indent=2)
619
+ logger.info(f"Results written to {args.output}")
620
+
621
+ return summary
622
+
623
+
624
+ if __name__ == "__main__":
625
+ main()