Revanth-ml commited on
Commit
b91d18e
Β·
verified Β·
1 Parent(s): 4d9c59a

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. inference.py +87 -130
inference.py CHANGED
@@ -2,29 +2,21 @@
2
  """
3
  AgentOps Gym β€” Baseline inference script.
4
 
5
- Uses the synchronous OpenEnv client pattern (env.sync()) matching the
6
- hackathon sample inference.py. No async/await needed.
7
-
8
- Environment variables:
9
- IMAGE_NAME Docker image name (set by validator)
10
- HF_TOKEN HuggingFace / API key (or OPENAI_API_KEY)
11
- API_BASE_URL LLM endpoint (default: https://router.huggingface.co/v1)
12
- MODEL_NAME Model name (default: Qwen/Qwen2.5-72B-Instruct)
13
- ENV_BASE_URL Server URL (default: http://localhost:8000)
14
-
15
- Usage:
16
- IMAGE_NAME=agentops-gym HF_TOKEN=xxx python inference.py
17
- """
18
 
19
- from __future__ import annotations
 
 
 
 
20
 
 
21
  import json
22
  import os
23
  import re
24
- import sys
25
  from typing import Dict, List, Optional
26
 
27
- # Load .env if present
28
  try:
29
  from dotenv import load_dotenv
30
  load_dotenv()
@@ -33,7 +25,7 @@ except ImportError:
33
 
34
  from openai import OpenAI
35
 
36
- # Ensure package is importable when run from inside the package dir
37
  import pathlib, sys as _sys
38
  _root = pathlib.Path(__file__).resolve().parent
39
  _parent = _root.parent
@@ -45,18 +37,14 @@ from agentops_gym.client import AgentOpsEnv
45
  from agentops_gym.models import ToolCall
46
 
47
  # ---------------------------------------------------------------------------
48
- # Configuration
49
  # ---------------------------------------------------------------------------
50
 
51
- IMAGE_NAME = os.getenv("IMAGE_NAME")
52
- API_KEY = (
53
- os.getenv("HF_TOKEN")
54
- or os.getenv("OPENAI_API_KEY")
55
- or os.getenv("API_KEY")
56
- )
57
- API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
58
- MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
59
- ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
60
 
61
  BENCHMARK = "agentops-gym"
62
  MAX_STEPS = 10
@@ -101,16 +89,18 @@ def log_start(task: str, env: str, model: str) -> None:
101
 
102
 
103
  def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
104
- err_val = error if error else "null"
 
105
  print(
106
- f"[STEP] step={step} action={str(action).replace(chr(10), ' ')[:200]} "
107
- f"reward={reward:.2f} done={str(done).lower()} error={err_val}",
108
  flush=True,
109
  )
110
 
111
 
112
  def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
113
- score = max(0.001, min(0.999, score)) # must be strictly between 0 and 1
 
114
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
115
  print(
116
  f"[END] success={str(success).lower()} steps={steps} "
@@ -166,11 +156,10 @@ def extract_tool_call(text: str) -> Optional[Dict]:
166
  return None
167
 
168
 
169
- def get_model_action(client: OpenAI, obs_data: Dict, history: List[str], model_name: str = MODEL_NAME) -> Optional[Dict]:
170
- """Ask the LLM for a tool call. Returns parsed dict or None."""
171
  try:
172
  completion = client.chat.completions.create(
173
- model=model_name,
174
  messages=[
175
  {"role": "system", "content": SYSTEM_PROMPT},
176
  {"role": "user", "content": build_prompt(obs_data, history)},
@@ -185,34 +174,49 @@ def get_model_action(client: OpenAI, obs_data: Dict, history: List[str], model_n
185
  return None
186
 
187
  # ---------------------------------------------------------------------------
188
- # Single task runner β€” sync pattern matching sample inference.py
189
  # ---------------------------------------------------------------------------
190
 
191
- def run_task(client: OpenAI, task_id: str, model_name: str = MODEL_NAME) -> Dict:
192
- """Run one episode synchronously. Returns result dict."""
193
-
194
- # Build client β€” use docker image if set, else connect to running server
195
- if IMAGE_NAME:
196
- env_client = AgentOpsEnv.from_docker_image(IMAGE_NAME)
197
- else:
198
- env_client = AgentOpsEnv(base_url=ENV_BASE_URL)
199
-
200
- history: List[str] = []
201
- rewards: List[float] = []
202
  steps_taken = 0
203
- score = 0.0
204
- success = False
205
- last_error: Optional[str] = None
206
 
207
  log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
208
 
209
  try:
210
- # Use .sync() context manager β€” same pattern as sample inference.py
211
- with env_client.sync() as env:
212
- if IMAGE_NAME:
213
- result = env.reset()
214
- else:
215
- result = env.reset(task_id=task_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
  obs_data = (
218
  result.observation.model_dump()
@@ -220,63 +224,28 @@ def run_task(client: OpenAI, task_id: str, model_name: str = MODEL_NAME) -> Dict
220
  else result.observation.dict()
221
  )
222
 
223
- for step in range(1, MAX_STEPS + 1):
224
- if result.done or obs_data.get("done", False):
225
- break
226
-
227
- tool_call = get_model_action(client, obs_data, history, model_name)
228
- if tool_call is None:
229
- tool_call = {
230
- "tool": "Grep",
231
- "parameters": {"pattern": "def "},
232
- "reasoning": "fallback",
233
- }
234
-
235
- tool = tool_call.get("tool", "Grep")
236
- params = tool_call.get("parameters", {})
237
- reasoning = tool_call.get("reasoning", "")
238
- action_str = f"{tool}({json.dumps(params)})"
239
-
240
- try:
241
- result = env.step(
242
- ToolCall(tool=tool, parameters=params, reasoning=reasoning)
243
- )
244
- last_error = None
245
- except Exception as e:
246
- last_error = str(e)
247
- log_step(step=step, action=action_str, reward=0.0, done=True, error=last_error)
248
- break
249
-
250
- obs_data = (
251
- result.observation.model_dump()
252
- if hasattr(result.observation, "model_dump")
253
- else result.observation.dict()
254
- )
255
-
256
- reward = float(result.reward or 0.0)
257
- done = bool(result.done)
258
 
259
- rewards.append(reward)
260
- steps_taken = step
261
- history.append(f"Step {step}: {action_str} β†’ reward {reward:.2f}")
262
 
263
- log_step(step=step, action=action_str, reward=reward, done=done, error=None)
264
 
265
- if done:
266
- break
267
 
268
- # Pull grader score from last observation metadata
269
  meta = obs_data.get("metadata", {})
270
  score = float(meta.get("grader_score") or 0.0)
271
- if score == 0.0 and rewards:
272
  score = float(meta.get("cumulative_reward") or 0.0)
273
  score = max(0.001, min(0.999, score))
274
  success = score >= SUCCESS_SCORE_THRESHOLD
275
 
276
  except Exception as e:
277
- print(f"[DEBUG] Task {task_id} error: {e}", flush=True)
278
- last_error = str(e)
279
- score = 0.001 # never exactly 0.0
280
 
281
  finally:
282
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
@@ -290,44 +259,32 @@ def run_task(client: OpenAI, task_id: str, model_name: str = MODEL_NAME) -> Dict
290
  }
291
 
292
  # ---------------------------------------------------------------------------
293
- # Entry point
294
  # ---------------------------------------------------------------------------
295
 
296
- def main() -> None:
297
- # Validator requires EXACTLY these β€” square brackets, no fallback
298
- api_key = os.environ["API_KEY"]
299
- api_base_url = os.environ["API_BASE_URL"]
300
- model_name = os.environ.get("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
301
-
302
- print(f"[DEBUG] API_BASE_URL={api_base_url}", flush=True)
303
- print(f"[DEBUG] MODEL_NAME={model_name}", flush=True)
304
 
305
- client = OpenAI(base_url=api_base_url, api_key=api_key)
 
306
 
307
  print("=" * 60, flush=True)
308
  print("AgentOps Gym β€” Baseline Inference", flush=True)
309
- print(f"Model: {model_name} | Image: {IMAGE_NAME or ENV_BASE_URL}", flush=True)
310
  print("=" * 60, flush=True)
311
 
312
  results = []
313
- for task_id in ALL_TASKS:
314
- print("─" * 40, flush=True)
315
- results.append(run_task(client, task_id, model_name))
316
-
317
- total = sum(r["score"] for r in results)
318
- solved = sum(1 for r in results if r["success"])
319
- avg = total / len(results) if results else 0.0
320
-
321
- print("=" * 60, flush=True)
322
- print("BASELINE SUMMARY", flush=True)
323
- print("=" * 60, flush=True)
324
- for r in results:
325
- status = "βœ… PASS" if r["success"] else "❌ FAIL"
326
- print(f" {r['task_id']:>8} score={r['score']:.3f} steps={r['steps']:2d} {status}", flush=True)
327
- print(f"\n Average score: {avg:.3f}", flush=True)
328
- print(f" Solved: {solved} / {len(results)}", flush=True)
329
- print("=" * 60, flush=True)
330
 
 
331
  total = sum(r["score"] for r in results)
332
  solved = sum(1 for r in results if r["success"])
333
  avg = total / len(results) if results else 0.0
@@ -344,4 +301,4 @@ def main() -> None:
344
 
345
 
346
  if __name__ == "__main__":
347
- main()
 
2
  """
3
  AgentOps Gym β€” Baseline inference script.
4
 
5
+ Follows the exact pattern from the official OpenEnv sample inference.py.
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ STDOUT FORMAT:
8
+ [START] task=<task> env=<benchmark> model=<model>
9
+ [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
10
+ [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
11
+ """
12
 
13
+ import asyncio
14
  import json
15
  import os
16
  import re
 
17
  from typing import Dict, List, Optional
18
 
19
+ # Load .env if present (local dev only)
20
  try:
21
  from dotenv import load_dotenv
22
  load_dotenv()
 
25
 
26
  from openai import OpenAI
27
 
28
+ # Ensure package importable from any working directory
29
  import pathlib, sys as _sys
30
  _root = pathlib.Path(__file__).resolve().parent
31
  _parent = _root.parent
 
37
  from agentops_gym.models import ToolCall
38
 
39
  # ---------------------------------------------------------------------------
40
+ # Configuration β€” exactly matching the official sample pattern
41
  # ---------------------------------------------------------------------------
42
 
43
+ IMAGE_NAME = os.getenv("IMAGE_NAME")
44
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") # HF_TOKEN first
45
+
46
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
47
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
 
 
 
 
48
 
49
  BENCHMARK = "agentops-gym"
50
  MAX_STEPS = 10
 
89
 
90
 
91
  def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
92
+ error_val = error if error else "null"
93
+ done_val = str(done).lower()
94
  print(
95
+ f"[STEP] step={step} action={str(action).replace(chr(10),' ')[:200]} "
96
+ f"reward={reward:.2f} done={done_val} error={error_val}",
97
  flush=True,
98
  )
99
 
100
 
101
  def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
102
+ # Score must be strictly between 0 and 1
103
+ score = max(0.001, min(0.999, score))
104
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
105
  print(
106
  f"[END] success={str(success).lower()} steps={steps} "
 
156
  return None
157
 
158
 
159
+ def get_model_action(client: OpenAI, obs_data: Dict, history: List[str]) -> Optional[Dict]:
 
160
  try:
161
  completion = client.chat.completions.create(
162
+ model=MODEL_NAME,
163
  messages=[
164
  {"role": "system", "content": SYSTEM_PROMPT},
165
  {"role": "user", "content": build_prompt(obs_data, history)},
 
174
  return None
175
 
176
  # ---------------------------------------------------------------------------
177
+ # Single episode runner
178
  # ---------------------------------------------------------------------------
179
 
180
+ async def run_episode(env: AgentOpsEnv, client: OpenAI, task_id: str) -> Dict:
181
+ history: List[str] = []
182
+ rewards: List[float] = []
 
 
 
 
 
 
 
 
183
  steps_taken = 0
184
+ score = 0.001
185
+ success = False
186
+ obs_data: Dict = {}
187
 
188
  log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
189
 
190
  try:
191
+ result = await env.reset(seed=None, task_id=task_id)
192
+ obs_data = (
193
+ result.observation.model_dump()
194
+ if hasattr(result.observation, "model_dump")
195
+ else result.observation.dict()
196
+ )
197
+
198
+ for step in range(1, MAX_STEPS + 1):
199
+ if result.done or obs_data.get("done", False):
200
+ break
201
+
202
+ tool_call = get_model_action(client, obs_data, history) or {
203
+ "tool": "Grep",
204
+ "parameters": {"pattern": "def "},
205
+ "reasoning": "fallback",
206
+ }
207
+
208
+ tool = tool_call.get("tool", "Grep")
209
+ params = tool_call.get("parameters", {})
210
+ reasoning = tool_call.get("reasoning", "")
211
+ action_str = f"{tool}({json.dumps(params)})"
212
+
213
+ try:
214
+ result = await env.step(
215
+ ToolCall(tool=tool, parameters=params, reasoning=reasoning)
216
+ )
217
+ except Exception as e:
218
+ log_step(step=step, action=action_str, reward=0.0, done=True, error=str(e))
219
+ break
220
 
221
  obs_data = (
222
  result.observation.model_dump()
 
224
  else result.observation.dict()
225
  )
226
 
227
+ reward = float(result.reward or 0.0)
228
+ done = bool(result.done)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
+ rewards.append(reward)
231
+ steps_taken = step
232
+ history.append(f"Step {step}: {action_str} β†’ reward {reward:.2f}")
233
 
234
+ log_step(step=step, action=action_str, reward=reward, done=done, error=None)
235
 
236
+ if done:
237
+ break
238
 
 
239
  meta = obs_data.get("metadata", {})
240
  score = float(meta.get("grader_score") or 0.0)
241
+ if score == 0.0:
242
  score = float(meta.get("cumulative_reward") or 0.0)
243
  score = max(0.001, min(0.999, score))
244
  success = score >= SUCCESS_SCORE_THRESHOLD
245
 
246
  except Exception as e:
247
+ print(f"[DEBUG] Episode error for {task_id}: {e}", flush=True)
248
+ score = 0.001
 
249
 
250
  finally:
251
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 
259
  }
260
 
261
  # ---------------------------------------------------------------------------
262
+ # Main β€” exactly matching official sample pattern
263
  # ---------------------------------------------------------------------------
264
 
265
+ async def async_main() -> None:
266
+ # Use module-level API_KEY and API_BASE_URL β€” same as official sample
267
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
 
 
 
 
 
268
 
269
+ # from_docker_image is awaitable β€” same as official sample
270
+ env = await AgentOpsEnv.from_docker_image(IMAGE_NAME)
271
 
272
  print("=" * 60, flush=True)
273
  print("AgentOps Gym β€” Baseline Inference", flush=True)
274
+ print(f"Model: {MODEL_NAME} | Image: {IMAGE_NAME}", flush=True)
275
  print("=" * 60, flush=True)
276
 
277
  results = []
278
+ try:
279
+ async with env:
280
+ for task_id in ALL_TASKS:
281
+ print("─" * 40, flush=True)
282
+ result = await run_episode(env, client, task_id)
283
+ results.append(result)
284
+ except Exception as e:
285
+ print(f"[DEBUG] Cleanup error (non-fatal): {e}", flush=True)
 
 
 
 
 
 
 
 
 
286
 
287
+ # Summary
288
  total = sum(r["score"] for r in results)
289
  solved = sum(1 for r in results if r["success"])
290
  avg = total / len(results) if results else 0.0
 
301
 
302
 
303
  if __name__ == "__main__":
304
+ asyncio.run(async_main())