""" Inference script for NeuralTuner — runs all 15 scenarios using the HF router. Usage: HF_TOKEN=hf_... python inference.py # all 15 scenarios HF_TOKEN=hf_... python inference.py --difficulty easy # 5 easy only HF_TOKEN=hf_... python inference.py --scenario inception_v3_medium HF_TOKEN=hf_... python inference.py --model Qwen/Qwen2.5-72B-Instruct """ from __future__ import annotations import argparse import json import os import re import sys from typing import Any, Dict, List, Optional from openai import OpenAI from models import NeuralTunerAction from server.neural_tuner_env_environment import NeuralTunerEnvironment from server.scenarios import EASY_SCENARIOS, HARD_SCENARIOS, MEDIUM_SCENARIOS, Scenario API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct") HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY") TEMPERATURE = 0.0 MAX_TOKENS = 1024 MAX_STEPS = 20 ALL_SCENARIOS: List[Scenario] = EASY_SCENARIOS + MEDIUM_SCENARIOS + HARD_SCENARIOS SYSTEM_PROMPT = """You are an expert ML optimization agent for Qualcomm Snapdragon hardware. Your goal is to reduce the latency and memory of a neural network while preserving accuracy. You interact with the NeuralTuner environment by emitting exactly one tool call per turn in this format: {"name": "", "arguments": {}} Available actions: - profile_layer(layer_id) — reveal sensitivity and optimization hints for a layer - quantize_layer(layer_id, dtype) — apply dtype quantization (FP32 | FP16 | INT8 | INT4) - prune_layer(layer_id, sparsity) — structured pruning (LOW | MEDIUM | HIGH) - revert_layer(layer_id) — reset a layer to FP32 / no pruning - benchmark() — simulate current plan and see latency/memory/accuracy - submit() — finalise and score the episode Strategy: 1. Profile the most expensive or sensitive-looking layers first. 2. Apply aggressive quantization (INT8 / INT4) to low-sensitivity layers. 3. Use FP16 for medium-sensitivity layers; leave high-sensitivity layers at FP32. 4. Call benchmark() after quantizing a batch to track progress. 5. Call submit() once all constraints are met.""" def _parse_tool_call(text: str) -> Optional[Dict[str, Any]]: """Extract tool call from model output. Accepts three formats: 1. {"name": "...", "arguments": {...}} 2. Bare JSON {"name": "...", "arguments": {...}} 3. Bare JSON {"action_type": "...", "layer_id": ..., ...} """ m = re.search(r"([\s\S]*?)", text) if m: try: return json.loads(m.group(1).strip()) except json.JSONDecodeError: pass # Bare JSON object m2 = re.search(r"\{[\s\S]*\}", text) if m2: try: obj = json.loads(m2.group(0)) if "name" in obj and "arguments" in obj: return obj if "action_type" in obj: return {"name": obj["action_type"], "arguments": {k: v for k, v in obj.items() if k != "action_type"}} except json.JSONDecodeError: pass return None def _log_step(step: int, name: str, args: Dict, reward: float, done: bool) -> None: args_str = " ".join(f"{k}={v}" for k, v in args.items() if v is not None) print(f" [{step:02d}] {name}({args_str}) reward={reward:.4f} done={done}", flush=True) def run_episode(client: OpenAI, scenario: Scenario, model: str = MODEL_NAME, max_steps: int = MAX_STEPS) -> float: """Run one full episode for *scenario* and return the final reward.""" env = NeuralTunerEnvironment() reset_obs = env.reset(model_id=scenario.model_id, difficulty=scenario.difficulty) obs_text = reset_obs.output print(f"\n{'─'*60}", flush=True) print(f"Scenario : {scenario.name} ({scenario.difficulty})", flush=True) print(f"Model : {scenario.model_id}", flush=True) print( f"Constraints: latency≤{scenario.constraints.latency_budget_ms}ms " f"memory≤{scenario.constraints.memory_budget_mb}MB " f"accuracy≥{scenario.constraints.min_accuracy_retention}", flush=True, ) messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": obs_text}, ] final_reward = 0.0 for step in range(1, max_steps + 1): try: completion = client.chat.completions.create( model=model, messages=messages, temperature=TEMPERATURE, max_tokens=MAX_TOKENS, ) response_text = completion.choices[0].message.content or "" except Exception as exc: print(f" [{step:02d}] API error: {exc}", flush=True) break tool_call = _parse_tool_call(response_text) if tool_call is None: print(f" [{step:02d}] Could not parse tool call from: {response_text[:120]!r}", flush=True) messages.append({"role": "assistant", "content": response_text}) messages.append( {"role": "user", "content": "Invalid response. Emit exactly one ... block."} ) continue name = tool_call.get("name", "") args: Dict[str, Any] = tool_call.get("arguments", {}) action = NeuralTunerAction( action_type=name, layer_id=args.get("layer_id"), dtype=args.get("dtype"), sparsity=args.get("sparsity"), ) try: result = env.step(action) except Exception as exc: print(f" [{step:02d}] env.step error: {exc}", flush=True) messages.append({"role": "assistant", "content": response_text}) messages.append({"role": "user", "content": f"Environment error: {exc}"}) continue final_reward = float(result.reward) _log_step(step, name, args, final_reward, bool(result.done)) messages.append({"role": "assistant", "content": response_text}) messages.append({"role": "user", "content": result.output}) if result.done: break return final_reward # ── Main ─────────────────────────────────────────────────────────────────────── def main() -> None: parser = argparse.ArgumentParser(description="Run NeuralTuner inference across all scenarios.") parser.add_argument("--model", default=MODEL_NAME, help="HF model ID (default: Qwen/Qwen2.5-72B-Instruct)") parser.add_argument( "--difficulty", choices=["easy", "medium", "hard"], default=None, help="Restrict to one difficulty tier." ) parser.add_argument("--scenario", default=None, help="Run a single scenario by name (e.g. inception_v3_medium).") parser.add_argument("--max-steps", type=int, default=MAX_STEPS) args = parser.parse_args() token = HF_TOKEN if not token: print("Error: set HF_TOKEN environment variable.", file=sys.stderr) sys.exit(1) model = args.model client = OpenAI(base_url=API_BASE_URL, api_key=token) # Select scenarios to run if args.scenario: scenarios = [s for s in ALL_SCENARIOS if s.name == args.scenario] if not scenarios: print(f"Unknown scenario '{args.scenario}'. Available:", file=sys.stderr) for s in ALL_SCENARIOS: print(f" {s.name}", file=sys.stderr) sys.exit(1) elif args.difficulty: scenarios = [s for s in ALL_SCENARIOS if s.difficulty == args.difficulty] else: scenarios = ALL_SCENARIOS print(f"NeuralTuner Inference — model={model}", flush=True) print(f"Running {len(scenarios)} scenario(s)\n", flush=True) scores: Dict[str, float] = {} for scenario in scenarios: scores[scenario.name] = run_episode(client, scenario, model=model, max_steps=args.max_steps) # Summary print(f"\n{'='*60}") print("RESULTS") print(f"{'='*60}") for diff in ["easy", "medium", "hard"]: tier = {n: r for n, r in scores.items() if n.endswith(f"_{diff}")} if not tier: continue print(f"\n {diff.upper()}") for name, reward in tier.items(): print(f" {name:<35} {reward:.4f}") print(f" {'avg':35} {sum(tier.values())/len(tier):.4f}") if scores: avg = sum(scores.values()) / len(scores) print(f"\n Overall average ({len(scores)} scenarios): {avg:.4f}") print(f"{'='*60}") if __name__ == "__main__": main()