File size: 8,814 Bytes
5dd60b9
3b7f446
5dd60b9
3b7f446
 
 
 
 
5dd60b9
 
 
 
 
 
 
 
 
3b7f446
5dd60b9
3b7f446
5dd60b9
8f2eab9
 
 
3b7f446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f2eab9
 
 
 
 
 
3b7f446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f2eab9
 
 
3b7f446
 
 
 
 
 
 
 
 
 
5dd60b9
3b7f446
 
 
 
 
 
 
5dd60b9
3b7f446
 
5dd60b9
3b7f446
 
5dd60b9
3b7f446
 
5dd60b9
3b7f446
5dd60b9
 
3b7f446
5dd60b9
8f2eab9
3b7f446
 
 
8f2eab9
 
 
 
3b7f446
5dd60b9
 
3b7f446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5dd60b9
 
 
3b7f446
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
"""
Inference script for NeuralTuner β€” runs all 15 scenarios using the HF router.

Usage:
    HF_TOKEN=hf_... python inference.py                           # all 15 scenarios
    HF_TOKEN=hf_... python inference.py --difficulty easy         # 5 easy only
    HF_TOKEN=hf_... python inference.py --scenario inception_v3_medium
    HF_TOKEN=hf_... python inference.py --model Qwen/Qwen2.5-72B-Instruct
"""

from __future__ import annotations

import argparse
import json
import os
import re
import sys
from typing import Any, Dict, List, Optional

from openai import OpenAI

from models import NeuralTunerAction
from server.neural_tuner_env_environment import NeuralTunerEnvironment
from server.scenarios import EASY_SCENARIOS, HARD_SCENARIOS, MEDIUM_SCENARIOS, Scenario

API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
TEMPERATURE = 0.0
MAX_TOKENS = 1024
MAX_STEPS = 20

ALL_SCENARIOS: List[Scenario] = EASY_SCENARIOS + MEDIUM_SCENARIOS + HARD_SCENARIOS

SYSTEM_PROMPT = """You are an expert ML optimization agent for Qualcomm Snapdragon hardware.
Your goal is to reduce the latency and memory of a neural network while preserving accuracy.

You interact with the NeuralTuner environment by emitting exactly one tool call per turn in this format:
<tool_call>{"name": "<action>", "arguments": {<args>}}</tool_call>

Available actions:
- profile_layer(layer_id)        β€” reveal sensitivity and optimization hints for a layer
- quantize_layer(layer_id, dtype) β€” apply dtype quantization (FP32 | FP16 | INT8 | INT4)
- prune_layer(layer_id, sparsity) β€” structured pruning (LOW | MEDIUM | HIGH)
- revert_layer(layer_id)         β€” reset a layer to FP32 / no pruning
- benchmark()                    β€” simulate current plan and see latency/memory/accuracy
- submit()                       β€” finalise and score the episode

Strategy:
1. Profile the most expensive or sensitive-looking layers first.
2. Apply aggressive quantization (INT8 / INT4) to low-sensitivity layers.
3. Use FP16 for medium-sensitivity layers; leave high-sensitivity layers at FP32.
4. Call benchmark() after quantizing a batch to track progress.
5. Call submit() once all constraints are met."""


def _parse_tool_call(text: str) -> Optional[Dict[str, Any]]:
    """Extract tool call from model output.

    Accepts three formats:
    1. <tool_call>{"name": "...", "arguments": {...}}</tool_call>
    2. Bare JSON {"name": "...", "arguments": {...}}
    3. Bare JSON {"action_type": "...", "layer_id": ..., ...}
    """
    m = re.search(r"<tool_call>([\s\S]*?)</tool_call>", text)
    if m:
        try:
            return json.loads(m.group(1).strip())
        except json.JSONDecodeError:
            pass
    # Bare JSON object
    m2 = re.search(r"\{[\s\S]*\}", text)
    if m2:
        try:
            obj = json.loads(m2.group(0))
            if "name" in obj and "arguments" in obj:
                return obj
            if "action_type" in obj:
                return {"name": obj["action_type"], "arguments": {k: v for k, v in obj.items() if k != "action_type"}}
        except json.JSONDecodeError:
            pass
    return None


def _log_step(step: int, name: str, args: Dict, reward: float, done: bool) -> None:
    args_str = "  ".join(f"{k}={v}" for k, v in args.items() if v is not None)
    print(f"  [{step:02d}] {name}({args_str})  reward={reward:.4f}  done={done}", flush=True)


def run_episode(client: OpenAI, scenario: Scenario, model: str = MODEL_NAME, max_steps: int = MAX_STEPS) -> float:
    """Run one full episode for *scenario* and return the final reward."""
    env = NeuralTunerEnvironment()
    reset_obs = env.reset(model_id=scenario.model_id, difficulty=scenario.difficulty)
    obs_text = reset_obs.output

    print(f"\n{'─'*60}", flush=True)
    print(f"Scenario : {scenario.name}  ({scenario.difficulty})", flush=True)
    print(f"Model    : {scenario.model_id}", flush=True)
    print(
        f"Constraints: latency≀{scenario.constraints.latency_budget_ms}ms  "
        f"memory≀{scenario.constraints.memory_budget_mb}MB  "
        f"accuracyβ‰₯{scenario.constraints.min_accuracy_retention}",
        flush=True,
    )

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": obs_text},
    ]

    final_reward = 0.0

    for step in range(1, max_steps + 1):
        try:
            completion = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=TEMPERATURE,
                max_tokens=MAX_TOKENS,
            )
            response_text = completion.choices[0].message.content or ""
        except Exception as exc:
            print(f"  [{step:02d}] API error: {exc}", flush=True)
            break

        tool_call = _parse_tool_call(response_text)
        if tool_call is None:
            print(f"  [{step:02d}] Could not parse tool call from: {response_text[:120]!r}", flush=True)
            messages.append({"role": "assistant", "content": response_text})
            messages.append(
                {"role": "user", "content": "Invalid response. Emit exactly one <tool_call>...</tool_call> block."}
            )
            continue

        name = tool_call.get("name", "")
        args: Dict[str, Any] = tool_call.get("arguments", {})
        action = NeuralTunerAction(
            action_type=name,
            layer_id=args.get("layer_id"),
            dtype=args.get("dtype"),
            sparsity=args.get("sparsity"),
        )

        try:
            result = env.step(action)
        except Exception as exc:
            print(f"  [{step:02d}] env.step error: {exc}", flush=True)
            messages.append({"role": "assistant", "content": response_text})
            messages.append({"role": "user", "content": f"Environment error: {exc}"})
            continue

        final_reward = float(result.reward)
        _log_step(step, name, args, final_reward, bool(result.done))

        messages.append({"role": "assistant", "content": response_text})
        messages.append({"role": "user", "content": result.output})

        if result.done:
            break

    return final_reward


# ── Main ───────────────────────────────────────────────────────────────────────


def main() -> None:
    parser = argparse.ArgumentParser(description="Run NeuralTuner inference across all scenarios.")
    parser.add_argument("--model", default=MODEL_NAME, help="HF model ID (default: Qwen/Qwen2.5-72B-Instruct)")
    parser.add_argument(
        "--difficulty", choices=["easy", "medium", "hard"], default=None, help="Restrict to one difficulty tier."
    )
    parser.add_argument("--scenario", default=None, help="Run a single scenario by name (e.g. inception_v3_medium).")
    parser.add_argument("--max-steps", type=int, default=MAX_STEPS)
    args = parser.parse_args()

    token = HF_TOKEN
    if not token:
        print("Error: set HF_TOKEN environment variable.", file=sys.stderr)
        sys.exit(1)

    model = args.model
    client = OpenAI(base_url=API_BASE_URL, api_key=token)

    # Select scenarios to run
    if args.scenario:
        scenarios = [s for s in ALL_SCENARIOS if s.name == args.scenario]
        if not scenarios:
            print(f"Unknown scenario '{args.scenario}'. Available:", file=sys.stderr)
            for s in ALL_SCENARIOS:
                print(f"  {s.name}", file=sys.stderr)
            sys.exit(1)
    elif args.difficulty:
        scenarios = [s for s in ALL_SCENARIOS if s.difficulty == args.difficulty]
    else:
        scenarios = ALL_SCENARIOS

    print(f"NeuralTuner Inference β€” model={model}", flush=True)
    print(f"Running {len(scenarios)} scenario(s)\n", flush=True)

    scores: Dict[str, float] = {}
    for scenario in scenarios:
        scores[scenario.name] = run_episode(client, scenario, model=model, max_steps=args.max_steps)

    # Summary
    print(f"\n{'='*60}")
    print("RESULTS")
    print(f"{'='*60}")
    for diff in ["easy", "medium", "hard"]:
        tier = {n: r for n, r in scores.items() if n.endswith(f"_{diff}")}
        if not tier:
            continue
        print(f"\n  {diff.upper()}")
        for name, reward in tier.items():
            print(f"    {name:<35} {reward:.4f}")
        print(f"    {'avg':35} {sum(tier.values())/len(tier):.4f}")
    if scores:
        avg = sum(scores.values()) / len(scores)
        print(f"\n  Overall average ({len(scores)} scenarios): {avg:.4f}")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()