| |
| """Hit the deployed endpoint with same prompt twice — MTP off, MTP on. |
| Set ENDPOINT_URL and HF_TOKEN env vars before running. |
| """ |
| import os, json, sys |
| import urllib.request |
|
|
| URL = os.environ.get("ENDPOINT_URL") |
| TOKEN = os.environ.get("HF_TOKEN") |
| if not URL or not TOKEN: |
| print("set ENDPOINT_URL and HF_TOKEN env vars first", file=sys.stderr) |
| sys.exit(1) |
|
|
| PROMPT = ( |
| "Write a Python function `decompose_modal(sentence)` that takes a legal " |
| "sentence and returns (modal_word, sentence_without_modal). Handle: shall, " |
| "must, may, will, should. Return (None, sentence) if no modal found. " |
| "Include 5 example calls." |
| ) |
|
|
|
|
| def call(use_mtp: bool): |
| payload = { |
| "inputs": PROMPT, |
| "parameters": { |
| "max_new_tokens": 300, |
| "use_mtp": use_mtp, |
| "do_sample": True, |
| "temperature": 0.7, |
| }, |
| } |
| req = urllib.request.Request( |
| URL, |
| data=json.dumps(payload).encode(), |
| headers={ |
| "Authorization": f"Bearer {TOKEN}", |
| "Content-Type": "application/json", |
| }, |
| ) |
| with urllib.request.urlopen(req, timeout=600) as r: |
| return json.load(r) |
|
|
|
|
| def fmt(label, result): |
| r = result[0] if isinstance(result, list) else result |
| print(f"--- {label} ---") |
| print(f" use_mtp: {r.get('use_mtp')}") |
| print(f" generated_tokens: {r.get('generated_tokens')}") |
| print(f" elapsed_seconds: {r.get('elapsed_seconds')}") |
| print(f" tokens_per_second: {r.get('tokens_per_second')}") |
| print(f" text excerpt: {(r.get('generated_text') or '')[:160]!r}") |
| print() |
|
|
|
|
| if __name__ == "__main__": |
| print("warming...") |
| call(use_mtp=False) |
| call(use_mtp=True) |
|
|
| print("\n=== TIMED RUNS ===\n") |
| for run in range(3): |
| print(f"### run {run + 1} ###") |
| fmt("MTP OFF", call(use_mtp=False)) |
| fmt("MTP ON ", call(use_mtp=True)) |
|
|