"""
WebScrapeAgent — Evaluation Script
===================================
Tests the fine-tuned model on diverse web scraping scenarios.
Measures: JSON validity, schema compliance, data accuracy, action correctness.
Usage:
python evaluate.py
python evaluate.py --model path/to/local/model
"""
import unsloth
import os, json, torch, argparse
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
EVAL_SCENARIOS = [
{
"name": "extract_product_table",
"skill": "html_reading",
"messages": [
{"role": "system", "content": "You are WebScrapeAgent, a web data extraction assistant. Given web content and a target schema, extract clean structured JSON. Every value must exist in the source content. Never invent data. Always include extraction status."},
{"role": "user", "content": "Extract structured data from the following web content.\n\n\n\n
\n
Sony WH-1000XM5
\n
$348.00\n
4.7 out of 5
\n
Available\n
\n
\n
AirPods Max
\n
$549.00\n
4.3 out of 5
\n
Only 2 left\n
\n
\n\n\nReturn as JSON array of products with name, sku, price, rating, and availability."}
],
"checks": ["has_json", "has_status", "count_2_items"]
},
{
"name": "navigate_action",
"skill": "action_sequencing",
"messages": [
{"role": "system", "content": "You are WebScrapeAgent, an autonomous web scraping system.\n\nAvailable actions: NAVIGATE, CLICK, WAIT, SET_COOKIES, SET_HEADERS, LOAD_BROWSER_PROFILE, EXECUTE_JS, SCROLL, SWITCH_STRATEGY, RETURN_RESULT\n\nRules: Think in blocks. Max 10 steps."},
{"role": "user", "content": "Task: Extract the pricing table from a SaaS website\nURL: https://app.example.com/pricing"}
],
"checks": ["has_thought", "has_action", "action_is_navigate"]
},
{
"name": "auth_handling",
"skill": "authentication",
"messages": [
{"role": "system", "content": "You are WebScrapeAgent, an autonomous web scraping system.\n\nAvailable actions: NAVIGATE, CLICK, WAIT, SET_COOKIES, SET_HEADERS, LOAD_BROWSER_PROFILE, EXECUTE_JS, SCROLL, SWITCH_STRATEGY, RETURN_RESULT\n\nRules: Think in blocks. Max 10 steps."},
{"role": "user", "content": "Task: Extract dashboard analytics\nURL: https://analytics.example.com/dashboard\nAuthentication: Session cookies available in vault"}
],
"checks": ["has_thought", "has_action", "action_is_auth"]
},
{
"name": "error_recovery_403",
"skill": "error_recovery",
"messages": [
{"role": "system", "content": "You are WebScrapeAgent, an autonomous web scraping system.\n\nAvailable actions: NAVIGATE, CLICK, WAIT, SET_COOKIES, SET_HEADERS, LOAD_BROWSER_PROFILE, EXECUTE_JS, SCROLL, SWITCH_STRATEGY, RETURN_RESULT\n\nRules: Think in blocks. Max 10 steps."},
{"role": "user", "content": "Task: Extract reviews\nURL: https://reviews.example.com/product/123"},
{"role": "assistant", "content": "Navigate to the page.\n\nACTION: NAVIGATE\n```json\n{\"url\": \"https://reviews.example.com/product/123\"}\n```"},
{"role": "user", "content": "Observation: HTTP 403 Forbidden\n\nAccess Denied
Bot detection triggered.
"}
],
"checks": ["has_thought", "has_recovery_action", "not_gives_up"]
},
{
"name": "empty_content",
"skill": "html_reading",
"messages": [
{"role": "system", "content": "You are WebScrapeAgent, a web data extraction assistant. Never invent data. Always include status."},
{"role": "user", "content": "Extract products.\n\n\n\n"}
],
"checks": ["returns_empty_or_acknowledges", "has_status"]
},
]
def check(response, check_name):
r = response.lower()
if check_name == "has_json": return "{" in response and ("```json" in response or '"' in response)
if check_name == "has_status": return '"status"' in response
if check_name == "has_thought": return "" in response
if check_name == "has_action": return "ACTION:" in response
if check_name == "action_is_navigate": return "NAVIGATE" in response
if check_name == "action_is_auth": return any(a in response for a in ["SET_COOKIES", "SET_HEADERS", "LOAD_BROWSER_PROFILE", "NAVIGATE"])
if check_name == "has_recovery_action": return any(a in response for a in ["SWITCH_STRATEGY", "NAVIGATE", "SET_HEADERS"])
if check_name == "not_gives_up": return '"status": "failed"' not in response
if check_name == "count_2_items":
try:
s = response.find("["); e = response.rfind("]") + 1
return len(json.loads(response[s:e])) == 2 if s >= 0 and e > s else False
except: return False
if check_name == "returns_empty_or_acknowledges":
return "[]" in response or "no " in r or "empty" in r or "not found" in r
return False
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="sukritvemula/WebScrapeAgent-7B-v1")
args = parser.parse_args()
print(f"Loading: {args.model}")
model, tokenizer = FastLanguageModel.from_pretrained(args.model, max_seq_length=4096, dtype=None, load_in_4bit=True)
FastLanguageModel.for_inference(model)
tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")
results = []
for s in EVAL_SCENARIOS:
print(f"\n{'='*50}\nTest: {s['name']} ({s['skill']})")
inputs = tokenizer.apply_chat_template(s["messages"], tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
out = model.generate(input_ids=inputs, max_new_tokens=1024, temperature=0.3, do_sample=True, top_p=0.9)
resp = tokenizer.decode(out[0][inputs.shape[1]:], skip_special_tokens=True)
print(f"Response: {resp[:400]}...")
passed = sum(1 for c in s["checks"] if check(resp, c))
score = passed / len(s["checks"])
results.append({"name": s["name"], "skill": s["skill"], "score": score, "passed": passed, "total": len(s["checks"])})
print(f"Score: {score:.2f} ({passed}/{len(s['checks'])})")
print(f"\n{'='*60}\nSUMMARY")
for r in results: print(f" {r['name']:30s} {r['skill']:20s} {r['score']:.2f}")
avg = sum(r["score"] for r in results) / len(results)
print(f"\n Average: {avg:.2f}")
with open("eval_results.json", "w") as f: json.dump(results, f, indent=2)
if __name__ == "__main__":
main()