Instructions to use sukritvemula/WebScrapeAgent-7B-v1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Local Apps
- Unsloth Studio new
How to use sukritvemula/WebScrapeAgent-7B-v1 with Unsloth Studio:
Install Unsloth Studio (macOS, Linux, WSL)
curl -fsSL https://unsloth.ai/install.sh | sh # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for sukritvemula/WebScrapeAgent-7B-v1 to start chatting
Install Unsloth Studio (Windows)
irm https://unsloth.ai/install.ps1 | iex # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for sukritvemula/WebScrapeAgent-7B-v1 to start chatting
Using HuggingFace Spaces for Unsloth
# No setup required # Open https://huggingface.co/spaces/unsloth/studio in your browser # Search for sukritvemula/WebScrapeAgent-7B-v1 to start chatting
Load model with FastModel
pip install unsloth from unsloth import FastModel model, tokenizer = FastModel.from_pretrained( model_name="sukritvemula/WebScrapeAgent-7B-v1", max_seq_length=2048, )
| """ | |
| WebScrapeAgent — Evaluation Script | |
| =================================== | |
| Tests the fine-tuned model on diverse web scraping scenarios. | |
| Measures: JSON validity, schema compliance, data accuracy, action correctness. | |
| Usage: | |
| python evaluate.py | |
| python evaluate.py --model path/to/local/model | |
| """ | |
| import unsloth | |
| import os, json, torch, argparse | |
| from unsloth import FastLanguageModel | |
| from unsloth.chat_templates import get_chat_template | |
| EVAL_SCENARIOS = [ | |
| { | |
| "name": "extract_product_table", | |
| "skill": "html_reading", | |
| "messages": [ | |
| {"role": "system", "content": "You are WebScrapeAgent, a web data extraction assistant. Given web content and a target schema, extract clean structured JSON. Every value must exist in the source content. Never invent data. Always include extraction status."}, | |
| {"role": "user", "content": "Extract structured data from the following web content.\n\n<content>\n<div class=\"product-list\">\n <div class=\"product\" data-sku=\"WH-1000\">\n <h3>Sony WH-1000XM5</h3>\n <span class=\"price\">$348.00</span>\n <div class=\"rating\">4.7 out of 5</div>\n <span class=\"stock in-stock\">Available</span>\n </div>\n <div class=\"product\" data-sku=\"AP-MAX\">\n <h3>AirPods Max</h3>\n <span class=\"price\">$549.00</span>\n <div class=\"rating\">4.3 out of 5</div>\n <span class=\"stock limited\">Only 2 left</span>\n </div>\n</div>\n</content>\n\nReturn as JSON array of products with name, sku, price, rating, and availability."} | |
| ], | |
| "checks": ["has_json", "has_status", "count_2_items"] | |
| }, | |
| { | |
| "name": "navigate_action", | |
| "skill": "action_sequencing", | |
| "messages": [ | |
| {"role": "system", "content": "You are WebScrapeAgent, an autonomous web scraping system.\n\nAvailable actions: NAVIGATE, CLICK, WAIT, SET_COOKIES, SET_HEADERS, LOAD_BROWSER_PROFILE, EXECUTE_JS, SCROLL, SWITCH_STRATEGY, RETURN_RESULT\n\nRules: Think in <thought> blocks. Max 10 steps."}, | |
| {"role": "user", "content": "Task: Extract the pricing table from a SaaS website\nURL: https://app.example.com/pricing"} | |
| ], | |
| "checks": ["has_thought", "has_action", "action_is_navigate"] | |
| }, | |
| { | |
| "name": "auth_handling", | |
| "skill": "authentication", | |
| "messages": [ | |
| {"role": "system", "content": "You are WebScrapeAgent, an autonomous web scraping system.\n\nAvailable actions: NAVIGATE, CLICK, WAIT, SET_COOKIES, SET_HEADERS, LOAD_BROWSER_PROFILE, EXECUTE_JS, SCROLL, SWITCH_STRATEGY, RETURN_RESULT\n\nRules: Think in <thought> blocks. Max 10 steps."}, | |
| {"role": "user", "content": "Task: Extract dashboard analytics\nURL: https://analytics.example.com/dashboard\nAuthentication: Session cookies available in vault"} | |
| ], | |
| "checks": ["has_thought", "has_action", "action_is_auth"] | |
| }, | |
| { | |
| "name": "error_recovery_403", | |
| "skill": "error_recovery", | |
| "messages": [ | |
| {"role": "system", "content": "You are WebScrapeAgent, an autonomous web scraping system.\n\nAvailable actions: NAVIGATE, CLICK, WAIT, SET_COOKIES, SET_HEADERS, LOAD_BROWSER_PROFILE, EXECUTE_JS, SCROLL, SWITCH_STRATEGY, RETURN_RESULT\n\nRules: Think in <thought> blocks. Max 10 steps."}, | |
| {"role": "user", "content": "Task: Extract reviews\nURL: https://reviews.example.com/product/123"}, | |
| {"role": "assistant", "content": "<thought>Navigate to the page.</thought>\n\nACTION: NAVIGATE\n```json\n{\"url\": \"https://reviews.example.com/product/123\"}\n```"}, | |
| {"role": "user", "content": "Observation: HTTP 403 Forbidden\n\n<html><body><h1>Access Denied</h1><p>Bot detection triggered.</p></body></html>"} | |
| ], | |
| "checks": ["has_thought", "has_recovery_action", "not_gives_up"] | |
| }, | |
| { | |
| "name": "empty_content", | |
| "skill": "html_reading", | |
| "messages": [ | |
| {"role": "system", "content": "You are WebScrapeAgent, a web data extraction assistant. Never invent data. Always include status."}, | |
| {"role": "user", "content": "Extract products.\n\n<content>\n<html><body><div class=\"products\"><p class=\"empty-state\">No products found.</p></div></body></html>\n</content>"} | |
| ], | |
| "checks": ["returns_empty_or_acknowledges", "has_status"] | |
| }, | |
| ] | |
| def check(response, check_name): | |
| r = response.lower() | |
| if check_name == "has_json": return "{" in response and ("```json" in response or '"' in response) | |
| if check_name == "has_status": return '"status"' in response | |
| if check_name == "has_thought": return "<thought>" in response | |
| if check_name == "has_action": return "ACTION:" in response | |
| if check_name == "action_is_navigate": return "NAVIGATE" in response | |
| if check_name == "action_is_auth": return any(a in response for a in ["SET_COOKIES", "SET_HEADERS", "LOAD_BROWSER_PROFILE", "NAVIGATE"]) | |
| if check_name == "has_recovery_action": return any(a in response for a in ["SWITCH_STRATEGY", "NAVIGATE", "SET_HEADERS"]) | |
| if check_name == "not_gives_up": return '"status": "failed"' not in response | |
| if check_name == "count_2_items": | |
| try: | |
| s = response.find("["); e = response.rfind("]") + 1 | |
| return len(json.loads(response[s:e])) == 2 if s >= 0 and e > s else False | |
| except: return False | |
| if check_name == "returns_empty_or_acknowledges": | |
| return "[]" in response or "no " in r or "empty" in r or "not found" in r | |
| return False | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model", default="sukritvemula/WebScrapeAgent-7B-v1") | |
| args = parser.parse_args() | |
| print(f"Loading: {args.model}") | |
| model, tokenizer = FastLanguageModel.from_pretrained(args.model, max_seq_length=4096, dtype=None, load_in_4bit=True) | |
| FastLanguageModel.for_inference(model) | |
| tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5") | |
| results = [] | |
| for s in EVAL_SCENARIOS: | |
| print(f"\n{'='*50}\nTest: {s['name']} ({s['skill']})") | |
| inputs = tokenizer.apply_chat_template(s["messages"], tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda") | |
| out = model.generate(input_ids=inputs, max_new_tokens=1024, temperature=0.3, do_sample=True, top_p=0.9) | |
| resp = tokenizer.decode(out[0][inputs.shape[1]:], skip_special_tokens=True) | |
| print(f"Response: {resp[:400]}...") | |
| passed = sum(1 for c in s["checks"] if check(resp, c)) | |
| score = passed / len(s["checks"]) | |
| results.append({"name": s["name"], "skill": s["skill"], "score": score, "passed": passed, "total": len(s["checks"])}) | |
| print(f"Score: {score:.2f} ({passed}/{len(s['checks'])})") | |
| print(f"\n{'='*60}\nSUMMARY") | |
| for r in results: print(f" {r['name']:30s} {r['skill']:20s} {r['score']:.2f}") | |
| avg = sum(r["score"] for r in results) / len(results) | |
| print(f"\n Average: {avg:.2f}") | |
| with open("eval_results.json", "w") as f: json.dump(results, f, indent=2) | |
| if __name__ == "__main__": | |
| main() | |