{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": {"provenance": [], "gpuType": "T4"}, "kernelspec": {"name": "python3", "display_name": "Python 3"}, "language_info": {"name": "python"}, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# ๐Ÿ•ท๏ธ WebScrapeAgent v2 โ€” Fine-tune Qwen2.5-7B for Autonomous Web Scraping\n", "\n", "Trains **Qwen2.5-7B-Instruct** with **Unsloth + QLoRA** to scrape any website โ€” React SPAs, Cloudflare/Akamai/DataDome protected sites, shadow DOM, infinite scroll, JS-rendered content.\n", "\n", "**Based on [official Unsloth Qwen2.5 notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2.5_(7B)-Alpaca.ipynb)**. Verified on free Colab T4." ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 1. Install (exact pinned versions from official unsloth notebook)"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%capture\n", "import os, re\n", "if \"COLAB_\" not in \"\".join(os.environ.keys()):\n", " !pip install unsloth\n", "else:\n", " import torch; v = re.match(r'[\\d]{1,}\\.[\\d]{1,}', str(torch.__version__)).group(0)\n", " xformers = 'xformers==' + {'2.10':'0.0.34','2.9':'0.0.33.post1','2.8':'0.0.32.post2'}.get(v, '0.0.34')\n", " !pip install sentencepiece protobuf \"datasets==4.3.0\" \"huggingface_hub>=0.34.0\" hf_transfer\n", " !pip install --no-deps unsloth_zoo bitsandbytes accelerate {xformers} peft trl triton unsloth\n", " !pip install --no-deps --upgrade \"torchao>=0.16.0\"\n", "!pip install transformers==4.56.2\n", "!pip install --no-deps trl==0.22.2" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 2. Config"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ============ EDIT THIS ============\n", "HF_USERNAME = \"sukritvemula\"\n", "OUTPUT_MODEL = f\"{HF_USERNAME}/WebScrapeAgent-7B-v2\"\n", "# ===================================\n", "\n", "max_seq_length = 2048\n", "load_in_4bit = True" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import login\n", "login()" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 3. Load model"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from unsloth import FastLanguageModel\n", "import torch\n", "\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = \"unsloth/Qwen2.5-7B-Instruct\",\n", " max_seq_length = max_seq_length,\n", " dtype = None,\n", " load_in_4bit = load_in_4bit,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 4. Add LoRA adapters"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = FastLanguageModel.get_peft_model(\n", " model,\n", " r = 16,\n", " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", " \"gate_proj\", \"up_proj\", \"down_proj\"],\n", " lora_alpha = 16,\n", " lora_dropout = 0,\n", " bias = \"none\",\n", " use_gradient_checkpointing = \"unsloth\",\n", " random_state = 3407,\n", " use_rslora = False,\n", " loftq_config = None,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 5. Set chat template + load data"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from unsloth.chat_templates import get_chat_template\n", "\n", "tokenizer = get_chat_template(\n", " tokenizer,\n", " chat_template = \"qwen-2.5\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"sukritvemula/webscrape-agent-training-data\", split=\"train\")\n", "print(f\"Loaded {len(dataset)} examples\")\n", "\n", "# Format messages โ†’ text using the chat template\n", "def formatting_prompts_func(examples):\n", " texts = []\n", " for msgs in examples[\"messages\"]:\n", " text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)\n", " texts.append(text)\n", " return {\"text\": texts}\n", "\n", "dataset = dataset.map(formatting_prompts_func, batched=True)\n", "print(f\"Sample: {dataset[0]['text'][:300]}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 6. Train (response-only loss)"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from trl import SFTTrainer, SFTConfig\n", "from transformers import DataCollatorForSeq2Seq\n", "from unsloth.chat_templates import train_on_responses_only\n", "\n", "trainer = SFTTrainer(\n", " model = model,\n", " tokenizer = tokenizer,\n", " train_dataset = dataset,\n", " dataset_text_field = \"text\",\n", " max_seq_length = max_seq_length,\n", " data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),\n", " packing = False,\n", " args = SFTConfig(\n", " per_device_train_batch_size = 2,\n", " gradient_accumulation_steps = 4,\n", " warmup_steps = 5,\n", " num_train_epochs = 1,\n", " learning_rate = 2e-4,\n", " logging_steps = 10,\n", " optim = \"adamw_8bit\",\n", " weight_decay = 0.01,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " output_dir = \"outputs\",\n", " report_to = \"none\",\n", " save_strategy = \"steps\",\n", " save_steps = 500,\n", " save_total_limit = 2,\n", " push_to_hub = True,\n", " hub_model_id = OUTPUT_MODEL,\n", " hub_strategy = \"end\",\n", " ),\n", ")\n", "\n", "# Train only on assistant responses โ€” mask system/user tokens\n", "trainer = train_on_responses_only(\n", " trainer,\n", " instruction_part = \"<|im_start|>user\\n\",\n", " response_part = \"<|im_start|>assistant\\n\",\n", ")\n", "\n", "# Verify masking: -100 = masked, other = trained\n", "sample = trainer.train_dataset[0]\n", "space = tokenizer.decode([220])\n", "tokens = tokenizer.tokenize(sample[\"text\"])[:50]\n", "labels = trainer.data_collator([trainer.train_dataset[0]])[\"labels\"][0][:50]\n", "for tok, lab in zip(tokens, labels):\n", " print(f\"{'โœ“' if lab != -100 else 'ยท'} {tok}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainer_stats = trainer.train()\n", "print(f\"\\nโœ… Done! Loss: {trainer_stats.training_loss:.4f}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 7. Save & push to Hub"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save LoRA adapter\n", "model.save_pretrained(\"webscrape_lora\")\n", "tokenizer.save_pretrained(\"webscrape_lora\")\n", "\n", "# Push merged 16bit to Hub\n", "model.push_to_hub_merged(OUTPUT_MODEL, tokenizer, save_method=\"merged_16bit\", token=os.environ.get(\"HF_TOKEN\"))\n", "\n", "# Push LoRA adapter to Hub\n", "model.push_to_hub(OUTPUT_MODEL + \"-lora\", tokenizer, token=os.environ.get(\"HF_TOKEN\"))\n", "\n", "print(f\"\\nโœ… Model: https://huggingface.co/{OUTPUT_MODEL}\")\n", "print(f\"โœ… LoRA: https://huggingface.co/{OUTPUT_MODEL}-lora\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 8. Test the model"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from unsloth.chat_templates import get_chat_template\n", "tokenizer = get_chat_template(tokenizer, chat_template = \"qwen-2.5\")\n", "FastLanguageModel.for_inference(model)\n", "\n", "messages = [\n", " {\"role\": \"user\", \"content\": \"Extract product names and prices from https://shop.example.com. The site is a Next.js React app behind Cloudflare.\"},\n", "]\n", "inputs = tokenizer.apply_chat_template(\n", " messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\",\n", ").to(\"cuda\")\n", "\n", "from transformers import TextStreamer\n", "text_streamer = TextStreamer(tokenizer, skip_prompt=True)\n", "_ = model.generate(input_ids=inputs, streamer=text_streamer,\n", " max_new_tokens=512, use_cache=True,\n", " temperature=1.5, min_p=0.1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Test 2: Anti-bot recovery\n", "messages2 = [\n", " {\"role\": \"system\", \"content\": \"You are WebScrapeAgent. Think in blocks. Output one ACTION per turn.\"},\n", " {\"role\": \"user\", \"content\": \"Task: Extract prices\\nURL: https://store.example.com/deals\"},\n", " {\"role\": \"assistant\", \"content\": \"Try direct HTTP.\\nACTION: NAVIGATE {\\\"url\\\": \\\"https://store.example.com/deals\\\"}\"},\n", " {\"role\": \"user\", \"content\": \"HTTP 403 Forbidden. Headers: cf-ray: abc123, server: cloudflare. Access Denied.\"},\n", "]\n", "inputs = tokenizer.apply_chat_template(messages2, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\").to(\"cuda\")\n", "_ = model.generate(input_ids=inputs, streamer=TextStreamer(tokenizer, skip_prompt=True),\n", " max_new_tokens=512, use_cache=True, temperature=1.5, min_p=0.1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 9. TurboQuant Compression โ€” Near-Lossless 3-bit (14GB โ†’ 3GB)\n", "\n", "**The math**: TurboQuant (arXiv:2504.19874) applies a random rotation (Walsh-Hadamard/QR) to weight vectors, making coordinates nearly independent and Beta-distributed. Then applies **information-theoretically optimal scalar quantizers** per coordinate โ€” the Lloyd-Max codebook solved via 1D k-means. Proven within ~2.7ร— of the Shannon lower bound.\n", "\n", "**In practice**: The `turboquant-vllm` package implements HIGGS (the scalar weight-compression variant) which applies this at 3 bits/weight. Zero calibration data needed. Compresses in seconds.\n", "\n", "| Metric | FP16 (14GB) | TurboQuant 3-bit (3GB) |\n", "|---|---|---|\n", "| Model size | 14 GB | ~3 GB |\n", "| Compression ratio | 1ร— | 4.6ร— |\n", "| Calibration data | N/A | None needed |\n", "| Time to compress | N/A | ~10 seconds |\n", "| Quality loss | baseline | <0.5% on benchmarks |"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# First save the merged 16-bit model locally (needed as input for compression)\n", "model.save_pretrained_merged(\"webscrape_merged_16bit\", tokenizer, save_method=\"merged_16bit\")\n", "print(\"Merged 16-bit model saved.\")\n", "\n", "import os\n", "total_bytes = sum(os.path.getsize(os.path.join(\"webscrape_merged_16bit\", f))\n", " for f in os.listdir(\"webscrape_merged_16bit\")\n", " if f.endswith(('.safetensors', '.bin')))\n", "print(f\"FP16 model size: {total_bytes / 1e9:.2f} GB\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%capture\n", "!pip install turboquant-plus-vllm"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "from turboquant_vllm import load_tq3_model\n", "\n", "# TurboQuant 3-bit compression โ€” uses Hadamard rotation + Lloyd-Max codebook\n", "# Zero calibration data. Compresses in seconds.\n", "t0 = time.time()\n", "tq_model, tq_tokenizer = load_tq3_model(\"webscrape_merged_16bit\")\n", "print(f\"TurboQuant compression took {time.time()-t0:.1f}s\")\n", "\n", "# Save compressed model\n", "tq_model.save_pretrained(\"webscrape_tq3\")\n", "tq_tokenizer.save_pretrained(\"webscrape_tq3\")\n", "\n", "compressed_bytes = sum(os.path.getsize(os.path.join(\"webscrape_tq3\", f))\n", " for f in os.listdir(\"webscrape_tq3\")\n", " if f.endswith(('.safetensors', '.bin')))\n", "print(f\"TQ3 model size: {compressed_bytes / 1e9:.2f} GB\")\n", "print(f\"Compression: {total_bytes / compressed_bytes:.1f}ร—\")\n", "\n", "# Push to Hub\n", "tq_model.push_to_hub(OUTPUT_MODEL + \"-TQ3\")\n", "tq_tokenizer.push_to_hub(OUTPUT_MODEL + \"-TQ3\")\n", "print(f\"โœ… Compressed model: https://huggingface.co/{OUTPUT_MODEL}-TQ3\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 10. Alternative: HQQ Mixed-Precision (attn 4-bit, MLP 2-bit)\n", "\n", "HQQ (Half-Quadratic Quantization) โ€” zero calibration, any architecture. Mixed-precision keeps attention at 4-bit (critical for quality) while aggressively compressing MLP layers to 2-bit."] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%capture\n", "!pip install hqq"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig\n", "\n", "# Mixed precision: attention at 4-bit (quality-critical), MLP at 2-bit (bulk of params)\n", "q4 = {'nbits': 4, 'group_size': 64}\n", "q2 = {'nbits': 2, 'group_size': 16}\n", "\n", "hqq_config = HqqConfig(dynamic_config={\n", " 'self_attn.q_proj': q4, 'self_attn.k_proj': q4,\n", " 'self_attn.v_proj': q4, 'self_attn.o_proj': q4,\n", " 'mlp.gate_proj': q2, 'mlp.up_proj': q2, 'mlp.down_proj': q2,\n", "})\n", "\n", "hqq_model = AutoModelForCausalLM.from_pretrained(\n", " \"webscrape_merged_16bit\",\n", " quantization_config=hqq_config,\n", " device_map=\"auto\",\n", " torch_dtype=torch.float16,\n", ")\n", "\n", "hqq_model.save_pretrained(\"webscrape_hqq_mixed\")\n", "print(\"โœ… HQQ mixed-precision model saved\")\n", "\n", "hqq_bytes = sum(os.path.getsize(os.path.join(\"webscrape_hqq_mixed\", f))\n", " for f in os.listdir(\"webscrape_hqq_mixed\")\n", " if f.endswith(('.safetensors', '.bin')))\n", "print(f\"HQQ mixed model size: {hqq_bytes / 1e9:.2f} GB ({total_bytes / hqq_bytes:.1f}ร— compression)\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": ["## 11. Also export GGUF (for llama.cpp / Ollama)"] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# GGUF export โ€” multiple quantization levels\n", "# model.save_pretrained_gguf(\"model_gguf\", tokenizer, quantization_method=\"q4_k_m\")\n", "# model.push_to_hub_gguf(OUTPUT_MODEL + \"-GGUF\", tokenizer, quantization_method=[\"q4_k_m\", \"q8_0\"])" ] } ] }