Text Generation
English
web-scraping
html-extraction
agent
structured-data
qwen2.5
unsloth
lora
File size: 6,330 Bytes
9817039
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
WebScrapeAgent — Standalone Training Script
=============================================
Run this on any machine with a 16GB+ GPU.

Usage:
    pip install unsloth trl peft transformers accelerate datasets bitsandbytes
    python train.py
    
    # Or with custom settings:
    python train.py --epochs 3 --lr 5e-5 --lora-r 64 --output my-org/my-model
"""

# CRITICAL: import unsloth FIRST
import unsloth

import os
import sys
import argparse
import torch
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template, train_on_responses_only


def parse_args():
    p = argparse.ArgumentParser(description="Train WebScrapeAgent")
    p.add_argument("--model", default="unsloth/Qwen2.5-7B-Instruct-bnb-4bit", help="Base model")
    p.add_argument("--dataset", default="sukritvemula/webscrape-agent-training-data", help="Training dataset")
    p.add_argument("--output", default="sukritvemula/WebScrapeAgent-7B-v1", help="Output model name on Hub")
    p.add_argument("--max-seq-len", type=int, default=4096, help="Max sequence length")
    p.add_argument("--lora-r", type=int, default=32, help="LoRA rank")
    p.add_argument("--lora-alpha", type=int, default=32, help="LoRA alpha")
    p.add_argument("--lr", type=float, default=1e-4, help="Learning rate")
    p.add_argument("--epochs", type=int, default=2, help="Number of epochs")
    p.add_argument("--batch-size", type=int, default=1, help="Per-device batch size")
    p.add_argument("--grad-accum", type=int, default=16, help="Gradient accumulation steps")
    p.add_argument("--no-push", action="store_true", help="Don't push to Hub")
    p.add_argument("--save-local", default="./webscrape-agent-local", help="Local save path")
    return p.parse_args()


def main():
    args = parse_args()
    
    print("=" * 60)
    print("WebScrapeAgent — Training")
    print("=" * 60)
    print(f"  GPU: {torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU'}")
    if torch.cuda.is_available():
        print(f"  VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
    
    # 1. Load model
    print(f"\n[1/5] Loading: {args.model}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=args.model,
        max_seq_length=args.max_seq_len,
        dtype=None,
        load_in_4bit=True,
    )
    
    # 2. LoRA
    print(f"[2/5] Applying LoRA (r={args.lora_r}, alpha={args.lora_alpha})")
    model = FastLanguageModel.get_peft_model(
        model,
        r=args.lora_r,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
        lora_alpha=args.lora_alpha,
        lora_dropout=0.0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=42,
    )
    
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"  Trainable: {trainable:,} / {total:,} ({trainable/total*100:.2f}%)")
    
    # 3. Chat template + dataset
    tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")
    
    print(f"[3/5] Loading dataset: {args.dataset}")
    dataset = load_dataset(args.dataset)
    train_ds = dataset["train"]
    
    def format_to_text(examples):
        texts = []
        for msgs in examples["messages"]:
            try:
                text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)
                texts.append(text)
            except Exception:
                text = ""
                for msg in msgs:
                    text += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
                texts.append(text)
        return {"text": texts}
    
    train_ds = train_ds.map(format_to_text, batched=True, num_proc=2, remove_columns=train_ds.column_names)
    
    def filter_length(example):
        tokens = tokenizer(example["text"], truncation=False)
        return len(tokens["input_ids"]) <= args.max_seq_len
    
    orig = len(train_ds)
    train_ds = train_ds.filter(filter_length, num_proc=2)
    print(f"  {len(train_ds)} / {orig} examples ({len(train_ds)/orig*100:.1f}% kept after length filter)")
    
    # 4. Trainer
    print(f"[4/5] Setting up trainer")
    training_args = SFTConfig(
        output_dir="./webscrape-checkpoints",
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size,
        gradient_accumulation_steps=args.grad_accum,
        optim="adamw_8bit",
        learning_rate=args.lr,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        max_grad_norm=0.3,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        max_seq_length=args.max_seq_len,
        dataset_text_field="text",
        packing=False,
        logging_steps=10,
        logging_first_step=True,
        disable_tqdm=False,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=2,
        push_to_hub=not args.no_push,
        hub_model_id=args.output if not args.no_push else None,
        hub_strategy="end",
        seed=42,
        dataset_num_proc=2,
    )
    
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_ds,
        args=training_args,
    )
    trainer = train_on_responses_only(trainer)
    
    # 5. Train
    print(f"[5/5] Training...")
    print(f"  Effective batch: {args.batch_size * args.grad_accum}")
    print(f"  LR: {args.lr}, Epochs: {args.epochs}")
    
    stats = trainer.train()
    print(f"\n✅ Done! Loss: {stats.training_loss:.4f}")
    
    # Save
    model.save_pretrained(args.save_local)
    tokenizer.save_pretrained(args.save_local)
    print(f"  Saved locally: {args.save_local}")
    
    if not args.no_push:
        print(f"  Pushing merged model to Hub: {args.output}")
        model.push_to_hub_merged(args.output, tokenizer, save_method="merged_16bit")
        model.push_to_hub(args.output + "-lora", tokenizer)
        print(f"  ✅ https://huggingface.co/{args.output}")
        print(f"  ✅ https://huggingface.co/{args.output}-lora")


if __name__ == "__main__":
    main()