Text Generation
English
web-scraping
html-extraction
agent
structured-data
qwen2.5
unsloth
lora
sukritvemula's picture
Upload train.py with huggingface_hub
9817039 verified
"""
WebScrapeAgent — Standalone Training Script
=============================================
Run this on any machine with a 16GB+ GPU.
Usage:
pip install unsloth trl peft transformers accelerate datasets bitsandbytes
python train.py
# Or with custom settings:
python train.py --epochs 3 --lr 5e-5 --lora-r 64 --output my-org/my-model
"""
# CRITICAL: import unsloth FIRST
import unsloth
import os
import sys
import argparse
import torch
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template, train_on_responses_only
def parse_args():
p = argparse.ArgumentParser(description="Train WebScrapeAgent")
p.add_argument("--model", default="unsloth/Qwen2.5-7B-Instruct-bnb-4bit", help="Base model")
p.add_argument("--dataset", default="sukritvemula/webscrape-agent-training-data", help="Training dataset")
p.add_argument("--output", default="sukritvemula/WebScrapeAgent-7B-v1", help="Output model name on Hub")
p.add_argument("--max-seq-len", type=int, default=4096, help="Max sequence length")
p.add_argument("--lora-r", type=int, default=32, help="LoRA rank")
p.add_argument("--lora-alpha", type=int, default=32, help="LoRA alpha")
p.add_argument("--lr", type=float, default=1e-4, help="Learning rate")
p.add_argument("--epochs", type=int, default=2, help="Number of epochs")
p.add_argument("--batch-size", type=int, default=1, help="Per-device batch size")
p.add_argument("--grad-accum", type=int, default=16, help="Gradient accumulation steps")
p.add_argument("--no-push", action="store_true", help="Don't push to Hub")
p.add_argument("--save-local", default="./webscrape-agent-local", help="Local save path")
return p.parse_args()
def main():
args = parse_args()
print("=" * 60)
print("WebScrapeAgent — Training")
print("=" * 60)
print(f" GPU: {torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
print(f" VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
# 1. Load model
print(f"\n[1/5] Loading: {args.model}")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model,
max_seq_length=args.max_seq_len,
dtype=None,
load_in_4bit=True,
)
# 2. LoRA
print(f"[2/5] Applying LoRA (r={args.lora_r}, alpha={args.lora_alpha})")
model = FastLanguageModel.get_peft_model(
model,
r=args.lora_r,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha=args.lora_alpha,
lora_dropout=0.0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=42,
)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f" Trainable: {trainable:,} / {total:,} ({trainable/total*100:.2f}%)")
# 3. Chat template + dataset
tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")
print(f"[3/5] Loading dataset: {args.dataset}")
dataset = load_dataset(args.dataset)
train_ds = dataset["train"]
def format_to_text(examples):
texts = []
for msgs in examples["messages"]:
try:
text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)
texts.append(text)
except Exception:
text = ""
for msg in msgs:
text += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
texts.append(text)
return {"text": texts}
train_ds = train_ds.map(format_to_text, batched=True, num_proc=2, remove_columns=train_ds.column_names)
def filter_length(example):
tokens = tokenizer(example["text"], truncation=False)
return len(tokens["input_ids"]) <= args.max_seq_len
orig = len(train_ds)
train_ds = train_ds.filter(filter_length, num_proc=2)
print(f" {len(train_ds)} / {orig} examples ({len(train_ds)/orig*100:.1f}% kept after length filter)")
# 4. Trainer
print(f"[4/5] Setting up trainer")
training_args = SFTConfig(
output_dir="./webscrape-checkpoints",
num_train_epochs=args.epochs,
per_device_train_batch_size=args.batch_size,
gradient_accumulation_steps=args.grad_accum,
optim="adamw_8bit",
learning_rate=args.lr,
weight_decay=0.01,
lr_scheduler_type="cosine",
warmup_ratio=0.03,
max_grad_norm=0.3,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
max_seq_length=args.max_seq_len,
dataset_text_field="text",
packing=False,
logging_steps=10,
logging_first_step=True,
disable_tqdm=False,
save_strategy="steps",
save_steps=500,
save_total_limit=2,
push_to_hub=not args.no_push,
hub_model_id=args.output if not args.no_push else None,
hub_strategy="end",
seed=42,
dataset_num_proc=2,
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=train_ds,
args=training_args,
)
trainer = train_on_responses_only(trainer)
# 5. Train
print(f"[5/5] Training...")
print(f" Effective batch: {args.batch_size * args.grad_accum}")
print(f" LR: {args.lr}, Epochs: {args.epochs}")
stats = trainer.train()
print(f"\n✅ Done! Loss: {stats.training_loss:.4f}")
# Save
model.save_pretrained(args.save_local)
tokenizer.save_pretrained(args.save_local)
print(f" Saved locally: {args.save_local}")
if not args.no_push:
print(f" Pushing merged model to Hub: {args.output}")
model.push_to_hub_merged(args.output, tokenizer, save_method="merged_16bit")
model.push_to_hub(args.output + "-lora", tokenizer)
print(f" ✅ https://huggingface.co/{args.output}")
print(f" ✅ https://huggingface.co/{args.output}-lora")
if __name__ == "__main__":
main()