pitvqa-training-scripts / merge_stage4_adapter.py

Upload merge_stage4_adapter.py with huggingface_hub

2e3edd5 verified about 1 month ago

5.85 kB

	#!/usr/bin/env python3
	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "torch",
	# "transformers>=4.40.0",
	# "peft>=0.10.0",
	# "accelerate",
	# "bitsandbytes",
	# "huggingface_hub>=0.21.0",
	# ]
	# ///
	"""
	Merge Stage 4 (Unified) adapter into base model.

	Stage 4 is trained on ALL tasks, so it can handle:
	- Point localization
	- Bounding box detection
	- Classification
	- Free-form queries

	Run with: hf jobs uv run --flavor a10g-large --secrets HF_TOKEN merge_stage4_adapter.py
	"""

	import os
	import torch
	from pathlib import Path

	# ============================================================
	# Config
	# ============================================================

	UNIFIED_MODEL = "mmrech/pitvqa-qwen2vl-unified-v2"
	BASE_MODEL = "Qwen/Qwen2-VL-2B-Instruct"
	OUTPUT_REPO = "mmrech/pitvqa-qwen2vl-merged"

	# ============================================================
	# Setup
	# ============================================================

	from huggingface_hub import login, HfApi

	hf_token = os.environ.get("HF_TOKEN")
	if hf_token:
	login(token=hf_token)
	print("✓ Logged in to HuggingFace")

	api = HfApi()

	# ============================================================
	# Load and Merge
	# ============================================================

	print("\n🤖 Loading base model...")

	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from peft import PeftModel

	# Load base model (full precision for merging)
	base = Qwen2VLForConditionalGeneration.from_pretrained(
	BASE_MODEL,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	)
	print(f"✓ Base model loaded")

	# Load processor
	processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
	print(f"✓ Processor loaded")

	# Load Stage 4 adapter (Unified - handles all tasks)
	print("\n📦 Loading Stage 4 (Unified) adapter...")
	model = PeftModel.from_pretrained(
	base,
	UNIFIED_MODEL,
	adapter_name="stage4",
	subfolder="stage4"
	)
	print(f"✓ Stage 4 adapter loaded")

	# Merge adapter into base model
	print("\n🔗 Merging adapter...")
	merged_model = model.merge_and_unload()
	print(f"✓ Adapter merged")

	# ============================================================
	# Save and Upload
	# ============================================================

	print("\n💾 Saving merged model...")
	output_dir = Path("./pitvqa-merged")
	output_dir.mkdir(exist_ok=True)

	merged_model.save_pretrained(output_dir)
	processor.save_pretrained(output_dir)
	print(f"✓ Saved to {output_dir}")

	# Create model card
	model_card = """---
	license: apache-2.0
	base_model: Qwen/Qwen2-VL-2B-Instruct
	tags:
	- medical
	- vision-language
	- surgical-ai
	- pituitary-surgery
	- qwen2-vl
	- merged-adapter
	---

	# PitVQA Merged Model

	A merged version of the PitVQA unified model for pituitary surgery understanding.

	## Model Description

	This model merges the Stage 4 (Unified) LoRA adapter into the Qwen2-VL-2B base model.
	It can handle ALL tasks without adapter switching:

	- Point Localization: `<point x='45.2' y='68.3'>suction device</point>`
	- Bounding Box: `<box x1='20' y1='30' x2='60' y2='70'>tumor region</box>`
	- Classification: Surgical phase identification
	- Free-form queries: Any question about the surgical scene

	## Usage

	```python
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	import torch

	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"mmrech/pitvqa-qwen2vl-merged",
	torch_dtype=torch.bfloat16,
	device_map="auto"
	)
	processor = AutoProcessor.from_pretrained("mmrech/pitvqa-qwen2vl-merged")

	# No adapter switching needed - just inference
	messages = [{"role": "user", "content": [
	{"type": "image", "image": your_image},
	{"type": "text", "text": "Point to the suction device"}
	]}]

	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(text=[text], images=[your_image], return_tensors="pt").to(model.device)
	output = model.generate(**inputs, max_new_tokens=128)
	print(processor.decode(output[0], skip_special_tokens=True))
	```

	## Source

	- Base: `Qwen/Qwen2-VL-2B-Instruct`
	- Adapter source: `mmrech/pitvqa-qwen2vl-unified-v2` (Stage 4)
	- Training dataset: `mmrech/pitvqa-comprehensive-spatial`
	"""

	with open(output_dir / "README.md", "w") as f:
	f.write(model_card)
	print("✓ Created README.md")

	# Upload to HuggingFace
	print(f"\n📤 Uploading to {OUTPUT_REPO}...")

	try:
	# Create repo if needed
	api.create_repo(OUTPUT_REPO, exist_ok=True)

	# Upload all files
	api.upload_folder(
	folder_path=str(output_dir),
	repo_id=OUTPUT_REPO,
	repo_type="model"
	)
	print(f"✓ Uploaded to https://huggingface.co/{OUTPUT_REPO}")
	except Exception as e:
	print(f"⚠ Upload error: {e}")

	# ============================================================
	# Verify
	# ============================================================

	print("\n🧪 Verifying merged model...")

	# Quick test
	from PIL import Image
	import numpy as np

	# Create test image
	test_image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))

	messages = [{"role": "user", "content": [
	{"type": "image", "image": test_image},
	{"type": "text", "text": "What do you see in this image?"}
	]}]

	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(text=[text], images=[test_image], return_tensors="pt").to(merged_model.device)

	with torch.no_grad():
	output = merged_model.generate(**inputs, max_new_tokens=50, do_sample=False)

	response = processor.decode(output[0], skip_special_tokens=True)
	print(f"Test response: {response[:200]}...")

	print("\n✅ DONE! Merged model available at:")
	print(f" https://huggingface.co/{OUTPUT_REPO}")