Spaces:

krislette
/

bach-or-bot

Sleeping

App Files Files Community

bach-or-bot / src /llm2vectrain /model.py

krislette

Auto-deploy from GitHub: 5ac21603a8274a2350875ec7db1bd58cbf2ee539

75d43d2 2 months ago

raw

history blame contribute delete

2 kB

	from llm2vec import LLM2Vec
	from transformers import AutoTokenizer, AutoModel, AutoConfig
	from peft import PeftModel
	from src.llm2vectrain.config import access_token
	import torch
	from torchao.quantization import quantize_, Int8WeightOnlyConfig
	import os


	def load_llm2vec_model():
	# Get cache directory from environment or use default
	cache_dir = os.getenv("TRANSFORMERS_CACHE", "/app/.cache/huggingface")

	model_id = "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp"

	tokenizer = AutoTokenizer.from_pretrained(
	model_id, padding=True, truncation=True, max_length=512, cache_dir=cache_dir
	)

	config = AutoConfig.from_pretrained(
	model_id, trust_remote_code=True, cache_dir=cache_dir
	)

	if torch.cuda.is_available():
	# GPU path: use bf16 for speed
	model = AutoModel.from_pretrained(
	model_id,
	trust_remote_code=True,
	config=config,
	torch_dtype=torch.bfloat16,
	device_map="cuda",
	token=access_token,
	cache_dir=cache_dir,
	)
	else:
	# CPU path: use float32 first, then quantize
	model = AutoModel.from_pretrained(
	model_id,
	trust_remote_code=True,
	config=config,
	torch_dtype=torch.float32, # quantization requires fp32
	device_map="cpu",
	token=access_token,
	cache_dir=cache_dir,
	)

	try:
	from torchao.quantization import quantize_

	print("[INFO] Applying torchao quantization for CPU...")
	quant_config = Int8WeightOnlyConfig(group_size=None)
	print("[INFO] Applying torchao quantization with Int8WeightOnlyConfig...")
	quantize_(model, quant_config)
	except ImportError:
	print("[WARNING] torchao not installed. Run: pip install torchao")
	print("[WARNING] Falling back to non-quantized CPU model.")

	l2v = LLM2Vec(model, tokenizer, pooling_mode="mean", max_length=512)
	return l2v