Spaces:

ysingh-aiml
/

tinyllama-activation-vector-steering

Sleeping

App Files Files Community

tinyllama-activation-vector-steering / model_utils.py

ysingh-aiml

Fix repetition collapse in steered decoding.

b803c00 2 months ago

raw

history blame contribute delete

6.62 kB

	"""
	Model and tokenizer loading for the TinyLlama activation steering demo.

	Loads TinyLlama/TinyLlama-1.1B-Chat-v1.0 in float32 for CPU inference.
	If a PEFT LoRA adapter is present at ./adapters/r16, it is merged into the
	base model before returning. The app degrades gracefully if peft is absent
	or the adapter files are missing.
	"""
	import logging
	import time
	from pathlib import Path
	from typing import Tuple

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	logger = logging.getLogger(__name__)

	BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
	ADAPTER_DIR = Path(__file__).resolve().parent / "adapters" / "r16"
	MAX_NEW_TOKENS = 100


	# ── PEFT availability ─────────────────────────────────────────────────────────

	def _peft_available() -> bool:
	try:
	import peft # noqa: F401
	return True
	except ImportError:
	return False


	# ── Model loading ─────────────────────────────────────────────────────────────

	def load_model_and_tokenizer(device: str = "cpu") -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
	"""
	Load the base model and tokenizer. Merges the LoRA adapter (r16) if
	available, falling back to the base model only.

	Parameters
	----------
	device : str
	Target device ("cpu" for Hugging Face Spaces default).

	Returns
	-------
	(model, tokenizer)
	"""
	logger.info("Loading tokenizer: %s", BASE_MODEL_ID)
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	logger.info("Loading base model: %s (device=%s, dtype=float32)", BASE_MODEL_ID, device)
	model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL_ID,
	dtype=torch.float32, # transformers ≥4.50 uses `dtype`; `torch_dtype` is deprecated
	low_cpu_mem_usage=True,
	attn_implementation="eager", # required for TinyLlama RoPE compatibility on CPU
	)

	# Attempt to load the PEFT LoRA adapter (converted from MLX r16 checkpoint)
	if ADAPTER_DIR.exists() and (ADAPTER_DIR / "adapter_config.json").exists():
	if _peft_available():
	try:
	from peft import PeftModel
	logger.info("Applying PEFT LoRA adapter from %s", ADAPTER_DIR)
	model = PeftModel.from_pretrained(model, str(ADAPTER_DIR))
	model = model.merge_and_unload()
	logger.info("LoRA adapter merged — model is now fine-tuned TinyLlama r16")
	except Exception as exc:
	logger.warning(
	"LoRA adapter load failed (%s) — falling back to base model", exc
	)
	else:
	logger.info("peft not installed — using base model only")
	else:
	logger.info("No adapter found at %s — using base model only", ADAPTER_DIR)

	model = model.to(device)
	model.eval()
	return model, tokenizer


	# ── Prompt formatting ─────────────────────────────────────────────────────────

	def format_prompt(tokenizer: AutoTokenizer, user_text: str) -> str:
	"""
	Format user text using the TinyLlama chat template when available,
	otherwise fall back to a manually constructed template.
	"""
	if hasattr(tokenizer, "apply_chat_template"):
	try:
	return tokenizer.apply_chat_template(
	[{"role": "user", "content": user_text}],
	tokenize=False,
	add_generation_prompt=True,
	)
	except Exception:
	pass
	# Manual TinyLlama chat template
	return (
	"<\|system\|>\n"
	"You are a helpful, respectful, and honest assistant.</s>\n"
	f"<\|user\|>\n{user_text}</s>\n"
	"<\|assistant\|>\n"
	)


	# ── Generation ────────────────────────────────────────────────────────────────

	def generate(
	model: AutoModelForCausalLM,
	tokenizer: AutoTokenizer,
	prompt: str,
	device: str = "cpu",
	max_new_tokens: int = MAX_NEW_TOKENS,
	*,
	repetition_penalty: float = 1.1,
	no_repeat_ngram_size: int = 0,
	do_sample: bool = False,
	temperature: float = 1.0,
	top_p: float = 1.0,
	) -> Tuple[str, float]:
	"""
	Generate up to max_new_tokens new tokens.

	Parameters
	----------
	model, tokenizer : loaded objects
	prompt : fully-formatted prompt string
	device : target device
	max_new_tokens : generation budget
	repetition_penalty : >1 discourages repeating tokens.
	no_repeat_ngram_size : if >0, block exact n-gram repeats.
	do_sample : True = temperature sampling; False = greedy.
	temperature : sampling temperature (only used when do_sample=True).
	top_p : nucleus sampling cutoff (only used when do_sample=True).

	Notes
	-----
	Steered decoding MUST use do_sample=True. Greedy decode combined with a
	fixed-direction activation injection creates a positive-feedback loop:
	the hook biases next-token logits in one direction every step, greedy
	pick reinforces those tokens into context, the hook fires again in the
	same direction — causing the repetition collapse visible in output.
	Temperature sampling breaks this feedback loop.
	"""
	inputs = tokenizer(prompt, return_tensors="pt").to(device)
	input_len = inputs["input_ids"].shape[-1]

	gen_kw: dict = dict(
	max_new_tokens=max_new_tokens,
	do_sample=do_sample,
	repetition_penalty=repetition_penalty,
	pad_token_id=tokenizer.eos_token_id,
	)
	if do_sample:
	gen_kw["temperature"] = temperature
	gen_kw["top_p"] = top_p
	if no_repeat_ngram_size > 0:
	gen_kw["no_repeat_ngram_size"] = int(no_repeat_ngram_size)

	t0 = time.perf_counter()
	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	**gen_kw,
	)
	elapsed = round(time.perf_counter() - t0, 2)

	new_tokens = output_ids[0, input_len:]
	response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
	return response, elapsed