Spaces:

HemanM
/

evo-gov-copilot-mu

Sleeping

App Files Files Community

evo-gov-copilot-mu / evo_plugin_example.py

HemanM

Update evo_plugin_example.py

1794bf5 verified 7 months ago

raw

history blame contribute delete

1.96 kB

	# evo_plugin_example.py — FLAN-T5 stand-in (truncation + clean kwargs)
	import torch
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	class _HFSeq2SeqGenerator:
	def __init__(self, model_name: str = "google/flan-t5-small"):
	self.device = torch.device("cpu")
	self.tok = AutoTokenizer.from_pretrained(model_name)
	self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device).eval()
	# FLAN-T5 encoder max length
	ml = getattr(self.tok, "model_max_length", 512) or 512
	# Some tokenizers report a huge sentinel value; clamp to 512 for T5-small
	self.max_src_len = min(512, int(ml if ml < 10000 else 512))

	@torch.no_grad()
	def generate(self, prompt: str, max_new_tokens: int = 200, temperature: float = 0.0) -> str:
	# TRUNCATE input to model's max encoder length
	inputs = self.tok(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=self.max_src_len,
	).to(self.device)

	do_sample = float(temperature) > 0.0

	gen_kwargs = dict(
	max_new_tokens=int(max_new_tokens),
	num_beams=4, # stable, less echo
	early_stopping=True,
	no_repeat_ngram_size=3,
	repetition_penalty=1.1,
	length_penalty=0.1,
	)
	# Only include sampling args when sampling is ON (silences warnings)
	if do_sample:
	gen_kwargs.update(
	do_sample=True,
	temperature=float(max(0.01, temperature)),
	top_p=0.9,
	)

	# Encourage non-trivial length without tying to input length
	gen_kwargs["min_new_tokens"] = max(48, int(0.4 * max_new_tokens))

	out = self.model.generate(inputs, gen_kwargs)
	return self.tok.decode(out[0], skip_special_tokens=True).strip()

	def load_model():
	return _HFSeq2SeqGenerator()