|
|
|
|
|
import torch |
|
|
import threading |
|
|
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer, TextIteratorStreamer |
|
|
|
|
|
QG_MODEL = "f3nsmart/ft-flan-t5-base-qgen_v2" |
|
|
|
|
|
tokenizer = T5Tokenizer.from_pretrained(QG_MODEL, use_fast=False) |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(QG_MODEL) |
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
model.to(device).eval() |
|
|
|
|
|
print(f"✅ Loaded interviewer model (streaming ready): {QG_MODEL}") |
|
|
|
|
|
|
|
|
def generate_question(prompt: str = "Generate one thoughtful question.") -> str: |
|
|
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device) |
|
|
with torch.no_grad(): |
|
|
output = model.generate(**inputs, max_new_tokens=80) |
|
|
return tokenizer.decode(output[0], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
def stream_question(prompt: str = "Generate one thoughtful question."): |
|
|
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device) |
|
|
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) |
|
|
generation_kwargs = dict( |
|
|
**inputs, |
|
|
streamer=streamer, |
|
|
max_new_tokens=80, |
|
|
do_sample=True, |
|
|
top_p=0.9, |
|
|
temperature=1.1, |
|
|
top_k=60, |
|
|
repetition_penalty=1.3, |
|
|
) |
|
|
|
|
|
|
|
|
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) |
|
|
thread.start() |
|
|
|
|
|
partial = "" |
|
|
for new_text in streamer: |
|
|
partial += new_text |
|
|
yield partial |
|
|
|