from transformers import pipeline from config import LLM_MODEL, HUGGINGFACE_API_KEY # HuggingFace LLM (stable decoding for production) generator = pipeline( "text2text-generation", model=LLM_MODEL, token=HUGGINGFACE_API_KEY ) def refine_answer(prompt: str) -> str: """ Stable generation: - Prevents repetition loops (OPTIONS OPTIONS bug) - Deterministic output for SOP / RAG answers """ response = generator( prompt, max_new_tokens=200, do_sample=False, # 🔑 turn OFF sampling temperature=0.0, # deterministic repetition_penalty=1.2, # penalize repetition no_repeat_ngram_size=3 # block repeating phrases )[0]["generated_text"] return response.strip()