import os from huggingface_hub import InferenceClient token = os.environ["forRAG"] client = InferenceClient( model="Qwen/Qwen2.5-7B-Instruct", token=token ) def get_llm_answer(prompt): stream = client.chat_completion( messages=[{"role": "user", "content": prompt}], max_tokens=5000, stream=True ) for chunk in stream: delta = chunk.choices[0].delta.content if delta: yield delta