RAG2 / llm.py
antimoda1
fix
6ec75fd
raw
history blame contribute delete
454 Bytes
import os
from huggingface_hub import InferenceClient
token = os.environ["forRAG"]
client = InferenceClient(
model="Qwen/Qwen2.5-7B-Instruct",
token=token
)
def get_llm_answer(prompt):
stream = client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=5000,
stream=True
)
for chunk in stream:
delta = chunk.choices[0].delta.content
if delta:
yield delta