| import os | |
| from huggingface_hub import InferenceClient | |
| token = os.environ["forRAG"] | |
| client = InferenceClient( | |
| model="Qwen/Qwen2.5-7B-Instruct", | |
| token=token | |
| ) | |
| def get_llm_answer(prompt): | |
| stream = client.chat_completion( | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=5000, | |
| stream=True | |
| ) | |
| for chunk in stream: | |
| delta = chunk.choices[0].delta.content | |
| if delta: | |
| yield delta | |