Spaces:

toropets
/

RAG2

Sleeping

RAG2 / llm.py

antimoda1

fix

6ec75fd 16 days ago

454 Bytes

	import os
	from huggingface_hub import InferenceClient

	token = os.environ["forRAG"]
	client = InferenceClient(
	model="Qwen/Qwen2.5-7B-Instruct",
	token=token
	)

	def get_llm_answer(prompt):
	stream = client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=5000,
	stream=True
	)

	for chunk in stream:
	delta = chunk.choices[0].delta.content
	if delta:
	yield delta