streaming helper, newest gradio
Browse files- requirements.txt +3 -2
- utils/streaming.py +35 -0
requirements.txt
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
-
gradio
|
| 2 |
openai==1.61.1
|
| 3 |
langchain-openai==0.3.4
|
| 4 |
langchain-anthropic==0.3.7
|
| 5 |
langchain-deepseek==0.1.1
|
| 6 |
langchain-core==0.3.34
|
| 7 |
pydantic==2.9.2
|
| 8 |
-
typing_extensions==4.12.2
|
|
|
|
|
|
| 1 |
+
gradio==5.15.0
|
| 2 |
openai==1.61.1
|
| 3 |
langchain-openai==0.3.4
|
| 4 |
langchain-anthropic==0.3.7
|
| 5 |
langchain-deepseek==0.1.1
|
| 6 |
langchain-core==0.3.34
|
| 7 |
pydantic==2.9.2
|
| 8 |
+
typing_extensions==4.12.2
|
| 9 |
+
huggingface_hub==0.28.1
|
utils/streaming.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/streaming.py
|
| 2 |
+
import os
|
| 3 |
+
import asyncio
|
| 4 |
+
from huggingface_hub import AsyncInferenceClient
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
async def stream_chat_completion(messages, model_name: str, max_tokens: int = 1024):
|
| 8 |
+
"""
|
| 9 |
+
Stream tokens from a Hugging Face Inference endpoint.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
messages (list[dict]): A list of message dictionaries, e.g.:
|
| 13 |
+
[{"role": "system", "content": "You are a helpful assistant."},
|
| 14 |
+
{"role": "user", "content": "Count to 10"}]
|
| 15 |
+
model_name (str): The identifier for the model (used in the base_url).
|
| 16 |
+
max_tokens (int): Maximum tokens to generate.
|
| 17 |
+
|
| 18 |
+
Yields:
|
| 19 |
+
str: Tokens as they are generated.
|
| 20 |
+
"""
|
| 21 |
+
# Construct a base URL that points to the model’s endpoint.
|
| 22 |
+
base_url = f"https://api-inference.huggingface.co/models/{model_name}"
|
| 23 |
+
token = os.getenv("HF_API_TOKEN")
|
| 24 |
+
client = AsyncInferenceClient(base_url=base_url, token=token)
|
| 25 |
+
|
| 26 |
+
stream = await client.chat.completions.create(
|
| 27 |
+
messages=messages,
|
| 28 |
+
stream=True,
|
| 29 |
+
max_tokens=max_tokens,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
async for chunk in stream:
|
| 33 |
+
# Each chunk is expected to have a structure where the generated text is in:
|
| 34 |
+
# chunk.choices[0].delta.content
|
| 35 |
+
yield chunk.choices[0].delta.content or ""
|