BtB-ExpC commited on
Commit
1fa1ef3
·
1 Parent(s): 4ed2934

streaming helper, newest gradio

Browse files
Files changed (2) hide show
  1. requirements.txt +3 -2
  2. utils/streaming.py +35 -0
requirements.txt CHANGED
@@ -1,8 +1,9 @@
1
- gradio>=3.42.0
2
  openai==1.61.1
3
  langchain-openai==0.3.4
4
  langchain-anthropic==0.3.7
5
  langchain-deepseek==0.1.1
6
  langchain-core==0.3.34
7
  pydantic==2.9.2
8
- typing_extensions==4.12.2
 
 
1
+ gradio==5.15.0
2
  openai==1.61.1
3
  langchain-openai==0.3.4
4
  langchain-anthropic==0.3.7
5
  langchain-deepseek==0.1.1
6
  langchain-core==0.3.34
7
  pydantic==2.9.2
8
+ typing_extensions==4.12.2
9
+ huggingface_hub==0.28.1
utils/streaming.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/streaming.py
2
+ import os
3
+ import asyncio
4
+ from huggingface_hub import AsyncInferenceClient
5
+
6
+
7
+ async def stream_chat_completion(messages, model_name: str, max_tokens: int = 1024):
8
+ """
9
+ Stream tokens from a Hugging Face Inference endpoint.
10
+
11
+ Args:
12
+ messages (list[dict]): A list of message dictionaries, e.g.:
13
+ [{"role": "system", "content": "You are a helpful assistant."},
14
+ {"role": "user", "content": "Count to 10"}]
15
+ model_name (str): The identifier for the model (used in the base_url).
16
+ max_tokens (int): Maximum tokens to generate.
17
+
18
+ Yields:
19
+ str: Tokens as they are generated.
20
+ """
21
+ # Construct a base URL that points to the model’s endpoint.
22
+ base_url = f"https://api-inference.huggingface.co/models/{model_name}"
23
+ token = os.getenv("HF_API_TOKEN")
24
+ client = AsyncInferenceClient(base_url=base_url, token=token)
25
+
26
+ stream = await client.chat.completions.create(
27
+ messages=messages,
28
+ stream=True,
29
+ max_tokens=max_tokens,
30
+ )
31
+
32
+ async for chunk in stream:
33
+ # Each chunk is expected to have a structure where the generated text is in:
34
+ # chunk.choices[0].delta.content
35
+ yield chunk.choices[0].delta.content or ""