Update app.py
Browse files
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import requests
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
import datetime
|
|
|
|
| 6 |
|
| 7 |
# Constants
|
| 8 |
SPACE_URL = "https://z7svds7k42bwhhgm.us-east-1.aws.endpoints.huggingface.cloud"
|
|
@@ -11,7 +12,7 @@ EOS_TOKEN = "<|end|>"
|
|
| 11 |
CHAT_HISTORY_DIR = "chat_histories"
|
| 12 |
IMAGE_PATH = "DubsChat.png"
|
| 13 |
IMAGE_PATH_2 = "Reboot AI.png"
|
| 14 |
-
|
| 15 |
|
| 16 |
# Ensure the directory exists
|
| 17 |
try:
|
|
@@ -109,56 +110,54 @@ for message in st.session_state["messages"]:
|
|
| 109 |
if message["role"] == "user":
|
| 110 |
st.chat_message("user").write(message["content"])
|
| 111 |
elif message["role"] == "assistant":
|
| 112 |
-
st.chat_message("assistant", avatar=
|
| 113 |
|
| 114 |
# -------------------------
|
| 115 |
-
# Streaming Logic
|
| 116 |
# -------------------------
|
| 117 |
def stream_response(prompt_text, api_key):
|
| 118 |
"""
|
| 119 |
-
Stream text from the HF Inference Endpoint
|
| 120 |
-
Yields each chunk of text as it arrives.
|
| 121 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
try:
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
# POST request with stream=True to get partial chunks
|
| 139 |
-
response = requests.post(
|
| 140 |
-
SPACE_URL,
|
| 141 |
-
json=payload,
|
| 142 |
-
headers=headers,
|
| 143 |
-
stream=True
|
| 144 |
-
)
|
| 145 |
-
response.raise_for_status()
|
| 146 |
-
|
| 147 |
-
# The endpoint presumably returns lines of JSON. Adjust parsing if needed:
|
| 148 |
-
for line in response.iter_lines():
|
| 149 |
-
if line:
|
| 150 |
-
data = json.loads(line.decode("utf-8"))
|
| 151 |
-
# Example: data might be [{"generated_text": "..."}]
|
| 152 |
-
# Adjust if your endpoint returns different JSON keys
|
| 153 |
-
chunk = data[0].get("generated_text", "")
|
| 154 |
-
yield chunk
|
| 155 |
-
|
| 156 |
-
except requests.exceptions.Timeout:
|
| 157 |
-
yield "The request timed out. Please try again later."
|
| 158 |
-
except requests.exceptions.RequestException as e:
|
| 159 |
yield f"Error: {e}"
|
| 160 |
-
except json.JSONDecodeError:
|
| 161 |
-
yield "Error decoding server response."
|
| 162 |
|
| 163 |
# -------------------------
|
| 164 |
# User Input
|
|
@@ -172,18 +171,19 @@ if prompt := st.chat_input():
|
|
| 172 |
st.chat_message("user").write(prompt)
|
| 173 |
|
| 174 |
# 2) Build combined chat history for the model prompt
|
|
|
|
| 175 |
chat_history = "".join(
|
| 176 |
[f"<|{msg['role']}|>{msg['content']}<|end|>" for msg in st.session_state["messages"]]
|
| 177 |
)
|
| 178 |
|
| 179 |
# 3) Create a placeholder for the assistant’s streamed response
|
| 180 |
with st.spinner("Dubs is thinking... Woof Woof! 🐾"):
|
| 181 |
-
assistant_message_placeholder = st.chat_message("assistant", avatar=
|
| 182 |
|
| 183 |
full_response = ""
|
| 184 |
-
# 4) Stream chunks from the
|
| 185 |
-
for chunk in stream_response(chat_history,
|
| 186 |
-
full_response
|
| 187 |
# Continuously update the placeholder with the partial response
|
| 188 |
assistant_message_placeholder.write(full_response)
|
| 189 |
|
|
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
import datetime
|
| 6 |
+
from huggingface_hub import InferenceClient # Make sure to install huggingface_hub first
|
| 7 |
|
| 8 |
# Constants
|
| 9 |
SPACE_URL = "https://z7svds7k42bwhhgm.us-east-1.aws.endpoints.huggingface.cloud"
|
|
|
|
| 12 |
CHAT_HISTORY_DIR = "chat_histories"
|
| 13 |
IMAGE_PATH = "DubsChat.png"
|
| 14 |
IMAGE_PATH_2 = "Reboot AI.png"
|
| 15 |
+
DUBS_PATH = "Dubs.png"
|
| 16 |
|
| 17 |
# Ensure the directory exists
|
| 18 |
try:
|
|
|
|
| 110 |
if message["role"] == "user":
|
| 111 |
st.chat_message("user").write(message["content"])
|
| 112 |
elif message["role"] == "assistant":
|
| 113 |
+
st.chat_message("assistant", avatar=DUBS_PATH).write(message["content"])
|
| 114 |
|
| 115 |
# -------------------------
|
| 116 |
+
# Streaming Logic using InferenceClient
|
| 117 |
# -------------------------
|
| 118 |
def stream_response(prompt_text, api_key):
|
| 119 |
"""
|
| 120 |
+
Stream text from the HF Inference Endpoint using the InferenceClient.
|
| 121 |
+
Yields each partial chunk of text as it arrives.
|
| 122 |
"""
|
| 123 |
+
# Initialize the client with your endpoint_url and API key
|
| 124 |
+
client = InferenceClient(
|
| 125 |
+
endpoint_url=SPACE_URL,
|
| 126 |
+
token=api_key
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# Define generation parameters
|
| 130 |
+
gen_kwargs = dict(
|
| 131 |
+
max_new_tokens=512,
|
| 132 |
+
top_k=30,
|
| 133 |
+
top_p=0.9,
|
| 134 |
+
temperature=0.2,
|
| 135 |
+
repetition_penalty=1.02,
|
| 136 |
+
stop_sequences=["<|endoftext|>"]
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# Start streaming from the model
|
| 140 |
+
stream = client.text_generation(prompt_text, stream=True, details=True, **gen_kwargs)
|
| 141 |
+
|
| 142 |
+
# We'll build the response incrementally
|
| 143 |
+
partial_text = ""
|
| 144 |
+
|
| 145 |
try:
|
| 146 |
+
for response in stream:
|
| 147 |
+
# Skip special tokens
|
| 148 |
+
if response.token.special:
|
| 149 |
+
continue
|
| 150 |
+
# Break if we encounter a stop sequence
|
| 151 |
+
if response.token.text in gen_kwargs["stop_sequences"]:
|
| 152 |
+
break
|
| 153 |
+
|
| 154 |
+
# Update the partial text
|
| 155 |
+
partial_text += response.token.text
|
| 156 |
+
|
| 157 |
+
# Yield the text so far so we can stream on the frontend
|
| 158 |
+
yield partial_text
|
| 159 |
+
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
yield f"Error: {e}"
|
|
|
|
|
|
|
| 161 |
|
| 162 |
# -------------------------
|
| 163 |
# User Input
|
|
|
|
| 171 |
st.chat_message("user").write(prompt)
|
| 172 |
|
| 173 |
# 2) Build combined chat history for the model prompt
|
| 174 |
+
# This format is just an example; adjust as needed for your model
|
| 175 |
chat_history = "".join(
|
| 176 |
[f"<|{msg['role']}|>{msg['content']}<|end|>" for msg in st.session_state["messages"]]
|
| 177 |
)
|
| 178 |
|
| 179 |
# 3) Create a placeholder for the assistant’s streamed response
|
| 180 |
with st.spinner("Dubs is thinking... Woof Woof! 🐾"):
|
| 181 |
+
assistant_message_placeholder = st.chat_message("assistant", avatar=DUBS_PATH).empty()
|
| 182 |
|
| 183 |
full_response = ""
|
| 184 |
+
# 4) Stream chunks from the Hugging Face InferenceClient
|
| 185 |
+
for chunk in stream_response(chat_history, dubs_key):
|
| 186 |
+
full_response = chunk # each chunk is the incremental text so far
|
| 187 |
# Continuously update the placeholder with the partial response
|
| 188 |
assistant_message_placeholder.write(full_response)
|
| 189 |
|