Spaces:
Build error
Build error
kwabs22 commited on
Commit ·
7e3fb58
1
Parent(s): ee032a8
after bufsize=1 change, exploring word or token level stream
Browse files
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import random
|
|
| 3 |
import subprocess
|
| 4 |
import time
|
| 5 |
|
|
|
|
| 6 |
def generate_response(user_message): #Figure Out the parameters later and find a way to get the ram usage
|
| 7 |
cmd = [
|
| 8 |
"/app/llama.cpp/main", # Path to the executable
|
|
@@ -32,9 +33,9 @@ def generate_response(user_message): #Figure Out the parameters later and find a
|
|
| 32 |
if process.returncode != 0:
|
| 33 |
error_message = process.stderr.read()
|
| 34 |
print(f"Error: {error_message}")
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
def generate_response_token_by_token(user_message):
|
| 38 |
cmd = [
|
| 39 |
"/app/llama.cpp/main", # Path to the executable
|
| 40 |
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
|
|
@@ -58,13 +59,14 @@ def generate_response_token_by_token(user_message):
|
|
| 58 |
if char == ' ' or char == '\n': # Token delimiters
|
| 59 |
elapsed_time = time.time() - start_time # Calculate elapsed time
|
| 60 |
alltokens += token_buffer
|
| 61 |
-
yield f"{alltokens} [Inference time: {elapsed_time:.2f} seconds]"
|
| 62 |
token_buffer = '' # Reset token buffer
|
| 63 |
|
| 64 |
# Yield the last token if there is any
|
| 65 |
if token_buffer:
|
| 66 |
elapsed_time = time.time() - start_time # Calculate elapsed time
|
| 67 |
-
|
|
|
|
| 68 |
|
| 69 |
# Wait for the subprocess to finish if it hasn't already
|
| 70 |
process.wait()
|
|
@@ -93,7 +95,7 @@ with gr.Blocks() as iface:
|
|
| 93 |
description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
|
| 94 |
flagging_dir="/usr/src/app/flagged",
|
| 95 |
)
|
| 96 |
-
gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
|
| 97 |
with gr.Group():
|
| 98 |
gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
|
| 99 |
MainOutput = gr.TextArea(placeholder='Output will show here')
|
|
|
|
| 3 |
import subprocess
|
| 4 |
import time
|
| 5 |
|
| 6 |
+
"""
|
| 7 |
def generate_response(user_message): #Figure Out the parameters later and find a way to get the ram usage
|
| 8 |
cmd = [
|
| 9 |
"/app/llama.cpp/main", # Path to the executable
|
|
|
|
| 33 |
if process.returncode != 0:
|
| 34 |
error_message = process.stderr.read()
|
| 35 |
print(f"Error: {error_message}")
|
| 36 |
+
"""
|
| 37 |
|
| 38 |
+
def generate_response(user_message): #generate_response_token_by_token
|
|
|
|
| 39 |
cmd = [
|
| 40 |
"/app/llama.cpp/main", # Path to the executable
|
| 41 |
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
|
|
|
|
| 59 |
if char == ' ' or char == '\n': # Token delimiters
|
| 60 |
elapsed_time = time.time() - start_time # Calculate elapsed time
|
| 61 |
alltokens += token_buffer
|
| 62 |
+
yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds]"
|
| 63 |
token_buffer = '' # Reset token buffer
|
| 64 |
|
| 65 |
# Yield the last token if there is any
|
| 66 |
if token_buffer:
|
| 67 |
elapsed_time = time.time() - start_time # Calculate elapsed time
|
| 68 |
+
alltokens += token_buffer
|
| 69 |
+
yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds]"
|
| 70 |
|
| 71 |
# Wait for the subprocess to finish if it hasn't already
|
| 72 |
process.wait()
|
|
|
|
| 95 |
description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
|
| 96 |
flagging_dir="/usr/src/app/flagged",
|
| 97 |
)
|
| 98 |
+
#gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
|
| 99 |
with gr.Group():
|
| 100 |
gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
|
| 101 |
MainOutput = gr.TextArea(placeholder='Output will show here')
|