Spaces:
Sleeping
Sleeping
time check
Browse files- main_gguf.py +16 -8
main_gguf.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import gradio as gr
|
| 3 |
from llama_cpp import Llama, llama_cpp
|
| 4 |
-
from fastapi import FastAPI
|
| 5 |
from fastapi.responses import StreamingResponse
|
| 6 |
from pydantic import BaseModel
|
| 7 |
import json
|
|
@@ -95,6 +95,11 @@ def generate_base_model_stream(messages: List[dict], max_length: int = 256, temp
|
|
| 95 |
"""Generate streaming response from base model."""
|
| 96 |
print(f"Using: base model", flush=True)
|
| 97 |
print(f"Messages: {messages}", flush=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
for chunk in base_model.create_chat_completion(
|
| 99 |
messages=messages,
|
| 100 |
max_tokens=max_length,
|
|
@@ -103,8 +108,17 @@ def generate_base_model_stream(messages: List[dict], max_length: int = 256, temp
|
|
| 103 |
):
|
| 104 |
delta = chunk["choices"][0]["delta"]
|
| 105 |
if "content" in delta:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
yield delta["content"]
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
def generate_text_stream(messages: List[dict], language: str, max_length: int = 256, temperature: float = 0.7):
|
|
@@ -193,20 +207,14 @@ async def generate_stream_api(request: GenerateRequest):
|
|
| 193 |
|
| 194 |
|
| 195 |
@app.post("/v1/chat/completions")
|
| 196 |
-
async def chat_completions(request: ChatCompletionRequest
|
| 197 |
-
# Log raw body to debug message loss
|
| 198 |
-
raw_body = await raw_request.body()
|
| 199 |
-
|
| 200 |
messages_dicts = [{"role": msg.role, "content": msg.content} for msg in request.messages]
|
| 201 |
language = request.model if request.model in GGUF_MODELS else "English"
|
| 202 |
-
|
| 203 |
-
print(f"Received messages_dicts: {messages_dicts}")
|
| 204 |
|
| 205 |
chat_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
|
| 206 |
created = int(time.time())
|
| 207 |
|
| 208 |
def event_generator():
|
| 209 |
-
print(f"[EVENT_GENERATOR] Starting with messages: {messages_dicts}", flush=True)
|
| 210 |
try:
|
| 211 |
for token in generate_base_model_stream(
|
| 212 |
messages_dicts,
|
|
|
|
| 1 |
import os
|
| 2 |
import gradio as gr
|
| 3 |
from llama_cpp import Llama, llama_cpp
|
| 4 |
+
from fastapi import FastAPI
|
| 5 |
from fastapi.responses import StreamingResponse
|
| 6 |
from pydantic import BaseModel
|
| 7 |
import json
|
|
|
|
| 95 |
"""Generate streaming response from base model."""
|
| 96 |
print(f"Using: base model", flush=True)
|
| 97 |
print(f"Messages: {messages}", flush=True)
|
| 98 |
+
|
| 99 |
+
start = time.time()
|
| 100 |
+
first_token_time = None
|
| 101 |
+
token_count = 0
|
| 102 |
+
|
| 103 |
for chunk in base_model.create_chat_completion(
|
| 104 |
messages=messages,
|
| 105 |
max_tokens=max_length,
|
|
|
|
| 108 |
):
|
| 109 |
delta = chunk["choices"][0]["delta"]
|
| 110 |
if "content" in delta:
|
| 111 |
+
if first_token_time is None:
|
| 112 |
+
first_token_time = time.time()
|
| 113 |
+
print(f"\n[Prefill: {first_token_time - start:.2f}s]", flush=True)
|
| 114 |
+
token_count += 1
|
| 115 |
+
print(delta["content"], end="", flush=True)
|
| 116 |
yield delta["content"]
|
| 117 |
|
| 118 |
+
total_time = time.time() - start
|
| 119 |
+
gen_time = total_time - (first_token_time - start) if first_token_time else 0
|
| 120 |
+
print(f"\n[Total: {total_time:.2f}s | Tokens: {token_count} | Speed: {token_count/gen_time:.1f} tok/s]", flush=True)
|
| 121 |
+
|
| 122 |
|
| 123 |
|
| 124 |
def generate_text_stream(messages: List[dict], language: str, max_length: int = 256, temperature: float = 0.7):
|
|
|
|
| 207 |
|
| 208 |
|
| 209 |
@app.post("/v1/chat/completions")
|
| 210 |
+
async def chat_completions(request: ChatCompletionRequest):
|
|
|
|
|
|
|
|
|
|
| 211 |
messages_dicts = [{"role": msg.role, "content": msg.content} for msg in request.messages]
|
| 212 |
language = request.model if request.model in GGUF_MODELS else "English"
|
|
|
|
|
|
|
| 213 |
|
| 214 |
chat_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
|
| 215 |
created = int(time.time())
|
| 216 |
|
| 217 |
def event_generator():
|
|
|
|
| 218 |
try:
|
| 219 |
for token in generate_base_model_stream(
|
| 220 |
messages_dicts,
|