jiminaa commited on
Commit
1ddecb0
·
1 Parent(s): 5faf11e

time check

Browse files
Files changed (1) hide show
  1. main_gguf.py +16 -8
main_gguf.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import gradio as gr
3
  from llama_cpp import Llama, llama_cpp
4
- from fastapi import FastAPI, Request
5
  from fastapi.responses import StreamingResponse
6
  from pydantic import BaseModel
7
  import json
@@ -95,6 +95,11 @@ def generate_base_model_stream(messages: List[dict], max_length: int = 256, temp
95
  """Generate streaming response from base model."""
96
  print(f"Using: base model", flush=True)
97
  print(f"Messages: {messages}", flush=True)
 
 
 
 
 
98
  for chunk in base_model.create_chat_completion(
99
  messages=messages,
100
  max_tokens=max_length,
@@ -103,8 +108,17 @@ def generate_base_model_stream(messages: List[dict], max_length: int = 256, temp
103
  ):
104
  delta = chunk["choices"][0]["delta"]
105
  if "content" in delta:
 
 
 
 
 
106
  yield delta["content"]
107
 
 
 
 
 
108
 
109
 
110
  def generate_text_stream(messages: List[dict], language: str, max_length: int = 256, temperature: float = 0.7):
@@ -193,20 +207,14 @@ async def generate_stream_api(request: GenerateRequest):
193
 
194
 
195
  @app.post("/v1/chat/completions")
196
- async def chat_completions(request: ChatCompletionRequest, raw_request: Request):
197
- # Log raw body to debug message loss
198
- raw_body = await raw_request.body()
199
-
200
  messages_dicts = [{"role": msg.role, "content": msg.content} for msg in request.messages]
201
  language = request.model if request.model in GGUF_MODELS else "English"
202
-
203
- print(f"Received messages_dicts: {messages_dicts}")
204
 
205
  chat_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
206
  created = int(time.time())
207
 
208
  def event_generator():
209
- print(f"[EVENT_GENERATOR] Starting with messages: {messages_dicts}", flush=True)
210
  try:
211
  for token in generate_base_model_stream(
212
  messages_dicts,
 
1
  import os
2
  import gradio as gr
3
  from llama_cpp import Llama, llama_cpp
4
+ from fastapi import FastAPI
5
  from fastapi.responses import StreamingResponse
6
  from pydantic import BaseModel
7
  import json
 
95
  """Generate streaming response from base model."""
96
  print(f"Using: base model", flush=True)
97
  print(f"Messages: {messages}", flush=True)
98
+
99
+ start = time.time()
100
+ first_token_time = None
101
+ token_count = 0
102
+
103
  for chunk in base_model.create_chat_completion(
104
  messages=messages,
105
  max_tokens=max_length,
 
108
  ):
109
  delta = chunk["choices"][0]["delta"]
110
  if "content" in delta:
111
+ if first_token_time is None:
112
+ first_token_time = time.time()
113
+ print(f"\n[Prefill: {first_token_time - start:.2f}s]", flush=True)
114
+ token_count += 1
115
+ print(delta["content"], end="", flush=True)
116
  yield delta["content"]
117
 
118
+ total_time = time.time() - start
119
+ gen_time = total_time - (first_token_time - start) if first_token_time else 0
120
+ print(f"\n[Total: {total_time:.2f}s | Tokens: {token_count} | Speed: {token_count/gen_time:.1f} tok/s]", flush=True)
121
+
122
 
123
 
124
  def generate_text_stream(messages: List[dict], language: str, max_length: int = 256, temperature: float = 0.7):
 
207
 
208
 
209
  @app.post("/v1/chat/completions")
210
+ async def chat_completions(request: ChatCompletionRequest):
 
 
 
211
  messages_dicts = [{"role": msg.role, "content": msg.content} for msg in request.messages]
212
  language = request.model if request.model in GGUF_MODELS else "English"
 
 
213
 
214
  chat_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
215
  created = int(time.time())
216
 
217
  def event_generator():
 
218
  try:
219
  for token in generate_base_model_stream(
220
  messages_dicts,