AnatoliiG commited on
Commit
f9aca5d
·
1 Parent(s): 2d1f66e

arch restruct

Browse files
src/api/routes.py CHANGED
@@ -1,8 +1,9 @@
1
  import asyncio
2
  import json
 
3
 
4
- from fastapi import APIRouter, Request
5
- from fastapi.responses import JSONResponse, StreamingResponse
6
 
7
  from src.core.config import settings
8
  from src.core.engine import engine
@@ -14,105 +15,57 @@ router = APIRouter()
14
  @router.post("/chat/completions")
15
  async def chat_completions(request: Request):
16
  if not engine.llm:
17
- return JSONResponse({"error": "Model not loaded"}, status_code=500)
18
 
19
  data = await request.json()
20
  messages = [
21
  {"role": m.get("role", "user"), "content": get_clean_text(m.get("content"))}
22
  for m in data.get("messages", [])
23
  ]
 
 
 
24
 
25
- stream = data.get("stream", True)
26
-
27
  async def stream_generator():
28
- # Ensure sequential processing: acquire the engine lock for the whole generation
29
- async with engine.lock:
30
- import threading
31
-
32
- # Use an asyncio.Queue to safely transfer chunks from a blocking worker thread to the async generator
33
- q: asyncio.Queue = asyncio.Queue()
34
- stop_event = threading.Event()
35
- loop = asyncio.get_running_loop()
36
-
37
- def worker():
38
- try:
39
- for chunk in engine.llm.create_chat_completion(
40
- messages=messages,
41
- max_tokens=int(
42
- data.get("max_tokens", settings.DEFAULT_MAX_TOKENS)
43
- ),
44
- temperature=float(
45
- data.get("temperature", settings.DEFAULT_TEMP)
46
- ),
47
- stream=True,
48
- ):
49
- # stop early if requested (e.g. client disconnected)
50
- if stop_event.is_set():
51
- break
52
- # chunk_count += 1
53
- # now = time.time()
54
- loop.call_soon_threadsafe(q.put_nowait, chunk)
55
- # loop.call_soon_threadsafe(
56
- # q.put_nowait,
57
- # {"__chunk": chunk, "ts": now, "count": chunk_count},
58
- # )
59
- # # финальный лог
60
- # loop.call_soon_threadsafe(
61
- # q.put_nowait,
62
- # {
63
- # "__done": True,
64
- # "duration": time.time() - start,
65
- # "chunks": chunk_count,
66
- # },
67
- # )
68
- except Exception as e:
69
- # Pass exception to the async side so we can surface an error or terminate cleanly
70
- loop.call_soon_threadsafe(q.put_nowait, {"__error": str(e)})
71
- finally:
72
- # Sentinel to mark completion
73
- loop.call_soon_threadsafe(q.put_nowait, None)
74
-
75
- # Run the blocking model iteration in a thread so it doesn't block the event loop
76
- worker_future = loop.run_in_executor(None, worker)
77
 
 
78
  try:
79
- while True:
80
- item = await q.get()
81
- if item is None:
82
- # worker finished normally
83
- break
84
- # If worker reported an error, stream it and break
85
- if isinstance(item, dict) and item.get("__error"):
86
- yield f"data: {json.dumps({'error': item['__error']})}\n\n"
87
- break
88
- yield f"data: {json.dumps(item)}\n\n"
 
89
  yield "data: [DONE]\n\n"
90
- except asyncio.CancelledError:
91
- # Client disconnected: signal the worker to stop and wait for it to finish, then re-raise to terminate streaming
92
- stop_event.set()
93
- try:
94
- await worker_future
95
- except Exception:
96
- pass
97
- raise
98
- finally:
99
- # Ensure worker is signalled to stop and awaited (idempotent)
100
- stop_event.set()
101
- try:
102
- await worker_future
103
- except Exception:
104
- pass
105
-
106
- if stream:
107
  return StreamingResponse(stream_generator(), media_type="text/event-stream")
108
 
109
  else:
110
- async with engine.lock:
111
- result = await asyncio.to_thread(
112
- engine.generate,
113
- messages,
114
- data.get("max_tokens", settings.DEFAULT_MAX_TOKENS),
115
- data.get("temperature", settings.DEFAULT_TEMP),
116
- stream=False,
117
- )
118
- return result
 
 
 
 
1
  import asyncio
2
  import json
3
+ import threading
4
 
5
+ from fastapi import APIRouter, HTTPException, Request
6
+ from fastapi.responses import StreamingResponse
7
 
8
  from src.core.config import settings
9
  from src.core.engine import engine
 
15
  @router.post("/chat/completions")
16
  async def chat_completions(request: Request):
17
  if not engine.llm:
18
+ raise HTTPException(status_code=500, detail="Model not loaded")
19
 
20
  data = await request.json()
21
  messages = [
22
  {"role": m.get("role", "user"), "content": get_clean_text(m.get("content"))}
23
  for m in data.get("messages", [])
24
  ]
25
+ max_tokens = data.get("max_tokens", settings.DEFAULT_MAX_TOKENS)
26
+ temperature = data.get("temperature", settings.DEFAULT_TEMP)
27
+ stream_req = data.get("stream", True)
28
 
29
+ # --- Логика Streaming ---
 
30
  async def stream_generator():
31
+ queue = asyncio.Queue()
32
+ loop = asyncio.get_running_loop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ def worker():
35
  try:
36
+ for chunk in engine.generate_stream(messages, max_tokens, temperature):
37
+ loop.call_soon_threadsafe(queue.put_nowait, chunk)
38
+ loop.call_soon_threadsafe(queue.put_nowait, None)
39
+ except Exception as e:
40
+ loop.call_soon_threadsafe(queue.put_nowait, {"error": str(e)})
41
+
42
+ loop.run_in_executor(None, worker)
43
+
44
+ while True:
45
+ chunk = await queue.get()
46
+ if chunk is None:
47
  yield "data: [DONE]\n\n"
48
+ break
49
+
50
+ if isinstance(chunk, dict) and "error" in chunk:
51
+ yield f"data: {json.dumps({'error': chunk['error']})}\n\n"
52
+ break
53
+
54
+ yield f"data: {json.dumps(chunk)}\n\n"
55
+
56
+ if stream_req:
 
 
 
 
 
 
 
 
57
  return StreamingResponse(stream_generator(), media_type="text/event-stream")
58
 
59
  else:
60
+
61
+ def run_sync():
62
+ with engine.lock:
63
+ return engine.llm.create_chat_completion(
64
+ messages=messages,
65
+ max_tokens=int(max_tokens),
66
+ temperature=float(temperature),
67
+ stream=False,
68
+ )
69
+
70
+ response = await asyncio.to_thread(run_sync)
71
+ return response
src/core/config.py CHANGED
@@ -7,10 +7,10 @@ class Settings(BaseSettings):
7
  REPO_ID: str = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
8
  FILENAME: str = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
9
 
10
- CONTEXT_SIZE: int = 8192
11
- DEFAULT_MAX_TOKENS: int = 4096
12
  DEFAULT_TEMP: float = 0.4
13
- N_THREADS: int = os.cpu_count()
14
  N_GPU_LAYERS: int = 0
15
 
16
 
 
7
  REPO_ID: str = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
8
  FILENAME: str = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
9
 
10
+ CONTEXT_SIZE: int = 65536
11
+ DEFAULT_MAX_TOKENS: int = 16384
12
  DEFAULT_TEMP: float = 0.4
13
+ N_THREADS: int = 2
14
  N_GPU_LAYERS: int = 0
15
 
16
 
src/core/engine.py CHANGED
@@ -1,4 +1,5 @@
1
- import asyncio
 
2
 
3
  from huggingface_hub import hf_hub_download
4
  from llama_cpp import Llama
@@ -9,11 +10,12 @@ from src.core.config import settings
9
  class ModelEngine:
10
  def __init__(self):
11
  self.llm = None
12
- self.lock = asyncio.Lock()
13
  self._load_model()
14
 
15
  def _load_model(self):
16
  try:
 
17
  model_path = hf_hub_download(
18
  repo_id=settings.REPO_ID, filename=settings.FILENAME
19
  )
@@ -24,19 +26,25 @@ class ModelEngine:
24
  n_gpu_layers=settings.N_GPU_LAYERS,
25
  verbose=True,
26
  )
 
27
  except Exception as e:
28
- print(f"Error loading model: {e}")
29
 
30
- def generate(self, messages, max_tokens, temperature, stream=True):
 
 
31
  if not self.llm:
32
  raise RuntimeError("Model not loaded")
33
- return self.llm.create_chat_completion(
34
- messages=messages,
35
- max_tokens=int(max_tokens),
36
- temperature=float(temperature),
37
- stream=stream,
38
- )
 
 
 
 
39
 
40
 
41
- # Создаем синглтон
42
  engine = ModelEngine()
 
1
+ import threading
2
+ from typing import Any, Dict, Generator, List
3
 
4
  from huggingface_hub import hf_hub_download
5
  from llama_cpp import Llama
 
10
  class ModelEngine:
11
  def __init__(self):
12
  self.llm = None
13
+ self.lock = threading.Lock()
14
  self._load_model()
15
 
16
  def _load_model(self):
17
  try:
18
+ print(f"Downloading/Loading model: {settings.REPO_ID}...")
19
  model_path = hf_hub_download(
20
  repo_id=settings.REPO_ID, filename=settings.FILENAME
21
  )
 
26
  n_gpu_layers=settings.N_GPU_LAYERS,
27
  verbose=True,
28
  )
29
+ print("Model loaded successfully!")
30
  except Exception as e:
31
+ print(f"CRITICAL ERROR loading model: {e}")
32
 
33
+ def generate_stream(
34
+ self, messages: List[Dict[str, str]], max_tokens: int, temperature: float
35
+ ) -> Generator:
36
  if not self.llm:
37
  raise RuntimeError("Model not loaded")
38
+
39
+ with self.lock:
40
+ stream = self.llm.create_chat_completion(
41
+ messages=messages,
42
+ max_tokens=int(max_tokens),
43
+ temperature=float(temperature),
44
+ stream=True,
45
+ )
46
+ for chunk in stream:
47
+ yield chunk
48
 
49
 
 
50
  engine = ModelEngine()
src/ui/callbacks.py CHANGED
@@ -1,20 +1,12 @@
1
  import gradio as gr
2
- from src.utils.helpers import get_clean_text
3
 
4
  from src.core.engine import engine
5
-
6
-
7
- def user_input(user_message, history):
8
- if not user_message:
9
- return None, history
10
- history = history or []
11
- history.append({"role": "user", "content": str(user_message)})
12
- return "", history
13
 
14
 
15
  def bot_response(history, system_prompt, temperature, max_tokens):
16
  messages = [{"role": "system", "content": system_prompt}]
17
- for msg in history[-15:]:
18
  messages.append(
19
  {"role": msg["role"], "content": get_clean_text(msg["content"])}
20
  )
@@ -22,7 +14,8 @@ def bot_response(history, system_prompt, temperature, max_tokens):
22
  history.append({"role": "assistant", "content": ""})
23
 
24
  try:
25
- stream = engine.generate(messages, max_tokens, temperature, stream=True)
 
26
  partial_text = ""
27
  for chunk in stream:
28
  delta = chunk["choices"][0]["delta"]
@@ -30,16 +23,7 @@ def bot_response(history, system_prompt, temperature, max_tokens):
30
  partial_text += delta["content"]
31
  history[-1]["content"] = partial_text
32
  yield history
 
33
  except Exception as e:
34
  history[-1]["content"] += f"\n\n❌ Error: {str(e)}"
35
  yield history
36
-
37
-
38
- def set_interactive(is_interactive):
39
- return (
40
- gr.update(
41
- interactive=is_interactive,
42
- placeholder="Wait..." if not is_interactive else "Type...",
43
- ),
44
- gr.update(interactive=is_interactive),
45
- )
 
1
  import gradio as gr
 
2
 
3
  from src.core.engine import engine
4
+ from src.utils.helpers import get_clean_text
 
 
 
 
 
 
 
5
 
6
 
7
  def bot_response(history, system_prompt, temperature, max_tokens):
8
  messages = [{"role": "system", "content": system_prompt}]
9
+ for msg in history[-7:]:
10
  messages.append(
11
  {"role": msg["role"], "content": get_clean_text(msg["content"])}
12
  )
 
14
  history.append({"role": "assistant", "content": ""})
15
 
16
  try:
17
+ stream = engine.generate_stream(messages, max_tokens, temperature)
18
+
19
  partial_text = ""
20
  for chunk in stream:
21
  delta = chunk["choices"][0]["delta"]
 
23
  partial_text += delta["content"]
24
  history[-1]["content"] = partial_text
25
  yield history
26
+
27
  except Exception as e:
28
  history[-1]["content"] += f"\n\n❌ Error: {str(e)}"
29
  yield history
 
 
 
 
 
 
 
 
 
 
src/ui/components.py CHANGED
@@ -1,11 +1,5 @@
1
  import gradio as gr
2
 
3
- # if not hasattr(gr, "Separator"):
4
- # def _gr_separator():
5
- # return gr.HTML(
6
- # "<div style='margin: 15px 0; border-top: 1px solid var(--border-color-primary);'></div>"
7
- # )
8
- # gr.Separator = lambda *args, **kwargs: _gr_separator()
9
  from src.core.config import settings
10
  from src.ui.callbacks import bot_response, set_interactive, user_input
11
  from src.ui.styles import CSS
@@ -24,7 +18,7 @@ def create_ui():
24
  gr.Markdown("### ⚙️ Model Settings")
25
  sys_pt = gr.Textbox(
26
  label="System Prompt",
27
- value="Вы опытный программист. Отвечаете кратко и по делу.",
28
  lines=4,
29
  )
30
  temp = gr.Slider(0, 1, value=settings.DEFAULT_TEMP, label="Temperature")
 
1
  import gradio as gr
2
 
 
 
 
 
 
 
3
  from src.core.config import settings
4
  from src.ui.callbacks import bot_response, set_interactive, user_input
5
  from src.ui.styles import CSS
 
18
  gr.Markdown("### ⚙️ Model Settings")
19
  sys_pt = gr.Textbox(
20
  label="System Prompt",
21
+ value="Вы опытный программист. Отвечаете кратко и по делу, пишите качественный и рабочий код.",
22
  lines=4,
23
  )
24
  temp = gr.Slider(0, 1, value=settings.DEFAULT_TEMP, label="Temperature")