AnatoliiG commited on
Commit
42fa16e
·
1 Parent(s): 6616542

split code

Browse files
app.py DELETED
@@ -1,105 +0,0 @@
1
- import asyncio # <--- Добавили импорт
2
- import json
3
-
4
- import uvicorn
5
- from fastapi import FastAPI, Request
6
- from fastapi.middleware.cors import CORSMiddleware
7
- from fastapi.responses import JSONResponse, StreamingResponse
8
- from gradio import mount_gradio_app
9
-
10
- import config
11
- from model import engine
12
- from ui import create_ui
13
- from utils import get_clean_text
14
-
15
- model_lock = asyncio.Lock()
16
-
17
- app = FastAPI()
18
- app.add_middleware(
19
- CORSMiddleware,
20
- allow_origins=["*"],
21
- allow_credentials=True,
22
- allow_methods=["*"],
23
- allow_headers=["*"],
24
- )
25
-
26
-
27
- @app.post("/v1/chat/completions")
28
- async def chat_completions(request: Request):
29
- if model_lock.locked():
30
- pass
31
-
32
- if not engine.llm:
33
- return JSONResponse(content={"error": "Model not loaded"}, status_code=500)
34
-
35
- try:
36
- data = await request.json()
37
- raw_messages = data.get("messages", [])
38
-
39
- messages = []
40
- for msg in raw_messages:
41
- messages.append(
42
- {
43
- "role": msg.get("role", "user"),
44
- "content": get_clean_text(msg.get("content")),
45
- }
46
- )
47
-
48
- stream = data.get("stream", True)
49
- temperature = data.get("temperature", config.DEFAULT_TEMP)
50
- max_tokens = data.get("max_tokens", config.DEFAULT_MAX_TOKENS)
51
-
52
- async def iter_content_locked():
53
- async with model_lock:
54
- try:
55
- output = engine.generate(
56
- messages=messages,
57
- max_tokens=max_tokens,
58
- temperature=temperature,
59
- stream=True,
60
- )
61
-
62
- for chunk in output:
63
- if "model" not in chunk:
64
- chunk["model"] = config.REPO_ID
65
- yield f"data: {json.dumps(chunk)}\n\n"
66
- await asyncio.sleep(0)
67
-
68
- yield "data: [DONE]\n\n"
69
- except Exception as e:
70
- print(f"Streaming error: {e}")
71
- yield f"data: {json.dumps({'error': str(e)})}\n\n"
72
-
73
- if stream:
74
- return StreamingResponse(
75
- iter_content_locked(),
76
- media_type="text/event-stream",
77
- headers={
78
- "Cache-Control": "no-cache",
79
- "Connection": "keep-alive",
80
- "X-Accel-Buffering": "no",
81
- },
82
- )
83
- else:
84
- async with model_lock:
85
- output = engine.generate(
86
- messages=messages,
87
- max_tokens=max_tokens,
88
- temperature=temperature,
89
- stream=False,
90
- )
91
- return JSONResponse(content=output)
92
-
93
- except Exception as e:
94
- import traceback
95
-
96
- traceback.print_exc()
97
- return JSONResponse(content={"error": str(e)}, status_code=500)
98
-
99
-
100
- # --- Mount Gradio ---
101
- demo = create_ui()
102
- app = mount_gradio_app(app, demo, path="/")
103
-
104
- if __name__ == "__main__":
105
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.py DELETED
@@ -1,9 +0,0 @@
1
- REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
2
- FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
3
-
4
- # Параметры модели
5
- CONTEXT_SIZE = 8192
6
- DEFAULT_MAX_TOKENS = 4096
7
- DEFAULT_TEMP = 0.4
8
- N_THREADS = 2
9
- N_GPU_LAYERS = 0 # 0 для CPU, -1 для GPU
 
 
 
 
 
 
 
 
 
 
main.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from gradio import mount_gradio_app
5
+
6
+ from src.api.routes import router as api_router
7
+ from src.ui.components import create_ui
8
+
9
+ app = FastAPI(title="Code LLM Service")
10
+
11
+ app.add_middleware(
12
+ CORSMiddleware,
13
+ allow_origins=["*"],
14
+ allow_methods=["*"],
15
+ allow_headers=["*"],
16
+ )
17
+
18
+ # Подключаем API эндпоинты OpenAI-типа
19
+ app.include_router(api_router, prefix="/v1")
20
+
21
+ # Подключаем Gradio интерфейс
22
+ ui_app = create_ui()
23
+ app = mount_gradio_app(app, ui_app, path="/")
24
+
25
+ if __name__ == "__main__":
26
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt CHANGED
@@ -5,3 +5,4 @@ huggingface_hub>=0.27.0
5
  gradio>=5.9.0
6
  python-multipart
7
  psutil
 
 
5
  gradio>=5.9.0
6
  python-multipart
7
  psutil
8
+ pydantic-settings
src/api/__init__.py ADDED
File without changes
src/api/routes.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+
4
+ from fastapi import APIRouter, Request
5
+ from fastapi.responses import JSONResponse, StreamingResponse
6
+ from src.utils.helpers import get_clean_text
7
+
8
+ from src.core.config import settings
9
+ from src.core.engine import engine
10
+
11
+ router = APIRouter()
12
+
13
+
14
+ @router.post("/chat/completions")
15
+ async def chat_completions(request: Request):
16
+ if not engine.llm:
17
+ return JSONResponse({"error": "Model not loaded"}, status_code=500)
18
+
19
+ data = await request.json()
20
+ messages = [
21
+ {"role": m.get("role", "user"), "content": get_clean_text(m.get("content"))}
22
+ for m in data.get("messages", [])
23
+ ]
24
+
25
+ stream = data.get("stream", True)
26
+
27
+ async def stream_generator():
28
+ async with engine.lock:
29
+ output = engine.generate(
30
+ messages,
31
+ data.get("max_tokens", settings.DEFAULT_MAX_TOKENS),
32
+ data.get("temperature", settings.DEFAULT_TEMP),
33
+ )
34
+ for chunk in output:
35
+ yield f"data: {json.dumps(chunk)}\n\n"
36
+ yield "data: [DONE]\n\n"
37
+
38
+ if stream:
39
+ return StreamingResponse(stream_generator(), media_type="text/event-stream")
40
+
41
+ async with engine.lock:
42
+ return engine.generate(
43
+ messages,
44
+ data.get("max_tokens", settings.DEFAULT_MAX_TOKENS),
45
+ data.get("temperature", settings.DEFAULT_TEMP),
46
+ stream=False,
47
+ )
src/core/__init__.py ADDED
File without changes
src/core/config.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+
3
+
4
+ class Settings(BaseSettings):
5
+ REPO_ID: str = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
6
+ FILENAME: str = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
7
+
8
+ CONTEXT_SIZE: int = 8192
9
+ DEFAULT_MAX_TOKENS: int = 4096
10
+ DEFAULT_TEMP: float = 0.4
11
+ N_THREADS: int = 2
12
+ N_GPU_LAYERS: int = 0
13
+
14
+
15
+ settings = Settings()
model.py → src/core/engine.py RENAMED
@@ -1,37 +1,35 @@
1
- import json
2
 
3
  from huggingface_hub import hf_hub_download
4
  from llama_cpp import Llama
5
 
6
- from config import CONTEXT_SIZE, FILENAME, N_GPU_LAYERS, N_THREADS, REPO_ID
7
 
8
 
9
  class ModelEngine:
10
  def __init__(self):
11
  self.llm = None
 
12
  self._load_model()
13
 
14
  def _load_model(self):
15
- print(f"Loading model {REPO_ID}...")
16
  try:
17
- model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
 
 
18
  self.llm = Llama(
19
  model_path=model_path,
20
- n_ctx=CONTEXT_SIZE,
21
- n_threads=N_THREADS,
22
- n_gpu_layers=N_GPU_LAYERS,
23
- n_batch=512,
24
  verbose=True,
25
  )
26
- print("Model loaded successfully.")
27
  except Exception as e:
28
- print(f"CRITICAL ERROR: Failed to load model. {e}")
29
- self.llm = None
30
 
31
  def generate(self, messages, max_tokens, temperature, stream=True):
32
  if not self.llm:
33
- raise RuntimeError("Model is not loaded.")
34
-
35
  return self.llm.create_chat_completion(
36
  messages=messages,
37
  max_tokens=int(max_tokens),
@@ -40,5 +38,5 @@ class ModelEngine:
40
  )
41
 
42
 
43
- # Создаем глобальный экземпляр (Singleton)
44
  engine = ModelEngine()
 
1
+ import asyncio
2
 
3
  from huggingface_hub import hf_hub_download
4
  from llama_cpp import Llama
5
 
6
+ from src.core.config import settings
7
 
8
 
9
  class ModelEngine:
10
  def __init__(self):
11
  self.llm = None
12
+ self.lock = asyncio.Lock()
13
  self._load_model()
14
 
15
  def _load_model(self):
 
16
  try:
17
+ model_path = hf_hub_download(
18
+ repo_id=settings.REPO_ID, filename=settings.FILENAME
19
+ )
20
  self.llm = Llama(
21
  model_path=model_path,
22
+ n_ctx=settings.CONTEXT_SIZE,
23
+ n_threads=settings.N_THREADS,
24
+ n_gpu_layers=settings.N_GPU_LAYERS,
 
25
  verbose=True,
26
  )
 
27
  except Exception as e:
28
+ print(f"Error loading model: {e}")
 
29
 
30
  def generate(self, messages, max_tokens, temperature, stream=True):
31
  if not self.llm:
32
+ raise RuntimeError("Model not loaded")
 
33
  return self.llm.create_chat_completion(
34
  messages=messages,
35
  max_tokens=int(max_tokens),
 
38
  )
39
 
40
 
41
+ # Создаем синглтон
42
  engine = ModelEngine()
src/ui/__init__.py ADDED
File without changes
chat_logic.py → src/ui/callbacks.py RENAMED
@@ -1,56 +1,28 @@
1
- import traceback
2
-
3
  import gradio as gr
 
4
 
5
- from model import engine
6
- from utils import get_clean_text
7
 
8
 
9
  def user_input(user_message, history):
10
- """Обработка ввода пользователя"""
11
  if not user_message:
12
  return None, history
13
-
14
- if history is None:
15
- history = []
16
-
17
- clean_history = []
18
- for msg in history:
19
- raw_content = msg.get("content", "")
20
- text_content = get_clean_text(raw_content)
21
- clean_history.append({"role": msg["role"], "content": text_content})
22
-
23
- clean_history.append({"role": "user", "content": str(user_message)})
24
- return "", clean_history
25
 
26
 
27
  def bot_response(history, system_prompt, temperature, max_tokens):
28
- """Генерация ответа модели (стриминг)"""
29
- if not engine.llm:
30
- history.append({"role": "assistant", "content": "Error: Model failed to load."})
31
- yield history
32
- return
33
-
34
  messages = [{"role": "system", "content": system_prompt}]
35
-
36
- # Контекстное окно (последние 15 сообщений)
37
- relevant_history = history[-15:] if len(history) > 15 else history
38
-
39
- for msg in relevant_history:
40
- raw_content = msg.get("content", "")
41
- text_content = get_clean_text(raw_content)
42
- messages.append({"role": msg["role"], "content": text_content})
43
 
44
  history.append({"role": "assistant", "content": ""})
45
 
46
  try:
47
- stream = engine.generate(
48
- messages=messages,
49
- max_tokens=max_tokens,
50
- temperature=temperature,
51
- stream=True,
52
- )
53
-
54
  partial_text = ""
55
  for chunk in stream:
56
  delta = chunk["choices"][0]["delta"]
@@ -58,19 +30,16 @@ def bot_response(history, system_prompt, temperature, max_tokens):
58
  partial_text += delta["content"]
59
  history[-1]["content"] = partial_text
60
  yield history
61
-
62
  except Exception as e:
63
- traceback.print_exc()
64
- history[-1]["content"] = partial_text + f"\n\n❌ **Error:** {str(e)}"
65
  yield history
66
 
67
 
68
  def set_interactive(is_interactive):
69
- """Переключение состояния кнопок во время генерации"""
70
  return (
71
  gr.update(
72
  interactive=is_interactive,
73
- placeholder="Wait..." if not is_interactive else "Type code question...",
74
  ),
75
  gr.update(interactive=is_interactive),
76
  )
 
 
 
1
  import gradio as gr
2
+ from src.utils.helpers import get_clean_text
3
 
4
+ from src.core.engine import engine
 
5
 
6
 
7
  def user_input(user_message, history):
 
8
  if not user_message:
9
  return None, history
10
+ history = history or []
11
+ history.append({"role": "user", "content": str(user_message)})
12
+ return "", history
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  def bot_response(history, system_prompt, temperature, max_tokens):
 
 
 
 
 
 
16
  messages = [{"role": "system", "content": system_prompt}]
17
+ for msg in history[-15:]:
18
+ messages.append(
19
+ {"role": msg["role"], "content": get_clean_text(msg["content"])}
20
+ )
 
 
 
 
21
 
22
  history.append({"role": "assistant", "content": ""})
23
 
24
  try:
25
+ stream = engine.generate(messages, max_tokens, temperature, stream=True)
 
 
 
 
 
 
26
  partial_text = ""
27
  for chunk in stream:
28
  delta = chunk["choices"][0]["delta"]
 
30
  partial_text += delta["content"]
31
  history[-1]["content"] = partial_text
32
  yield history
 
33
  except Exception as e:
34
+ history[-1]["content"] += f"\n\n❌ Error: {str(e)}"
 
35
  yield history
36
 
37
 
38
  def set_interactive(is_interactive):
 
39
  return (
40
  gr.update(
41
  interactive=is_interactive,
42
+ placeholder="Wait..." if not is_interactive else "Type...",
43
  ),
44
  gr.update(interactive=is_interactive),
45
  )
src/ui/components.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import psutil
5
+
6
+ from src.core.config import settings
7
+ from src.ui.callbacks import bot_response, set_interactive, user_input
8
+ from src.ui.styles import CSS
9
+
10
+
11
+ def get_system_status():
12
+ cpu = psutil.cpu_percent()
13
+ ram = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
14
+ return f"### 🖥️ Status\n**CPU:** {cpu}% | **RAM:** {ram:.1f}MB"
15
+
16
+
17
+ def create_ui():
18
+ with gr.Blocks(css=CSS, title="Code LLM") as demo:
19
+ timer = gr.Timer(0.5, active=True)
20
+
21
+ with gr.Row(elem_id="main-row"):
22
+ with gr.Sidebar():
23
+ status = gr.Markdown(get_system_status())
24
+ sys_pt = gr.Textbox(
25
+ label="System Prompt", value="Вы программист.", lines=3
26
+ )
27
+ temp = gr.Slider(0, 1, value=settings.DEFAULT_TEMP, label="Temp")
28
+ tokens = gr.Slider(
29
+ 512, 8192, value=settings.DEFAULT_MAX_TOKENS, label="Max Tokens"
30
+ )
31
+ clear = gr.Button("🗑️ Clear")
32
+
33
+ with gr.Column(elem_id="col-chat-main"):
34
+ chatbot = gr.Chatbot(elem_id="chatbot", type="messages")
35
+ with gr.Row(elem_id="input-area"):
36
+ msg = gr.Textbox(show_label=False, scale=9, autofocus=True)
37
+ submit = gr.Button("Run ➤", variant="primary", scale=1)
38
+
39
+ # Events
40
+ timer.tick(get_system_status, outputs=status, show_progress="hidden")
41
+
42
+ input_args = [msg, chatbot]
43
+ output_args = [msg, chatbot]
44
+ gen_args = [chatbot, sys_pt, temp, tokens]
45
+
46
+ msg.submit(user_input, input_args, output_args, queue=False).then(
47
+ lambda: set_interactive(False), None, [msg, submit]
48
+ ).then(bot_response, gen_args, chatbot).then(
49
+ lambda: set_interactive(True), None, [msg, submit]
50
+ )
51
+
52
+ submit.click(user_input, input_args, output_args, queue=False).then(
53
+ lambda: set_interactive(False), None, [msg, submit]
54
+ ).then(bot_response, gen_args, chatbot).then(
55
+ lambda: set_interactive(True), None, [msg, submit]
56
+ )
57
+
58
+ clear.click(lambda: [], None, chatbot)
59
+
60
+ return demo
styles.py → src/ui/styles.py RENAMED
File without changes
src/utils/__init__.py ADDED
File without changes
utils.py → src/utils/helpers.py RENAMED
File without changes
ui.py DELETED
@@ -1,108 +0,0 @@
1
- import os
2
-
3
- import gradio as gr
4
- import psutil
5
-
6
- import config
7
- from chat_logic import bot_response, set_interactive, user_input
8
- from styles import CSS
9
-
10
-
11
- def get_system_status():
12
- """Возвращает текущую загрузку системы для отображения в Markdown"""
13
- # CPU
14
- cpu_usage = psutil.cpu_percent(interval=None)
15
-
16
- # RAM (процесс приложения)
17
- process = psutil.Process(os.getpid())
18
- memory_info = process.memory_info()
19
- ram_usage_mb = memory_info.rss / 1024 / 1024
20
-
21
- # Общая память системы
22
- virtual_mem = psutil.virtual_memory()
23
- ram_percent = virtual_mem.percent
24
-
25
- return f"""
26
- ### 🖥️ System Health
27
- **CPU:** {cpu_usage}%
28
- **RAM (App):** {ram_usage_mb:.1f} MB
29
- **RAM (Total):** {ram_percent}%
30
- """
31
-
32
-
33
- def create_ui():
34
- theme = gr.themes.Soft(primary_hue="blue", text_size="lg")
35
-
36
- with gr.Blocks(theme=theme, css=CSS, title="Code LLM") as demo:
37
- stats_timer = gr.Timer(value=0.1, active=True)
38
-
39
- with gr.Row(equal_height=True, variant="default", elem_id="main-row"):
40
- # --- ЛЕВАЯ КОЛОНКА (Сайдбар) ---
41
- with gr.Sidebar(elem_classes=["sidebar"]):
42
- gr.Markdown("### ⚙️ Settings")
43
-
44
- # Мониторинг (обновляется автоматически)
45
- system_status = gr.Markdown(value=get_system_status())
46
-
47
- gr.Markdown("---") # Разделительная линия
48
-
49
- system_prompt = gr.Textbox(
50
- label="System Prompt",
51
- value="Вы опытный программист. Пишите чистый и эффективный код.",
52
- lines=5,
53
- )
54
- temperature = gr.Slider(
55
- 0.0, 1.0, value=config.DEFAULT_TEMP, label="Temperature"
56
- )
57
- max_tokens = gr.Slider(
58
- 512, 8192, value=config.DEFAULT_MAX_TOKENS, label="Max Tokens"
59
- )
60
- clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
61
-
62
- # --- ПРАВАЯ КОЛОНКА (Чат) ---
63
- with gr.Column(scale=1, elem_id="col-chat-main"):
64
- chatbot = gr.Chatbot(
65
- elem_id="chatbot",
66
- label="Code Assistant",
67
- avatar_images=(None, "https://api.iconify.design/noto:robot.svg"),
68
- layout="bubble",
69
- render_markdown=True,
70
- )
71
-
72
- with gr.Row(elem_id="input-area"):
73
- msg = gr.Textbox(
74
- show_label=False,
75
- placeholder="Type your code question here...",
76
- lines=1,
77
- scale=9,
78
- autofocus=True,
79
- max_lines=10,
80
- container=False,
81
- )
82
- submit_btn = gr.Button(
83
- "Run ➤", variant="primary", scale=1, min_width=80
84
- )
85
-
86
- # --- СОБЫТИЯ ---
87
-
88
- # Обновление статуса по таймеру
89
- stats_timer.tick(
90
- get_system_status, outputs=system_status, show_progress="hidden"
91
- )
92
-
93
- # Логика отправки сообщений
94
- msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
95
- lambda: set_interactive(False), None, [msg, submit_btn], queue=False
96
- ).then(
97
- bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot
98
- ).then(lambda: set_interactive(True), None, [msg, submit_btn], queue=False)
99
-
100
- submit_btn.click(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
101
- lambda: set_interactive(False), None, [msg, submit_btn], queue=False
102
- ).then(
103
- bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot
104
- ).then(lambda: set_interactive(True), None, [msg, submit_btn], queue=False)
105
-
106
- clear_btn.click(lambda: [], None, chatbot, queue=False)
107
-
108
- return demo