Spaces:

Vishinka
/

Code_LLM

Sleeping

App Files Files Community

AnatoliiG commited on Jan 18

Commit

54f6dae

1 Parent(s): 9834c86

refactor code

Browse files

Files changed (5) hide show

app.py +12 -135
config.py +9 -0
model.py +44 -0
ui.py +165 -0
utils.py +8 -0

app.py CHANGED Viewed

@@ -1,37 +1,17 @@
 import json
-import traceback
-import gradio as gr
 import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from gradio import mount_gradio_app
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
-# --- КОНФИГУРАЦИЯ ---
-REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
-FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
-CONTEXT_SIZE = 8192
-DEFAULT_MAX_TOKENS = 4096
-print(f"Loading model {REPO_ID}...")
-try:
-    model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
-    llm = Llama(
-        model_path=model_path,
-        n_ctx=CONTEXT_SIZE,
-        n_threads=4,  # Оптимизация для CPU Spaces
-        n_gpu_layers=0,  # Явно указываем 0 для CPU
-        n_batch=512,
-        verbose=True,
-    )
-except Exception as e:
-    print(f"Critical Error: {e}")
-    llm = None
-# --- API (FastAPI) ---
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
@@ -42,19 +22,20 @@ app.add_middleware(
 )
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
-    if not llm:
         return JSONResponse(content={"error": "Model not loaded"}, status_code=500)
     try:
         data = await request.json()
         messages = data.get("messages", [])
         stream = data.get("stream", False)
-        temperature = data.get("temperature", 0.4)
-        max_tokens = data.get("max_tokens", DEFAULT_MAX_TOKENS)
-        output = llm.create_chat_completion(
             messages=messages,
             max_tokens=max_tokens,
             temperature=temperature,
@@ -75,112 +56,8 @@ async def chat_completions(request: Request):
         return JSONResponse(content={"error": str(e)}, status_code=500)
-# --- ЛОГИКА ГЕНЕРАЦИИ ДЛЯ GRADIO ---
-def user_input(user_message, history):
-    # Если история пуста, инициализируем список
-    if history is None:
-        history = []
-    # Возвращаем список словарей (формат Gradio 5)
-    return "", history + [{"role": "user", "content": user_message}]
-def bot_response(history, system_prompt, temperature, max_tokens):
-    if not llm:
-        history.append({"role": "assistant", "content": "Error: Model failed to load."})
-        yield history
-        return
-    # Формируем сообщения для Llama
-    messages = [{"role": "system", "content": system_prompt}]
-    # Берем последние 10 сообщений для контекста
-    relevant_history = history[-10:] if len(history) > 10 else history
-    for msg in relevant_history:
-        content = msg["content"]
-        if isinstance(content, list):
-            content = "\n".join(str(item) for item in content)
-        messages.append({"role": msg["role"], "content": str(content)})
-    history.append({"role": "assistant", "content": ""})
-    partial_text = ""
-    try:
-        stream = llm.create_chat_completion(
-            messages=messages,
-            max_tokens=int(max_tokens),
-            temperature=float(temperature),
-            stream=True,
-        )
-        for chunk in stream:
-            delta = chunk["choices"][0]["delta"]
-            if "content" in delta:
-                partial_text += delta["content"]
-                # Обновляем последнее сообщение
-                history[-1]["content"] = partial_text
-                yield history
-    except Exception as e:
-        traceback.print_exc()
-        history[-1]["content"] = partial_text + f"\n\n❌ **Error:** {str(e)}"
-        yield history
-# --- ИНТЕРФЕЙС (Gradio Blocks) ---
-custom_css = """
-#chatbot {
-    height: 70vh !important;
-    overflow: auto;
-}
-"""
-theme = gr.themes.Soft(primary_hue="blue", text_size="lg")
-with gr.Blocks(theme=theme, css=custom_css, title="Qwen Coder Pro") as demo:
-    gr.Markdown("# 💻 Qwen 2.5 Coder Assistant")
-    with gr.Row():
-        # Настройки
-        with gr.Column(scale=1, min_width=250):
-            gr.Markdown("### ⚙️ Settings")
-            system_prompt = gr.Textbox(
-                label="System Prompt",
-                value="Ты экспертный агент-кодер. Напиши чистый код.",
-                lines=3,
-            )
-            temperature = gr.Slider(0.0, 1.0, value=0.4, label="Temperature")
-            max_tokens = gr.Slider(512, 8192, value=4096, label="Max Tokens")
-            clear_btn = gr.Button("🗑️ Clear Chat")
-        # Чат
-        with gr.Column(scale=4):
-            chatbot = gr.Chatbot(
-                label="Conversation",
-                elem_id="chatbot",
-                avatar_images=(None, "https://api.iconify.design/noto:robot.svg"),
-            )
-            msg = gr.Textbox(
-                show_label=False, placeholder="Type your code question here...", lines=2
-            )
-            submit_btn = gr.Button("Run ➤", variant="primary")
-    # Связка событий
-    msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
-        bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot
-    )
-    submit_btn.click(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
-        bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot
-    )
-    # Очистка возвращает пустой список
-    clear_btn.click(lambda: [], None, chatbot, queue=False)
 app = mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":

+# app.py
 import json
 import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from gradio import mount_gradio_app
+import config
+from model import engine
+from ui import create_ui
+# --- FastAPI Setup ---
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
 )
+# --- API Endpoints ---
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
+    if not engine.llm:
         return JSONResponse(content={"error": "Model not loaded"}, status_code=500)
     try:
         data = await request.json()
         messages = data.get("messages", [])
         stream = data.get("stream", False)
+        temperature = data.get("temperature", config.DEFAULT_TEMP)
+        max_tokens = data.get("max_tokens", config.DEFAULT_MAX_TOKENS)
+        output = engine.generate(
             messages=messages,
             max_tokens=max_tokens,
             temperature=temperature,
         return JSONResponse(content={"error": str(e)}, status_code=500)
+# --- Mount Gradio ---
+demo = create_ui()
 app = mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":

config.py ADDED Viewed

	@@ -0,0 +1,9 @@

+REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
+FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
+# Параметры модели
+CONTEXT_SIZE = 8192
+DEFAULT_MAX_TOKENS = 4096
+DEFAULT_TEMP = 0.4
+N_THREADS = 4
+N_GPU_LAYERS = 0  # 0 для CPU, -1 для GPU

model.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import json
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+from config import CONTEXT_SIZE, FILENAME, N_GPU_LAYERS, N_THREADS, REPO_ID
+class ModelEngine:
+    def __init__(self):
+        self.llm = None
+        self._load_model()
+    def _load_model(self):
+        print(f"Loading model {REPO_ID}...")
+        try:
+            model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
+            self.llm = Llama(
+                model_path=model_path,
+                n_ctx=CONTEXT_SIZE,
+                n_threads=N_THREADS,
+                n_gpu_layers=N_GPU_LAYERS,
+                n_batch=512,
+                verbose=True,
+            )
+            print("Model loaded successfully.")
+        except Exception as e:
+            print(f"CRITICAL ERROR: Failed to load model. {e}")
+            self.llm = None
+    def generate(self, messages, max_tokens, temperature, stream=True):
+        if not self.llm:
+            raise RuntimeError("Model is not loaded.")
+        return self.llm.create_chat_completion(
+            messages=messages,
+            max_tokens=int(max_tokens),
+            temperature=float(temperature),
+            stream=stream,
+        )
+# Создаем глобальный экземпляр (Singleton)
+engine = ModelEngine()

ui.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# ui.py
+import traceback
+import gradio as gr
+import config
+from model import engine
+from utils import sanitize_content
+# --- CSS стили ---
+CUSTOM_CSS = """
+body, .gradio-container {
+    overflow: hidden !important;
+    height: 100vh !important;
+    max_height: 100vh !important;
+}
+#chatbot {
+    height: 100% !important;
+    flex-grow: 1;
+    overflow: auto;
+    font-family: 'Consolas', 'Monaco', monospace;
+}
+"""
+# --- Логика событий ---
+def user_input(user_message, history):
+    if not user_message:
+        return None, history
+    if history is None:
+        history = []
+    # Очистка старой истории
+    clean_history = []
+    for msg in history:
+        clean_history.append(
+            {"role": msg["role"], "content": sanitize_content(msg.get("content", ""))}
+        )
+    clean_history.append({"role": "user", "content": str(user_message)})
+    return "", clean_history
+def bot_response(history, system_prompt, temperature, max_tokens):
+    if not engine.llm:
+        history.append({"role": "assistant", "content": "Error: Model failed to load."})
+        yield history
+        return
+    # Подготовка сообщений
+    messages = [{"role": "system", "content": system_prompt}]
+    relevant_history = history[-15:] if len(history) > 15 else history
+    for msg in relevant_history:
+        messages.append(
+            {"role": msg["role"], "content": sanitize_content(msg.get("content", ""))}
+        )
+    history.append({"role": "assistant", "content": ""})
+    try:
+        stream = engine.generate(
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            stream=True,
+        )
+        partial_text = ""
+        for chunk in stream:
+            delta = chunk["choices"][0]["delta"]
+            if "content" in delta:
+                partial_text += delta["content"]
+                history[-1]["content"] = partial_text
+                yield history
+    except Exception as e:
+        traceback.print_exc()
+        history[-1]["content"] = partial_text + f"\n\n❌ **Error:** {str(e)}"
+        yield history
+def set_interactive(is_interactive):
+    return (
+        gr.update(
+            interactive=is_interactive,
+            placeholder="Wait for response..."
+            if not is_interactive
+            else "Type code question...",
+        ),
+        gr.update(interactive=is_interactive),
+    )
+# --- Создание интерфейса ---
+def create_ui():
+    theme = gr.themes.Soft(primary_hue="blue", text_size="lg")
+    with gr.Blocks(
+        theme=theme, css=CUSTOM_CSS, title="Qwen Coder Pro", fill_height=True
+    ) as demo:
+        with gr.Sidebar():
+            gr.Markdown("### ⚙️ Settings")
+            system_prompt = gr.Textbox(
+                label="System Prompt",
+                value="You are an expert coding assistant. Write clean, efficient code.",
+                lines=5,
+            )
+            temperature = gr.Slider(
+                0.0, 1.0, value=config.DEFAULT_TEMP, label="Temperature"
+            )
+            max_tokens = gr.Slider(
+                512, 8192, value=config.DEFAULT_MAX_TOKENS, label="Max Tokens"
+            )
+            clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
+        with gr.Column(fill_height=True):
+            chatbot = gr.Chatbot(
+                label="Code Assistant",
+                elem_id="chatbot",
+                avatar_images=(None, "https://api.iconify.design/noto:robot.svg"),
+                show_copy_button=True,
+                scale=1,
+                bubble_full_width=False,
+            )
+            with gr.Row(variant="compact"):
+                msg = gr.Textbox(
+                    show_label=False,
+                    placeholder="Type your code question here...",
+                    lines=1,
+                    scale=8,
+                    autofocus=True,
+                    max_lines=3,
+                )
+                submit_btn = gr.Button(
+                    "Run ➤", variant="primary", scale=1, min_width=100
+                )
+        # Chains
+        submit_event = (
+            msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False)
+            .then(lambda: set_interactive(False), None, [msg, submit_btn], queue=False)
+            .then(
+                bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot
+            )
+            .then(lambda: set_interactive(True), None, [msg, submit_btn], queue=False)
+        )
+        click_event = (
+            submit_btn.click(user_input, [msg, chatbot], [msg, chatbot], queue=False)
+            .then(lambda: set_interactive(False), None, [msg, submit_btn], queue=False)
+            .then(
+                bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot
+            )
+            .then(lambda: set_interactive(True), None, [msg, submit_btn], queue=False)
+        )
+        clear_btn.click(lambda: [], None, chatbot, queue=False)
+    return demo

utils.py ADDED Viewed

	@@ -0,0 +1,8 @@

+def sanitize_content(content):
+    """
+    Гарантирует, что контент - это строка.
+    Исправляет баг Gradio, когда текст приходит как список.
+    """
+    if isinstance(content, list):
+        return "\n".join(str(item) for item in content)
+    return str(content) if content is not None else ""