| import os, json, asyncio |
| from fastapi import FastAPI, Request, HTTPException |
| from fastapi.responses import StreamingResponse |
| from fastapi.middleware.cors import CORSMiddleware |
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
| from prompts import build_system_prompt |
| from search_engine import search_web |
|
|
| app = FastAPI() |
| app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) |
|
|
| MODEL_REPO = "bartowski/Qwen_Qwen3.6-35B-A3B-GGUF" |
| MODEL_FILE = "Qwen_Qwen3.6-35B-A3B-IQ3_M.gguf" |
| llm = None |
|
|
| def load_model(): |
| global llm |
| if llm is None: |
| print("⬇️ جاري تحميل النموذج...") |
| model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=2048, |
| n_threads=4, |
| n_gpu_layers=0, |
| use_mmap=True, |
| verbose=False |
| ) |
| print("✅ تم تحميل النموذج بنجاح.") |
|
|
| @app.on_event("startup") |
| def startup(): |
| load_model() |
|
|
| def format_qwen_chat(messages: list, system_prompt: str) -> str: |
| """بناء قالب محادثة Qwen3 الصحيح مع حفظ السياق""" |
| prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n" |
| |
| history = messages[-5:] if len(messages) > 5 else messages |
| for msg in history: |
| role = "user" if msg["role"] == "user" else "assistant" |
| prompt += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n" |
| prompt += "<|im_start|>assistant\n" |
| return prompt |
|
|
| async def generate_stream(messages: list, mode: str): |
| system_prompt = build_system_prompt(mode) |
| |
| |
| if mode == "search": |
| query = messages[-1]['content'] |
| search_res = search_web(query) |
| |
| messages = messages.copy() |
| messages.insert(-1, {"role": "system", "content": f"[SEARCH RESULTS]\n{search_res}\n\nINSTRUCTION: Use the above results to answer accurately. If irrelevant, rely on your knowledge."}) |
|
|
| prompt = format_qwen_chat(messages, system_prompt) |
| |
| |
| for token in llm( |
| prompt, |
| max_tokens=2048, |
| stop=["<|im_end|>", "<|user|>"], |
| stream=True, |
| temperature=0.7, |
| repeat_penalty=1.1, |
| top_p=0.9 |
| ): |
| yield json.dumps({"token": token["choices"][0]["text"]}) + "\n" |
| await asyncio.sleep(0.01) |
|
|
| @app.post("/v1/chat/completions") |
| async def chat_completions(request: Request): |
| data = await request.json() |
| messages = data.get("messages", []) |
| mode = data.get("mode", "chat") |
| if not messages: |
| raise HTTPException(400, "No messages provided") |
| return StreamingResponse(generate_stream(messages, mode), media_type="application/json") |
|
|
| @app.get("/health") |
| def health(): |
| return {"status": "ok", "model": MODEL_FILE} |