import os, json, asyncio from fastapi import FastAPI, Request, HTTPException from fastapi.responses import StreamingResponse from fastapi.middleware.cors import CORSMiddleware from huggingface_hub import hf_hub_download from llama_cpp import Llama from prompts import build_system_prompt from search_engine import search_web app = FastAPI() app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) MODEL_REPO = "bartowski/Qwen_Qwen3.6-35B-A3B-GGUF" MODEL_FILE = "Qwen_Qwen3.6-35B-A3B-IQ3_M.gguf" llm = None def load_model(): global llm if llm is None: print("⬇️ جاري تحميل النموذج...") model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) llm = Llama( model_path=model_path, n_ctx=1536, n_threads=4, n_gpu_layers=0, use_mmap=True, verbose=False ) print("✅ تم تحميل النموذج بنجاح.") @app.on_event("startup") def startup(): load_model() async def generate_stream(messages: list, mode: str): system_prompt = build_system_prompt(mode) user_msg = messages[-1]['content'] prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_msg}\n<|assistant|>\n" if mode == "search": search_res = search_web(user_msg) prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_msg}\n[SEARCH RESULTS]\n{search_res}\n<|assistant|>\n" for token in llm(prompt, max_tokens=1200, stop=["<|user|>", "<|end|>"], stream=True, temperature=0.7): yield json.dumps({"token": token["choices"][0]["text"]}) + "\n" await asyncio.sleep(0.01) @app.post("/v1/chat/completions") async def chat_completions(request: Request): data = await request.json() messages = data.get("messages", []) mode = data.get("mode", "chat") if not messages: raise HTTPException(400, "No messages provided") return StreamingResponse(generate_stream(messages, mode), media_type="application/json") @app.get("/health") def health(): return {"status": "ok", "model": MODEL_FILE}