import os, json, asyncio from fastapi import FastAPI, Request, HTTPException from fastapi.responses import StreamingResponse from fastapi.middleware.cors import CORSMiddleware from huggingface_hub import hf_hub_download from llama_cpp import Llama from prompts import build_system_prompt from search_engine import search_web app = FastAPI() app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) MODEL_REPO = "bartowski/Qwen_Qwen3.6-35B-A3B-GGUF" MODEL_FILE = "Qwen_Qwen3.6-35B-A3B-IQ3_M.gguf" llm = None def load_model(): global llm if llm is None: print("⬇️ جاري تحميل النموذج...") model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) llm = Llama( model_path=model_path, n_ctx=2048, # متوازن مع 18GB RAM (يمكن رفعه لـ 3076 إذا توفرت رامات إضافية) n_threads=4, n_gpu_layers=0, use_mmap=True, verbose=False ) print("✅ تم تحميل النموذج بنجاح.") @app.on_event("startup") def startup(): load_model() def format_qwen_chat(messages: list, system_prompt: str) -> str: """بناء قالب محادثة Qwen3 الصحيح مع حفظ السياق""" prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n" # نحتفظ بآخر 5 رسائل فقط لتوفير سياق الذاكرة على السيرفر المجاني history = messages[-5:] if len(messages) > 5 else messages for msg in history: role = "user" if msg["role"] == "user" else "assistant" prompt += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n" prompt += "<|im_start|>assistant\n" return prompt async def generate_stream(messages: list, mode: str): system_prompt = build_system_prompt(mode) # وضع البحث: حقن النتائج بتعليمات واضحة if mode == "search": query = messages[-1]['content'] search_res = search_web(query) # نضيف النتائج كرسالة نظام قبل آخر رسالة مستخدم messages = messages.copy() messages.insert(-1, {"role": "system", "content": f"[SEARCH RESULTS]\n{search_res}\n\nINSTRUCTION: Use the above results to answer accurately. If irrelevant, rely on your knowledge."}) prompt = format_qwen_chat(messages, system_prompt) # إعدادات توليد محسنة لنماذج MoE الكبيرة for token in llm( prompt, max_tokens=2048, stop=["<|im_end|>", "<|user|>"], stream=True, temperature=0.7, repeat_penalty=1.1, # منع التكرار top_p=0.9 ): yield json.dumps({"token": token["choices"][0]["text"]}) + "\n" await asyncio.sleep(0.01) @app.post("/v1/chat/completions") async def chat_completions(request: Request): data = await request.json() messages = data.get("messages", []) mode = data.get("mode", "chat") if not messages: raise HTTPException(400, "No messages provided") return StreamingResponse(generate_stream(messages, mode), media_type="application/json") @app.get("/health") def health(): return {"status": "ok", "model": MODEL_FILE}