THEZYZSTUDIO commited on
Commit
8fca77c
·
verified ·
1 Parent(s): 9e15ca7

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -84
app.py DELETED
@@ -1,84 +0,0 @@
1
- import os, json, asyncio
2
- from fastapi import FastAPI, Request, HTTPException
3
- from fastapi.responses import StreamingResponse
4
- from fastapi.middleware.cors import CORSMiddleware
5
- from huggingface_hub import hf_hub_download
6
- from llama_cpp import Llama
7
- from prompts import build_system_prompt
8
- from search_engine import search_web
9
-
10
- app = FastAPI()
11
- app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
12
-
13
- MODEL_REPO = "unsloth/Qwen3.6-35B-A3B-GGUF"
14
- MODEL_FILE = "Qwen3.6-35B-A3B-UD-Q3_K_XL.gguf"
15
- llm = None
16
-
17
- def load_model():
18
- global llm
19
- if llm is None:
20
- print("⬇️ جاري تحميل النموذج...")
21
- model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
22
- llm = Llama(
23
- model_path=model_path,
24
- n_ctx=2048, # متوازن مع 18GB RAM (يمكن رفعه لـ 3076 إذا توفرت رامات إضافية)
25
- n_threads=4,
26
- n_gpu_layers=0,
27
- use_mmap=True,
28
- verbose=False
29
- )
30
- print("✅ تم تحميل النموذج بنجاح.")
31
-
32
- @app.on_event("startup")
33
- def startup():
34
- load_model()
35
-
36
- def format_qwen_chat(messages: list, system_prompt: str) -> str:
37
- """بناء قالب محادثة Qwen3 الصحيح مع حفظ السياق"""
38
- prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
39
- # نحتفظ بآخر 5 رسائل فقط لتوفير سياق الذاكرة على السيرفر المجاني
40
- history = messages[-5:] if len(messages) > 5 else messages
41
- for msg in history:
42
- role = "user" if msg["role"] == "user" else "assistant"
43
- prompt += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n"
44
- prompt += "<|im_start|>assistant\n"
45
- return prompt
46
-
47
- async def generate_stream(messages: list, mode: str):
48
- system_prompt = build_system_prompt(mode)
49
-
50
- # وضع البحث: حقن النتائج بتعليمات واضحة
51
- if mode == "search":
52
- query = messages[-1]['content']
53
- search_res = search_web(query)
54
- # نضيف النتائج كرسالة نظام قبل آخر رسالة مستخدم
55
- messages = messages.copy()
56
- messages.insert(-1, {"role": "system", "content": f"[SEARCH RESULTS]\n{search_res}\n\nINSTRUCTION: Use the above results to answer accurately. If irrelevant, rely on your knowledge."})
57
-
58
- prompt = format_qwen_chat(messages, system_prompt)
59
-
60
- # إعدادات توليد محسنة لنماذج MoE الكبيرة
61
- for token in llm(
62
- prompt,
63
- max_tokens=2048,
64
- stop=["<|im_end|>", "<|user|>"],
65
- stream=True,
66
- temperature=0.7,
67
- repeat_penalty=1.1, # منع التكرار
68
- top_p=0.9
69
- ):
70
- yield json.dumps({"token": token["choices"][0]["text"]}) + "\n"
71
- await asyncio.sleep(0.01)
72
-
73
- @app.post("/v1/chat/completions")
74
- async def chat_completions(request: Request):
75
- data = await request.json()
76
- messages = data.get("messages", [])
77
- mode = data.get("mode", "chat")
78
- if not messages:
79
- raise HTTPException(400, "No messages provided")
80
- return StreamingResponse(generate_stream(messages, mode), media_type="application/json")
81
-
82
- @app.get("/health")
83
- def health():
84
- return {"status": "ok", "model": MODEL_FILE}