Spaces:

Rid3
/

xtime-api

Sleeping

App Files Files Community

Rid3 commited on Mar 27

Commit

8bf4672

verified ·

1 Parent(s): 9560ef7

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -15

app.py CHANGED Viewed

@@ -1,21 +1,20 @@
 from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware # Добавь это
 from pydantic import BaseModel
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import gc
 app = FastAPI(title="RID3 QUANTUM AI API")
-# --- НАСТРОЙКА CORS ---
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Разрешить запросы с любых сайтов
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# ----------------------
 REPO_ID = "Rid3/xtime-v1beta-gguf-storage"
 current_llm = None
@@ -23,7 +22,7 @@ current_model_name = ""
 MODELS = {
     "medium": "xtime-v1beta-n-m_1p.gguf",
-    "large": "xtime-v1beta-q4_K_M.gguf",
     "small": "xtime-v1beta-xp-r_2.gguf"
 }
@@ -31,9 +30,12 @@ def load_model(model_key: str):
     global current_llm, current_model_name
     filename = MODELS.get(model_key)
     if not filename:
-        raise HTTPException(status_code=404, detail="Модель не найдена")
     if current_model_name == model_key:
         return
     if current_llm is not None:
         del current_llm
@@ -41,13 +43,24 @@ def load_model(model_key: str):
     try:
         model_path = hf_hub_download(repo_id=REPO_ID, filename=filename)
-        current_llm = Llama(model_path=model_path, n_ctx=2048, n_threads=4)
         current_model_name = model_key
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.on_event("startup")
 async def startup_event():
     load_model("large")
 class ChatRequest(BaseModel):
@@ -58,11 +71,20 @@ class ChatRequest(BaseModel):
 async def chat(request: ChatRequest):
     if request.model_type != current_model_name:
         load_model(request.model_type)
-    output = current_llm(
-        f"User: {request.prompt}\nAI:",
-        max_tokens=256,
-        stop=["User:", "\n"],
-        echo=False
-    )
-    return {"response": output["choices"][0]["text"].strip()}

 from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import gc
+import os
 app = FastAPI(title="RID3 QUANTUM AI API")
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 REPO_ID = "Rid3/xtime-v1beta-gguf-storage"
 current_llm = None
 MODELS = {
     "medium": "xtime-v1beta-n-m_1p.gguf",
+    "large": "xtime-v1beta-q4_K_M.gguf", # Это Llama 3.2 (mllama)
     "small": "xtime-v1beta-xp-r_2.gguf"
 }
     global current_llm, current_model_name
     filename = MODELS.get(model_key)
     if not filename:
+        raise HTTPException(status_code=404, detail="Model not found")
     if current_model_name == model_key:
         return
+    print(f"--- Loading MLLAMA Architecture: {filename} ---")
     if current_llm is not None:
         del current_llm
     try:
         model_path = hf_hub_download(repo_id=REPO_ID, filename=filename)
+        # Для mllama важно использовать свежий движок
+        current_llm = Llama(
+            model_path=model_path,
+            n_ctx=2048,           # Увеличь, если нужно больше памяти
+            n_threads=4,
+            verbose=False,        # Меньше логов — меньше шансов на ошибку переполнения
+            chat_format="llama-3" # Явно указываем формат для Llama 3.2
+        )
         current_model_name = model_key
     except Exception as e:
+        print(f"Error loading model: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.on_event("startup")
 async def startup_event():
+    # Попробуем загрузить medium, если large (mllama) все еще не поддерживается твоим билдом
+    # Но с новым Dockerfile 'large' должен завестись
     load_model("large")
 class ChatRequest(BaseModel):
 async def chat(request: ChatRequest):
     if request.model_type != current_model_name:
         load_model(request.model_type)
+    try:
+        # Используем метод create_chat_completion для лучшей совместимости с Llama 3
+        output = current_llm.create_chat_completion(
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": request.prompt}
+            ],
+            max_tokens=512
+        )
+        return {"response": output["choices"][0]["message"]["content"].strip()}
+    except Exception as e:
+        return {"error": str(e)}
+@app.get("/")
+async def health():
+    return {"status": "online", "model": current_model_name}