Rid3 commited on
Commit
8bf4672
·
verified ·
1 Parent(s): 9560ef7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -15
app.py CHANGED
@@ -1,21 +1,20 @@
1
  from fastapi import FastAPI, HTTPException
2
- from fastapi.middleware.cors import CORSMiddleware # Добавь это
3
  from pydantic import BaseModel
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
  import gc
 
7
 
8
  app = FastAPI(title="RID3 QUANTUM AI API")
9
 
10
- # --- НАСТРОЙКА CORS ---
11
  app.add_middleware(
12
  CORSMiddleware,
13
- allow_origins=["*"], # Разрешить запросы с любых сайтов
14
  allow_credentials=True,
15
  allow_methods=["*"],
16
  allow_headers=["*"],
17
  )
18
- # ----------------------
19
 
20
  REPO_ID = "Rid3/xtime-v1beta-gguf-storage"
21
  current_llm = None
@@ -23,7 +22,7 @@ current_model_name = ""
23
 
24
  MODELS = {
25
  "medium": "xtime-v1beta-n-m_1p.gguf",
26
- "large": "xtime-v1beta-q4_K_M.gguf",
27
  "small": "xtime-v1beta-xp-r_2.gguf"
28
  }
29
 
@@ -31,9 +30,12 @@ def load_model(model_key: str):
31
  global current_llm, current_model_name
32
  filename = MODELS.get(model_key)
33
  if not filename:
34
- raise HTTPException(status_code=404, detail="Модель не найдена")
 
35
  if current_model_name == model_key:
36
  return
 
 
37
 
38
  if current_llm is not None:
39
  del current_llm
@@ -41,13 +43,24 @@ def load_model(model_key: str):
41
 
42
  try:
43
  model_path = hf_hub_download(repo_id=REPO_ID, filename=filename)
44
- current_llm = Llama(model_path=model_path, n_ctx=2048, n_threads=4)
 
 
 
 
 
 
 
 
45
  current_model_name = model_key
46
  except Exception as e:
 
47
  raise HTTPException(status_code=500, detail=str(e))
48
 
49
  @app.on_event("startup")
50
  async def startup_event():
 
 
51
  load_model("large")
52
 
53
  class ChatRequest(BaseModel):
@@ -58,11 +71,20 @@ class ChatRequest(BaseModel):
58
  async def chat(request: ChatRequest):
59
  if request.model_type != current_model_name:
60
  load_model(request.model_type)
61
-
62
- output = current_llm(
63
- f"User: {request.prompt}\nAI:",
64
- max_tokens=256,
65
- stop=["User:", "\n"],
66
- echo=False
67
- )
68
- return {"response": output["choices"][0]["text"].strip()}
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
  import gc
7
+ import os
8
 
9
  app = FastAPI(title="RID3 QUANTUM AI API")
10
 
 
11
  app.add_middleware(
12
  CORSMiddleware,
13
+ allow_origins=["*"],
14
  allow_credentials=True,
15
  allow_methods=["*"],
16
  allow_headers=["*"],
17
  )
 
18
 
19
  REPO_ID = "Rid3/xtime-v1beta-gguf-storage"
20
  current_llm = None
 
22
 
23
  MODELS = {
24
  "medium": "xtime-v1beta-n-m_1p.gguf",
25
+ "large": "xtime-v1beta-q4_K_M.gguf", # Это Llama 3.2 (mllama)
26
  "small": "xtime-v1beta-xp-r_2.gguf"
27
  }
28
 
 
30
  global current_llm, current_model_name
31
  filename = MODELS.get(model_key)
32
  if not filename:
33
+ raise HTTPException(status_code=404, detail="Model not found")
34
+
35
  if current_model_name == model_key:
36
  return
37
+
38
+ print(f"--- Loading MLLAMA Architecture: {filename} ---")
39
 
40
  if current_llm is not None:
41
  del current_llm
 
43
 
44
  try:
45
  model_path = hf_hub_download(repo_id=REPO_ID, filename=filename)
46
+
47
+ # Для mllama важно использовать свежий движок
48
+ current_llm = Llama(
49
+ model_path=model_path,
50
+ n_ctx=2048, # Увеличь, если нужно больше памяти
51
+ n_threads=4,
52
+ verbose=False, # Меньше логов — меньше шансов на ошибку переполнения
53
+ chat_format="llama-3" # Явно указываем формат для Llama 3.2
54
+ )
55
  current_model_name = model_key
56
  except Exception as e:
57
+ print(f"Error loading model: {e}")
58
  raise HTTPException(status_code=500, detail=str(e))
59
 
60
  @app.on_event("startup")
61
  async def startup_event():
62
+ # Попробуем загрузить medium, если large (mllama) все еще не поддерживается твоим билдом
63
+ # Но с новым Dockerfile 'large' должен завестись
64
  load_model("large")
65
 
66
  class ChatRequest(BaseModel):
 
71
  async def chat(request: ChatRequest):
72
  if request.model_type != current_model_name:
73
  load_model(request.model_type)
74
+
75
+ try:
76
+ # Используем метод create_chat_completion для лучшей совместимости с Llama 3
77
+ output = current_llm.create_chat_completion(
78
+ messages=[
79
+ {"role": "system", "content": "You are a helpful assistant."},
80
+ {"role": "user", "content": request.prompt}
81
+ ],
82
+ max_tokens=512
83
+ )
84
+ return {"response": output["choices"][0]["message"]["content"].strip()}
85
+ except Exception as e:
86
+ return {"error": str(e)}
87
+
88
+ @app.get("/")
89
+ async def health():
90
+ return {"status": "online", "model": current_model_name}