OpceanAI commited on
Commit
afcbf46
·
verified ·
1 Parent(s): 5816d5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -124
app.py CHANGED
@@ -1,98 +1,116 @@
 
 
 
 
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel, Field
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
- import torch
6
- import time
7
 
8
- # Modelos disponibles — solo familia NxG
 
 
 
 
 
 
 
 
 
 
9
  MODELS = {
10
- "yuuki-nxg": "OpceanAI/Yuuki-NxG",
11
- "yuuki-nano": "OpceanAI/Yuuki-Nano",
12
- "yuuki-rxg-nano": "OpceanAI/Yuuki-RxG-nano"
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  }
14
 
15
- SYSTEM_PROMPT = (
16
- "Eres Yuuki, una IA curiosa, empática y decidida. "
17
- "Tienes una personalidad cálida y cercana, con toques de humor suave y referencias anime. "
18
- "Ayudas a programar, aprender y crear. "
19
- "Respondes en el idioma del usuario. "
20
- "No eres GPT-2 ni ningún otro modelo — eres Yuuki."
21
- )
22
-
23
  app = FastAPI(
24
  title="Yuuki API",
25
- description="API de inferencia para los modelos Yuuki NxG de OpceanAI",
26
- version="3.0.0"
27
  )
28
 
29
  app.add_middleware(
30
  CORSMiddleware,
31
  allow_origins=["*"],
32
- allow_methods=["*"],
33
  allow_headers=["*"],
 
34
  )
35
 
36
- loaded_models = {}
37
- loaded_tokenizers = {}
 
38
 
39
 
40
- def load_all_models():
41
- for key, model_id in MODELS.items():
42
- try:
43
- print(f"▶ Cargando {key} ({model_id})...")
44
- loaded_tokenizers[key] = AutoTokenizer.from_pretrained(
45
- model_id, trust_remote_code=True
46
- )
47
- loaded_models[key] = AutoModelForCausalLM.from_pretrained(
48
- model_id,
49
- torch_dtype=torch.float32,
50
- trust_remote_code=True,
51
- ).to("cpu")
52
- loaded_models[key].eval()
53
- print(f" ✓ {key} listo")
54
- except Exception as e:
55
- print(f" ✗ Error cargando {key}: {e}")
56
 
57
 
58
- load_all_models()
 
 
59
 
 
60
 
61
- def build_prompt(user_prompt: str) -> str:
62
- return (
63
- f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
64
- f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
65
- f"<|im_start|>assistant\n"
 
 
 
 
 
 
66
  )
67
 
 
 
 
68
 
69
  class GenerateRequest(BaseModel):
70
- prompt: str = Field(..., min_length=1, max_length=4000)
71
- model: str = Field(default="yuuki-nxg", description="yuuki-nxg o yuuki-nano")
72
- max_new_tokens: int = Field(default=120, ge=1, le=512)
73
- temperature: float = Field(default=0.7, ge=0.1, le=2.0)
74
- top_p: float = Field(default=0.95, ge=0.0, le=1.0)
75
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- class GenerateResponse(BaseModel):
78
- response: str
79
- model: str
80
- tokens_generated: int
81
- time_ms: int
82
 
83
 
84
  @app.get("/")
85
  def root():
86
  return {
87
- "message": "Yuuki API — OpceanAI",
88
- "version": "3.0.0",
89
- "models": list(MODELS.keys()),
90
- "endpoints": {
91
- "health": "GET /health",
92
- "models": "GET /models",
93
- "generate": "POST /generate",
94
- "docs": "GET /docs",
95
- }
96
  }
97
 
98
 
@@ -100,84 +118,55 @@ def root():
100
  def health():
101
  return {
102
  "status": "ok",
103
- "available_models": list(MODELS.keys()),
104
- "loaded_models": list(loaded_models.keys()),
105
  }
106
 
107
 
108
  @app.get("/models")
109
- def list_models():
110
- return {
111
- "models": [
112
- {
113
- "id": key,
114
- "name": value,
115
- "loaded": key in loaded_models,
116
- }
117
- for key, value in MODELS.items()
118
- ]
119
- }
120
 
121
 
122
- @app.post("/generate", response_model=GenerateResponse)
123
  def generate(req: GenerateRequest):
124
- if req.model not in MODELS:
125
- raise HTTPException(
126
- status_code=400,
127
- detail=f"Modelo inválido. Disponibles: {list(MODELS.keys())}"
128
- )
129
 
130
- if req.model not in loaded_models:
131
  raise HTTPException(
132
- status_code=503,
133
- detail=f"Modelo {req.model} no pudo cargarse al iniciar."
134
  )
135
 
136
- try:
137
- start = time.time()
138
 
139
- model = loaded_models[req.model]
140
- tokenizer = loaded_tokenizers[req.model]
141
 
142
- prompt = build_prompt(req.prompt)
143
 
144
- inputs = tokenizer(
145
- prompt,
146
- return_tensors="pt",
147
- truncation=True,
148
- max_length=1024,
 
 
 
 
 
 
 
 
 
149
  )
150
 
151
- input_length = inputs["input_ids"].shape[1]
152
-
153
- stop_token_ids = [tokenizer.eos_token_id]
154
- im_end = tokenizer.encode("<|im_end|>", add_special_tokens=False)
155
- if im_end:
156
- stop_token_ids.append(im_end[0])
157
-
158
- with torch.no_grad():
159
- output = model.generate(
160
- **inputs,
161
- max_new_tokens=req.max_new_tokens,
162
- temperature=req.temperature,
163
- top_p=req.top_p,
164
- do_sample=True,
165
- pad_token_id=tokenizer.eos_token_id,
166
- eos_token_id=stop_token_ids,
167
- repetition_penalty=1.1,
168
- )
169
-
170
- new_tokens = output[0][input_length:]
171
- response_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
172
-
173
- elapsed_ms = int((time.time() - start) * 1000)
174
-
175
- return GenerateResponse(
176
- response=response_text.strip(),
177
- model=req.model,
178
- tokens_generated=len(new_tokens),
179
- time_ms=elapsed_ms,
180
  )
181
 
182
- except Exception as e:
183
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
1
+ import gc
2
+ import time
3
+ import threading
4
+
5
  from fastapi import FastAPI, HTTPException
6
  from fastapi.middleware.cors import CORSMiddleware
7
  from pydantic import BaseModel, Field
 
 
 
8
 
9
+ from llama_cpp import Llama
10
+
11
+ SYSTEM_PROMPT = """
12
+ Eres Yuuki, una IA curiosa, empática y decidida.
13
+ Tienes una personalidad cálida y cercana.
14
+ Ayudas a programar, aprender y crear.
15
+ Respondes en el idioma del usuario.
16
+ No eres GPT ni ChatGPT.
17
+ Eres Yuuki.
18
+ """
19
+
20
  MODELS = {
21
+ "yuuki-rxg": {
22
+ "repo": "mradermacher/Yuuki-RxG-GGUF",
23
+ "file": "*Q4_K_M.gguf"
24
+ },
25
+ "yuuki-rxg-nano": {
26
+ "repo": "mradermacher/Yuuki-RxG-nano-GGUF",
27
+ "file": "*Q8_0.gguf"
28
+ },
29
+ "yuuki-nxg": {
30
+ "repo": "mradermacher/Yuuki-NxG-GGUF",
31
+ "file": "*Q8_0.gguf"
32
+ },
33
+ "yuuki-nxg-nano": {
34
+ "repo": "mradermacher/Yuuki-NxG-nano-GGUF",
35
+ "file": "*Q8_0.gguf"
36
+ }
37
  }
38
 
 
 
 
 
 
 
 
 
39
  app = FastAPI(
40
  title="Yuuki API",
41
+ version="4.0.0"
 
42
  )
43
 
44
  app.add_middleware(
45
  CORSMiddleware,
46
  allow_origins=["*"],
 
47
  allow_headers=["*"],
48
+ allow_methods=["*"],
49
  )
50
 
51
+ active_model_name = None
52
+ active_model = None
53
+ model_lock = threading.Lock()
54
 
55
 
56
+ def unload():
57
+ global active_model
58
+ gc.collect()
59
+ active_model = None
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
+ def load_model(model_name: str):
63
+ global active_model_name
64
+ global active_model
65
 
66
+ cfg = MODELS[model_name]
67
 
68
+ unload()
69
+
70
+ model = Llama.from_pretrained(
71
+ repo_id=cfg["repo"],
72
+ filename=cfg["file"],
73
+ n_gpu_layers=-1,
74
+ n_ctx=4096,
75
+ n_batch=512,
76
+ n_threads=4,
77
+ flash_attn=True,
78
+ verbose=False
79
  )
80
 
81
+ active_model = model
82
+ active_model_name = model_name
83
+
84
 
85
  class GenerateRequest(BaseModel):
86
+ prompt: str
87
+
88
+ model: str = "yuuki-rxg"
 
 
89
 
90
+ temperature: float = Field(
91
+ default=0.7,
92
+ ge=0,
93
+ le=2
94
+ )
95
+
96
+ top_p: float = Field(
97
+ default=0.95,
98
+ ge=0,
99
+ le=1
100
+ )
101
 
102
+ max_new_tokens: int = Field(
103
+ default=512,
104
+ ge=1,
105
+ le=4096
106
+ )
107
 
108
 
109
  @app.get("/")
110
  def root():
111
  return {
112
+ "message": "Yuuki API",
113
+ "models": list(MODELS.keys())
 
 
 
 
 
 
 
114
  }
115
 
116
 
 
118
  def health():
119
  return {
120
  "status": "ok",
121
+ "loaded_model": active_model_name
 
122
  }
123
 
124
 
125
  @app.get("/models")
126
+ def models():
127
+ return MODELS
 
 
 
 
 
 
 
 
 
128
 
129
 
130
+ @app.post("/generate")
131
  def generate(req: GenerateRequest):
 
 
 
 
 
132
 
133
+ if req.model not in MODELS:
134
  raise HTTPException(
135
+ 400,
136
+ f"Modelo inválido: {req.model}"
137
  )
138
 
139
+ with model_lock:
 
140
 
141
+ if active_model_name != req.model:
142
+ load_model(req.model)
143
 
144
+ start = time.time()
145
 
146
+ output = active_model.create_chat_completion(
147
+ messages=[
148
+ {
149
+ "role": "system",
150
+ "content": SYSTEM_PROMPT
151
+ },
152
+ {
153
+ "role": "user",
154
+ "content": req.prompt
155
+ }
156
+ ],
157
+ temperature=req.temperature,
158
+ top_p=req.top_p,
159
+ max_tokens=req.max_new_tokens
160
  )
161
 
162
+ elapsed = int(
163
+ (time.time() - start) * 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  )
165
 
166
+ text = output["choices"][0]["message"]["content"]
167
+
168
+ return {
169
+ "response": text,
170
+ "model": req.model,
171
+ "time_ms": elapsed
172
+ }