aguitauwu commited on
Commit
849ec65
·
0 Parent(s):

Primer commit

Browse files
Files changed (3) hide show
  1. Dockerfile +35 -0
  2. app.py +95 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Instalar dependencias del sistema
6
+ RUN apt-get update && apt-get install -y \
7
+ git \
8
+ curl \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copiar e instalar dependencias Python primero (cache de Docker)
12
+ COPY requirements.txt .
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Copiar código
16
+ COPY app.py .
17
+
18
+ # Pre-descargar el modelo durante el build
19
+ # (no en runtime, así el container arranca rápido)
20
+ RUN python -c "\
21
+ from transformers import AutoTokenizer, AutoModelForCausalLM; \
22
+ print('Downloading tokenizer...'); \
23
+ AutoTokenizer.from_pretrained('OpceanAI/Yuuki-best'); \
24
+ print('Downloading model...'); \
25
+ AutoModelForCausalLM.from_pretrained('OpceanAI/Yuuki-best'); \
26
+ print('Done!')"
27
+
28
+ EXPOSE 7860
29
+
30
+ # Healthcheck para saber cuando el servidor está listo
31
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
32
+ CMD curl -f http://localhost:7860/health || exit 1
33
+
34
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
35
+
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel, Field
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ import torch
6
+ import time
7
+
8
+ MODEL_ID = "OpceanAI/Yuuki-best"
9
+
10
+ app = FastAPI(
11
+ title="Yuuki API",
12
+ description="Local inference API for Yuuki models",
13
+ version="1.0.0"
14
+ )
15
+
16
+ # CORS para que Yuuki-chat pueda llamar desde el browser
17
+ app.add_middleware(
18
+ CORSMiddleware,
19
+ allow_origins=["*"],
20
+ allow_methods=["*"],
21
+ allow_headers=["*"],
22
+ )
23
+
24
+ # Cargar modelo una sola vez al arrancar
25
+ print(f"Loading tokenizer from {MODEL_ID}...")
26
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
27
+
28
+ print(f"Loading model from {MODEL_ID}...")
29
+ model = AutoModelForCausalLM.from_pretrained(
30
+ MODEL_ID,
31
+ torch_dtype=torch.float32
32
+ ).to("cpu")
33
+
34
+ model.eval() # Modo inferencia (más rápido, menos memoria)
35
+ print("Model ready!")
36
+
37
+
38
+ class GenerateRequest(BaseModel):
39
+ prompt: str = Field(..., min_length=1, max_length=4000)
40
+ max_new_tokens: int = Field(default=120, ge=1, le=512)
41
+ temperature: float = Field(default=0.7, ge=0.1, le=2.0)
42
+ top_p: float = Field(default=0.95, ge=0.0, le=1.0)
43
+
44
+
45
+ class GenerateResponse(BaseModel):
46
+ response: str
47
+ tokens_generated: int
48
+ time_ms: int
49
+
50
+
51
+ @app.get("/health")
52
+ def health():
53
+ return {"status": "ok", "model": MODEL_ID}
54
+
55
+
56
+ @app.post("/generate", response_model=GenerateResponse)
57
+ def generate(req: GenerateRequest):
58
+ try:
59
+ start = time.time()
60
+
61
+ inputs = tokenizer(
62
+ req.prompt,
63
+ return_tensors="pt",
64
+ truncation=True,
65
+ max_length=1024
66
+ )
67
+
68
+ input_length = inputs["input_ids"].shape[1]
69
+
70
+ with torch.no_grad():
71
+ output = model.generate(
72
+ **inputs,
73
+ max_new_tokens=req.max_new_tokens,
74
+ temperature=req.temperature,
75
+ top_p=req.top_p,
76
+ do_sample=True,
77
+ pad_token_id=tokenizer.eos_token_id,
78
+ repetition_penalty=1.1,
79
+ )
80
+
81
+ # Solo devolver tokens NUEVOS (no el prompt)
82
+ new_tokens = output[0][input_length:]
83
+ response_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
84
+
85
+ elapsed_ms = int((time.time() - start) * 1000)
86
+
87
+ return GenerateResponse(
88
+ response=response_text.strip(),
89
+ tokens_generated=len(new_tokens),
90
+ time_ms=elapsed_ms
91
+ )
92
+
93
+ except Exception as e:
94
+ raise HTTPException(status_code=500, detail=str(e))
95
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn==0.30.6
3
+ transformers==4.45.0
4
+ torch==2.4.1
5
+ pydantic==2.9.0
6
+ accelerate==0.34.2
7
+