rbc33 commited on
Commit
2da32e3
·
verified ·
1 Parent(s): 77e16d1

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. Dockerfile +16 -0
  3. README.md +7 -8
  4. main.py +127 -0
  5. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Crear usuario no-root por seguridad (recomendado en HF Spaces)
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ COPY --chown=user ./requirements.txt requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
12
+
13
+ COPY --chown=user . .
14
+
15
+ # Hugging Face Spaces espera el puerto 7860
16
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,10 @@
1
  ---
2
- title: Docker Hf
3
- emoji:
4
- colorFrom: green
5
- colorTo: purple
6
  sdk: docker
 
7
  pinned: false
8
- short_description: docker deploy of fastAPI openAi like api
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: OpenAI Like API
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
+ license: mit
10
+ ---
 
 
main.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from fastapi import FastAPI, HTTPException
4
+ from pydantic import BaseModel, Field
5
+ from typing import List, Optional, Literal
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
+ import uvicorn
8
+
9
+ app = FastAPI(title="OpenAI-compatible API")
10
+
11
+ # --- Configuración del Modelo ---
12
+ # Puedes cambiar esto por variables de entorno en HF Spaces
13
+ MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct")
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+
16
+ print(f"Cargando modelo: {MODEL_ID} en {device}...")
17
+
18
+ try:
19
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ MODEL_ID,
22
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
23
+ device_map="auto" if device == "cuda" else None,
24
+ low_cpu_mem_usage=True
25
+ )
26
+ if device == "cpu":
27
+ model.to(device)
28
+ print("¡Modelo cargado exitosamente!")
29
+ except Exception as e:
30
+ print(f"Error cargando el modelo: {e}")
31
+ raise e
32
+
33
+ # --- Modelos Pydantic (Formato OpenAI) ---
34
+
35
+ class ChatMessage(BaseModel):
36
+ role: str
37
+ content: str
38
+
39
+ class ChatCompletionRequest(BaseModel):
40
+ model: str = "default-model"
41
+ messages: List[ChatMessage]
42
+ max_tokens: Optional[int] = 512
43
+ temperature: Optional[float] = 0.7
44
+ top_p: Optional[float] = 0.9
45
+ stream: Optional[bool] = False
46
+
47
+ class Choice(BaseModel):
48
+ index: int
49
+ message: ChatMessage
50
+ finish_reason: str
51
+
52
+ class Usage(BaseModel):
53
+ prompt_tokens: int
54
+ completion_tokens: int
55
+ total_tokens: int
56
+
57
+ class ChatCompletionResponse(BaseModel):
58
+ id: str
59
+ object: Literal["chat.completion"] = "chat.completion"
60
+ created: int
61
+ model: str
62
+ choices: List[Choice]
63
+ usage: Usage
64
+
65
+ # --- Endpoint API ---
66
+
67
+ @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
68
+ async def chat_completions(request: ChatCompletionRequest):
69
+ try:
70
+ # 1. Aplicar el Chat Template (convierte la lista de mensajes en el string que entiende el modelo)
71
+ # Esto hace que funcione con Llama, Mistral, Qwen, etc. automáticamente.
72
+ input_text = tokenizer.apply_chat_template(
73
+ [{"role": m.role, "content": m.content} for m in request.messages],
74
+ tokenize=False,
75
+ add_generation_prompt=True
76
+ )
77
+
78
+ inputs = tokenizer(input_text, return_tensors="pt").to(device)
79
+
80
+ # 2. Generar respuesta
81
+ with torch.no_grad():
82
+ outputs = model.generate(
83
+ **inputs,
84
+ max_new_tokens=request.max_tokens,
85
+ temperature=request.temperature,
86
+ top_p=request.top_p,
87
+ do_sample=True
88
+ )
89
+
90
+ # 3. Decodificar solo la parte nueva (la respuesta)
91
+ generated_ids = outputs[0][inputs.input_ids.shape[1]:]
92
+ response_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
93
+
94
+ # 4. Calcular tokens (aproximado)
95
+ prompt_tokens = len(inputs.input_ids[0])
96
+ completion_tokens = len(generated_ids)
97
+
98
+ # 5. Formatear como OpenAI
99
+ return ChatCompletionResponse(
100
+ id="chatcmpl-123",
101
+ created=1234567890,
102
+ model=MODEL_ID,
103
+ choices=[
104
+ Choice(
105
+ index=0,
106
+ message=ChatMessage(role="assistant", content=response_text),
107
+ finish_reason="stop"
108
+ )
109
+ ],
110
+ usage=Usage(
111
+ prompt_tokens=prompt_tokens,
112
+ completion_tokens=completion_tokens,
113
+ total_tokens=prompt_tokens + completion_tokens
114
+ )
115
+ )
116
+
117
+ except Exception as e:
118
+ print(f"Error en generación: {e}")
119
+ raise HTTPException(status_code=500, detail=str(e))
120
+
121
+ @app.get("/")
122
+ def home():
123
+ return {"message": "OpenAI-compatible API is running", "model": MODEL_ID}
124
+
125
+ # Permite ejecutar con `python main.py` para pruebas locales
126
+ if __name__ == "__main__":
127
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pydantic
4
+ transformers
5
+ torch
6
+ accelerate