Delta0723 commited on
Commit
dfd7796
·
verified ·
1 Parent(s): 6b5c715

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -37
app.py CHANGED
@@ -1,8 +1,11 @@
 
 
 
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  from typing import Optional
5
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
6
  from peft import PeftModel
7
  import torch
8
  import os
@@ -10,19 +13,16 @@ import os
10
  # =========================
11
  # CONFIG
12
  # =========================
13
-
14
  BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
15
  LORA_MODEL = "Delta0723/techmind-pro-v9"
 
16
 
17
- # Crear carpeta para offload si no existe
18
- os.makedirs("offload", exist_ok=True)
19
 
20
  # =========================
21
  # FastAPI Setup
22
  # =========================
23
-
24
- app = FastAPI(title="TechMind Pro API")
25
-
26
  app.add_middleware(
27
  CORSMiddleware,
28
  allow_origins=["*"],
@@ -30,52 +30,62 @@ app.add_middleware(
30
  allow_headers=["*"]
31
  )
32
 
 
 
 
 
33
  # =========================
34
- # Load Model
35
  # =========================
36
-
37
- print("🚀 Cargando modelo y tokenizer...")
38
-
39
- try:
 
 
 
 
40
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
41
  tokenizer.pad_token = tokenizer.eos_token
42
-
43
- quant_config = BitsAndBytesConfig(load_in_4bit=True)
44
-
45
  base_model = AutoModelForCausalLM.from_pretrained(
46
  BASE_MODEL,
47
- device_map="auto",
48
- trust_remote_code=True,
49
- offload_folder="offload",
50
- quantization_config=quant_config
 
51
  )
52
-
53
- model = PeftModel.from_pretrained(base_model, LORA_MODEL)
 
 
 
 
 
 
 
54
  model.eval()
55
-
56
- print("✅ Modelo listo para usar")
57
-
58
- except Exception as e:
59
- print("❌ Error al cargar el modelo:", e)
60
- raise e
61
 
62
  # =========================
63
  # Data Models
64
  # =========================
65
-
66
  class Query(BaseModel):
67
  question: str
68
- max_tokens: Optional[int] = 300
69
  temperature: Optional[float] = 0.7
70
 
71
  # =========================
72
  # Utilidades
73
  # =========================
74
-
75
- def generate_answer(question: str, max_tokens=300, temperature=0.7) -> str:
 
76
  prompt = f"<s>[INST] {question} [/INST]"
77
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
78
-
79
  with torch.no_grad():
80
  outputs = model.generate(
81
  **inputs,
@@ -83,19 +93,24 @@ def generate_answer(question: str, max_tokens=300, temperature=0.7) -> str:
83
  temperature=temperature,
84
  top_p=0.95,
85
  do_sample=True,
86
- pad_token_id=tokenizer.eos_token_id
 
87
  )
88
-
89
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
90
  return decoded.split("[/INST]")[-1].strip() if "[/INST]" in decoded else decoded
91
 
92
  # =========================
93
  # Endpoints
94
  # =========================
95
-
96
  @app.get("/")
97
  def root():
98
- return {"TechMind": "Mistral-7B Instruct + LoRA v9", "status": "online"}
 
 
 
 
 
99
 
100
  @app.post("/ask")
101
  def ask_q(req: Query):
@@ -104,3 +119,63 @@ def ask_q(req: Query):
104
  return {"response": result}
105
  except Exception as e:
106
  raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py para Hugging Face Spaces
2
+ # Usa CPU con optimizaciones máximas
3
+
4
  from fastapi import FastAPI, HTTPException
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from pydantic import BaseModel
7
  from typing import Optional
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM
9
  from peft import PeftModel
10
  import torch
11
  import os
 
13
  # =========================
14
  # CONFIG
15
  # =========================
 
16
  BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
17
  LORA_MODEL = "Delta0723/techmind-pro-v9"
18
+ OFFLOAD_DIR = "./offload_folder"
19
 
20
+ os.makedirs(OFFLOAD_DIR, exist_ok=True)
 
21
 
22
  # =========================
23
  # FastAPI Setup
24
  # =========================
25
+ app = FastAPI(title="TechMind Pro v9")
 
 
26
  app.add_middleware(
27
  CORSMiddleware,
28
  allow_origins=["*"],
 
30
  allow_headers=["*"]
31
  )
32
 
33
+ # Variable global para modelo
34
+ model = None
35
+ tokenizer = None
36
+
37
  # =========================
38
+ # Load Model (lazy loading)
39
  # =========================
40
+ def load_model():
41
+ global model, tokenizer
42
+
43
+ if model is not None:
44
+ return
45
+
46
+ print("🚀 Cargando modelo...")
47
+
48
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
49
  tokenizer.pad_token = tokenizer.eos_token
50
+
51
+ # Cargar en CPU con int8 (más ligero que 4bit para CPU)
 
52
  base_model = AutoModelForCausalLM.from_pretrained(
53
  BASE_MODEL,
54
+ device_map={"": "cpu"},
55
+ torch_dtype=torch.float16,
56
+ low_cpu_mem_usage=True,
57
+ offload_folder=OFFLOAD_DIR,
58
+ offload_state_dict=True
59
  )
60
+
61
+ # Cargar LoRA
62
+ model = PeftModel.from_pretrained(
63
+ base_model,
64
+ LORA_MODEL,
65
+ device_map={"": "cpu"},
66
+ offload_folder=OFFLOAD_DIR
67
+ )
68
+
69
  model.eval()
70
+ print("✅ Modelo cargado")
 
 
 
 
 
71
 
72
  # =========================
73
  # Data Models
74
  # =========================
 
75
  class Query(BaseModel):
76
  question: str
77
+ max_tokens: Optional[int] = 200
78
  temperature: Optional[float] = 0.7
79
 
80
  # =========================
81
  # Utilidades
82
  # =========================
83
+ def generate_answer(question: str, max_tokens=200, temperature=0.7) -> str:
84
+ load_model() # Carga lazy
85
+
86
  prompt = f"<s>[INST] {question} [/INST]"
87
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
88
+
89
  with torch.no_grad():
90
  outputs = model.generate(
91
  **inputs,
 
93
  temperature=temperature,
94
  top_p=0.95,
95
  do_sample=True,
96
+ pad_token_id=tokenizer.eos_token_id,
97
+ num_beams=1 # Velocidad
98
  )
99
+
100
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
101
  return decoded.split("[/INST]")[-1].strip() if "[/INST]" in decoded else decoded
102
 
103
  # =========================
104
  # Endpoints
105
  # =========================
 
106
  @app.get("/")
107
  def root():
108
+ return {
109
+ "model": "TechMind Pro v9",
110
+ "base": BASE_MODEL,
111
+ "lora": LORA_MODEL,
112
+ "status": "online"
113
+ }
114
 
115
  @app.post("/ask")
116
  def ask_q(req: Query):
 
119
  return {"response": result}
120
  except Exception as e:
121
  raise HTTPException(status_code=500, detail=str(e))
122
+
123
+ # =========================
124
+ # README.md para el Space
125
+ # =========================
126
+ """
127
+ ---
128
+ title: TechMind Pro v9
129
+ emoji: 🤖
130
+ colorFrom: blue
131
+ colorTo: purple
132
+ sdk: docker
133
+ pinned: false
134
+ ---
135
+
136
+ # TechMind Pro v9
137
+
138
+ API para el modelo TechMind Pro v9 (Mistral-7B + LoRA fine-tuned)
139
+
140
+ ## Uso
141
+
142
+ ```bash
143
+ curl -X POST "https://YOUR-SPACE.hf.space/ask" \
144
+ -H "Content-Type: application/json" \
145
+ -d '{"question": "¿Qué es Python?"}'
146
+ ```
147
+ """
148
+
149
+ # =========================
150
+ # Dockerfile para el Space
151
+ # =========================
152
+ """
153
+ FROM python:3.10-slim
154
+
155
+ WORKDIR /app
156
+
157
+ RUN apt-get update && apt-get install -y \
158
+ git \
159
+ && rm -rf /var/lib/apt/lists/*
160
+
161
+ COPY requirements.txt .
162
+ RUN pip install --no-cache-dir -r requirements.txt
163
+
164
+ COPY . .
165
+
166
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
167
+ """
168
+
169
+ # =========================
170
+ # requirements.txt
171
+ # =========================
172
+ """
173
+ fastapi
174
+ uvicorn[standard]
175
+ transformers>=4.35.0
176
+ peft
177
+ torch
178
+ accelerate
179
+ sentencepiece
180
+ protobuf
181
+ """