File size: 4,175 Bytes
dfd7796
 
 
c456490
05334b7
 
6498586
dfd7796
5ce85fd
c456490
05334b7
 
c456490
 
 
5ce85fd
19cb7b0
dfd7796
05334b7
dfd7796
6498586
c456490
 
 
dfd7796
05334b7
 
 
 
c456490
05334b7
 
dfd7796
 
 
 
c456490
dfd7796
c456490
dfd7796
 
 
 
 
 
 
 
7f7ed2c
c456490
dfd7796
 
6498586
 
dfd7796
 
 
 
 
6498586
dfd7796
 
 
 
 
 
 
 
 
c456490
dfd7796
5ce85fd
c456490
 
 
 
05334b7
dfd7796
05334b7
 
c456490
 
 
dfd7796
 
 
05334b7
dfd7796
 
05334b7
 
 
 
 
c456490
05334b7
dfd7796
 
05334b7
dfd7796
c456490
 
05334b7
c456490
 
 
05334b7
 
dfd7796
 
 
 
 
 
05334b7
c456490
 
05334b7
c456490
 
05334b7
 
dfd7796
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# app.py para Hugging Face Spaces
# Usa CPU con optimizaciones máximas

from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import os

# =========================
# CONFIG
# =========================
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
LORA_MODEL = "Delta0723/techmind-pro-v9"
OFFLOAD_DIR = "./offload_folder"

os.makedirs(OFFLOAD_DIR, exist_ok=True)

# =========================
# FastAPI Setup
# =========================
app = FastAPI(title="TechMind Pro v9")
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"]
)

# Variable global para modelo
model = None
tokenizer = None

# =========================
# Load Model (lazy loading)
# =========================
def load_model():
    global model, tokenizer
    
    if model is not None:
        return
    
    print("🚀 Cargando modelo...")
    
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Cargar en CPU con int8 (más ligero que 4bit para CPU)
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        device_map={"": "cpu"},
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        offload_folder=OFFLOAD_DIR,
        offload_state_dict=True
    )
    
    # Cargar LoRA
    model = PeftModel.from_pretrained(
        base_model,
        LORA_MODEL,
        device_map={"": "cpu"},
        offload_folder=OFFLOAD_DIR
    )
    
    model.eval()
    print("✅ Modelo cargado")

# =========================
# Data Models
# =========================
class Query(BaseModel):
    question: str
    max_tokens: Optional[int] = 200
    temperature: Optional[float] = 0.7

# =========================
# Utilidades
# =========================
def generate_answer(question: str, max_tokens=200, temperature=0.7) -> str:
    load_model()  # Carga lazy
    
    prompt = f"<s>[INST] {question} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            num_beams=1  # Velocidad
        )
    
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("[/INST]")[-1].strip() if "[/INST]" in decoded else decoded

# =========================
# Endpoints
# =========================
@app.get("/")
def root():
    return {
        "model": "TechMind Pro v9",
        "base": BASE_MODEL,
        "lora": LORA_MODEL,
        "status": "online"
    }

@app.post("/ask")
def ask_q(req: Query):
    try:
        result = generate_answer(req.question, req.max_tokens, req.temperature)
        return {"response": result}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# =========================
# README.md para el Space
# =========================
"""
---
title: TechMind Pro v9
emoji: 🤖
colorFrom: blue
colorTo: purple
sdk: docker
pinned: false
---

# TechMind Pro v9

API para el modelo TechMind Pro v9 (Mistral-7B + LoRA fine-tuned)

## Uso

```bash
curl -X POST "https://YOUR-SPACE.hf.space/ask" \
  -H "Content-Type: application/json" \
  -d '{"question": "¿Qué es Python?"}'
```
"""

# =========================
# Dockerfile para el Space
# =========================
"""
FROM python:3.10-slim

WORKDIR /app

RUN apt-get update && apt-get install -y \
    git \
    && rm -rf /var/lib/apt/lists/*

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
"""

# =========================
# requirements.txt
# =========================
"""
fastapi
uvicorn[standard]
transformers>=4.35.0
peft
torch
accelerate
sentencepiece
protobuf
"""