Spaces:
Runtime error
Runtime error
File size: 4,175 Bytes
dfd7796 c456490 05334b7 6498586 dfd7796 5ce85fd c456490 05334b7 c456490 5ce85fd 19cb7b0 dfd7796 05334b7 dfd7796 6498586 c456490 dfd7796 05334b7 c456490 05334b7 dfd7796 c456490 dfd7796 c456490 dfd7796 7f7ed2c c456490 dfd7796 6498586 dfd7796 6498586 dfd7796 c456490 dfd7796 5ce85fd c456490 05334b7 dfd7796 05334b7 c456490 dfd7796 05334b7 dfd7796 05334b7 c456490 05334b7 dfd7796 05334b7 dfd7796 c456490 05334b7 c456490 05334b7 dfd7796 05334b7 c456490 05334b7 c456490 05334b7 dfd7796 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
# app.py para Hugging Face Spaces
# Usa CPU con optimizaciones máximas
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import os
# =========================
# CONFIG
# =========================
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
LORA_MODEL = "Delta0723/techmind-pro-v9"
OFFLOAD_DIR = "./offload_folder"
os.makedirs(OFFLOAD_DIR, exist_ok=True)
# =========================
# FastAPI Setup
# =========================
app = FastAPI(title="TechMind Pro v9")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"]
)
# Variable global para modelo
model = None
tokenizer = None
# =========================
# Load Model (lazy loading)
# =========================
def load_model():
global model, tokenizer
if model is not None:
return
print("🚀 Cargando modelo...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
# Cargar en CPU con int8 (más ligero que 4bit para CPU)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map={"": "cpu"},
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
offload_folder=OFFLOAD_DIR,
offload_state_dict=True
)
# Cargar LoRA
model = PeftModel.from_pretrained(
base_model,
LORA_MODEL,
device_map={"": "cpu"},
offload_folder=OFFLOAD_DIR
)
model.eval()
print("✅ Modelo cargado")
# =========================
# Data Models
# =========================
class Query(BaseModel):
question: str
max_tokens: Optional[int] = 200
temperature: Optional[float] = 0.7
# =========================
# Utilidades
# =========================
def generate_answer(question: str, max_tokens=200, temperature=0.7) -> str:
load_model() # Carga lazy
prompt = f"<s>[INST] {question} [/INST]"
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=0.95,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
num_beams=1 # Velocidad
)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
return decoded.split("[/INST]")[-1].strip() if "[/INST]" in decoded else decoded
# =========================
# Endpoints
# =========================
@app.get("/")
def root():
return {
"model": "TechMind Pro v9",
"base": BASE_MODEL,
"lora": LORA_MODEL,
"status": "online"
}
@app.post("/ask")
def ask_q(req: Query):
try:
result = generate_answer(req.question, req.max_tokens, req.temperature)
return {"response": result}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =========================
# README.md para el Space
# =========================
"""
---
title: TechMind Pro v9
emoji: 🤖
colorFrom: blue
colorTo: purple
sdk: docker
pinned: false
---
# TechMind Pro v9
API para el modelo TechMind Pro v9 (Mistral-7B + LoRA fine-tuned)
## Uso
```bash
curl -X POST "https://YOUR-SPACE.hf.space/ask" \
-H "Content-Type: application/json" \
-d '{"question": "¿Qué es Python?"}'
```
"""
# =========================
# Dockerfile para el Space
# =========================
"""
FROM python:3.10-slim
WORKDIR /app
RUN apt-get update && apt-get install -y \
git \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
"""
# =========================
# requirements.txt
# =========================
"""
fastapi
uvicorn[standard]
transformers>=4.35.0
peft
torch
accelerate
sentencepiece
protobuf
""" |