VeuReu commited on
Commit
5c6fcdb
·
verified ·
1 Parent(s): 38f2ef7

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +20 -14
  2. app.py +113 -0
  3. clients/test_client.py +9 -0
  4. requirements.txt +7 -0
README.md CHANGED
@@ -1,14 +1,20 @@
1
- ---
2
- title: Schat
3
- emoji: 🐠
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Chatbot- salamandra LLM
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
+ ---
2
+ title: veureu-svision
3
+ emoji: 🦎
4
+ colorFrom: purple
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: "4.44.1"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # 🦎 veureu-svision (Salamandra-Vision 7B · ZeroGPU)
13
+
14
+ ## Endpoints
15
+ - **`/api/predict`** (Gradio): **batch** — entrada `[[<file1>, <file2>, ...], "{...context_json...}", 256, 0.7]` → salida `["desc1", "desc2", ...]`.
16
+ - **`/api/describe_raw`** (multipart): `image`, `text`, `max_new_tokens`, `temperature` → `{"text": "..."}`.
17
+ - **`/api/describe`** (Gradio UI single).
18
+
19
+ Compatibilidad con el `engine`: el `VisionClient` del engine llama a **`api_name="/predict"`** con *lista de imágenes* y **`context_json`**.
20
+
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py — veureu/schat (Salamandra 7B Instruct · ZeroGPU) — compatible con ENGINE
2
+ from __future__ import annotations
3
+ import os, json
4
+ from typing import List, Dict, Optional, Tuple
5
+
6
+ import gradio as gr
7
+ import spaces
8
+ import torch
9
+ from transformers import (
10
+ AutoTokenizer,
11
+ AutoModelForCausalLM,
12
+ TextIteratorStreamer,
13
+ )
14
+
15
+ # ===== Config =====
16
+ MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-instruct")
17
+ DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
18
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
19
+
20
+ _tok = None
21
+ _model = None
22
+
23
+ def _lazy_load() -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
24
+ global _tok, _model
25
+ if _tok is None or _model is None:
26
+ _tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
27
+ _model = AutoModelForCausalLM.from_pretrained(
28
+ MODEL_ID,
29
+ torch_dtype=DTYPE,
30
+ low_cpu_mem_usage=True,
31
+ use_safetensors=True,
32
+ trust_remote_code=True,
33
+ device_map=None,
34
+ ).to(DEVICE)
35
+ return _tok, _model
36
+
37
+ def _build_prompt(prompt: str, system: Optional[str]) -> str:
38
+ """
39
+ Si el tokenizer posee 'chat_template', lo usamos con mensajes [system?, user].
40
+ Si no, hacemos un prompt plano con system arriba.
41
+ """
42
+ tok, _ = _lazy_load()
43
+ messages = []
44
+ if system and system.strip():
45
+ messages.append({"role": "system", "content": system.strip()})
46
+ messages.append({"role": "user", "content": prompt})
47
+
48
+ chat_template = getattr(tok, "chat_template", None)
49
+ if chat_template:
50
+ return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
51
+ # Fallback sin chat template
52
+ sys_part = (f"<<SYS>>\n{system.strip()}\n<</SYS>>\n\n" if system and system.strip() else "")
53
+ return sys_part + f"### Instrucción\n{prompt}\n\n### Respuesta\n"
54
+
55
+ @spaces.GPU # usa GPU si está disponible (ZeroGPU)
56
+ def _generate(
57
+ prompt: str,
58
+ system: str = "",
59
+ max_new_tokens: int = 512,
60
+ temperature: float = 0.7,
61
+ top_p: float = 0.95,
62
+ ) -> str:
63
+ tok, model = _lazy_load()
64
+ text = _build_prompt(prompt, system or "")
65
+ inputs = tok(text, return_tensors="pt").to(DEVICE)
66
+
67
+ with torch.inference_mode():
68
+ out = model.generate(
69
+ **inputs,
70
+ max_new_tokens=int(max_new_tokens),
71
+ temperature=float(temperature),
72
+ top_p=float(top_p),
73
+ do_sample=True if temperature > 0 else False,
74
+ pad_token_id=tok.eos_token_id,
75
+ eos_token_id=tok.eos_token_id,
76
+ )
77
+ return tok.decode(out[0], skip_special_tokens=True).strip()
78
+
79
+ # ------------------- Gradio Endpoints -------------------
80
+ # 1) /predict — lo que espera el ENGINE (solo 'prompt' → string)
81
+ def predict_for_engine(prompt: str) -> str:
82
+ return _generate(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)
83
+
84
+ # 2) /generate — más controles (prompt + system + params)
85
+ def generate_advanced(prompt: str, system: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
86
+ return _generate(prompt=prompt, system=system, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p)
87
+
88
+ # ------------------- HTTP (opcional, clientes puros) -------------------
89
+ # Si quieres, puedes añadir un endpoint HTTP POST /generate (FastAPI),
90
+ # pero con Gradio Client es suficiente para engine/local.
91
+
92
+ # ------------------- UI -------------------
93
+ with gr.Blocks(title="Salamandra 7B Instruct · ZeroGPU") as demo:
94
+ gr.Markdown("## Salamandra-7B-Instruct · ZeroGPU\nTexto → respuesta instruccional.")
95
+ with gr.Row():
96
+ with gr.Column(scale=1):
97
+ in_system = gr.Textbox(label="System (opcional)", value="")
98
+ in_prompt = gr.Textbox(label="Prompt", placeholder="Escribe tu instrucción…", lines=6)
99
+ max_new = gr.Slider(16, 2048, value=512, step=16, label="max_new_tokens")
100
+ temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
101
+ top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
102
+ btn = gr.Button("Generar", variant="primary")
103
+ with gr.Column(scale=1):
104
+ out = gr.Textbox(label="Respuesta", lines=18)
105
+
106
+ btn.click(generate_advanced, [in_prompt, in_system, max_new, temp, top_p], out, api_name="generate")
107
+
108
+ # Endpoint minimalista compatible con el ENGINE (/predict: solo prompt)
109
+ in_prompt_engine = gr.Textbox(label="Prompt (ENGINE)", value="Di hola en una frase.")
110
+ out_engine = gr.Textbox(label="Respuesta (ENGINE)")
111
+ gr.Button("Probar /predict").click(predict_for_engine, [in_prompt_engine], out_engine, api_name="predict")
112
+
113
+ demo.queue(concurrency_count=1, max_size=16).launch()
clients/test_client.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from gradio_client import Client
2
+ c = Client("https://veureu-schat.hf.space")
3
+ resp = c.predict(
4
+ "Resumeix el text següent en català en 3 punts.", # prompt
5
+ "Ets un ajudant que respon breu i clar.", # system
6
+ 512, 0.7, 0.95,
7
+ api_name="/generate"
8
+ )
9
+ print(resp)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=4.44.1
2
+ spaces>=0.25.0
3
+ transformers>=4.44.0
4
+ torch>=2.2
5
+ accelerate>=0.30.0
6
+ safetensors>=0.4.2
7
+ sentencepiece>=0.1.99