File size: 12,125 Bytes
73cad8e
7b4bcba
 
92cec32
7b4bcba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db72e25
7b4bcba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73cad8e
 
7b4bcba
 
 
 
 
 
 
 
 
 
73cad8e
 
7b4bcba
73cad8e
7b4bcba
73cad8e
 
 
 
 
 
 
 
 
 
 
0fb6f95
73cad8e
 
 
 
 
 
 
 
 
 
 
 
0fb6f95
73cad8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b4bcba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73cad8e
7b4bcba
 
 
73cad8e
7b4bcba
 
 
286235d
 
 
73cad8e
286235d
 
 
 
73cad8e
286235d
 
 
73cad8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddee9da
73cad8e
 
ddee9da
 
 
73cad8e
 
ddee9da
 
73cad8e
 
 
 
ddee9da
73cad8e
 
ddee9da
73cad8e
 
ddee9da
73cad8e
 
ddee9da
 
 
73cad8e
 
ddee9da
 
73cad8e
 
 
 
cb4cd52
 
 
 
73cad8e
 
cb4cd52
73cad8e
 
cb4cd52
 
 
73cad8e
 
cb4cd52
 
7b4bcba
 
 
 
 
5b17d04
 
 
 
 
 
 
 
 
 
 
 
 
 
7b4bcba
 
 
 
 
 
 
 
 
 
 
 
 
5b17d04
7b4bcba
 
 
 
5b17d04
 
ddee9da
 
 
b28c85b
 
ddee9da
b28c85b
5b17d04
b28c85b
ddee9da
b28c85b
73cad8e
b28c85b
 
 
ddee9da
 
5b17d04
 
ddee9da
 
 
b28c85b
 
ddee9da
 
5b17d04
b28c85b
5b17d04
b28c85b
ddee9da
b28c85b
ddee9da
b28c85b
ddee9da
 
 
cb4cd52
 
 
 
b28c85b
cb4cd52
 
 
b28c85b
cb4cd52
 
 
 
 
 
 
73cad8e
 
 
 
 
 
 
 
 
 
 
7b4bcba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# app.py — veureu/schat (Salamandra 7B Instruct · ZeroGPU) — compatible with ENGINE
from __future__ import annotations
import os, json
from typing import List, Dict, Any, Optional, Tuple

import gradio as gr
import spaces
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
)

from transformers import AutoTokenizer, AutoModelForCausalLM
from moe_tools import SalamandraClient

# ===== Config =====
MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-instruct")
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

_tok = None
_model = None
_salamandra = None

def _lazy_load() -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
    global _tok, _model
    if _tok is None or _model is None:
        _tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
        _model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=DTYPE,
            low_cpu_mem_usage=True,
            use_safetensors=True,
            trust_remote_code=True,
            device_map=None,
        ).to(DEVICE)
    return _tok, _model

def _build_prompt(prompt: str, system: Optional[str]) -> str:
    """
    If the tokenizer has 'chat_template', use it with messages [system?, user].
    Otherwise, create a plain prompt with system at the top.
    """
    tok, _ = _lazy_load()
    messages = []
    if system and system.strip():
        messages.append({"role": "system", "content": system.strip()})
    messages.append({"role": "user", "content": prompt})

    chat_template = getattr(tok, "chat_template", None)
    if chat_template:
        return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Fallback without chat template
    sys_part = (f"<<SYS>>\n{system.strip()}\n<</SYS>>\n\n" if system and system.strip() else "")
    return sys_part + f"### Instrucció\n{prompt}\n\n### Resposta\n"

#@spaces.GPU  # use GPU if available (ZeroGPU)
#def _generate_with_tools(
#    messages: List[Dict[str, str]],
#    tools: List[Dict[str, Any]],
#    max_new_tokens: int = 512,
#    temperature: float = 0.7,
#    top_p: float = 0.95,
#) -> Dict[str, Any]:
#    tok, model = _lazy_load()
#    tools_md = _render_tools_md(tools)
#    prompt = _compose_chat_prompt(messages, tools_md)

#    inputs = tok(prompt, return_tensors="pt").to(DEVICE)
#    with torch.inference_mode():
#        out = model.generate(
#            **inputs,
#            max_new_tokens=int(max_new_tokens),
#            temperature=float(temperature),
#            top_p=float(top_p),
#            do_sample=True if temperature > 0 else False,
#            pad_token_id=tok.eos_token_id,
#            eos_token_id=tok.eos_token_id,
#        )
#    text = tok.decode(out[0], skip_special_tokens=True).strip()

#    # If the model returns a JSON block with 'tool_calls', try to extract it
#    tool_calls: List[Dict[str, Any]] = []
#    try:
#        # Search for the last {...} containing "tool_calls"
#        matches = list(re.finditer(r"\{.*?\"tool_calls\".*?\}", text, flags=re.S))
#        if matches:
#            block = text[matches[-1].start():matches[-1].end()]
#            obj = json.loads(block)
#            tc = obj.get("tool_calls", [])
#            if isinstance(tc, list):
#                tool_calls = tc
#    except Exception:
#        pass

    # Execute the extracted tool calls if any
#    tool_results = maybe_execute_tool_calls(tool_calls) if tool_calls else []

#    return {"text": text, "tool_calls": tool_calls, "tool_results": tool_results}

@spaces.GPU # use GPU if available (ZeroGPU)
def _generate(
    prompt: str,
    system: str = "",
    max_new_tokens: int = 512,
    temperature: float = 0.7,
    top_p: float = 0.95,
) -> str:
    tok, model = _lazy_load()
    text = _build_prompt(prompt, system or "")
    inputs = tok(text, return_tensors="pt").to(DEVICE)

    with torch.inference_mode():
        out = model.generate(
            **inputs,
            max_new_tokens=int(max_new_tokens),
            temperature=float(temperature),
            top_p=float(top_p),
            do_sample=True if temperature > 0 else False,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
        )
    return tok.decode(out[0], skip_special_tokens=True).strip()

# ------------------- Gradio Endpoints -------------------
# 1) /predict — what ENGINE expects (only 'prompt' → string)
def predict_for_engine(prompt: str) -> str:
    return _generate(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)

# 2) /generate — more controls (prompt + system + params)
def generate_advanced(prompt: str, system: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
    return _generate(prompt=prompt, system=system, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p)

def salamandra_chat_endpoint(prompt: str) -> Dict[str, Any]:
    global _salamandra
    if _salamandra is None:
        _salamandra = SalamandraClient()   # use your class

    try:
        text = _salamandra.chat(prompt)
    except Exception as e:
        text = f"Error running SalamandraClient: {str(e)}"

    return {"text": text}

def resume_sentence(sentence, num_words):
    """
    Summarizes the given sentence in the specified number of words.
    
    Parameters:
    - sentence (str): The sentence to summarize.
    - num_words (int): The number of words for the summary.
    
    Returns:
    - str: The summarized sentence.
    """
    num_words = int(num_words)
    
    # Prompt the model to summarize the sentence
    prompt = f"Instrució: Resumeix la següent frase en {num_words} paraules. Input: {sentence}"
    result = generate_advanced(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)
    
    # Clean the output if it contains 'assistant' role
    if "assistant" in result:
        clean_output = result.split("assistant", 1)[1].strip().split("\n")[0]
    else:
        clean_output = sentence
    
    return clean_output

def identity_manager(sentence, person):
    """
    Replaces the subject of the sentence with the indicated person, keeping the rest unchanged.
    """
    prompt = f"""Instrucció: Substitueix el subjecte de la frase per la persona indicada, mantenint la resta igual.
        Frase: {sentence}
        Substitució: {person}
        Resposta:"""
    
    # Generate the modified sentence using the advanced generator
    result = generate_advanced(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)
    
    # Clean the output if it contains 'assistant' role
    if "assistant" in result:
        clean_output = result.split("assistant", 1)[1].strip().split("\n")[0]
    else:
        clean_output = sentence
    
    return clean_output

def free_narration(srt_text):
    """
    Converts the given audio description into a short, natural, and coherent free narration.
    """
    prompt = f"""Instrucció: Converteix aquesta audiodescripció en una narració lliure breu, natural i coherent.,
        input: {srt_final}
        output:
    """
    
    # Generate the free narration using the advanced generator
    result = generate_advanced(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)
    
    # Clean the output if it contains 'assistant' role
    if "assistant" in result:
        clean_output = result.split("assistant", 1)[1].strip().split("\n")[0]
    else:
        clean_output = srt_text  # fallback to original input
    
    return clean_output

# ------------------- HTTP (opcional, clientes puros) -------------------
# Si quieres, puedes añadir un endpoint HTTP POST /generate (FastAPI),
# pero con Gradio Client es suficiente para engine/local.

# ------------------- UI -------------------
custom_css = """
h2 {
    background: #e3e4e6 !important;
    padding: 14px 22px !important;
    border-radius: 14px !important;
    box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important;
    display: block !important;       /* ocupa tot l'ample */
    width: 100% !important;          /* assegura 100% */
    margin: 20px auto !important;
    text-align:center;
}
"""

with gr.Blocks(title="Salamandra 7B Instruct · ZeroGPU",css=custom_css,theme=gr.themes.Soft()) as demo:
    gr.Markdown("## Salamandra-7B-Instruct · ZeroGPU\nTexto → respuesta instruccional.")
    with gr.Row():
        with gr.Column(scale=1):
            in_system = gr.Textbox(label="System (opcional)", value="")
            in_prompt = gr.Textbox(label="Prompt", placeholder="Escribe tu instrucción…", lines=6)
            max_new = gr.Slider(16, 2048, value=512, step=16, label="max_new_tokens")
            temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
            top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
            btn = gr.Button("Generar", variant="primary")
        with gr.Column(scale=1):
            out = gr.Textbox(label="Respuesta", lines=18)

    btn.click(generate_advanced, [in_prompt, in_system, max_new, temp, top_p], out, api_name="generate", concurrency_limit=1)
    gr.Markdown("---")
    # Endpoint minimalista compatible con el ENGINE (/predict: solo prompt)
    in_prompt_engine = gr.Textbox(label="Prompt (ENGINE)", value="Di hola en una frase.")
    out_engine = gr.Textbox(label="Respuesta (ENGINE)")
    gr.Button("Probar /predict").click(predict_for_engine, [in_prompt_engine], out_engine, api_name="predict", concurrency_limit=1)
    gr.Markdown("---")
    
    gr.Markdown('<h2 style="text-align:center">Resumir frases</h2>')
    with gr.Row():
        with gr.Column(scale=1):
            sentence = gr.Textbox(label="Frase a resumir", value="", lines=3)
            num_words = gr.Textbox(label="Nombre de paraules del resum", value="4")
        with gr.Column(scale=1):
            out_resume = gr.Textbox(label="Resposta", lines=18)
    with gr.Row():
        btn_resume = gr.Button("Resumir", variant="primary")

    btn_resume.click(
        resume_sentence,
        inputs=[sentence, num_words],
        outputs=out_resume,
        api_name="resume",
        concurrency_limit=1
    )
    gr.Markdown("---")
    
    gr.Markdown('<h2 style="text-align:center">Inclusió d’identitats</h2>')
    with gr.Row():
        with gr.Column(scale=1):
            sentence = gr.Textbox(label="Frase a modificar", value="", lines=3)
            person = gr.Textbox(label="Persones reconegudes", value='"Mireia Martí": 4, "Xavier Busquets": 5')
        with gr.Column(scale=1):
            out_modificat = gr.Textbox(label="Resposta", lines=18)
    with gr.Row():
        btn_modify = gr.Button("Modificar frase", variant="primary")
        
    btn_modify.click(
        identity_manager,
        inputs=[sentence, person],
        outputs=out_modificat,
        api_name="modificat",
        concurrency_limit=1
    )

    gr.Markdown('<h2 style="text-align:center">Narració lliure</h2>')
    with gr.Row():
        with gr.Column(scale=1):
            srt = gr.Textbox(label="Audiodescripció", value="(AD)\nTOTS CANTANT: avui celebrem la nostra festa major\nAINA: som hi tots a ballar", lines=3)
            btn_modify = gr.Button("Generar narració lliure", variant="primary")
        with gr.Column(scale=1):
            narració_lliure = gr.Textbox(label="Narració lliure", lines=18)

    btn_modify.click(
        free_narration,
        inputs=[srt],
        outputs=narració_lliure,
        api_name="narració",
        concurrency_limit=1
    )

    gr.Markdown('<h2 style="text-align:center">Sortida del model Salamandra a partir d’una petició</h2>')
    with gr.Row():
        prompt = gr.Textbox(label="prompt", lines=10)
    with gr.Row():
        btn2 = gr.Button("Generar", variant="primary")
    with gr.Row():
        out2 = gr.JSON(label="Salida")

    btn2.click(salamandra_chat_endpoint, [prompt], out2, api_name="generate_out_from_prompt", concurrency_limit=1)
    gr.Markdown("---")

demo.queue(max_size=16).launch()