File size: 14,320 Bytes
73cad8e
7b4bcba
 
92cec32
7b4bcba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db72e25
7b4bcba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73cad8e
 
7b4bcba
 
 
 
 
 
 
 
 
 
73cad8e
 
7b4bcba
73cad8e
7b4bcba
73cad8e
 
 
 
 
 
 
 
 
 
 
0fb6f95
73cad8e
 
 
 
 
 
 
 
 
 
 
 
0fb6f95
73cad8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b4bcba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73cad8e
7b4bcba
 
 
73cad8e
7b4bcba
 
 
286235d
 
 
73cad8e
286235d
 
 
 
73cad8e
286235d
 
 
73cad8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddee9da
73cad8e
 
ddee9da
 
 
73cad8e
 
ddee9da
 
73cad8e
 
 
 
ddee9da
73cad8e
 
ddee9da
73cad8e
 
ddee9da
73cad8e
 
ddee9da
 
 
73cad8e
 
ddee9da
 
73cad8e
 
 
 
cb4cd52
c86d72a
cb4cd52
 
73cad8e
 
cb4cd52
73cad8e
 
cb4cd52
 
 
73cad8e
 
cb4cd52
 
7b4bcba
 
 
 
 
5b17d04
 
 
 
 
 
 
 
 
 
 
 
 
18cf1c5
 
 
 
 
 
7b4bcba
 
18cf1c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b4bcba
18cf1c5
7b4bcba
18cf1c5
 
 
 
 
 
 
 
 
 
 
7b4bcba
18cf1c5
5b17d04
18cf1c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b17d04
18cf1c5
 
 
ddee9da
18cf1c5
ddee9da
 
18cf1c5
b28c85b
18cf1c5
 
b28c85b
18cf1c5
ddee9da
18cf1c5
b28c85b
18cf1c5
5b17d04
18cf1c5
b28c85b
ddee9da
b28c85b
73cad8e
b28c85b
 
 
ddee9da
 
18cf1c5
 
5b17d04
18cf1c5
 
 
ddee9da
18cf1c5
ddee9da
 
18cf1c5
b28c85b
18cf1c5
 
b28c85b
18cf1c5
ddee9da
 
18cf1c5
5b17d04
b28c85b
18cf1c5
b28c85b
ddee9da
b28c85b
ddee9da
b28c85b
ddee9da
 
 
18cf1c5
 
 
 
 
cb4cd52
18cf1c5
cb4cd52
 
18cf1c5
 
 
 
 
 
 
b28c85b
18cf1c5
cb4cd52
 
 
b28c85b
cb4cd52
 
 
 
 
 
 
18cf1c5
 
 
 
 
73cad8e
18cf1c5
73cad8e
18cf1c5
 
73cad8e
 
18cf1c5
73cad8e
18cf1c5
 
 
 
 
 
 
 
 
73cad8e
18cf1c5
73cad8e
18cf1c5
73cad8e
18cf1c5
7b4bcba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
# app.py — veureu/schat (Salamandra 7B Instruct · ZeroGPU) — compatible with ENGINE
from __future__ import annotations
import os, json
from typing import List, Dict, Any, Optional, Tuple

import gradio as gr
import spaces
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
)

from transformers import AutoTokenizer, AutoModelForCausalLM
from moe_tools import SalamandraClient

# ===== Config =====
MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-instruct")
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

_tok = None
_model = None
_salamandra = None

def _lazy_load() -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
    global _tok, _model
    if _tok is None or _model is None:
        _tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
        _model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=DTYPE,
            low_cpu_mem_usage=True,
            use_safetensors=True,
            trust_remote_code=True,
            device_map=None,
        ).to(DEVICE)
    return _tok, _model

def _build_prompt(prompt: str, system: Optional[str]) -> str:
    """
    If the tokenizer has 'chat_template', use it with messages [system?, user].
    Otherwise, create a plain prompt with system at the top.
    """
    tok, _ = _lazy_load()
    messages = []
    if system and system.strip():
        messages.append({"role": "system", "content": system.strip()})
    messages.append({"role": "user", "content": prompt})

    chat_template = getattr(tok, "chat_template", None)
    if chat_template:
        return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Fallback without chat template
    sys_part = (f"<<SYS>>\n{system.strip()}\n<</SYS>>\n\n" if system and system.strip() else "")
    return sys_part + f"### Instrucció\n{prompt}\n\n### Resposta\n"

#@spaces.GPU  # use GPU if available (ZeroGPU)
#def _generate_with_tools(
#    messages: List[Dict[str, str]],
#    tools: List[Dict[str, Any]],
#    max_new_tokens: int = 512,
#    temperature: float = 0.7,
#    top_p: float = 0.95,
#) -> Dict[str, Any]:
#    tok, model = _lazy_load()
#    tools_md = _render_tools_md(tools)
#    prompt = _compose_chat_prompt(messages, tools_md)

#    inputs = tok(prompt, return_tensors="pt").to(DEVICE)
#    with torch.inference_mode():
#        out = model.generate(
#            **inputs,
#            max_new_tokens=int(max_new_tokens),
#            temperature=float(temperature),
#            top_p=float(top_p),
#            do_sample=True if temperature > 0 else False,
#            pad_token_id=tok.eos_token_id,
#            eos_token_id=tok.eos_token_id,
#        )
#    text = tok.decode(out[0], skip_special_tokens=True).strip()

#    # If the model returns a JSON block with 'tool_calls', try to extract it
#    tool_calls: List[Dict[str, Any]] = []
#    try:
#        # Search for the last {...} containing "tool_calls"
#        matches = list(re.finditer(r"\{.*?\"tool_calls\".*?\}", text, flags=re.S))
#        if matches:
#            block = text[matches[-1].start():matches[-1].end()]
#            obj = json.loads(block)
#            tc = obj.get("tool_calls", [])
#            if isinstance(tc, list):
#                tool_calls = tc
#    except Exception:
#        pass

    # Execute the extracted tool calls if any
#    tool_results = maybe_execute_tool_calls(tool_calls) if tool_calls else []

#    return {"text": text, "tool_calls": tool_calls, "tool_results": tool_results}

@spaces.GPU # use GPU if available (ZeroGPU)
def _generate(
    prompt: str,
    system: str = "",
    max_new_tokens: int = 512,
    temperature: float = 0.7,
    top_p: float = 0.95,
) -> str:
    tok, model = _lazy_load()
    text = _build_prompt(prompt, system or "")
    inputs = tok(text, return_tensors="pt").to(DEVICE)

    with torch.inference_mode():
        out = model.generate(
            **inputs,
            max_new_tokens=int(max_new_tokens),
            temperature=float(temperature),
            top_p=float(top_p),
            do_sample=True if temperature > 0 else False,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
        )
    return tok.decode(out[0], skip_special_tokens=True).strip()

# ------------------- Gradio Endpoints -------------------
# 1) /predict — what ENGINE expects (only 'prompt' → string)
def predict_for_engine(prompt: str) -> str:
    return _generate(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)

# 2) /generate — more controls (prompt + system + params)
def generate_advanced(prompt: str, system: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
    return _generate(prompt=prompt, system=system, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p)

def salamandra_chat_endpoint(prompt: str) -> Dict[str, Any]:
    global _salamandra
    if _salamandra is None:
        _salamandra = SalamandraClient()   # use your class

    try:
        text = _salamandra.chat(prompt)
    except Exception as e:
        text = f"Error running SalamandraClient: {str(e)}"

    return {"text": text}

def resume_sentence(sentence, num_words):
    """
    Summarizes the given sentence in the specified number of words.
    
    Parameters:
    - sentence (str): The sentence to summarize.
    - num_words (int): The number of words for the summary.
    
    Returns:
    - str: The summarized sentence.
    """
    num_words = int(num_words)
    
    # Prompt the model to summarize the sentence
    prompt = f"Instrució: Resumeix la següent frase en {num_words} paraules. Input: {sentence}"
    result = generate_advanced(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)
    
    # Clean the output if it contains 'assistant' role
    if "assistant" in result:
        clean_output = result.split("assistant", 1)[1].strip().split("\n")[0]
    else:
        clean_output = sentence
    
    return clean_output

def identity_manager(sentence, person):
    """
    Replaces the subject of the sentence with the indicated person, keeping the rest unchanged.
    """
    prompt = f"""Instrucció: Substitueix el subjecte de la frase per la persona indicada, mantenint la resta igual.
        Frase: {sentence}
        Substitució: {person}
        Resposta:"""
    
    # Generate the modified sentence using the advanced generator
    result = generate_advanced(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)
    
    # Clean the output if it contains 'assistant' role
    if "assistant" in result:
        clean_output = result.split("assistant", 1)[1].strip().split("\n")[0]
    else:
        clean_output = sentence
    
    return clean_output

def free_narration(srt_text):
    """
    Converts the given audio description into a short, natural, and coherent free narration.
    """
    prompt = f"""Instrucció: Converteix aquesta audiodescripció en una narració lliure breu, natural i coherent.,
        input: {srt_text}
        output:
    """
    
    # Generate the free narration using the advanced generator
    result = generate_advanced(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)
    
    # Clean the output if it contains 'assistant' role
    if "assistant" in result:
        clean_output = result.split("assistant", 1)[1].strip().split("\n")[0]
    else:
        clean_output = srt_text  # fallback to original input
    
    return clean_output

# ------------------- HTTP (opcional, clientes puros) -------------------
# Si quieres, puedes añadir un endpoint HTTP POST /generate (FastAPI),
# pero con Gradio Client es suficiente para engine/local.

# ------------------- UI -------------------
custom_css = """
h2 {
    background: #e3e4e6 !important;
    padding: 14px 22px !important;
    border-radius: 14px !important;
    box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important;
    display: block !important;       /* ocupa tot l'ample */
    width: 100% !important;          /* assegura 100% */
    margin: 20px auto !important;
    text-align:center;
}
"""

# App UI built with Gradio. This interface exposes several model utilities.
with gr.Blocks(title="Salamandra 7B Instruct · ZeroGPU", css=custom_css, theme=gr.themes.Soft()) as demo:

    # Section: Instruction-based text generation
    gr.Markdown("## Salamandra-7B-Instruct · ZeroGPU\nText → resposta instruccional.")

    with gr.Row():
        with gr.Column(scale=1):
            # System prompt (optional internal conditioning)
            in_system = gr.Textbox(label="Sistema (opcional)", value="")

            # User prompt to instruct the model
            in_prompt = gr.Textbox(label="Instrucció", placeholder="Escriu la teva instrucció…", lines=6)

            # Maximum number of new tokens to generate
            max_new = gr.Slider(16, 2048, value=512, step=16, label="Màxim de tokens nous")

            # Diversity parameter for randomness
            temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperatura")

            # Nucleus sampling threshold
            top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="Top-p")

            # Button to trigger text generation
            btn = gr.Button("Generar", variant="primary")

        with gr.Column(scale=1):
            # Output box for generated text
            out = gr.Textbox(label="Resposta", lines=18)

    # Bind main generation function
    btn.click(
        generate_advanced,
        [in_prompt, in_system, max_new, temp, top_p],
        out,
        api_name="generate",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

    # Minimal endpoint for ENGINE compatibility (/predict)
    # Only requires a prompt, returns generated text
    in_prompt_engine = gr.Textbox(label="Instrucció (ENGINE)", value="Digues hola en una frase.")
    out_engine = gr.Textbox(label="Resposta (ENGINE)")

    gr.Button("Provar /predict").click(
        predict_for_engine,
        [in_prompt_engine],
        out_engine,
        api_name="predict",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

    # Section: Sentence summarization
    gr.Markdown('<h2 style="text-align:center">Resumir frases</h2>')

    with gr.Row():
        with gr.Column(scale=1):
            # Text to summarize
            sentence = gr.Textbox(label="Frase a resumir", value="", lines=3)

            # Desired number of words in the summary
            num_words = gr.Textbox(label="Nombre de paraules del resum", value="4")

        with gr.Column(scale=1):
            # Output summary
            out_resume = gr.Textbox(label="Resposta", lines=18)

    with gr.Row():
        # Button to produce a summary
        btn_resume = gr.Button("Resumir", variant="primary")

    btn_resume.click(
        resume_sentence,
        inputs=[sentence, num_words],
        outputs=out_resume,
        api_name="resume",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

    # Section: Inclusion of identities inside text
    gr.Markdown('<h2 style="text-align:center">Inclusió d’identitats</h2>')

    with gr.Row():
        with gr.Column(scale=1):
            # Sentence to modify
            sentence = gr.Textbox(label="Frase a modificar", value="", lines=3)

            # Identity mapping provided by the user
            person = gr.Textbox(label="Persones reconegudes", value='"Mireia Martí": 4, "Xavier Busquets": 5')

        with gr.Column(scale=1):
            out_modificat = gr.Textbox(label="Resposta", lines=18)

    with gr.Row():
        btn_modify = gr.Button("Modificar frase", variant="primary")

    btn_modify.click(
        identity_manager,
        inputs=[sentence, person],
        outputs=out_modificat,
        api_name="modificat",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

    # Section: Free narration generation from SRT-like audio description
    gr.Markdown('<h2 style="text-align:center">Narració lliure</h2>')

    with gr.Row():
        with gr.Column(scale=1):
            # SRT-like structured description
            srt = gr.Textbox(
                label="Audiodescripció",
                value="(AD)\nTOTS CANTANT: avui celebrem la nostra festa major\nAINA: som hi tots a ballar",
                lines=3
            )

            btn_modify = gr.Button("Generar narració lliure", variant="primary")

        with gr.Column(scale=1):
            narració_lliure = gr.Textbox(label="Narració lliure", lines=18)

    btn_modify.click(
        free_narration,
        inputs=[srt],
        outputs=narració_lliure,
        api_name="narració",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

    # Section: Raw model output from a prompt (JSON)
    gr.Markdown('<h2 style="text-align:center">Sortida del model Salamandra a partir d’una petició</h2>')

    with gr.Row():
        prompt = gr.Textbox(label="Prompt", lines=10)

    with gr.Row():
        btn2 = gr.Button("Generar", variant="primary")

    with gr.Row():
        out2 = gr.JSON(label="Sortida")

    btn2.click(
        salamandra_chat_endpoint,
        [prompt],
        out2,
        api_name="generate_out_from_prompt",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

# Queue to handle multiple requests safely
demo.queue(max_size=16).launch()