Spaces:

VeuReu
/

schat

Running on Zero

File size: 14,320 Bytes

73cad8e
7b4bcba
 
92cec32
7b4bcba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db72e25
7b4bcba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73cad8e
 
7b4bcba
 
 
 
 
 
 
 
 
 
73cad8e
 
7b4bcba
73cad8e
7b4bcba
73cad8e
 
 
 
 
 
 
 
 
 
 
0fb6f95
73cad8e
 
 
 
 
 
 
 
 
 
 
 
0fb6f95
73cad8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b4bcba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73cad8e
7b4bcba
 
 
73cad8e
7b4bcba
 
 
286235d
 
 
73cad8e
286235d
 
 
 
73cad8e
286235d
 
 
73cad8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddee9da
73cad8e
 
ddee9da
 
 
73cad8e
 
ddee9da
 
73cad8e
 
 
 
ddee9da
73cad8e
 
ddee9da
73cad8e
 
ddee9da
73cad8e
 
ddee9da
 
 
73cad8e
 
ddee9da
 
73cad8e
 
 
 
cb4cd52
c86d72a
cb4cd52
 
73cad8e
 
cb4cd52
73cad8e
 
cb4cd52
 
 
73cad8e
 
cb4cd52
 
7b4bcba
 
 
 
 
5b17d04
 
 
 
 
 
 
 
 
 
 
 
 
18cf1c5
 
 
 
 
 
7b4bcba
 
18cf1c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b4bcba
18cf1c5
7b4bcba
18cf1c5
 
 
 
 
 
 
 
 
 
 
7b4bcba
18cf1c5
5b17d04
18cf1c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b17d04
18cf1c5
 
 
ddee9da
18cf1c5
ddee9da
 
18cf1c5
b28c85b
18cf1c5
 
b28c85b
18cf1c5
ddee9da
18cf1c5
b28c85b
18cf1c5
5b17d04
18cf1c5
b28c85b
ddee9da
b28c85b
73cad8e
b28c85b
 
 
ddee9da
 
18cf1c5
 
5b17d04
18cf1c5
 
 
ddee9da
18cf1c5
ddee9da
 
18cf1c5
b28c85b
18cf1c5
 
b28c85b
18cf1c5
ddee9da
 
18cf1c5
5b17d04
b28c85b
18cf1c5
b28c85b
ddee9da
b28c85b
ddee9da
b28c85b
ddee9da
 
 
18cf1c5
 
 
 
 
cb4cd52
18cf1c5
cb4cd52
 
18cf1c5
 
 
 
 
 
 
b28c85b
18cf1c5
cb4cd52
 
 
b28c85b
cb4cd52
 
 
 
 
 
 
18cf1c5
 
 
 
 
73cad8e
18cf1c5
73cad8e
18cf1c5
 
73cad8e
 
18cf1c5
73cad8e
18cf1c5
 
 
 
 
 
 
 
 
73cad8e
18cf1c5
73cad8e
18cf1c5
73cad8e
18cf1c5
7b4bcba

# app.py — veureu/schat (Salamandra 7B Instruct · ZeroGPU) — compatible with ENGINE
from __future__ import annotations
import os, json
from typing import List, Dict, Any, Optional, Tuple

import gradio as gr
import spaces
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
)

from transformers import AutoTokenizer, AutoModelForCausalLM
from moe_tools import SalamandraClient

# ===== Config =====
MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-instruct")
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

_tok = None
_model = None
_salamandra = None

def _lazy_load() -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
    global _tok, _model
    if _tok is None or _model is None:
        _tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
        _model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=DTYPE,
            low_cpu_mem_usage=True,
            use_safetensors=True,
            trust_remote_code=True,
            device_map=None,
        ).to(DEVICE)
    return _tok, _model

def _build_prompt(prompt: str, system: Optional[str]) -> str:
    """
    If the tokenizer has 'chat_template', use it with messages [system?, user].
    Otherwise, create a plain prompt with system at the top.
    """
    tok, _ = _lazy_load()
    messages = []
    if system and system.strip():
        messages.append({"role": "system", "content": system.strip()})
    messages.append({"role": "user", "content": prompt})

    chat_template = getattr(tok, "chat_template", None)
    if chat_template:
        return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Fallback without chat template
    sys_part = (f"<<SYS>>\n{system.strip()}\n<</SYS>>\n\n" if system and system.strip() else "")
    return sys_part + f"### Instrucció\n{prompt}\n\n### Resposta\n"

#@spaces.GPU  # use GPU if available (ZeroGPU)
#def _generate_with_tools(
#    messages: List[Dict[str, str]],
#    tools: List[Dict[str, Any]],
#    max_new_tokens: int = 512,
#    temperature: float = 0.7,
#    top_p: float = 0.95,
#) -> Dict[str, Any]:
#    tok, model = _lazy_load()
#    tools_md = _render_tools_md(tools)
#    prompt = _compose_chat_prompt(messages, tools_md)

#    inputs = tok(prompt, return_tensors="pt").to(DEVICE)
#    with torch.inference_mode():
#        out = model.generate(
#            **inputs,
#            max_new_tokens=int(max_new_tokens),
#            temperature=float(temperature),
#            top_p=float(top_p),
#            do_sample=True if temperature > 0 else False,
#            pad_token_id=tok.eos_token_id,
#            eos_token_id=tok.eos_token_id,
#        )
#    text = tok.decode(out[0], skip_special_tokens=True).strip()

#    # If the model returns a JSON block with 'tool_calls', try to extract it
#    tool_calls: List[Dict[str, Any]] = []
#    try:
#        # Search for the last {...} containing "tool_calls"
#        matches = list(re.finditer(r"\{.*?\"tool_calls\".*?\}", text, flags=re.S))
#        if matches:
#            block = text[matches[-1].start():matches[-1].end()]
#            obj = json.loads(block)
#            tc = obj.get("tool_calls", [])
#            if isinstance(tc, list):
#                tool_calls = tc
#    except Exception:
#        pass

    # Execute the extracted tool calls if any
#    tool_results = maybe_execute_tool_calls(tool_calls) if tool_calls else []

#    return {"text": text, "tool_calls": tool_calls, "tool_results": tool_results}

@spaces.GPU # use GPU if available (ZeroGPU)
def _generate(
    prompt: str,
    system: str = "",
    max_new_tokens: int = 512,
    temperature: float = 0.7,
    top_p: float = 0.95,
) -> str:
    tok, model = _lazy_load()
    text = _build_prompt(prompt, system or "")
    inputs = tok(text, return_tensors="pt").to(DEVICE)

    with torch.inference_mode():
        out = model.generate(
            **inputs,
            max_new_tokens=int(max_new_tokens),
            temperature=float(temperature),
            top_p=float(top_p),
            do_sample=True if temperature > 0 else False,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id,
        )
    return tok.decode(out[0], skip_special_tokens=True).strip()

# ------------------- Gradio Endpoints -------------------
# 1) /predict — what ENGINE expects (only 'prompt' → string)
def predict_for_engine(prompt: str) -> str:
    return _generate(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)

# 2) /generate — more controls (prompt + system + params)
def generate_advanced(prompt: str, system: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
    return _generate(prompt=prompt, system=system, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p)

def salamandra_chat_endpoint(prompt: str) -> Dict[str, Any]:
    global _salamandra
    if _salamandra is None:
        _salamandra = SalamandraClient()   # use your class

    try:
        text = _salamandra.chat(prompt)
    except Exception as e:
        text = f"Error running SalamandraClient: {str(e)}"

    return {"text": text}

def resume_sentence(sentence, num_words):
    """
    Summarizes the given sentence in the specified number of words.
    
    Parameters:
    - sentence (str): The sentence to summarize.
    - num_words (int): The number of words for the summary.
    
    Returns:
    - str: The summarized sentence.
    """
    num_words = int(num_words)
    
    # Prompt the model to summarize the sentence
    prompt = f"Instrució: Resumeix la següent frase en {num_words} paraules. Input: {sentence}"
    result = generate_advanced(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)
    
    # Clean the output if it contains 'assistant' role
    if "assistant" in result:
        clean_output = result.split("assistant", 1)[1].strip().split("\n")[0]
    else:
        clean_output = sentence
    
    return clean_output

def identity_manager(sentence, person):
    """
    Replaces the subject of the sentence with the indicated person, keeping the rest unchanged.
    """
    prompt = f"""Instrucció: Substitueix el subjecte de la frase per la persona indicada, mantenint la resta igual.
        Frase: {sentence}
        Substitució: {person}
        Resposta:"""
    
    # Generate the modified sentence using the advanced generator
    result = generate_advanced(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)
    
    # Clean the output if it contains 'assistant' role
    if "assistant" in result:
        clean_output = result.split("assistant", 1)[1].strip().split("\n")[0]
    else:
        clean_output = sentence
    
    return clean_output

def free_narration(srt_text):
    """
    Converts the given audio description into a short, natural, and coherent free narration.
    """
    prompt = f"""Instrucció: Converteix aquesta audiodescripció en una narració lliure breu, natural i coherent.,
        input: {srt_text}
        output:
    """
    
    # Generate the free narration using the advanced generator
    result = generate_advanced(prompt=prompt, system="", max_new_tokens=512, temperature=0.7, top_p=0.95)
    
    # Clean the output if it contains 'assistant' role
    if "assistant" in result:
        clean_output = result.split("assistant", 1)[1].strip().split("\n")[0]
    else:
        clean_output = srt_text  # fallback to original input
    
    return clean_output

# ------------------- HTTP (opcional, clientes puros) -------------------
# Si quieres, puedes añadir un endpoint HTTP POST /generate (FastAPI),
# pero con Gradio Client es suficiente para engine/local.

# ------------------- UI -------------------
custom_css = """
h2 {
    background: #e3e4e6 !important;
    padding: 14px 22px !important;
    border-radius: 14px !important;
    box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important;
    display: block !important;       /* ocupa tot l'ample */
    width: 100% !important;          /* assegura 100% */
    margin: 20px auto !important;
    text-align:center;
}
"""

# App UI built with Gradio. This interface exposes several model utilities.
with gr.Blocks(title="Salamandra 7B Instruct · ZeroGPU", css=custom_css, theme=gr.themes.Soft()) as demo:

    # Section: Instruction-based text generation
    gr.Markdown("## Salamandra-7B-Instruct · ZeroGPU\nText → resposta instruccional.")

    with gr.Row():
        with gr.Column(scale=1):
            # System prompt (optional internal conditioning)
            in_system = gr.Textbox(label="Sistema (opcional)", value="")

            # User prompt to instruct the model
            in_prompt = gr.Textbox(label="Instrucció", placeholder="Escriu la teva instrucció…", lines=6)

            # Maximum number of new tokens to generate
            max_new = gr.Slider(16, 2048, value=512, step=16, label="Màxim de tokens nous")

            # Diversity parameter for randomness
            temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperatura")

            # Nucleus sampling threshold
            top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="Top-p")

            # Button to trigger text generation
            btn = gr.Button("Generar", variant="primary")

        with gr.Column(scale=1):
            # Output box for generated text
            out = gr.Textbox(label="Resposta", lines=18)

    # Bind main generation function
    btn.click(
        generate_advanced,
        [in_prompt, in_system, max_new, temp, top_p],
        out,
        api_name="generate",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

    # Minimal endpoint for ENGINE compatibility (/predict)
    # Only requires a prompt, returns generated text
    in_prompt_engine = gr.Textbox(label="Instrucció (ENGINE)", value="Digues hola en una frase.")
    out_engine = gr.Textbox(label="Resposta (ENGINE)")

    gr.Button("Provar /predict").click(
        predict_for_engine,
        [in_prompt_engine],
        out_engine,
        api_name="predict",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

    # Section: Sentence summarization
    gr.Markdown('<h2 style="text-align:center">Resumir frases</h2>')

    with gr.Row():
        with gr.Column(scale=1):
            # Text to summarize
            sentence = gr.Textbox(label="Frase a resumir", value="", lines=3)

            # Desired number of words in the summary
            num_words = gr.Textbox(label="Nombre de paraules del resum", value="4")

        with gr.Column(scale=1):
            # Output summary
            out_resume = gr.Textbox(label="Resposta", lines=18)

    with gr.Row():
        # Button to produce a summary
        btn_resume = gr.Button("Resumir", variant="primary")

    btn_resume.click(
        resume_sentence,
        inputs=[sentence, num_words],
        outputs=out_resume,
        api_name="resume",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

    # Section: Inclusion of identities inside text
    gr.Markdown('<h2 style="text-align:center">Inclusió d’identitats</h2>')

    with gr.Row():
        with gr.Column(scale=1):
            # Sentence to modify
            sentence = gr.Textbox(label="Frase a modificar", value="", lines=3)

            # Identity mapping provided by the user
            person = gr.Textbox(label="Persones reconegudes", value='"Mireia Martí": 4, "Xavier Busquets": 5')

        with gr.Column(scale=1):
            out_modificat = gr.Textbox(label="Resposta", lines=18)

    with gr.Row():
        btn_modify = gr.Button("Modificar frase", variant="primary")

    btn_modify.click(
        identity_manager,
        inputs=[sentence, person],
        outputs=out_modificat,
        api_name="modificat",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

    # Section: Free narration generation from SRT-like audio description
    gr.Markdown('<h2 style="text-align:center">Narració lliure</h2>')

    with gr.Row():
        with gr.Column(scale=1):
            # SRT-like structured description
            srt = gr.Textbox(
                label="Audiodescripció",
                value="(AD)\nTOTS CANTANT: avui celebrem la nostra festa major\nAINA: som hi tots a ballar",
                lines=3
            )

            btn_modify = gr.Button("Generar narració lliure", variant="primary")

        with gr.Column(scale=1):
            narració_lliure = gr.Textbox(label="Narració lliure", lines=18)

    btn_modify.click(
        free_narration,
        inputs=[srt],
        outputs=narració_lliure,
        api_name="narració",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

    # Section: Raw model output from a prompt (JSON)
    gr.Markdown('<h2 style="text-align:center">Sortida del model Salamandra a partir d’una petició</h2>')

    with gr.Row():
        prompt = gr.Textbox(label="Prompt", lines=10)

    with gr.Row():
        btn2 = gr.Button("Generar", variant="primary")

    with gr.Row():
        out2 = gr.JSON(label="Sortida")

    btn2.click(
        salamandra_chat_endpoint,
        [prompt],
        out2,
        api_name="generate_out_from_prompt",
        concurrency_limit=1
    )

    # --------------------------------------------------------------
    gr.Markdown("---")
    # --------------------------------------------------------------

# Queue to handle multiple requests safely
demo.queue(max_size=16).launch()