Spaces:

VeuReu
/

stools

Running on Zero

App Files Files Community

VeuReu commited on 12 days ago

Commit

e09a32e

verified ·

1 Parent(s): 4abe767

Update app.py

Browse files

Files changed (1) hide show

app.py +255 -231

app.py CHANGED Viewed

@@ -1,231 +1,255 @@
-# app.py — veureu/stools (Salamandra 7B Tools · ZeroGPU) — compatible con ENGINE
-from __future__ import annotations
-import os, json, re
-from typing import List, Dict, Any, Optional, Tuple
-import gradio as gr
-import spaces
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# ================= Config =================
-MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-tools")
-DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-_tok = None
-_model = None
-def _lazy_load() -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
-    global _tok, _model
-    if _tok is None or _model is None:
-        _tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
-        _model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            torch_dtype=DTYPE,
-            low_cpu_mem_usage=True,
-            use_safetensors=True,
-            trust_remote_code=True,
-            device_map=None,
-        ).to(DEVICE)
-    return _tok, _model
-# =============== Helpers ===============
-def _render_tools_md(tools: List[Dict[str, Any]]) -> str:
-    """Convierte la especificación OpenAI-style de tools a un bloque breve markdown para el prompt."""
-    if not tools:
-        return ""
-    lines = ["Herramientas disponibles (formato JSON):"]
-    for t in tools:
-        name = t.get("function", {}).get("name") or t.get("name") or "tool"
-        desc = t.get("function", {}).get("description") or t.get("description") or ""
-        params = t.get("function", {}).get("parameters") or t.get("parameters") or {}
-        lines.append(f"- **{name}**: {desc} | parámetros: {json.dumps(params)[:600]}")
-    return "\n".join(lines)
-def _compose_chat_prompt(messages: List[Dict[str, str]], tools_md: str) -> str:
-    """
-    Soporta mensajes estilo OpenAI: [{"role":"system|user|assistant", "content":"..."}]
-    Usa chat_template si está disponible.
-    """
-    tok, _ = _lazy_load()
-    sys_text = ""
-    usr_msgs: List[Dict[str, str]] = []
-    for m in messages:
-        role = m.get("role", "")
-        content = (m.get("content") or "").strip()
-        if role == "system":
-            sys_text += ("\n" + content) if sys_text else content
-        else:
-            usr_msgs.append({"role": role, "content": content})
-    # injerta descripción de tools en el system
-    if tools_md:
-        sys_text = (sys_text + "\n\n" if sys_text else "") + tools_md + \
-                   "\n\nSi decides llamar a una herramienta, devuelve un objeto JSON con la clave 'tool_calls' " \
-                   "y describe tus razonamientos de forma concisa en 'thought' (opcional)."
-    # reconstruimos la conversación con system delante
-    conv: List[Dict[str, str]] = []
-    if sys_text:
-        conv.append({"role":"system", "content": sys_text})
-    conv.extend(usr_msgs)
-    chat_template = getattr(tok, "chat_template", None)
-    if chat_template:
-        return tok.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)
-    # Fallback sin plantilla
-    rendered = ""
-    if sys_text:
-        rendered += f"<<SYS>>\n{sys_text}\n<</SYS>>\n\n"
-    for m in usr_msgs:
-        if m["role"] == "user":
-            rendered += f"### Usuario\n{m['content']}\n\n"
-        elif m["role"] == "assistant":
-            rendered += f"### Asistente\n{m['content']}\n\n"
-    rendered += "### Asistente\n"
-    return rendered
-# =============== (Opcional) Mini-ejecutor local de herramientas seguras ===============
-# Si el LLM devuelve {"tool_calls":[{"name":"calculator","arguments":{"expr":"2+2"}}]}
-# podemos ejecutar algunas herramientas inofensivas de ejemplo.
-# Nota: mantén esto muy simple/seguro. Puedes desactivarlo poniendo EXECUTE_TOOLS=False.
-EXECUTE_TOOLS = True
-def _safe_calculator(expr: str) -> str:
-    # Permite solo dígitos, espacios, (), y +-*/.%**
-    if not re.fullmatch(r"[0-9\.\s\+\-\*\/\%\(\)\^eE]+", expr.replace("**","^")):
-        return "Rejected expression."
-    # soporta ^ como potencia -> **
-    expr = expr.replace("^", "**")
-    try:
-        return str(eval(expr, {"__builtins__":{}}, {}))
-    except Exception as e:
-        return f"Error: {e}"
-LOCAL_TOOLBOX = {
-    "calculator": lambda args: _safe_calculator(str(args.get("expr",""))),
-}
-def maybe_execute_tool_calls(tool_calls: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    if not EXECUTE_TOOLS:
-        return []
-    results = []
-    for call in tool_calls:
-        name = call.get("name")
-        args = call.get("arguments", {})
-        fn = LOCAL_TOOLBOX.get(name)
-        if fn is None:
-            results.append({"name": name, "error": "tool_not_available"})
-            continue
-        try:
-            out = fn(args)
-            results.append({"name": name, "output": out})
-        except Exception as e:
-            results.append({"name": name, "error": str(e)})
-    return results
-# =============== Core generation ===============
-@spaces.GPU  # usa GPU si está disponible (ZeroGPU)
-def _generate_with_tools(
-    messages: List[Dict[str, str]],
-    tools: List[Dict[str, Any]],
-    max_new_tokens: int = 512,
-    temperature: float = 0.7,
-    top_p: float = 0.95,
-) -> Dict[str, Any]:
-    tok, model = _lazy_load()
-    tools_md = _render_tools_md(tools)
-    prompt = _compose_chat_prompt(messages, tools_md)
-    inputs = tok(prompt, return_tensors="pt").to(DEVICE)
-    with torch.inference_mode():
-        out = model.generate(
-            **inputs,
-            max_new_tokens=int(max_new_tokens),
-            temperature=float(temperature),
-            top_p=float(top_p),
-            do_sample=True if temperature > 0 else False,
-            pad_token_id=tok.eos_token_id,
-            eos_token_id=tok.eos_token_id,
-        )
-    text = tok.decode(out[0], skip_special_tokens=True).strip()
-    # Si el modelo devuelve un bloque JSON con 'tool_calls', lo intentamos extraer.
-    tool_calls: List[Dict[str, Any]] = []
-    try:
-        # busca el último {...} que contenga "tool_calls"
-        matches = list(re.finditer(r"\{.*?\"tool_calls\".*?\}", text, flags=re.S))
-        if matches:
-            block = text[matches[-1].start():matches[-1].end()]
-            obj = json.loads(block)
-            tc = obj.get("tool_calls", [])
-            if isinstance(tc, list):
-                tool_calls = tc
-    except Exception:
-        pass
-    tool_results = maybe_execute_tool_calls(tool_calls) if tool_calls else []
-    return {"text": text, "tool_calls": tool_calls, "tool_results": tool_results}
-# =================== Gradio Endpoints ===================
-def predict_for_engine(messages_json: str, tools_json: str) -> Dict[str, Any]:
-    """
-    Endpoint esperado por ENGINE (ToolsClient.chat):
-      - messages_json: JSON de [{"role":"user|assistant|system","content":"..."}]
-      - tools_json: JSON OpenAI-like de herramientas (opcional)
-    Devuelve: {"text": "...", "tool_calls": [...], "tool_results": [...]}
-    """
-    try:
-        messages = json.loads(messages_json) if messages_json else []
-    except Exception:
-        messages = []
-    try:
-        tools = json.loads(tools_json) if tools_json else []
-    except Exception:
-        tools = []
-    return _generate_with_tools(messages, tools, max_new_tokens=512, temperature=0.7, top_p=0.95)
-def chat_advanced(messages_json: str, tools_json: str, max_new_tokens: int, temperature: float, top_p: float) -> Dict[str, Any]:
-    try:
-        messages = json.loads(messages_json) if messages_json else []
-    except Exception:
-        messages = []
-    try:
-        tools = json.loads(tools_json) if tools_json else []
-    except Exception:
-        tools = []
-    return _generate_with_tools(messages, tools, max_new_tokens=int(max_new_tokens), temperature=float(temperature), top_p=float(top_p))
-# =================== UI ===================
-with gr.Blocks(title="Salamandra 7B Tools · ZeroGPU") as demo:
-    gr.Markdown("## Salamandra-7B-Tools · ZeroGPU\nChat con especificación de herramientas (function-calling).")
-    with gr.Row():
-        with gr.Column():
-            messages = gr.Textbox(label="messages_json", value='[{"role":"user","content":"¿Cuánto es (2+2)^3?"}]', lines=6)
-            tools = gr.Textbox(label="tools_json (opcional)", value='[{"type":"function","function":{"name":"calculator","description":"Evalúa expresiones aritméticas básicas.","parameters":{"type":"object","properties":{"expr":{"type":"string"}},"required":["expr"]}}}]', lines=6)
-            max_new = gr.Slider(16, 2048, value=512, step=16, label="max_new_tokens")
-            temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
-            topp = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
-            btn = gr.Button("Generar", variant="primary")
-        with gr.Column():
-            out = gr.JSON(label="Salida")
-    btn.click(chat_advanced, [messages, tools, max_new, temp, topp], out, api_name="chat", concurrency_limit=1)
-    # Endpoint minimalista /predict para ENGINE (mensajes + tools)
-    gr.Button("Probar /predict").click(predict_for_engine, [messages, tools], out, api_name="predict", concurrency_limit=1)
-demo.queue(max_size=16).launch()

+# app.py — veureu/stools (Salamandra 7B Tools · ZeroGPU) — compatible con ENGINE
+from __future__ import annotations
+import os, json, re
+from typing import List, Dict, Any, Optional, Tuple
+import gradio as gr
+import spaces
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from moe_tools import SalamandraClient
+# ================= Config =================
+MODEL_ID = os.environ.get("MODEL_ID", "BSC-LT/salamandra-7b-tools")
+DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+_tok = None
+_model = None
+def _lazy_load() -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
+    global _tok, _model
+    if _tok is None or _model is None:
+        _tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
+        _model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=DTYPE,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+            trust_remote_code=True,
+            device_map=None,
+        ).to(DEVICE)
+    return _tok, _model
+# =============== Helpers ===============
+def _render_tools_md(tools: List[Dict[str, Any]]) -> str:
+    """Convierte la especificación OpenAI-style de tools a un bloque breve markdown para el prompt."""
+    if not tools:
+        return ""
+    lines = ["Herramientas disponibles (formato JSON):"]
+    for t in tools:
+        name = t.get("function", {}).get("name") or t.get("name") or "tool"
+        desc = t.get("function", {}).get("description") or t.get("description") or ""
+        params = t.get("function", {}).get("parameters") or t.get("parameters") or {}
+        lines.append(f"- **{name}**: {desc} | parámetros: {json.dumps(params)[:600]}")
+    return "\n".join(lines)
+def _compose_chat_prompt(messages: List[Dict[str, str]], tools_md: str) -> str:
+    """
+    Soporta mensajes estilo OpenAI: [{"role":"system|user|assistant", "content":"..."}]
+    Usa chat_template si está disponible.
+    """
+    tok, _ = _lazy_load()
+    sys_text = ""
+    usr_msgs: List[Dict[str, str]] = []
+    for m in messages:
+        role = m.get("role", "")
+        content = (m.get("content") or "").strip()
+        if role == "system":
+            sys_text += ("\n" + content) if sys_text else content
+        else:
+            usr_msgs.append({"role": role, "content": content})
+    # injerta descripción de tools en el system
+    if tools_md:
+        sys_text = (sys_text + "\n\n" if sys_text else "") + tools_md + \
+                   "\n\nSi decides llamar a una herramienta, devuelve un objeto JSON con la clave 'tool_calls' " \
+                   "y describe tus razonamientos de forma concisa en 'thought' (opcional)."
+    # reconstruimos la conversación con system delante
+    conv: List[Dict[str, str]] = []
+    if sys_text:
+        conv.append({"role":"system", "content": sys_text})
+    conv.extend(usr_msgs)
+    chat_template = getattr(tok, "chat_template", None)
+    if chat_template:
+        return tok.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)
+    # Fallback sin plantilla
+    rendered = ""
+    if sys_text:
+        rendered += f"<<SYS>>\n{sys_text}\n<</SYS>>\n\n"
+    for m in usr_msgs:
+        if m["role"] == "user":
+            rendered += f"### Usuario\n{m['content']}\n\n"
+        elif m["role"] == "assistant":
+            rendered += f"### Asistente\n{m['content']}\n\n"
+    rendered += "### Asistente\n"
+    return rendered
+# =============== (Opcional) Mini-ejecutor local de herramientas seguras ===============
+# Si el LLM devuelve {"tool_calls":[{"name":"calculator","arguments":{"expr":"2+2"}}]}
+# podemos ejecutar algunas herramientas inofensivas de ejemplo.
+# Nota: mantén esto muy simple/seguro. Puedes desactivarlo poniendo EXECUTE_TOOLS=False.
+EXECUTE_TOOLS = True
+def _safe_calculator(expr: str) -> str:
+    # Permite solo dígitos, espacios, (), y +-*/.%**
+    if not re.fullmatch(r"[0-9\.\s\+\-\*\/\%\(\)\^eE]+", expr.replace("**","^")):
+        return "Rejected expression."
+    # soporta ^ como potencia -> **
+    expr = expr.replace("^", "**")
+    try:
+        return str(eval(expr, {"__builtins__":{}}, {}))
+    except Exception as e:
+        return f"Error: {e}"
+LOCAL_TOOLBOX = {
+    "calculator": lambda args: _safe_calculator(str(args.get("expr",""))),
+}
+def maybe_execute_tool_calls(tool_calls: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    if not EXECUTE_TOOLS:
+        return []
+    results = []
+    for call in tool_calls:
+        name = call.get("name")
+        args = call.get("arguments", {})
+        fn = LOCAL_TOOLBOX.get(name)
+        if fn is None:
+            results.append({"name": name, "error": "tool_not_available"})
+            continue
+        try:
+            out = fn(args)
+            results.append({"name": name, "output": out})
+        except Exception as e:
+            results.append({"name": name, "error": str(e)})
+    return results
+# =============== Core generation ===============
+@spaces.GPU  # usa GPU si está disponible (ZeroGPU)
+def _generate_with_tools(
+    messages: List[Dict[str, str]],
+    tools: List[Dict[str, Any]],
+    max_new_tokens: int = 512,
+    temperature: float = 0.7,
+    top_p: float = 0.95,
+) -> Dict[str, Any]:
+    tok, model = _lazy_load()
+    tools_md = _render_tools_md(tools)
+    prompt = _compose_chat_prompt(messages, tools_md)
+    inputs = tok(prompt, return_tensors="pt").to(DEVICE)
+    with torch.inference_mode():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=int(max_new_tokens),
+            temperature=float(temperature),
+            top_p=float(top_p),
+            do_sample=True if temperature > 0 else False,
+            pad_token_id=tok.eos_token_id,
+            eos_token_id=tok.eos_token_id,
+        )
+    text = tok.decode(out[0], skip_special_tokens=True).strip()
+    # Si el modelo devuelve un bloque JSON con 'tool_calls', lo intentamos extraer.
+    tool_calls: List[Dict[str, Any]] = []
+    try:
+        # busca el último {...} que contenga "tool_calls"
+        matches = list(re.finditer(r"\{.*?\"tool_calls\".*?\}", text, flags=re.S))
+        if matches:
+            block = text[matches[-1].start():matches[-1].end()]
+            obj = json.loads(block)
+            tc = obj.get("tool_calls", [])
+            if isinstance(tc, list):
+                tool_calls = tc
+    except Exception:
+        pass
+    tool_results = maybe_execute_tool_calls(tool_calls) if tool_calls else []
+    return {"text": text, "tool_calls": tool_calls, "tool_results": tool_results}
+# =================== Gradio Endpoints ===================
+def predict_for_engine(messages_json: str, tools_json: str) -> Dict[str, Any]:
+    """
+    Endpoint esperado por ENGINE (ToolsClient.chat):
+      - messages_json: JSON de [{"role":"user|assistant|system","content":"..."}]
+      - tools_json: JSON OpenAI-like de herramientas (opcional)
+    Devuelve: {"text": "...", "tool_calls": [...], "tool_results": [...]}
+    """
+    try:
+        messages = json.loads(messages_json) if messages_json else []
+    except Exception:
+        messages = []
+    try:
+        tools = json.loads(tools_json) if tools_json else []
+    except Exception:
+        tools = []
+    return _generate_with_tools(messages, tools, max_new_tokens=512, temperature=0.7, top_p=0.95)
+def chat_advanced(messages_json: str, tools_json: str, max_new_tokens: int, temperature: float, top_p: float) -> Dict[str, Any]:
+    try:
+        messages = json.loads(messages_json) if messages_json else []
+    except Exception:
+        messages = []
+    try:
+        tools = json.loads(tools_json) if tools_json else []
+    except Exception:
+        tools = []
+    return _generate_with_tools(messages, tools, max_new_tokens=int(max_new_tokens), temperature=float(temperature), top_p=float(top_p))
+_salamandra = None
+def salamandra_chat_endpoint(prompt: str) -> Dict[str, Any]:
+    global _salamandra
+    if _salamandra is None:
+        _salamandra = SalamandraClient()   # usa tu clase
+    try:
+        text = _salamandra.chat(prompt)
+    except Exception as e:
+        text = f"Error ejecutando SalamandraClient: {str(e)}"
+    return {"text": text}
+# =================== UI ===================
+with gr.Blocks(title="Salamandra 7B Tools · ZeroGPU") as demo:
+    gr.Markdown("## Salamandra-7B-Tools · ZeroGPU\nChat con especificación de herramientas (function-calling).")
+    with gr.Row():
+        with gr.Column():
+            messages = gr.Textbox(label="messages_json", value='[{"role":"user","content":"¿Cuánto es (2+2)^3?"}]', lines=6)
+            tools = gr.Textbox(label="tools_json (opcional)", value='[{"type":"function","function":{"name":"calculator","description":"Evalúa expresiones aritméticas básicas.","parameters":{"type":"object","properties":{"expr":{"type":"string"}},"required":["expr"]}}}]', lines=6)
+            max_new = gr.Slider(16, 2048, value=512, step=16, label="max_new_tokens")
+            temp = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="temperature")
+            topp = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
+            btn = gr.Button("Generar", variant="primary")
+        with gr.Column():
+            out = gr.JSON(label="Salida")
+    btn.click(chat_advanced, [messages, tools, max_new, temp, topp], out, api_name="chat", concurrency_limit=1)
+    # Endpoint minimalista /predict para ENGINE (mensajes + tools)
+    gr.Button("Probar /predict").click(predict_for_engine, [messages, tools], out, api_name="predict", concurrency_limit=1)
+    with gr.Row():
+        prompt = gr.Textbox(label="prompt", lines=10)
+    with gr.Row():
+        btn2 = gr.Button("Generar", variant="primary")
+    with gr.Row():
+        out2 = gr.JSON(label="Salida")
+    btn2.click(salamandra_chat_endpoint, [prompt], out2, api_name="generate_out_from_prompt", concurrency_limit=1)
+demo.queue(max_size=16).launch()