File size: 26,071 Bytes

0e41b61

#!/usr/bin/env python3
"""
mac-tensor ui — Web chat UI for the distributed agent.

Serves a single-page HTML chat interface and a Server-Sent Events
endpoint that streams agent events (steps, tool calls, results, final answer).

Usage:
    mac-tensor ui --model gemma4 --nodes http://mac2:8401,http://mac3:8401
    # Then open http://localhost:8500 in your browser
"""

import json
import os
import sys
import time
import threading
from queue import Queue, Empty


def run_server(model_key, node_urls=None, host="0.0.0.0", port=8500, allow_write=False,
               vision=False, stream_dir=None, source_dir=None,
               falcon=False, falcon_model=None,
               swarm_leader=False,
               turbo_url=None):
    """Start the FastAPI server with the agent backend pre-loaded.

    Modes:
      - Distributed text-only: pass node_urls
      - Single-machine vision (Gemma 4 only): vision=True
      - Vision + Falcon Perception (segmentation): vision=True, falcon=True
      - Swarm leader: swarm_leader=True (peer registry + dynamic coordinator)
    """
    from fastapi import FastAPI, Request, UploadFile, File, Form
    from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
    from .agent import AgentBackend, run_agent_turn_stream

    vision_engine = None
    falcon_tools = None
    swarm_registry = None

    if swarm_leader:
        from .swarm import SwarmRegistry, reaper_loop
        swarm_registry = SwarmRegistry(model_key=model_key)
        threading.Thread(target=reaper_loop, args=(swarm_registry,), daemon=True).start()
        print(f"Swarm registry started (model={model_key})")

    if swarm_leader:
        # Leader mode: no LLM backend loaded yet — peers register dynamically.
        # The leader can ALSO load a backend on demand (later) when peers exist.
        backend = None
        print("Running as swarm leader. Workers join via mac-tensor join.")
    elif vision:
        print(f"Loading vision Gemma 4 sniper (single-machine)...")
        from .vision_engine import VisionGemma4Sniper
        vision_engine = VisionGemma4Sniper(
            stream_dir=stream_dir or "~/models/gemma4-stream",
            source_dir=source_dir or "~/models/gemma4-26b-4bit",
        )
        vision_engine.load()
        print("Vision engine ready.")

        if falcon:
            print(f"Loading Falcon Perception...")
            from .falcon_perception import FalconPerceptionTools
            falcon_tools = FalconPerceptionTools.load(
                model_path=falcon_model or "/Users/bigneek/models/falcon-perception"
            )
            print("Falcon Perception ready.")

        backend = None  # Not used in vision mode
    elif falcon:
        # Falcon-only mode: no Gemma, no distributed nodes. Used by the
        # data labeling factory where we only need /api/falcon.
        print("Loading Falcon Perception (standalone, no Gemma)...")
        from .falcon_perception import FalconPerceptionTools
        falcon_tools = FalconPerceptionTools.load(
            model_path=falcon_model or "/Users/bigneek/models/falcon-perception"
        )
        print("Falcon Perception ready. (~1.5 GB resident, no Gemma loaded)")
        backend = None
    else:
        print(f"Loading {model_key} distributed engine...")
        backend = AgentBackend(model_key=model_key, node_urls=node_urls)
        backend.load()
        print(f"Backend ready. Connected to {len(node_urls)} expert nodes.")

    app = FastAPI(title="mac-tensor agent UI")

    # Read the static HTML file shipped alongside this server
    static_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static")
    html_path = os.path.join(static_dir, "chat.html")
    with open(html_path) as f:
        chat_html = f.read()

    # Inject backend info into the HTML so the UI can show it
    if vision:
        model_label = "Gemma 4-26B-A4B (Vision)"
        node_count_label = "single Mac · vision enabled"
    elif falcon_tools is not None and vision_engine is None:
        model_label = "Falcon Perception (labeling factory)"
        node_count_label = "single Mac · Falcon-only"
    elif swarm_leader:
        model_label = {"gemma4": "Gemma 4-26B-A4B",
                       "qwen35": "Qwen 3.5-35B-A3B"}.get(model_key, model_key)
        node_count_label = "swarm leader · waiting for peers"
    else:
        model_label = {"gemma4": "Gemma 4-26B-A4B",
                       "qwen35": "Qwen 3.5-35B-A3B"}.get(model_key, model_key)
        node_count_label = f"{len(node_urls)} expert nodes"

    chat_html = chat_html.replace("{{MODEL_NAME}}", model_label) \
                          .replace("{{NODE_COUNT}}", node_count_label) \
                          .replace("{{VISION_ENABLED}}", "true" if vision else "false") \
                          .replace("{{FALCON_ENABLED}}", "true" if falcon_tools is not None else "false")

    # Lock so only one chat request runs at a time (single MoE engine)
    lock = threading.Lock()

    @app.get("/")
    async def index():
        return HTMLResponse(chat_html)

    @app.get("/api/info")
    async def info():
        return {
            "model": model_key,
            "nodes": node_urls,
            "allow_write": allow_write,
            "vision": vision,
            "falcon": falcon_tools is not None,
            "swarm_leader": swarm_registry is not None,
        }

    # ============================================================
    # Swarm endpoints (only when running as leader)
    # ============================================================

    if swarm_registry is not None:

        @app.post("/swarm/register")
        async def swarm_register(request: Request):
            body = await request.json()
            url = body.get("url")
            mem_gb = body.get("mem_gb", 0)
            meta = body.get("meta", {})
            if not url:
                return JSONResponse({"error": "url required"}, status_code=400)
            peer_id, partition = swarm_registry.register(url, mem_gb, meta)
            print(f"[swarm] +peer {peer_id} at {url} → partition {partition}")
            return {
                "peer_id": peer_id,
                "partition": partition,
                "model": model_key,
                "partition_version": swarm_registry.partition_version,
            }

        @app.post("/swarm/heartbeat")
        async def swarm_heartbeat(request: Request):
            body = await request.json()
            peer_id = body.get("peer_id")
            if not peer_id:
                return JSONResponse({"error": "peer_id required"}, status_code=400)
            ok, version = swarm_registry.heartbeat(peer_id)
            if not ok:
                return JSONResponse({"error": "unknown peer"}, status_code=404)

            # Tell the peer if its partition has been reassigned
            current_partition = None
            with swarm_registry.lock:
                if peer_id in swarm_registry.peers:
                    current_partition = swarm_registry.peers[peer_id]["partition"]

            return {
                "ok": True,
                "partition_version": version,
                "partition": current_partition,
            }

        @app.post("/swarm/leave")
        async def swarm_leave(request: Request):
            body = await request.json()
            peer_id = body.get("peer_id")
            if not peer_id:
                return JSONResponse({"error": "peer_id required"}, status_code=400)
            swarm_registry.leave(peer_id)
            print(f"[swarm] -peer {peer_id} (graceful leave)")
            return {"ok": True}

        @app.get("/swarm/peers")
        async def swarm_peers():
            return swarm_registry.status()

    @app.post("/api/reset")
    async def reset():
        with lock:
            if vision_engine:
                vision_engine.sniper.reset_cache()
            elif backend:
                backend.reset()
        return {"ok": True}

    # When running as leader, lazily build a backend from the swarm registry.
    # We cache it and rebuild when the partition_version changes.
    leader_backend_state = {"backend": None, "version": -1}

    def get_swarm_backend():
        """Lazy backend that uses live peers from the swarm registry.

        Rebuilds when partition_version changes. Returns None if no live peers.
        """
        if swarm_registry is None:
            return backend  # static mode

        live_peers = [p for p in swarm_registry.get_live_peers() if p.get("alive")]
        if not live_peers:
            return None

        # Check if registry version changed → rebuild backend
        current_version = swarm_registry.partition_version
        if leader_backend_state["version"] != current_version or leader_backend_state["backend"] is None:
            print(f"[swarm] building backend for {len(live_peers)} peers (v{current_version})")
            from .agent import AgentBackend
            node_urls = [p["url"] for p in live_peers]
            try:
                bk = AgentBackend(model_key=model_key, node_urls=node_urls)
                bk.load()
                leader_backend_state["backend"] = bk
                leader_backend_state["version"] = current_version
            except Exception as e:
                print(f"[swarm] backend load failed: {e}")
                return None

        return leader_backend_state["backend"]

    @app.post("/api/chat")
    async def chat(request: Request):
        # Use the static backend if not in swarm mode, else build from registry
        active = get_swarm_backend() if swarm_registry is not None else backend

        if active is None:
            return JSONResponse({
                "error": "no live peers in swarm yet — run `mac-tensor join "
                         f"http://{_local_ip()}:{port}` on a worker Mac first"
            }, status_code=503)

        body = await request.json()
        message = body.get("message", "").strip()
        if not message:
            return JSONResponse({"error": "empty message"}, status_code=400)

        max_iterations = int(body.get("max_iterations", 5))
        max_tokens = int(body.get("max_tokens", 300))

        def event_stream():
            with lock:
                try:
                    for event in run_agent_turn_stream(
                        active, message,
                        max_iterations=max_iterations,
                        max_tokens=max_tokens,
                        allow_write=allow_write,
                    ):
                        yield f"data: {json.dumps(event)}\n\n"
                except Exception as e:
                    yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"

        return StreamingResponse(
            event_stream(),
            media_type="text/event-stream",
            headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no",
                     "Connection": "keep-alive"},
        )

    # Build a vision agent backend if Falcon is loaded
    vision_agent = None
    if vision_engine is not None and falcon_tools is not None:
        from .agent import VisionAgentBackend
        vision_agent = VisionAgentBackend(
            vision_engine=vision_engine, falcon_tools=falcon_tools
        )
        print("Vision agent ready (Gemma 4 + Falcon Perception chained).")

    @app.post("/api/chat_vision")
    async def chat_vision(
        message: str = Form(...),
        max_tokens: int = Form(300),
        image: UploadFile = File(None),
    ):
        """Vision chat endpoint — accepts an optional image upload.

        If Falcon Perception is loaded, uses the vision agent loop with
        tool calling. Otherwise falls back to plain Gemma 4 vision.
        """
        if vision_engine is None:
            return JSONResponse({"error": "vision mode not enabled"}, status_code=400)

        image_path = None
        if image is not None and image.filename:
            import tempfile
            tmp = tempfile.NamedTemporaryFile(suffix="_" + image.filename, delete=False)
            tmp.write(await image.read())
            tmp.close()
            image_path = tmp.name

        def event_stream():
            with lock:
                try:
                    if vision_agent is not None and image_path:
                        # Chained mode: Gemma 4 + Falcon tool calls
                        from .agent import run_vision_agent_turn_stream
                        for event in run_vision_agent_turn_stream(
                            vision_agent, message, image_path,
                            max_iterations=4,
                            max_tokens=max_tokens,
                        ):
                            yield f"data: {json.dumps(event)}\n\n"
                    else:
                        # Simple mode: just Gemma 4 vision (no tools)
                        yield f"data: {json.dumps({'type': 'step_start', 'step': 1, 'max': 1})}\n\n"
                        chunks = []
                        def on_chunk(text):
                            chunks.append(text)
                        output = vision_engine.generate(
                            message, image_path=image_path,
                            max_tokens=max_tokens, temperature=0.6,
                            on_chunk=on_chunk,
                        )
                        for chunk in chunks:
                            yield f"data: {json.dumps({'type': 'token', 'text': chunk})}\n\n"
                        yield f"data: {json.dumps({'type': 'final', 'text': output.strip()})}\n\n"
                        yield f"data: {json.dumps({'type': 'done'})}\n\n"
                except Exception as e:
                    import traceback
                    traceback.print_exc()
                    yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
                finally:
                    if image_path and os.path.exists(image_path):
                        try:
                            os.unlink(image_path)
                        except Exception:
                            pass

        return StreamingResponse(
            event_stream(),
            media_type="text/event-stream",
            headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no",
                     "Connection": "keep-alive"},
        )

    @app.post("/api/turbo_chat")
    async def turbo_chat(
        message: str = Form(...),
        max_tokens: int = Form(300),
        image: UploadFile = File(None),
    ):
        """Turbo mode: Gemma 4 encodes image once, then a fast small LLM
        (Qwen3-1.7B 4-bit via mlx-lm) handles the reasoning loop.

        Streams SSE events compatible with the existing chat UI.
        """
        if vision_engine is None:
            return JSONResponse({"error": "vision mode not enabled"}, status_code=400)
        if not turbo_url:
            return JSONResponse(
                {"error": "turbo brain not configured (start server with --turbo-url)"},
                status_code=400,
            )

        image_path = None
        if image is not None and image.filename:
            import tempfile
            tmp = tempfile.NamedTemporaryFile(suffix="_" + image.filename, delete=False)
            tmp.write(await image.read())
            tmp.close()
            image_path = tmp.name

        def event_stream():
            with lock:
                try:
                    # Step 1: Gemma 4 vision encodes the image ONCE
                    yield f"data: {json.dumps({'type': 'step_start', 'step': 1, 'max': 2})}\n\n"
                    yield f"data: {json.dumps({'type': 'token', 'text': '🔍 Encoding image with Gemma 4 vision...\\n'})}\n\n"

                    description = ""
                    if image_path:
                        try:
                            description = vision_engine.generate(
                                "Describe this image briefly: what's in it, where is it, what colors and notable details. Be concise (3-4 sentences max).",
                                image_path=image_path,
                                max_tokens=150,
                                temperature=0.5,
                            ).strip()
                            yield f"data: {json.dumps({'type': 'tool_call', 'tool': 'vision_describe', 'args': 'image'})}\n\n"
                            yield f"data: {json.dumps({'type': 'tool_result', 'result': description})}\n\n"
                        except Exception as e:
                            yield f"data: {json.dumps({'type': 'error', 'message': f'vision encode failed: {e}'})}\n\n"
                            return

                    # Step 2: fast turbo brain reasons over the description
                    yield f"data: {json.dumps({'type': 'step_start', 'step': 2, 'max': 2})}\n\n"

                    # Build the prompt for the turbo brain
                    if description:
                        system_msg = (
                            "You are a fast assistant that answers questions about images. "
                            "A vision model has already described the image for you. "
                            "Give a SHORT, direct answer (1-3 sentences). Do not show your reasoning."
                        )
                        user_msg = (
                            f"Image description: {description}\n\n"
                            f"User's question: {message}"
                        )
                    else:
                        system_msg = "You are a fast helpful assistant. Give SHORT, direct answers."
                        user_msg = message

                    # Stream from the turbo brain (mlx-lm OpenAI-compatible endpoint)
                    import urllib.request
                    payload = {
                        "model": "qwen3-1.7b",
                        "messages": [
                            {"role": "system", "content": system_msg},
                            {"role": "user", "content": user_msg},
                        ],
                        "max_tokens": max_tokens,
                        "temperature": 0.5,
                        "stream": True,
                    }
                    req = urllib.request.Request(
                        f"{turbo_url}/v1/chat/completions",
                        data=json.dumps(payload).encode(),
                        headers={"Content-Type": "application/json"},
                    )
                    final_text = ""
                    in_think = False
                    try:
                        with urllib.request.urlopen(req, timeout=120) as resp:
                            buffer = ""
                            for chunk in iter(lambda: resp.read(512), b""):
                                buffer += chunk.decode("utf-8", errors="ignore")
                                while "\n" in buffer:
                                    line, buffer = buffer.split("\n", 1)
                                    line = line.strip()
                                    if not line.startswith("data: "):
                                        continue
                                    data = line[6:]
                                    if data == "[DONE]":
                                        break
                                    try:
                                        ev = json.loads(data)
                                        delta = ev.get("choices", [{}])[0].get("delta", {})
                                        text = delta.get("content", "")
                                        if not text:
                                            continue
                                        # Strip out <think>...</think> reasoning blocks
                                        for ch in text:
                                            if "<think>" in (final_text + ch)[-7:]:
                                                in_think = True
                                                continue
                                            if "</think>" in (final_text + ch)[-8:]:
                                                in_think = False
                                                final_text = ""
                                                continue
                                            if not in_think:
                                                final_text += ch
                                                yield f"data: {json.dumps({'type': 'token', 'text': ch})}\n\n"
                                    except Exception:
                                        continue
                    except Exception as e:
                        yield f"data: {json.dumps({'type': 'error', 'message': f'turbo brain failed: {e}'})}\n\n"
                        return

                    yield f"data: {json.dumps({'type': 'final', 'text': final_text.strip()})}\n\n"
                    yield f"data: {json.dumps({'type': 'done'})}\n\n"
                except Exception as e:
                    import traceback
                    traceback.print_exc()
                    yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
                finally:
                    if image_path and os.path.exists(image_path):
                        try:
                            os.unlink(image_path)
                        except Exception:
                            pass

        return StreamingResponse(
            event_stream(),
            media_type="text/event-stream",
            headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no",
                     "Connection": "keep-alive"},
        )

    @app.post("/api/falcon")
    async def falcon_ground(
        query: str = Form(...),
        image: UploadFile = File(...),
    ):
        """Run Falcon Perception on uploaded image with text query.

        Returns JSON with detected masks (count, IDs, metadata) plus a
        base64-encoded annotated image showing bounding boxes + labels.
        """
        if falcon_tools is None:
            return JSONResponse({"error": "Falcon Perception not loaded"}, status_code=400)

        # Save uploaded image
        import tempfile, base64, io
        tmp = tempfile.NamedTemporaryFile(suffix="_" + image.filename, delete=False)
        tmp.write(await image.read())
        tmp.close()

        try:
            with lock:
                # Set image in Falcon session
                falcon_tools.set_image(tmp.name)
                # Run grounding
                t0 = time.time()
                result = falcon_tools.ground(query, slot=query.replace(" ", "_")[:32])
                elapsed = time.time() - t0

                if "error" in result:
                    return JSONResponse(result, status_code=500)

                # Annotate the image with bounding boxes
                annotated = falcon_tools.annotate_image(mask_ids=result["mask_ids"])

                # Encode annotated image as base64 PNG
                buf = io.BytesIO()
                annotated.save(buf, format="PNG")
                annotated_b64 = base64.b64encode(buf.getvalue()).decode()

                return JSONResponse({
                    "query": query,
                    "count": result["count"],
                    "mask_ids": result["mask_ids"],
                    "masks": result["masks"],
                    "annotated_image": f"data:image/png;base64,{annotated_b64}",
                    "elapsed_seconds": round(elapsed, 2),
                })
        except Exception as e:
            import traceback
            traceback.print_exc()
            return JSONResponse({"error": str(e)}, status_code=500)
        finally:
            try:
                os.unlink(tmp.name)
            except Exception:
                pass

    print()
    print("=" * 60)
    print(f"  mac-tensor UI ready")
    print(f"  Open: http://localhost:{port}")
    print(f"        http://{_local_ip()}:{port}  (LAN access)")
    print("=" * 60)
    print()

    import uvicorn
    uvicorn.run(app, host=host, port=port, log_level="warning")


def _local_ip():
    """Best-effort detection of the LAN IP."""
    import socket
    try:
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        s.connect(("8.8.8.8", 80))
        ip = s.getsockname()[0]
        s.close()
        return ip
    except Exception:
        return "localhost"


def main(args):
    vision = getattr(args, "vision", False)
    falcon_only = getattr(args, "falcon_only", False)

    if falcon_only:
        # Falcon-only mode: load Falcon Perception, skip Gemma entirely.
        # ~1.5 GB resident. Used by the data labeling factory.
        run_server(
            model_key="falcon",
            node_urls=None,
            host=args.host or "0.0.0.0",
            port=args.port or 8500,
            allow_write=False,
            vision=False,
            falcon=True,
            falcon_model=getattr(args, "falcon_model", None),
        )
        return

    if vision:
        # Vision mode: single-machine, no distributed nodes needed
        run_server(
            model_key="gemma4",
            node_urls=None,
            host=args.host or "0.0.0.0",
            port=args.port or 8500,
            allow_write=getattr(args, "write", False),
            vision=True,
            stream_dir=getattr(args, "stream_dir", None),
            source_dir=getattr(args, "source_dir", None),
            falcon=getattr(args, "falcon", False),
            falcon_model=getattr(args, "falcon_model", None),
        )
    else:
        if not args.nodes:
            print("Error: --nodes is required (or pass --vision for single-machine mode)")
            sys.exit(1)
        node_urls = [u.strip() for u in args.nodes.split(",")]
        run_server(
            model_key=args.model or "gemma4",
            node_urls=node_urls,
            host=args.host or "0.0.0.0",
            port=args.port or 8500,
            allow_write=args.write,
        )