"""Tagline Foundry — HF Space (free CPU; generation on Modal serverless).

Paste a URL (or copy) → extract the page (editable) → review your current tagline
against the qualities of a good tagline (✅/❌) → a fine-tuned Qwen3.5 writes sharper
alternatives, each scored by a local SetFit classifier.

ZeroGPU couldn't reliably host the 9 GB model (recurring worker_init 'No CUDA GPUs
available' across every loading approach), so generation + critique run on a Modal
serverless GPU endpoint that scales to zero (no idle cost, no daily cap). This Space
is plain CPU: it extracts the page, calls the endpoint, and does the SetFit scoring.
First call after idle is a ~60-90s cold start while Modal spins a GPU + loads weights.

Extraction prefers the real hero: a landing page's tagline lives in <h1> / og:title
and its pitch in og:description — trafilatura's article heuristic instead grabs the
densest prose block (often a team bio), so we read og + headings directly.
"""

from __future__ import annotations

import json
import os
import re
import shutil
import stat
import subprocess
import tempfile
import urllib.request
from pathlib import Path
from urllib.parse import urlparse

import gradio as gr
import numpy as np
import requests
import trafilatura
from sentence_transformers import SentenceTransformer

# lightpanda: a JS-rendering headless browser. We render every URL through it so
# client-side-rendered pages (React/Framer/etc.) resolve to real DOM before we parse
# the hero. The linux binary is fetched once at startup; locally we use $PATH.
LP_RELEASE = "https://github.com/lightpanda-io/browser/releases/download/nightly/lightpanda-x86_64-linux"
LP_CACHED = Path(tempfile.gettempdir()) / "lightpanda"

SETFIT_REPO = "standd/tagline-quality-setfit"
STREAM_ENDPOINT = os.environ.get("MODAL_STREAM_ENDPOINT", "https://standd--tagline-serve-generator-stream.modal.run")
SERVE_TOKEN = os.environ.get("SERVE_TOKEN", "")
HEADER = """# 🏷️ Tagline Foundry

Rate your homepage tagline — then let a fine-tuned model try to beat it.

Your tagline is the first line a visitor reads: the hook that says *why you matter* — not your
name, and not your category ("an AI agent for X" is a category, not a tagline).

**Paste a URL below.** We score your current line on the five marks of a good tagline —
*outcome over category · differentiated · clear · memorable · resonant* — then a model
fine-tuned from Claude Opus writes sharper ones, each scored 0–100.
"""

ABOUT = """### Why we built it
This tool exists to fix our *own* tagline. We make **[Hey Lefty](https://heylefty.com)** — an
autonomous research agent that briefs you every morning on any topic you follow (papers,
markets, regulators, competitors), building a compounding knowledge base while you do your
actual work. Our homepage hero just said *"Autonomous Research Agents"* — a **category, not a
tagline**. Rather than agonize over copy, we trained a model to write a better one. Tagline
Foundry is that pipeline, opened up for any site.

### How it works
1. **Render** — [lightpanda](https://lightpanda.io), an open-source headless browser, loads the page (with JS, so SPAs resolve).
2. **Extract** — the hero `<h1>` + `og:description` become your current tagline and the page context.
3. **Critique** — the model grades that line ✅/❌ on the five qualities; a SetFit classifier scores it 0–100.
4. **Rewrite** — a fine-tuned **Qwen3.5-4B**, *distilled from Claude Opus 4.7* as the teacher, drafts a dozen alternatives.
5. **Rank** — SetFit scores each; the strongest rise to the top.

The Space itself is free CPU; the GPU work runs on a **Modal** serverless endpoint that scales to zero.

### Open weights & data
- **Generator:** [standd/tagline-qwen3p5-4b](https://huggingface.co/standd/tagline-qwen3p5-4b)
- **Quality scorer:** [standd/tagline-quality-setfit](https://huggingface.co/standd/tagline-quality-setfit)
- **Datasets:** [saas-taglines-rated](https://huggingface.co/datasets/standd/saas-taglines-rated) · [saas-tagline-distilled](https://huggingface.co/datasets/standd/saas-tagline-distilled) (Opus-written pairs)
"""

# --- rating: SetFit body on CPU + numpy LR head ---
_st_body = SentenceTransformer(SETFIT_REPO, device="cpu")
_h = np.load(Path(__file__).parent / "setfit_head.npz", allow_pickle=True)
_COEF, _B, _CLASSES = _h["coef"], _h["intercept"], [str(c) for c in _h["classes"]]
STRONG_COL = _CLASSES.index("strong")


def score(taglines: list[str]) -> list[int]:
    if not taglines:
        return []
    X = np.asarray(_st_body.encode(list(taglines)))
    p1 = 1.0 / (1.0 + np.exp(-(X @ _COEF.T + _B)[:, 0]))
    probs = np.stack([1.0 - p1, p1], axis=1)
    return [round(100 * float(probs[i, STRONG_COL])) for i in range(len(taglines))]


def stream_remote(masked: str, company: str, current: str, n: int = 12):
    """Yield SSE events from the Modal endpoint: {type: review|token|candidate|done}."""
    with requests.post(
        STREAM_ENDPOINT,
        json={"token": SERVE_TOKEN, "masked": masked, "company": company, "current": current, "n": n},
        stream=True, timeout=300,
    ) as r:
        r.raise_for_status()
        for raw in r.iter_lines(decode_unicode=True):
            if raw and raw.startswith("data: "):
                yield json.loads(raw[6:])


def _meta(html: str, key: str) -> str:
    pat1 = r'<meta[^>]+(?:property|name)=["\']' + re.escape(key) + r'["\'][^>]*content=["\']([^"\']+)'
    pat2 = r'<meta[^>]+content=["\']([^"\']+)["\'][^>]*(?:property|name)=["\']' + re.escape(key) + r'["\']'
    m = re.search(pat1, html, re.I) or re.search(pat2, html, re.I)
    return m.group(1).strip() if m else ""


def _headings(html: str) -> list[str]:
    heads: list[str] = []
    for tag in ("h1", "h2"):
        for raw in re.findall(rf"<{tag}[^>]*>(.*?)</{tag}>", html, re.I | re.S):
            t = re.sub(r"\s+", " ", re.sub(r"<[^>]+>", " ", raw)).strip()
            if t and t not in heads and 2 <= len(t.split()) <= 20:
                heads.append(t)
    return heads


def _lightpanda_bin() -> str | None:
    found = shutil.which("lightpanda")
    if found:
        return found
    if LP_CACHED.exists() and os.access(LP_CACHED, os.X_OK):
        return str(LP_CACHED)
    try:
        urllib.request.urlretrieve(LP_RELEASE, LP_CACHED)
        LP_CACHED.chmod(LP_CACHED.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
        return str(LP_CACHED)
    except Exception:
        return None


def _fetch_html(url: str) -> str:
    """Render the URL with lightpanda (JS executed); fall back to a static fetch."""
    lp = _lightpanda_bin()
    if lp:
        try:
            # --wait-selector h1 is the robust trigger: lifecycle events (load/
            # domcontentloaded) can fire before a SPA paints its hero, or capture
            # nothing on sites whose hydration crashes inside lightpanda.
            r = subprocess.run(
                [lp, "fetch", "--dump", "html", "--wait-selector", "h1", "--strip-mode", "js,css", url],
                capture_output=True, text=True, timeout=45,
            )
            if r.stdout and len(r.stdout) > 200:
                print(f"[extract] lightpanda rendered {url} ({len(r.stdout)} bytes)", flush=True)
                return r.stdout
        except Exception as e:
            print(f"[extract] lightpanda failed for {url}: {e!r} — static fallback", flush=True)
    else:
        print("[extract] lightpanda binary unavailable — static fallback", flush=True)
    return trafilatura.fetch_url(url) or ""


def _page_from_url(url: str) -> tuple[str, str]:
    """Return (landing_copy, company). Hero = first heading; pitch = og:description."""
    dl = _fetch_html(url)
    if not dl:
        raise gr.Error("Couldn't fetch that URL. Paste your landing copy in the box and try again.")
    desc = _meta(dl, "og:description") or _meta(dl, "description") or _meta(dl, "twitter:description")
    heads = _headings(dl)
    company = _meta(dl, "og:title") or _meta(dl, "og:site_name") or urlparse(url).netloc.replace("www.", "")
    parts = ([heads[0]] if heads else []) + ([desc] if desc else []) + heads[1:8]
    md = "\n\n".join(parts).strip()
    if not md:  # fallback to trafilatura main content for pages with no usable og/headings
        md = (trafilatura.extract(dl, include_formatting=True) or "").strip()
    if not md:
        raise gr.Error("No readable text found on that page. Paste your landing copy in the box instead.")
    return md, (company.strip()[:60] or "the product")


def extract_page(url: str, pasted: str) -> tuple[str, str, str, str]:
    """Return (current_tagline, masked_page, display_copy, company).

    First non-empty line is treated as the current hero tagline; the rest is page
    context with the hero masked out.
    """
    if pasted and pasted.strip():
        md, company = pasted.strip(), "the product"
    else:
        md, company = _page_from_url(url)
    lines = [ln for ln in md.splitlines() if ln.strip()]
    current = re.sub(r"^#+\s*", "", lines[0]).strip()[:120] if lines else ""
    rest = "\n".join(md.splitlines()[md.splitlines().index(lines[0]) + 1:]).strip() if lines else md
    masked = ("# [TAGLINE]\n\n" + rest).strip()[:3500]
    return current, masked, md[:6000], company


def _review_md(current: str, cur_score: int, review: list[dict]) -> str:
    md = f"### Your current tagline\n> **{current or '—'}** &nbsp;·&nbsp; score **{cur_score}/100**\n\n"
    if review:
        md += "**How it scores on the qualities of a good tagline:**\n"
        for r in review:
            md += f"- {'✅' if r['pass'] else '❌'} **{r['label']}**" + (f" — _{r['note']}_\n" if r["note"] else "\n")
    return md


def _alts_md(finals: dict[int, tuple[str, int]], typing: dict[int, str]) -> str:
    md = "### ✨ Sharper alternatives\n"
    for txt, sc in sorted(finals.values(), key=lambda x: -x[1]):
        md += f"- **{sc}/100** — {txt}\n"
    for partial in typing.values():
        if partial:
            md += f"- _… {partial}▌_\n"
    if not finals and not any(typing.values()):
        md += "_(working…)_"
    return md


def _run(url: str, pasted: str):
    yield "⏳ Extracting the page…", gr.update(), gr.update(), gr.update(), gr.update(visible=False)
    current, masked, full, company = extract_page(url, pasted)
    cur_score = score([current])[0] if current else 0
    review_md = _review_md(current, cur_score, [])
    yield ("⏳ Warming up the GPU and writing taglines… (first run cold-starts, ~60–90s)",
           gr.update(value=full), review_md, gr.update(), gr.update(visible=False))

    finals: dict[int, tuple[str, int]] = {}
    typing: dict[int, str] = {}
    try:
        for evt in stream_remote(masked, company, current, n=12):
            kind = evt.get("type")
            if kind == "review":
                review_md = _review_md(current, cur_score, evt["review"])
            elif kind == "token":
                typing[evt["i"]] = evt["text"]
            elif kind == "candidate":
                typing.pop(evt["i"], None)
                finals[evt["i"]] = (evt["text"], score([evt["text"]])[0])
            elif kind == "done":
                break
            yield "✍️ Writing alternatives…", gr.update(), review_md, _alts_md(finals, typing), gr.update(visible=False)
    except Exception as e:
        yield (f"⚠️ Generator error: {e}. Try again in a moment.",
               gr.update(), review_md, _alts_md(finals, {}), gr.update(visible=False))
        return

    yield ("✅ Done — tweak the copy box and hit *Generate more* for another batch.",
           gr.update(), review_md, _alts_md(finals, {}), gr.update(visible=True))


def run_url(url: str):
    """Rate & rewrite always re-extracts from the URL (so changing it re-runs)."""
    yield from _run(url, "")


def run_box(pasted: str):
    """Generate more uses the (possibly edited) copy box; first line = your tagline."""
    yield from _run("", pasted)


with gr.Blocks(title="Tagline Foundry", theme=gr.themes.Soft()) as demo:
    gr.Markdown(HEADER)
    with gr.Accordion("ℹ️  How it works · open weights · why we built it", open=True):
        gr.Markdown(ABOUT)
    with gr.Row():
        url = gr.Textbox(label="Company URL", placeholder="https://yourcompany.com", scale=4)
        go = gr.Button("Rate & rewrite", variant="primary", scale=1)
    copy_box = gr.Textbox(label="Extracted landing copy (first line = your tagline; edit it, then Generate more)", lines=8)
    status = gr.Markdown()
    review = gr.Markdown()
    alts = gr.Markdown()
    more = gr.Button("🎲 Generate more", visible=False)
    outs = [status, copy_box, review, alts, more]
    go.click(run_url, [url], outs)
    more.click(run_box, [copy_box], outs)


if __name__ == "__main__":
    demo.launch()