Spaces:
Running
Running
| """Tagline Foundry — HF Space (free CPU; generation on Modal serverless). | |
| Paste a URL (or copy) → extract the page (editable) → review your current tagline | |
| against the qualities of a good tagline (✅/❌) → a fine-tuned Qwen3.5 writes sharper | |
| alternatives, each scored by a local SetFit classifier. | |
| ZeroGPU couldn't reliably host the 9 GB model (recurring worker_init 'No CUDA GPUs | |
| available' across every loading approach), so generation + critique run on a Modal | |
| serverless GPU endpoint that scales to zero (no idle cost, no daily cap). This Space | |
| is plain CPU: it extracts the page, calls the endpoint, and does the SetFit scoring. | |
| First call after idle is a ~60-90s cold start while Modal spins a GPU + loads weights. | |
| Extraction prefers the real hero: a landing page's tagline lives in <h1> / og:title | |
| and its pitch in og:description — trafilatura's article heuristic instead grabs the | |
| densest prose block (often a team bio), so we read og + headings directly. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import re | |
| import shutil | |
| import stat | |
| import subprocess | |
| import tempfile | |
| import urllib.request | |
| from pathlib import Path | |
| from urllib.parse import urlparse | |
| import gradio as gr | |
| import numpy as np | |
| import requests | |
| import trafilatura | |
| from sentence_transformers import SentenceTransformer | |
| # lightpanda: a JS-rendering headless browser. We render every URL through it so | |
| # client-side-rendered pages (React/Framer/etc.) resolve to real DOM before we parse | |
| # the hero. The linux binary is fetched once at startup; locally we use $PATH. | |
| LP_RELEASE = "https://github.com/lightpanda-io/browser/releases/download/nightly/lightpanda-x86_64-linux" | |
| LP_CACHED = Path(tempfile.gettempdir()) / "lightpanda" | |
| SETFIT_REPO = "standd/tagline-quality-setfit" | |
| STREAM_ENDPOINT = os.environ.get("MODAL_STREAM_ENDPOINT", "https://standd--tagline-serve-generator-stream.modal.run") | |
| SERVE_TOKEN = os.environ.get("SERVE_TOKEN", "") | |
| HEADER = """# 🏷️ Tagline Foundry | |
| Rate your homepage tagline — then let a fine-tuned model try to beat it. | |
| Your tagline is the first line a visitor reads: the hook that says *why you matter* — not your | |
| name, and not your category ("an AI agent for X" is a category, not a tagline). | |
| **Paste a URL below.** We score your current line on the five marks of a good tagline — | |
| *outcome over category · differentiated · clear · memorable · resonant* — then a model | |
| fine-tuned from Claude Opus writes sharper ones, each scored 0–100. | |
| """ | |
| ABOUT = """### Why we built it | |
| This tool exists to fix our *own* tagline. We make **[Hey Lefty](https://heylefty.com)** — an | |
| autonomous research agent that briefs you every morning on any topic you follow (papers, | |
| markets, regulators, competitors), building a compounding knowledge base while you do your | |
| actual work. Our homepage hero just said *"Autonomous Research Agents"* — a **category, not a | |
| tagline**. Rather than agonize over copy, we trained a model to write a better one. Tagline | |
| Foundry is that pipeline, opened up for any site. | |
| ### How it works | |
| 1. **Render** — [lightpanda](https://lightpanda.io), an open-source headless browser, loads the page (with JS, so SPAs resolve). | |
| 2. **Extract** — the hero `<h1>` + `og:description` become your current tagline and the page context. | |
| 3. **Critique** — the model grades that line ✅/❌ on the five qualities; a SetFit classifier scores it 0–100. | |
| 4. **Rewrite** — a fine-tuned **Qwen3.5-4B**, *distilled from Claude Opus 4.7* as the teacher, drafts a dozen alternatives. | |
| 5. **Rank** — SetFit scores each; the strongest rise to the top. | |
| The Space itself is free CPU; the GPU work runs on a **Modal** serverless endpoint that scales to zero. | |
| ### Open weights & data | |
| - **Generator:** [standd/tagline-qwen3p5-4b](https://huggingface.co/standd/tagline-qwen3p5-4b) | |
| - **Quality scorer:** [standd/tagline-quality-setfit](https://huggingface.co/standd/tagline-quality-setfit) | |
| - **Datasets:** [saas-taglines-rated](https://huggingface.co/datasets/standd/saas-taglines-rated) · [saas-tagline-distilled](https://huggingface.co/datasets/standd/saas-tagline-distilled) (Opus-written pairs) | |
| """ | |
| # --- rating: SetFit body on CPU + numpy LR head --- | |
| _st_body = SentenceTransformer(SETFIT_REPO, device="cpu") | |
| _h = np.load(Path(__file__).parent / "setfit_head.npz", allow_pickle=True) | |
| _COEF, _B, _CLASSES = _h["coef"], _h["intercept"], [str(c) for c in _h["classes"]] | |
| STRONG_COL = _CLASSES.index("strong") | |
| def score(taglines: list[str]) -> list[int]: | |
| if not taglines: | |
| return [] | |
| X = np.asarray(_st_body.encode(list(taglines))) | |
| p1 = 1.0 / (1.0 + np.exp(-(X @ _COEF.T + _B)[:, 0])) | |
| probs = np.stack([1.0 - p1, p1], axis=1) | |
| return [round(100 * float(probs[i, STRONG_COL])) for i in range(len(taglines))] | |
| def stream_remote(masked: str, company: str, current: str, n: int = 12): | |
| """Yield SSE events from the Modal endpoint: {type: review|token|candidate|done}.""" | |
| with requests.post( | |
| STREAM_ENDPOINT, | |
| json={"token": SERVE_TOKEN, "masked": masked, "company": company, "current": current, "n": n}, | |
| stream=True, timeout=300, | |
| ) as r: | |
| r.raise_for_status() | |
| for raw in r.iter_lines(decode_unicode=True): | |
| if raw and raw.startswith("data: "): | |
| yield json.loads(raw[6:]) | |
| def _meta(html: str, key: str) -> str: | |
| pat1 = r'<meta[^>]+(?:property|name)=["\']' + re.escape(key) + r'["\'][^>]*content=["\']([^"\']+)' | |
| pat2 = r'<meta[^>]+content=["\']([^"\']+)["\'][^>]*(?:property|name)=["\']' + re.escape(key) + r'["\']' | |
| m = re.search(pat1, html, re.I) or re.search(pat2, html, re.I) | |
| return m.group(1).strip() if m else "" | |
| def _headings(html: str) -> list[str]: | |
| heads: list[str] = [] | |
| for tag in ("h1", "h2"): | |
| for raw in re.findall(rf"<{tag}[^>]*>(.*?)</{tag}>", html, re.I | re.S): | |
| t = re.sub(r"\s+", " ", re.sub(r"<[^>]+>", " ", raw)).strip() | |
| if t and t not in heads and 2 <= len(t.split()) <= 20: | |
| heads.append(t) | |
| return heads | |
| def _lightpanda_bin() -> str | None: | |
| found = shutil.which("lightpanda") | |
| if found: | |
| return found | |
| if LP_CACHED.exists() and os.access(LP_CACHED, os.X_OK): | |
| return str(LP_CACHED) | |
| try: | |
| urllib.request.urlretrieve(LP_RELEASE, LP_CACHED) | |
| LP_CACHED.chmod(LP_CACHED.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH) | |
| return str(LP_CACHED) | |
| except Exception: | |
| return None | |
| def _fetch_html(url: str) -> str: | |
| """Render the URL with lightpanda (JS executed); fall back to a static fetch.""" | |
| lp = _lightpanda_bin() | |
| if lp: | |
| try: | |
| # --wait-selector h1 is the robust trigger: lifecycle events (load/ | |
| # domcontentloaded) can fire before a SPA paints its hero, or capture | |
| # nothing on sites whose hydration crashes inside lightpanda. | |
| r = subprocess.run( | |
| [lp, "fetch", "--dump", "html", "--wait-selector", "h1", "--strip-mode", "js,css", url], | |
| capture_output=True, text=True, timeout=45, | |
| ) | |
| if r.stdout and len(r.stdout) > 200: | |
| print(f"[extract] lightpanda rendered {url} ({len(r.stdout)} bytes)", flush=True) | |
| return r.stdout | |
| except Exception as e: | |
| print(f"[extract] lightpanda failed for {url}: {e!r} — static fallback", flush=True) | |
| else: | |
| print("[extract] lightpanda binary unavailable — static fallback", flush=True) | |
| return trafilatura.fetch_url(url) or "" | |
| def _page_from_url(url: str) -> tuple[str, str]: | |
| """Return (landing_copy, company). Hero = first heading; pitch = og:description.""" | |
| dl = _fetch_html(url) | |
| if not dl: | |
| raise gr.Error("Couldn't fetch that URL. Paste your landing copy in the box and try again.") | |
| desc = _meta(dl, "og:description") or _meta(dl, "description") or _meta(dl, "twitter:description") | |
| heads = _headings(dl) | |
| company = _meta(dl, "og:title") or _meta(dl, "og:site_name") or urlparse(url).netloc.replace("www.", "") | |
| parts = ([heads[0]] if heads else []) + ([desc] if desc else []) + heads[1:8] | |
| md = "\n\n".join(parts).strip() | |
| if not md: # fallback to trafilatura main content for pages with no usable og/headings | |
| md = (trafilatura.extract(dl, include_formatting=True) or "").strip() | |
| if not md: | |
| raise gr.Error("No readable text found on that page. Paste your landing copy in the box instead.") | |
| return md, (company.strip()[:60] or "the product") | |
| def extract_page(url: str, pasted: str) -> tuple[str, str, str, str]: | |
| """Return (current_tagline, masked_page, display_copy, company). | |
| First non-empty line is treated as the current hero tagline; the rest is page | |
| context with the hero masked out. | |
| """ | |
| if pasted and pasted.strip(): | |
| md, company = pasted.strip(), "the product" | |
| else: | |
| md, company = _page_from_url(url) | |
| lines = [ln for ln in md.splitlines() if ln.strip()] | |
| current = re.sub(r"^#+\s*", "", lines[0]).strip()[:120] if lines else "" | |
| rest = "\n".join(md.splitlines()[md.splitlines().index(lines[0]) + 1:]).strip() if lines else md | |
| masked = ("# [TAGLINE]\n\n" + rest).strip()[:3500] | |
| return current, masked, md[:6000], company | |
| def _review_md(current: str, cur_score: int, review: list[dict]) -> str: | |
| md = f"### Your current tagline\n> **{current or '—'}** · score **{cur_score}/100**\n\n" | |
| if review: | |
| md += "**How it scores on the qualities of a good tagline:**\n" | |
| for r in review: | |
| md += f"- {'✅' if r['pass'] else '❌'} **{r['label']}**" + (f" — _{r['note']}_\n" if r["note"] else "\n") | |
| return md | |
| def _alts_md(finals: dict[int, tuple[str, int]], typing: dict[int, str]) -> str: | |
| md = "### ✨ Sharper alternatives\n" | |
| for txt, sc in sorted(finals.values(), key=lambda x: -x[1]): | |
| md += f"- **{sc}/100** — {txt}\n" | |
| for partial in typing.values(): | |
| if partial: | |
| md += f"- _… {partial}▌_\n" | |
| if not finals and not any(typing.values()): | |
| md += "_(working…)_" | |
| return md | |
| def _run(url: str, pasted: str): | |
| yield "⏳ Extracting the page…", gr.update(), gr.update(), gr.update(), gr.update(visible=False) | |
| current, masked, full, company = extract_page(url, pasted) | |
| cur_score = score([current])[0] if current else 0 | |
| review_md = _review_md(current, cur_score, []) | |
| yield ("⏳ Warming up the GPU and writing taglines… (first run cold-starts, ~60–90s)", | |
| gr.update(value=full), review_md, gr.update(), gr.update(visible=False)) | |
| finals: dict[int, tuple[str, int]] = {} | |
| typing: dict[int, str] = {} | |
| try: | |
| for evt in stream_remote(masked, company, current, n=12): | |
| kind = evt.get("type") | |
| if kind == "review": | |
| review_md = _review_md(current, cur_score, evt["review"]) | |
| elif kind == "token": | |
| typing[evt["i"]] = evt["text"] | |
| elif kind == "candidate": | |
| typing.pop(evt["i"], None) | |
| finals[evt["i"]] = (evt["text"], score([evt["text"]])[0]) | |
| elif kind == "done": | |
| break | |
| yield "✍️ Writing alternatives…", gr.update(), review_md, _alts_md(finals, typing), gr.update(visible=False) | |
| except Exception as e: | |
| yield (f"⚠️ Generator error: {e}. Try again in a moment.", | |
| gr.update(), review_md, _alts_md(finals, {}), gr.update(visible=False)) | |
| return | |
| yield ("✅ Done — tweak the copy box and hit *Generate more* for another batch.", | |
| gr.update(), review_md, _alts_md(finals, {}), gr.update(visible=True)) | |
| def run_url(url: str): | |
| """Rate & rewrite always re-extracts from the URL (so changing it re-runs).""" | |
| yield from _run(url, "") | |
| def run_box(pasted: str): | |
| """Generate more uses the (possibly edited) copy box; first line = your tagline.""" | |
| yield from _run("", pasted) | |
| with gr.Blocks(title="Tagline Foundry", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(HEADER) | |
| with gr.Accordion("ℹ️ How it works · open weights · why we built it", open=True): | |
| gr.Markdown(ABOUT) | |
| with gr.Row(): | |
| url = gr.Textbox(label="Company URL", placeholder="https://yourcompany.com", scale=4) | |
| go = gr.Button("Rate & rewrite", variant="primary", scale=1) | |
| copy_box = gr.Textbox(label="Extracted landing copy (first line = your tagline; edit it, then Generate more)", lines=8) | |
| status = gr.Markdown() | |
| review = gr.Markdown() | |
| alts = gr.Markdown() | |
| more = gr.Button("🎲 Generate more", visible=False) | |
| outs = [status, copy_box, review, alts, more] | |
| go.click(run_url, [url], outs) | |
| more.click(run_box, [copy_box], outs) | |
| if __name__ == "__main__": | |
| demo.launch() | |