Spaces:

standd
/

tagline-foundry

Running

App Files Files Community

tagline-foundry / app.py

stephen-standd

Upload app.py with huggingface_hub

5155b9d verified 1 day ago

raw

history blame contribute delete

12.8 kB

	"""Tagline Foundry — HF Space (free CPU; generation on Modal serverless).

	Paste a URL (or copy) → extract the page (editable) → review your current tagline
	against the qualities of a good tagline (✅/❌) → a fine-tuned Qwen3.5 writes sharper
	alternatives, each scored by a local SetFit classifier.

	ZeroGPU couldn't reliably host the 9 GB model (recurring worker_init 'No CUDA GPUs
	available' across every loading approach), so generation + critique run on a Modal
	serverless GPU endpoint that scales to zero (no idle cost, no daily cap). This Space
	is plain CPU: it extracts the page, calls the endpoint, and does the SetFit scoring.
	First call after idle is a ~60-90s cold start while Modal spins a GPU + loads weights.

	Extraction prefers the real hero: a landing page's tagline lives in <h1> / og:title
	and its pitch in og:description — trafilatura's article heuristic instead grabs the
	densest prose block (often a team bio), so we read og + headings directly.
	"""

	from __future__ import annotations

	import json
	import os
	import re
	import shutil
	import stat
	import subprocess
	import tempfile
	import urllib.request
	from pathlib import Path
	from urllib.parse import urlparse

	import gradio as gr
	import numpy as np
	import requests
	import trafilatura
	from sentence_transformers import SentenceTransformer

	# lightpanda: a JS-rendering headless browser. We render every URL through it so
	# client-side-rendered pages (React/Framer/etc.) resolve to real DOM before we parse
	# the hero. The linux binary is fetched once at startup; locally we use $PATH.
	LP_RELEASE = "https://github.com/lightpanda-io/browser/releases/download/nightly/lightpanda-x86_64-linux"
	LP_CACHED = Path(tempfile.gettempdir()) / "lightpanda"

	SETFIT_REPO = "standd/tagline-quality-setfit"
	STREAM_ENDPOINT = os.environ.get("MODAL_STREAM_ENDPOINT", "https://standd--tagline-serve-generator-stream.modal.run")
	SERVE_TOKEN = os.environ.get("SERVE_TOKEN", "")
	HEADER = """# 🏷️ Tagline Foundry

	Rate your homepage tagline — then let a fine-tuned model try to beat it.

	Your tagline is the first line a visitor reads: the hook that says why you matter — not your
	name, and not your category ("an AI agent for X" is a category, not a tagline).

	Paste a URL below. We score your current line on the five marks of a good tagline —
	outcome over category · differentiated · clear · memorable · resonant — then a model
	fine-tuned from Claude Opus writes sharper ones, each scored 0–100.
	"""

	ABOUT = """### Why we built it
	This tool exists to fix our own tagline. We make [Hey Lefty](https://heylefty.com) — an
	autonomous research agent that briefs you every morning on any topic you follow (papers,
	markets, regulators, competitors), building a compounding knowledge base while you do your
	actual work. Our homepage hero just said "Autonomous Research Agents" — a **category, not a
	tagline**. Rather than agonize over copy, we trained a model to write a better one. Tagline
	Foundry is that pipeline, opened up for any site.

	### How it works
	1. Render — [lightpanda](https://lightpanda.io), an open-source headless browser, loads the page (with JS, so SPAs resolve).
	2. Extract — the hero `<h1>` + `og:description` become your current tagline and the page context.
	3. Critique — the model grades that line ✅/❌ on the five qualities; a SetFit classifier scores it 0–100.
	4. Rewrite — a fine-tuned Qwen3.5-4B, distilled from Claude Opus 4.7 as the teacher, drafts a dozen alternatives.
	5. Rank — SetFit scores each; the strongest rise to the top.

	The Space itself is free CPU; the GPU work runs on a Modal serverless endpoint that scales to zero.

	### Open weights & data
	- Generator: [standd/tagline-qwen3p5-4b](https://huggingface.co/standd/tagline-qwen3p5-4b)
	- Quality scorer: [standd/tagline-quality-setfit](https://huggingface.co/standd/tagline-quality-setfit)
	- Datasets: [saas-taglines-rated](https://huggingface.co/datasets/standd/saas-taglines-rated) · [saas-tagline-distilled](https://huggingface.co/datasets/standd/saas-tagline-distilled) (Opus-written pairs)
	"""

	# --- rating: SetFit body on CPU + numpy LR head ---
	_st_body = SentenceTransformer(SETFIT_REPO, device="cpu")
	_h = np.load(Path(__file__).parent / "setfit_head.npz", allow_pickle=True)
	_COEF, _B, _CLASSES = _h["coef"], _h["intercept"], [str(c) for c in _h["classes"]]
	STRONG_COL = _CLASSES.index("strong")


	def score(taglines: list[str]) -> list[int]:
	if not taglines:
	return []
	X = np.asarray(_st_body.encode(list(taglines)))
	p1 = 1.0 / (1.0 + np.exp(-(X @ _COEF.T + _B)[:, 0]))
	probs = np.stack([1.0 - p1, p1], axis=1)
	return [round(100 * float(probs[i, STRONG_COL])) for i in range(len(taglines))]


	def stream_remote(masked: str, company: str, current: str, n: int = 12):
	"""Yield SSE events from the Modal endpoint: {type: review\|token\|candidate\|done}."""
	with requests.post(
	STREAM_ENDPOINT,
	json={"token": SERVE_TOKEN, "masked": masked, "company": company, "current": current, "n": n},
	stream=True, timeout=300,
	) as r:
	r.raise_for_status()
	for raw in r.iter_lines(decode_unicode=True):
	if raw and raw.startswith("data: "):
	yield json.loads(raw[6:])


	def _meta(html: str, key: str) -> str:
	pat1 = r'<meta[^>]+(?:property\|name)=["\']' + re.escape(key) + r'["\'][^>]*content=["\']([^"\']+)'
	pat2 = r'<meta[^>]+content=["\']([^"\']+)["\'][^>]*(?:property\|name)=["\']' + re.escape(key) + r'["\']'
	m = re.search(pat1, html, re.I) or re.search(pat2, html, re.I)
	return m.group(1).strip() if m else ""


	def _headings(html: str) -> list[str]:
	heads: list[str] = []
	for tag in ("h1", "h2"):
	for raw in re.findall(rf"<{tag}[^>]>(.?)</{tag}>", html, re.I \| re.S):
	t = re.sub(r"\s+", " ", re.sub(r"<[^>]+>", " ", raw)).strip()
	if t and t not in heads and 2 <= len(t.split()) <= 20:
	heads.append(t)
	return heads


	def _lightpanda_bin() -> str \| None:
	found = shutil.which("lightpanda")
	if found:
	return found
	if LP_CACHED.exists() and os.access(LP_CACHED, os.X_OK):
	return str(LP_CACHED)
	try:
	urllib.request.urlretrieve(LP_RELEASE, LP_CACHED)
	LP_CACHED.chmod(LP_CACHED.stat().st_mode \| stat.S_IEXEC \| stat.S_IXGRP \| stat.S_IXOTH)
	return str(LP_CACHED)
	except Exception:
	return None


	def _fetch_html(url: str) -> str:
	"""Render the URL with lightpanda (JS executed); fall back to a static fetch."""
	lp = _lightpanda_bin()
	if lp:
	try:
	# --wait-selector h1 is the robust trigger: lifecycle events (load/
	# domcontentloaded) can fire before a SPA paints its hero, or capture
	# nothing on sites whose hydration crashes inside lightpanda.
	r = subprocess.run(
	[lp, "fetch", "--dump", "html", "--wait-selector", "h1", "--strip-mode", "js,css", url],
	capture_output=True, text=True, timeout=45,
	)
	if r.stdout and len(r.stdout) > 200:
	print(f"[extract] lightpanda rendered {url} ({len(r.stdout)} bytes)", flush=True)
	return r.stdout
	except Exception as e:
	print(f"[extract] lightpanda failed for {url}: {e!r} — static fallback", flush=True)
	else:
	print("[extract] lightpanda binary unavailable — static fallback", flush=True)
	return trafilatura.fetch_url(url) or ""


	def _page_from_url(url: str) -> tuple[str, str]:
	"""Return (landing_copy, company). Hero = first heading; pitch = og:description."""
	dl = _fetch_html(url)
	if not dl:
	raise gr.Error("Couldn't fetch that URL. Paste your landing copy in the box and try again.")
	desc = _meta(dl, "og:description") or _meta(dl, "description") or _meta(dl, "twitter:description")
	heads = _headings(dl)
	company = _meta(dl, "og:title") or _meta(dl, "og:site_name") or urlparse(url).netloc.replace("www.", "")
	parts = ([heads[0]] if heads else []) + ([desc] if desc else []) + heads[1:8]
	md = "\n\n".join(parts).strip()
	if not md: # fallback to trafilatura main content for pages with no usable og/headings
	md = (trafilatura.extract(dl, include_formatting=True) or "").strip()
	if not md:
	raise gr.Error("No readable text found on that page. Paste your landing copy in the box instead.")
	return md, (company.strip()[:60] or "the product")


	def extract_page(url: str, pasted: str) -> tuple[str, str, str, str]:
	"""Return (current_tagline, masked_page, display_copy, company).

	First non-empty line is treated as the current hero tagline; the rest is page
	context with the hero masked out.
	"""
	if pasted and pasted.strip():
	md, company = pasted.strip(), "the product"
	else:
	md, company = _page_from_url(url)
	lines = [ln for ln in md.splitlines() if ln.strip()]
	current = re.sub(r"^#+\s*", "", lines[0]).strip()[:120] if lines else ""
	rest = "\n".join(md.splitlines()[md.splitlines().index(lines[0]) + 1:]).strip() if lines else md
	masked = ("# [TAGLINE]\n\n" + rest).strip()[:3500]
	return current, masked, md[:6000], company


	def _review_md(current: str, cur_score: int, review: list[dict]) -> str:
	md = f"### Your current tagline\n> {current or '—'}  ·  score {cur_score}/100\n\n"
	if review:
	md += "How it scores on the qualities of a good tagline:\n"
	for r in review:
	md += f"- {'✅' if r['pass'] else '❌'} {r['label']}" + (f" — _{r['note']}_\n" if r["note"] else "\n")
	return md


	def _alts_md(finals: dict[int, tuple[str, int]], typing: dict[int, str]) -> str:
	md = "### ✨ Sharper alternatives\n"
	for txt, sc in sorted(finals.values(), key=lambda x: -x[1]):
	md += f"- {sc}/100 — {txt}\n"
	for partial in typing.values():
	if partial:
	md += f"- _… {partial}▌_\n"
	if not finals and not any(typing.values()):
	md += "_(working…)_"
	return md


	def _run(url: str, pasted: str):
	yield "⏳ Extracting the page…", gr.update(), gr.update(), gr.update(), gr.update(visible=False)
	current, masked, full, company = extract_page(url, pasted)
	cur_score = score([current])[0] if current else 0
	review_md = _review_md(current, cur_score, [])
	yield ("⏳ Warming up the GPU and writing taglines… (first run cold-starts, ~60–90s)",
	gr.update(value=full), review_md, gr.update(), gr.update(visible=False))

	finals: dict[int, tuple[str, int]] = {}
	typing: dict[int, str] = {}
	try:
	for evt in stream_remote(masked, company, current, n=12):
	kind = evt.get("type")
	if kind == "review":
	review_md = _review_md(current, cur_score, evt["review"])
	elif kind == "token":
	typing[evt["i"]] = evt["text"]
	elif kind == "candidate":
	typing.pop(evt["i"], None)
	finals[evt["i"]] = (evt["text"], score([evt["text"]])[0])
	elif kind == "done":
	break
	yield "✍️ Writing alternatives…", gr.update(), review_md, _alts_md(finals, typing), gr.update(visible=False)
	except Exception as e:
	yield (f"⚠️ Generator error: {e}. Try again in a moment.",
	gr.update(), review_md, _alts_md(finals, {}), gr.update(visible=False))
	return

	yield ("✅ Done — tweak the copy box and hit Generate more for another batch.",
	gr.update(), review_md, _alts_md(finals, {}), gr.update(visible=True))


	def run_url(url: str):
	"""Rate & rewrite always re-extracts from the URL (so changing it re-runs)."""
	yield from _run(url, "")


	def run_box(pasted: str):
	"""Generate more uses the (possibly edited) copy box; first line = your tagline."""
	yield from _run("", pasted)


	with gr.Blocks(title="Tagline Foundry", theme=gr.themes.Soft()) as demo:
	gr.Markdown(HEADER)
	with gr.Accordion("ℹ️ How it works · open weights · why we built it", open=True):
	gr.Markdown(ABOUT)
	with gr.Row():
	url = gr.Textbox(label="Company URL", placeholder="https://yourcompany.com", scale=4)
	go = gr.Button("Rate & rewrite", variant="primary", scale=1)
	copy_box = gr.Textbox(label="Extracted landing copy (first line = your tagline; edit it, then Generate more)", lines=8)
	status = gr.Markdown()
	review = gr.Markdown()
	alts = gr.Markdown()
	more = gr.Button("🎲 Generate more", visible=False)
	outs = [status, copy_box, review, alts, more]
	go.click(run_url, [url], outs)
	more.click(run_box, [copy_box], outs)


	if __name__ == "__main__":
	demo.launch()