Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

.ipynb_checkpoints/inference_edited_chat_opt-checkpoint.py +886 -0
inference_edited_chat_opt.py +1 -1
pod_api.py +146 -40
pod_api_old.py +485 -0

.ipynb_checkpoints/inference_edited_chat_opt-checkpoint.py ADDED Viewed

	@@ -0,0 +1,886 @@

+#!/usr/bin/env python3
+"""
+Inference script for Qwen2.5-Coder-7B LoRA fine-tuned model
+Input: list of prompt strings (hardcoded below)
+Output: one .html file per prompt
+"""
+import os
+import re as _re
+import torch
+from pathlib import Path
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+try:
+    from anthropic import Anthropic
+    _ANTHROPIC_AVAILABLE = True
+except ImportError:
+    _ANTHROPIC_AVAILABLE = False
+# 002 still the best with usloth
+#/overfit_lora_6k_r32_8epochs_data_full/final_model and 0.02 is the best so far
+# ──────────────────────────────────────────────────────────────────────────────
+# Configuration
+# ──────────────────────────────────────────────────────────────────────────────
+MODEL_PATH    = "/final_model"
+OUTPUT_FOLDER = "/afdam_style15_20prompts_orig_detailed_style_2_002_less_style_more"
+MAX_NEW_TOKENS = 16384
+TEMPERATURE    = 0.02
+TOP_P          = 0.9
+DO_SAMPLE      = True
+SYSTEM_PROMPT = """You are a senior frontend architect.
+Generate clean, responsive, production-ready HTML using only HTML + Tailwind CSS.
+RULES:
+- Output HTML only; no explanations.
+- Follow the provided base HTML template.
+- Adapt layouts to the target device (mobile / desktop / web).
+- Use Tailwind classes exclusively.
+- For brand names and wordmarks, use styled text elements (<span>, <a>) — NEVER generate <svg><path> elements for logos.
+- For person/avatar photos, use https://i.pravatar.cc/150?img=N (vary N 1-70) — never storage.googleapis.com or any other URL.
+- For all other image and video placeholders, use <AI-IMAGE class="..." src="short, descriptive image prompt with style" />.
+- For fonts, load via Google Fonts <link rel="stylesheet"> only — NEVER use @font-face with remote src URLs.
+- NEVER use storage.googleapis.com, uxpilot CDN, or any invented website domain URLs anywhere in the HTML.
+- Use Font Awesome for icons via cdnjs.cloudflare.com.
+- Use Highcharts (SVG mode) for charts when requested.
+- Avoid full-height utility classes (100vh, h-screen, etc.).
+- Assign unique IDs to main sections and cards.
+- End output exactly at </html>.
+BASE TEMPLATE:
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width,initial-scale=1">
+<script src="https://cdn.tailwindcss.com"></script>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/js/all.min.js"></script>
+</head>
+<body>
+</body>
+</html>"""
+# ──────────────────────────────────────────────────────────────────────────────
+# Prompt normalization (Haiku)
+# ──────────────────────────────────────────────────────────────────────────────
+NORMALIZE_PROMPTS      = True
+NORMALIZER_MODEL       = "claude-sonnet-4-6"
+NORMALIZER_TEMPERATURE = 0.4
+NORMALIZER_MAX_TOKENS  = 4096
+# NORMALIZER_SYSTEM_PROMPT = """You are a prompt normalizer for a dashboard / application-UI HTML generation pipeline. A user gives you any kind of dashboard request — a one-liner, a partial brief, or a fully-detailed spec — and you output a single flowing paragraph in a precise structure that the downstream model expects.
+# OUTPUT RULES (non-negotiable):
+# - Output ONLY the normalized prompt. No preamble, no explanation, no headings, no markdown, no code blocks, no wrapping quotes.
+# - Single flowing paragraph. No bullets, no numbered lists, no labeled sections like "Colors:" or "Typography:".
+# - No mention of real brands you weren't given (Linear, Vercel, Notion, Salesforce, etc.) — steal patterns, never invent brand names. BUT if the user explicitly names their own product/brand (e.g., "Ardentis", "WindFarm", "Tsubaki"), preserve that exact name.
+# - Length adapts to input. Short/vague input → 500–900 words. Detailed input (≥ 600 words) → match the user's level of specificity, up to 2,000+ words. NEVER compress detailed input to fit a length budget.
+# PRESERVE-DETAIL RULES (critical for detailed inputs):
+# - The conversion is STRUCTURAL (labeled sections / bullets / lists → flowing prose), NOT LOSSY. Every concrete detail the user provided survives the rewrite.
+# - Preserve VERBATIM:
+#   * Exact text strings to render in the UI ("Yield Overview", "Total Collateral", "Booking Requests", any quoted copy or label)
+#   * Brand and product names exactly as written
+#   * Specific numbers (KPI values like $67659.99, counts like "ACTIVE 18", percentages, IDs like "944905011UZ")
+#   * All hex codes the user supplied — do not substitute "near-white" for their #f2f1ec
+#   * Distinctive widget elements (radial gauges with thresholds, kanban column counts, map overlay popups, multi-panel arrangements, status pill counts)
+# - If the user describes a region in 80 words of detail, your prose version keeps every detail and is roughly the same length. Don't summarize.
+# - If the user describes a nested element ("inside the sidebar, a nav stack, each nav item with icon + label + count"), preserve all nesting layers in your prose.
+# AESTHETIC THINKING (when the user gave none): If the user supplied widgets and workflow but no aesthetic, imagine the dashboard deeply first as a product designer would before writing. Who is the operator? Is this a passive monitoring surface or an active work surface? Is data the hero or content the hero? What is the ONE visual move that makes this dashboard feel inevitable? Use your own taste. Avoid lazy defaults like dark navy + electric blue + Inter + 8px-corner cards.
+# STRUCTURE (merge all six blocks into one continuous paragraph):
+# 1. OPENING — "Design me a [DASHBOARD TYPE] for [DOMAIN/PRODUCT] — primary user is [OPERATOR ROLE], and the core workflow is [ACTION OR DECISION][, OPTIONAL MOOD FRAGMENT]."
+# 2. DISTINCTIVE FLOURISH — one sentence describing a single standout interactive or visual behavior tied to data (live ticker on the primary KPI, pulse on a status indicator when state changes, hover-driven detail card over a chart point, smooth column-reorder on the kanban, soft glow on the active sidebar item, count-up animation on header counters). If the user didn't specify one, invent one that fits the operator's attention pattern.
+# 3. STRUCTURAL WALKTHROUGH — walk through 8–14 regions using frame-language, not narrative-language. Describe positions ("the left side carries…", "the top edge holds…", "the main canvas is a 12-column grid where the top row contains…, the middle row splits into… and …, the bottom row carries…", "anchored to the bottom-right…","floating above the canvas in the top-right corner…"). Avoid landing-page connectives like "Start with…", "Then…", "Flow into…", "Below that…", "Anchored below…" — those imply a story scrolling top-to-bottom. A dashboard is a room: name what sits where in the frame.
+# 4. COLORS INLINE — "Use [base] [hex] with [text] [hex] text, [primary accent] [hex] [role], [neutral] [hex] for [surfaces], and a state palette of [success-hex] success, [warning-hex] warning, [danger-hex] danger." Every color named inline with its hex code. Never use a "Colors:" label. Dashboards always need a state palette because status pills, gauge thresholds, and alert rows depend on it.
+# 5. TYPOGRAPHY LINE — "[Display font] [size] [optional tracking] [weight range] for headings and large numerals, [body font] [weight range] [body size] for labels and table rows[, plus optional mono family for tabular figures]." Name real Google Fonts or common typographic families.
+# 6. CLOSING RULE — ALWAYS end the output with this exact sentence: "Icons via Font Awesome only — never inline SVG." This is mandatory on every prompt. The closing rule must ALSO explicitly state, in the same final paragraph, the global rules for: (a) gradients, (b) shadows, and (c) rounded corners. Don't leave any of those three categories unspecified. Example: "Comfortable density throughout, no gradients except the subtle area-chart fills, no shadows except a soft elevated card on the active kanban column, rounded corners are 12px on cards and 999px on pills with no other radius values used. Icons via Font Awesome only — never inline SVG."
+# NO-COIN-FLIPS CHECKLIST (apply before every output — these are the rules that separate prompts the model can render deterministically from prompts where it has to guess):
+# 1. NO "or" CHOICES anywhere in the prompt. Pick ONE specific value. Ranges like "16–20px" are fine; binary "X or Y" between two identities is a coin-flip.
+# 2. NO PER-WIDGET STAGGERED ANIMATIONS. ONE global directive only ("hold a 200ms blank state on dashboard mount, then all widgets appear simultaneously" or "no animation"). Real-time data updates are continuous, not animations — those are fine.
+# 3. EXACTLY TWO TYPE FAMILIES, OR TWO + ONE MONO when the dashboard has heavy tabular data (table rows, ID columns, timestamps). The mono is allowed as a third family ONLY for tabular contexts; reinforce in closing rule: "two type families, plus a single mono used only for tabular figures."
+# 4. ONE HEX PER COLOR ROLE. Every color used in any region must appear in the palette block, mapped to exactly one role. State palette (success/warning/danger) is mandatory for dashboards.
+# 5. REGION SPECS MUST AGREE WITH CLOSING RULE. Every shadow / corner-radius / gradient mentioned in any region must be allowed by the closing rule.
+# 6. DESIGN-LANGUAGE PROSE, NOT CSS-SPEC. Wrong: "box-shadow: 0 4px 12px rgba(0,0,0,0.08)", "font-weight 800", "letter-spacing -1px". Right: "soft elevated card", "extrabold tight tracking". Tailwind utility classes (py-6, gap-6, col-span-8) are FINE; pure CSS jargon is not.
+# 7. CONSISTENT TAILWIND vs PROSE. Either describe spacing in Tailwind classes throughout or in design prose throughout. Don't mix.
+# 8. NO AMBIGUOUS RULE EXCEPTIONS. Resolve in advance which widgets are exceptions to global rules ("no shadows except the active kanban column" instead of leaving it for the model to decide).
+# SMART DEFAULTS (use when the user didn't specify):
+# - COLORS: NEVER default to pure white + pure black only. Always produce 5–7 colors — base, text, muted copy, surface/card, border, primary accent, plus a mandatory state palette of success / warning / danger. Use a sophisticated palette that matches the dashboard's nature:
+#   * Trading / DeFi / financial monitoring: dark base #0A0F1A or #0D1117, off-white text, muted text, card surface #131C2E, plus a bright cool accent like cyan #22D3EE, electric blue #2563EB, or mint #34D399.
+#   * Operations / dispatch / logistics: dark base #0E0E10 or #131316, off-white text, muted, card surface #1C1C21, plus a status-color-driven accent rooted in green #10B981 / amber #F59E0B / red #EF4444.
+#   * Support / inbox / CRM: dim base #0F0F12 or #131316, off-white text, plus a warm accent like violet #A78BFA, amber #F59E0B, or coral #FB7185.
+#   * Analytics / BI: light base #FAFAFA, ink #0A0A0A text, muted #6B7280, surface #F4F4F4, plus a restrained accent like deep blue #1E40AF, slate #64748B, or emerald #047857.
+#   * Industrial / IoT / monitoring: light glass base #F5F7FA or dark control-room #0D0D0F, contextual accent in safety yellow #FACC15 or signal red #EF4444.
+#   * Project / workspace / kanban: dim base #0F0F12, off-white text, plus a playful accent like violet #8B5CF6, electric green #22C55E, or coral #F97316.
+# - OPERATOR / WORKFLOW: infer from dashboard type. Yield / DeFi → user managing positions. Fleet ops → dispatcher coordinating drivers. CRM inbox → support agent triaging. Analytics → marketing or product lead reviewing. Industrial → field operator or control-room engineer. Project → PM running sprints.
+# - FLOURISH: invent one that fits the workflow. Live ticker on primary KPI, pulse on state-change indicator, hover detail tooltip on chart, smooth kanban-card drag, breathing online dot, count-up animation on header counters — all valid patterns.
+# - TYPOGRAPHY: use tasteful real-font pairings.
+#   * Trading / financial: Geist or Space Grotesk display + Inter body, plus Geist Mono for tabular figures.
+#   * Operations / industrial: Inter Tight or Space Grotesk display + Inter body, plus JetBrains Mono for sensor values and IDs.
+#   * CRM / inbox: Inter or Geist display + Inter body (keep simple, content is the hero).
+#   * Analytics / BI: Inter Display or Söhne-likes + Inter body, plus IBM Plex Mono for table figures.
+#   * Project / workspace: Outfit or Plus Jakarta Sans display + Inter body.
+# - REGIONS: always produce 8–14 distinct widget regions for a full dashboard surface, fewer for focused workspaces.
+# IF THE USER'S INPUT IS ALREADY IN THIS STRUCTURE: leave it essentially as-is — only fix obvious structural gaps or missing fields. Do not rewrite already-good prompts.
+# IF THE USER'S INPUT IS HIGHLY DETAILED (≥ 600 words, has labeled regions, exact text strings, specific numbers, named widgets, unusual visual moves): treat the user's content as authoritative. Restructure into the 6-block flowing-prose format BUT keep every element the user mentioned — every exact label, every number, every hex code, every named widget, every nested layout layer. Your output will be longer than typical (1,500–2,200 words is fine). Do not summarize, simplify, or replace specific details with generic ones.
+# Normalize the user's input now. Output only the normalized prompt."""
+NORMALIZER_SYSTEM_PROMPT = """You are a prompt normalizer for a website-design HTML generation pipeline. A user gives you any kind of website-design request — a one-liner, a partial brief, or a fully-detailed spec — and you output a single flowing paragraph in a precise structure that the downstream model expects.
+VERY IMPORTANT:
+Always, ALWAYS specify which text color goes on which surface (so the model won't make mistakes like black text on black surface);
+DO NOT USE LIGHT COLORS ON LIGHT BG, OR DARK COLORS ON DARK BG PLEASE.
+OUTPUT RULES (non-negotiable):
+- Output ONLY the normalized prompt. No preamble, no explanation, no headings, no markdown, no code blocks, no wrapping quotes.
+- Single flowing paragraph. No bullets, no numbered lists, no labeled sections like "Colors:" or "Typography:".
+- No mention of real brands you weren't given (Linear, Vercel, Apple, Stripe, Framer, Notion, etc.) — steal patterns, never invent brand names. BUT if the user explicitly names their own product/brand (e.g., "UX PILOT", "Nimbus", "Tsubaki"), preserve that exact name.
+- Length adapts to input. Short/vague input → 500–900 words. Detailed input (≥ 600 words) → match the user's level of specificity, up to 2,000+ words. NEVER compress detailed input to fit a length budget.
+PRESERVE-DETAIL RULES (critical for detailed inputs):
+- The conversion is STRUCTURAL (labeled sections / bullets / lists → flowing prose), NOT LOSSY. Every concrete detail the user provided survives the rewrite.
+- Preserve VERBATIM:
+  * Exact text strings to render in the page ("WELCOME BACK, ADAM", "Roasted Fresh, Delivered Daily", any quoted copy)
+  * Brand and product names exactly as written (UX PILOT, Nimbus, Nodey, Tsubaki)
+  * Specific numbers (counts, prices, percentages, IDs like "20881", "600,000+ USERS")
+  * All hex codes the user supplied — do not substitute "near-white" for their #f2f1ec
+  * Distinctive structural elements (architectural crop marks, ruler tick marks, geometric brackets, monospace annotations, nested mockups, drag-sliders, marquees with specific content, etc.)
+- If the user describes a section in 80 words of detail, your prose version of that section keeps every detail and is roughly the same length. Don't summarize.
+- If the user describes a nested element ("inside this container, there's a frame, inside the frame is a window mockup"), preserve all nesting layers in your prose.
+AESTHETIC THINKING (when the user gave none): If the user supplied structure but no aesthetic, imagine the page deeply first as a design director would before writing, produce something good, like an award winning website, something breath taking.
+STRUCTURE (merge all six blocks into one continuous paragraph):
+1. OPENING — "Design me a [SITE TYPE] homepage for [CONTEXT] — audience is [1-3 AUDIENCE TYPES], and the goal is [CONCRETE GOAL][, OPTIONAL MOOD FRAGMENT]."
+2. DISTINCTIVE FLOURISH — one sentence describing a single standout interactive or visual behavior (scroll-driven, cursor effect, load timing, time-of-day, typographic reveal, mouse parallax, live data tick, image-reveal animation, etc.). Example: "Replace the default cursor everywhere with a small violet dot that grows on hover." If the user didn't specify one, invent one that fits the brand tone.
+3. STRUCTURAL WALKTHROUGH — walk through 10–14 sections using connective phrases ("Start with… then a hero… Flow into… Follow with… Then… Close with…"). Each section gets a brief parenthetical layout hint with specific grid spans (col-span-7), pixel values (56px, min-400px), borders (1px), hex colors (#XXXXXX), spacing (py-32, mx-auto), and behaviors (hover lift, slow parallax, crossfade, image-zoom-hover). Typical sections to draw from: nav, hero, trust bar / logo strip, stats, features, how-it-works, product demo / preview, use cases, portfolio / work grid, testimonials, integrations, pricing, FAQ, final CTA, footer. Pick what fits the site type.
+4. COLORS INLINE — "Use [base] [hex] with [text] [hex] text, [primary accent] [hex] [role], and [neutral] [hex] for [surfaces]." Every color named inline with its hex code. Never use a "Colors:" label.
+5. TYPOGRAPHY LINE — "[Display font] [size] [optional tracking] [weight range] display, [body font] [weight range] [body size] body." Name real Google Fonts or common typographic families.
+6. CLOSING RULE — ALWAYS end the output with this exact sentence: "Icons via Font Awesome only — never inline SVG - never hidden body overflow." This is mandatory on every prompt. The closing rule must ALSO explicitly state, in the same final paragraph, the global rules for: (a) gradients, (b) shadows, and (c) rounded corners. Don't leave any of those three categories unspecified. Examples of valid closing rules: "Pure flat — no gradients, no shadows, no rounded corners except where explicitly noted on [list]. Icons via Font Awesome only — never inline SVG." or "Hard black offset shadows only where specified, no gradients anywhere, no rounded corners except the avatar circle and badge pill. Icons via Font Awesome only — never inline SVG."
+NO-COIN-FLIPS CHECKLIST (apply before every output — these are the rules that separate prompts the model can render deterministically from prompts where it has to guess):
+1. NO "or" CHOICES anywhere in the prompt. Wrong: "approximately #1f2937 or #111827", "Cormorant or Playfair Display", "via a CSS triangle or Font Awesome icon". Right: pick ONE specific value. If the user gave a range like "16–20px thick", that's fine — it's a single design intent; the model can pick within. But a binary "X or Y" between two different identities is a coin-flip.
+2. NO PER-ELEMENT STAGGERED ANIMATIONS. Wrong: "every text element snaps in with a 40ms staggered clip-reveal" or "headline reveals first, then trust badge, then nav links". Right: ONE global state change. "Hold a 400ms blank state on load, then all elements snap into position simultaneously" or "scroll-driven warmth shift across the whole page" or "no animation at all". A single global directive the model can render with one CSS rule.
+3. EXACTLY TWO TYPE FAMILIES. Pick a display family + a body/UI family, OR a display family + a mono family. Never three. If the user's draft has three (e.g., Space Grotesk + Inter + JetBrains Mono), drop the middle one — Space Grotesk medium can do body, you don't need Inter as a third family. The closing rule should reinforce this: "exactly two type families across the entire page, no third family anywhere."
+4. ONE HEX PER COLOR ROLE. Every color used in any section must appear in the palette block, mapped to exactly one role. If the section says nav links are #2d2d2d but the palette doesn't list #2d2d2d, that's an undeclared color — the model treats it as noise. Every #hex used → declared once in palette.
+5. SECTION SPECS MUST AGREE WITH CLOSING RULE. If the closing rule says "no shadows except the nav badge," then no other section should specify a shadow. If a section says "CTA hover has a 3px 3px 0 black shadow," the closing rule MUST allow that — e.g., "hard black offset shadows only where specified". Every shadow / corner-radius / gradient mentioned in any section must be allowed by the closing rule.
+6. DESIGN-LANGUAGE PROSE, NOT CSS-SPEC. Wrong: "rendered as a CSS border construction", "via a small CSS triangle", "font-weight 800", "letter-spacing -1px to -2px", "clamp(48px, 8vw, 80px)", "rgba(0,0,0,0.12)". Right: "thick lilac frame", "extrabold tight tracking", "subtle box shadow". The downstream model is trained on design-language descriptions — CSS-spec phrasing maps less cleanly. Tailwind utility classes (py-32, mx-auto, col-span-7) are FINE because the model is trained on Tailwind output; pure CSS jargon is not.
+7. CONSISTENT TAILWIND vs PROSE. Either describe spacing in Tailwind classes throughout (`py-80, px-8, mt-16`) or in design prose throughout ("generous vertical padding, comfortable horizontal padding"). Don't mix freely within a single prompt — pick one register and stick with it.
+8. NO AMBIGUOUS RULE EXCEPTIONS. If the closing rule says "no rounded corners on any interactive element" but the page has a circular user avatar, the model has to decide if the avatar is "interactive" — that's a coin-flip. Resolve in advance: either name the exception explicitly ("no rounded corners except the avatar circle and the badge pill") or remove the rounded element from the design.
+SMART DEFAULTS (use when the user didn't specify):
+- COLORS: NEVER default to pure white + pure black only. Always produce 4–5 colors — base, text, muted copy, surface/card, plus ONE accent. Use a sophisticated neutral palette that matches the brand tone:
+  * Editorial/minimal: near-white #FAFAFA base, ink #0A0A0A text, muted #6B7280 copy, surface #F4F4F4, plus a subtle accent like warm cream #F5EDE0, soft sage #8B9E84, dusty rose #D4748A, muted gold #C5A55A, olive #5C6B4F, or soft slate #64748B.
+  * Modern SaaS: white #FFFFFF base, near-black #0A0A0A or #111111 text, muted #6B7280 copy, card #F9FAFB, plus an accent like electric blue #2563EB, indigo #6366F1, emerald #10B981, or violet #A78BFA.
+  * Warm/premium: off-white #F7F5F0 base, natural #1C1C1A text, surface #EFEDE8, plus a warm accent like terracotta #C4775A, gold #B8934A, or sage #6B8F71.
+  * Dark/moody: charcoal #0A0A0A or #0D0D0D base, off-white #F0EBE3 text, secondary dark #1A1A1C, plus an accent like gold #C5A55A, electric green #22C55E, or electric blue #0066FF.
+- AUDIENCE: infer from site type. SaaS → product leads, founders, teams. DTC → design-conscious consumers, ethical shoppers. Portfolio → creative directors, clients. Dashboard → name the professional role. Agency → brand leads, marketing directors. App → specific user persona.
+- GOAL: infer from site type. SaaS → trial signups and demo bookings. DTC → first-purchase conversions. Portfolio → project inquiries. App → downloads and trial starts. Dashboard → reduce friction on core tasks. Agency → consultation requests.
+- FLOURISH: invent one that fits the brand. Scroll velocity crop, time-of-day hero, cursor parallax, image-reveal on load, live data ticker, 600ms load hold, typographic reveal, background luminance pulse, kinetic captions — all valid patterns.
+- TYPOGRAPHY: use tasteful real-font pairings.
+  * Editorial / serif: Playfair Display, Cormorant Garamond, Fraunces, DM Serif Display, Noto Serif Display + DM Sans or Inter body.
+  * Modern SaaS: Inter Tight, Geist, Space Grotesk + Inter or Geist body.
+  * Friendly: Outfit, Plus Jakarta Sans, Syne + Inter body.
+  * Dev / terminal: JetBrains Mono, IBM Plex Mono, Fira Code + Inter body.
+- SECTIONS: always produce 10–14 sections even for vague inputs.
+IF THE USER'S INPUT IS ALREADY IN THIS STRUCTURE: leave it essentially as-is — only fix obvious structural gaps or missing fields. Do not rewrite already-good prompts.
+IF THE USER'S INPUT IS HIGHLY DETAILED (≥ 600 words, has labeled sections, exact text strings, specific numbers, named UI components, unusual visual moves): treat the user's content as authoritative. Restructure into the 6-block flowing-prose format BUT keep every element the user mentioned — every exact text string, every number, every hex code, every named visual element, every nested layout layer. Your output will be longer than typical (1,500–2,200 words is fine). Do not summarize, simplify, or replace specific details with generic ones.
+Normalize the user's input now. Output only the normalized prompt."""
+DASHBOARD_NORMALIZER_SYSTEM_PROMPT = """You are a prompt normalizer for a dashboard / application-UI HTML generation pipeline. A user gives you any kind of dashboard request — a one-liner, a partial brief, or a fully-detailed spec — and you output a single flowing paragraph in a precise structure that the downstream model expects.
+VERY IMPORTANT:
+CARDS MUST BE VISIBLY DIFFERENT FROM THE BACKGROUND. Never give a card the same color as the page background — cards have to stand off from the surface they sit on. Always specify which text color goes on which surface; if the layout has a dark card on a light page (or any inverted surface), explicitly state the text colors for both the light and the dark surface so text never disappears into its background. Build structure through contrast, borders, and spacing together — not through any one alone. Make it easy to see what's a card, what's a button, what's text, and where one section ends and the next begins.
+HARD LIMITS (apply before output — these are explicit negative rules; ignoring them produces plain or broken downstream output):
+NO COORDINATION REQUIREMENTS BETWEEN REGIONS. Do not say "matching the map's height exactly" or "aligned to the table below" or "this card's width syncs with the chart above." Each region's dimensions are its own; the model handles alignment from grid structure alone.
+DENSITY SELF-CHECK (before output): mentally count atomic specs in your output. If above 50, identify the bottom 20% by importance and cut them. Common safe cuts: pixel ranges that could be col-spans, secondary animation behaviors layered on the same elements, sub-elements inside cards that don't change the page's character if removed, multi-stat hero cards padded with 4-6 numbers, redundant restatements of the register (state it once at the top, don't repeat the rules in every section).
+Thoroughness past a threshold becomes noise. The prompt's job is to set the register and the structural skeleton — not to resolve every visual decision in advance. The downstream model has its own taste at the pixel level; let it use it.
+OUTPUT RULES (non-negotiable):
+- Output ONLY the normalized prompt. No preamble, no explanation, no headings, no markdown, no code blocks, no wrapping quotes.
+- Single flowing paragraph. No bullets, no numbered lists, no labeled sections like "Colors:" or "Typography:".
+- No mention of real brands you weren't given (Linear, Vercel, Notion, Salesforce, Stripe, Mercury, Monzo, Revolut, etc.) — steal patterns, never invent brand names. BUT if the user explicitly names their own product/brand (e.g., "Ardentis", "WindFarm", "Tavolo"), preserve that exact name.
+- Length adapts to input. Short/vague input → 500–900 words. Detailed input (≥ 600 words) → match the user's level of specificity, up to 2,000+ words. NEVER compress detailed input to fit a length budget.
+- Prefer design-direction prose over micro-spec. Set the register, not the pixel values — the downstream model has its own taste at the pixel level. Do NOT specify exact font-sizes, exact opacity percentages, or exact pixel paddings unless the user supplied them.
+CLOSING RULE — ALWAYS end the output with this exact sentence: "Icons via Font Awesome only — never inline SVG - never hidden body overflow." This is mandatory on every prompt. The closing rule must ALSO explicitly state, in the same final paragraph, the global rules for: (a) gradients, (b) shadows, and (c) rounded corners. Don't leave any of those three categories unspecified. Examples of valid closing rules: "Pure flat — no gradients, no shadows, no rounded corners except where explicitly noted on [list]. Icons via Font Awesome only — never inline SVG." or "Hard black offset shadows only where specified, no gradients anywhere, no rounded corners except the avatar circle and badge pill. Icons via Font Awesome only — never inline SVG."
+PRESERVE-DETAIL RULES (critical for detailed inputs):
+- The conversion is STRUCTURAL (labeled sections / bullets / lists → flowing prose), NOT LOSSY. Every concrete detail the user provided survives the rewrite.
+- Preserve VERBATIM:
+  * Exact text strings to render in the UI ("Yield Overview", "Total Collateral", "Booking Requests", any quoted copy or label)
+  * Brand and product names exactly as written
+  * Specific numbers (KPI values like $67659.99, counts like "ACTIVE 18", percentages, IDs like "944905011UZ")
+  * All hex codes the user supplied — do not substitute "near-white" for their #f2f1ec
+  * Distinctive widget elements (radial gauges with thresholds, kanban column counts, map overlay popups, multi-panel arrangements, status pill counts)
+- If the user describes a region in 80 words of detail, your prose version keeps every detail and is roughly the same length. Don't summarize.
+- If the user describes a nested element ("inside the sidebar, a nav stack, each nav item with icon + label + count"), preserve all nesting layers in your prose.
+VISUAL REGISTER (when the user gave none):
+If the user supplied widgets and workflow but no aesthetic, commit to ONE coherent visual register before writing the structural walkthrough. Neutral is failure. Invent one that fits the operator's context, and describe it in design-direction prose — not in pixel values, exact opacities, or exact font-sizes. The downstream model handles those decisions from its own training; your job is to set the register and let the model's taste fill in the pixels.
+ Avoid the lazy fallback (dark navy + electric blue + Inter + 8px-corner cards) unless that genuinely is your imagined register — that's "Linear utilitarian" and only fits when the operator's context calls for it.
+SMART DEFAULTS (use when the user didn't specify):
+- COLORS: Always produce 5–7 colors — base, text, muted copy, surface/card, border, primary accent, plus mandatory state palette of success / warning / danger. Pick a palette that signals the chosen register from the library above. Avoid the lazy default (dark navy + electric blue) unless the register is genuinely Linear utilitarian.
+- OPERATOR / WORKFLOW: infer from dashboard type. Yield / DeFi → user managing positions. Fleet ops → dispatcher coordinating drivers. CRM inbox → support agent triaging. Analytics → marketing or product lead reviewing. Industrial → field operator or control-room engineer. Project → PM running sprints.
+- FLOURISH: live ticker on primary KPI, pulse on state-change indicator, hover detail tooltip on chart, smooth kanban-card drag, breathing online dot, count-up animation on header counters.
+- TYPOGRAPHY: pick the pairing that matches the chosen register — Fraunces + Inter for soft modern fintech, Outfit + Inter for neo-banking calm, Tiempos + Inter for editorial broadsheet, Geist + Inter for glass dark aurora, JetBrains Mono alone for brutalist trading terminal, Söhne-likes Condensed + Inter for swiss minimal grid, Inter + Geist Mono for Linear utilitarian, Fraunces + Inter + JetBrains Mono for architectural drafting.
+- REGIONS: always produce 8–14 distinct widget regions for a full dashboard surface, fewer for focused workspaces.
+IF THE USER'S INPUT IS ALREADY IN THIS STRUCTURE: leave it essentially as-is — only fix obvious structural gaps or missing fields.
+IF THE USER'S INPUT IS HIGHLY DETAILED (≥ 600 words, has labeled regions, exact text strings, specific numbers, named widgets, unusual visual moves): treat the user's content as authoritative. Restructure into the 6-block flowing-prose format BUT keep every element the user mentioned — every exact label, every number, every hex code, every named widget, every nested layout layer. Your output will be longer than typical (1,500–2,200 words is fine). Do not summarize.
+Normalize the user's input now. Output only the normalized prompt."""
+# Keywords that indicate a dashboard / app-UI prompt — case-insensitive substring match.
+DASHBOARD_KEYWORDS = (
+    "dashboard",
+    "admin panel",
+    "control panel",
+    "control room",
+    "command center",
+    "console",
+    "workspace",
+    "monitoring",
+    "monitor surface",
+    "ops surface",
+    "ops console",
+    "kanban",
+    "inbox view",
+    "inbox workspace",
+    "data table",
+    "analytics overview",
+    "bi report",
+    "bi reporting",
+    "trading terminal",
+    "back office",
+    "internal tool",
+)
+def is_dashboard_prompt(prompt: str) -> bool:
+    """Return True if the user's prompt looks like a dashboard / app-UI request."""
+    p = prompt.lower()
+    return any(kw in p for kw in DASHBOARD_KEYWORDS)
+# Use Highcharts (SVG mode) for charts when requested.
+# 5prompts
+PROMPTS = [
+# 1
+"""Landing page for a habit-tracking mobile app called Streak. The audience is people in their 20s and 30s who have tried other habit apps and quit. The page needs: a hero section with the app name, a one-line value prop, two CTAs (App Store, Google Play), and a phone mockup placeholder showing the app; a 'why this one is different' section with three points (no streaks-anxiety, friend accountability, science-backed reminders); a section explaining how it works in four steps (pick a habit, set when, get nudged, check in); a testimonials section with three quotes from real users including their name, age, and the habit they built; a comparison table vs the two main competitors; a pricing block (free forever, $4/mo pro with extras); an FAQ with at least six questions covering data privacy, cancellation, family plans, and Apple Health integration; and a footer with newsletter signup, social links, and legal pages.""",
+# 2
+"""Operations dashboard for a mid-size logistics company that runs ~200 trucks across the US. The primary user is a dispatcher who has this open all day. Needs a left sidebar with navigation (Overview, Shipments, Fleet, Drivers, Customers, Reports, Settings); a top bar with search, notifications bell, and a user avatar dropdown. The main area should have four KPI cards across the top (active shipments, on-time delivery rate, fleet utilization, fuel cost this week — each with a trend arrow and sparkline). Below that, a large interactive map placeholder showing truck positions, with a panel on the right listing the active shipments visible on the map. Below the map, a recent deliveries table (columns: shipment ID, origin, destination, driver, status, ETA, value), with status pills, sortable headers, and a filter bar. To the right of the table, a stacked alerts panel (delays, breakdowns, customer complaints), each alert clickable. The dashboard should feel dense but not cluttered.""",
+# 3
+"""Marketing site for an AI legal assistant called Counsel.ai aimed at small law firms (2–20 attorneys). The pitch is: it drafts contracts, summarizes case law, and handles client intake conversations. Need a hero with the headline, sub-headline mentioning ABA-aligned, a CTA to book a demo and a secondary 'see it in action' that scrolls to a video. Below the hero: a logo strip of the law schools and firms already using it. Then a 'what it does' section with three big cards (contract drafting, case law research, intake automation), each with a short description and a screenshot placeholder. Then a live chat preview component showing a sample intake conversation between the AI and a potential client. Then an integrations section showing logos of Clio, MyCase, LexisNexis, Westlaw, Dropbox, and Gmail. A security and compliance section is critical — call out SOC2 Type II, attorney-client privilege handling, encryption at rest and in transit, and US data residency, presented as four trust badges with short explanations. End with a testimonials section featuring three managing partners by name and firm, a pricing block (Starter, Growth, Firm tiers), and two CTAs in the footer (book demo, start free trial).""",
+# 4
+"""E-commerce product detail page for a premium pair of trail running shoes called the Ridge Pro 2. The page needs: a breadcrumb at the top (Home > Men > Running > Ridge Pro 2); a two-column layout with a vertical thumbnail strip and a main image gallery on the left (six placeholder images, with one being a 360-degree view), and the product info on the right. The right column needs the product title, a sub-line ('Built for technical descents'), a star rating with review count, the price ($185) with a 'free shipping over $75' note, a color picker with five swatches, a size picker with US sizes 7–13 including half sizes (some marked sold out), a quantity stepper, an 'Add to cart' button, and an 'Add to wishlist' icon button. Below the right column, a tabs section: Description (with a paragraph and bullet specs — weight, drop, stack height, lug depth, materials), Reviews (showing average rating, a 5-star bar breakdown, three sample reviews with reviewer name and verified-buyer badge), and Shipping & Returns. Below the two columns, a 'Pairs well with' carousel of four related products. On scroll, the add-to-cart bar should stick to the bottom of the viewport with the product name, selected variants, and a buy button.""",
+# 5
+"""Onboarding flow page — step 3 of 5 — for a project management app called Mosaic. The user has just signed up and is being walked through setup. Step 3 is 'Invite your team.' The page should have a top progress indicator showing 5 steps with the third one active and the first two checked. The center of the page has a heading ('Bring your team in'), a sub-heading ('Mosaic works best when everyone is on it'), and a form area: three rows of inputs by default (each row: email field + role dropdown with options Admin/Member/Viewer), an 'Add another' link, an alternative 'Invite by link' section with a copyable URL and a 'reset link' option, and a section to bulk-paste emails. Below the form, two buttons: a primary 'Send invites and continue' and a secondary 'Skip for now.' On the right side of the page, a small contextual card: 'Why invite now? Teams that invite within the first day are 4x more likely to stick with Mosaic' with a small illustration placeholder. Footer should have a 'Need help?' link.""",
+# 6
+"""Pricing calculator page for a cloud hosting provider called Stratus. The audience is engineering leads picking a vendor. The page should let them estimate monthly cost based on their usage. Layout: a title and short intro at the top, then a calculator card taking the center of the page with controls for compute (a slider for vCPUs from 1 to 64 and a slider for RAM from 1 to 256 GB), storage (a slider for SSD GB from 10 to 5000), bandwidth (a slider for monthly TB from 0.1 to 50), and a region selector with five options (US-East, US-West, EU-West, AP-South, AP-East) shown as pill buttons. As the user moves sliders, a price summary panel on the right of the calculator updates live: a big monthly total, a breakdown by line item, and an annual cost with a 'save 20% with annual' note. Below the calculator, a 'How we compare' section showing the same configuration's price on the three biggest competitors (AWS, GCP, DigitalOcean) — clearly labeled as estimates. Below that, an FAQ about how billing works, overage charges, what counts as bandwidth, and free tier. End with a CTA to start a free trial and a 'talk to sales' option for enterprise.""",
+# 7
+"""Status page for a developer API service called Pulse. The audience is engineers who integrate Pulse and need to know if it's up. Top of the page: a big banner showing overall system status — green if everything is operational, with the message 'All systems operational' and a last-updated timestamp. Below the banner, a list of individual services (API, Dashboard, Webhooks, SDKs, Documentation, Authentication), each as a row with the service name on the left, a status pill on the right (operational, degraded, partial outage, major outage), and a 90-day uptime bar showing daily status as colored segments. Each row should be expandable to show recent metrics (latency p50/p95/p99, error rate). Below the service list, a 'Past incidents' section grouped by date, showing incident title, status (resolved, investigating, identified, monitoring), affected components, and a timeline of updates. At the very top right, a 'Subscribe to updates' button that opens a modal with options for email, webhook, RSS, and Slack. Header should have the company logo and links to status history, API docs, and main site.""",
+# 8
+"""Case study page for a B2B design agency called North Field that just delivered a rebrand for a fintech client called Halcyon. The audience is potential clients evaluating the agency. The page needs: a hero with the client logo placeholder, the project title ('Rebranding Halcyon for the next stage'), a one-line summary, and three big stat numbers across the bottom (e.g., 47% lift in signups, 3 months delivery, 12 deliverables). Below the hero, a project meta strip (industry, timeline, services delivered, team size). Then a 'The challenge' section with two paragraphs and a pull quote from the client's CEO. Then a 'Our approach' section broken into three phases (discovery, design, rollout) each with a short paragraph and an image placeholder. Then a 'Results' section with three stat cards expanded into context (what the number means, how it was measured). Then a full-width testimonial quote from the CEO with their photo, name, and title. Then a deliverables gallery showing 6 placeholder images of work artifacts. End with a 'Ready to start your project?' CTA card and three thumbnails of related case studies.""",
+# 9
+"""Settings page for a developer tools app called Forge. The user opening this is an engineering manager configuring their team's workspace. Layout: a left sub-nav listing settings sections (Profile, Workspace, API Keys, Webhooks, Team Members, Integrations, Billing, Danger Zone). The active section is API Keys. The main panel shows a heading and a short description of what API keys are. Below that, a 'Generate new key' button. Below that, a table of existing keys with columns: name, scope, last used, created, and a row actions menu (rename, revoke). Below the API keys table, the next section visible on scroll should be Webhooks — a list of registered webhook endpoints, each showing URL, events subscribed to, last delivery status, and a toggle to enable/disable. Below that, Team Members — a table with member avatars, names, emails, roles (with editable dropdowns), and a remove action. Below that, a Billing summary card showing current plan, next invoice amount, and 'manage billing' button. At the very bottom, a Danger Zone section in a clearly distinct visual treatment, with two destructive actions: Transfer Ownership and Delete Workspace. The page should feel like a serious admin surface.""",
+# 10
+"""Inbox view for a customer support platform called Mailroom. The user is a support agent working through tickets. Three-panel layout. Left panel (narrow): conversation list with filter tabs at the top (All, Unassigned, Mine, Mentions), a search bar, and below that a list of conversations — each row showing customer avatar, name, subject preview, last message preview, time, and unread indicator. Middle panel (widest): the active conversation. Top of the panel: customer name, channel (email, chat, etc.), and action buttons (assign, snooze, close, more). Below that, the message thread — alternating customer and agent messages with timestamps, the agent's messages on the right. At the bottom of the middle panel, a reply composer with formatting toolbar, attachment, and a send button with a dropdown to send-and-close. Right panel (narrow): customer details — avatar, name, email, plan, signup date, lifetime value, last active. Below that, a 'past conversations' list (last 5), then 'internal notes' (a small section where teammates can leave notes about this customer), then a 'related articles' suggestions list (auto-suggested help docs based on the conversation content).""",
+# 11
+"""Homepage for a Series A fintech startup called Stack that helps freelancers handle quarterly taxes. Audience: US-based freelancers earning $40k–$200k who currently use a CPA or do nothing. Hero: clear value prop ('Quarterly taxes, handled'), sub-line, two CTAs ('Estimate my taxes' and 'How it works'), and a hero visual of a phone showing the app's tax estimate screen. Below the hero, a logo bar with publications that have covered them (TechCrunch, NYT, Wired, etc.). Then a 'How it works' section with three steps (connect your accounts, we calculate every quarter, file with one tap), each with an icon and short description. Then a tax savings calculator widget where users input their annual freelance income and state, and it shows estimated savings and time saved vs. doing it themselves. Then a testimonials section with three quotes from named freelancers (designer, developer, copywriter) including profession and city. Then a section addressing the common objection 'why not just use a CPA?' with a side-by-side comparison. Then a security and trust section (bank-level encryption, SOC2, never sells data). Then pricing — flat $25/month, no upsells. End with an FAQ (six questions covering states supported, what if I'm late, integration with QuickBooks, refunds, multi-state freelancers, accuracy guarantee) and a footer with newsletter signup.""",
+# 12
+"""Comparison page on a SaaS marketing site: 'Linear vs Jira.' The audience is a team currently on Jira considering switching. Hero: a clear headline ('A modern alternative to Jira'), a one-line sub, and two CTAs (start free, book a migration call). Below the hero, a feature-by-feature comparison table with about 20 rows grouped into sections (Speed & UX, Workflow, Integrations, Pricing, Support), with check/x marks and short clarifying notes per cell. Below the table, three side-by-side highlight cards covering the biggest differences (10x faster UI, opinionated workflow, transparent pricing). Below that, a pricing comparison block with the equivalent plan from each side by side, showing per-seat cost. Below that, a customer story section: a quote from someone who migrated, with their photo, name, title, and company, plus three stat numbers from their experience (e.g., '4 hours of meetings saved per week', '70% adoption in week 1', etc.). Below that, a 'Migration is easy' section with a three-step process (export from Jira, run our migrator, go live in a day) and a CTA to talk to the migration team. End with FAQ covering data import, custom fields, permissions parity, and pricing edge cases.""",
+# 13
+"""Profile page on a freelance marketplace called Make. The profile belongs to a senior brand designer. The page is what a hiring client sees when they land on this designer's page. Top: a wide cover photo placeholder, with the avatar overlapping the bottom edge. Below, the name, headline ('Brand designer for early-stage SaaS'), location, hourly rate, response time, availability badge, and two prominent CTAs ('Hire' and 'Message'). Below that, a row of trust signals (top-rated badge, identity verified, total earnings, 5-year tenure on the platform). Then a section with the designer's bio (two paragraphs). Then a skills section with tag chips (brand strategy, logo design, visual identity, design systems, Figma, illustration). Then a portfolio grid of nine project thumbnails with title and category overlay on hover. Then a services offered list — three packaged services with title, price starting at, delivery time, and a 'select' button. Then a reviews section: average rating, total reviews, a star breakdown, and four sample reviews with client name, project title, rating, and quote. Then a 'work history' section listing past completed projects with client and date. The Hire CTA should stick to the right side of the viewport on scroll.""",
+# 14
+"""Search results page for a job board focused on remote engineering roles, called Async. The user has just searched 'senior backend engineer.' The page should have a top filter bar with the search query and quick filters (Remote-only, Full-time, Salary > $X, Posted in last 7 days). Left sidebar: more detailed filters — location/timezone (with a multi-select for regions like Americas-only, Europe-only, Anywhere), salary range slider, role seniority checkboxes, company size, tech stack tags (Go, Python, Rust, TypeScript, Postgres, AWS, etc.), and equity/benefits toggles. Main area: at the top, results count and a sort dropdown (relevance, newest, highest salary). Below, a list of job result cards — each showing company logo, role title, company name, salary range, location/timezone, posted date, a brief role description, three tech stack tags, and a 'Save' bookmark icon. About 10 results visible, with pagination at the bottom. To the right of the result list, a sticky 'Save this search' card prompting the user to get email alerts for new matches. Below the result list, a 'Companies hiring now' section with logos linking to each company's full job list.""",
+# 15
+"""Documentation homepage for an open-source ML library called Tensorgrove. Audience: ML engineers and researchers evaluating or starting with the library. The page should feel technical, fast, and not over-designed. Top header with library name, version selector (showing current stable version), and links (Docs, API, Tutorials, Blog, GitHub stars badge). Hero: short tagline ('Composable tensor operations for research'), a one-line description, and two prominent buttons ('Quickstart' and 'API reference'). Below the hero, a quickstart code block showing a 10-line install + first-example snippet with syntax highlighting and a copy button. Below that, a four-tile grid linking to main doc sections: Getting Started, Core Concepts, Tutorials, API Reference — each with an icon, short description, and arrow. Below that, a 'What you can build' section with three example projects (an image classifier, a transformer from scratch, an RL agent), each with a thumbnail placeholder, brief description, and a 'see code' link. Below that, a 'Used by' section showing logos of universities and labs. Below that, a community card linking to Discord, GitHub Discussions, and the bi-weekly office hours, with member count. End with a footer that has docs links, GitHub, license info, and a 'star us on GitHub' button.""",
+# 16
+"""Booking flow page for a hair salon's website. This is step 2 of 4: pick your stylist and time. Top of the page: a 4-step progress indicator (Service > Stylist & Time > Your Info > Confirm), with step 2 active. Below the progress indicator, a summary strip showing what was selected in step 1 (service: 'Cut & Color', estimated duration: 2h 15m, price range: $180–$240), with a small 'edit' link. Main content split into two columns. Left column (wider): the stylist + time picker. At the top, four stylist cards in a row, each showing the stylist's photo, name, specialty tag, average rating, and price tier indicator. One can be clicked to filter the calendar below. Below the stylist cards, a date picker showing the next 14 days as a horizontal scrollable strip, with each day showing day name, date, and an availability dot. Below the date strip, a time slot grid for the selected day — slots in 15-minute increments from 9am to 7pm, with available slots clickable and unavailable greyed out. Right column (narrower): a sticky booking summary card showing service, selected stylist (or 'Any available'), selected date and time (or 'Pick a time'), price range, and a 'Continue to your info' button (disabled until a slot is chosen). Below the summary card, a 'Cancellation policy' note. Footer with help link and salon contact info.""",
+# 17
+"""Help center landing page for a consumer finance app called Pocket. Audience: existing users with questions, ranging from non-technical to power users. Top: a centered hero with the help center title, a short reassurance line ('We're here to help — find an answer fast or talk to us'), and a large search bar with placeholder text ('Search articles, e.g., "transfer limits"'). Below the hero, a 'Popular topics' section as a 6-tile grid, each tile with an icon, a topic title (Account & login, Sending money, Cards, Limits & verification, Security, Billing & fees), and the article count in that category. Below that, a 'Top articles' list — 8 article links with titles and a 1-line snippet. Below that, a 'Still need help?' section with three contact options as side-by-side cards: chat with support (with current wait time), email (with response time SLA), and schedule a call (with available slots indicator). Each option should make clear what the user gets. Below that, a status indicator section linking to the public status page if there's any ongoing issue. Footer with links to community forum, security page, accessibility statement, and changelog.""",
+# 18
+"""Changelog page for a productivity app called Nimbus. Audience: existing users who want to see what's new and developers integrating with the API. Top: page title, short intro, and three controls — a category filter (All, New, Improved, Fixed, Security, API), a search bar, and subscribe options (RSS, email, webhook). Below the controls, the changelog feed grouped by month, with the current month at the top. Each month is a section with a month header. Within each month, individual entries listed reverse-chronologically. Each entry has: a date, a version tag (v3.4.1 style) when applicable, a category pill, a heading, a short paragraph (2–4 sentences) describing the change, optionally an inline screenshot or short video placeholder, and a 'See in app' deep-link button when relevant. Some entries are big releases with longer write-ups including bullet sub-changes and a 'Read full post' link. Sidebar on the right: a 'jump to month' navigation, a 'biggest changes this quarter' highlight box with three pinned entries, and a card promoting the public roadmap. The page should feel like a real product log, not marketing fluff.""",
+# 19
+"""Real estate listing detail page for a 3-bedroom house in Brooklyn. Audience: a buyer or renter browsing on desktop. Top: a breadcrumb (Home > Brooklyn > Park Slope > 312 7th Ave). Below, a full-width photo carousel placeholder showing the listing image, with arrows, image counter (1/24), and a 'view all photos' button that opens a gallery, plus tabs to switch between Photos / Floor Plan / Street View / Video Tour. Below the carousel, a two-column layout. Left column (wider): the listing details. Address, price (with 'price reduced' badge if applicable), price-per-sqft, beds, baths, square footage, and a short description paragraph. Then a 'Highlights' bullet list (renovated kitchen, private backyard, in-unit laundry, central AC, etc.). Then a 'Property details' table (year built, lot size, parking, HOA, taxes, MLS number). Then a 'Neighborhood' section with a small map placeholder, walk/transit/bike scores, and nearby points of interest (schools with ratings, parks, subway lines). Then a floor plan image with a square-footage breakdown. Then a 'Price history' table showing prior list events with date, event, and price. Right column (narrower, sticky): an agent contact card with the agent's photo, name, brokerage, phone, and a contact form (name, email, phone, message, 'I'd like to tour') with three CTA buttons ('Schedule a tour', 'Request info', 'Make an offer'). Below the agent card, a mortgage calculator widget (down payment, loan term, interest rate inputs, estimated monthly payment output). Below the two columns, a 'Similar homes nearby' carousel of 6 listing cards.""",
+# 20
+"""Analytics overview dashboard for an e-commerce store owner running a Shopify-style storefront. Audience: a small business owner checking performance daily. Top bar: store name on the left, a global date-range picker in the center (with presets: Today, Yesterday, Last 7 days, Last 30 days, This month, Custom), and a 'Compare to' toggle (vs previous period) on the right. Below, four KPI cards across a row showing: Total revenue (with delta vs previous period), Orders, Conversion rate, Average order value — each with a sparkline of the chosen period. Below that, the main revenue chart — a large line chart with revenue over time (toggleable to gross sales, net sales, refunds), with a secondary axis option for orders. Below the revenue chart, a three-column row: a 'Top products' table (product name with thumbnail, units sold, revenue, % of total) showing the top 5 with a 'View all' link; a 'Traffic sources' donut chart with a legend (direct, organic, social, paid, email, referral) and percentages; and a 'Conversion funnel' visualization showing visitors → product views → add to cart → checkout started → purchase, with drop-off rates between each step. Below those three, a 'Recent orders' table (order number, customer name, items count, total, status, date), and to the right of that, an 'Inventory alerts' panel showing products with low stock. The whole dashboard should feel scannable in 30 seconds but rewarding when you dig in."""
+]
+# ──────────────────────────────────────────────────────────────────────────────
+# Prompt normalizer
+# ──────────────────────────────────────────────────────────────────────────────
+_anthropic_client = None
+def _get_anthropic_client():
+    global _anthropic_client
+    if _anthropic_client is not None:
+        return _anthropic_client
+    if not _ANTHROPIC_AVAILABLE:
+        raise RuntimeError("anthropic SDK not installed. Run: pip install anthropic")
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        raise RuntimeError("ANTHROPIC_API_KEY environment variable not set")
+    _anthropic_client = Anthropic(api_key=api_key)
+    return _anthropic_client
+def normalize_prompt(raw_prompt: str) -> str:
+    """Normalize any user prompt into the structured format using the API.
+    Routes to the dashboard normalizer if the prompt looks dashboard-shaped,
+    otherwise uses the landing-page normalizer.
+    Falls back to the raw prompt if normalization fails."""
+    if not NORMALIZE_PROMPTS:
+        return raw_prompt
+    is_dashboard = is_dashboard_prompt(raw_prompt)
+    system_prompt = DASHBOARD_NORMALIZER_SYSTEM_PROMPT if is_dashboard else NORMALIZER_SYSTEM_PROMPT
+    print(f"[normalize_prompt] route → {'DASHBOARD' if is_dashboard else 'LANDING-PAGE'} normalizer")
+    try:
+        client = _get_anthropic_client()
+        response = client.messages.create(
+            model=NORMALIZER_MODEL,
+            max_tokens=NORMALIZER_MAX_TOKENS,
+            temperature=NORMALIZER_TEMPERATURE,
+            system=system_prompt,
+            messages=[{"role": "user", "content": raw_prompt}],
+        )
+        normalized = response.content[0].text.strip()
+        # Strip any wrapping quotes the model might add
+        if normalized.startswith('"""') and normalized.endswith('"""'):
+            normalized = normalized[3:-3].strip()
+        elif normalized.startswith('"') and normalized.endswith('"'):
+            normalized = normalized[1:-1].strip()
+        elif normalized.startswith("'") and normalized.endswith("'"):
+            normalized = normalized[1:-1].strip()
+        return normalized
+    except Exception as e:
+        print(f"[normalize_prompt] WARNING: falling back to raw prompt — {e}")
+        return raw_prompt
+# ──────────────────────────────────────────────────────────────────────────────
+# Model loading
+# ──────────────────────────────────────────────────────────────────────────────
+# Set USE_UNSLOTH = True to load with Unsloth's FastLanguageModel (custom Triton
+# kernels for Qwen2 inference, ~1.5–2x faster than vanilla transformers).
+# Set False to fall back to plain HuggingFace AutoModelForCausalLM.
+USE_UNSLOTH = True
+def load_model_and_tokenizer():
+    print(f"Loading merged model from: {MODEL_PATH}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+    ).to("cuda:0")
+    model.eval()
+    print(f"Attention implementation: {getattr(model.config, '_attn_implementation', 'unknown')}")
+    tokenizer.eos_token = "<|im_end|>"
+    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+    print("Model loaded successfully")
+    return model, tokenizer
+# ──────────────────────────────────────────────────────────────────────────────
+# Generation
+# ──────────────────────────────────────────────────────────────────────────────
+IMAGE_GUARD = """IMPORTANT: Use <AI-IMAGE class="..." src="descriptive prompt" /> for every image — no external URLs, no src="https://...". Give every major section and card a unique id attribute."""
+# with image guard is fine rank 1 for now 0.03 B
+def generate_html(model, tokenizer, prompt_text):
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user",   "content": prompt_text},
+    ]
+    input_text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    inputs = tokenizer(
+        input_text,
+        return_tensors="pt",
+        add_special_tokens=False,
+    ).to("cuda")
+    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+            temperature=TEMPERATURE,
+            top_p=TOP_P,
+            do_sample=DO_SAMPLE,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+            streamer=streamer,
+        )
+    generated_text = tokenizer.decode(
+        outputs[0][inputs.input_ids.shape[1]:],
+        skip_special_tokens=True,
+    )
+    return post_process(generated_text.strip())
+def post_process(html: str) -> str:
+    """Strip known contamination patterns that slip past the SYSTEM_PROMPT guard."""
+    # 0. Detect degenerate SVG loop (same coordinate pair repeated >50 times)
+    #    This consumes the entire token budget and produces a broken output.
+    loop_match = _re.search(r'((?:M|L|C)\d+\.?\d* \d+\.?\d*[ ,]?){50,}', html)
+    if loop_match:
+        print("[post_process] WARNING: Degenerate SVG path loop detected — stripping SVG element")
+        html = _re.sub(r'<svg[^>]*>.*?</svg>', '', html, flags=_re.DOTALL | _re.IGNORECASE)
+    # 1. Remove @font-face blocks that reference external (non-local) URLs
+    html = _re.sub(
+        r'@font-face\s*\{[^}]*src\s*:\s*url\(["\']?https?://[^"\')\s]+["\']?\)[^}]*\}',
+        '',
+        html,
+        flags=_re.DOTALL | _re.IGNORECASE,
+    )
+    # 2. Replace storage.googleapis.com avatar URLs with pravatar
+    def _fix_avatar(m):
+        import hashlib
+        n = int(hashlib.md5(m.group(0).encode()).hexdigest(), 16) % 70 + 1
+        return f'src="https://i.pravatar.cc/150?img={n}"'
+    html = _re.sub(
+        r'src="https://storage\.googleapis\.com/uxpilot-auth[^"]*"',
+        _fix_avatar,
+        html,
+    )
+    # 3. Replace invented brand domain image/video URLs with AI-IMAGE
+    def _fix_brand_url(m):
+        tag = m.group(0)
+        if 'pravatar' in tag or 'cdn.tailwind' in tag or 'cdnjs' in tag or 'fonts.google' in tag:
+            return tag
+        return '<AI-IMAGE src="placeholder image" />'
+    html = _re.sub(
+        r'<img[^>]+src="https?://(?!i\.pravatar\.cc)[^"]+\.(jpg|jpeg|png|gif|webp|mp4|mov)"[^>]*/?>',
+        _fix_brand_url,
+        html,
+        flags=_re.IGNORECASE,
+    )
+    return html
+# ──────────────────────────────────────────────────────────────────────────────
+# Dynamic batching inference engine
+# ──────────────────────────────────────────────────────────────────────────────
+# Multiple threads (or an async server) call engine.submit(prompt). A single
+# worker thread collects requests into a batch (up to MAX_BATCH, with a
+# WAIT_MS window for the batch to fill) and runs them in ONE model.generate()
+# call. Per-request output is bit-identical to single-request inference —
+# same model, same tokenizer, same sampling params; only difference is
+# left-padding so decoder-only batched generation lines up correctly.
+#
+# The GPU is the serialization point, so one worker thread is correct.
+# Don't add more worker threads — they'll just contend on CUDA and slow down.
+# ──────────────────────────────────────────────────────────────────────────────
+import threading
+import queue as _queue
+import time as _time
+from dataclasses import dataclass, field
+from typing import Optional, List, Tuple
+@dataclass
+class _BatchRequest:
+    prompt: str
+    event: threading.Event = field(default_factory=threading.Event)
+    result: Optional[str] = None
+    error: Optional[Exception] = None
+    submitted_at: float = field(default_factory=_time.monotonic)
+    started_at: Optional[float] = None
+    finished_at: Optional[float] = None
+class BatchingInferenceEngine:
+    """Dynamic batching wrapper around model.generate()."""
+    def __init__(
+        self,
+        model,
+        tokenizer,
+        system_prompt: str,
+        max_batch: int = 4,
+        wait_ms: int = 50,
+        max_new_tokens: int = MAX_NEW_TOKENS,
+        temperature: float = TEMPERATURE,
+        top_p: float = TOP_P,
+        do_sample: bool = DO_SAMPLE,
+    ):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.system_prompt = system_prompt
+        self.max_batch = max_batch
+        self.wait_s = wait_ms / 1000.0
+        self.max_new_tokens = max_new_tokens
+        self.temperature = temperature
+        self.top_p = top_p
+        self.do_sample = do_sample
+        # Critical: left-pad for decoder-only batched generation so every row's
+        # generation starts at the same index (input_len) and EOS logic works.
+        self.tokenizer.padding_side = "left"
+        self._queue: "_queue.Queue[_BatchRequest]" = _queue.Queue()
+        self._running = True
+        self._worker = threading.Thread(target=self._worker_loop, daemon=True)
+        self._worker.start()
+    def submit(self, prompt: str, timeout: Optional[float] = None) -> str:
+        """Blocking — returns the generated HTML (post-processed). Raises on failure."""
+        req = _BatchRequest(prompt=prompt)
+        self._queue.put(req)
+        if not req.event.wait(timeout=timeout):
+            raise TimeoutError("Batch inference timed out")
+        if req.error:
+            raise req.error
+        return req.result
+    def shutdown(self):
+        self._running = False
+    def _worker_loop(self):
+        while self._running:
+            batch = self._collect_batch()
+            if not batch:
+                continue
+            t_start = _time.monotonic()
+            for req in batch:
+                req.started_at = t_start
+            try:
+                self._run_batch(batch)
+            except Exception as e:
+                print(f"[batch_worker] ERROR: {e}")
+                for req in batch:
+                    if not req.event.is_set():
+                        req.error = e
+                        req.event.set()
+    def _collect_batch(self) -> List[_BatchRequest]:
+        # Block for first request (heartbeat timeout to allow clean shutdown).
+        try:
+            first = self._queue.get(timeout=1.0)
+        except _queue.Empty:
+            return []
+        batch = [first]
+        # Drain additional requests inside the wait window.
+        deadline = _time.monotonic() + self.wait_s
+        while len(batch) < self.max_batch:
+            remaining = deadline - _time.monotonic()
+            if remaining <= 0:
+                break
+            try:
+                req = self._queue.get(timeout=remaining)
+                batch.append(req)
+            except _queue.Empty:
+                break
+        return batch
+    def _run_batch(self, batch: List[_BatchRequest]):
+        texts = []
+        for req in batch:
+            messages = [
+                {"role": "system", "content": self.system_prompt},
+                {"role": "user",   "content": req.prompt},
+            ]
+            text = self.tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True,
+            )
+            texts.append(text)
+        inputs = self.tokenizer(
+            texts,
+            padding=True,
+            return_tensors="pt",
+            add_special_tokens=False,
+        ).to("cuda")
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=self.max_new_tokens,
+                temperature=self.temperature,
+                top_p=self.top_p,
+                do_sample=self.do_sample,
+                eos_token_id=self.tokenizer.eos_token_id,
+                pad_token_id=self.tokenizer.pad_token_id,
+            )
+        # Left-padded → all rows share the same input_len prefix.
+        input_len = inputs.input_ids.shape[1]
+        t_end = _time.monotonic()
+        bsz = len(batch)
+        wait_times   = [(r.started_at - r.submitted_at) * 1000 for r in batch]
+        gen_time_ms  = (t_end - batch[0].started_at) * 1000
+        print(f"[batch] size={bsz} gen={gen_time_ms:.0f}ms "
+              f"wait=[{min(wait_times):.0f}-{max(wait_times):.0f}ms]")
+        for i, req in enumerate(batch):
+            generated = outputs[i][input_len:]
+            text = self.tokenizer.decode(generated, skip_special_tokens=True)
+            req.result = post_process(text.strip())
+            req.finished_at = t_end
+            req.event.set()
+# ──────────────────────────────────────────────────────────────────────────────
+# Parity & throughput tests
+# ──────────────────────────────────────────────────────────────────────────────
+def run_parity_check(model, tokenizer, test_prompt: str):
+    """Confirm batch-of-1 output matches single-request output exactly."""
+    print("\n[parity] single-request pass...")
+    t0 = _time.monotonic()
+    single = generate_html(model, tokenizer, test_prompt)
+    t1 = _time.monotonic()
+    engine = BatchingInferenceEngine(
+        model, tokenizer, SYSTEM_PROMPT,
+        max_batch=1, wait_ms=10,
+    )
+    print("[parity] batched (batch-of-1) pass...")
+    t2 = _time.monotonic()
+    batched = engine.submit(test_prompt)
+    t3 = _time.monotonic()
+    engine.shutdown()
+    print(f"[parity] single={t1-t0:.1f}s batched-of-1={t3-t2:.1f}s")
+    if single == batched:
+        print("[parity] OK — outputs are bit-identical.")
+        return True
+    else:
+        import difflib
+        diff = list(difflib.unified_diff(
+            single.splitlines(), batched.splitlines(),
+            fromfile="single", tofile="batched", lineterm="", n=2,
+        ))
+        print(f"[parity] MISMATCH — first 30 diff lines:")
+        for line in diff[:30]:
+            print(line)
+        return False
+def run_throughput_test(
+    model, tokenizer, prompts: List[str],
+    max_batch: int = 4, wait_ms: int = 50,
+) -> Tuple[List[str], float]:
+    """Fire all prompts concurrently through one engine; measure wall time."""
+    engine = BatchingInferenceEngine(
+        model, tokenizer, SYSTEM_PROMPT,
+        max_batch=max_batch, wait_ms=wait_ms,
+    )
+    results: List[Optional[str]] = [None] * len(prompts)
+    threads: List[threading.Thread] = []
+    def worker(i, p):
+        try:
+            results[i] = engine.submit(p)
+        except Exception as e:
+            results[i] = f"ERROR: {e}"
+    t0 = _time.monotonic()
+    for i, p in enumerate(prompts):
+        th = threading.Thread(target=worker, args=(i, p))
+        th.start()
+        threads.append(th)
+    for th in threads:
+        th.join()
+    wall = _time.monotonic() - t0
+    print(f"\n[throughput] {len(prompts)} prompts in {wall:.1f}s wall "
+          f"→ {wall/len(prompts):.1f}s per prompt effective "
+          f"(max_batch={max_batch}, wait_ms={wait_ms})")
+    engine.shutdown()
+    return results, wall
+# ──────────────────────────────────────────────────────────────────────────────
+# Main
+# ──────────────────────────────────────────────────────────────────────────────
+def main():
+    model, tokenizer = load_model_and_tokenizer()
+    output_dir = Path(OUTPUT_FOLDER)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for i, prompt in enumerate(PROMPTS, 1):
+        print(f"\n{'='*80}")
+        print(f"Generating {i}/{len(PROMPTS)}")
+        print(f"Raw prompt: {prompt[:120]}..." if len(prompt) > 120 else f"Raw prompt: {prompt}")
+        # Normalize the prompt via Haiku before inference
+        normalized_prompt = normalize_prompt(prompt)
+        if normalized_prompt != prompt:
+            preview = normalized_prompt[:120] + "..." if len(normalized_prompt) > 120 else normalized_prompt
+            print(f"Normalized: {preview}")
+        print(f"{'='*80}")
+        # Save the normalized prompt alongside the HTML for traceability
+        norm_path = output_dir / f"test_prompt_{i:03d}_normalized.txt"
+        with open(norm_path, "w", encoding="utf-8") as f:
+            f.write(normalized_prompt)
+        try:
+            _t0 = _time.perf_counter()
+            html = generate_html(model, tokenizer, normalized_prompt)
+            _dt = _time.perf_counter() - _t0
+            print(f"[timing] prompt {i}/{len(PROMPTS)} generated in {_dt:.1f}s")
+            path = output_dir / f"test_prompt_{i:03d}.html"
+            with open(path, "w", encoding="utf-8") as f:
+                f.write(html)
+            print(f"Saved -> {path}")
+        except Exception as e:
+            print(f"Error on prompt {i}: {e}")
+            error_path = output_dir / f"prompt_{i:03d}_ERROR.txt"
+            with open(error_path, "w", encoding="utf-8") as f:
+                f.write(f"Error: {str(e)}\nPrompt: {prompt}")
+            print(f"Error saved -> {error_path}")
+    print(f"\nDone. All files -> {output_dir}")
+if __name__ == "__main__":
+    import sys
+    mode = sys.argv[1] if len(sys.argv) > 1 else "main"
+    if mode == "parity":
+        # Verify batched output == single-request output.
+        # Usage: python inference_edited_chat_opt.py parity
+        model, tokenizer = load_model_and_tokenizer()
+        test_prompt = PROMPTS[0] if PROMPTS else "simple landing page for a coffee shop"
+        run_parity_check(model, tokenizer, test_prompt)
+    elif mode == "throughput":
+        # Measure throughput with concurrent submissions.
+        # Usage: python inference_edited_chat_opt.py throughput [max_batch] [wait_ms] [n_prompts]
+        max_batch = int(sys.argv[2]) if len(sys.argv) > 2 else 4
+        wait_ms   = int(sys.argv[3]) if len(sys.argv) > 3 else 50
+        n         = int(sys.argv[4]) if len(sys.argv) > 4 else min(8, len(PROMPTS))
+        model, tokenizer = load_model_and_tokenizer()
+        prompts_to_run = PROMPTS[:n] if len(PROMPTS) >= n else (PROMPTS * ((n // len(PROMPTS)) + 1))[:n]
+        # Normalize upfront so the test measures inference, not normalizer latency.
+        prompts_to_run = [normalize_prompt(p) for p in prompts_to_run]
+        results, wall = run_throughput_test(
+            model, tokenizer, prompts_to_run,
+            max_batch=max_batch, wait_ms=wait_ms,
+        )
+        output_dir = Path(OUTPUT_FOLDER) / f"throughput_b{max_batch}_w{wait_ms}"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        for i, html in enumerate(results, 1):
+            (output_dir / f"batch_{i:03d}.html").write_text(html or "", encoding="utf-8")
+        print(f"[throughput] wrote {len(results)} files -> {output_dir}")
+    else:
+        main()

inference_edited_chat_opt.py CHANGED Viewed

@@ -179,7 +179,7 @@ PRESERVE-DETAIL RULES (critical for detailed inputs):
 - If the user describes a section in 80 words of detail, your prose version of that section keeps every detail and is roughly the same length. Don't summarize.
 - If the user describes a nested element ("inside this container, there's a frame, inside the frame is a window mockup"), preserve all nesting layers in your prose.
-AESTHETIC THINKING (when the user gave none): If the user supplied structure but no aesthetic, imagine the page deeply first as a design director would before writing. Who is the audience? What does this product feel like at its most honest? What is the ONE visual move that would make this page feel inevitable? Use your own taste. Avoid lazy defaults like indigo accent + Inter/DM Sans + flat design.
 STRUCTURE (merge all six blocks into one continuous paragraph):

 - If the user describes a section in 80 words of detail, your prose version of that section keeps every detail and is roughly the same length. Don't summarize.
 - If the user describes a nested element ("inside this container, there's a frame, inside the frame is a window mockup"), preserve all nesting layers in your prose.
+AESTHETIC THINKING (when the user gave none): If the user supplied structure but no aesthetic, imagine the page deeply first as a design director would before writing, produce something good, like an award winning website, something breath taking.
 STRUCTURE (merge all six blocks into one continuous paragraph):

pod_api.py CHANGED Viewed

@@ -1,9 +1,7 @@
 """
-pod_api.py — RunPod-side FastAPI server that delegates generation to local
-trtllm-serve while keeping your existing api.py contract (job pattern,
-Pydantic validation, normalizer routing, auto-save, error handling).
-Architecture in this pod:
     Client  ─POST /v1/jobs──▶  pod_api.py (this file, port 5000)
                                     │
@@ -11,33 +9,15 @@ Architecture in this pod:
                                     ▼
                               ThreadPoolExecutor
                                     │
-                                    │ 1. normalize via Anthropic API
                                     │ 2. POST to trtllm-serve
                                     ▼
-                              trtllm-serve (port 8000, local) ──▶ model on GPU
-Why this layout:
-- Your reliability layer (job pattern, validation, GC, auto-save) stays.
-- TRT-LLM does the actual generation — 2.85× faster than transformers, and
-  ready to add NGram speculative on top via the existing spec_config.yaml.
-- Anthropic-based normalizer + dashboard routing keep working unchanged
-  because we import your existing inference_edited_chat_opt module.
-Setup:
-    pip install fastapi "uvicorn[standard]" pydantic requests anthropic
-    export ANTHROPIC_API_KEY=...
-    # Make sure trtllm-serve is already running on :8000.
-    # Then start this:
     uvicorn pod_api:app --host 0.0.0.0 --port 5000 --workers 1
-Endpoints (same shape as your old api.py):
-    GET  /v1/healthz
-    GET  /v1/readyz
-    POST /v1/jobs                  -> 202 {"job_id": ...}
-    GET  /v1/jobs/{job_id}         -> status + html when done
-    GET  /v1/jobs                  -> list recent jobs
-    POST /v1/generate              -> synchronous variant
 """
 from __future__ import annotations
@@ -52,7 +32,7 @@ from concurrent.futures import ThreadPoolExecutor
 from contextlib import asynccontextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Literal, Optional
 import requests
 from fastapi import FastAPI, HTTPException
@@ -60,11 +40,10 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field, field_validator
-# Make /workspace importable so we can pull SYSTEM_PROMPT, normalizers,
-# is_dashboard_prompt, and post_process from your existing module.
 sys.path.insert(0, "/workspace")
 import inference_edited_chat_opt as inf
 # ──────────────────────────────────────────────────────────────────────────────
 # Config
 # ──────────────────────────────────────────────────────────────────────────────
@@ -85,8 +64,143 @@ logging.basicConfig(
     format="%(asctime)s %(levelname)s %(name)s: %(message)s",
 )
 # ──────────────────────────────────────────────────────────────────────────────
-# Job state (same shape as your old api.py)
 # ──────────────────────────────────────────────────────────────────────────────
 JobStatus = Literal["queued", "running", "done", "error"]
@@ -178,7 +292,6 @@ def _inflight_count() -> int:
 # Generation — call into local trtllm-serve over HTTP
 # ──────────────────────────────────────────────────────────────────────────────
 def _trtllm_generate(prompt_text: str) -> str:
-    """Send a chat-completion request to trtllm-serve and return the HTML."""
     body = {
         "model": TRTLLM_MODEL,
         "messages": [
@@ -211,8 +324,6 @@ def _run_job(job: Job) -> None:
     logger.info("job %s started", job.id)
     try:
-        # Step 1 — normalize via Anthropic (uses your existing normalizers,
-        # routed by is_dashboard_prompt for landing-page vs dashboard).
         try:
             normalized = inf.normalize_prompt(job.raw_prompt)
         except Exception as e:
@@ -225,10 +336,8 @@ def _run_job(job: Job) -> None:
             normalized = job.raw_prompt
         job.normalized_prompt = normalized
-        # Step 2 — generate via trtllm-serve (local HTTP, port 8000)
         raw_html = _trtllm_generate(job.normalized_prompt)
-        # Step 3 — apply your existing post-processing
         html = inf.post_process(raw_html)
         if not html.strip():
             raise RuntimeError("post_process returned empty output")
@@ -240,7 +349,6 @@ def _run_job(job: Job) -> None:
             job.id, time.time() - job.started_at, len(html),
         )
-        # Auto-save to disk so results survive in-memory GC.
         if OUTPUT_DIR is not None:
             try:
                 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
@@ -290,7 +398,6 @@ def _run_job(job: Job) -> None:
 async def lifespan(_: FastAPI):
     global _executor
-    # Probe trtllm-serve once on startup so we fail fast if it's not running.
     try:
         r = requests.get(f"{TRTLLM_BASE_URL}/v1/models", timeout=10)
         r.raise_for_status()
@@ -321,7 +428,7 @@ async def lifespan(_: FastAPI):
             _executor.shutdown(wait=False, cancel_futures=True)
-app = FastAPI(title="HTML Generation API (TRT-LLM backed)", version="2.0.0", lifespan=lifespan)
 app.add_middleware(
     CORSMiddleware,
@@ -412,7 +519,6 @@ def get_job(job_id: str):
     job = _get_job(job_id)
     if job is not None:
         return job.to_response()
-    # Fall back to disk if the job was GC'd from memory.
     if OUTPUT_DIR is not None:
         html_path = OUTPUT_DIR / f"{job_id}.html"
         meta_path = OUTPUT_DIR / f"{job_id}.json"

 """
+pod_api.py — RunPod-side FastAPI server with structured-output normalizer.
+Architecture:
     Client  ─POST /v1/jobs──▶  pod_api.py (this file, port 5000)
                                     │
                                     ▼
                               ThreadPoolExecutor
                                     │
+                                    │ 1. structured-output normalize via Gemini
                                     │ 2. POST to trtllm-serve
                                     ▼
+                              trtllm-serve (port 8000) ──▶ model on GPU
+Run:
+    pip install fastapi "uvicorn[standard]" pydantic requests google-genai
+    export GEMINI_API_KEY=...
     uvicorn pod_api:app --host 0.0.0.0 --port 5000 --workers 1
 """
 from __future__ import annotations
 from contextlib import asynccontextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
+from typing import Any, List, Literal, Optional
 import requests
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field, field_validator
 sys.path.insert(0, "/workspace")
 import inference_edited_chat_opt as inf
 # ──────────────────────────────────────────────────────────────────────────────
 # Config
 # ──────────────────────────────────────────────────────────────────────────────
     format="%(asctime)s %(levelname)s %(name)s: %(message)s",
 )
+# ──────────────────────────────────────────────────────────────────────────────
+# Structured-output normalizer (Pydantic schema → JSON → assembled prompt)
+# ──────────────────────────────────────────────────────────────────────────────
+class _Colors(BaseModel):
+    base_hex: str = Field(..., description="Page background hex like #F4EFE2")
+    text_hex: str = Field(..., description="Primary text hex like #1A1814")
+    muted_hex: str = Field(..., description="Muted secondary text hex")
+    surface_hex: str = Field(..., description="Card/surface background hex")
+    border_hex: str = Field(..., description="Hairline border hex")
+    accent_hex: str = Field(..., description="Single primary accent hex")
+    accent_role: str = Field(..., description="Where accent is used")
+    success_hex: str = Field(..., description="Success state hex")
+    warning_hex: str = Field(..., description="Warning state hex")
+    danger_hex: str = Field(..., description="Danger state hex")
+class _Typography(BaseModel):
+    display_family: str = Field(..., description="Real Google Font name like Fraunces, Tiempos, Geist. NEVER serif or sans-serif.")
+    display_weight: str = Field(..., description="Weight range like semibold-to-extrabold")
+    body_family: str = Field(..., description="Real Google Font name. NEVER serif or sans-serif.")
+    body_weight: str = Field(..., description="Weight range like regular-to-medium")
+    mono_family: str = Field(default="", description="Optional mono family for tabular only, or empty string")
+class _ClosingRules(BaseModel):
+    gradients: str = Field(..., description="Gradient rule")
+    shadows: str = Field(..., description="Shadow rule")
+    corners: str = Field(..., description="Corner radius rule")
+class _Section(BaseModel):
+    description: str = Field(..., description="One paragraph describing this section's layout, content, specific copy. Use frame-language and named hex colors.")
+class _NormalizedSpec(BaseModel):
+    opening: str = Field(..., description="Opening clause: Design me a [type] for [context] - audience X, goal Y")
+    register_commitment: str = Field(..., description="One sentence committing to the visual register with hex codes, named fonts, and motifs")
+    distinctive_flourish: str = Field(..., description="One sentence about a single standout interactive or visual behavior")
+    sections: List[_Section] = Field(..., min_length=8, max_length=14, description="8-14 sections in DOM order")
+    colors: _Colors
+    typography: _Typography
+    closing: _ClosingRules
+def _assemble(spec: _NormalizedSpec) -> str:
+    parts = [spec.opening.strip(), spec.register_commitment.strip(), spec.distinctive_flourish.strip()]
+    connectives = ["Start with", "Then", "Flow into", "Follow with", "Then", "Then", "Follow with", "Then", "Follow with", "Then", "Follow with", "Then", "Then", "Close with"]
+    starters = {c.split()[0].lower() for c in connectives + ["close"]}
+    for i, s in enumerate(spec.sections):
+        prefix = connectives[i] if i < len(connectives) else "Then"
+        desc = s.description.strip()
+        first = desc.split(" ", 1)[0].lower() if desc else ""
+        if first in starters or not desc:
+            parts.append(desc)
+        else:
+            parts.append(prefix + " " + (desc[0].lower() + desc[1:] if desc[0].isupper() else desc))
+    c = spec.colors
+    parts.append(
+        "Use " + c.base_hex + " as the base with " + c.text_hex + " primary text, " +
+        c.muted_hex + " muted copy, " + c.surface_hex + " for card surfaces, " +
+        c.border_hex + " for hairlines, and " + c.accent_hex + " as the primary accent for " + c.accent_role + ", " +
+        "with a state palette of " + c.success_hex + " success, " + c.warning_hex + " warning, and " + c.danger_hex + " danger."
+    )
+    t = spec.typography
+    typo = t.display_family + " " + t.display_weight + " for display and headings, paired with " + t.body_family + " " + t.body_weight + " for body"
+    if t.mono_family.strip():
+        typo += ", plus " + t.mono_family + " used only for tabular figures, IDs, and timestamps - two type families plus a single mono used only for tabular contexts."
+    else:
+        typo += " - exactly two type families across the entire page, no third family anywhere."
+    parts.append(typo)
+    cr = spec.closing
+    parts.append(
+        cr.gradients + ", " + cr.shadows + ", " + cr.corners + ". " +
+        "Icons via Font Awesome only - never inline SVG - never hidden body overflow."
+    )
+    return " ".join(parts)
+SCHEMA_DIRECTIVE = (
+    "\n\nIMPORTANT OUTPUT FORMAT: Output as JSON matching the provided schema. "
+    "Every field is mandatory and non-empty. All hex codes must be valid 6-digit hex like #1A1814 - never named colors. "
+    "Font families must be real Google Fonts (Fraunces, Inter, Geist, Space Grotesk, Tiempos, Recoleta, Outfit, Plus Jakarta Sans, IBM Plex Mono, JetBrains Mono, etc.) - NEVER use the placeholder serif or sans-serif alone. "
+    "Sections array must have between 8 and 14 entries, each describing one DOM-order region with concrete layout, content, and specific copy."
+)
+def _normalize_via_gemini(raw_prompt: str) -> str:
+    if not getattr(inf, "NORMALIZE_PROMPTS", True):
+        return raw_prompt
+    is_dashboard = inf.is_dashboard_prompt(raw_prompt)
+    system_prompt = inf.DASHBOARD_NORMALIZER_SYSTEM_PROMPT if is_dashboard else inf.NORMALIZER_SYSTEM_PROMPT
+    try:
+        from google import genai
+        from google.genai import types
+        client = genai.Client()
+        r = client.models.generate_content(
+            model="gemini-3-flash-preview",
+            contents=raw_prompt,
+            config=types.GenerateContentConfig(
+                system_instruction=system_prompt + SCHEMA_DIRECTIVE,
+                temperature=0.6,
+                max_output_tokens=8192,
+                thinking_config=types.ThinkingConfig(thinking_level="high"),
+                response_mime_type="application/json",
+                response_schema=_NormalizedSpec,
+            ),
+        )
+        spec = getattr(r, "parsed", None)
+        if spec is None:
+            data = json.loads(r.text)
+            spec = _NormalizedSpec.model_validate(data)
+        assembled = _assemble(spec)
+        if not assembled or not assembled.strip():
+            raise RuntimeError("assembled normalized prompt is empty")
+        return assembled
+    except Exception as e:
+        logger.warning("structured normalize failed: %s - falling back to raw prompt", e)
+        return raw_prompt
+inf.normalize_prompt = _normalize_via_gemini
 # ──────────────────────────────────────────────────────────────────────────────
+# Job state
 # ──────────────────────────────────────────────────────────────────────────────
 JobStatus = Literal["queued", "running", "done", "error"]
 # Generation — call into local trtllm-serve over HTTP
 # ──────────────────────────────────────────────────────────────────────────────
 def _trtllm_generate(prompt_text: str) -> str:
     body = {
         "model": TRTLLM_MODEL,
         "messages": [
     logger.info("job %s started", job.id)
     try:
         try:
             normalized = inf.normalize_prompt(job.raw_prompt)
         except Exception as e:
             normalized = job.raw_prompt
         job.normalized_prompt = normalized
         raw_html = _trtllm_generate(job.normalized_prompt)
         html = inf.post_process(raw_html)
         if not html.strip():
             raise RuntimeError("post_process returned empty output")
             job.id, time.time() - job.started_at, len(html),
         )
         if OUTPUT_DIR is not None:
             try:
                 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 async def lifespan(_: FastAPI):
     global _executor
     try:
         r = requests.get(f"{TRTLLM_BASE_URL}/v1/models", timeout=10)
         r.raise_for_status()
             _executor.shutdown(wait=False, cancel_futures=True)
+app = FastAPI(title="HTML Generation API (TRT-LLM backed)", version="2.1.0", lifespan=lifespan)
 app.add_middleware(
     CORSMiddleware,
     job = _get_job(job_id)
     if job is not None:
         return job.to_response()
     if OUTPUT_DIR is not None:
         html_path = OUTPUT_DIR / f"{job_id}.html"
         meta_path = OUTPUT_DIR / f"{job_id}.json"

pod_api_old.py ADDED Viewed

	@@ -0,0 +1,485 @@

+"""
+pod_api.py — RunPod-side FastAPI server that delegates generation to local
+trtllm-serve while keeping your existing api.py contract (job pattern,
+Pydantic validation, normalizer routing, auto-save, error handling).
+Architecture in this pod:
+    Client  ─POST /v1/jobs──▶  pod_api.py (this file, port 5000)
+                                    │
+                                    │ enqueues job
+                                    ▼
+                              ThreadPoolExecutor
+                                    │
+                                    │ 1. normalize via Anthropic API
+                                    │ 2. POST to trtllm-serve
+                                    ▼
+                              trtllm-serve (port 8000, local) ──▶ model on GPU
+Why this layout:
+- Your reliability layer (job pattern, validation, GC, auto-save) stays.
+- TRT-LLM does the actual generation — 2.85× faster than transformers, and
+  ready to add NGram speculative on top via the existing spec_config.yaml.
+- Anthropic-based normalizer + dashboard routing keep working unchanged
+  because we import your existing inference_edited_chat_opt module.
+Setup:
+    pip install fastapi "uvicorn[standard]" pydantic requests anthropic
+    export ANTHROPIC_API_KEY=...
+    # Make sure trtllm-serve is already running on :8000.
+    # Then start this:
+    uvicorn pod_api:app --host 0.0.0.0 --port 5000 --workers 1
+Endpoints (same shape as your old api.py):
+    GET  /v1/healthz
+    GET  /v1/readyz
+    POST /v1/jobs                  -> 202 {"job_id": ...}
+    GET  /v1/jobs/{job_id}         -> status + html when done
+    GET  /v1/jobs                  -> list recent jobs
+    POST /v1/generate              -> synchronous variant
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import sys
+import threading
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import asynccontextmanager
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Literal, Optional
+import requests
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field, field_validator
+# Make /workspace importable so we can pull SYSTEM_PROMPT, normalizers,
+# is_dashboard_prompt, and post_process from your existing module.
+sys.path.insert(0, "/workspace")
+import inference_edited_chat_opt as inf
+# ──────────────────────────────────────────────────────────────────────────────
+# Config
+# ──────────────────────────────────────────────────────────────────────────────
+TRTLLM_BASE_URL     = os.environ.get("TRTLLM_BASE_URL", "http://localhost:8000")
+TRTLLM_MODEL        = os.environ.get("TRTLLM_MODEL", "final_model")
+MAX_PROMPT_CHARS    = 8_000
+MAX_CONCURRENT_JOBS = 16
+JOB_TIMEOUT_S       = 60 * 25
+SYNC_TIMEOUT_S      = 60 * 20
+JOB_RETENTION_S     = 60 * 60
+OUTPUT_DIR: Optional[Path] = Path(os.environ.get("API_OUTPUT_DIR", "/workspace/api_output"))
+GENERATION_MAX_TOKENS = int(os.environ.get("GENERATION_MAX_TOKENS", "8192"))
+GENERATION_TEMPERATURE = float(os.environ.get("GENERATION_TEMPERATURE", "0.0"))
+logger = logging.getLogger("pod_api")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+)
+# ──────────────────────────────────────────────────────────────────────────────
+# Job state (same shape as your old api.py)
+# ──────────────────────────────────────────────────────────────────────────────
+JobStatus = Literal["queued", "running", "done", "error"]
+@dataclass
+class Job:
+    id: str
+    raw_prompt: str
+    normalized_prompt: Optional[str] = None
+    status: JobStatus = "queued"
+    html: Optional[str] = None
+    error: Optional[str] = None
+    created_at: float = field(default_factory=time.time)
+    started_at: Optional[float] = None
+    finished_at: Optional[float] = None
+    done_event: threading.Event = field(default_factory=threading.Event)
+    def to_response(self) -> dict[str, Any]:
+        body: dict[str, Any] = {
+            "job_id": self.id,
+            "status": self.status,
+            "created_at": self.created_at,
+        }
+        if self.started_at is not None:
+            body["started_at"] = self.started_at
+        if self.finished_at is not None:
+            body["finished_at"] = self.finished_at
+            body["duration_seconds"] = round(
+                self.finished_at - (self.started_at or self.created_at), 2
+            )
+        if self.normalized_prompt is not None:
+            body["normalized_prompt"] = self.normalized_prompt
+        if self.status == "done":
+            body["html"] = self.html
+        elif self.status == "error":
+            body["error"] = self.error
+        return body
+_jobs: dict[str, Job] = {}
+_jobs_lock = threading.Lock()
+_executor: Optional[ThreadPoolExecutor] = None
+_inflight = 0
+_inflight_lock = threading.Lock()
+def _store_job(job: Job) -> None:
+    with _jobs_lock:
+        _jobs[job.id] = job
+def _get_job(job_id: str) -> Optional[Job]:
+    with _jobs_lock:
+        return _jobs.get(job_id)
+def _gc_jobs() -> None:
+    now = time.time()
+    with _jobs_lock:
+        stale = [
+            jid for jid, j in _jobs.items()
+            if j.finished_at is not None and (now - j.finished_at) > JOB_RETENTION_S
+        ]
+        for jid in stale:
+            _jobs.pop(jid, None)
+def _try_reserve_slot() -> bool:
+    global _inflight
+    with _inflight_lock:
+        if _inflight >= MAX_CONCURRENT_JOBS:
+            return False
+        _inflight += 1
+        return True
+def _release_slot() -> None:
+    global _inflight
+    with _inflight_lock:
+        _inflight = max(0, _inflight - 1)
+def _inflight_count() -> int:
+    with _inflight_lock:
+        return _inflight
+# ──────────────────────────────────────────────────────────────────────────────
+# Generation — call into local trtllm-serve over HTTP
+# ──────────────────────────────────────────────────────────────────────────────
+def _trtllm_generate(prompt_text: str) -> str:
+    """Send a chat-completion request to trtllm-serve and return the HTML."""
+    body = {
+        "model": TRTLLM_MODEL,
+        "messages": [
+            {"role": "system", "content": inf.SYSTEM_PROMPT},
+            {"role": "user", "content": prompt_text},
+        ],
+        "max_tokens": GENERATION_MAX_TOKENS,
+        "temperature": GENERATION_TEMPERATURE,
+    }
+    resp = requests.post(
+        f"{TRTLLM_BASE_URL}/v1/chat/completions",
+        headers={"Content-Type": "application/json"},
+        json=body,
+        timeout=JOB_TIMEOUT_S,
+    )
+    resp.raise_for_status()
+    data = resp.json()
+    text = data["choices"][0]["message"]["content"]
+    if not isinstance(text, str) or not text.strip():
+        raise RuntimeError("trtllm-serve returned empty content")
+    return text
+# ──────────────────────────────────────────────────────────────────────────────
+# Job runner
+# ──────────────────────────────────────────────────────────────────────────────
+def _run_job(job: Job) -> None:
+    job.started_at = time.time()
+    job.status = "running"
+    logger.info("job %s started", job.id)
+    try:
+        # Step 1 — normalize via Anthropic (uses your existing normalizers,
+        # routed by is_dashboard_prompt for landing-page vs dashboard).
+        try:
+            normalized = inf.normalize_prompt(job.raw_prompt)
+        except Exception as e:
+            logger.warning(
+                "normalize failed for job %s: %s — falling back to raw prompt",
+                job.id, e,
+            )
+            normalized = job.raw_prompt
+        if not isinstance(normalized, str) or not normalized.strip():
+            normalized = job.raw_prompt
+        job.normalized_prompt = normalized
+        # Step 2 — generate via trtllm-serve (local HTTP, port 8000)
+        raw_html = _trtllm_generate(job.normalized_prompt)
+        # Step 3 — apply your existing post-processing
+        html = inf.post_process(raw_html)
+        if not html.strip():
+            raise RuntimeError("post_process returned empty output")
+        job.html = html
+        job.status = "done"
+        logger.info(
+            "job %s done in %.1fs (%d chars)",
+            job.id, time.time() - job.started_at, len(html),
+        )
+        # Auto-save to disk so results survive in-memory GC.
+        if OUTPUT_DIR is not None:
+            try:
+                OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+                (OUTPUT_DIR / f"{job.id}.html").write_text(html, encoding="utf-8")
+                (OUTPUT_DIR / f"{job.id}.json").write_text(
+                    json.dumps({
+                        "job_id": job.id,
+                        "raw_prompt": job.raw_prompt,
+                        "normalized_prompt": job.normalized_prompt,
+                        "created_at": job.created_at,
+                        "started_at": job.started_at,
+                        "finished_at": time.time(),
+                        "duration_seconds": round(time.time() - job.started_at, 2),
+                    }, indent=2),
+                    encoding="utf-8",
+                )
+                logger.info("job %s saved to %s", job.id, OUTPUT_DIR)
+            except Exception as e:
+                logger.warning("failed to persist job %s: %s", job.id, e)
+    except requests.HTTPError as e:
+        job.error = f"trtllm-serve returned {e.response.status_code}: {e.response.text[:500]}"
+        job.status = "error"
+        logger.exception("job %s — trtllm-serve HTTP error", job.id)
+    except requests.RequestException as e:
+        job.error = f"trtllm-serve unreachable: {e}"
+        job.status = "error"
+        logger.exception("job %s — trtllm-serve unreachable", job.id)
+    except Exception as e:
+        job.error = f"{type(e).__name__}: {e}"
+        job.status = "error"
+        logger.exception("job %s failed", job.id)
+    finally:
+        job.finished_at = time.time()
+        job.done_event.set()
+        _release_slot()
+        _gc_jobs()
+# ──────────────────────────────────────────────────────────────────────────────
+# FastAPI app + lifespan
+# ──────────────────────────────────────────────────────────────────────────────
+@asynccontextmanager
+async def lifespan(_: FastAPI):
+    global _executor
+    # Probe trtllm-serve once on startup so we fail fast if it's not running.
+    try:
+        r = requests.get(f"{TRTLLM_BASE_URL}/v1/models", timeout=10)
+        r.raise_for_status()
+        logger.info(
+            "trtllm-serve OK at %s (%d models loaded)",
+            TRTLLM_BASE_URL, len(r.json().get("data", [])),
+        )
+    except Exception as e:
+        logger.error(
+            "trtllm-serve not reachable at %s — %s. "
+            "Start it before this API: trtllm-serve /workspace/final_model --host 0.0.0.0 --port 8000",
+            TRTLLM_BASE_URL, e,
+        )
+    _executor = ThreadPoolExecutor(
+        max_workers=MAX_CONCURRENT_JOBS,
+        thread_name_prefix="job-runner",
+    )
+    logger.info(
+        "executor started (max_workers=%d), output_dir=%s",
+        MAX_CONCURRENT_JOBS, OUTPUT_DIR,
+    )
+    try:
+        yield
+    finally:
+        if _executor is not None:
+            _executor.shutdown(wait=False, cancel_futures=True)
+app = FastAPI(title="HTML Generation API (TRT-LLM backed)", version="2.0.0", lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class GenerateRequest(BaseModel):
+    prompt: str = Field(..., min_length=1, max_length=MAX_PROMPT_CHARS)
+    @field_validator("prompt")
+    @classmethod
+    def _strip(cls, v: str) -> str:
+        v = v.strip()
+        if not v:
+            raise ValueError("prompt is empty after stripping whitespace")
+        return v
+@app.exception_handler(Exception)
+async def _unhandled(request, exc):
+    logger.exception("unhandled exception in request: %s", exc)
+    return JSONResponse(
+        status_code=500,
+        content={"error": "internal_server_error", "detail": str(exc)},
+    )
+# ──────────────────────────────────────────────────────────────────────────────
+# Endpoints
+# ──────────────────────────────────────────────────────────────────────────────
+@app.get("/v1/healthz")
+def healthz():
+    return {"status": "ok"}
+@app.get("/v1/readyz")
+def readyz():
+    if _executor is None:
+        return JSONResponse(status_code=503, content={"status": "executor_not_ready"})
+    try:
+        r = requests.get(f"{TRTLLM_BASE_URL}/v1/models", timeout=5)
+        if r.status_code != 200:
+            return JSONResponse(
+                status_code=503,
+                content={"status": "trtllm_unhealthy", "trtllm_status": r.status_code},
+            )
+    except Exception as e:
+        return JSONResponse(
+            status_code=503,
+            content={"status": "trtllm_unreachable", "detail": str(e)},
+        )
+    return {
+        "status": "ready",
+        "in_flight": _inflight_count(),
+        "max_concurrent_jobs": MAX_CONCURRENT_JOBS,
+        "trtllm_url": TRTLLM_BASE_URL,
+    }
+@app.post("/v1/jobs", status_code=202)
+def create_job(req: GenerateRequest):
+    if _executor is None:
+        raise HTTPException(status_code=503, detail="server still warming up")
+    if not _try_reserve_slot():
+        raise HTTPException(
+            status_code=503,
+            detail=f"server at capacity ({MAX_CONCURRENT_JOBS} in-flight) — try again shortly",
+        )
+    job = Job(id=uuid.uuid4().hex, raw_prompt=req.prompt)
+    _store_job(job)
+    _executor.submit(_run_job, job)
+    logger.info(
+        "job %s queued (in_flight=%d, prompt_chars=%d)",
+        job.id, _inflight_count(), len(req.prompt),
+    )
+    return {
+        "job_id": job.id,
+        "status": "queued",
+        "in_flight": _inflight_count(),
+    }
+@app.get("/v1/jobs/{job_id}")
+def get_job(job_id: str):
+    job = _get_job(job_id)
+    if job is not None:
+        return job.to_response()
+    # Fall back to disk if the job was GC'd from memory.
+    if OUTPUT_DIR is not None:
+        html_path = OUTPUT_DIR / f"{job_id}.html"
+        meta_path = OUTPUT_DIR / f"{job_id}.json"
+        if html_path.exists():
+            try:
+                meta = json.loads(meta_path.read_text(encoding="utf-8")) if meta_path.exists() else {}
+                return {
+                    "job_id": job_id,
+                    "status": "done",
+                    "html": html_path.read_text(encoding="utf-8"),
+                    "source": "disk",
+                    **meta,
+                }
+            except Exception as e:
+                logger.warning("failed to read persisted job %s: %s", job_id, e)
+    raise HTTPException(
+        status_code=404,
+        detail="job not found (not in memory and not persisted to disk)",
+    )
+@app.get("/v1/jobs")
+def list_jobs(limit: int = 50):
+    if limit < 1 or limit > 500:
+        raise HTTPException(status_code=400, detail="limit must be between 1 and 500")
+    with _jobs_lock:
+        items = sorted(_jobs.values(), key=lambda j: j.created_at, reverse=True)[:limit]
+    return {
+        "count": len(items),
+        "jobs": [
+            {"job_id": j.id, "status": j.status, "created_at": j.created_at}
+            for j in items
+        ],
+    }
+@app.post("/v1/generate")
+def generate_sync(req: GenerateRequest):
+    if _executor is None:
+        raise HTTPException(status_code=503, detail="server still warming up")
+    if not _try_reserve_slot():
+        raise HTTPException(
+            status_code=503,
+            detail=f"server at capacity ({MAX_CONCURRENT_JOBS} in-flight) — try again shortly",
+        )
+    job = Job(id=uuid.uuid4().hex, raw_prompt=req.prompt)
+    _store_job(job)
+    _executor.submit(_run_job, job)
+    finished = job.done_event.wait(timeout=SYNC_TIMEOUT_S)
+    if not finished:
+        raise HTTPException(
+            status_code=504,
+            detail={
+                "job_id": job.id,
+                "error": "generation timed out — use GET /v1/jobs/{id} to retrieve",
+            },
+        )
+    if job.status == "done":
+        return {
+            "job_id": job.id,
+            "html": job.html,
+            "normalized_prompt": job.normalized_prompt,
+            "duration_seconds": round(
+                (job.finished_at or 0) - (job.started_at or 0), 2
+            ),
+        }
+    raise HTTPException(
+        status_code=500,
+        detail={"job_id": job.id, "error": job.error or "unknown error"},
+    )