| from __future__ import annotations |
|
|
| from dataclasses import dataclass |
| from pathlib import Path |
| from xml.sax.saxutils import escape |
|
|
| import fitz |
| from pypdf import PdfReader |
| from reportlab.lib import colors |
| from reportlab.lib.pagesizes import A4 |
| from reportlab.lib.styles import ParagraphStyle |
| from reportlab.pdfgen import canvas |
| from reportlab.platypus import Paragraph |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| OUTPUT_DIR = ROOT / "output" / "pdf" |
| TMP_DIR = ROOT / "tmp" / "pdfs" |
| PDF_PATH = OUTPUT_DIR / "pluto_app_summary_one_page.pdf" |
| PNG_PATH = TMP_DIR / "pluto_app_summary_one_page-1.png" |
|
|
| PAGE_WIDTH, PAGE_HEIGHT = A4 |
| MARGIN_X = 34 |
| MARGIN_TOP = 34 |
| MARGIN_BOTTOM = 28 |
| GUTTER = 16 |
| COLUMN_WIDTH = (PAGE_WIDTH - (2 * MARGIN_X) - GUTTER) / 2 |
|
|
| NAVY = colors.HexColor("#17324D") |
| TEAL = colors.HexColor("#2A7F8C") |
| INK = colors.HexColor("#1D2430") |
| MUTED = colors.HexColor("#5C6773") |
| CARD_BG = colors.HexColor("#F5F8FB") |
| CARD_BORDER = colors.HexColor("#D6E1EA") |
| ACCENT_BG = colors.HexColor("#E8F4F4") |
| WHITE = colors.white |
|
|
|
|
| @dataclass |
| class SectionBlock: |
| title: str |
| items: list[tuple[str, str]] |
|
|
|
|
| def build_blocks() -> tuple[list[SectionBlock], list[SectionBlock]]: |
| left = [ |
| SectionBlock( |
| title="What it is", |
| items=[ |
| ( |
| "body", |
| "Pluto is an AI-powered document extraction and question-answering app. " |
| "It lets a user upload documents into a corpus, run a multi-stage pipeline, " |
| "and inspect the answer with evidence, trace, and confidence signals.", |
| ), |
| ], |
| ), |
| SectionBlock( |
| title="Who it's for", |
| items=[ |
| ("body", "Primary user/persona: Not found in repo."), |
| ( |
| "body", |
| "Closest repo evidence: a person asking research-style questions " |
| "over uploaded documents and reviewing evidence-backed results.", |
| ), |
| ], |
| ), |
| SectionBlock( |
| title="What it does", |
| items=[ |
| ("bullet", "Uploads PDF, DOCX/DOC, TXT, and Markdown files into a corpus."), |
| ("bullet", "Converts uploads to Markdown, chunks them, classifies them, and tracks readiness."), |
| ("bullet", "Runs a 4-stage pipeline: route, extract, merge, evidence_check."), |
| ("bullet", "Streams live progress and upload status to the dashboard."), |
| ("bullet", "Queries the full corpus or selected ready documents."), |
| ("bullet", "Shows final sections, evidence, trace, confidence, and a benchmark view."), |
| ], |
| ), |
| ] |
|
|
| right = [ |
| SectionBlock( |
| title="How it works", |
| items=[ |
| ( |
| "bullet", |
| "Frontend: `frontend/index.html` + `app.js` call `/api/upload`, `/api/corpus`, " |
| "`/api/run`, `/api/stream`, and `/api/compare`.", |
| ), |
| ( |
| "bullet", |
| "Server: `pluto/server.py` serves the UI, handles uploads, streams SSE progress, " |
| "and runs `PipelineRunner` in a worker thread.", |
| ), |
| ( |
| "bullet", |
| "Ingest path: uploaded file -> Markdown in `corpus/` -> chunk split/classification " |
| "-> `DocIndex` registration; background Phase A stores overview/status in " |
| "`corpus/.doc_index.json`.", |
| ), |
| ( |
| "bullet", |
| "Query path: selected docs -> S0 route -> S1 extract -> S2 merge -> S3 evidence_check " |
| "-> JSON result + cache stats -> UI panels; final JSON also writes to " |
| "`output/final_output.json`.", |
| ), |
| ( |
| "bullet", |
| "Support layers: `ExtractionCache` reuses extractions; `CorpusTools` reads/searches chunks. " |
| "NVIDIA embedding/rerank code paths exist, and chunking falls back when NVIDIA keys are absent.", |
| ), |
| ], |
| ), |
| SectionBlock( |
| title="How to run", |
| items=[ |
| ("bullet", "From `mp1/`: `pip install -r requirements.txt`"), |
| ( |
| "bullet", |
| "Create `.env` and set `GROQ_API_KEY` (explicitly named in `README.md`).", |
| ), |
| ("bullet", "Run `python main.py --serve`"), |
| ("bullet", "Open `http://localhost:8000`"), |
| ( |
| "bullet", |
| "Upload docs, wait for Understanding to finish, then submit a query. " |
| "Other required provider keys: Not found in repo.", |
| ), |
| ], |
| ), |
| ] |
| return left, right |
|
|
|
|
| def make_styles(scale: float) -> dict[str, ParagraphStyle]: |
| return { |
| "title": ParagraphStyle( |
| "title", |
| fontName="Helvetica-Bold", |
| fontSize=21 * scale, |
| leading=25 * scale, |
| textColor=WHITE, |
| spaceAfter=0, |
| ), |
| "subtitle": ParagraphStyle( |
| "subtitle", |
| fontName="Helvetica", |
| fontSize=9.6 * scale, |
| leading=12 * scale, |
| textColor=colors.HexColor("#DCE7F3"), |
| ), |
| "eyebrow": ParagraphStyle( |
| "eyebrow", |
| fontName="Helvetica-Bold", |
| fontSize=7.4 * scale, |
| leading=9 * scale, |
| textColor=colors.HexColor("#B9D6DA"), |
| ), |
| "section_title": ParagraphStyle( |
| "section_title", |
| fontName="Helvetica-Bold", |
| fontSize=10.6 * scale, |
| leading=12.5 * scale, |
| textColor=NAVY, |
| ), |
| "body": ParagraphStyle( |
| "body", |
| fontName="Helvetica", |
| fontSize=8.6 * scale, |
| leading=11 * scale, |
| textColor=INK, |
| ), |
| "bullet": ParagraphStyle( |
| "bullet", |
| fontName="Helvetica", |
| fontSize=8.5 * scale, |
| leading=10.7 * scale, |
| textColor=INK, |
| leftIndent=10 * scale, |
| firstLineIndent=-7 * scale, |
| ), |
| "footer": ParagraphStyle( |
| "footer", |
| fontName="Helvetica", |
| fontSize=7.1 * scale, |
| leading=8.5 * scale, |
| textColor=MUTED, |
| ), |
| } |
|
|
|
|
| def escape_inline(text: str) -> str: |
| escaped = escape(text) |
| return escaped.replace("`", "<font name='Courier'>").replace("</font><font name='Courier'>", "") |
|
|
|
|
| def format_text(text: str) -> str: |
| parts = text.split("`") |
| if len(parts) == 1: |
| return escape(text) |
|
|
| result: list[str] = [] |
| code = False |
| for part in parts: |
| if code: |
| result.append(f"<font name='Courier'>{escape(part)}</font>") |
| else: |
| result.append(escape(part)) |
| code = not code |
| return "".join(result) |
|
|
|
|
| def paragraph_for(kind: str, text: str, styles: dict[str, ParagraphStyle]) -> Paragraph: |
| style_name = "bullet" if kind == "bullet" else "body" |
| content = f"- {text}" if kind == "bullet" else text |
| return Paragraph(format_text(content), styles[style_name]) |
|
|
|
|
| def measure_section(block: SectionBlock, styles: dict[str, ParagraphStyle], width: float) -> tuple[float, list[Paragraph]]: |
| title = Paragraph(format_text(block.title), styles["section_title"]) |
| rendered_items = [paragraph_for(kind, text, styles) for kind, text in block.items] |
|
|
| title_height = title.wrap(width - 20, 1000)[1] |
| items_height = 0.0 |
| for para in rendered_items: |
| items_height += para.wrap(width - 20, 1000)[1] |
| items_height += 5 |
|
|
| total = 14 + title_height + 8 + items_height + 10 |
| return total, [title, *rendered_items] |
|
|
|
|
| def choose_scale(left: list[SectionBlock], right: list[SectionBlock]) -> tuple[float, dict[str, ParagraphStyle], float]: |
| header_space = 114 |
| footer_space = 18 |
| available = PAGE_HEIGHT - MARGIN_TOP - MARGIN_BOTTOM - header_space - footer_space |
|
|
| for scale in (1.0, 0.97, 0.94, 0.91, 0.88, 0.85): |
| styles = make_styles(scale) |
| left_height = total_column_height(left, styles) |
| right_height = total_column_height(right, styles) |
| if max(left_height, right_height) <= available: |
| return scale, styles, available |
|
|
| raise RuntimeError("Content did not fit on a single page.") |
|
|
|
|
| def total_column_height(blocks: list[SectionBlock], styles: dict[str, ParagraphStyle]) -> float: |
| total = 0.0 |
| for index, block in enumerate(blocks): |
| section_height, _ = measure_section(block, styles, COLUMN_WIDTH) |
| total += section_height |
| if index < len(blocks) - 1: |
| total += 10 |
| return total |
|
|
|
|
| def draw_header(pdf: canvas.Canvas, styles: dict[str, ParagraphStyle]) -> float: |
| header_height = 94 |
| header_y = PAGE_HEIGHT - MARGIN_TOP - header_height |
|
|
| pdf.setFillColor(NAVY) |
| pdf.roundRect(MARGIN_X, header_y, PAGE_WIDTH - (2 * MARGIN_X), header_height, 14, stroke=0, fill=1) |
| pdf.setFillColor(TEAL) |
| pdf.roundRect(PAGE_WIDTH - MARGIN_X - 110, header_y, 110, header_height, 14, stroke=0, fill=1) |
|
|
| eyebrow = Paragraph("ONE-PAGE APP SUMMARY", styles["eyebrow"]) |
| title = Paragraph("Pluto", styles["title"]) |
| subtitle = Paragraph( |
| "Repo-backed overview of the document extraction and question-answering dashboard.", |
| styles["subtitle"], |
| ) |
|
|
| x = MARGIN_X + 18 |
| y = PAGE_HEIGHT - MARGIN_TOP - 16 |
|
|
| for para, width in ((eyebrow, 210), (title, 260), (subtitle, PAGE_WIDTH - (2 * MARGIN_X) - 150)): |
| _, height = para.wrap(width, 1000) |
| para.drawOn(pdf, x, y - height) |
| y -= height + 4 |
|
|
| note = Paragraph("Evidence source: README + app server, pipeline, ingest, index, and UI files.", styles["subtitle"]) |
| note_width = 92 |
| _, note_height = note.wrap(note_width, 1000) |
| note.drawOn(pdf, PAGE_WIDTH - MARGIN_X - 98, header_y + header_height - 18 - note_height) |
|
|
| return header_y - 12 |
|
|
|
|
| def draw_column( |
| pdf: canvas.Canvas, |
| blocks: list[SectionBlock], |
| x: float, |
| top_y: float, |
| styles: dict[str, ParagraphStyle], |
| ) -> None: |
| y = top_y |
| for block in blocks: |
| section_height, items = measure_section(block, styles, COLUMN_WIDTH) |
|
|
| pdf.setFillColor(CARD_BG if block.title != "How to run" else ACCENT_BG) |
| pdf.setStrokeColor(CARD_BORDER) |
| pdf.roundRect(x, y - section_height, COLUMN_WIDTH, section_height, 12, stroke=1, fill=1) |
|
|
| cursor = y - 14 |
| title = items[0] |
| _, title_height = title.wrap(COLUMN_WIDTH - 20, 1000) |
| title.drawOn(pdf, x + 10, cursor - title_height) |
| cursor -= title_height + 8 |
|
|
| for para in items[1:]: |
| _, para_height = para.wrap(COLUMN_WIDTH - 20, 1000) |
| para.drawOn(pdf, x + 10, cursor - para_height) |
| cursor -= para_height + 5 |
|
|
| y -= section_height + 10 |
|
|
|
|
| def build_pdf() -> None: |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
| TMP_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| left, right = build_blocks() |
| _, styles, _ = choose_scale(left, right) |
|
|
| pdf = canvas.Canvas(str(PDF_PATH), pagesize=A4) |
| pdf.setTitle("Pluto App Summary") |
| pdf.setAuthor("OpenAI Codex") |
| pdf.setSubject("One-page summary generated from repository evidence") |
|
|
| top_y = draw_header(pdf, styles) |
| draw_column(pdf, left, MARGIN_X, top_y, styles) |
| draw_column(pdf, right, MARGIN_X + COLUMN_WIDTH + GUTTER, top_y, styles) |
|
|
| footer = Paragraph( |
| "Not found in repo items are labeled explicitly. Output generated as a single-page PDF.", |
| styles["footer"], |
| ) |
| _, footer_height = footer.wrap(PAGE_WIDTH - (2 * MARGIN_X), 1000) |
| footer.drawOn(pdf, MARGIN_X, MARGIN_BOTTOM - 4) |
|
|
| pdf.showPage() |
| pdf.save() |
|
|
|
|
| def validate_outputs() -> None: |
| reader = PdfReader(str(PDF_PATH)) |
| if len(reader.pages) != 1: |
| raise RuntimeError(f"Expected 1 page, found {len(reader.pages)}") |
|
|
| document = fitz.open(PDF_PATH) |
| page = document.load_page(0) |
| pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0), alpha=False) |
| pix.save(PNG_PATH) |
| document.close() |
|
|
|
|
| def main() -> None: |
| build_pdf() |
| validate_outputs() |
| print(f"PDF_PATH={PDF_PATH}") |
| print(f"PNG_PATH={PNG_PATH}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|