Spaces:

FridayCodehhr
/

finalyze

Sleeping

App Files Files Community

FridayCodehhr commited on Dec 18, 2025

Commit

fc361bb

verified ·

1 Parent(s): 3796af8

Upload 9 files

Browse files

Files changed (9) hide show

Dockerfile +32 -0
app.py +67 -0
config.py +45 -0
index.html +86 -0
main.py +297 -0
openrouter_client.py +256 -0
pdf_io.py +75 -0
requirements.txt +7 -0
statement_candidates.py +545 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# Use official Python runtime as a parent image
+FROM python:3.10-slim
+# Set the working directory in the container
+WORKDIR /app
+# Install system dependencies (Tesseract)
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    libtesseract-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code
+COPY . .
+# Create a user to run the app (security best practice, required by some environments)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Expose port 7860 (Hugging Face Spaces default)
+EXPOSE 7860
+# Command to run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import shutil
+import tempfile
+import json
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse, HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from main import analyze_pdf
+app = FastAPI()
+# Mount static files to serve index.html
+# We assume index.html is in the same directory
+app.mount("/static", StaticFiles(directory="."), name="static")
+@app.get("/", response_class=HTMLResponse)
+async def read_root():
+    with open("index.html", "r") as f:
+        return f.read()
+@app.post("/analyze")
+async def analyze_endpoint(file: UploadFile = File(...)):
+    if not file.filename.endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="File must be a PDF")
+    # Save uploaded file to a temp location
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    try:
+        # Create a temp debug dir
+        debug_dir = tempfile.mkdtemp()
+        # Get API Key from environment (injected by Space secrets)
+        api_key = os.getenv("OPENROUTER_API_KEY")
+        if not api_key:
+            raise HTTPException(status_code=500, detail="Server misconfigured: OPENROUTER_API_KEY missing")
+        # Run analysis using the refactored main logic
+        # We pass None for output_path so it doesn't try to write to a fixed file unless we want it to
+        # But analyze_pdf writes to output_path if provided. We can just let it return the dict.
+        result = analyze_pdf(
+            pdf_path=tmp_path,
+            output_path="", # Don't write to file, just return dict
+            debug_dir=debug_dir,
+            openrouter_api_key=api_key
+        )
+        return JSONResponse(content=result)
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        # Cleanup
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+        # We might want to keep debug dir for a bit or clean it up.
+        # For a simple demo, we can clean it up or ignore it (tmp cleans up eventually on restart usually, but explicitly is better)
+        if os.path.exists(debug_dir):
+            shutil.rmtree(debug_dir, ignore_errors=True)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from __future__ import annotations
+from dataclasses import dataclass
+import os
+from dotenv import load_dotenv
+@dataclass(frozen=True)
+class Settings:
+    openrouter_api_key: str
+    openrouter_model: str | None
+    max_images: int
+    dpi: int
+    ocr_lang: str
+    min_text_chars_for_digital: int
+    topk_per_statement: int
+DEFAULT_FREE_VISION_MODELS = [
+    # Free + vision-capable (as of their OpenRouter pages)
+    "google/gemma-3-12b-it:free",
+    "nvidia/nemotron-nano-12b-v2-vl:free",
+    "amazon/nova-2-lite-v1:free",
+]
+def load_settings(**kwargs) -> Settings:
+    load_dotenv()
+    api_key = kwargs.get("openrouter_api_key") or os.getenv("OPENROUTER_API_KEY", "").strip()
+    if not api_key:
+        raise RuntimeError("Missing OPENROUTER_API_KEY in environment/.env")
+    model = kwargs.get("openrouter_model") or os.getenv("OPENROUTER_MODEL", "").strip() or None
+    max_images = kwargs.get("max_images") or int(os.getenv("MAX_IMAGES", "12"))
+    dpi = kwargs.get("dpi") or int(os.getenv("PDF_RENDER_DPI", "200"))
+    ocr_lang = kwargs.get("ocr_lang") or os.getenv("OCR_LANG", "eng")
+    min_text_chars_for_digital = kwargs.get("min_text_chars_for_digital") or int(os.getenv("MIN_TEXT_CHARS_FOR_DIGITAL", "80"))
+    topk_per_statement = kwargs.get("topk_per_statement") or int(os.getenv("TOPK_PER_STATEMENT", "3"))
+    return Settings(
+        openrouter_api_key=api_key,
+        openrouter_model=model,
+        max_images=max_images,
+        dpi=dpi,
+        ocr_lang=ocr_lang,
+        min_text_chars_for_digital=min_text_chars_for_digital,
+        topk_per_statement=topk_per_statement,
+    )

index.html ADDED Viewed

	@@ -0,0 +1,86 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Financial Report Analyzer</title>
+    <style>
+        body { font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; }
+        .container { border: 1px solid #ccc; padding: 20px; border-radius: 8px; background: #f9f9f9; }
+        h1 { text-align: center; color: #333; }
+        .form-group { margin-bottom: 20px; text-align: center; }
+        input[type="file"] { margin: 10px 0; }
+        button { background-color: #007bff; color: white; border: none; padding: 10px 20px; border-radius: 4px; cursor: pointer; font-size: 16px; }
+        button:hover { background-color: #0056b3; }
+        button:disabled { background-color: #ccc; cursor: not-allowed; }
+        #status { text-align: center; margin-top: 10px; font-weight: bold; }
+        #result { margin-top: 20px; white-space: pre-wrap; background: #fff; padding: 15px; border: 1px solid #ddd; border-radius: 4px; display: none; }
+        .error { color: #dc3545; }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Financial Report Analyzer</h1>
+        <p style="text-align: center;">Upload a 10-K/Annual Report PDF to extract page ranges for primary financial statements.</p>
+        <div class="form-group">
+            <input type="file" id="pdfInput" accept=".pdf" />
+            <br>
+            <button id="analyzeBtn" onclick="analyzePdf()">Analyze PDF</button>
+        </div>
+        <div id="status"></div>
+        <pre id="result"></pre>
+    </div>
+    <script>
+        async function analyzePdf() {
+            const input = document.getElementById('pdfInput');
+            const file = input.files[0];
+            const btn = document.getElementById('analyzeBtn');
+            const status = document.getElementById('status');
+            const resultDisplay = document.getElementById('result');
+            if (!file) {
+                alert("Please select a PDF file first.");
+                return;
+            }
+            // Reset UI
+            btn.disabled = true;
+            status.textContent = "Analyzing... This may take a minute.";
+            status.className = "";
+            resultDisplay.style.display = 'none';
+            resultDisplay.textContent = "";
+            const formData = new FormData();
+            formData.append('file', file);
+            try {
+                const response = await fetch('/analyze', {
+                    method: 'POST',
+                    body: formData
+                });
+                if (!response.ok) {
+                    const errorData = await response.json();
+                    throw new Error(errorData.detail || "Analysis failed");
+                }
+                const data = await response.json();
+                status.textContent = "Analysis Complete!";
+                resultDisplay.textContent = JSON.stringify(data, null, 2);
+                resultDisplay.style.display = 'block';
+            } catch (error) {
+                console.error("Error:", error);
+                status.textContent = "Error: " + error.message;
+                status.className = "error";
+            } finally {
+                btn.disabled = false;
+            }
+        }
+    </script>
+</body>
+</html>

main.py ADDED Viewed

	@@ -0,0 +1,297 @@

+from __future__ import annotations
+import argparse
+import json
+import time
+from config import load_settings, DEFAULT_FREE_VISION_MODELS
+from pdf_io import extract_texts_from_pdf, render_pages_to_png_bytes
+from statement_candidates import build_candidate_lists, select_pages_for_llm
+from openrouter_client import (
+    choose_free_vision_model,
+    choose_any_free_text_model,
+    chat_completion,
+    make_user_message_with_images,
+    robust_json_loads,
+    repair_to_json,
+)
+PROMPT_TEMPLATE = """
+You are given:
+1) OCR/extracted text for a set of PDF pages from a company's financial report (10-K/annual report)
+2) Images of the same pages
+Task:
+Identify the PDF PAGE RANGES (start_page, end_page) for the THREE PRIMARY FINANCIAL STATEMENT TABLES ONLY:
+- Balance Sheet (a.k.a. Statement of Financial Position / Consolidated Balance Sheets)
+- Profit & Loss (a.k.a. Income Statement / Statements of Earnings / Statements of Operations)
+- Cash Flow Statement (Statements of Cash Flows)
+IMPORTANT RULES (STRICT):
+- Only return ranges for the PRIMARY consolidated financial statements pages.
+- Do NOT return ranges for note disclosures (e.g., derivatives, leases, fair value tables), MD&A, segment notes, or narrative discussion.
+- A primary statement table page usually has:
+  (a) a clear statement title at the top (e.g., “Consolidated Balance Sheets”)
+  (b) many numeric columns (often multiple years)
+  (c) canonical line items like:
+      Balance sheet: “Total assets”, “Total liabilities”, “Total equity/stockholders’ equity”
+      P&L: “Net revenues/sales”, “Cost of sales”, “Operating income”, “Net earnings/income”, “Earnings per share”
+      Cash flow: “Cash flows from operating/investing/financing activities”, “Net cash provided by”, “Cash and cash equivalents at end”
+- If a statement continues onto the next page, include that continuation page in the range.
+Pages provided (OCR snippets):
+{page_snippets}
+Output JSON ONLY in this schema (no extra keys, no markdown):
+{{
+  "balance_sheet": {{"start_page": int, "end_page": int, "confidence": float, "title": str}},
+  "profit_and_loss": {{"start_page": int, "end_page": int, "confidence": float, "title": str}},
+  "cash_flow": {{"start_page": int, "end_page": int, "confidence": float, "title": str}}
+}}
+Remember: PDF page numbers are 1-based in your output.
+"""
+SCHEMA_HINT = """{
+  "balance_sheet": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
+  "profit_and_loss": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
+  "cash_flow": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
+  "notes": "string[]"
+}"""
+def log(msg: str):
+    ts = time.strftime("%H:%M:%S")
+    print(f"[{ts}] {msg}", flush=True)
+def build_page_snippets(page_texts, selected_pages):
+    chunks = []
+    for p in selected_pages:
+        pt = page_texts[p]
+        txt = (pt.extracted_text or "") + "\n" + (pt.ocr_text or "")
+        txt = " ".join(txt.strip().split())
+        if len(txt) > 900:
+            txt = txt[:900] + "..."
+        chunks.append(f"- Page {p+1}: {txt}")
+    return "\n".join(chunks)
+def validate_ranges(result: dict, page_count: int) -> dict:
+    def clamp(v):
+        if v is None:
+            return None
+        if not isinstance(v, int):
+            return None
+        if v < 1 or v > page_count:
+            return None
+        return v
+    for k in ["balance_sheet", "profit_and_loss", "cash_flow"]:
+        obj = result.get(k, {})
+        if not isinstance(obj, dict):
+            result[k] = {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None}
+            continue
+        sp = clamp(obj.get("start_page"))
+        ep = clamp(obj.get("end_page"))
+        if sp is not None and ep is not None and ep < sp:
+            sp, ep = None, None
+        obj["start_page"] = sp
+        obj["end_page"] = ep
+        if "confidence" not in obj or not isinstance(obj["confidence"], (int, float)):
+            obj["confidence"] = 0.0
+        if "evidence_pages" not in obj or not isinstance(obj["evidence_pages"], list):
+            obj["evidence_pages"] = []
+        if "title" not in obj:
+            obj["title"] = None
+        result[k] = obj
+    if "notes" not in result or not isinstance(result["notes"], list):
+        result["notes"] = []
+    return result
+def analyze_pdf(
+    pdf_path: str,
+    output_path: str = "ranges.json",
+    debug_dir: str = "debug",
+    openrouter_api_key: str = None
+) -> dict:
+    """
+    Analyzes a PDF to find financial statement page ranges.
+    Returns the result dict.
+    """
+    settings_kwargs = {}
+    if openrouter_api_key:
+        settings_kwargs["openrouter_api_key"] = openrouter_api_key
+    st = load_settings(**settings_kwargs)
+    log(f"Loading PDF: {pdf_path}")
+    page_texts, page_count = extract_texts_from_pdf(
+        pdf_path=pdf_path,
+        dpi=st.dpi,
+        ocr_lang=st.ocr_lang,
+        min_text_chars_for_digital=st.min_text_chars_for_digital,
+    )
+    ocr_pages = sum(1 for p in page_texts if p.used_ocr)
+    log(f"Pages: {page_count} | OCR used on {ocr_pages} pages")
+    candidates, cand_debug = build_candidate_lists(page_texts, top_k=30, debug=True)
+    log("TOC/Index debug:")
+    log(f"  item8_toc_page = {cand_debug.get('item8_toc_page')}")
+    log(f"  toc_internal   = {cand_debug.get('toc_internal')}")
+    log(f"  toc_pdf_all    = {cand_debug.get('toc_pdf_targets_all')}")
+    log(f"  heuristic_ranges_0_based = {cand_debug.get('heuristic_ranges_0_based')}")
+    selected_pages = select_pages_for_llm(
+        candidates=candidates,
+        debug_info=cand_debug,
+        page_count=page_count,
+        max_images=st.max_images
+    )
+    log(f"Selected pages to render/send (1-indexed): {[p+1 for p in selected_pages]}")
+    log(f"Rendering {len(selected_pages)} pages to images (dpi={st.dpi})...")
+    page_png_map = render_pages_to_png_bytes(pdf_path, selected_pages, dpi=st.dpi)
+    log("Image rendering done.")
+    if st.openrouter_model:
+        model = st.openrouter_model
+        log(f"Using model from env: {model}")
+    else:
+        model = choose_free_vision_model(st.openrouter_api_key, preferred=DEFAULT_FREE_VISION_MODELS)
+        log(f"Auto-selected free vision model: {model}")
+    snippets = build_page_snippets(page_texts, selected_pages)
+    prompt = PROMPT_TEMPLATE.format(page_snippets=snippets)
+    # --- LLM call with progressive image backoff ---
+    pages_sent = list(selected_pages)
+    llm_res = None
+    while pages_sent:
+        images = [page_png_map[p] for p in pages_sent]
+        msg = make_user_message_with_images(prompt, images)
+        log(f"Calling OpenRouter (images={len(images)})...")
+        llm_res = chat_completion(
+            api_key=st.openrouter_api_key,
+            model=model,
+            messages=[msg],
+            max_tokens=4096,
+            temperature=0.0,
+            require_json=True,
+        )
+        log(f"finish_reason={llm_res.finish_reason} native={llm_res.native_finish_reason} content_len={len(llm_res.content)}")
+        # save raw response for debugging
+        try:
+            import os
+            os.makedirs(debug_dir, exist_ok=True)
+            with open(f"{debug_dir}/openrouter_raw_response.json", "w", encoding="utf-8") as f:
+                json.dump(llm_res.raw, f, indent=2)
+        except Exception:
+            pass
+        if llm_res.finish_reason == "error" or ("error" in llm_res.raw and llm_res.raw["error"]):
+            log("OpenRouter returned an error payload (see debug/openrouter_raw_response.json). Backing off images...")
+        elif llm_res.content.strip():
+            break
+        if len(pages_sent) <= 3:
+            break
+        pages_sent = pages_sent[:-2]
+        log(f"Retrying with fewer images. Now sending pages: {[p+1 for p in pages_sent]}")
+    if not llm_res:
+        raise RuntimeError("LLM call never executed.")
+    raw_text = (llm_res.content or "").strip()
+    log("DEBUG: raw model output (first 1200 chars):")
+    print(raw_text[:1200], flush=True)
+    # --- Parse JSON with repair fallback ---
+    try:
+        result = robust_json_loads(raw_text)
+        log("Parsed JSON successfully.")
+    except Exception as e:
+        log(f"JSON parse failed: {e}")
+        # Save raw text
+        try:
+            import os
+            os.makedirs(debug_dir, exist_ok=True)
+            with open(f"{debug_dir}/llm_raw_output.txt", "w", encoding="utf-8") as f:
+                f.write(raw_text)
+        except Exception:
+            pass
+        # Repair pass with free-tier text model
+        repair_model = choose_any_free_text_model(st.openrouter_api_key, preferred=[
+            model,  # try same model first
+            "google/gemma-3-12b-it:free",
+            "amazon/nova-2-lite-v1:free",
+            "nvidia/nemotron-nano-12b-v2-vl:free",
+        ])
+        log(f"Attempting JSON repair using: {repair_model}")
+        try:
+            result = repair_to_json(
+                api_key=st.openrouter_api_key,
+                model=repair_model,
+                bad_output=raw_text if raw_text else json.dumps(llm_res.raw),
+                schema_hint=SCHEMA_HINT,
+            )
+            log("Repair JSON succeeded.")
+        except Exception as e2:
+            log(f"Repair JSON failed: {e2}")
+            # Final safe fallback
+            result = {
+                "balance_sheet": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
+                "profit_and_loss": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
+                "cash_flow": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
+                "notes": [
+                    "Model output could not be parsed as JSON.",
+                    "Check debug/openrouter_raw_response.json and debug/llm_raw_output.txt",
+                ],
+            }
+    result = validate_ranges(result, page_count=page_count)
+    result["debug"] = {
+        "model_used": model,
+        "pages_sent": [p + 1 for p in pages_sent],
+        "candidate_pages": candidates,
+        "finish_reason": llm_res.finish_reason,
+        "native_finish_reason": llm_res.native_finish_reason,
+    }
+    if output_path:
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=2)
+        log(f"Saved output: {output_path}")
+    return result
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--pdf", required=True, help="Path to financial report PDF")
+    ap.add_argument("--out", default="ranges.json", help="Output JSON path")
+    ap.add_argument("--debug_dir", default="debug", help="Folder to write debug artifacts")
+    args = ap.parse_args()
+    # Call the core logic
+    result = analyze_pdf(
+        pdf_path=args.pdf,
+        output_path=args.out,
+        debug_dir=args.debug_dir
+    )
+    # Print result to stdout for CLI use
+    print(json.dumps(result, indent=2), flush=True)
+if __name__ == "__main__":
+    main()

openrouter_client.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from __future__ import annotations
+import base64
+import json
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+import requests
+OPENROUTER_CHAT_URL = "https://openrouter.ai/api/v1/chat/completions"
+OPENROUTER_MODELS_URL = "https://openrouter.ai/api/v1/models"
+@dataclass
+class ChatResult:
+    content: str
+    finish_reason: str | None
+    native_finish_reason: str | None
+    tool_calls: Any
+    raw: dict
+def list_models(api_key: str) -> dict:
+    headers = {"Authorization": f"Bearer {api_key}"}
+    r = requests.get(OPENROUTER_MODELS_URL, headers=headers, timeout=60)
+    r.raise_for_status()
+    return r.json()
+def choose_free_vision_model(api_key: str, preferred: list[str]) -> str:
+    models = list_models(api_key).get("data", [])
+    by_id = {m.get("id"): m for m in models}
+    def is_free(m: dict) -> bool:
+        pricing = m.get("pricing") or {}
+        try:
+            return float(pricing.get("prompt", "1")) == 0.0 and float(pricing.get("completion", "1")) == 0.0
+        except Exception:
+            return False
+    def is_vision(m: dict) -> bool:
+        arch = (m.get("architecture") or {})
+        in_mods = set(arch.get("input_modalities") or [])
+        return "image" in in_mods
+    # Preferred first
+    for mid in preferred:
+        m = by_id.get(mid)
+        if m and is_free(m) and is_vision(m):
+            return mid
+    # Any free vision
+    for m in models:
+        if is_free(m) and is_vision(m):
+            return m.get("id")
+    raise RuntimeError("Could not find any free vision-capable model in /models.")
+def choose_any_free_text_model(api_key: str, preferred: list[str] | None = None) -> str:
+    models = list_models(api_key).get("data", [])
+    by_id = {m.get("id"): m for m in models}
+    def is_free(m: dict) -> bool:
+        pricing = m.get("pricing") or {}
+        try:
+            return float(pricing.get("prompt", "1")) == 0.0 and float(pricing.get("completion", "1")) == 0.0
+        except Exception:
+            return False
+    def is_text_input(m: dict) -> bool:
+        arch = (m.get("architecture") or {})
+        in_mods = set(arch.get("input_modalities") or [])
+        return "text" in in_mods
+    if preferred:
+        for mid in preferred:
+            m = by_id.get(mid)
+            if m and is_free(m) and is_text_input(m):
+                return mid
+    for m in models:
+        if is_free(m) and is_text_input(m):
+            return m.get("id")
+    raise RuntimeError("Could not find any free text-capable model in /models.")
+def _img_bytes_to_data_url(png_bytes: bytes) -> str:
+    b64 = base64.b64encode(png_bytes).decode("utf-8")
+    return f"data:image/png;base64,{b64}"
+def make_user_message_with_images(prompt_text: str, images: list[bytes]) -> dict:
+    """
+    OpenRouter follows OpenAI chat schema; some SDK examples show imageUrl (camelCase).
+    We include both keys for maximum compatibility.
+    """
+    content: list[dict] = [{"type": "text", "text": prompt_text}]
+    for im in images:
+        url = _img_bytes_to_data_url(im)
+        content.append(
+            {
+                "type": "image_url",
+                "image_url": {"url": url},  # OpenAI-style
+                "imageUrl": {"url": url},   # SDK-style
+            }
+        )
+    return {"role": "user", "content": content}
+def chat_completion(
+    api_key: str,
+    model: str,
+    messages: list[dict],
+    max_tokens: int = 2000,
+    temperature: float = 0.0,
+    require_json: bool = True,
+    extra: dict | None = None,
+) -> ChatResult:
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+        "HTTP-Referer": "http://localhost",
+        "X-Title": "fin-statement-page-locator",
+    }
+    payload: dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        # Force no tool calls even if provider supports them
+        "tool_choice": "none",
+    }
+    if require_json:
+        # OpenRouter supports response_format json_object (JSON mode)
+        payload["response_format"] = {"type": "json_object"}
+    if extra:
+        payload.update(extra)
+    r = requests.post(OPENROUTER_CHAT_URL, headers=headers, json=payload, timeout=180)
+    r.raise_for_status()
+    data = r.json()
+    # OpenRouter can return errors at top-level even with HTTP 200 in some scenarios
+    if isinstance(data, dict) and "error" in data and data["error"]:
+        # keep raw for debugging
+        return ChatResult(
+            content="",
+            finish_reason="error",
+            native_finish_reason=None,
+            tool_calls=None,
+            raw=data,
+        )
+    choice0 = (data.get("choices") or [{}])[0]
+    msg = choice0.get("message") or {}
+    content = (msg.get("content") or "").strip()
+    tool_calls = msg.get("tool_calls") or msg.get("toolCalls")
+    return ChatResult(
+        content=content,
+        finish_reason=choice0.get("finish_reason"),
+        native_finish_reason=choice0.get("native_finish_reason"),
+        tool_calls=tool_calls,
+        raw=data,
+    )
+def _extract_json_from_codeblock(s: str) -> str | None:
+    # ```json ... ```
+    m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", s, flags=re.IGNORECASE)
+    if m:
+        return m.group(1).strip()
+    return None
+def _extract_first_balanced_object(s: str) -> str | None:
+    """
+    Extract the first balanced {...} JSON object from arbitrary text.
+    """
+    start = s.find("{")
+    if start == -1:
+        return None
+    depth = 0
+    for i in range(start, len(s)):
+        ch = s[i]
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return s[start : i + 1]
+    return None
+def robust_json_loads(s: str) -> dict:
+    s = (s or "").strip()
+    if not s:
+        raise ValueError("Empty model content (no JSON to parse).")
+    # 1) direct parse
+    try:
+        return json.loads(s)
+    except Exception:
+        pass
+    # 2) codeblock
+    cb = _extract_json_from_codeblock(s)
+    if cb:
+        try:
+            return json.loads(cb)
+        except Exception:
+            pass
+    # 3) balanced object
+    obj = _extract_first_balanced_object(s)
+    if obj:
+        return json.loads(obj)
+    raise ValueError("Could not parse JSON from model output (no valid JSON object found).")
+def repair_to_json(
+    api_key: str,
+    model: str,
+    bad_output: str,
+    schema_hint: str,
+) -> dict:
+    """
+    Ask a free model to convert arbitrary text into valid JSON for our schema.
+    """
+    repair_prompt = f"""Convert the following content into VALID JSON ONLY.
+No markdown, no backticks, no explanations.
+Schema (must match keys/types):
+{schema_hint}
+Content to convert:
+{bad_output}
+"""
+    msg = {"role": "user", "content": repair_prompt}
+    res = chat_completion(
+        api_key=api_key,
+        model=model,
+        messages=[msg],
+        max_tokens=900,
+        temperature=0.0,
+        require_json=True,
+    )
+    return robust_json_loads(res.content)

pdf_io.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+import fitz  # PyMuPDF
+from PIL import Image
+import io
+@dataclass
+class PageText:
+    page_index: int  # 0-based
+    extracted_text: str
+    ocr_text: str
+    used_ocr: bool
+def _safe_text(s: str) -> str:
+    return (s or "").replace("\x00", " ").strip()
+def render_page_to_pil(doc: fitz.Document, page_index: int, dpi: int) -> Image.Image:
+    page = doc.load_page(page_index)
+    zoom = dpi / 72.0
+    mat = fitz.Matrix(zoom, zoom)
+    pix = page.get_pixmap(matrix=mat, alpha=False)
+    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    return img
+def ocr_pil_image(img: Image.Image, lang: str = "eng") -> str:
+    try:
+        import pytesseract
+    except Exception as e:
+        raise RuntimeError(
+            "pytesseract not available. Install pytesseract and system Tesseract OCR."
+        ) from e
+    # psm 6: assume a block of text (good for tables + headings)
+    txt = pytesseract.image_to_string(img, lang=lang, config="--psm 6")
+    return _safe_text(txt)
+def is_likely_scanned(extracted_text: str, min_chars: int) -> bool:
+    # If the page has almost no selectable text, it’s probably scanned.
+    return len(_safe_text(extracted_text)) < min_chars
+def extract_texts_from_pdf(
+    pdf_path: str,
+    dpi: int,
+    ocr_lang: str,
+    min_text_chars_for_digital: int,
+) -> Tuple[List[PageText], int]:
+    doc = fitz.open(pdf_path)
+    page_count = doc.page_count
+    results: List[PageText] = []
+    for i in range(page_count):
+        page = doc.load_page(i)
+        extracted = _safe_text(page.get_text("text"))
+        if is_likely_scanned(extracted, min_text_chars_for_digital):
+            img = render_page_to_pil(doc, i, dpi=dpi)
+            ocr_txt = ocr_pil_image(img, lang=ocr_lang)
+            results.append(PageText(i, extracted_text=extracted, ocr_text=ocr_txt, used_ocr=True))
+        else:
+            results.append(PageText(i, extracted_text=extracted, ocr_text="", used_ocr=False))
+    doc.close()
+    return results, page_count
+def render_pages_to_png_bytes(pdf_path: str, page_indices: List[int], dpi: int) -> dict[int, bytes]:
+    doc = fitz.open(pdf_path)
+    out: dict[int, bytes] = {}
+    for p in page_indices:
+        img = render_page_to_pil(doc, p, dpi=dpi)
+        buf = io.BytesIO()
+        img.save(buf, format="PNG")
+        out[p] = buf.getvalue()
+    doc.close()
+    return out

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+uvicorn
+python-multipart
+pymupdf
+pillow
+requests
+python-dotenv

statement_candidates.py ADDED Viewed

	@@ -0,0 +1,545 @@

+# statement_candidates.py
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+import difflib
+# =========================
+# Targets (you want ONLY these 3)
+# =========================
+TARGETS = ["balance_sheet", "profit_and_loss", "cash_flow"]
+# Auxiliary statements used ONLY for delimiting ranges (helpful in 10-K order)
+AUX = ["comprehensive_income", "equity", "notes"]
+# =========================
+# Title variants (based on your screenshots + common 10-K phrasing)
+# =========================
+TITLE_VARIANTS: Dict[str, List[str]] = {
+    "balance_sheet": [
+        "Consolidated Balance Sheets",
+        "Balance Sheets",
+        "Statement of Financial Position",
+    ],
+    "profit_and_loss": [
+        "Consolidated Statements of Earnings",      # AbbVie screenshot
+        "Consolidated Statements of Operations",
+        "Consolidated Statements of Income",
+        "Income Statement",
+        "Statement of Profit and Loss",
+    ],
+    "cash_flow": [
+        "Consolidated Statements of Cash Flows",
+        "Statement of Cash Flows",
+        "Cash Flow Statement",
+    ],
+    # auxiliary
+    "comprehensive_income": [
+        "Consolidated Statements of Comprehensive Income",
+        "Statement of Comprehensive Income",
+    ],
+    "equity": [
+        "Consolidated Statements of Equity",
+        "Statement of Stockholders' Equity",
+        "Statement of Shareholders' Equity",
+    ],
+    "notes": [
+        "Notes to Consolidated Financial Statements",
+        "Notes to Financial Statements",
+    ],
+}
+# Footer phrase (exact idea from your images)
+INTEGRAL_FOOTER = "the accompanying notes are an integral part"
+# =========================
+# Signature table line-items (increase precision against note tables)
+# =========================
+SIG_TERMS: Dict[str, List[str]] = {
+    "balance_sheet": [
+        "total assets",
+        "total liabilities",
+        "total equity",
+        "stockholders' equity",
+        "shareholders' equity",
+        "assets",
+        "liabilities and equity",
+        "current assets",
+        "current liabilities",
+    ],
+    "profit_and_loss": [
+        "net revenues",
+        "net sales",
+        "revenue",
+        "cost of products sold",
+        "cost of sales",
+        "gross profit",
+        "operating income",
+        "operating earnings",
+        "net earnings",
+        "net income",
+        "earnings per share",
+        "basic",
+        "diluted",
+    ],
+    "cash_flow": [
+        "cash flows from operating activities",
+        "cash flows from investing activities",
+        "cash flows from financing activities",
+        "net cash provided by operating activities",
+        "net cash used in investing activities",
+        "net cash used in financing activities",
+        "cash and cash equivalents, end of year",
+        "cash and equivalents, end of year",
+        "net change in cash",
+    ],
+    # aux
+    "notes": ["note 1", "note 2", "notes to consolidated financial statements"],
+}
+NOTE_HEADING_RE = re.compile(r"^\s*note\s+\d+\b", re.IGNORECASE)
+# Typical TOC “dot leaders”
+DOT_LEADER_RE = re.compile(r"\.{5,}")
+# Item 8 TOC trigger
+ITEM8_RE = re.compile(r"\bITEM\s+8\.\s+FINANCIAL\s+STATEMENTS\s+AND\s+SUPPLEMENTARY\s+DATA\b", re.IGNORECASE)
+# =========================
+# Page object -> combined text
+# =========================
+def _combined_text(page_obj: Any) -> str:
+    """
+    Works with your PageText dataclass:
+      extracted_text + ocr_text
+    Also supports dict/object string fallback.
+    """
+    if page_obj is None:
+        return ""
+    if isinstance(page_obj, str):
+        return page_obj
+    # dict-like
+    if isinstance(page_obj, dict):
+        a = page_obj.get("extracted_text") or page_obj.get("text") or ""
+        b = page_obj.get("ocr_text") or ""
+        return (a + "\n" + b).strip()
+    # attribute style
+    a = getattr(page_obj, "extracted_text", None) or getattr(page_obj, "text", None) or ""
+    b = getattr(page_obj, "ocr_text", None) or ""
+    return (a + "\n" + b).strip()
+def _page_index(page_obj: Any, fallback: int) -> int:
+    if isinstance(page_obj, dict):
+        if isinstance(page_obj.get("page_index"), int):
+            return int(page_obj["page_index"])
+    v = getattr(page_obj, "page_index", None)
+    return int(v) if isinstance(v, int) else fallback
+def _norm(s: str) -> str:
+    return re.sub(r"\s+", " ", (s or "")).strip().lower()
+# =========================
+# Fuzzy title detection (OCR typos tolerant)
+# =========================
+def _fuzzy_line_contains_title(top_lines: List[str], title: str, threshold: float = 0.86) -> bool:
+    title_n = _norm(title)
+    for ln in top_lines:
+        ln_n = _norm(ln)
+        if not ln_n:
+            continue
+        # direct contains
+        if title_n in ln_n:
+            return True
+        # fuzzy ratio
+        r = difflib.SequenceMatcher(None, ln_n, title_n).ratio()
+        if r >= threshold:
+            return True
+    return False
+def detect_title(text: str, stmt: str) -> bool:
+    lines = (text or "").splitlines()
+    top_lines = [ln.strip() for ln in lines[:14] if ln.strip()]  # titles live here in your screenshots
+    for variant in TITLE_VARIANTS.get(stmt, []):
+        if _fuzzy_line_contains_title(top_lines, variant):
+            return True
+    return False
+# =========================
+# Footer internal page number extraction (10-K style)
+# =========================
+FOOTER_PIPE_RE = re.compile(r"\|\s*(\d{1,4})\s*$", re.MULTILINE)
+FOOTER_FORM_RE = re.compile(r"form\s+10-?k\s*\|\s*(\d{1,4})\s*$", re.IGNORECASE | re.MULTILINE)
+def extract_footer_internal_page(text: str) -> Optional[int]:
+    t = text or ""
+    m = FOOTER_PIPE_RE.findall(t)
+    if m:
+        return int(m[-1])
+    m = FOOTER_FORM_RE.findall(t)
+    if m:
+        return int(m[-1])
+    # fallback: last few non-empty lines that are ONLY digits (avoid table numbers)
+    lines = [ln.strip() for ln in (t.splitlines() if t else []) if ln.strip()]
+    for ln in reversed(lines[-6:]):
+        if re.fullmatch(r"\d{1,4}", ln):
+            return int(ln)
+    return None
+# =========================
+# Item 8 TOC page detection + TOC parsing
+# AbbVie TOC is "title line" then next line has page number ("55")
+# =========================
+def find_item8_toc_page(all_texts: Sequence[str]) -> Optional[int]:
+    """
+    Choose the Item 8 page that LOOKS like an index/TOC (has dot leaders or 'Page').
+    """
+    candidates = []
+    for i, txt in enumerate(all_texts):
+        if not ITEM8_RE.search(txt or ""):
+            continue
+        low = _norm(txt)
+        tocish = ("page" in low) and (DOT_LEADER_RE.search(txt or "") is not None)
+        if tocish:
+            candidates.append(i)
+    return candidates[0] if candidates else None
+def parse_statement_index_numbers(toc_text: str) -> Dict[str, int]:
+    """
+    Returns internal page numbers from the index.
+    Handles:
+      - same line "Consolidated Balance Sheets .... 57"
+      - two-line "Consolidated Balance Sheets" newline "57"  (AbbVie)
+    """
+    lines = [ln.strip() for ln in (toc_text or "").splitlines()]
+    out: Dict[str, int] = {}
+    # compile quick patterns
+    pats = {
+        "profit_and_loss": re.compile(r"consolidated\s+statements?\s+of\s+(earnings|operations|income)", re.I),
+        "comprehensive_income": re.compile(r"consolidated\s+statements?\s+of\s+comprehensive\s+income", re.I),
+        "balance_sheet": re.compile(r"consolidated\s+balance\s+sheets?|statement\s+of\s+financial\s+position", re.I),
+        "equity": re.compile(r"consolidated\s+statements?\s+of\s+equity|stockholders[’']\s+equity|shareholders[’']\s+equity", re.I),
+        "cash_flow": re.compile(r"consolidated\s+statements?\s+of\s+cash\s+flows?", re.I),
+        "notes": re.compile(r"notes\s+to\s+consolidated\s+financial\s+statements", re.I),
+    }
+    for i, ln in enumerate(lines):
+        if not ln:
+            continue
+        for key, pat in pats.items():
+            if not pat.search(ln):
+                continue
+            # case 1: number on same line at end
+            m = re.findall(r"(\d{1,4})\s*$", ln)
+            if m and ln.endswith(m[-1]):
+                out[key] = int(m[-1])
+                continue
+            # case 2: number on next non-empty line
+            j = i + 1
+            while j < len(lines) and not lines[j]:
+                j += 1
+            if j < len(lines) and re.fullmatch(r"\d{1,4}", lines[j]):
+                out[key] = int(lines[j])
+    return out
+def build_internal_to_pdf_map(all_texts: Sequence[str]) -> Dict[int, int]:
+    """
+    internal_page_number -> pdf_page_index
+    """
+    mapping: Dict[int, int] = {}
+    for pdf_i, txt in enumerate(all_texts):
+        n = extract_footer_internal_page(txt or "")
+        if n is None:
+            continue
+        mapping.setdefault(n, pdf_i)  # keep first occurrence
+    return mapping
+def map_internal_to_pdf(internal: int, internal_to_pdf: Dict[int, int]) -> Optional[int]:
+    """
+    Robust mapping:
+    - direct if exists
+    - else estimate from nearest known internal page (assumes mostly consecutive internal numbering)
+    """
+    if internal in internal_to_pdf:
+        return internal_to_pdf[internal]
+    # nearest neighbor estimate
+    keys = sorted(internal_to_pdf.keys())
+    if not keys:
+        return None
+    # find closest key
+    best_k = min(keys, key=lambda k: abs(k - internal))
+    return internal_to_pdf[best_k] + (internal - best_k)
+# =========================
+# Strong statement scoring (only used if TOC mapping fails)
+# =========================
+def _page_stats(text: str) -> Dict[str, float]:
+    t = text or ""
+    low = t.lower()
+    # numeric signals
+    year_count = len(re.findall(r"\b20\d{2}\b", t))
+    currency_count = len(re.findall(r"[$€£]|usd|inr|eur|gbp", low))
+    paren_neg = len(re.findall(r"\(\s*\d", t))  # (123) negatives
+    integral = 1.0 if INTEGRAL_FOOTER in low else 0.0
+    tokens = re.findall(r"[A-Za-z]+|\d+(?:,\d{3})*(?:\.\d+)?", t)
+    if not tokens:
+        return dict(num_ratio=0.0, year_count=float(year_count), currency=float(currency_count),
+                    paren=float(paren_neg), integral=integral)
+    nums = sum(1 for tok in tokens if re.fullmatch(r"\d+(?:,\d{3})*(?:\.\d+)?", tok))
+    alphas = sum(1 for tok in tokens if re.fullmatch(r"[A-Za-z]+", tok))
+    num_ratio = nums / max(1.0, nums + alphas)
+    return dict(num_ratio=float(num_ratio), year_count=float(year_count), currency=float(currency_count),
+                paren=float(paren_neg), integral=integral)
+def score_statement_page(text: str, stmt: str) -> Tuple[float, Dict[str, Any]]:
+    low = (text or "").lower()
+    top = (text or "")[:1200]
+    st = _page_stats(text)
+    reasons = {"title": False, "sig_hits": [], "integral": False, "penalties": [], "stats": st}
+    score = 0.0
+    # Title near top is a MUST (or fuzzy)
+    if detect_title(top, stmt):
+        score += 60.0
+        reasons["title"] = True
+    else:
+        # without title, heavily downrank (note tables can be very numeric)
+        score -= 25.0
+        reasons["penalties"].append("no_title(-25)")
+    # Integral footer is very characteristic of primary statements (seen in your screenshots)
+    if st["integral"] > 0:
+        score += 18.0
+        reasons["integral"] = True
+    # Signature line items: require multiple hits
+    hits = 0
+    for term in SIG_TERMS.get(stmt, []):
+        if term in low:
+            hits += 1
+            reasons["sig_hits"].append(term)
+    score += min(hits, 10) * 6.0  # stronger weight
+    # Table-ness: years + currency + negative brackets + numeric ratio
+    score += st["num_ratio"] * 30.0
+    score += min(st["year_count"], 10.0) * 1.5
+    score += min(st["currency"], 10.0) * 2.0
+    score += min(st["paren"], 10.0) * 1.0
+    # Hard penalties for NOTE pages
+    if NOTE_HEADING_RE.search((text or "")[:220]):
+        score -= 60.0
+        reasons["penalties"].append("note_heading(-60)")
+    # If it looks like TOC index page, punish (dot leaders)
+    if DOT_LEADER_RE.search(text or ""):
+        score -= 30.0
+        reasons["penalties"].append("toc_dotleaders(-30)")
+    # Guardrails:
+    # If title found but it doesn't look like a table at all, punish
+    if reasons["title"] and st["num_ratio"] < 0.10 and st["year_count"] < 1:
+        score -= 35.0
+        reasons["penalties"].append("title_without_table(-35)")
+    # Require at least 2 signature hits for high confidence
+    if hits < 2:
+        score -= 18.0
+        reasons["penalties"].append("low_sig_hits(<2)(-18)")
+    return score, reasons
+# =========================
+# Range inference from ordered statement starts
+# =========================
+def infer_ranges_from_starts(
+    starts_pdf: Dict[str, int],
+    page_count: int,
+    ordered_keys: List[str],
+) -> Dict[str, Tuple[int, int]]:
+    """
+    Given start pdf indices (0-based) for an ordered list of keys,
+    return inclusive ranges for TARGETS based on next-start-1.
+    """
+    # keep only those that exist
+    items = [(k, starts_pdf[k]) for k in ordered_keys if k in starts_pdf and isinstance(starts_pdf[k], int)]
+    items.sort(key=lambda x: x[1])
+    next_start = {}
+    for idx, (k, p) in enumerate(items):
+        nxt = items[idx + 1][1] if idx + 1 < len(items) else None
+        next_start[k] = nxt
+    ranges: Dict[str, Tuple[int, int]] = {}
+    for k, p in items:
+        end = (next_start[k] - 1) if next_start[k] is not None else p
+        end = min(max(end, p), page_count - 1)
+        ranges[k] = (p, end)
+    # return only targets that exist
+    return {k: ranges[k] for k in TARGETS if k in ranges}
+# =========================
+# Public API
+# =========================
+def build_candidate_lists(
+    pages: Sequence[Any],
+    top_k: int = 25,
+    debug: bool = True,
+) -> Tuple[Dict[str, List[Tuple[int, float]]], Dict[str, Any]]:
+    """
+    Returns:
+      candidates: {stmt: [(pdf_page_idx, score), ...]}  for TARGETS only
+      debug_info: contains toc/internal mapping and top explanations
+    """
+    all_texts = [_combined_text(p) for p in pages]
+    page_count = len(all_texts)
+    debug_info: Dict[str, Any] = {
+        "item8_toc_page": None,
+        "toc_internal": {},
+        "internal_to_pdf_map_size": 0,
+        "toc_pdf_targets_all": {},
+        "heuristic_ranges_0_based": {},
+        "top_scoring": {},
+    }
+    # ---- 1) TOC-based detection (most accurate on 10-K) ----
+    toc_i = find_item8_toc_page(all_texts)
+    if toc_i is not None:
+        toc_text = all_texts[toc_i]
+        toc_internal = parse_statement_index_numbers(toc_text)
+        internal_to_pdf = build_internal_to_pdf_map(all_texts)
+        toc_pdf_all: Dict[str, int] = {}
+        for k, internal_n in toc_internal.items():
+            mapped = map_internal_to_pdf(internal_n, internal_to_pdf)
+            if mapped is not None and 0 <= mapped < page_count:
+                toc_pdf_all[k] = mapped
+        debug_info.update({
+            "item8_toc_page": toc_i,
+            "toc_internal": toc_internal,
+            "internal_to_pdf_map_size": len(internal_to_pdf),
+            "toc_pdf_targets_all": toc_pdf_all,
+        })
+        # If we got our 3 targets, build direct ranges using the typical order:
+        # Earnings -> Comprehensive Income -> Balance Sheet -> Equity -> Cash Flow -> Notes
+        if all(k in toc_pdf_all for k in ["profit_and_loss", "balance_sheet", "cash_flow"]):
+            ordered = ["profit_and_loss", "comprehensive_income", "balance_sheet", "equity", "cash_flow", "notes"]
+            ranges = infer_ranges_from_starts(toc_pdf_all, page_count, ordered)
+            debug_info["heuristic_ranges_0_based"] = ranges
+            # Build candidates directly from these starts with huge confidence
+            candidates = {k: [] for k in TARGETS}
+            for k in TARGETS:
+                start, end = ranges.get(k, (None, None))
+                if start is None:
+                    continue
+                # prioritize start page; include end too
+                candidates[k].append((start, 999.0))
+                if end != start:
+                    candidates[k].append((end, 950.0))
+            return candidates, debug_info
+    # ---- 2) Fallback: statement scoring over ALL pages ----
+    candidates: Dict[str, List[Tuple[int, float]]] = {k: [] for k in TARGETS}
+    reasons_store: Dict[str, Dict[int, Any]] = {k: {} for k in TARGETS}
+    for i, p in enumerate(pages):
+        idx = _page_index(p, i)
+        txt = _combined_text(p)
+        for stmt in TARGETS:
+            sc, why = score_statement_page(txt, stmt)
+            if sc > 0:
+                candidates[stmt].append((idx, float(sc)))
+                if debug and (why["title"] or sc > 80):
+                    reasons_store[stmt][idx] = why
+    for stmt in TARGETS:
+        candidates[stmt].sort(key=lambda x: x[1], reverse=True)
+        candidates[stmt] = candidates[stmt][:max(8, top_k)]
+        if debug:
+            debug_info["top_scoring"][stmt] = [
+                {"page": p, "score": round(s, 2), "why": reasons_store[stmt].get(p)}
+                for p, s in candidates[stmt][:10]
+            ]
+    return candidates, debug_info
+def select_pages_for_llm(
+    candidates: Dict[str, List[Tuple[int, float]]],
+    debug_info: Dict[str, Any],
+    page_count: int,
+    max_images: int,
+) -> List[int]:
+    """
+    If TOC-based ranges exist -> send ONLY those pages (+neighbors) (highest precision).
+    Else -> send top candidates + neighbors.
+    """
+    picked = []
+    seen = set()
+    def add(p: int):
+        if 0 <= p < page_count and p not in seen and len(picked) < max_images:
+            seen.add(p)
+            picked.append(p)
+    # TOC ranges (best)
+    ranges = debug_info.get("heuristic_ranges_0_based") or {}
+    if ranges:
+        for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
+            if stmt in ranges:
+                s, e = ranges[stmt]
+                for p in range(s, e + 1):
+                    add(p)
+                add(s - 1)
+                add(e + 1)
+        return sorted(picked)
+    # fallback
+    for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
+        for (p, _sc) in candidates.get(stmt, [])[:2]:
+            add(p)
+            add(p - 1)
+            add(p + 1)
+    return sorted(picked)