Spaces:

FridayCodehhr
/

finalyze

Sleeping

App Files Files Community

FridayCodehhr commited on Dec 21, 2025

Commit

a9d5e1b

verified ·

1 Parent(s): 689d59b

Upload 10 files

Browse files

Files changed (10) hide show

Dockerfile +0 -8
app.py +43 -39
config.py +22 -3
image_server_snippet.py +22 -0
index.html +153 -18
main.py +307 -217
openrouter_client.py +87 -155
pdf_io.py +28 -21
requirements.txt +1 -1
statement_candidates.py +369 -215

Dockerfile CHANGED Viewed

@@ -1,7 +1,6 @@
 # Use official Python runtime as a parent image
 FROM python:3.10-slim
-# Set the working directory in the container
 WORKDIR /app
 # Install system dependencies (Tesseract)
@@ -10,23 +9,16 @@ RUN apt-get update && apt-get install -y \
     libtesseract-dev \
     && rm -rf /var/lib/apt/lists/*
-# Copy requirements first to leverage Docker cache
 COPY requirements.txt .
-# Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy the rest of the application code
 COPY . .
-# Create a user to run the app (security best practice, required by some environments)
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user \
     PATH=/home/user/.local/bin:$PATH
-# Expose port 7860 (Hugging Face Spaces default)
 EXPOSE 7860
-# Command to run the application
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 # Use official Python runtime as a parent image
 FROM python:3.10-slim
 WORKDIR /app
 # Install system dependencies (Tesseract)
     libtesseract-dev \
     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user \
     PATH=/home/user/.local/bin:$PATH
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,18 +1,14 @@
-import os
 import shutil
 import tempfile
-import json
 from fastapi import FastAPI, File, UploadFile, HTTPException
-from fastapi.responses import JSONResponse, HTMLResponse
 from fastapi.staticfiles import StaticFiles
 from main import analyze_pdf
 app = FastAPI()
-# Mount static files to serve index.html
-# We assume index.html is in the same directory
-app.mount("/static", StaticFiles(directory="."), name="static")
 @app.get("/", response_class=HTMLResponse)
 async def read_root():
     with open("index.html", "r") as f:
@@ -23,45 +19,53 @@ async def analyze_endpoint(file: UploadFile = File(...)):
     if not file.filename.endswith(".pdf"):
         raise HTTPException(status_code=400, detail="File must be a PDF")
-    # Save uploaded file to a temp location
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
-        shutil.copyfileobj(file.file, tmp)
-        tmp_path = tmp.name
     try:
-        # Create a temp debug dir
-        debug_dir = tempfile.mkdtemp()
-        # Get API Key from environment (injected by Space secrets)
-        api_key = os.getenv("OPENROUTER_API_KEY")
-        if not api_key:
-            raise HTTPException(status_code=500, detail="Server misconfigured: OPENROUTER_API_KEY missing")
-        # Run analysis using the refactored main logic
-        # We pass None for output_path so it doesn't try to write to a fixed file unless we want it to
-        # But analyze_pdf writes to output_path if provided. We can just let it return the dict.
         result = analyze_pdf(
-            pdf_path=tmp_path,
-            output_path="", # Don't write to file, just return dict
-            debug_dir=debug_dir,
-            openrouter_api_key=api_key
         )
         return JSONResponse(content=result)
     except Exception as e:
         import traceback
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=str(e))
-    finally:
-        # Cleanup
-        if os.path.exists(tmp_path):
-            os.remove(tmp_path)
-        # We might want to keep debug dir for a bit or clean it up.
-        # For a simple demo, we can clean it up or ignore it (tmp cleans up eventually on restart usually, but explicitly is better)
-        if os.path.exists(debug_dir):
-            shutil.rmtree(debug_dir, ignore_errors=True)
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import shutil
 import tempfile
+import os
 from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import HTMLResponse, JSONResponse, Response
 from fastapi.staticfiles import StaticFiles
 from main import analyze_pdf
 app = FastAPI()
+# serve index.html at root
 @app.get("/", response_class=HTMLResponse)
 async def read_root():
     with open("index.html", "r") as f:
     if not file.filename.endswith(".pdf"):
         raise HTTPException(status_code=400, detail="File must be a PDF")
+    # Save to a known fixed path for the viewing endpoint
+    # Note: This is not thread-safe/multi-user safe, but sufficient for this local demo.
+    fixed_path = "latest_upload.pdf"
+    with open(fixed_path, "wb") as f:
+        # seek back to start if we copied it (but we didn't read it yet)
+        # Actually file is an UploadFile, we can just save it.
+        # But we already copied to tmp. Let's just use the tmp copy logic but to fixed path.
+        pass
+    # Actually, let's just write directly to fixed_path!
+    with open(fixed_path, "wb") as f:
+        shutil.copyfileobj(file.file, f)
     try:
         result = analyze_pdf(
+            pdf_path=fixed_path,
+            output_path="",
+            debug_dir=""
         )
         return JSONResponse(content=result)
     except Exception as e:
         import traceback
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=str(e))
+import fitz
+@app.get("/pdf/page/{page_num}")
+async def get_pdf_page(page_num: int):
+    path = "latest_upload.pdf"
+    if not os.path.exists(path):
+        return Response(status_code=404)
+    try:
+        doc = fitz.open(path)
+        if page_num < 1 or page_num > doc.page_count:
+             doc.close()
+             return Response(status_code=404)
+        page = doc.load_page(page_num - 1)
+        # decent resolution for web viewing
+        zoom = 150 / 72.0
+        mat = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat, alpha=False)
+        img_bytes = pix.tobytes("png")
+        doc.close()
+        return Response(content=img_bytes, media_type="image/png")
+    except Exception as e:
+        print(f"Error serving page: {e}")
+        return Response(status_code=500)

config.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from __future__ import annotations
 from dataclasses import dataclass
 import os
 from dotenv import load_dotenv
 @dataclass(frozen=True)
 class Settings:
     openrouter_api_key: str
@@ -13,16 +15,22 @@ class Settings:
     min_text_chars_for_digital: int
     topk_per_statement: int
 DEFAULT_FREE_VISION_MODELS = [
-    # Free + vision-capable (as of their OpenRouter pages)
     "google/gemma-3-12b-it:free",
     "nvidia/nemotron-nano-12b-v2-vl:free",
     "amazon/nova-2-lite-v1:free",
 ]
 def load_settings(**kwargs) -> Settings:
     load_dotenv()
     api_key = kwargs.get("openrouter_api_key") or os.getenv("OPENROUTER_API_KEY", "").strip()
     if not api_key:
         raise RuntimeError("Missing OPENROUTER_API_KEY in environment/.env")
@@ -31,9 +39,18 @@ def load_settings(**kwargs) -> Settings:
     max_images = kwargs.get("max_images") or int(os.getenv("MAX_IMAGES", "12"))
     dpi = kwargs.get("dpi") or int(os.getenv("PDF_RENDER_DPI", "200"))
     ocr_lang = kwargs.get("ocr_lang") or os.getenv("OCR_LANG", "eng")
-    min_text_chars_for_digital = kwargs.get("min_text_chars_for_digital") or int(os.getenv("MIN_TEXT_CHARS_FOR_DIGITAL", "80"))
     topk_per_statement = kwargs.get("topk_per_statement") or int(os.getenv("TOPK_PER_STATEMENT", "3"))
     return Settings(
         openrouter_api_key=api_key,
         openrouter_model=model,
@@ -42,4 +59,6 @@ def load_settings(**kwargs) -> Settings:
         ocr_lang=ocr_lang,
         min_text_chars_for_digital=min_text_chars_for_digital,
         topk_per_statement=topk_per_statement,
     )

 from __future__ import annotations
 from dataclasses import dataclass
 import os
 from dotenv import load_dotenv
 @dataclass(frozen=True)
 class Settings:
     openrouter_api_key: str
     min_text_chars_for_digital: int
     topk_per_statement: int
+    # block logic knobs
+    max_blocks_per_statement: int
+    continuation_max_forward: int
 DEFAULT_FREE_VISION_MODELS = [
+    # Free + vision-capable (as of their OpenRouter pages / availability changes over time)
     "google/gemma-3-12b-it:free",
     "nvidia/nemotron-nano-12b-v2-vl:free",
     "amazon/nova-2-lite-v1:free",
 ]
 def load_settings(**kwargs) -> Settings:
     load_dotenv()
     api_key = kwargs.get("openrouter_api_key") or os.getenv("OPENROUTER_API_KEY", "").strip()
     if not api_key:
         raise RuntimeError("Missing OPENROUTER_API_KEY in environment/.env")
     max_images = kwargs.get("max_images") or int(os.getenv("MAX_IMAGES", "12"))
     dpi = kwargs.get("dpi") or int(os.getenv("PDF_RENDER_DPI", "200"))
     ocr_lang = kwargs.get("ocr_lang") or os.getenv("OCR_LANG", "eng")
+    min_text_chars_for_digital = kwargs.get("min_text_chars_for_digital") or int(
+        os.getenv("MIN_TEXT_CHARS_FOR_DIGITAL", "80")
+    )
     topk_per_statement = kwargs.get("topk_per_statement") or int(os.getenv("TOPK_PER_STATEMENT", "3"))
+    max_blocks_per_statement = kwargs.get("max_blocks_per_statement") or int(
+        os.getenv("MAX_BLOCKS_PER_STATEMENT", "2")
+    )
+    continuation_max_forward = kwargs.get("continuation_max_forward") or int(
+        os.getenv("CONTINUATION_MAX_FORWARD", "6")
+    )
     return Settings(
         openrouter_api_key=api_key,
         openrouter_model=model,
         ocr_lang=ocr_lang,
         min_text_chars_for_digital=min_text_chars_for_digital,
         topk_per_statement=topk_per_statement,
+        max_blocks_per_statement=max_blocks_per_statement,
+        continuation_max_forward=continuation_max_forward,
     )

image_server_snippet.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import Response
+import fitz
+import io
+# We will need to inject this into app.py
+def serve_page_image(pdf_path: str, page_num: int, dpi: int = 150):
+    try:
+        doc = fitz.open(pdf_path)
+        if page_num < 1 or page_num > doc.page_count:
+             return Response(status_code=404)
+        page = doc.load_page(page_num - 1)
+        zoom = dpi / 72.0
+        mat = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat, alpha=False)
+        img_bytes = pix.tobytes("png")
+        doc.close()
+        return Response(content=img_bytes, media_type="image/png")
+    except Exception as e:
+        print(f"Error serving page: {e}")
+        return Response(status_code=500)

index.html CHANGED Viewed

@@ -1,29 +1,88 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>Financial Report Analyzer</title>
     <style>
-        body { font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; }
-        .container { border: 1px solid #ccc; padding: 20px; border-radius: 8px; background: #f9f9f9; }
-        h1 { text-align: center; color: #333; }
-        .form-group { margin-bottom: 20px; text-align: center; }
-        input[type="file"] { margin: 10px 0; }
-        button { background-color: #007bff; color: white; border: none; padding: 10px 20px; border-radius: 4px; cursor: pointer; font-size: 16px; }
-        button:hover { background-color: #0056b3; }
-        button:disabled { background-color: #ccc; cursor: not-allowed; }
-        #status { text-align: center; margin-top: 10px; font-weight: bold; }
-        #result { margin-top: 20px; white-space: pre-wrap; background: #fff; padding: 15px; border: 1px solid #ddd; border-radius: 4px; display: none; }
-        .error { color: #dc3545; }
     </style>
 </head>
 <body>
     <div class="container">
         <h1>Financial Report Analyzer</h1>
-        <p style="text-align: center;">Upload a 10-K/Annual Report PDF to extract page ranges for primary financial statements.</p>
         <div class="form-group">
             <input type="file" id="pdfInput" accept=".pdf" />
             <br>
@@ -52,7 +111,7 @@
             status.textContent = "Analyzing... This may take a minute.";
             status.className = "";
             resultDisplay.style.display = 'none';
-            resultDisplay.textContent = "";
             const formData = new FormData();
             formData.append('file', file);
@@ -69,10 +128,10 @@
                 }
                 const data = await response.json();
-                delete data.debug;
-                delete data.notes;
                 status.textContent = "Analysis Complete!";
-                resultDisplay.textContent = JSON.stringify(data, null, 2);
                 resultDisplay.style.display = 'block';
             } catch (error) {
@@ -83,6 +142,82 @@
                 btn.disabled = false;
             }
         }
     </script>
 </body>
-</html>

 <!DOCTYPE html>
 <html lang="en">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>Financial Report Analyzer</title>
     <style>
+        body {
+            font-family: sans-serif;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+            line-height: 1.6;
+        }
+        .container {
+            border: 1px solid #ccc;
+            padding: 20px;
+            border-radius: 8px;
+            background: #f9f9f9;
+        }
+        h1 {
+            text-align: center;
+            color: #333;
+        }
+        .form-group {
+            margin-bottom: 20px;
+            text-align: center;
+        }
+        input[type="file"] {
+            margin: 10px 0;
+        }
+        button {
+            background-color: #007bff;
+            color: white;
+            border: none;
+            padding: 10px 20px;
+            border-radius: 4px;
+            cursor: pointer;
+            font-size: 16px;
+        }
+        button:hover {
+            background-color: #0056b3;
+        }
+        button:disabled {
+            background-color: #ccc;
+            cursor: not-allowed;
+        }
+        #status {
+            text-align: center;
+            margin-top: 10px;
+            font-weight: bold;
+        }
+        #result {
+            margin-top: 20px;
+            white-space: pre-wrap;
+            background: #fff;
+            padding: 15px;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+            display: none;
+        }
+        .error {
+            color: #dc3545;
+        }
     </style>
 </head>
 <body>
     <div class="container">
         <h1>Financial Report Analyzer</h1>
+        <p style="text-align: center;">Upload a 10-K/Annual Report PDF to extract page ranges for primary financial
+            statements.</p>
         <div class="form-group">
             <input type="file" id="pdfInput" accept=".pdf" />
             <br>
             status.textContent = "Analyzing... This may take a minute.";
             status.className = "";
             resultDisplay.style.display = 'none';
+            resultDisplay.innerHTML = ""; // Clear previous content
             const formData = new FormData();
             formData.append('file', file);
                 }
                 const data = await response.json();
                 status.textContent = "Analysis Complete!";
+                // Render nicely
+                renderResults(data, resultDisplay);
                 resultDisplay.style.display = 'block';
             } catch (error) {
                 btn.disabled = false;
             }
         }
+        function renderResults(data, container) {
+            let html = "";
+            const sections = [
+                { key: 'balance_sheet', label: 'Balance Sheet' },
+                { key: 'profit_and_loss', label: 'Profit & Loss' },
+                { key: 'cash_flow', label: 'Cash Flow' }
+            ];
+            sections.forEach(sec => {
+                html += `<h3>${sec.label}</h3>`;
+                const items = data[sec.key];
+                if (!items || items.length === 0) {
+                    html += "<p>No ranges found.</p>";
+                } else {
+                    html += `
+                    <div style="overflow-x: auto;">
+                    <table border="1" cellpadding="8" style="border-collapse: collapse; width: 100%; margin-bottom: 20px;">
+                        <tr style="background: #eee;">
+                            <th>Scope</th>
+                            <th>Pages</th>
+                            <th>Details</th>
+                            <th style="min-width: 300px;">Evidence Images</th>
+                        </tr>`;
+                    items.forEach(item => {
+                        const pagesStr = (item.pages || []).join(", ");
+                        // Generate images for all pages in the range
+                        let imagesHtml = '<div style="display: flex; gap: 10px; overflow-x: auto;">';
+                        const pagesToShow = item.evidence_pages && item.evidence_pages.length > 0
+                            ? item.evidence_pages
+                            : (item.pages || []);
+                        pagesToShow.forEach(pNum => {
+                            imagesHtml += `
+                                <div style="text-align: center;">
+                                    <a href="/pdf/page/${pNum}" target="_blank">
+                                        <img src="/pdf/page/${pNum}" style="height: 200px; border: 1px solid #ddd; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);" alt="Page ${pNum}" loading="lazy"><br>
+                                        <small>Page ${pNum}</small>
+                                    </a>
+                                </div>`;
+                        });
+                        imagesHtml += '</div>';
+                        html += `
+                        <tr>
+                            <td><strong>${item.scope}</strong></td>
+                            <td>${pagesStr}</td>
+                            <td>
+                                <strong>Title:</strong> ${item.title || "<em>(null)</em>"}<br>
+                                <strong>Confidence:</strong> ${(item.confidence * 100).toFixed(0)}%
+                            </td>
+                            <td>${imagesHtml}</td>
+                        </tr>`;
+                    });
+                    html += "</table></div>";
+                }
+            });
+            // Notes
+            if (data.notes && data.notes.length > 0) {
+                html += "<h3>Notes</h3><ul>";
+                data.notes.forEach(note => {
+                    html += `<li>${note}</li>`;
+                });
+                html += "</ul>";
+            }
+            // Raw JSON toggle (optional)
+            html += `<hr><details><summary>Raw JSON</summary><pre>${JSON.stringify(data, null, 2)}</pre></details>`;
+            container.innerHTML = html;
+        }
     </script>
 </body>
+</html>

main.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from __future__ import annotations
 import argparse
 import json
 import time
 from config import load_settings, DEFAULT_FREE_VISION_MODELS
 from pdf_io import extract_texts_from_pdf, render_pages_to_png_bytes
@@ -17,279 +20,366 @@ from openrouter_client import (
 PROMPT_TEMPLATE = """
-You are given:
-1) OCR/extracted text for a set of PDF pages from a company's financial report (10-K/annual report)
-2) Images of the same pages
 Task:
-Identify the PDF PAGE RANGES (start_page, end_page) for the THREE PRIMARY FINANCIAL STATEMENT TABLES ONLY:
-- Balance Sheet (a.k.a. Statement of Financial Position / Consolidated Balance Sheet / Standalone Balance Sheet)
-- Profit & Loss (a.k.a. Income Statement / Statements of Earnings / Statements of Operations)
-- Cash Flow Statement (Statements of Cash Flows)
-IMPORTANT RULES (STRICT):
-- Only return ranges for the PRIMARY consolidated & standalone financial statements pages.
-- Do NOT return ranges for note disclosures (e.g., derivatives, leases, fair value tables), MD&A, segment notes, or narrative discussion.
-- A primary statement table page usually has:
-  (a) a clear statement title at the top (e.g., “Consolidated Balance Sheets”, "Standalone Balance Sheets")
-  (b) many numeric columns (often multiple years)
-  (c) canonical line items like:
-      Balance sheet: “Total assets”, “Total liabilities”, “Total equity/stockholders’ equity”
-      P&L: “Net revenues/sales”, “Cost of sales”, “Operating income”, “Net earnings/income”, “Earnings per share”
-      Cash flow: “Cash flows from operating/investing/financing activities”, “Net cash provided by”, “Cash and cash equivalents at end”
-- If a statement continues onto the next page, include that continuation page in the range.
-Pages provided (OCR snippets):
 {page_snippets}
-Output JSON ONLY in this schema (no extra keys, no markdown):
 {{
-  "balance_sheet": {{"start_page": int, "end_page": int, "confidence": float, "title": str}},
-  "profit_and_loss": {{"start_page": int, "end_page": int, "confidence": float, "title": str}},
-  "cash_flow": {{"start_page": int, "end_page": int, "confidence": float, "title": str}}
 }}
-Remember: PDF page numbers are 1-based in your output.
-"""
-SCHEMA_HINT = """{
-  "balance_sheet": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
-  "profit_and_loss": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
-  "cash_flow": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
-  "notes": "string[]"
-}"""
-def log(msg: str):
-    ts = time.strftime("%H:%M:%S")
-    print(f"[{ts}] {msg}", flush=True)
-def build_page_snippets(page_texts, selected_pages):
-    chunks = []
-    for p in selected_pages:
-        pt = page_texts[p]
-        txt = (pt.extracted_text or "") + "\n" + (pt.ocr_text or "")
-        txt = " ".join(txt.strip().split())
-        if len(txt) > 900:
-            txt = txt[:900] + "..."
-        chunks.append(f"- Page {p+1}: {txt}")
-    return "\n".join(chunks)
 def validate_ranges(result: dict, page_count: int) -> dict:
-    def clamp(v):
-        if v is None:
-            return None
-        if not isinstance(v, int):
-            return None
-        if v < 1 or v > page_count:
             return None
-        return v
-    for k in ["balance_sheet", "profit_and_loss", "cash_flow"]:
-        obj = result.get(k, {})
         if not isinstance(obj, dict):
-            result[k] = {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None}
-            continue
-        sp = clamp(obj.get("start_page"))
-        ep = clamp(obj.get("end_page"))
         if sp is not None and ep is not None and ep < sp:
-            sp, ep = None, None
-        obj["start_page"] = sp
-        obj["end_page"] = ep
-        if "confidence" not in obj or not isinstance(obj["confidence"], (int, float)):
-            obj["confidence"] = 0.0
-        if "evidence_pages" not in obj or not isinstance(obj["evidence_pages"], list):
-            obj["evidence_pages"] = []
-        if "title" not in obj:
-            obj["title"] = None
-        result[k] = obj
     if "notes" not in result or not isinstance(result["notes"], list):
         result["notes"] = []
     return result
-def analyze_pdf(
-    pdf_path: str,
-    output_path: str = "ranges.json",
-    debug_dir: str = "debug",
-    openrouter_api_key: str = None
-) -> dict:
     """
-    Analyzes a PDF to find financial statement page ranges.
-    Returns the result dict.
     """
-    settings_kwargs = {}
-    if openrouter_api_key:
-        settings_kwargs["openrouter_api_key"] = openrouter_api_key
-    st = load_settings(**settings_kwargs)
-    log(f"Loading PDF: {pdf_path}")
     page_texts, page_count = extract_texts_from_pdf(
         pdf_path=pdf_path,
-        dpi=st.dpi,
-        ocr_lang=st.ocr_lang,
-        min_text_chars_for_digital=st.min_text_chars_for_digital,
     )
-    ocr_pages = sum(1 for p in page_texts if p.used_ocr)
-    log(f"Pages: {page_count} | OCR used on {ocr_pages} pages")
-    candidates, cand_debug = build_candidate_lists(page_texts, top_k=30, debug=True)
-    log("TOC/Index debug:")
-    log(f"  item8_toc_page = {cand_debug.get('item8_toc_page')}")
-    log(f"  toc_internal   = {cand_debug.get('toc_internal')}")
-    log(f"  toc_pdf_all    = {cand_debug.get('toc_pdf_targets_all')}")
-    log(f"  heuristic_ranges_0_based = {cand_debug.get('heuristic_ranges_0_based')}")
-    selected_pages = select_pages_for_llm(
         candidates=candidates,
-        debug_info=cand_debug,
         page_count=page_count,
-        max_images=st.max_images
     )
-    log(f"Selected pages to render/send (1-indexed): {[p+1 for p in selected_pages]}")
-    log(f"Rendering {len(selected_pages)} pages to images (dpi={st.dpi})...")
-    page_png_map = render_pages_to_png_bytes(pdf_path, selected_pages, dpi=st.dpi)
-    log("Image rendering done.")
-    if st.openrouter_model:
-        model = st.openrouter_model
-        log(f"Using model from env: {model}")
-    else:
-        model = choose_free_vision_model(st.openrouter_api_key, preferred=DEFAULT_FREE_VISION_MODELS)
-        log(f"Auto-selected free vision model: {model}")
-    snippets = build_page_snippets(page_texts, selected_pages)
-    prompt = PROMPT_TEMPLATE.format(page_snippets=snippets)
-    # --- LLM call with progressive image backoff ---
-    pages_sent = list(selected_pages)
-    llm_res = None
-    while pages_sent:
-        images = [page_png_map[p] for p in pages_sent]
-        msg = make_user_message_with_images(prompt, images)
-        log(f"Calling OpenRouter (images={len(images)})...")
-        llm_res = chat_completion(
-            api_key=st.openrouter_api_key,
-            model=model,
-            messages=[msg],
-            max_tokens=4096,
-            temperature=0.0,
-            require_json=True,
-        )
-        log(f"finish_reason={llm_res.finish_reason} native={llm_res.native_finish_reason} content_len={len(llm_res.content)}")
-        # save raw response for debugging
-        try:
-            import os
-            os.makedirs(debug_dir, exist_ok=True)
-            with open(f"{debug_dir}/openrouter_raw_response.json", "w", encoding="utf-8") as f:
-                json.dump(llm_res.raw, f, indent=2)
-        except Exception:
-            pass
-        if llm_res.finish_reason == "error" or ("error" in llm_res.raw and llm_res.raw["error"]):
-            log("OpenRouter returned an error payload (see debug/openrouter_raw_response.json). Backing off images...")
-        elif llm_res.content.strip():
-            break
-        if len(pages_sent) <= 3:
-            break
-        pages_sent = pages_sent[:-2]
-        log(f"Retrying with fewer images. Now sending pages: {[p+1 for p in pages_sent]}")
-    if not llm_res:
-        raise RuntimeError("LLM call never executed.")
-    raw_text = (llm_res.content or "").strip()
-    log("DEBUG: raw model output (first 1200 chars):")
-    print(raw_text[:1200], flush=True)
-    # --- Parse JSON with repair fallback ---
     try:
-        result = robust_json_loads(raw_text)
-        log("Parsed JSON successfully.")
     except Exception as e:
-        log(f"JSON parse failed: {e}")
-        # Save raw text
-        try:
-            import os
-            os.makedirs(debug_dir, exist_ok=True)
-            with open(f"{debug_dir}/llm_raw_output.txt", "w", encoding="utf-8") as f:
-                f.write(raw_text)
-        except Exception:
-            pass
-        # Repair pass with free-tier text model
-        repair_model = choose_any_free_text_model(st.openrouter_api_key, preferred=[
-            model,  # try same model first
-            "google/gemma-3-12b-it:free",
-            "amazon/nova-2-lite-v1:free",
-            "nvidia/nemotron-nano-12b-v2-vl:free",
-        ])
-        log(f"Attempting JSON repair using: {repair_model}")
-        try:
-            result = repair_to_json(
-                api_key=st.openrouter_api_key,
-                model=repair_model,
-                bad_output=raw_text if raw_text else json.dumps(llm_res.raw),
-                schema_hint=SCHEMA_HINT,
-            )
-            log("Repair JSON succeeded.")
-        except Exception as e2:
-            log(f"Repair JSON failed: {e2}")
-            # Final safe fallback
-            result = {
-                "balance_sheet": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
-                "profit_and_loss": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
-                "cash_flow": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
-                "notes": [
-                    "Model output could not be parsed as JSON.",
-                    "Check debug/openrouter_raw_response.json and debug/llm_raw_output.txt",
-                ],
-            }
-    result = validate_ranges(result, page_count=page_count)
     result["debug"] = {
-        "model_used": model,
-        "pages_sent": [p + 1 for p in pages_sent],
-        "candidate_pages": candidates,
-        "finish_reason": llm_res.finish_reason,
-        "native_finish_reason": llm_res.native_finish_reason,
     }
     if output_path:
         with open(output_path, "w", encoding="utf-8") as f:
             json.dump(result, f, indent=2)
-        log(f"Saved output: {output_path}")
     return result
 def main():
     ap = argparse.ArgumentParser()
-    ap.add_argument("--pdf", required=True, help="Path to financial report PDF")
-    ap.add_argument("--out", default="ranges.json", help="Output JSON path")
-    ap.add_argument("--debug_dir", default="debug", help="Folder to write debug artifacts")
     args = ap.parse_args()
-    # Call the core logic
-    result = analyze_pdf(
-        pdf_path=args.pdf,
-        output_path=args.out,
-        debug_dir=args.debug_dir
-    )
-    # Print result to stdout for CLI use
     print(json.dumps(result, indent=2), flush=True)

 from __future__ import annotations
 import argparse
 import json
+import os
 import time
+from typing import Any, Dict, List
 from config import load_settings, DEFAULT_FREE_VISION_MODELS
 from pdf_io import extract_texts_from_pdf, render_pages_to_png_bytes
 PROMPT_TEMPLATE = """
+You are an expert financial-report analyst.
 Task:
+Given (a) OCR/native text snippets for certain pages and (b) images of those pages,
+identify page ranges that contain ONLY the three PRIMARY financial statements tables:
+1) Balance Sheet / Statement of Financial Position
+2) Profit & Loss / Income / Earnings / Operations
+3) Cash Flow Statement
+Important:
+- Many annual reports include BOTH consolidated and standalone statements.
+- You MUST return blocks for BOTH if present.
+- If a statement spans multiple pages, include ALL pages in that block.
+- A continuation page may not repeat the full title; use table structure + line-items.
+Heuristic candidate blocks (for reference only; you must verify from images+snippets):
+{heuristic_blocks}
+Pages provided (OCR/native snippets):
 {page_snippets}
+Return STRICT JSON ONLY (no markdown, no commentary).
+Schema (IMPORTANT: each statement is a LIST of blocks):
 {{
+  "balance_sheet": [
+    {{
+      "scope": "consolidated|standalone|unknown",
+      "start_page": <1-indexed int>,
+      "end_page": <1-indexed int>,
+      "pages": [<1-indexed ints>],
+      "confidence": <0..1>,
+      "title": "<string or null>",
+      "evidence_pages": [<1-indexed ints>]
+    }}
+  ],
+  "profit_and_loss": [ ... same block schema ... ],
+  "cash_flow": [ ... same block schema ... ],
+  "notes": [ "<optional strings>" ]
 }}
+Rules:
+- "pages" must list ALL pages in the block (even if it's one page).
+- start_page = min(pages), end_page = max(pages).
+- If a statement is NOT present, return an empty list for it.
+""".strip()
+def _combined_for_snippet(p) -> str:
+    a = getattr(p, "extracted_text", "") or ""
+    b = getattr(p, "ocr_text", "") or ""
+    return (a + "\n" + b).strip()
+def build_page_snippets(page_texts: List[Any], selected_pages_0: List[int], max_chars_per_page: int = 1400) -> str:
+    parts = []
+    for p0 in selected_pages_0:
+        pt = page_texts[p0]
+        txt = _combined_for_snippet(pt)
+        txt = txt[:max_chars_per_page]
+        parts.append(f"--- Page {p0+1} ---\n{txt}\n")
+    return "\n".join(parts).strip()
+def format_heuristic_blocks(heuristic_blocks_0_based: dict, max_per_stmt: int = 6) -> str:
+    if not isinstance(heuristic_blocks_0_based, dict):
+        return "(none)"
+    lines = []
+    for stmt in ["balance_sheet", "profit_and_loss", "cash_flow"]:
+        bl = heuristic_blocks_0_based.get(stmt) or []
+        if not isinstance(bl, list) or not bl:
+            lines.append(f"- {stmt}: (none)")
+            continue
+        bl_sorted = sorted(bl, key=lambda b: float(b.get("score") or 0.0), reverse=True)[:max_per_stmt]
+        parts = []
+        for b in bl_sorted:
+            s = int(b.get("start")) + 1
+            e = int(b.get("end")) + 1
+            scope = (b.get("scope") or "unknown")
+            title = b.get("title")
+            parts.append(f"{scope}: {s}-{e}" + (f" ({title})" if title else ""))
+        lines.append(f"- {stmt}: " + "; ".join(parts))
+    return "\n".join(lines)
 def validate_ranges(result: dict, page_count: int) -> dict:
+    """
+    Normalize model output into list-of-blocks schema.
+    Ensures every block has pages list; fixes start/end from pages.
+    """
+    def clamp_int(v):
+        if v is None or not isinstance(v, int):
             return None
+        return v if 1 <= v <= page_count else None
+    def normalize_pages(pages_val):
+        if not isinstance(pages_val, list):
+            return []
+        out = [x for x in pages_val if isinstance(x, int) and 1 <= x <= page_count]
+        return sorted(set(out))
+    def norm_block(obj) -> dict:
         if not isinstance(obj, dict):
+            obj = {}
+        sp = clamp_int(obj.get("start_page"))
+        ep = clamp_int(obj.get("end_page"))
+        pages = normalize_pages(obj.get("pages"))
+        if pages and (sp is None or ep is None):
+            sp = min(pages)
+            ep = max(pages)
         if sp is not None and ep is not None and ep < sp:
+            sp, ep, pages = None, None, []
+        if not pages and sp is not None and ep is not None:
+            pages = list(range(sp, ep + 1))
+        scope = obj.get("scope")
+        if not isinstance(scope, str):
+            scope = "unknown"
+        scope = scope.lower().strip()
+        if scope not in {"consolidated", "standalone", "unknown"}:
+            scope = "unknown"
+        conf = obj.get("confidence")
+        conf = float(conf) if isinstance(conf, (int, float)) else 0.0
+        conf = max(0.0, min(1.0, conf))
+        evidence = obj.get("evidence_pages")
+        if not isinstance(evidence, list):
+            evidence = []
+        evidence = [x for x in evidence if isinstance(x, int) and 1 <= x <= page_count]
+        title = obj.get("title")
+        if title is not None and not isinstance(title, str):
+            title = None
+        # ALWAYS keep pages list even if single page
+        if sp is None or ep is None:
+            return {
+                "start_page": None,
+                "end_page": None,
+                "pages": [],
+                "scope": scope,
+                "confidence": conf,
+                "title": title,
+                "evidence_pages": evidence,
+            }
+        return {
+            "start_page": sp,
+            "end_page": ep,
+            "pages": pages,
+            "scope": scope,
+            "confidence": conf,
+            "title": title,
+            "evidence_pages": evidence if evidence else ([sp] if sp else []),
+        }
+    for k in ["balance_sheet", "profit_and_loss", "cash_flow"]:
+        val = result.get(k)
+        if isinstance(val, dict):
+            val = [val]
+        if not isinstance(val, list):
+            val = []
+        result[k] = [norm_block(x) for x in val]
     if "notes" not in result or not isinstance(result["notes"], list):
         result["notes"] = []
+    else:
+        result["notes"] = [x for x in result["notes"] if isinstance(x, str)]
     return result
+def merge_with_heuristics(result: dict, heuristic_blocks_0_based: dict, page_count: int) -> dict:
     """
+    Add missing consolidated/standalone blocks if LLM returned only one.
+    Also expands single-page LLM blocks if heuristics show a longer block with same start+scope.
     """
+    if not isinstance(heuristic_blocks_0_based, dict):
+        return result
+    def overlap(a, b):
+        return not (a[1] < b[0] or b[1] < a[0])
+    for stmt in ["balance_sheet", "profit_and_loss", "cash_flow"]:
+        llm_blocks = result.get(stmt) or []
+        if not isinstance(llm_blocks, list):
+            llm_blocks = []
+        hb = heuristic_blocks_0_based.get(stmt) or []
+        heur_blocks = []
+        if isinstance(hb, list):
+            for b in hb:
+                try:
+                    s = int(b.get("start")) + 1
+                    e = int(b.get("end")) + 1
+                except Exception:
+                    continue
+                if not (1 <= s <= page_count and 1 <= e <= page_count and e >= s):
+                    continue
+                heur_blocks.append(
+                    {
+                        "start_page": s,
+                        "end_page": e,
+                        "pages": list(range(s, e + 1)),
+                        "scope": (b.get("scope") or "unknown"),
+                        "confidence": 0.35,
+                        "title": b.get("title"),
+                        "evidence_pages": [s],
+                    }
+                )
+        # expand single-page blocks using heuristics
+        for lb in llm_blocks:
+            if not isinstance(lb, dict):
+                continue
+            sp = lb.get("start_page")
+            ep = lb.get("end_page")
+            scope = (lb.get("scope") or "unknown")
+            if sp is None or ep is None:
+                continue
+            if sp == ep:
+                for hb2 in heur_blocks:
+                    if hb2["scope"] == scope and hb2["start_page"] == sp and hb2["end_page"] > ep:
+                        lb["end_page"] = hb2["end_page"]
+                        lb["pages"] = hb2["pages"]
+                        break
+        present_ranges = [
+            (b.get("start_page"), b.get("end_page"))
+            for b in llm_blocks
+            if isinstance(b, dict) and b.get("start_page") and b.get("end_page")
+        ]
+        present_scopes = {(b.get("scope") or "unknown") for b in llm_blocks if isinstance(b, dict)}
+        # add missing scope blocks (common: consolidated + standalone)
+        for hb2 in heur_blocks:
+            if hb2["scope"] in present_scopes and len(heur_blocks) > 1:
+                continue
+            r = (hb2["start_page"], hb2["end_page"])
+            if any(overlap(r, (ps, pe)) for (ps, pe) in present_ranges if ps and pe):
+                continue
+            llm_blocks.append(hb2)
+            present_scopes.add(hb2["scope"])
+            present_ranges.append(r)
+        llm_blocks = [b for b in llm_blocks if isinstance(b, dict)]
+        llm_blocks.sort(key=lambda b: (b.get("start_page") or 10**9, b.get("end_page") or 10**9))
+        result[stmt] = llm_blocks
+    return result
+def analyze_pdf(
+    pdf_path: str,
+    output_path: str = "",
+    debug_dir: str = "",
+    openrouter_api_key: str | None = None,
+) -> Dict[str, Any]:
+    settings = load_settings(openrouter_api_key=openrouter_api_key or os.getenv("OPENROUTER_API_KEY", "").strip())
+    t0 = time.time()
+    print(f"[1/6] Extracting text/OCR from PDF: {pdf_path}", flush=True)
     page_texts, page_count = extract_texts_from_pdf(
         pdf_path=pdf_path,
+        dpi=settings.dpi,
+        ocr_lang=settings.ocr_lang,
+        min_text_chars_for_digital=settings.min_text_chars_for_digital,
     )
+    print(f"      -> pages: {page_count}  (t={time.time()-t0:.1f}s)", flush=True)
+    print(f"[2/6] Building statement candidates + heuristic blocks...", flush=True)
+    candidates, debug_info = build_candidate_lists(
+        pages=page_texts,
+        page_count=page_count,
+        topk_per_statement=settings.topk_per_statement,
+        continuation_max_forward=settings.continuation_max_forward,
+        debug=True,
+    )
+    print("[3/6] Selecting pages to send to LLM (images)...", flush=True)
+    selected_pages_0 = select_pages_for_llm(
         candidates=candidates,
+        debug_info=debug_info,
         page_count=page_count,
+        max_images=settings.max_images,
+        max_blocks_per_statement=settings.max_blocks_per_statement,
     )
+    print(f"      -> selected {len(selected_pages_0)} pages: {[p+1 for p in selected_pages_0]}", flush=True)
+    print("[4/6] Rendering selected pages to PNG bytes...", flush=True)
+    images = render_pages_to_png_bytes(pdf_path, selected_pages_0, dpi=settings.dpi)
+    heuristic_blocks_str = format_heuristic_blocks(debug_info.get("heuristic_blocks_0_based") or {})
+    snippets = build_page_snippets(page_texts, selected_pages_0)
+    prompt = PROMPT_TEMPLATE.format(
+        heuristic_blocks=heuristic_blocks_str,
+        page_snippets=snippets,
+    )
+    # Choose model
+    model = settings.openrouter_model
+    if not model:
+        print("[5/6] Selecting a free vision model from OpenRouter...", flush=True)
+        model = choose_free_vision_model(settings.openrouter_api_key, DEFAULT_FREE_VISION_MODELS)
+    print(f"[5/6] Calling OpenRouter model: {model}", flush=True)
+    messages = [
+        # {"role": "system", "content": "Return STRICT JSON only."},
+        make_user_message_with_images(prompt, images),
+    ]
+    raw = chat_completion(settings.openrouter_api_key, model=model, messages=messages, temperature=0.0, max_tokens=1400)
+    raw_text = (raw.content or "").strip()
+    print("[6/6] Parsing model output...", flush=True)
     try:
+        parsed = robust_json_loads(raw_text)
     except Exception as e:
+        print("      -> JSON parse failed, attempting repair:", str(e), flush=True)
+        text_model = choose_any_free_text_model(settings.openrouter_api_key)
+        fixed = repair_to_json(settings.openrouter_api_key, raw_text, model=text_model)
+        parsed = robust_json_loads(fixed)
+    if not isinstance(parsed, dict):
+        parsed = {"balance_sheet": [], "profit_and_loss": [], "cash_flow": [], "notes": []}
+    parsed = validate_ranges(parsed, page_count=page_count)
+    parsed = merge_with_heuristics(parsed, debug_info.get("heuristic_blocks_0_based") or {}, page_count=page_count)
+    result: Dict[str, Any] = dict(parsed)
     result["debug"] = {
+        "selected_pages_1_based": [p + 1 for p in selected_pages_0],
+        "candidates_top": debug_info.get("top_scoring", {}),
+        "heuristic_blocks_0_based": debug_info.get("heuristic_blocks_0_based", {}),
+        "item8_toc_page_1_based": (debug_info.get("item8_toc_page") + 1) if debug_info.get("item8_toc_page") is not None else None,
     }
     if output_path:
         with open(output_path, "w", encoding="utf-8") as f:
             json.dump(result, f, indent=2)
+        print(f"Saved output -> {output_path}", flush=True)
     return result
 def main():
     ap = argparse.ArgumentParser()
+    ap.add_argument("--pdf", required=True, help="Path to input PDF")
+    ap.add_argument("--out", required=False, default="", help="Path to output JSON file")
+    ap.add_argument("--debug_dir", required=False, default="", help="Directory to store debug artifacts (optional)")
     args = ap.parse_args()
+    result = analyze_pdf(pdf_path=args.pdf, output_path=args.out, debug_dir=args.debug_dir)
     print(json.dumps(result, indent=2), flush=True)

openrouter_client.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from __future__ import annotations
 import base64
 import json
 import re
@@ -7,6 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple
 import requests
 OPENROUTER_CHAT_URL = "https://openrouter.ai/api/v1/chat/completions"
 OPENROUTER_MODELS_URL = "https://openrouter.ai/api/v1/models"
@@ -14,8 +16,8 @@ OPENROUTER_MODELS_URL = "https://openrouter.ai/api/v1/models"
 @dataclass
 class ChatResult:
     content: str
-    finish_reason: str | None
-    native_finish_reason: str | None
     tool_calls: Any
     raw: dict
@@ -27,62 +29,39 @@ def list_models(api_key: str) -> dict:
     return r.json()
-def choose_free_vision_model(api_key: str, preferred: list[str]) -> str:
     models = list_models(api_key).get("data", [])
-    by_id = {m.get("id"): m for m in models}
-    def is_free(m: dict) -> bool:
-        pricing = m.get("pricing") or {}
-        try:
-            return float(pricing.get("prompt", "1")) == 0.0 and float(pricing.get("completion", "1")) == 0.0
-        except Exception:
-            return False
-    def is_vision(m: dict) -> bool:
-        arch = (m.get("architecture") or {})
-        in_mods = set(arch.get("input_modalities") or [])
-        return "image" in in_mods
-    # Preferred first
-    for mid in preferred:
-        m = by_id.get(mid)
-        if m and is_free(m) and is_vision(m):
-            return mid
-    # Any free vision
     for m in models:
-        if is_free(m) and is_vision(m):
-            return m.get("id")
-    raise RuntimeError("Could not find any free vision-capable model in /models.")
-def choose_any_free_text_model(api_key: str, preferred: list[str] | None = None) -> str:
     models = list_models(api_key).get("data", [])
-    by_id = {m.get("id"): m for m in models}
-    def is_free(m: dict) -> bool:
-        pricing = m.get("pricing") or {}
-        try:
-            return float(pricing.get("prompt", "1")) == 0.0 and float(pricing.get("completion", "1")) == 0.0
-        except Exception:
-            return False
-    def is_text_input(m: dict) -> bool:
-        arch = (m.get("architecture") or {})
-        in_mods = set(arch.get("input_modalities") or [])
-        return "text" in in_mods
-    if preferred:
-        for mid in preferred:
-            m = by_id.get(mid)
-            if m and is_free(m) and is_text_input(m):
-                return mid
     for m in models:
-        if is_free(m) and is_text_input(m):
-            return m.get("id")
     raise RuntimeError("Could not find any free text-capable model in /models.")
@@ -91,19 +70,16 @@ def _img_bytes_to_data_url(png_bytes: bytes) -> str:
     return f"data:image/png;base64,{b64}"
-def make_user_message_with_images(prompt_text: str, images: list[bytes]) -> dict:
     """
-    OpenRouter follows OpenAI chat schema; some SDK examples show imageUrl (camelCase).
-    We include both keys for maximum compatibility.
     """
-    content: list[dict] = [{"type": "text", "text": prompt_text}]
-    for im in images:
-        url = _img_bytes_to_data_url(im)
         content.append(
             {
                 "type": "image_url",
-                "image_url": {"url": url},  # OpenAI-style
-                "imageUrl": {"url": url},   # SDK-style
             }
         )
     return {"role": "user", "content": content}
@@ -112,145 +88,101 @@ def make_user_message_with_images(prompt_text: str, images: list[bytes]) -> dict
 def chat_completion(
     api_key: str,
     model: str,
-    messages: list[dict],
-    max_tokens: int = 2000,
     temperature: float = 0.0,
-    require_json: bool = True,
-    extra: dict | None = None,
 ) -> ChatResult:
     headers = {
         "Authorization": f"Bearer {api_key}",
         "Content-Type": "application/json",
-        "HTTP-Referer": "http://localhost",
-        "X-Title": "fin-statement-page-locator",
     }
-    payload: dict[str, Any] = {
         "model": model,
         "messages": messages,
         "temperature": temperature,
         "max_tokens": max_tokens,
-        # Force no tool calls even if provider supports them
-        "tool_choice": "none",
     }
-    if require_json:
-        # OpenRouter supports response_format json_object (JSON mode)
-        payload["response_format"] = {"type": "json_object"}
-    if extra:
-        payload.update(extra)
     r = requests.post(OPENROUTER_CHAT_URL, headers=headers, json=payload, timeout=180)
     r.raise_for_status()
     data = r.json()
-    # OpenRouter can return errors at top-level even with HTTP 200 in some scenarios
-    if isinstance(data, dict) and "error" in data and data["error"]:
-        # keep raw for debugging
-        return ChatResult(
-            content="",
-            finish_reason="error",
-            native_finish_reason=None,
-            tool_calls=None,
-            raw=data,
-        )
-    choice0 = (data.get("choices") or [{}])[0]
-    msg = choice0.get("message") or {}
-    content = (msg.get("content") or "").strip()
-    tool_calls = msg.get("tool_calls") or msg.get("toolCalls")
     return ChatResult(
-        content=content,
-        finish_reason=choice0.get("finish_reason"),
-        native_finish_reason=choice0.get("native_finish_reason"),
         tool_calls=tool_calls,
         raw=data,
     )
-def _extract_json_from_codeblock(s: str) -> str | None:
-    # ```json ... ```
-    m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", s, flags=re.IGNORECASE)
-    if m:
-        return m.group(1).strip()
-    return None
-def _extract_first_balanced_object(s: str) -> str | None:
     """
-    Extract the first balanced {...} JSON object from arbitrary text.
     """
-    start = s.find("{")
-    if start == -1:
-        return None
-    depth = 0
-    for i in range(start, len(s)):
-        ch = s[i]
-        if ch == "{":
-            depth += 1
-        elif ch == "}":
-            depth -= 1
-            if depth == 0:
-                return s[start : i + 1]
-    return None
-def robust_json_loads(s: str) -> dict:
-    s = (s or "").strip()
-    if not s:
-        raise ValueError("Empty model content (no JSON to parse).")
-    # 1) direct parse
     try:
-        return json.loads(s)
     except Exception:
         pass
-    # 2) codeblock
-    cb = _extract_json_from_codeblock(s)
-    if cb:
         try:
-            return json.loads(cb)
         except Exception:
             pass
-    # 3) balanced object
-    obj = _extract_first_balanced_object(s)
-    if obj:
-        return json.loads(obj)
-    raise ValueError("Could not parse JSON from model output (no valid JSON object found).")
-def repair_to_json(
-    api_key: str,
-    model: str,
-    bad_output: str,
-    schema_hint: str,
-) -> dict:
     """
-    Ask a free model to convert arbitrary text into valid JSON for our schema.
     """
-    repair_prompt = f"""Convert the following content into VALID JSON ONLY.
-No markdown, no backticks, no explanations.
-Schema (must match keys/types):
-{schema_hint}
-Content to convert:
-{bad_output}
-"""
-    msg = {"role": "user", "content": repair_prompt}
     res = chat_completion(
         api_key=api_key,
         model=model,
-        messages=[msg],
-        max_tokens=900,
         temperature=0.0,
-        require_json=True,
     )
-    return robust_json_loads(res.content)

 from __future__ import annotations
 import base64
 import json
 import re
 import requests
 OPENROUTER_CHAT_URL = "https://openrouter.ai/api/v1/chat/completions"
 OPENROUTER_MODELS_URL = "https://openrouter.ai/api/v1/models"
 @dataclass
 class ChatResult:
     content: str
+    model: str
+    native_finish_reason: Optional[str]
     tool_calls: Any
     raw: dict
     return r.json()
+def choose_free_vision_model(api_key: str, preferred: List[str]) -> str:
     models = list_models(api_key).get("data", [])
+    # try preferred first
+    available = {m.get("id") for m in models if isinstance(m, dict)}
+    for p in preferred:
+        if p in available:
+            return p
+    # fallback: any model with ":free" + some vision hint in the metadata
     for m in models:
+        if not isinstance(m, dict):
+            continue
+        mid = m.get("id", "")
+        if ":free" not in mid:
+            continue
+        # crude heuristic: many vision models have "vl" or "vision" somewhere
+        text = json.dumps(m).lower()
+        if ("vision" in text) or ("image" in text) or ("vl" in mid.lower()):
+            return mid
+    raise RuntimeError("Could not find any free vision-capable model in /models. Set OPENROUTER_MODEL explicitly.")
+def choose_any_free_text_model(api_key: str) -> str:
     models = list_models(api_key).get("data", [])
     for m in models:
+        if not isinstance(m, dict):
+            continue
+        mid = m.get("id", "")
+        if ":free" not in mid:
+            continue
+        # exclude known vision-only ids if any; otherwise allow
+        return mid
     raise RuntimeError("Could not find any free text-capable model in /models.")
     return f"data:image/png;base64,{b64}"
+def make_user_message_with_images(prompt_text: str, images: List[bytes]) -> dict:
     """
+    OpenRouter follows OpenAI chat schema. Use 'image_url' (snake) which is supported by OpenAI-style APIs.
     """
+    content: List[dict] = [{"type": "text", "text": prompt_text}]
+    for b in images:
         content.append(
             {
                 "type": "image_url",
+                "image_url": {"url": _img_bytes_to_data_url(b)},
             }
         )
     return {"role": "user", "content": content}
 def chat_completion(
     api_key: str,
     model: str,
+    messages: List[dict],
     temperature: float = 0.0,
+    max_tokens: int = 1200,
 ) -> ChatResult:
     headers = {
         "Authorization": f"Bearer {api_key}",
         "Content-Type": "application/json",
     }
+    payload = {
         "model": model,
         "messages": messages,
         "temperature": temperature,
         "max_tokens": max_tokens,
     }
     r = requests.post(OPENROUTER_CHAT_URL, headers=headers, json=payload, timeout=180)
+    if r.status_code != 200:
+        print(f"API Error {r.status_code}: {r.text}", flush=True)
     r.raise_for_status()
     data = r.json()
+    # OpenAI-like response
+    choice = (data.get("choices") or [{}])[0]
+    msg = choice.get("message") or {}
+    content = msg.get("content") or ""
+    tool_calls = msg.get("tool_calls")
+    finish = choice.get("finish_reason")
     return ChatResult(
+        content=content if isinstance(content, str) else json.dumps(content),
+        model=data.get("model") or model,
+        native_finish_reason=finish,
         tool_calls=tool_calls,
         raw=data,
     )
+_JSON_OBJ_RE = re.compile(r"\{.*\}", re.DOTALL)
+_JSON_ARR_RE = re.compile(r"\[.*\]", re.DOTALL)
+def robust_json_loads(text: str) -> Any:
     """
+    Extract the first valid JSON object/array from a messy LLM output.
     """
+    if not text:
+        raise ValueError("Empty model output.")
+    t = text.strip()
+    # direct try
     try:
+        return json.loads(t)
     except Exception:
         pass
+    # try find object
+    m = _JSON_OBJ_RE.search(t)
+    if m:
+        cand = m.group(0)
         try:
+            return json.loads(cand)
         except Exception:
             pass
+    # try find array
+    m = _JSON_ARR_RE.search(t)
+    if m:
+        cand = m.group(0)
+        try:
+            return json.loads(cand)
+        except Exception:
+            pass
+    raise ValueError("Could not parse JSON from model output.")
+def repair_to_json(api_key: str, bad_text: str, model: str) -> str:
     """
+    Uses a free text model to rewrite messy output into strict JSON only.
     """
+    sys = (
+        "You are a strict JSON formatter. "
+        "Return ONLY valid JSON. No markdown, no commentary. "
+        "Preserve keys/values if possible."
+    )
+    user = f"Convert this into valid JSON ONLY:\n\n{bad_text}"
     res = chat_completion(
         api_key=api_key,
         model=model,
+        messages=[
+            {"role": "system", "content": sys},
+            {"role": "user", "content": user},
+        ],
         temperature=0.0,
+        max_tokens=1200,
     )
+    return res.content.strip()

pdf_io.py CHANGED Viewed

@@ -1,10 +1,15 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
 import fitz  # PyMuPDF
 from PIL import Image
 import io
 @dataclass
 class PageText:
     page_index: int  # 0-based
@@ -12,38 +17,39 @@ class PageText:
     ocr_text: str
     used_ocr: bool
 def _safe_text(s: str) -> str:
     return (s or "").replace("\x00", " ").strip()
-def render_page_to_pil(doc: fitz.Document, page_index: int, dpi: int) -> Image.Image:
     page = doc.load_page(page_index)
     zoom = dpi / 72.0
     mat = fitz.Matrix(zoom, zoom)
     pix = page.get_pixmap(matrix=mat, alpha=False)
-    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
     return img
 def ocr_pil_image(img: Image.Image, lang: str = "eng") -> str:
-    try:
-        import pytesseract
-    except Exception as e:
-        raise RuntimeError(
-            "pytesseract not available. Install pytesseract and system Tesseract OCR."
-        ) from e
-    # psm 6: assume a block of text (good for tables + headings)
-    txt = pytesseract.image_to_string(img, lang=lang, config="--psm 6")
     return _safe_text(txt)
-def is_likely_scanned(extracted_text: str, min_chars: int) -> bool:
-    # If the page has almost no selectable text, it’s probably scanned.
-    return len(_safe_text(extracted_text)) < min_chars
 def extract_texts_from_pdf(
     pdf_path: str,
-    dpi: int,
-    ocr_lang: str,
-    min_text_chars_for_digital: int,
 ) -> Tuple[List[PageText], int]:
     doc = fitz.open(pdf_path)
     page_count = doc.page_count
@@ -63,13 +69,14 @@ def extract_texts_from_pdf(
     doc.close()
     return results, page_count
-def render_pages_to_png_bytes(pdf_path: str, page_indices: List[int], dpi: int) -> dict[int, bytes]:
     doc = fitz.open(pdf_path)
-    out: dict[int, bytes] = {}
     for p in page_indices:
         img = render_page_to_pil(doc, p, dpi=dpi)
         buf = io.BytesIO()
         img.save(buf, format="PNG")
-        out[p] = buf.getvalue()
     doc.close()
     return out

 from __future__ import annotations
 from dataclasses import dataclass
+from typing import List, Tuple
 import fitz  # PyMuPDF
 from PIL import Image
 import io
+import pytesseract
 @dataclass
 class PageText:
     page_index: int  # 0-based
     ocr_text: str
     used_ocr: bool
 def _safe_text(s: str) -> str:
     return (s or "").replace("\x00", " ").strip()
+def is_likely_scanned(extracted_text: str, min_text_chars_for_digital: int) -> bool:
+    """
+    Simple heuristic: if the native extracted text is too short, likely scanned.
+    """
+    t = _safe_text(extracted_text)
+    return len(t) < min_text_chars_for_digital
+def render_page_to_pil(doc: fitz.Document, page_index: int, dpi: int = 200) -> Image.Image:
     page = doc.load_page(page_index)
     zoom = dpi / 72.0
     mat = fitz.Matrix(zoom, zoom)
     pix = page.get_pixmap(matrix=mat, alpha=False)
+    img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
     return img
 def ocr_pil_image(img: Image.Image, lang: str = "eng") -> str:
+    # You can also add config like "--psm 6" if needed.
+    txt = pytesseract.image_to_string(img, lang=lang)
     return _safe_text(txt)
 def extract_texts_from_pdf(
     pdf_path: str,
+    dpi: int = 200,
+    ocr_lang: str = "eng",
+    min_text_chars_for_digital: int = 80,
 ) -> Tuple[List[PageText], int]:
     doc = fitz.open(pdf_path)
     page_count = doc.page_count
     doc.close()
     return results, page_count
+def render_pages_to_png_bytes(pdf_path: str, page_indices: List[int], dpi: int = 200) -> List[bytes]:
     doc = fitz.open(pdf_path)
+    out: List[bytes] = []
     for p in page_indices:
         img = render_page_to_pil(doc, p, dpi=dpi)
         buf = io.BytesIO()
         img.save(buf, format="PNG")
+        out.append(buf.getvalue())
     doc.close()
     return out

requirements.txt CHANGED Viewed

@@ -5,4 +5,4 @@ pymupdf
 pillow
 requests
 python-dotenv
-pytesseract

 pillow
 requests
 python-dotenv
+pytesseract

statement_candidates.py CHANGED Viewed

@@ -1,22 +1,19 @@
-# statement_candidates.py
 from __future__ import annotations
 import re
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Sequence, Tuple
 import difflib
 # =========================
-# Targets (you want ONLY these 3)
 # =========================
 TARGETS = ["balance_sheet", "profit_and_loss", "cash_flow"]
-# Auxiliary statements used ONLY for delimiting ranges (helpful in 10-K order)
-AUX = ["comprehensive_income", "equity", "notes"]
 # =========================
-# Title variants (based on your screenshots + common 10-K phrasing)
 # =========================
 TITLE_VARIANTS: Dict[str, List[str]] = {
     "balance_sheet": [
@@ -24,9 +21,10 @@ TITLE_VARIANTS: Dict[str, List[str]] = {
         "Standalone Balance Sheets",
         "Balance Sheets",
         "Statement of Financial Position",
     ],
     "profit_and_loss": [
-        "Consolidated Statements of Earnings",      # AbbVie screenshot
         "Standalone Statements of Earnings",
         "Consolidated Statements of Operations",
         "Standalone Statements of Operations",
@@ -34,6 +32,7 @@ TITLE_VARIANTS: Dict[str, List[str]] = {
         "Standalone Statements of Income",
         "Income Statement",
         "Statement of Profit and Loss",
     ],
     "cash_flow": [
         "Consolidated Statements of Cash Flows",
@@ -41,7 +40,7 @@ TITLE_VARIANTS: Dict[str, List[str]] = {
         "Statement of Cash Flows",
         "Cash Flow Statement",
     ],
-    # auxiliary
     "comprehensive_income": [
         "Consolidated Statements of Comprehensive Income",
         "Standalone Statements of Comprehensive Income",
@@ -60,12 +59,8 @@ TITLE_VARIANTS: Dict[str, List[str]] = {
     ],
 }
-# Footer phrase (exact idea from your images)
 INTEGRAL_FOOTER = "the accompanying notes are an integral part"
-# =========================
-# Signature table line-items (increase precision against note tables)
-# =========================
 SIG_TERMS: Dict[str, List[str]] = {
     "balance_sheet": [
         "total assets",
@@ -73,22 +68,24 @@ SIG_TERMS: Dict[str, List[str]] = {
         "total equity",
         "stockholders' equity",
         "shareholders' equity",
-        "assets",
         "liabilities and equity",
         "current assets",
         "current liabilities",
     ],
     "profit_and_loss": [
         "net revenues",
         "net sales",
         "revenue",
-        "cost of products sold",
         "cost of sales",
         "gross profit",
         "operating income",
-        "operating earnings",
-        "net earnings",
         "net income",
         "earnings per share",
         "basic",
         "diluted",
@@ -101,122 +98,133 @@ SIG_TERMS: Dict[str, List[str]] = {
         "net cash used in investing activities",
         "net cash used in financing activities",
         "cash and cash equivalents, end of year",
-        "cash and equivalents, end of year",
         "net change in cash",
     ],
-    # aux
-    "notes": ["note 1", "note 2", "notes to consolidated financial statements", "notes to standalone financial statements"],
 }
 NOTE_HEADING_RE = re.compile(r"^\s*note\s+\d+\b", re.IGNORECASE)
-# Typical TOC “dot leaders”
 DOT_LEADER_RE = re.compile(r"\.{5,}")
-# Item 8 TOC trigger
-ITEM8_RE = re.compile(r"\bITEM\s+8\.\s+FINANCIAL\s+STATEMENTS\s+AND\s+SUPPLEMENTARY\s+DATA\b", re.IGNORECASE)
 # =========================
-# Page object -> combined text
 # =========================
 def _combined_text(page_obj: Any) -> str:
-    """
-    Works with your PageText dataclass:
-      extracted_text + ocr_text
-    Also supports dict/object string fallback.
-    """
     if page_obj is None:
         return ""
     if isinstance(page_obj, str):
         return page_obj
-    # dict-like
     if isinstance(page_obj, dict):
         a = page_obj.get("extracted_text") or page_obj.get("text") or ""
         b = page_obj.get("ocr_text") or ""
         return (a + "\n" + b).strip()
-    # attribute style
     a = getattr(page_obj, "extracted_text", None) or getattr(page_obj, "text", None) or ""
     b = getattr(page_obj, "ocr_text", None) or ""
     return (a + "\n" + b).strip()
-def _page_index(page_obj: Any, fallback: int) -> int:
-    if isinstance(page_obj, dict):
-        if isinstance(page_obj.get("page_index"), int):
-            return int(page_obj["page_index"])
-    v = getattr(page_obj, "page_index", None)
-    return int(v) if isinstance(v, int) else fallback
 def _norm(s: str) -> str:
     return re.sub(r"\s+", " ", (s or "")).strip().lower()
-# =========================
-# Fuzzy title detection (OCR typos tolerant)
-# =========================
 def _fuzzy_line_contains_title(top_lines: List[str], title: str, threshold: float = 0.86) -> bool:
     title_n = _norm(title)
     for ln in top_lines:
         ln_n = _norm(ln)
         if not ln_n:
             continue
-        # direct contains
         if title_n in ln_n:
             return True
-        # fuzzy ratio
         r = difflib.SequenceMatcher(None, ln_n, title_n).ratio()
         if r >= threshold:
             return True
     return False
-def detect_title(text: str, stmt: str) -> bool:
     lines = (text or "").splitlines()
-    top_lines = [ln.strip() for ln in lines[:14] if ln.strip()]  # titles live here in your screenshots
     for variant in TITLE_VARIANTS.get(stmt, []):
         if _fuzzy_line_contains_title(top_lines, variant):
-            return True
-    return False
 # =========================
-# Footer internal page number extraction (10-K style)
 # =========================
 FOOTER_PIPE_RE = re.compile(r"\|\s*(\d{1,4})\s*$", re.MULTILINE)
 FOOTER_FORM_RE = re.compile(r"form\s+10-?k\s*\|\s*(\d{1,4})\s*$", re.IGNORECASE | re.MULTILINE)
 def extract_footer_internal_page(text: str) -> Optional[int]:
     t = text or ""
     m = FOOTER_PIPE_RE.findall(t)
     if m:
         return int(m[-1])
     m = FOOTER_FORM_RE.findall(t)
     if m:
         return int(m[-1])
-    # fallback: last few non-empty lines that are ONLY digits (avoid table numbers)
     lines = [ln.strip() for ln in (t.splitlines() if t else []) if ln.strip()]
     for ln in reversed(lines[-6:]):
         if re.fullmatch(r"\d{1,4}", ln):
             return int(ln)
     return None
-# =========================
-# Item 8 TOC page detection + TOC parsing
-# AbbVie TOC is "title line" then next line has page number ("55")
-# =========================
 def find_item8_toc_page(all_texts: Sequence[str]) -> Optional[int]:
-    """
-    Choose the Item 8 page that LOOKS like an index/TOC (has dot leaders or 'Page').
-    """
     candidates = []
     for i, txt in enumerate(all_texts):
         if not ITEM8_RE.search(txt or ""):
@@ -225,21 +233,18 @@ def find_item8_toc_page(all_texts: Sequence[str]) -> Optional[int]:
         tocish = ("page" in low) and (DOT_LEADER_RE.search(txt or "") is not None)
         if tocish:
             candidates.append(i)
     return candidates[0] if candidates else None
 def parse_statement_index_numbers(toc_text: str) -> Dict[str, int]:
     """
-    Returns internal page numbers from the index.
-    Handles:
-      - same line "Consolidated Balance Sheets .... 57"
-      - two-line "Consolidated Balance Sheets" newline "57"  (AbbVie)
     """
     lines = [ln.strip() for ln in (toc_text or "").splitlines()]
     out: Dict[str, int] = {}
-    # compile quick patterns
     pats = {
         "profit_and_loss": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+(earnings|operations|income)", re.I),
         "comprehensive_income": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+comprehensive\s+income", re.I),
@@ -253,82 +258,72 @@ def parse_statement_index_numbers(toc_text: str) -> Dict[str, int]:
         if not ln:
             continue
-        for key, pat in pats.items():
-            if not pat.search(ln):
                 continue
-            # case 1: number on same line at end
             m = re.findall(r"(\d{1,4})\s*$", ln)
             if m and ln.endswith(m[-1]):
-                out[key] = int(m[-1])
                 continue
-            # case 2: number on next non-empty line
             j = i + 1
             while j < len(lines) and not lines[j]:
                 j += 1
             if j < len(lines) and re.fullmatch(r"\d{1,4}", lines[j]):
-                out[key] = int(lines[j])
     return out
 def build_internal_to_pdf_map(all_texts: Sequence[str]) -> Dict[int, int]:
-    """
-    internal_page_number -> pdf_page_index
-    """
     mapping: Dict[int, int] = {}
     for pdf_i, txt in enumerate(all_texts):
         n = extract_footer_internal_page(txt or "")
         if n is None:
             continue
-        mapping.setdefault(n, pdf_i)  # keep first occurrence
     return mapping
 def map_internal_to_pdf(internal: int, internal_to_pdf: Dict[int, int]) -> Optional[int]:
-    """
-    Robust mapping:
-    - direct if exists
-    - else estimate from nearest known internal page (assumes mostly consecutive internal numbering)
-    """
     if internal in internal_to_pdf:
         return internal_to_pdf[internal]
-    # nearest neighbor estimate
     keys = sorted(internal_to_pdf.keys())
     if not keys:
         return None
-    # find closest key
     best_k = min(keys, key=lambda k: abs(k - internal))
     return internal_to_pdf[best_k] + (internal - best_k)
 # =========================
-# Strong statement scoring (only used if TOC mapping fails)
 # =========================
 def _page_stats(text: str) -> Dict[str, float]:
     t = text or ""
     low = t.lower()
-    # numeric signals
     year_count = len(re.findall(r"\b20\d{2}\b", t))
     currency_count = len(re.findall(r"[$€£]|usd|inr|eur|gbp", low))
-    paren_neg = len(re.findall(r"\(\s*\d", t))  # (123) negatives
     integral = 1.0 if INTEGRAL_FOOTER in low else 0.0
     tokens = re.findall(r"[A-Za-z]+|\d+(?:,\d{3})*(?:\.\d+)?", t)
     if not tokens:
-        return dict(num_ratio=0.0, year_count=float(year_count), currency=float(currency_count),
-                    paren=float(paren_neg), integral=integral)
     nums = sum(1 for tok in tokens if re.fullmatch(r"\d+(?:,\d{3})*(?:\.\d+)?", tok))
     alphas = sum(1 for tok in tokens if re.fullmatch(r"[A-Za-z]+", tok))
     num_ratio = nums / max(1.0, nums + alphas)
-    return dict(num_ratio=float(num_ratio), year_count=float(year_count), currency=float(currency_count),
-                paren=float(paren_neg), integral=integral)
 def score_statement_page(text: str, stmt: str) -> Tuple[float, Dict[str, Any]]:
@@ -336,179 +331,320 @@ def score_statement_page(text: str, stmt: str) -> Tuple[float, Dict[str, Any]]:
     top = (text or "")[:1200]
     st = _page_stats(text)
-    reasons = {"title": False, "sig_hits": [], "integral": False, "penalties": [], "stats": st}
     score = 0.0
-    # Title near top is a MUST (or fuzzy)
-    if detect_title(top, stmt):
         score += 60.0
         reasons["title"] = True
     else:
-        # without title, heavily downrank (note tables can be very numeric)
-        score -= 25.0
-        reasons["penalties"].append("no_title(-25)")
-    # Integral footer is very characteristic of primary statements (seen in your screenshots)
     if st["integral"] > 0:
-        score += 18.0
         reasons["integral"] = True
-    # Signature line items: require multiple hits
     hits = 0
     for term in SIG_TERMS.get(stmt, []):
         if term in low:
             hits += 1
             reasons["sig_hits"].append(term)
-    score += min(hits, 10) * 6.0  # stronger weight
-    # Table-ness: years + currency + negative brackets + numeric ratio
-    score += st["num_ratio"] * 30.0
-    score += min(st["year_count"], 10.0) * 1.5
-    score += min(st["currency"], 10.0) * 2.0
     score += min(st["paren"], 10.0) * 1.0
-    # Hard penalties for NOTE pages
     if NOTE_HEADING_RE.search((text or "")[:220]):
-        score -= 60.0
-        reasons["penalties"].append("note_heading(-60)")
-    # If it looks like TOC index page, punish (dot leaders)
     if DOT_LEADER_RE.search(text or ""):
-        score -= 30.0
-        reasons["penalties"].append("toc_dotleaders(-30)")
-    # Guardrails:
-    # If title found but it doesn't look like a table at all, punish
-    if reasons["title"] and st["num_ratio"] < 0.10 and st["year_count"] < 1:
-        score -= 35.0
-        reasons["penalties"].append("title_without_table(-35)")
-    # Require at least 2 signature hits for high confidence
     if hits < 2:
-        score -= 18.0
-        reasons["penalties"].append("low_sig_hits(<2)(-18)")
     return score, reasons
-# =========================
-# Range inference from ordered statement starts
-# =========================
-def infer_ranges_from_starts(
-    starts_pdf: Dict[str, int],
-    page_count: int,
-    ordered_keys: List[str],
-) -> Dict[str, Tuple[int, int]]:
     """
-    Given start pdf indices (0-based) for an ordered list of keys,
-    return inclusive ranges for TARGETS based on next-start-1.
     """
-    # keep only those that exist
-    items = [(k, starts_pdf[k]) for k in ordered_keys if k in starts_pdf and isinstance(starts_pdf[k], int)]
-    items.sort(key=lambda x: x[1])
-    next_start = {}
-    for idx, (k, p) in enumerate(items):
-        nxt = items[idx + 1][1] if idx + 1 < len(items) else None
-        next_start[k] = nxt
-    ranges: Dict[str, Tuple[int, int]] = {}
-    for k, p in items:
-        end = (next_start[k] - 1) if next_start[k] is not None else p
-        end = min(max(end, p), page_count - 1)
-        ranges[k] = (p, end)
-    # return only targets that exist
-    return {k: ranges[k] for k in TARGETS if k in ranges}
 # =========================
-# Public API
 # =========================
 def build_candidate_lists(
     pages: Sequence[Any],
-    top_k: int = 25,
     debug: bool = True,
 ) -> Tuple[Dict[str, List[Tuple[int, float]]], Dict[str, Any]]:
     """
     Returns:
-      candidates: {stmt: [(pdf_page_idx, score), ...]}  for TARGETS only
-      debug_info: contains toc/internal mapping and top explanations
     """
     all_texts = [_combined_text(p) for p in pages]
-    page_count = len(all_texts)
     debug_info: Dict[str, Any] = {
         "item8_toc_page": None,
         "toc_internal": {},
         "internal_to_pdf_map_size": 0,
-        "toc_pdf_targets_all": {},
-        "heuristic_ranges_0_based": {},
-        "top_scoring": {},
     }
-    # ---- 1) TOC-based detection (most accurate on 10-K) ----
     toc_i = find_item8_toc_page(all_texts)
     if toc_i is not None:
-        toc_text = all_texts[toc_i]
         toc_internal = parse_statement_index_numbers(toc_text)
         internal_to_pdf = build_internal_to_pdf_map(all_texts)
-        toc_pdf_all: Dict[str, int] = {}
-        for k, internal_n in toc_internal.items():
-            mapped = map_internal_to_pdf(internal_n, internal_to_pdf)
-            if mapped is not None and 0 <= mapped < page_count:
-                toc_pdf_all[k] = mapped
-        debug_info.update({
-            "item8_toc_page": toc_i,
-            "toc_internal": toc_internal,
-            "internal_to_pdf_map_size": len(internal_to_pdf),
-            "toc_pdf_targets_all": toc_pdf_all,
-        })
-        # If we got our 3 targets, build direct ranges using the typical order:
-        # Earnings -> Comprehensive Income -> Balance Sheet -> Equity -> Cash Flow -> Notes
-        if all(k in toc_pdf_all for k in ["profit_and_loss", "balance_sheet", "cash_flow"]):
-            ordered = ["profit_and_loss", "comprehensive_income", "balance_sheet", "equity", "cash_flow", "notes"]
-            ranges = infer_ranges_from_starts(toc_pdf_all, page_count, ordered)
-            debug_info["heuristic_ranges_0_based"] = ranges
-            # Build candidates directly from these starts with huge confidence
-            candidates = {k: [] for k in TARGETS}
-            for k in TARGETS:
-                start, end = ranges.get(k, (None, None))
-                if start is None:
-                    continue
-                # prioritize start page; include end too
-                candidates[k].append((start, 999.0))
-                if end != start:
-                    candidates[k].append((end, 950.0))
-            return candidates, debug_info
-    # ---- 2) Fallback: statement scoring over ALL pages ----
     candidates: Dict[str, List[Tuple[int, float]]] = {k: [] for k in TARGETS}
     reasons_store: Dict[str, Dict[int, Any]] = {k: {} for k in TARGETS}
-    for i, p in enumerate(pages):
-        idx = _page_index(p, i)
-        txt = _combined_text(p)
         for stmt in TARGETS:
-            sc, why = score_statement_page(txt, stmt)
             if sc > 0:
-                candidates[stmt].append((idx, float(sc)))
-                if debug and (why["title"] or sc > 80):
-                    reasons_store[stmt][idx] = why
     for stmt in TARGETS:
         candidates[stmt].sort(key=lambda x: x[1], reverse=True)
-        candidates[stmt] = candidates[stmt][:max(8, top_k)]
-        if debug:
-            debug_info["top_scoring"][stmt] = [
-                {"page": p, "score": round(s, 2), "why": reasons_store[stmt].get(p)}
-                for p, s in candidates[stmt][:10]
-            ]
     return candidates, debug_info
@@ -518,12 +654,13 @@ def select_pages_for_llm(
     debug_info: Dict[str, Any],
     page_count: int,
     max_images: int,
 ) -> List[int]:
     """
-    If TOC-based ranges exist -> send ONLY those pages (+neighbors) (highest precision).
-    Else -> send top candidates + neighbors.
     """
-    picked = []
     seen = set()
     def add(p: int):
@@ -531,19 +668,36 @@ def select_pages_for_llm(
             seen.add(p)
             picked.append(p)
-    # TOC ranges (best)
-    ranges = debug_info.get("heuristic_ranges_0_based") or {}
-    if ranges:
         for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
-            if stmt in ranges:
-                s, e = ranges[stmt]
                 for p in range(s, e + 1):
                     add(p)
                 add(s - 1)
                 add(e + 1)
         return sorted(picked)
-    # fallback
     for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
         for (p, _sc) in candidates.get(stmt, [])[:2]:
             add(p)

 from __future__ import annotations
 import re
 import difflib
+from typing import Any, Dict, List, Optional, Sequence, Tuple
 # =========================
+# Targets (ONLY these 3)
 # =========================
 TARGETS = ["balance_sheet", "profit_and_loss", "cash_flow"]
+AUX = ["comprehensive_income", "equity", "notes"]  # only for delimiting (when available)
 # =========================
+# Title variants
 # =========================
 TITLE_VARIANTS: Dict[str, List[str]] = {
     "balance_sheet": [
         "Standalone Balance Sheets",
         "Balance Sheets",
         "Statement of Financial Position",
+        "Standalone Statement of Financial Position",
     ],
     "profit_and_loss": [
+        "Consolidated Statements of Earnings",
         "Standalone Statements of Earnings",
         "Consolidated Statements of Operations",
         "Standalone Statements of Operations",
         "Standalone Statements of Income",
         "Income Statement",
         "Statement of Profit and Loss",
+        "Statement of Profit & Loss",
     ],
     "cash_flow": [
         "Consolidated Statements of Cash Flows",
         "Statement of Cash Flows",
         "Cash Flow Statement",
     ],
+    # aux
     "comprehensive_income": [
         "Consolidated Statements of Comprehensive Income",
         "Standalone Statements of Comprehensive Income",
     ],
 }
 INTEGRAL_FOOTER = "the accompanying notes are an integral part"
 SIG_TERMS: Dict[str, List[str]] = {
     "balance_sheet": [
         "total assets",
         "total equity",
         "stockholders' equity",
         "shareholders' equity",
         "liabilities and equity",
         "current assets",
         "current liabilities",
+        "non-current assets",
+        "non-current liabilities",
     ],
     "profit_and_loss": [
         "net revenues",
         "net sales",
         "revenue",
         "cost of sales",
+        "cost of products sold",
         "gross profit",
         "operating income",
+        "operating profit",
+        "profit before tax",
         "net income",
+        "net earnings",
         "earnings per share",
         "basic",
         "diluted",
         "net cash used in investing activities",
         "net cash used in financing activities",
         "cash and cash equivalents, end of year",
         "net change in cash",
     ],
 }
 NOTE_HEADING_RE = re.compile(r"^\s*note\s+\d+\b", re.IGNORECASE)
 DOT_LEADER_RE = re.compile(r"\.{5,}")
+ITEM8_RE = re.compile(
+    r"\bITEM\s+8\.\s+FINANCIAL\s+STATEMENTS\s+AND\s+SUPPLEMENTARY\s+DATA\b", re.IGNORECASE
+)
+CONTINUED_RE = re.compile(r"\bcontinued\b", re.IGNORECASE)
 # =========================
+# Utilities
 # =========================
 def _combined_text(page_obj: Any) -> str:
     if page_obj is None:
         return ""
     if isinstance(page_obj, str):
         return page_obj
     if isinstance(page_obj, dict):
         a = page_obj.get("extracted_text") or page_obj.get("text") or ""
         b = page_obj.get("ocr_text") or ""
         return (a + "\n" + b).strip()
     a = getattr(page_obj, "extracted_text", None) or getattr(page_obj, "text", None) or ""
     b = getattr(page_obj, "ocr_text", None) or ""
     return (a + "\n" + b).strip()
 def _norm(s: str) -> str:
     return re.sub(r"\s+", " ", (s or "")).strip().lower()
 def _fuzzy_line_contains_title(top_lines: List[str], title: str, threshold: float = 0.86) -> bool:
     title_n = _norm(title)
     for ln in top_lines:
         ln_n = _norm(ln)
         if not ln_n:
             continue
         if title_n in ln_n:
             return True
         r = difflib.SequenceMatcher(None, ln_n, title_n).ratio()
         if r >= threshold:
             return True
     return False
+def detect_title_match(text: str, stmt: str) -> Tuple[bool, Optional[str], str]:
+    """
+    Returns (matched?, matched_variant, scope)
+    scope in {"consolidated","standalone","unknown"}
+    """
     lines = (text or "").splitlines()
+    top_lines = [ln.strip() for ln in lines[:16] if ln.strip()]
     for variant in TITLE_VARIANTS.get(stmt, []):
         if _fuzzy_line_contains_title(top_lines, variant):
+            vlow = variant.lower()
+            if "consolidated" in vlow:
+                scope = "consolidated"
+            elif "standalone" in vlow or "separate" in vlow:
+                scope = "standalone"
+            else:
+                scope = "unknown"
+            return True, variant, scope
+    joined = " ".join(top_lines).lower()
+    # fallback for OCR garble
+    if stmt == "balance_sheet" and ("balance sheet" in joined or "financial position" in joined):
+        if "consolidated" in joined:
+            return True, None, "consolidated"
+        if "standalone" in joined or "separate" in joined:
+            return True, None, "standalone"
+        return True, None, "unknown"
+    if stmt == "cash_flow" and ("cash flow" in joined or "cash flows" in joined):
+        if "consolidated" in joined:
+            return True, None, "consolidated"
+        if "standalone" in joined or "separate" in joined:
+            return True, None, "standalone"
+        return True, None, "unknown"
+    if stmt == "profit_and_loss" and (
+        "statement of profit" in joined
+        or "profit and loss" in joined
+        or "income statement" in joined
+        or "statements of income" in joined
+        or "statements of operations" in joined
+        or "statements of earnings" in joined
+    ):
+        if "consolidated" in joined:
+            return True, None, "consolidated"
+        if "standalone" in joined or "separate" in joined:
+            return True, None, "standalone"
+        return True, None, "unknown"
+    return False, None, "unknown"
+def detect_title(text: str, stmt: str) -> bool:
+    ok, _, _ = detect_title_match(text, stmt)
+    return ok
 # =========================
+# (Optional) 10-K TOC mapping helpers (kept, but now scope-safe)
 # =========================
 FOOTER_PIPE_RE = re.compile(r"\|\s*(\d{1,4})\s*$", re.MULTILINE)
 FOOTER_FORM_RE = re.compile(r"form\s+10-?k\s*\|\s*(\d{1,4})\s*$", re.IGNORECASE | re.MULTILINE)
 def extract_footer_internal_page(text: str) -> Optional[int]:
     t = text or ""
     m = FOOTER_PIPE_RE.findall(t)
     if m:
         return int(m[-1])
     m = FOOTER_FORM_RE.findall(t)
     if m:
         return int(m[-1])
     lines = [ln.strip() for ln in (t.splitlines() if t else []) if ln.strip()]
     for ln in reversed(lines[-6:]):
         if re.fullmatch(r"\d{1,4}", ln):
             return int(ln)
     return None
 def find_item8_toc_page(all_texts: Sequence[str]) -> Optional[int]:
     candidates = []
     for i, txt in enumerate(all_texts):
         if not ITEM8_RE.search(txt or ""):
         tocish = ("page" in low) and (DOT_LEADER_RE.search(txt or "") is not None)
         if tocish:
             candidates.append(i)
     return candidates[0] if candidates else None
 def parse_statement_index_numbers(toc_text: str) -> Dict[str, int]:
     """
+    Return internal page numbers from the index.
+    IMPORTANT: keeps consolidated + standalone separately:
+      key = f"{stmt}__{scope}"
     """
     lines = [ln.strip() for ln in (toc_text or "").splitlines()]
     out: Dict[str, int] = {}
     pats = {
         "profit_and_loss": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+(earnings|operations|income)", re.I),
         "comprehensive_income": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+comprehensive\s+income", re.I),
         if not ln:
             continue
+        for stmt, pat in pats.items():
+            mscope = pat.search(ln)
+            if not mscope:
                 continue
+            scope = (mscope.group(1) or "").strip().lower()
+            if scope not in {"consolidated", "standalone"}:
+                scope = "unknown"
+            out_key = f"{stmt}__{scope}"
+            # number at end of line
             m = re.findall(r"(\d{1,4})\s*$", ln)
             if m and ln.endswith(m[-1]):
+                out.setdefault(out_key, int(m[-1]))
                 continue
+            # number on next line
             j = i + 1
             while j < len(lines) and not lines[j]:
                 j += 1
             if j < len(lines) and re.fullmatch(r"\d{1,4}", lines[j]):
+                out.setdefault(out_key, int(lines[j]))
     return out
 def build_internal_to_pdf_map(all_texts: Sequence[str]) -> Dict[int, int]:
     mapping: Dict[int, int] = {}
     for pdf_i, txt in enumerate(all_texts):
         n = extract_footer_internal_page(txt or "")
         if n is None:
             continue
+        mapping.setdefault(n, pdf_i)
     return mapping
 def map_internal_to_pdf(internal: int, internal_to_pdf: Dict[int, int]) -> Optional[int]:
     if internal in internal_to_pdf:
         return internal_to_pdf[internal]
     keys = sorted(internal_to_pdf.keys())
     if not keys:
         return None
     best_k = min(keys, key=lambda k: abs(k - internal))
     return internal_to_pdf[best_k] + (internal - best_k)
 # =========================
+# Scoring
 # =========================
 def _page_stats(text: str) -> Dict[str, float]:
     t = text or ""
     low = t.lower()
     year_count = len(re.findall(r"\b20\d{2}\b", t))
     currency_count = len(re.findall(r"[$€£]|usd|inr|eur|gbp", low))
+    paren_neg = len(re.findall(r"\(\s*\d", t))
     integral = 1.0 if INTEGRAL_FOOTER in low else 0.0
     tokens = re.findall(r"[A-Za-z]+|\d+(?:,\d{3})*(?:\.\d+)?", t)
     if not tokens:
+        return dict(num_ratio=0.0, year_count=float(year_count), currency=float(currency_count), paren=float(paren_neg), integral=integral)
     nums = sum(1 for tok in tokens if re.fullmatch(r"\d+(?:,\d{3})*(?:\.\d+)?", tok))
     alphas = sum(1 for tok in tokens if re.fullmatch(r"[A-Za-z]+", tok))
     num_ratio = nums / max(1.0, nums + alphas)
+    return dict(num_ratio=float(num_ratio), year_count=float(year_count), currency=float(currency_count), paren=float(paren_neg), integral=integral)
 def score_statement_page(text: str, stmt: str) -> Tuple[float, Dict[str, Any]]:
     top = (text or "")[:1200]
     st = _page_stats(text)
+    reasons: Dict[str, Any] = {"title": False, "scope": "unknown", "sig_hits": [], "integral": False, "penalties": [], "stats": st}
     score = 0.0
+    ok, _, scope = detect_title_match(top, stmt)
+    if ok:
         score += 60.0
         reasons["title"] = True
+        reasons["scope"] = scope
     else:
+        score -= 20.0
+        reasons["penalties"].append("no_title(-20)")
     if st["integral"] > 0:
+        score += 12.0
         reasons["integral"] = True
     hits = 0
     for term in SIG_TERMS.get(stmt, []):
         if term in low:
             hits += 1
             reasons["sig_hits"].append(term)
+    score += min(hits, 10) * 5.0
+    score += st["num_ratio"] * 24.0
+    score += min(st["year_count"], 10.0) * 1.2
+    score += min(st["currency"], 10.0) * 1.8
     score += min(st["paren"], 10.0) * 1.0
     if NOTE_HEADING_RE.search((text or "")[:220]):
+        score -= 45.0
+        reasons["penalties"].append("note_heading(-45)")
     if DOT_LEADER_RE.search(text or ""):
+        score -= 25.0
+        reasons["penalties"].append("toc_dotleaders(-25)")
+    if reasons["title"] and st["num_ratio"] < 0.08 and st["year_count"] < 1:
+        score -= 30.0
+        reasons["penalties"].append("title_without_table(-30)")
     if hits < 2:
+        score -= 12.0
+        reasons["penalties"].append("low_sig_hits(<2)(-12)")
     return score, reasons
+def _statement_signal_no_title(text: str, stmt: str) -> float:
+    """
+    Continuation-page score (no title required). Used to extend blocks forward.
+    """
+    if not text:
+        return 0.0
+    if NOTE_HEADING_RE.search(text[:220]):
+        return 0.0
+    if DOT_LEADER_RE.search(text):
+        return 0.0
+    low = text.lower()
+    st = _page_stats(text)
+    hits = 0
+    for term in SIG_TERMS.get(stmt, []):
+        if term in low:
+            hits += 1
+    score = 0.0
+    score += min(hits, 10) * 4.5
+    score += st["num_ratio"] * 26.0
+    score += min(st["year_count"], 10.0) * 1.1
+    score += min(st["currency"], 10.0) * 1.5
+    score += min(st["paren"], 10.0) * 0.7
+    if CONTINUED_RE.search(text[:240]):
+        score += 8.0
+    # special: if a page has strong signature terms + years, it's often a continuation
+    if hits >= 2 and st["year_count"] >= 1:
+        score += 6.0
+    return score
+def _any_other_statement_title(text: str, stmt: str) -> bool:
+    for other in TARGETS:
+        if other == stmt:
+            continue
+        if detect_title(text[:1200], other):
+            return True
+    return False
+def _expand_block(all_texts: Sequence[str], stmt: str, start: int, max_forward: int = 6) -> int:
+    """
+    Expand forward to include continuation pages.
+    Stops if another statement begins (unless this stmt title repeats).
+    """
+    end = start
+    n = len(all_texts)
+    for j in range(start + 1, min(n, start + 1 + max_forward)):
+        txt = all_texts[j] or ""
+        if _any_other_statement_title(txt, stmt) and not detect_title(txt[:1200], stmt):
+            break
+        sig = _statement_signal_no_title(txt, stmt)
+        if sig >= 13.5:
+            end = j
+            continue
+        if CONTINUED_RE.search(txt[:240]) and sig >= 8.0:
+            end = j
+            continue
+        break
+    return end
+def _blocks_overlap(a: Tuple[int, int], b: Tuple[int, int]) -> bool:
+    return not (a[1] < b[0] or b[1] < a[0])
+def _dedup_blocks(blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Deduplicate overlapping blocks, keeping higher 'score'.
+    """
+    blocks = sorted(blocks, key=lambda x: (int(x.get("start", 10**9)), -(float(x.get("score") or 0.0))))
+    kept: List[Dict[str, Any]] = []
+    for b in blocks:
+        r = (int(b.get("start")), int(b.get("end")))
+        merged = False
+        for k in kept:
+            kr = (int(k.get("start")), int(k.get("end")))
+            if _blocks_overlap(r, kr):
+                if float(b.get("score") or 0.0) > float(k.get("score") or 0.0):
+                    k.update(b)
+                merged = True
+                break
+        if not merged:
+            kept.append(b)
+    return kept
+def build_blocks_from_titles(all_texts: Sequence[str], continuation_max_forward: int = 6) -> Dict[str, List[Dict[str, Any]]]:
     """
+    Finds MULTIPLE blocks per statement (consolidated + standalone).
+    Strategy:
+      - find title pages for stmt
+      - cluster nearby title hits of same scope
+      - expand each start forward with continuation scoring
     """
+    out: Dict[str, List[Dict[str, Any]]] = {k: [] for k in TARGETS}
+    for stmt in TARGETS:
+        title_hits: List[Tuple[int, float, str, Optional[str]]] = []
+        for i, txt in enumerate(all_texts):
+            ok, variant, scope = detect_title_match((txt or "")[:1200], stmt)
+            if not ok:
+                continue
+            sc, _why = score_statement_page(txt or "", stmt)
+            if sc < 30.0:
+                continue
+            title_hits.append((i, float(sc), scope, variant))
+        if not title_hits:
+            continue
+        title_hits.sort(key=lambda x: x[0])
+        clusters: List[List[Tuple[int, float, str, Optional[str]]]] = []
+        for hit in title_hits:
+            if not clusters:
+                clusters.append([hit])
+                continue
+            last = clusters[-1][-1]
+            # group if same scope and close
+            if hit[2] == last[2] and hit[0] <= last[0] + 3:
+                clusters[-1].append(hit)
+            else:
+                clusters.append([hit])
+        blocks: List[Dict[str, Any]] = []
+        for cl in clusters:
+            start = min(h[0] for h in cl)
+            best = max(cl, key=lambda x: x[1])
+            best_score = best[1]
+            scope = best[2]
+            title = best[3]
+            end = _expand_block(all_texts, stmt, start, max_forward=continuation_max_forward)
+            blocks.append(
+                {
+                    "start": int(start),
+                    "end": int(end),
+                    "scope": scope,
+                    "title": title,
+                    "score": float(best_score),
+                }
+            )
+        out[stmt] = _dedup_blocks(blocks)
+    return out
 # =========================
+# Main builder
 # =========================
 def build_candidate_lists(
     pages: Sequence[Any],
+    page_count: int,
+    topk_per_statement: int = 3,
+    continuation_max_forward: int = 6,
     debug: bool = True,
 ) -> Tuple[Dict[str, List[Tuple[int, float]]], Dict[str, Any]]:
     """
     Returns:
+      candidates: {stmt: [(page_idx, score), ...]}
+      debug_info: includes heuristic_blocks_0_based per stmt (list of blocks)
     """
     all_texts = [_combined_text(p) for p in pages]
     debug_info: Dict[str, Any] = {
         "item8_toc_page": None,
         "toc_internal": {},
         "internal_to_pdf_map_size": 0,
+        "heuristic_blocks_0_based": {k: [] for k in TARGETS},
+        "top_scoring": {k: [] for k in TARGETS},
     }
+    # 1) Title-based multi-blocks (works for many non-10K PDFs too)
+    title_blocks = build_blocks_from_titles(all_texts, continuation_max_forward=continuation_max_forward)
+    # 2) Try 10-K Item8 TOC mapping (optional; mostly US 10-Ks)
+    toc_blocks: Dict[str, List[Dict[str, Any]]] = {k: [] for k in TARGETS}
     toc_i = find_item8_toc_page(all_texts)
     if toc_i is not None:
+        debug_info["item8_toc_page"] = toc_i
+        toc_text = all_texts[toc_i] or ""
         toc_internal = parse_statement_index_numbers(toc_text)
+        debug_info["toc_internal"] = toc_internal
         internal_to_pdf = build_internal_to_pdf_map(all_texts)
+        debug_info["internal_to_pdf_map_size"] = len(internal_to_pdf)
+        # convert internal -> pdf
+        for key_scoped, internal_page in toc_internal.items():
+            if "__" not in key_scoped:
+                continue
+            stmt, scope = key_scoped.split("__", 1)
+            if stmt not in TARGETS:
+                continue
+            start_pdf = map_internal_to_pdf(internal_page, internal_to_pdf)
+            if start_pdf is None:
+                continue
+            # expand a block from TOC-derived start
+            end_pdf = _expand_block(all_texts, stmt, start_pdf, max_forward=continuation_max_forward)
+            toc_blocks[stmt].append(
+                {
+                    "start": int(start_pdf),
+                    "end": int(end_pdf),
+                    "scope": scope if scope in {"consolidated", "standalone"} else "unknown",
+                    "title": None,
+                    "score": 55.0,  # heuristic
+                }
+            )
+        for stmt in TARGETS:
+            toc_blocks[stmt] = _dedup_blocks(toc_blocks[stmt])
+    # merge blocks
+    merged_blocks: Dict[str, List[Dict[str, Any]]] = {}
+    for stmt in TARGETS:
+        merged_blocks[stmt] = _dedup_blocks((title_blocks.get(stmt) or []) + (toc_blocks.get(stmt) or []))
+        # keep only top N blocks by score, but keep distinct scope if possible
+        bl = sorted(merged_blocks[stmt], key=lambda b: float(b.get("score") or 0.0), reverse=True)
+        chosen: List[Dict[str, Any]] = []
+        seen_scope = set()
+        for b in bl:
+            scope = (b.get("scope") or "unknown")
+            if scope in seen_scope and len(bl) > 1:
+                continue
+            chosen.append(b)
+            seen_scope.add(scope)
+            if len(chosen) >= 4:  # internal cap, actual final cap comes from settings in main
+                break
+        merged_blocks[stmt] = sorted(chosen, key=lambda b: (int(b["start"]), int(b["end"])))
+    debug_info["heuristic_blocks_0_based"] = merged_blocks
+    # 3) Strong per-page scoring candidates (fallback / also helpful for LLM page picking)
     candidates: Dict[str, List[Tuple[int, float]]] = {k: [] for k in TARGETS}
     reasons_store: Dict[str, Dict[int, Any]] = {k: {} for k in TARGETS}
+    for i, txt in enumerate(all_texts):
         for stmt in TARGETS:
+            sc, why = score_statement_page(txt or "", stmt)
             if sc > 0:
+                candidates[stmt].append((i, float(sc)))
+                if debug and (why.get("title") or sc > 80):
+                    reasons_store[stmt][i] = why
     for stmt in TARGETS:
         candidates[stmt].sort(key=lambda x: x[1], reverse=True)
+        debug_info["top_scoring"][stmt] = candidates[stmt][: min(len(candidates[stmt]), 10)]
+        candidates[stmt] = candidates[stmt][:topk_per_statement]
+        debug_info[f"reasons_{stmt}"] = reasons_store[stmt]
     return candidates, debug_info
     debug_info: Dict[str, Any],
     page_count: int,
     max_images: int,
+    max_blocks_per_statement: int = 2,
 ) -> List[int]:
     """
+    Prefer multi-block heuristic pages (include BOTH consolidated + standalone if found).
+    Else fallback to top candidates + neighbors.
     """
+    picked: List[int] = []
     seen = set()
     def add(p: int):
             seen.add(p)
             picked.append(p)
+    blocks_by_stmt = debug_info.get("heuristic_blocks_0_based") or {}
+    if isinstance(blocks_by_stmt, dict) and any(blocks_by_stmt.get(k) for k in TARGETS):
         for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
+            bl = blocks_by_stmt.get(stmt) or []
+            if not isinstance(bl, list) or not bl:
+                continue
+            # pick top blocks, prefer distinct scopes
+            bl_sorted = sorted(bl, key=lambda b: float(b.get("score") or 0.0), reverse=True)
+            chosen: List[Dict[str, Any]] = []
+            seen_scope = set()
+            for b in bl_sorted:
+                scope = (b.get("scope") or "unknown")
+                if scope in seen_scope and len(bl_sorted) > 1:
+                    continue
+                chosen.append(b)
+                seen_scope.add(scope)
+                if len(chosen) >= max_blocks_per_statement:
+                    break
+            for b in chosen:
+                s, e = int(b.get("start")), int(b.get("end"))
                 for p in range(s, e + 1):
                     add(p)
                 add(s - 1)
                 add(e + 1)
         return sorted(picked)
+    # fallback: use top candidates
     for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
         for (p, _sc) in candidates.get(stmt, [])[:2]:
             add(p)