FridayCodehhr commited on
Commit
a9d5e1b
·
verified ·
1 Parent(s): 689d59b

Upload 10 files

Browse files
Files changed (10) hide show
  1. Dockerfile +0 -8
  2. app.py +43 -39
  3. config.py +22 -3
  4. image_server_snippet.py +22 -0
  5. index.html +153 -18
  6. main.py +307 -217
  7. openrouter_client.py +87 -155
  8. pdf_io.py +28 -21
  9. requirements.txt +1 -1
  10. statement_candidates.py +369 -215
Dockerfile CHANGED
@@ -1,7 +1,6 @@
1
  # Use official Python runtime as a parent image
2
  FROM python:3.10-slim
3
 
4
- # Set the working directory in the container
5
  WORKDIR /app
6
 
7
  # Install system dependencies (Tesseract)
@@ -10,23 +9,16 @@ RUN apt-get update && apt-get install -y \
10
  libtesseract-dev \
11
  && rm -rf /var/lib/apt/lists/*
12
 
13
- # Copy requirements first to leverage Docker cache
14
  COPY requirements.txt .
15
-
16
- # Install Python dependencies
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
- # Copy the rest of the application code
20
  COPY . .
21
 
22
- # Create a user to run the app (security best practice, required by some environments)
23
  RUN useradd -m -u 1000 user
24
  USER user
25
  ENV HOME=/home/user \
26
  PATH=/home/user/.local/bin:$PATH
27
 
28
- # Expose port 7860 (Hugging Face Spaces default)
29
  EXPOSE 7860
30
 
31
- # Command to run the application
32
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  # Use official Python runtime as a parent image
2
  FROM python:3.10-slim
3
 
 
4
  WORKDIR /app
5
 
6
  # Install system dependencies (Tesseract)
 
9
  libtesseract-dev \
10
  && rm -rf /var/lib/apt/lists/*
11
 
 
12
  COPY requirements.txt .
 
 
13
  RUN pip install --no-cache-dir -r requirements.txt
14
 
 
15
  COPY . .
16
 
 
17
  RUN useradd -m -u 1000 user
18
  USER user
19
  ENV HOME=/home/user \
20
  PATH=/home/user/.local/bin:$PATH
21
 
 
22
  EXPOSE 7860
23
 
 
24
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,18 +1,14 @@
1
- import os
2
  import shutil
3
  import tempfile
4
- import json
5
  from fastapi import FastAPI, File, UploadFile, HTTPException
6
- from fastapi.responses import JSONResponse, HTMLResponse
7
  from fastapi.staticfiles import StaticFiles
8
  from main import analyze_pdf
9
 
10
  app = FastAPI()
11
 
12
- # Mount static files to serve index.html
13
- # We assume index.html is in the same directory
14
- app.mount("/static", StaticFiles(directory="."), name="static")
15
-
16
  @app.get("/", response_class=HTMLResponse)
17
  async def read_root():
18
  with open("index.html", "r") as f:
@@ -23,45 +19,53 @@ async def analyze_endpoint(file: UploadFile = File(...)):
23
  if not file.filename.endswith(".pdf"):
24
  raise HTTPException(status_code=400, detail="File must be a PDF")
25
 
26
- # Save uploaded file to a temp location
27
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
28
- shutil.copyfileobj(file.file, tmp)
29
- tmp_path = tmp.name
 
 
 
 
 
 
 
 
30
 
31
  try:
32
- # Create a temp debug dir
33
- debug_dir = tempfile.mkdtemp()
34
-
35
- # Get API Key from environment (injected by Space secrets)
36
- api_key = os.getenv("OPENROUTER_API_KEY")
37
- if not api_key:
38
- raise HTTPException(status_code=500, detail="Server misconfigured: OPENROUTER_API_KEY missing")
39
-
40
- # Run analysis using the refactored main logic
41
- # We pass None for output_path so it doesn't try to write to a fixed file unless we want it to
42
- # But analyze_pdf writes to output_path if provided. We can just let it return the dict.
43
  result = analyze_pdf(
44
- pdf_path=tmp_path,
45
- output_path="", # Don't write to file, just return dict
46
- debug_dir=debug_dir,
47
- openrouter_api_key=api_key
48
  )
49
-
50
  return JSONResponse(content=result)
51
-
52
  except Exception as e:
53
  import traceback
54
  traceback.print_exc()
55
  raise HTTPException(status_code=500, detail=str(e))
56
- finally:
57
- # Cleanup
58
- if os.path.exists(tmp_path):
59
- os.remove(tmp_path)
60
- # We might want to keep debug dir for a bit or clean it up.
61
- # For a simple demo, we can clean it up or ignore it (tmp cleans up eventually on restart usually, but explicitly is better)
62
- if os.path.exists(debug_dir):
63
- shutil.rmtree(debug_dir, ignore_errors=True)
64
 
65
- if __name__ == "__main__":
66
- import uvicorn
67
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import shutil
2
  import tempfile
3
+ import os
4
  from fastapi import FastAPI, File, UploadFile, HTTPException
5
+ from fastapi.responses import HTMLResponse, JSONResponse, Response
6
  from fastapi.staticfiles import StaticFiles
7
  from main import analyze_pdf
8
 
9
  app = FastAPI()
10
 
11
+ # serve index.html at root
 
 
 
12
  @app.get("/", response_class=HTMLResponse)
13
  async def read_root():
14
  with open("index.html", "r") as f:
 
19
  if not file.filename.endswith(".pdf"):
20
  raise HTTPException(status_code=400, detail="File must be a PDF")
21
 
22
+ # Save to a known fixed path for the viewing endpoint
23
+ # Note: This is not thread-safe/multi-user safe, but sufficient for this local demo.
24
+ fixed_path = "latest_upload.pdf"
25
+ with open(fixed_path, "wb") as f:
26
+ # seek back to start if we copied it (but we didn't read it yet)
27
+ # Actually file is an UploadFile, we can just save it.
28
+ # But we already copied to tmp. Let's just use the tmp copy logic but to fixed path.
29
+ pass
30
+
31
+ # Actually, let's just write directly to fixed_path!
32
+ with open(fixed_path, "wb") as f:
33
+ shutil.copyfileobj(file.file, f)
34
 
35
  try:
 
 
 
 
 
 
 
 
 
 
 
36
  result = analyze_pdf(
37
+ pdf_path=fixed_path,
38
+ output_path="",
39
+ debug_dir=""
 
40
  )
 
41
  return JSONResponse(content=result)
 
42
  except Exception as e:
43
  import traceback
44
  traceback.print_exc()
45
  raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
46
 
47
+ import fitz
48
+
49
+ @app.get("/pdf/page/{page_num}")
50
+ async def get_pdf_page(page_num: int):
51
+ path = "latest_upload.pdf"
52
+ if not os.path.exists(path):
53
+ return Response(status_code=404)
54
+
55
+ try:
56
+ doc = fitz.open(path)
57
+ if page_num < 1 or page_num > doc.page_count:
58
+ doc.close()
59
+ return Response(status_code=404)
60
+
61
+ page = doc.load_page(page_num - 1)
62
+ # decent resolution for web viewing
63
+ zoom = 150 / 72.0
64
+ mat = fitz.Matrix(zoom, zoom)
65
+ pix = page.get_pixmap(matrix=mat, alpha=False)
66
+ img_bytes = pix.tobytes("png")
67
+ doc.close()
68
+ return Response(content=img_bytes, media_type="image/png")
69
+ except Exception as e:
70
+ print(f"Error serving page: {e}")
71
+ return Response(status_code=500)
config.py CHANGED
@@ -1,8 +1,10 @@
1
  from __future__ import annotations
 
2
  from dataclasses import dataclass
3
  import os
4
  from dotenv import load_dotenv
5
 
 
6
  @dataclass(frozen=True)
7
  class Settings:
8
  openrouter_api_key: str
@@ -13,16 +15,22 @@ class Settings:
13
  min_text_chars_for_digital: int
14
  topk_per_statement: int
15
 
 
 
 
 
 
16
  DEFAULT_FREE_VISION_MODELS = [
17
- # Free + vision-capable (as of their OpenRouter pages)
18
  "google/gemma-3-12b-it:free",
19
  "nvidia/nemotron-nano-12b-v2-vl:free",
20
  "amazon/nova-2-lite-v1:free",
21
  ]
22
 
 
23
  def load_settings(**kwargs) -> Settings:
24
  load_dotenv()
25
-
26
  api_key = kwargs.get("openrouter_api_key") or os.getenv("OPENROUTER_API_KEY", "").strip()
27
  if not api_key:
28
  raise RuntimeError("Missing OPENROUTER_API_KEY in environment/.env")
@@ -31,9 +39,18 @@ def load_settings(**kwargs) -> Settings:
31
  max_images = kwargs.get("max_images") or int(os.getenv("MAX_IMAGES", "12"))
32
  dpi = kwargs.get("dpi") or int(os.getenv("PDF_RENDER_DPI", "200"))
33
  ocr_lang = kwargs.get("ocr_lang") or os.getenv("OCR_LANG", "eng")
34
- min_text_chars_for_digital = kwargs.get("min_text_chars_for_digital") or int(os.getenv("MIN_TEXT_CHARS_FOR_DIGITAL", "80"))
 
 
35
  topk_per_statement = kwargs.get("topk_per_statement") or int(os.getenv("TOPK_PER_STATEMENT", "3"))
36
 
 
 
 
 
 
 
 
37
  return Settings(
38
  openrouter_api_key=api_key,
39
  openrouter_model=model,
@@ -42,4 +59,6 @@ def load_settings(**kwargs) -> Settings:
42
  ocr_lang=ocr_lang,
43
  min_text_chars_for_digital=min_text_chars_for_digital,
44
  topk_per_statement=topk_per_statement,
 
 
45
  )
 
1
  from __future__ import annotations
2
+
3
  from dataclasses import dataclass
4
  import os
5
  from dotenv import load_dotenv
6
 
7
+
8
  @dataclass(frozen=True)
9
  class Settings:
10
  openrouter_api_key: str
 
15
  min_text_chars_for_digital: int
16
  topk_per_statement: int
17
 
18
+ # block logic knobs
19
+ max_blocks_per_statement: int
20
+ continuation_max_forward: int
21
+
22
+
23
  DEFAULT_FREE_VISION_MODELS = [
24
+ # Free + vision-capable (as of their OpenRouter pages / availability changes over time)
25
  "google/gemma-3-12b-it:free",
26
  "nvidia/nemotron-nano-12b-v2-vl:free",
27
  "amazon/nova-2-lite-v1:free",
28
  ]
29
 
30
+
31
  def load_settings(**kwargs) -> Settings:
32
  load_dotenv()
33
+
34
  api_key = kwargs.get("openrouter_api_key") or os.getenv("OPENROUTER_API_KEY", "").strip()
35
  if not api_key:
36
  raise RuntimeError("Missing OPENROUTER_API_KEY in environment/.env")
 
39
  max_images = kwargs.get("max_images") or int(os.getenv("MAX_IMAGES", "12"))
40
  dpi = kwargs.get("dpi") or int(os.getenv("PDF_RENDER_DPI", "200"))
41
  ocr_lang = kwargs.get("ocr_lang") or os.getenv("OCR_LANG", "eng")
42
+ min_text_chars_for_digital = kwargs.get("min_text_chars_for_digital") or int(
43
+ os.getenv("MIN_TEXT_CHARS_FOR_DIGITAL", "80")
44
+ )
45
  topk_per_statement = kwargs.get("topk_per_statement") or int(os.getenv("TOPK_PER_STATEMENT", "3"))
46
 
47
+ max_blocks_per_statement = kwargs.get("max_blocks_per_statement") or int(
48
+ os.getenv("MAX_BLOCKS_PER_STATEMENT", "2")
49
+ )
50
+ continuation_max_forward = kwargs.get("continuation_max_forward") or int(
51
+ os.getenv("CONTINUATION_MAX_FORWARD", "6")
52
+ )
53
+
54
  return Settings(
55
  openrouter_api_key=api_key,
56
  openrouter_model=model,
 
59
  ocr_lang=ocr_lang,
60
  min_text_chars_for_digital=min_text_chars_for_digital,
61
  topk_per_statement=topk_per_statement,
62
+ max_blocks_per_statement=max_blocks_per_statement,
63
+ continuation_max_forward=continuation_max_forward,
64
  )
image_server_snippet.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Request
2
+ from fastapi.responses import Response
3
+ import fitz
4
+ import io
5
+
6
+ # We will need to inject this into app.py
7
+ def serve_page_image(pdf_path: str, page_num: int, dpi: int = 150):
8
+ try:
9
+ doc = fitz.open(pdf_path)
10
+ if page_num < 1 or page_num > doc.page_count:
11
+ return Response(status_code=404)
12
+
13
+ page = doc.load_page(page_num - 1)
14
+ zoom = dpi / 72.0
15
+ mat = fitz.Matrix(zoom, zoom)
16
+ pix = page.get_pixmap(matrix=mat, alpha=False)
17
+ img_bytes = pix.tobytes("png")
18
+ doc.close()
19
+ return Response(content=img_bytes, media_type="image/png")
20
+ except Exception as e:
21
+ print(f"Error serving page: {e}")
22
+ return Response(status_code=500)
index.html CHANGED
@@ -1,29 +1,88 @@
1
  <!DOCTYPE html>
2
  <html lang="en">
 
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
  <title>Financial Report Analyzer</title>
7
  <style>
8
- body { font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; }
9
- .container { border: 1px solid #ccc; padding: 20px; border-radius: 8px; background: #f9f9f9; }
10
- h1 { text-align: center; color: #333; }
11
- .form-group { margin-bottom: 20px; text-align: center; }
12
- input[type="file"] { margin: 10px 0; }
13
- button { background-color: #007bff; color: white; border: none; padding: 10px 20px; border-radius: 4px; cursor: pointer; font-size: 16px; }
14
- button:hover { background-color: #0056b3; }
15
- button:disabled { background-color: #ccc; cursor: not-allowed; }
16
- #status { text-align: center; margin-top: 10px; font-weight: bold; }
17
- #result { margin-top: 20px; white-space: pre-wrap; background: #fff; padding: 15px; border: 1px solid #ddd; border-radius: 4px; display: none; }
18
- .error { color: #dc3545; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </style>
20
  </head>
 
21
  <body>
22
 
23
  <div class="container">
24
  <h1>Financial Report Analyzer</h1>
25
- <p style="text-align: center;">Upload a 10-K/Annual Report PDF to extract page ranges for primary financial statements.</p>
26
-
 
27
  <div class="form-group">
28
  <input type="file" id="pdfInput" accept=".pdf" />
29
  <br>
@@ -52,7 +111,7 @@
52
  status.textContent = "Analyzing... This may take a minute.";
53
  status.className = "";
54
  resultDisplay.style.display = 'none';
55
- resultDisplay.textContent = "";
56
 
57
  const formData = new FormData();
58
  formData.append('file', file);
@@ -69,10 +128,10 @@
69
  }
70
 
71
  const data = await response.json();
72
- delete data.debug;
73
- delete data.notes;
74
  status.textContent = "Analysis Complete!";
75
- resultDisplay.textContent = JSON.stringify(data, null, 2);
 
 
76
  resultDisplay.style.display = 'block';
77
 
78
  } catch (error) {
@@ -83,6 +142,82 @@
83
  btn.disabled = false;
84
  }
85
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  </script>
87
  </body>
88
- </html>
 
 
1
  <!DOCTYPE html>
2
  <html lang="en">
3
+
4
  <head>
5
  <meta charset="UTF-8">
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
  <title>Financial Report Analyzer</title>
8
  <style>
9
+ body {
10
+ font-family: sans-serif;
11
+ max-width: 800px;
12
+ margin: 0 auto;
13
+ padding: 20px;
14
+ line-height: 1.6;
15
+ }
16
+
17
+ .container {
18
+ border: 1px solid #ccc;
19
+ padding: 20px;
20
+ border-radius: 8px;
21
+ background: #f9f9f9;
22
+ }
23
+
24
+ h1 {
25
+ text-align: center;
26
+ color: #333;
27
+ }
28
+
29
+ .form-group {
30
+ margin-bottom: 20px;
31
+ text-align: center;
32
+ }
33
+
34
+ input[type="file"] {
35
+ margin: 10px 0;
36
+ }
37
+
38
+ button {
39
+ background-color: #007bff;
40
+ color: white;
41
+ border: none;
42
+ padding: 10px 20px;
43
+ border-radius: 4px;
44
+ cursor: pointer;
45
+ font-size: 16px;
46
+ }
47
+
48
+ button:hover {
49
+ background-color: #0056b3;
50
+ }
51
+
52
+ button:disabled {
53
+ background-color: #ccc;
54
+ cursor: not-allowed;
55
+ }
56
+
57
+ #status {
58
+ text-align: center;
59
+ margin-top: 10px;
60
+ font-weight: bold;
61
+ }
62
+
63
+ #result {
64
+ margin-top: 20px;
65
+ white-space: pre-wrap;
66
+ background: #fff;
67
+ padding: 15px;
68
+ border: 1px solid #ddd;
69
+ border-radius: 4px;
70
+ display: none;
71
+ }
72
+
73
+ .error {
74
+ color: #dc3545;
75
+ }
76
  </style>
77
  </head>
78
+
79
  <body>
80
 
81
  <div class="container">
82
  <h1>Financial Report Analyzer</h1>
83
+ <p style="text-align: center;">Upload a 10-K/Annual Report PDF to extract page ranges for primary financial
84
+ statements.</p>
85
+
86
  <div class="form-group">
87
  <input type="file" id="pdfInput" accept=".pdf" />
88
  <br>
 
111
  status.textContent = "Analyzing... This may take a minute.";
112
  status.className = "";
113
  resultDisplay.style.display = 'none';
114
+ resultDisplay.innerHTML = ""; // Clear previous content
115
 
116
  const formData = new FormData();
117
  formData.append('file', file);
 
128
  }
129
 
130
  const data = await response.json();
 
 
131
  status.textContent = "Analysis Complete!";
132
+
133
+ // Render nicely
134
+ renderResults(data, resultDisplay);
135
  resultDisplay.style.display = 'block';
136
 
137
  } catch (error) {
 
142
  btn.disabled = false;
143
  }
144
  }
145
+
146
+ function renderResults(data, container) {
147
+ let html = "";
148
+
149
+ const sections = [
150
+ { key: 'balance_sheet', label: 'Balance Sheet' },
151
+ { key: 'profit_and_loss', label: 'Profit & Loss' },
152
+ { key: 'cash_flow', label: 'Cash Flow' }
153
+ ];
154
+
155
+ sections.forEach(sec => {
156
+ html += `<h3>${sec.label}</h3>`;
157
+ const items = data[sec.key];
158
+ if (!items || items.length === 0) {
159
+ html += "<p>No ranges found.</p>";
160
+ } else {
161
+ html += `
162
+ <div style="overflow-x: auto;">
163
+ <table border="1" cellpadding="8" style="border-collapse: collapse; width: 100%; margin-bottom: 20px;">
164
+ <tr style="background: #eee;">
165
+ <th>Scope</th>
166
+ <th>Pages</th>
167
+ <th>Details</th>
168
+ <th style="min-width: 300px;">Evidence Images</th>
169
+ </tr>`;
170
+
171
+ items.forEach(item => {
172
+ const pagesStr = (item.pages || []).join(", ");
173
+
174
+ // Generate images for all pages in the range
175
+ let imagesHtml = '<div style="display: flex; gap: 10px; overflow-x: auto;">';
176
+ const pagesToShow = item.evidence_pages && item.evidence_pages.length > 0
177
+ ? item.evidence_pages
178
+ : (item.pages || []);
179
+
180
+ pagesToShow.forEach(pNum => {
181
+ imagesHtml += `
182
+ <div style="text-align: center;">
183
+ <a href="/pdf/page/${pNum}" target="_blank">
184
+ <img src="/pdf/page/${pNum}" style="height: 200px; border: 1px solid #ddd; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);" alt="Page ${pNum}" loading="lazy"><br>
185
+ <small>Page ${pNum}</small>
186
+ </a>
187
+ </div>`;
188
+ });
189
+ imagesHtml += '</div>';
190
+
191
+ html += `
192
+ <tr>
193
+ <td><strong>${item.scope}</strong></td>
194
+ <td>${pagesStr}</td>
195
+ <td>
196
+ <strong>Title:</strong> ${item.title || "<em>(null)</em>"}<br>
197
+ <strong>Confidence:</strong> ${(item.confidence * 100).toFixed(0)}%
198
+ </td>
199
+ <td>${imagesHtml}</td>
200
+ </tr>`;
201
+ });
202
+ html += "</table></div>";
203
+ }
204
+ });
205
+
206
+ // Notes
207
+ if (data.notes && data.notes.length > 0) {
208
+ html += "<h3>Notes</h3><ul>";
209
+ data.notes.forEach(note => {
210
+ html += `<li>${note}</li>`;
211
+ });
212
+ html += "</ul>";
213
+ }
214
+
215
+ // Raw JSON toggle (optional)
216
+ html += `<hr><details><summary>Raw JSON</summary><pre>${JSON.stringify(data, null, 2)}</pre></details>`;
217
+
218
+ container.innerHTML = html;
219
+ }
220
  </script>
221
  </body>
222
+
223
+ </html>
main.py CHANGED
@@ -1,7 +1,10 @@
1
  from __future__ import annotations
 
2
  import argparse
3
  import json
 
4
  import time
 
5
 
6
  from config import load_settings, DEFAULT_FREE_VISION_MODELS
7
  from pdf_io import extract_texts_from_pdf, render_pages_to_png_bytes
@@ -17,279 +20,366 @@ from openrouter_client import (
17
 
18
 
19
  PROMPT_TEMPLATE = """
20
- You are given:
21
- 1) OCR/extracted text for a set of PDF pages from a company's financial report (10-K/annual report)
22
- 2) Images of the same pages
23
 
24
  Task:
25
- Identify the PDF PAGE RANGES (start_page, end_page) for the THREE PRIMARY FINANCIAL STATEMENT TABLES ONLY:
26
- - Balance Sheet (a.k.a. Statement of Financial Position / Consolidated Balance Sheet / Standalone Balance Sheet)
27
- - Profit & Loss (a.k.a. Income Statement / Statements of Earnings / Statements of Operations)
28
- - Cash Flow Statement (Statements of Cash Flows)
29
-
30
- IMPORTANT RULES (STRICT):
31
- - Only return ranges for the PRIMARY consolidated & standalone financial statements pages.
32
- - Do NOT return ranges for note disclosures (e.g., derivatives, leases, fair value tables), MD&A, segment notes, or narrative discussion.
33
- - A primary statement table page usually has:
34
- (a) a clear statement title at the top (e.g., “Consolidated Balance Sheets”, "Standalone Balance Sheets")
35
- (b) many numeric columns (often multiple years)
36
- (c) canonical line items like:
37
- Balance sheet: “Total assets”, “Total liabilities”, “Total equity/stockholders’ equity”
38
- P&L: “Net revenues/sales”, “Cost of sales”, “Operating income”, “Net earnings/income”, “Earnings per share”
39
- Cash flow: “Cash flows from operating/investing/financing activities”, “Net cash provided by”, “Cash and cash equivalents at end”
40
- - If a statement continues onto the next page, include that continuation page in the range.
41
-
42
- Pages provided (OCR snippets):
43
  {page_snippets}
44
 
45
- Output JSON ONLY in this schema (no extra keys, no markdown):
 
46
  {{
47
- "balance_sheet": {{"start_page": int, "end_page": int, "confidence": float, "title": str}},
48
- "profit_and_loss": {{"start_page": int, "end_page": int, "confidence": float, "title": str}},
49
- "cash_flow": {{"start_page": int, "end_page": int, "confidence": float, "title": str}}
 
 
 
 
 
 
 
 
 
 
 
50
  }}
51
 
52
- Remember: PDF page numbers are 1-based in your output.
53
- """
 
 
 
 
 
 
 
 
 
54
 
55
- SCHEMA_HINT = """{
56
- "balance_sheet": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
57
- "profit_and_loss": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
58
- "cash_flow": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
59
- "notes": "string[]"
60
- }"""
61
 
 
 
 
 
 
 
 
 
62
 
63
- def log(msg: str):
64
- ts = time.strftime("%H:%M:%S")
65
- print(f"[{ts}] {msg}", flush=True)
66
 
 
 
 
67
 
68
- def build_page_snippets(page_texts, selected_pages):
69
- chunks = []
70
- for p in selected_pages:
71
- pt = page_texts[p]
72
- txt = (pt.extracted_text or "") + "\n" + (pt.ocr_text or "")
73
- txt = " ".join(txt.strip().split())
74
- if len(txt) > 900:
75
- txt = txt[:900] + "..."
76
- chunks.append(f"- Page {p+1}: {txt}")
77
- return "\n".join(chunks)
 
 
 
 
 
 
 
78
 
79
 
80
  def validate_ranges(result: dict, page_count: int) -> dict:
81
- def clamp(v):
82
- if v is None:
83
- return None
84
- if not isinstance(v, int):
85
- return None
86
- if v < 1 or v > page_count:
87
  return None
88
- return v
89
 
90
- for k in ["balance_sheet", "profit_and_loss", "cash_flow"]:
91
- obj = result.get(k, {})
 
 
 
 
 
92
  if not isinstance(obj, dict):
93
- result[k] = {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None}
94
- continue
 
 
 
 
 
 
 
95
 
96
- sp = clamp(obj.get("start_page"))
97
- ep = clamp(obj.get("end_page"))
98
  if sp is not None and ep is not None and ep < sp:
99
- sp, ep = None, None
100
-
101
- obj["start_page"] = sp
102
- obj["end_page"] = ep
103
- if "confidence" not in obj or not isinstance(obj["confidence"], (int, float)):
104
- obj["confidence"] = 0.0
105
- if "evidence_pages" not in obj or not isinstance(obj["evidence_pages"], list):
106
- obj["evidence_pages"] = []
107
- if "title" not in obj:
108
- obj["title"] = None
109
- result[k] = obj
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  if "notes" not in result or not isinstance(result["notes"], list):
112
  result["notes"] = []
 
 
 
113
  return result
114
 
115
 
116
- def analyze_pdf(
117
- pdf_path: str,
118
- output_path: str = "ranges.json",
119
- debug_dir: str = "debug",
120
- openrouter_api_key: str = None
121
- ) -> dict:
122
  """
123
- Analyzes a PDF to find financial statement page ranges.
124
- Returns the result dict.
125
  """
126
- settings_kwargs = {}
127
- if openrouter_api_key:
128
- settings_kwargs["openrouter_api_key"] = openrouter_api_key
129
-
130
- st = load_settings(**settings_kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- log(f"Loading PDF: {pdf_path}")
 
 
 
 
 
 
 
 
 
 
133
  page_texts, page_count = extract_texts_from_pdf(
134
  pdf_path=pdf_path,
135
- dpi=st.dpi,
136
- ocr_lang=st.ocr_lang,
137
- min_text_chars_for_digital=st.min_text_chars_for_digital,
138
  )
 
139
 
140
- ocr_pages = sum(1 for p in page_texts if p.used_ocr)
141
- log(f"Pages: {page_count} | OCR used on {ocr_pages} pages")
142
-
143
- candidates, cand_debug = build_candidate_lists(page_texts, top_k=30, debug=True)
144
- log("TOC/Index debug:")
145
- log(f" item8_toc_page = {cand_debug.get('item8_toc_page')}")
146
- log(f" toc_internal = {cand_debug.get('toc_internal')}")
147
- log(f" toc_pdf_all = {cand_debug.get('toc_pdf_targets_all')}")
148
- log(f" heuristic_ranges_0_based = {cand_debug.get('heuristic_ranges_0_based')}")
149
 
150
- selected_pages = select_pages_for_llm(
 
151
  candidates=candidates,
152
- debug_info=cand_debug,
153
  page_count=page_count,
154
- max_images=st.max_images
 
155
  )
156
- log(f"Selected pages to render/send (1-indexed): {[p+1 for p in selected_pages]}")
157
 
158
- log(f"Rendering {len(selected_pages)} pages to images (dpi={st.dpi})...")
159
- page_png_map = render_pages_to_png_bytes(pdf_path, selected_pages, dpi=st.dpi)
160
- log("Image rendering done.")
161
 
162
- if st.openrouter_model:
163
- model = st.openrouter_model
164
- log(f"Using model from env: {model}")
165
- else:
166
- model = choose_free_vision_model(st.openrouter_api_key, preferred=DEFAULT_FREE_VISION_MODELS)
167
- log(f"Auto-selected free vision model: {model}")
168
-
169
- snippets = build_page_snippets(page_texts, selected_pages)
170
- prompt = PROMPT_TEMPLATE.format(page_snippets=snippets)
171
-
172
- # --- LLM call with progressive image backoff ---
173
- pages_sent = list(selected_pages)
174
- llm_res = None
175
- while pages_sent:
176
- images = [page_png_map[p] for p in pages_sent]
177
- msg = make_user_message_with_images(prompt, images)
178
-
179
- log(f"Calling OpenRouter (images={len(images)})...")
180
- llm_res = chat_completion(
181
- api_key=st.openrouter_api_key,
182
- model=model,
183
- messages=[msg],
184
- max_tokens=4096,
185
- temperature=0.0,
186
- require_json=True,
187
- )
188
-
189
- log(f"finish_reason={llm_res.finish_reason} native={llm_res.native_finish_reason} content_len={len(llm_res.content)}")
190
-
191
- # save raw response for debugging
192
- try:
193
- import os
194
- os.makedirs(debug_dir, exist_ok=True)
195
- with open(f"{debug_dir}/openrouter_raw_response.json", "w", encoding="utf-8") as f:
196
- json.dump(llm_res.raw, f, indent=2)
197
- except Exception:
198
- pass
199
-
200
- if llm_res.finish_reason == "error" or ("error" in llm_res.raw and llm_res.raw["error"]):
201
- log("OpenRouter returned an error payload (see debug/openrouter_raw_response.json). Backing off images...")
202
- elif llm_res.content.strip():
203
- break
204
-
205
- if len(pages_sent) <= 3:
206
- break
207
- pages_sent = pages_sent[:-2]
208
- log(f"Retrying with fewer images. Now sending pages: {[p+1 for p in pages_sent]}")
209
-
210
- if not llm_res:
211
- raise RuntimeError("LLM call never executed.")
212
-
213
- raw_text = (llm_res.content or "").strip()
214
- log("DEBUG: raw model output (first 1200 chars):")
215
- print(raw_text[:1200], flush=True)
216
-
217
- # --- Parse JSON with repair fallback ---
218
  try:
219
- result = robust_json_loads(raw_text)
220
- log("Parsed JSON successfully.")
221
  except Exception as e:
222
- log(f"JSON parse failed: {e}")
223
- # Save raw text
224
- try:
225
- import os
226
- os.makedirs(debug_dir, exist_ok=True)
227
- with open(f"{debug_dir}/llm_raw_output.txt", "w", encoding="utf-8") as f:
228
- f.write(raw_text)
229
- except Exception:
230
- pass
231
-
232
- # Repair pass with free-tier text model
233
- repair_model = choose_any_free_text_model(st.openrouter_api_key, preferred=[
234
- model, # try same model first
235
- "google/gemma-3-12b-it:free",
236
- "amazon/nova-2-lite-v1:free",
237
- "nvidia/nemotron-nano-12b-v2-vl:free",
238
- ])
239
- log(f"Attempting JSON repair using: {repair_model}")
240
- try:
241
- result = repair_to_json(
242
- api_key=st.openrouter_api_key,
243
- model=repair_model,
244
- bad_output=raw_text if raw_text else json.dumps(llm_res.raw),
245
- schema_hint=SCHEMA_HINT,
246
- )
247
- log("Repair JSON succeeded.")
248
- except Exception as e2:
249
- log(f"Repair JSON failed: {e2}")
250
- # Final safe fallback
251
- result = {
252
- "balance_sheet": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
253
- "profit_and_loss": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
254
- "cash_flow": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
255
- "notes": [
256
- "Model output could not be parsed as JSON.",
257
- "Check debug/openrouter_raw_response.json and debug/llm_raw_output.txt",
258
- ],
259
- }
260
 
261
- result = validate_ranges(result, page_count=page_count)
 
 
 
262
  result["debug"] = {
263
- "model_used": model,
264
- "pages_sent": [p + 1 for p in pages_sent],
265
- "candidate_pages": candidates,
266
- "finish_reason": llm_res.finish_reason,
267
- "native_finish_reason": llm_res.native_finish_reason,
268
  }
269
 
270
  if output_path:
271
  with open(output_path, "w", encoding="utf-8") as f:
272
  json.dump(result, f, indent=2)
273
- log(f"Saved output: {output_path}")
274
 
275
  return result
276
 
277
 
278
  def main():
279
  ap = argparse.ArgumentParser()
280
- ap.add_argument("--pdf", required=True, help="Path to financial report PDF")
281
- ap.add_argument("--out", default="ranges.json", help="Output JSON path")
282
- ap.add_argument("--debug_dir", default="debug", help="Folder to write debug artifacts")
283
  args = ap.parse_args()
284
 
285
- # Call the core logic
286
- result = analyze_pdf(
287
- pdf_path=args.pdf,
288
- output_path=args.out,
289
- debug_dir=args.debug_dir
290
- )
291
-
292
- # Print result to stdout for CLI use
293
  print(json.dumps(result, indent=2), flush=True)
294
 
295
 
 
1
  from __future__ import annotations
2
+
3
  import argparse
4
  import json
5
+ import os
6
  import time
7
+ from typing import Any, Dict, List
8
 
9
  from config import load_settings, DEFAULT_FREE_VISION_MODELS
10
  from pdf_io import extract_texts_from_pdf, render_pages_to_png_bytes
 
20
 
21
 
22
  PROMPT_TEMPLATE = """
23
+ You are an expert financial-report analyst.
 
 
24
 
25
  Task:
26
+ Given (a) OCR/native text snippets for certain pages and (b) images of those pages,
27
+ identify page ranges that contain ONLY the three PRIMARY financial statements tables:
28
+ 1) Balance Sheet / Statement of Financial Position
29
+ 2) Profit & Loss / Income / Earnings / Operations
30
+ 3) Cash Flow Statement
31
+
32
+ Important:
33
+ - Many annual reports include BOTH consolidated and standalone statements.
34
+ - You MUST return blocks for BOTH if present.
35
+ - If a statement spans multiple pages, include ALL pages in that block.
36
+ - A continuation page may not repeat the full title; use table structure + line-items.
37
+
38
+ Heuristic candidate blocks (for reference only; you must verify from images+snippets):
39
+ {heuristic_blocks}
40
+
41
+ Pages provided (OCR/native snippets):
 
 
42
  {page_snippets}
43
 
44
+ Return STRICT JSON ONLY (no markdown, no commentary).
45
+ Schema (IMPORTANT: each statement is a LIST of blocks):
46
  {{
47
+ "balance_sheet": [
48
+ {{
49
+ "scope": "consolidated|standalone|unknown",
50
+ "start_page": <1-indexed int>,
51
+ "end_page": <1-indexed int>,
52
+ "pages": [<1-indexed ints>],
53
+ "confidence": <0..1>,
54
+ "title": "<string or null>",
55
+ "evidence_pages": [<1-indexed ints>]
56
+ }}
57
+ ],
58
+ "profit_and_loss": [ ... same block schema ... ],
59
+ "cash_flow": [ ... same block schema ... ],
60
+ "notes": [ "<optional strings>" ]
61
  }}
62
 
63
+ Rules:
64
+ - "pages" must list ALL pages in the block (even if it's one page).
65
+ - start_page = min(pages), end_page = max(pages).
66
+ - If a statement is NOT present, return an empty list for it.
67
+ """.strip()
68
+
69
+
70
+ def _combined_for_snippet(p) -> str:
71
+ a = getattr(p, "extracted_text", "") or ""
72
+ b = getattr(p, "ocr_text", "") or ""
73
+ return (a + "\n" + b).strip()
74
 
 
 
 
 
 
 
75
 
76
+ def build_page_snippets(page_texts: List[Any], selected_pages_0: List[int], max_chars_per_page: int = 1400) -> str:
77
+ parts = []
78
+ for p0 in selected_pages_0:
79
+ pt = page_texts[p0]
80
+ txt = _combined_for_snippet(pt)
81
+ txt = txt[:max_chars_per_page]
82
+ parts.append(f"--- Page {p0+1} ---\n{txt}\n")
83
+ return "\n".join(parts).strip()
84
 
 
 
 
85
 
86
+ def format_heuristic_blocks(heuristic_blocks_0_based: dict, max_per_stmt: int = 6) -> str:
87
+ if not isinstance(heuristic_blocks_0_based, dict):
88
+ return "(none)"
89
 
90
+ lines = []
91
+ for stmt in ["balance_sheet", "profit_and_loss", "cash_flow"]:
92
+ bl = heuristic_blocks_0_based.get(stmt) or []
93
+ if not isinstance(bl, list) or not bl:
94
+ lines.append(f"- {stmt}: (none)")
95
+ continue
96
+
97
+ bl_sorted = sorted(bl, key=lambda b: float(b.get("score") or 0.0), reverse=True)[:max_per_stmt]
98
+ parts = []
99
+ for b in bl_sorted:
100
+ s = int(b.get("start")) + 1
101
+ e = int(b.get("end")) + 1
102
+ scope = (b.get("scope") or "unknown")
103
+ title = b.get("title")
104
+ parts.append(f"{scope}: {s}-{e}" + (f" ({title})" if title else ""))
105
+ lines.append(f"- {stmt}: " + "; ".join(parts))
106
+ return "\n".join(lines)
107
 
108
 
109
  def validate_ranges(result: dict, page_count: int) -> dict:
110
+ """
111
+ Normalize model output into list-of-blocks schema.
112
+ Ensures every block has pages list; fixes start/end from pages.
113
+ """
114
+ def clamp_int(v):
115
+ if v is None or not isinstance(v, int):
116
  return None
117
+ return v if 1 <= v <= page_count else None
118
 
119
+ def normalize_pages(pages_val):
120
+ if not isinstance(pages_val, list):
121
+ return []
122
+ out = [x for x in pages_val if isinstance(x, int) and 1 <= x <= page_count]
123
+ return sorted(set(out))
124
+
125
+ def norm_block(obj) -> dict:
126
  if not isinstance(obj, dict):
127
+ obj = {}
128
+
129
+ sp = clamp_int(obj.get("start_page"))
130
+ ep = clamp_int(obj.get("end_page"))
131
+ pages = normalize_pages(obj.get("pages"))
132
+
133
+ if pages and (sp is None or ep is None):
134
+ sp = min(pages)
135
+ ep = max(pages)
136
 
 
 
137
  if sp is not None and ep is not None and ep < sp:
138
+ sp, ep, pages = None, None, []
139
+
140
+ if not pages and sp is not None and ep is not None:
141
+ pages = list(range(sp, ep + 1))
142
+
143
+ scope = obj.get("scope")
144
+ if not isinstance(scope, str):
145
+ scope = "unknown"
146
+ scope = scope.lower().strip()
147
+ if scope not in {"consolidated", "standalone", "unknown"}:
148
+ scope = "unknown"
149
+
150
+ conf = obj.get("confidence")
151
+ conf = float(conf) if isinstance(conf, (int, float)) else 0.0
152
+ conf = max(0.0, min(1.0, conf))
153
+
154
+ evidence = obj.get("evidence_pages")
155
+ if not isinstance(evidence, list):
156
+ evidence = []
157
+ evidence = [x for x in evidence if isinstance(x, int) and 1 <= x <= page_count]
158
+
159
+ title = obj.get("title")
160
+ if title is not None and not isinstance(title, str):
161
+ title = None
162
+
163
+ # ALWAYS keep pages list even if single page
164
+ if sp is None or ep is None:
165
+ return {
166
+ "start_page": None,
167
+ "end_page": None,
168
+ "pages": [],
169
+ "scope": scope,
170
+ "confidence": conf,
171
+ "title": title,
172
+ "evidence_pages": evidence,
173
+ }
174
+
175
+ return {
176
+ "start_page": sp,
177
+ "end_page": ep,
178
+ "pages": pages,
179
+ "scope": scope,
180
+ "confidence": conf,
181
+ "title": title,
182
+ "evidence_pages": evidence if evidence else ([sp] if sp else []),
183
+ }
184
+
185
+ for k in ["balance_sheet", "profit_and_loss", "cash_flow"]:
186
+ val = result.get(k)
187
+ if isinstance(val, dict):
188
+ val = [val]
189
+ if not isinstance(val, list):
190
+ val = []
191
+ result[k] = [norm_block(x) for x in val]
192
 
193
  if "notes" not in result or not isinstance(result["notes"], list):
194
  result["notes"] = []
195
+ else:
196
+ result["notes"] = [x for x in result["notes"] if isinstance(x, str)]
197
+
198
  return result
199
 
200
 
201
+ def merge_with_heuristics(result: dict, heuristic_blocks_0_based: dict, page_count: int) -> dict:
 
 
 
 
 
202
  """
203
+ Add missing consolidated/standalone blocks if LLM returned only one.
204
+ Also expands single-page LLM blocks if heuristics show a longer block with same start+scope.
205
  """
206
+ if not isinstance(heuristic_blocks_0_based, dict):
207
+ return result
208
+
209
+ def overlap(a, b):
210
+ return not (a[1] < b[0] or b[1] < a[0])
211
+
212
+ for stmt in ["balance_sheet", "profit_and_loss", "cash_flow"]:
213
+ llm_blocks = result.get(stmt) or []
214
+ if not isinstance(llm_blocks, list):
215
+ llm_blocks = []
216
+
217
+ hb = heuristic_blocks_0_based.get(stmt) or []
218
+ heur_blocks = []
219
+ if isinstance(hb, list):
220
+ for b in hb:
221
+ try:
222
+ s = int(b.get("start")) + 1
223
+ e = int(b.get("end")) + 1
224
+ except Exception:
225
+ continue
226
+ if not (1 <= s <= page_count and 1 <= e <= page_count and e >= s):
227
+ continue
228
+ heur_blocks.append(
229
+ {
230
+ "start_page": s,
231
+ "end_page": e,
232
+ "pages": list(range(s, e + 1)),
233
+ "scope": (b.get("scope") or "unknown"),
234
+ "confidence": 0.35,
235
+ "title": b.get("title"),
236
+ "evidence_pages": [s],
237
+ }
238
+ )
239
+
240
+ # expand single-page blocks using heuristics
241
+ for lb in llm_blocks:
242
+ if not isinstance(lb, dict):
243
+ continue
244
+ sp = lb.get("start_page")
245
+ ep = lb.get("end_page")
246
+ scope = (lb.get("scope") or "unknown")
247
+ if sp is None or ep is None:
248
+ continue
249
+ if sp == ep:
250
+ for hb2 in heur_blocks:
251
+ if hb2["scope"] == scope and hb2["start_page"] == sp and hb2["end_page"] > ep:
252
+ lb["end_page"] = hb2["end_page"]
253
+ lb["pages"] = hb2["pages"]
254
+ break
255
+
256
+ present_ranges = [
257
+ (b.get("start_page"), b.get("end_page"))
258
+ for b in llm_blocks
259
+ if isinstance(b, dict) and b.get("start_page") and b.get("end_page")
260
+ ]
261
+ present_scopes = {(b.get("scope") or "unknown") for b in llm_blocks if isinstance(b, dict)}
262
+
263
+ # add missing scope blocks (common: consolidated + standalone)
264
+ for hb2 in heur_blocks:
265
+ if hb2["scope"] in present_scopes and len(heur_blocks) > 1:
266
+ continue
267
+ r = (hb2["start_page"], hb2["end_page"])
268
+ if any(overlap(r, (ps, pe)) for (ps, pe) in present_ranges if ps and pe):
269
+ continue
270
+ llm_blocks.append(hb2)
271
+ present_scopes.add(hb2["scope"])
272
+ present_ranges.append(r)
273
+
274
+ llm_blocks = [b for b in llm_blocks if isinstance(b, dict)]
275
+ llm_blocks.sort(key=lambda b: (b.get("start_page") or 10**9, b.get("end_page") or 10**9))
276
+ result[stmt] = llm_blocks
277
+
278
+ return result
279
 
280
+
281
+ def analyze_pdf(
282
+ pdf_path: str,
283
+ output_path: str = "",
284
+ debug_dir: str = "",
285
+ openrouter_api_key: str | None = None,
286
+ ) -> Dict[str, Any]:
287
+ settings = load_settings(openrouter_api_key=openrouter_api_key or os.getenv("OPENROUTER_API_KEY", "").strip())
288
+
289
+ t0 = time.time()
290
+ print(f"[1/6] Extracting text/OCR from PDF: {pdf_path}", flush=True)
291
  page_texts, page_count = extract_texts_from_pdf(
292
  pdf_path=pdf_path,
293
+ dpi=settings.dpi,
294
+ ocr_lang=settings.ocr_lang,
295
+ min_text_chars_for_digital=settings.min_text_chars_for_digital,
296
  )
297
+ print(f" -> pages: {page_count} (t={time.time()-t0:.1f}s)", flush=True)
298
 
299
+ print(f"[2/6] Building statement candidates + heuristic blocks...", flush=True)
300
+ candidates, debug_info = build_candidate_lists(
301
+ pages=page_texts,
302
+ page_count=page_count,
303
+ topk_per_statement=settings.topk_per_statement,
304
+ continuation_max_forward=settings.continuation_max_forward,
305
+ debug=True,
306
+ )
 
307
 
308
+ print("[3/6] Selecting pages to send to LLM (images)...", flush=True)
309
+ selected_pages_0 = select_pages_for_llm(
310
  candidates=candidates,
311
+ debug_info=debug_info,
312
  page_count=page_count,
313
+ max_images=settings.max_images,
314
+ max_blocks_per_statement=settings.max_blocks_per_statement,
315
  )
316
+ print(f" -> selected {len(selected_pages_0)} pages: {[p+1 for p in selected_pages_0]}", flush=True)
317
 
318
+ print("[4/6] Rendering selected pages to PNG bytes...", flush=True)
319
+ images = render_pages_to_png_bytes(pdf_path, selected_pages_0, dpi=settings.dpi)
 
320
 
321
+ heuristic_blocks_str = format_heuristic_blocks(debug_info.get("heuristic_blocks_0_based") or {})
322
+ snippets = build_page_snippets(page_texts, selected_pages_0)
323
+
324
+ prompt = PROMPT_TEMPLATE.format(
325
+ heuristic_blocks=heuristic_blocks_str,
326
+ page_snippets=snippets,
327
+ )
328
+
329
+ # Choose model
330
+ model = settings.openrouter_model
331
+ if not model:
332
+ print("[5/6] Selecting a free vision model from OpenRouter...", flush=True)
333
+ model = choose_free_vision_model(settings.openrouter_api_key, DEFAULT_FREE_VISION_MODELS)
334
+ print(f"[5/6] Calling OpenRouter model: {model}", flush=True)
335
+
336
+ messages = [
337
+ # {"role": "system", "content": "Return STRICT JSON only."},
338
+ make_user_message_with_images(prompt, images),
339
+ ]
340
+
341
+ raw = chat_completion(settings.openrouter_api_key, model=model, messages=messages, temperature=0.0, max_tokens=1400)
342
+ raw_text = (raw.content or "").strip()
343
+
344
+ print("[6/6] Parsing model output...", flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  try:
346
+ parsed = robust_json_loads(raw_text)
 
347
  except Exception as e:
348
+ print(" -> JSON parse failed, attempting repair:", str(e), flush=True)
349
+ text_model = choose_any_free_text_model(settings.openrouter_api_key)
350
+ fixed = repair_to_json(settings.openrouter_api_key, raw_text, model=text_model)
351
+ parsed = robust_json_loads(fixed)
352
+
353
+ if not isinstance(parsed, dict):
354
+ parsed = {"balance_sheet": [], "profit_and_loss": [], "cash_flow": [], "notes": []}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
+ parsed = validate_ranges(parsed, page_count=page_count)
357
+ parsed = merge_with_heuristics(parsed, debug_info.get("heuristic_blocks_0_based") or {}, page_count=page_count)
358
+
359
+ result: Dict[str, Any] = dict(parsed)
360
  result["debug"] = {
361
+ "selected_pages_1_based": [p + 1 for p in selected_pages_0],
362
+ "candidates_top": debug_info.get("top_scoring", {}),
363
+ "heuristic_blocks_0_based": debug_info.get("heuristic_blocks_0_based", {}),
364
+ "item8_toc_page_1_based": (debug_info.get("item8_toc_page") + 1) if debug_info.get("item8_toc_page") is not None else None,
 
365
  }
366
 
367
  if output_path:
368
  with open(output_path, "w", encoding="utf-8") as f:
369
  json.dump(result, f, indent=2)
370
+ print(f"Saved output -> {output_path}", flush=True)
371
 
372
  return result
373
 
374
 
375
  def main():
376
  ap = argparse.ArgumentParser()
377
+ ap.add_argument("--pdf", required=True, help="Path to input PDF")
378
+ ap.add_argument("--out", required=False, default="", help="Path to output JSON file")
379
+ ap.add_argument("--debug_dir", required=False, default="", help="Directory to store debug artifacts (optional)")
380
  args = ap.parse_args()
381
 
382
+ result = analyze_pdf(pdf_path=args.pdf, output_path=args.out, debug_dir=args.debug_dir)
 
 
 
 
 
 
 
383
  print(json.dumps(result, indent=2), flush=True)
384
 
385
 
openrouter_client.py CHANGED
@@ -1,4 +1,5 @@
1
  from __future__ import annotations
 
2
  import base64
3
  import json
4
  import re
@@ -7,6 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple
7
 
8
  import requests
9
 
 
10
  OPENROUTER_CHAT_URL = "https://openrouter.ai/api/v1/chat/completions"
11
  OPENROUTER_MODELS_URL = "https://openrouter.ai/api/v1/models"
12
 
@@ -14,8 +16,8 @@ OPENROUTER_MODELS_URL = "https://openrouter.ai/api/v1/models"
14
  @dataclass
15
  class ChatResult:
16
  content: str
17
- finish_reason: str | None
18
- native_finish_reason: str | None
19
  tool_calls: Any
20
  raw: dict
21
 
@@ -27,62 +29,39 @@ def list_models(api_key: str) -> dict:
27
  return r.json()
28
 
29
 
30
- def choose_free_vision_model(api_key: str, preferred: list[str]) -> str:
31
  models = list_models(api_key).get("data", [])
32
- by_id = {m.get("id"): m for m in models}
33
-
34
- def is_free(m: dict) -> bool:
35
- pricing = m.get("pricing") or {}
36
- try:
37
- return float(pricing.get("prompt", "1")) == 0.0 and float(pricing.get("completion", "1")) == 0.0
38
- except Exception:
39
- return False
40
 
41
- def is_vision(m: dict) -> bool:
42
- arch = (m.get("architecture") or {})
43
- in_mods = set(arch.get("input_modalities") or [])
44
- return "image" in in_mods
45
-
46
- # Preferred first
47
- for mid in preferred:
48
- m = by_id.get(mid)
49
- if m and is_free(m) and is_vision(m):
50
- return mid
51
-
52
- # Any free vision
53
  for m in models:
54
- if is_free(m) and is_vision(m):
55
- return m.get("id")
 
 
 
 
 
 
 
56
 
57
- raise RuntimeError("Could not find any free vision-capable model in /models.")
58
 
59
 
60
- def choose_any_free_text_model(api_key: str, preferred: list[str] | None = None) -> str:
61
  models = list_models(api_key).get("data", [])
62
- by_id = {m.get("id"): m for m in models}
63
-
64
- def is_free(m: dict) -> bool:
65
- pricing = m.get("pricing") or {}
66
- try:
67
- return float(pricing.get("prompt", "1")) == 0.0 and float(pricing.get("completion", "1")) == 0.0
68
- except Exception:
69
- return False
70
-
71
- def is_text_input(m: dict) -> bool:
72
- arch = (m.get("architecture") or {})
73
- in_mods = set(arch.get("input_modalities") or [])
74
- return "text" in in_mods
75
-
76
- if preferred:
77
- for mid in preferred:
78
- m = by_id.get(mid)
79
- if m and is_free(m) and is_text_input(m):
80
- return mid
81
-
82
  for m in models:
83
- if is_free(m) and is_text_input(m):
84
- return m.get("id")
85
-
 
 
 
 
86
  raise RuntimeError("Could not find any free text-capable model in /models.")
87
 
88
 
@@ -91,19 +70,16 @@ def _img_bytes_to_data_url(png_bytes: bytes) -> str:
91
  return f"data:image/png;base64,{b64}"
92
 
93
 
94
- def make_user_message_with_images(prompt_text: str, images: list[bytes]) -> dict:
95
  """
96
- OpenRouter follows OpenAI chat schema; some SDK examples show imageUrl (camelCase).
97
- We include both keys for maximum compatibility.
98
  """
99
- content: list[dict] = [{"type": "text", "text": prompt_text}]
100
- for im in images:
101
- url = _img_bytes_to_data_url(im)
102
  content.append(
103
  {
104
  "type": "image_url",
105
- "image_url": {"url": url}, # OpenAI-style
106
- "imageUrl": {"url": url}, # SDK-style
107
  }
108
  )
109
  return {"role": "user", "content": content}
@@ -112,145 +88,101 @@ def make_user_message_with_images(prompt_text: str, images: list[bytes]) -> dict
112
  def chat_completion(
113
  api_key: str,
114
  model: str,
115
- messages: list[dict],
116
- max_tokens: int = 2000,
117
  temperature: float = 0.0,
118
- require_json: bool = True,
119
- extra: dict | None = None,
120
  ) -> ChatResult:
121
  headers = {
122
  "Authorization": f"Bearer {api_key}",
123
  "Content-Type": "application/json",
124
- "HTTP-Referer": "http://localhost",
125
- "X-Title": "fin-statement-page-locator",
126
  }
127
-
128
- payload: dict[str, Any] = {
129
  "model": model,
130
  "messages": messages,
131
  "temperature": temperature,
132
  "max_tokens": max_tokens,
133
- # Force no tool calls even if provider supports them
134
- "tool_choice": "none",
135
  }
136
-
137
- if require_json:
138
- # OpenRouter supports response_format json_object (JSON mode)
139
- payload["response_format"] = {"type": "json_object"}
140
-
141
- if extra:
142
- payload.update(extra)
143
-
144
  r = requests.post(OPENROUTER_CHAT_URL, headers=headers, json=payload, timeout=180)
 
 
145
  r.raise_for_status()
146
  data = r.json()
147
 
148
- # OpenRouter can return errors at top-level even with HTTP 200 in some scenarios
149
- if isinstance(data, dict) and "error" in data and data["error"]:
150
- # keep raw for debugging
151
- return ChatResult(
152
- content="",
153
- finish_reason="error",
154
- native_finish_reason=None,
155
- tool_calls=None,
156
- raw=data,
157
- )
158
-
159
- choice0 = (data.get("choices") or [{}])[0]
160
- msg = choice0.get("message") or {}
161
-
162
- content = (msg.get("content") or "").strip()
163
- tool_calls = msg.get("tool_calls") or msg.get("toolCalls")
164
 
165
  return ChatResult(
166
- content=content,
167
- finish_reason=choice0.get("finish_reason"),
168
- native_finish_reason=choice0.get("native_finish_reason"),
169
  tool_calls=tool_calls,
170
  raw=data,
171
  )
172
 
173
 
174
- def _extract_json_from_codeblock(s: str) -> str | None:
175
- # ```json ... ```
176
- m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", s, flags=re.IGNORECASE)
177
- if m:
178
- return m.group(1).strip()
179
- return None
180
 
181
 
182
- def _extract_first_balanced_object(s: str) -> str | None:
183
  """
184
- Extract the first balanced {...} JSON object from arbitrary text.
185
  """
186
- start = s.find("{")
187
- if start == -1:
188
- return None
189
-
190
- depth = 0
191
- for i in range(start, len(s)):
192
- ch = s[i]
193
- if ch == "{":
194
- depth += 1
195
- elif ch == "}":
196
- depth -= 1
197
- if depth == 0:
198
- return s[start : i + 1]
199
- return None
200
-
201
-
202
- def robust_json_loads(s: str) -> dict:
203
- s = (s or "").strip()
204
- if not s:
205
- raise ValueError("Empty model content (no JSON to parse).")
206
-
207
- # 1) direct parse
208
  try:
209
- return json.loads(s)
210
  except Exception:
211
  pass
212
 
213
- # 2) codeblock
214
- cb = _extract_json_from_codeblock(s)
215
- if cb:
 
216
  try:
217
- return json.loads(cb)
218
  except Exception:
219
  pass
220
 
221
- # 3) balanced object
222
- obj = _extract_first_balanced_object(s)
223
- if obj:
224
- return json.loads(obj)
 
 
 
 
225
 
226
- raise ValueError("Could not parse JSON from model output (no valid JSON object found).")
227
 
228
 
229
- def repair_to_json(
230
- api_key: str,
231
- model: str,
232
- bad_output: str,
233
- schema_hint: str,
234
- ) -> dict:
235
  """
236
- Ask a free model to convert arbitrary text into valid JSON for our schema.
237
  """
238
- repair_prompt = f"""Convert the following content into VALID JSON ONLY.
239
- No markdown, no backticks, no explanations.
240
-
241
- Schema (must match keys/types):
242
- {schema_hint}
 
243
 
244
- Content to convert:
245
- {bad_output}
246
- """
247
- msg = {"role": "user", "content": repair_prompt}
248
  res = chat_completion(
249
  api_key=api_key,
250
  model=model,
251
- messages=[msg],
252
- max_tokens=900,
 
 
253
  temperature=0.0,
254
- require_json=True,
255
  )
256
- return robust_json_loads(res.content)
 
1
  from __future__ import annotations
2
+
3
  import base64
4
  import json
5
  import re
 
8
 
9
  import requests
10
 
11
+
12
  OPENROUTER_CHAT_URL = "https://openrouter.ai/api/v1/chat/completions"
13
  OPENROUTER_MODELS_URL = "https://openrouter.ai/api/v1/models"
14
 
 
16
  @dataclass
17
  class ChatResult:
18
  content: str
19
+ model: str
20
+ native_finish_reason: Optional[str]
21
  tool_calls: Any
22
  raw: dict
23
 
 
29
  return r.json()
30
 
31
 
32
+ def choose_free_vision_model(api_key: str, preferred: List[str]) -> str:
33
  models = list_models(api_key).get("data", [])
34
+ # try preferred first
35
+ available = {m.get("id") for m in models if isinstance(m, dict)}
36
+ for p in preferred:
37
+ if p in available:
38
+ return p
 
 
 
39
 
40
+ # fallback: any model with ":free" + some vision hint in the metadata
 
 
 
 
 
 
 
 
 
 
 
41
  for m in models:
42
+ if not isinstance(m, dict):
43
+ continue
44
+ mid = m.get("id", "")
45
+ if ":free" not in mid:
46
+ continue
47
+ # crude heuristic: many vision models have "vl" or "vision" somewhere
48
+ text = json.dumps(m).lower()
49
+ if ("vision" in text) or ("image" in text) or ("vl" in mid.lower()):
50
+ return mid
51
 
52
+ raise RuntimeError("Could not find any free vision-capable model in /models. Set OPENROUTER_MODEL explicitly.")
53
 
54
 
55
+ def choose_any_free_text_model(api_key: str) -> str:
56
  models = list_models(api_key).get("data", [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  for m in models:
58
+ if not isinstance(m, dict):
59
+ continue
60
+ mid = m.get("id", "")
61
+ if ":free" not in mid:
62
+ continue
63
+ # exclude known vision-only ids if any; otherwise allow
64
+ return mid
65
  raise RuntimeError("Could not find any free text-capable model in /models.")
66
 
67
 
 
70
  return f"data:image/png;base64,{b64}"
71
 
72
 
73
+ def make_user_message_with_images(prompt_text: str, images: List[bytes]) -> dict:
74
  """
75
+ OpenRouter follows OpenAI chat schema. Use 'image_url' (snake) which is supported by OpenAI-style APIs.
 
76
  """
77
+ content: List[dict] = [{"type": "text", "text": prompt_text}]
78
+ for b in images:
 
79
  content.append(
80
  {
81
  "type": "image_url",
82
+ "image_url": {"url": _img_bytes_to_data_url(b)},
 
83
  }
84
  )
85
  return {"role": "user", "content": content}
 
88
  def chat_completion(
89
  api_key: str,
90
  model: str,
91
+ messages: List[dict],
 
92
  temperature: float = 0.0,
93
+ max_tokens: int = 1200,
 
94
  ) -> ChatResult:
95
  headers = {
96
  "Authorization": f"Bearer {api_key}",
97
  "Content-Type": "application/json",
 
 
98
  }
99
+ payload = {
 
100
  "model": model,
101
  "messages": messages,
102
  "temperature": temperature,
103
  "max_tokens": max_tokens,
 
 
104
  }
 
 
 
 
 
 
 
 
105
  r = requests.post(OPENROUTER_CHAT_URL, headers=headers, json=payload, timeout=180)
106
+ if r.status_code != 200:
107
+ print(f"API Error {r.status_code}: {r.text}", flush=True)
108
  r.raise_for_status()
109
  data = r.json()
110
 
111
+ # OpenAI-like response
112
+ choice = (data.get("choices") or [{}])[0]
113
+ msg = choice.get("message") or {}
114
+ content = msg.get("content") or ""
115
+ tool_calls = msg.get("tool_calls")
116
+ finish = choice.get("finish_reason")
 
 
 
 
 
 
 
 
 
 
117
 
118
  return ChatResult(
119
+ content=content if isinstance(content, str) else json.dumps(content),
120
+ model=data.get("model") or model,
121
+ native_finish_reason=finish,
122
  tool_calls=tool_calls,
123
  raw=data,
124
  )
125
 
126
 
127
+ _JSON_OBJ_RE = re.compile(r"\{.*\}", re.DOTALL)
128
+ _JSON_ARR_RE = re.compile(r"\[.*\]", re.DOTALL)
 
 
 
 
129
 
130
 
131
+ def robust_json_loads(text: str) -> Any:
132
  """
133
+ Extract the first valid JSON object/array from a messy LLM output.
134
  """
135
+ if not text:
136
+ raise ValueError("Empty model output.")
137
+
138
+ t = text.strip()
139
+
140
+ # direct try
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  try:
142
+ return json.loads(t)
143
  except Exception:
144
  pass
145
 
146
+ # try find object
147
+ m = _JSON_OBJ_RE.search(t)
148
+ if m:
149
+ cand = m.group(0)
150
  try:
151
+ return json.loads(cand)
152
  except Exception:
153
  pass
154
 
155
+ # try find array
156
+ m = _JSON_ARR_RE.search(t)
157
+ if m:
158
+ cand = m.group(0)
159
+ try:
160
+ return json.loads(cand)
161
+ except Exception:
162
+ pass
163
 
164
+ raise ValueError("Could not parse JSON from model output.")
165
 
166
 
167
+ def repair_to_json(api_key: str, bad_text: str, model: str) -> str:
 
 
 
 
 
168
  """
169
+ Uses a free text model to rewrite messy output into strict JSON only.
170
  """
171
+ sys = (
172
+ "You are a strict JSON formatter. "
173
+ "Return ONLY valid JSON. No markdown, no commentary. "
174
+ "Preserve keys/values if possible."
175
+ )
176
+ user = f"Convert this into valid JSON ONLY:\n\n{bad_text}"
177
 
 
 
 
 
178
  res = chat_completion(
179
  api_key=api_key,
180
  model=model,
181
+ messages=[
182
+ {"role": "system", "content": sys},
183
+ {"role": "user", "content": user},
184
+ ],
185
  temperature=0.0,
186
+ max_tokens=1200,
187
  )
188
+ return res.content.strip()
pdf_io.py CHANGED
@@ -1,10 +1,15 @@
1
  from __future__ import annotations
 
2
  from dataclasses import dataclass
3
- from typing import List, Optional, Tuple
 
4
  import fitz # PyMuPDF
5
  from PIL import Image
6
  import io
7
 
 
 
 
8
  @dataclass
9
  class PageText:
10
  page_index: int # 0-based
@@ -12,38 +17,39 @@ class PageText:
12
  ocr_text: str
13
  used_ocr: bool
14
 
 
15
  def _safe_text(s: str) -> str:
16
  return (s or "").replace("\x00", " ").strip()
17
 
18
- def render_page_to_pil(doc: fitz.Document, page_index: int, dpi: int) -> Image.Image:
 
 
 
 
 
 
 
 
 
19
  page = doc.load_page(page_index)
20
  zoom = dpi / 72.0
21
  mat = fitz.Matrix(zoom, zoom)
22
  pix = page.get_pixmap(matrix=mat, alpha=False)
23
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
24
  return img
25
 
 
26
  def ocr_pil_image(img: Image.Image, lang: str = "eng") -> str:
27
- try:
28
- import pytesseract
29
- except Exception as e:
30
- raise RuntimeError(
31
- "pytesseract not available. Install pytesseract and system Tesseract OCR."
32
- ) from e
33
-
34
- # psm 6: assume a block of text (good for tables + headings)
35
- txt = pytesseract.image_to_string(img, lang=lang, config="--psm 6")
36
  return _safe_text(txt)
37
 
38
- def is_likely_scanned(extracted_text: str, min_chars: int) -> bool:
39
- # If the page has almost no selectable text, it’s probably scanned.
40
- return len(_safe_text(extracted_text)) < min_chars
41
 
42
  def extract_texts_from_pdf(
43
  pdf_path: str,
44
- dpi: int,
45
- ocr_lang: str,
46
- min_text_chars_for_digital: int,
47
  ) -> Tuple[List[PageText], int]:
48
  doc = fitz.open(pdf_path)
49
  page_count = doc.page_count
@@ -63,13 +69,14 @@ def extract_texts_from_pdf(
63
  doc.close()
64
  return results, page_count
65
 
66
- def render_pages_to_png_bytes(pdf_path: str, page_indices: List[int], dpi: int) -> dict[int, bytes]:
 
67
  doc = fitz.open(pdf_path)
68
- out: dict[int, bytes] = {}
69
  for p in page_indices:
70
  img = render_page_to_pil(doc, p, dpi=dpi)
71
  buf = io.BytesIO()
72
  img.save(buf, format="PNG")
73
- out[p] = buf.getvalue()
74
  doc.close()
75
  return out
 
1
  from __future__ import annotations
2
+
3
  from dataclasses import dataclass
4
+ from typing import List, Tuple
5
+
6
  import fitz # PyMuPDF
7
  from PIL import Image
8
  import io
9
 
10
+ import pytesseract
11
+
12
+
13
  @dataclass
14
  class PageText:
15
  page_index: int # 0-based
 
17
  ocr_text: str
18
  used_ocr: bool
19
 
20
+
21
  def _safe_text(s: str) -> str:
22
  return (s or "").replace("\x00", " ").strip()
23
 
24
+
25
+ def is_likely_scanned(extracted_text: str, min_text_chars_for_digital: int) -> bool:
26
+ """
27
+ Simple heuristic: if the native extracted text is too short, likely scanned.
28
+ """
29
+ t = _safe_text(extracted_text)
30
+ return len(t) < min_text_chars_for_digital
31
+
32
+
33
+ def render_page_to_pil(doc: fitz.Document, page_index: int, dpi: int = 200) -> Image.Image:
34
  page = doc.load_page(page_index)
35
  zoom = dpi / 72.0
36
  mat = fitz.Matrix(zoom, zoom)
37
  pix = page.get_pixmap(matrix=mat, alpha=False)
38
+ img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
39
  return img
40
 
41
+
42
  def ocr_pil_image(img: Image.Image, lang: str = "eng") -> str:
43
+ # You can also add config like "--psm 6" if needed.
44
+ txt = pytesseract.image_to_string(img, lang=lang)
 
 
 
 
 
 
 
45
  return _safe_text(txt)
46
 
 
 
 
47
 
48
  def extract_texts_from_pdf(
49
  pdf_path: str,
50
+ dpi: int = 200,
51
+ ocr_lang: str = "eng",
52
+ min_text_chars_for_digital: int = 80,
53
  ) -> Tuple[List[PageText], int]:
54
  doc = fitz.open(pdf_path)
55
  page_count = doc.page_count
 
69
  doc.close()
70
  return results, page_count
71
 
72
+
73
+ def render_pages_to_png_bytes(pdf_path: str, page_indices: List[int], dpi: int = 200) -> List[bytes]:
74
  doc = fitz.open(pdf_path)
75
+ out: List[bytes] = []
76
  for p in page_indices:
77
  img = render_page_to_pil(doc, p, dpi=dpi)
78
  buf = io.BytesIO()
79
  img.save(buf, format="PNG")
80
+ out.append(buf.getvalue())
81
  doc.close()
82
  return out
requirements.txt CHANGED
@@ -5,4 +5,4 @@ pymupdf
5
  pillow
6
  requests
7
  python-dotenv
8
- pytesseract
 
5
  pillow
6
  requests
7
  python-dotenv
8
+ pytesseract
statement_candidates.py CHANGED
@@ -1,22 +1,19 @@
1
- # statement_candidates.py
2
  from __future__ import annotations
3
 
4
  import re
5
- from dataclasses import dataclass
6
- from typing import Any, Dict, List, Optional, Sequence, Tuple
7
  import difflib
 
8
 
9
 
10
  # =========================
11
- # Targets (you want ONLY these 3)
12
  # =========================
13
  TARGETS = ["balance_sheet", "profit_and_loss", "cash_flow"]
 
14
 
15
- # Auxiliary statements used ONLY for delimiting ranges (helpful in 10-K order)
16
- AUX = ["comprehensive_income", "equity", "notes"]
17
 
18
  # =========================
19
- # Title variants (based on your screenshots + common 10-K phrasing)
20
  # =========================
21
  TITLE_VARIANTS: Dict[str, List[str]] = {
22
  "balance_sheet": [
@@ -24,9 +21,10 @@ TITLE_VARIANTS: Dict[str, List[str]] = {
24
  "Standalone Balance Sheets",
25
  "Balance Sheets",
26
  "Statement of Financial Position",
 
27
  ],
28
  "profit_and_loss": [
29
- "Consolidated Statements of Earnings", # AbbVie screenshot
30
  "Standalone Statements of Earnings",
31
  "Consolidated Statements of Operations",
32
  "Standalone Statements of Operations",
@@ -34,6 +32,7 @@ TITLE_VARIANTS: Dict[str, List[str]] = {
34
  "Standalone Statements of Income",
35
  "Income Statement",
36
  "Statement of Profit and Loss",
 
37
  ],
38
  "cash_flow": [
39
  "Consolidated Statements of Cash Flows",
@@ -41,7 +40,7 @@ TITLE_VARIANTS: Dict[str, List[str]] = {
41
  "Statement of Cash Flows",
42
  "Cash Flow Statement",
43
  ],
44
- # auxiliary
45
  "comprehensive_income": [
46
  "Consolidated Statements of Comprehensive Income",
47
  "Standalone Statements of Comprehensive Income",
@@ -60,12 +59,8 @@ TITLE_VARIANTS: Dict[str, List[str]] = {
60
  ],
61
  }
62
 
63
- # Footer phrase (exact idea from your images)
64
  INTEGRAL_FOOTER = "the accompanying notes are an integral part"
65
 
66
- # =========================
67
- # Signature table line-items (increase precision against note tables)
68
- # =========================
69
  SIG_TERMS: Dict[str, List[str]] = {
70
  "balance_sheet": [
71
  "total assets",
@@ -73,22 +68,24 @@ SIG_TERMS: Dict[str, List[str]] = {
73
  "total equity",
74
  "stockholders' equity",
75
  "shareholders' equity",
76
- "assets",
77
  "liabilities and equity",
78
  "current assets",
79
  "current liabilities",
 
 
80
  ],
81
  "profit_and_loss": [
82
  "net revenues",
83
  "net sales",
84
  "revenue",
85
- "cost of products sold",
86
  "cost of sales",
 
87
  "gross profit",
88
  "operating income",
89
- "operating earnings",
90
- "net earnings",
91
  "net income",
 
92
  "earnings per share",
93
  "basic",
94
  "diluted",
@@ -101,122 +98,133 @@ SIG_TERMS: Dict[str, List[str]] = {
101
  "net cash used in investing activities",
102
  "net cash used in financing activities",
103
  "cash and cash equivalents, end of year",
104
- "cash and equivalents, end of year",
105
  "net change in cash",
106
  ],
107
- # aux
108
- "notes": ["note 1", "note 2", "notes to consolidated financial statements", "notes to standalone financial statements"],
109
  }
110
 
111
  NOTE_HEADING_RE = re.compile(r"^\s*note\s+\d+\b", re.IGNORECASE)
112
-
113
- # Typical TOC “dot leaders”
114
  DOT_LEADER_RE = re.compile(r"\.{5,}")
115
-
116
- # Item 8 TOC trigger
117
- ITEM8_RE = re.compile(r"\bITEM\s+8\.\s+FINANCIAL\s+STATEMENTS\s+AND\s+SUPPLEMENTARY\s+DATA\b", re.IGNORECASE)
 
118
 
119
 
120
  # =========================
121
- # Page object -> combined text
122
  # =========================
123
  def _combined_text(page_obj: Any) -> str:
124
- """
125
- Works with your PageText dataclass:
126
- extracted_text + ocr_text
127
- Also supports dict/object string fallback.
128
- """
129
  if page_obj is None:
130
  return ""
131
  if isinstance(page_obj, str):
132
  return page_obj
133
-
134
- # dict-like
135
  if isinstance(page_obj, dict):
136
  a = page_obj.get("extracted_text") or page_obj.get("text") or ""
137
  b = page_obj.get("ocr_text") or ""
138
  return (a + "\n" + b).strip()
139
-
140
- # attribute style
141
  a = getattr(page_obj, "extracted_text", None) or getattr(page_obj, "text", None) or ""
142
  b = getattr(page_obj, "ocr_text", None) or ""
143
  return (a + "\n" + b).strip()
144
 
145
 
146
- def _page_index(page_obj: Any, fallback: int) -> int:
147
- if isinstance(page_obj, dict):
148
- if isinstance(page_obj.get("page_index"), int):
149
- return int(page_obj["page_index"])
150
- v = getattr(page_obj, "page_index", None)
151
- return int(v) if isinstance(v, int) else fallback
152
-
153
-
154
  def _norm(s: str) -> str:
155
  return re.sub(r"\s+", " ", (s or "")).strip().lower()
156
 
157
 
158
- # =========================
159
- # Fuzzy title detection (OCR typos tolerant)
160
- # =========================
161
  def _fuzzy_line_contains_title(top_lines: List[str], title: str, threshold: float = 0.86) -> bool:
162
  title_n = _norm(title)
163
  for ln in top_lines:
164
  ln_n = _norm(ln)
165
  if not ln_n:
166
  continue
167
- # direct contains
168
  if title_n in ln_n:
169
  return True
170
- # fuzzy ratio
171
  r = difflib.SequenceMatcher(None, ln_n, title_n).ratio()
172
  if r >= threshold:
173
  return True
174
  return False
175
 
176
 
177
- def detect_title(text: str, stmt: str) -> bool:
 
 
 
 
178
  lines = (text or "").splitlines()
179
- top_lines = [ln.strip() for ln in lines[:14] if ln.strip()] # titles live here in your screenshots
 
180
  for variant in TITLE_VARIANTS.get(stmt, []):
181
  if _fuzzy_line_contains_title(top_lines, variant):
182
- return True
183
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
 
186
  # =========================
187
- # Footer internal page number extraction (10-K style)
188
  # =========================
189
  FOOTER_PIPE_RE = re.compile(r"\|\s*(\d{1,4})\s*$", re.MULTILINE)
190
  FOOTER_FORM_RE = re.compile(r"form\s+10-?k\s*\|\s*(\d{1,4})\s*$", re.IGNORECASE | re.MULTILINE)
191
 
 
192
  def extract_footer_internal_page(text: str) -> Optional[int]:
193
  t = text or ""
194
-
195
  m = FOOTER_PIPE_RE.findall(t)
196
  if m:
197
  return int(m[-1])
198
-
199
  m = FOOTER_FORM_RE.findall(t)
200
  if m:
201
  return int(m[-1])
202
-
203
- # fallback: last few non-empty lines that are ONLY digits (avoid table numbers)
204
  lines = [ln.strip() for ln in (t.splitlines() if t else []) if ln.strip()]
205
  for ln in reversed(lines[-6:]):
206
  if re.fullmatch(r"\d{1,4}", ln):
207
  return int(ln)
208
-
209
  return None
210
 
211
 
212
- # =========================
213
- # Item 8 TOC page detection + TOC parsing
214
- # AbbVie TOC is "title line" then next line has page number ("55")
215
- # =========================
216
  def find_item8_toc_page(all_texts: Sequence[str]) -> Optional[int]:
217
- """
218
- Choose the Item 8 page that LOOKS like an index/TOC (has dot leaders or 'Page').
219
- """
220
  candidates = []
221
  for i, txt in enumerate(all_texts):
222
  if not ITEM8_RE.search(txt or ""):
@@ -225,21 +233,18 @@ def find_item8_toc_page(all_texts: Sequence[str]) -> Optional[int]:
225
  tocish = ("page" in low) and (DOT_LEADER_RE.search(txt or "") is not None)
226
  if tocish:
227
  candidates.append(i)
228
-
229
  return candidates[0] if candidates else None
230
 
231
 
232
  def parse_statement_index_numbers(toc_text: str) -> Dict[str, int]:
233
  """
234
- Returns internal page numbers from the index.
235
- Handles:
236
- - same line "Consolidated Balance Sheets .... 57"
237
- - two-line "Consolidated Balance Sheets" newline "57" (AbbVie)
238
  """
239
  lines = [ln.strip() for ln in (toc_text or "").splitlines()]
240
  out: Dict[str, int] = {}
241
 
242
- # compile quick patterns
243
  pats = {
244
  "profit_and_loss": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+(earnings|operations|income)", re.I),
245
  "comprehensive_income": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+comprehensive\s+income", re.I),
@@ -253,82 +258,72 @@ def parse_statement_index_numbers(toc_text: str) -> Dict[str, int]:
253
  if not ln:
254
  continue
255
 
256
- for key, pat in pats.items():
257
- if not pat.search(ln):
 
258
  continue
259
 
260
- # case 1: number on same line at end
 
 
 
 
 
261
  m = re.findall(r"(\d{1,4})\s*$", ln)
262
  if m and ln.endswith(m[-1]):
263
- out[key] = int(m[-1])
264
  continue
265
 
266
- # case 2: number on next non-empty line
267
  j = i + 1
268
  while j < len(lines) and not lines[j]:
269
  j += 1
270
  if j < len(lines) and re.fullmatch(r"\d{1,4}", lines[j]):
271
- out[key] = int(lines[j])
272
 
273
  return out
274
 
275
 
276
  def build_internal_to_pdf_map(all_texts: Sequence[str]) -> Dict[int, int]:
277
- """
278
- internal_page_number -> pdf_page_index
279
- """
280
  mapping: Dict[int, int] = {}
281
  for pdf_i, txt in enumerate(all_texts):
282
  n = extract_footer_internal_page(txt or "")
283
  if n is None:
284
  continue
285
- mapping.setdefault(n, pdf_i) # keep first occurrence
286
  return mapping
287
 
288
 
289
  def map_internal_to_pdf(internal: int, internal_to_pdf: Dict[int, int]) -> Optional[int]:
290
- """
291
- Robust mapping:
292
- - direct if exists
293
- - else estimate from nearest known internal page (assumes mostly consecutive internal numbering)
294
- """
295
  if internal in internal_to_pdf:
296
  return internal_to_pdf[internal]
297
-
298
- # nearest neighbor estimate
299
  keys = sorted(internal_to_pdf.keys())
300
  if not keys:
301
  return None
302
-
303
- # find closest key
304
  best_k = min(keys, key=lambda k: abs(k - internal))
305
  return internal_to_pdf[best_k] + (internal - best_k)
306
 
307
 
308
  # =========================
309
- # Strong statement scoring (only used if TOC mapping fails)
310
  # =========================
311
  def _page_stats(text: str) -> Dict[str, float]:
312
  t = text or ""
313
  low = t.lower()
314
-
315
- # numeric signals
316
  year_count = len(re.findall(r"\b20\d{2}\b", t))
317
  currency_count = len(re.findall(r"[$€£]|usd|inr|eur|gbp", low))
318
- paren_neg = len(re.findall(r"\(\s*\d", t)) # (123) negatives
319
  integral = 1.0 if INTEGRAL_FOOTER in low else 0.0
320
 
321
  tokens = re.findall(r"[A-Za-z]+|\d+(?:,\d{3})*(?:\.\d+)?", t)
322
  if not tokens:
323
- return dict(num_ratio=0.0, year_count=float(year_count), currency=float(currency_count),
324
- paren=float(paren_neg), integral=integral)
325
 
326
  nums = sum(1 for tok in tokens if re.fullmatch(r"\d+(?:,\d{3})*(?:\.\d+)?", tok))
327
  alphas = sum(1 for tok in tokens if re.fullmatch(r"[A-Za-z]+", tok))
328
  num_ratio = nums / max(1.0, nums + alphas)
329
 
330
- return dict(num_ratio=float(num_ratio), year_count=float(year_count), currency=float(currency_count),
331
- paren=float(paren_neg), integral=integral)
332
 
333
 
334
  def score_statement_page(text: str, stmt: str) -> Tuple[float, Dict[str, Any]]:
@@ -336,179 +331,320 @@ def score_statement_page(text: str, stmt: str) -> Tuple[float, Dict[str, Any]]:
336
  top = (text or "")[:1200]
337
  st = _page_stats(text)
338
 
339
- reasons = {"title": False, "sig_hits": [], "integral": False, "penalties": [], "stats": st}
340
  score = 0.0
341
 
342
- # Title near top is a MUST (or fuzzy)
343
- if detect_title(top, stmt):
344
  score += 60.0
345
  reasons["title"] = True
 
346
  else:
347
- # without title, heavily downrank (note tables can be very numeric)
348
- score -= 25.0
349
- reasons["penalties"].append("no_title(-25)")
350
 
351
- # Integral footer is very characteristic of primary statements (seen in your screenshots)
352
  if st["integral"] > 0:
353
- score += 18.0
354
  reasons["integral"] = True
355
 
356
- # Signature line items: require multiple hits
357
  hits = 0
358
  for term in SIG_TERMS.get(stmt, []):
359
  if term in low:
360
  hits += 1
361
  reasons["sig_hits"].append(term)
362
- score += min(hits, 10) * 6.0 # stronger weight
363
 
364
- # Table-ness: years + currency + negative brackets + numeric ratio
365
- score += st["num_ratio"] * 30.0
366
- score += min(st["year_count"], 10.0) * 1.5
367
- score += min(st["currency"], 10.0) * 2.0
368
  score += min(st["paren"], 10.0) * 1.0
369
 
370
- # Hard penalties for NOTE pages
371
  if NOTE_HEADING_RE.search((text or "")[:220]):
372
- score -= 60.0
373
- reasons["penalties"].append("note_heading(-60)")
374
 
375
- # If it looks like TOC index page, punish (dot leaders)
376
  if DOT_LEADER_RE.search(text or ""):
377
- score -= 30.0
378
- reasons["penalties"].append("toc_dotleaders(-30)")
379
 
380
- # Guardrails:
381
- # If title found but it doesn't look like a table at all, punish
382
- if reasons["title"] and st["num_ratio"] < 0.10 and st["year_count"] < 1:
383
- score -= 35.0
384
- reasons["penalties"].append("title_without_table(-35)")
385
 
386
- # Require at least 2 signature hits for high confidence
387
  if hits < 2:
388
- score -= 18.0
389
- reasons["penalties"].append("low_sig_hits(<2)(-18)")
390
 
391
  return score, reasons
392
 
393
 
394
- # =========================
395
- # Range inference from ordered statement starts
396
- # =========================
397
- def infer_ranges_from_starts(
398
- starts_pdf: Dict[str, int],
399
- page_count: int,
400
- ordered_keys: List[str],
401
- ) -> Dict[str, Tuple[int, int]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  """
403
- Given start pdf indices (0-based) for an ordered list of keys,
404
- return inclusive ranges for TARGETS based on next-start-1.
 
 
 
405
  """
406
- # keep only those that exist
407
- items = [(k, starts_pdf[k]) for k in ordered_keys if k in starts_pdf and isinstance(starts_pdf[k], int)]
408
- items.sort(key=lambda x: x[1])
409
 
410
- next_start = {}
411
- for idx, (k, p) in enumerate(items):
412
- nxt = items[idx + 1][1] if idx + 1 < len(items) else None
413
- next_start[k] = nxt
 
 
 
 
 
 
 
 
 
 
 
414
 
415
- ranges: Dict[str, Tuple[int, int]] = {}
416
- for k, p in items:
417
- end = (next_start[k] - 1) if next_start[k] is not None else p
418
- end = min(max(end, p), page_count - 1)
419
- ranges[k] = (p, end)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
 
421
- # return only targets that exist
422
- return {k: ranges[k] for k in TARGETS if k in ranges}
423
 
424
 
425
  # =========================
426
- # Public API
427
  # =========================
428
  def build_candidate_lists(
429
  pages: Sequence[Any],
430
- top_k: int = 25,
 
 
431
  debug: bool = True,
432
  ) -> Tuple[Dict[str, List[Tuple[int, float]]], Dict[str, Any]]:
433
  """
434
  Returns:
435
- candidates: {stmt: [(pdf_page_idx, score), ...]} for TARGETS only
436
- debug_info: contains toc/internal mapping and top explanations
437
  """
438
  all_texts = [_combined_text(p) for p in pages]
439
- page_count = len(all_texts)
440
 
441
  debug_info: Dict[str, Any] = {
442
  "item8_toc_page": None,
443
  "toc_internal": {},
444
  "internal_to_pdf_map_size": 0,
445
- "toc_pdf_targets_all": {},
446
- "heuristic_ranges_0_based": {},
447
- "top_scoring": {},
448
  }
449
 
450
- # ---- 1) TOC-based detection (most accurate on 10-K) ----
 
 
 
 
451
  toc_i = find_item8_toc_page(all_texts)
452
  if toc_i is not None:
453
- toc_text = all_texts[toc_i]
 
454
  toc_internal = parse_statement_index_numbers(toc_text)
 
 
455
  internal_to_pdf = build_internal_to_pdf_map(all_texts)
 
456
 
457
- toc_pdf_all: Dict[str, int] = {}
458
- for k, internal_n in toc_internal.items():
459
- mapped = map_internal_to_pdf(internal_n, internal_to_pdf)
460
- if mapped is not None and 0 <= mapped < page_count:
461
- toc_pdf_all[k] = mapped
462
-
463
- debug_info.update({
464
- "item8_toc_page": toc_i,
465
- "toc_internal": toc_internal,
466
- "internal_to_pdf_map_size": len(internal_to_pdf),
467
- "toc_pdf_targets_all": toc_pdf_all,
468
- })
469
-
470
- # If we got our 3 targets, build direct ranges using the typical order:
471
- # Earnings -> Comprehensive Income -> Balance Sheet -> Equity -> Cash Flow -> Notes
472
- if all(k in toc_pdf_all for k in ["profit_and_loss", "balance_sheet", "cash_flow"]):
473
- ordered = ["profit_and_loss", "comprehensive_income", "balance_sheet", "equity", "cash_flow", "notes"]
474
- ranges = infer_ranges_from_starts(toc_pdf_all, page_count, ordered)
475
- debug_info["heuristic_ranges_0_based"] = ranges
476
-
477
- # Build candidates directly from these starts with huge confidence
478
- candidates = {k: [] for k in TARGETS}
479
- for k in TARGETS:
480
- start, end = ranges.get(k, (None, None))
481
- if start is None:
482
- continue
483
- # prioritize start page; include end too
484
- candidates[k].append((start, 999.0))
485
- if end != start:
486
- candidates[k].append((end, 950.0))
487
- return candidates, debug_info
488
 
489
- # ---- 2) Fallback: statement scoring over ALL pages ----
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  candidates: Dict[str, List[Tuple[int, float]]] = {k: [] for k in TARGETS}
491
  reasons_store: Dict[str, Dict[int, Any]] = {k: {} for k in TARGETS}
492
 
493
- for i, p in enumerate(pages):
494
- idx = _page_index(p, i)
495
- txt = _combined_text(p)
496
-
497
  for stmt in TARGETS:
498
- sc, why = score_statement_page(txt, stmt)
499
  if sc > 0:
500
- candidates[stmt].append((idx, float(sc)))
501
- if debug and (why["title"] or sc > 80):
502
- reasons_store[stmt][idx] = why
503
 
504
  for stmt in TARGETS:
505
  candidates[stmt].sort(key=lambda x: x[1], reverse=True)
506
- candidates[stmt] = candidates[stmt][:max(8, top_k)]
507
- if debug:
508
- debug_info["top_scoring"][stmt] = [
509
- {"page": p, "score": round(s, 2), "why": reasons_store[stmt].get(p)}
510
- for p, s in candidates[stmt][:10]
511
- ]
512
 
513
  return candidates, debug_info
514
 
@@ -518,12 +654,13 @@ def select_pages_for_llm(
518
  debug_info: Dict[str, Any],
519
  page_count: int,
520
  max_images: int,
 
521
  ) -> List[int]:
522
  """
523
- If TOC-based ranges exist -> send ONLY those pages (+neighbors) (highest precision).
524
- Else -> send top candidates + neighbors.
525
  """
526
- picked = []
527
  seen = set()
528
 
529
  def add(p: int):
@@ -531,19 +668,36 @@ def select_pages_for_llm(
531
  seen.add(p)
532
  picked.append(p)
533
 
534
- # TOC ranges (best)
535
- ranges = debug_info.get("heuristic_ranges_0_based") or {}
536
- if ranges:
537
  for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
538
- if stmt in ranges:
539
- s, e = ranges[stmt]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
  for p in range(s, e + 1):
541
  add(p)
542
  add(s - 1)
543
  add(e + 1)
 
544
  return sorted(picked)
545
 
546
- # fallback
547
  for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
548
  for (p, _sc) in candidates.get(stmt, [])[:2]:
549
  add(p)
 
 
1
  from __future__ import annotations
2
 
3
  import re
 
 
4
  import difflib
5
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
6
 
7
 
8
  # =========================
9
+ # Targets (ONLY these 3)
10
  # =========================
11
  TARGETS = ["balance_sheet", "profit_and_loss", "cash_flow"]
12
+ AUX = ["comprehensive_income", "equity", "notes"] # only for delimiting (when available)
13
 
 
 
14
 
15
  # =========================
16
+ # Title variants
17
  # =========================
18
  TITLE_VARIANTS: Dict[str, List[str]] = {
19
  "balance_sheet": [
 
21
  "Standalone Balance Sheets",
22
  "Balance Sheets",
23
  "Statement of Financial Position",
24
+ "Standalone Statement of Financial Position",
25
  ],
26
  "profit_and_loss": [
27
+ "Consolidated Statements of Earnings",
28
  "Standalone Statements of Earnings",
29
  "Consolidated Statements of Operations",
30
  "Standalone Statements of Operations",
 
32
  "Standalone Statements of Income",
33
  "Income Statement",
34
  "Statement of Profit and Loss",
35
+ "Statement of Profit & Loss",
36
  ],
37
  "cash_flow": [
38
  "Consolidated Statements of Cash Flows",
 
40
  "Statement of Cash Flows",
41
  "Cash Flow Statement",
42
  ],
43
+ # aux
44
  "comprehensive_income": [
45
  "Consolidated Statements of Comprehensive Income",
46
  "Standalone Statements of Comprehensive Income",
 
59
  ],
60
  }
61
 
 
62
  INTEGRAL_FOOTER = "the accompanying notes are an integral part"
63
 
 
 
 
64
  SIG_TERMS: Dict[str, List[str]] = {
65
  "balance_sheet": [
66
  "total assets",
 
68
  "total equity",
69
  "stockholders' equity",
70
  "shareholders' equity",
 
71
  "liabilities and equity",
72
  "current assets",
73
  "current liabilities",
74
+ "non-current assets",
75
+ "non-current liabilities",
76
  ],
77
  "profit_and_loss": [
78
  "net revenues",
79
  "net sales",
80
  "revenue",
 
81
  "cost of sales",
82
+ "cost of products sold",
83
  "gross profit",
84
  "operating income",
85
+ "operating profit",
86
+ "profit before tax",
87
  "net income",
88
+ "net earnings",
89
  "earnings per share",
90
  "basic",
91
  "diluted",
 
98
  "net cash used in investing activities",
99
  "net cash used in financing activities",
100
  "cash and cash equivalents, end of year",
 
101
  "net change in cash",
102
  ],
 
 
103
  }
104
 
105
  NOTE_HEADING_RE = re.compile(r"^\s*note\s+\d+\b", re.IGNORECASE)
 
 
106
  DOT_LEADER_RE = re.compile(r"\.{5,}")
107
+ ITEM8_RE = re.compile(
108
+ r"\bITEM\s+8\.\s+FINANCIAL\s+STATEMENTS\s+AND\s+SUPPLEMENTARY\s+DATA\b", re.IGNORECASE
109
+ )
110
+ CONTINUED_RE = re.compile(r"\bcontinued\b", re.IGNORECASE)
111
 
112
 
113
  # =========================
114
+ # Utilities
115
  # =========================
116
  def _combined_text(page_obj: Any) -> str:
 
 
 
 
 
117
  if page_obj is None:
118
  return ""
119
  if isinstance(page_obj, str):
120
  return page_obj
 
 
121
  if isinstance(page_obj, dict):
122
  a = page_obj.get("extracted_text") or page_obj.get("text") or ""
123
  b = page_obj.get("ocr_text") or ""
124
  return (a + "\n" + b).strip()
 
 
125
  a = getattr(page_obj, "extracted_text", None) or getattr(page_obj, "text", None) or ""
126
  b = getattr(page_obj, "ocr_text", None) or ""
127
  return (a + "\n" + b).strip()
128
 
129
 
 
 
 
 
 
 
 
 
130
  def _norm(s: str) -> str:
131
  return re.sub(r"\s+", " ", (s or "")).strip().lower()
132
 
133
 
 
 
 
134
  def _fuzzy_line_contains_title(top_lines: List[str], title: str, threshold: float = 0.86) -> bool:
135
  title_n = _norm(title)
136
  for ln in top_lines:
137
  ln_n = _norm(ln)
138
  if not ln_n:
139
  continue
 
140
  if title_n in ln_n:
141
  return True
 
142
  r = difflib.SequenceMatcher(None, ln_n, title_n).ratio()
143
  if r >= threshold:
144
  return True
145
  return False
146
 
147
 
148
+ def detect_title_match(text: str, stmt: str) -> Tuple[bool, Optional[str], str]:
149
+ """
150
+ Returns (matched?, matched_variant, scope)
151
+ scope in {"consolidated","standalone","unknown"}
152
+ """
153
  lines = (text or "").splitlines()
154
+ top_lines = [ln.strip() for ln in lines[:16] if ln.strip()]
155
+
156
  for variant in TITLE_VARIANTS.get(stmt, []):
157
  if _fuzzy_line_contains_title(top_lines, variant):
158
+ vlow = variant.lower()
159
+ if "consolidated" in vlow:
160
+ scope = "consolidated"
161
+ elif "standalone" in vlow or "separate" in vlow:
162
+ scope = "standalone"
163
+ else:
164
+ scope = "unknown"
165
+ return True, variant, scope
166
+
167
+ joined = " ".join(top_lines).lower()
168
+ # fallback for OCR garble
169
+ if stmt == "balance_sheet" and ("balance sheet" in joined or "financial position" in joined):
170
+ if "consolidated" in joined:
171
+ return True, None, "consolidated"
172
+ if "standalone" in joined or "separate" in joined:
173
+ return True, None, "standalone"
174
+ return True, None, "unknown"
175
+
176
+ if stmt == "cash_flow" and ("cash flow" in joined or "cash flows" in joined):
177
+ if "consolidated" in joined:
178
+ return True, None, "consolidated"
179
+ if "standalone" in joined or "separate" in joined:
180
+ return True, None, "standalone"
181
+ return True, None, "unknown"
182
+
183
+ if stmt == "profit_and_loss" and (
184
+ "statement of profit" in joined
185
+ or "profit and loss" in joined
186
+ or "income statement" in joined
187
+ or "statements of income" in joined
188
+ or "statements of operations" in joined
189
+ or "statements of earnings" in joined
190
+ ):
191
+ if "consolidated" in joined:
192
+ return True, None, "consolidated"
193
+ if "standalone" in joined or "separate" in joined:
194
+ return True, None, "standalone"
195
+ return True, None, "unknown"
196
+
197
+ return False, None, "unknown"
198
+
199
+
200
+ def detect_title(text: str, stmt: str) -> bool:
201
+ ok, _, _ = detect_title_match(text, stmt)
202
+ return ok
203
 
204
 
205
  # =========================
206
+ # (Optional) 10-K TOC mapping helpers (kept, but now scope-safe)
207
  # =========================
208
  FOOTER_PIPE_RE = re.compile(r"\|\s*(\d{1,4})\s*$", re.MULTILINE)
209
  FOOTER_FORM_RE = re.compile(r"form\s+10-?k\s*\|\s*(\d{1,4})\s*$", re.IGNORECASE | re.MULTILINE)
210
 
211
+
212
  def extract_footer_internal_page(text: str) -> Optional[int]:
213
  t = text or ""
 
214
  m = FOOTER_PIPE_RE.findall(t)
215
  if m:
216
  return int(m[-1])
 
217
  m = FOOTER_FORM_RE.findall(t)
218
  if m:
219
  return int(m[-1])
 
 
220
  lines = [ln.strip() for ln in (t.splitlines() if t else []) if ln.strip()]
221
  for ln in reversed(lines[-6:]):
222
  if re.fullmatch(r"\d{1,4}", ln):
223
  return int(ln)
 
224
  return None
225
 
226
 
 
 
 
 
227
  def find_item8_toc_page(all_texts: Sequence[str]) -> Optional[int]:
 
 
 
228
  candidates = []
229
  for i, txt in enumerate(all_texts):
230
  if not ITEM8_RE.search(txt or ""):
 
233
  tocish = ("page" in low) and (DOT_LEADER_RE.search(txt or "") is not None)
234
  if tocish:
235
  candidates.append(i)
 
236
  return candidates[0] if candidates else None
237
 
238
 
239
  def parse_statement_index_numbers(toc_text: str) -> Dict[str, int]:
240
  """
241
+ Return internal page numbers from the index.
242
+ IMPORTANT: keeps consolidated + standalone separately:
243
+ key = f"{stmt}__{scope}"
 
244
  """
245
  lines = [ln.strip() for ln in (toc_text or "").splitlines()]
246
  out: Dict[str, int] = {}
247
 
 
248
  pats = {
249
  "profit_and_loss": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+(earnings|operations|income)", re.I),
250
  "comprehensive_income": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+comprehensive\s+income", re.I),
 
258
  if not ln:
259
  continue
260
 
261
+ for stmt, pat in pats.items():
262
+ mscope = pat.search(ln)
263
+ if not mscope:
264
  continue
265
 
266
+ scope = (mscope.group(1) or "").strip().lower()
267
+ if scope not in {"consolidated", "standalone"}:
268
+ scope = "unknown"
269
+ out_key = f"{stmt}__{scope}"
270
+
271
+ # number at end of line
272
  m = re.findall(r"(\d{1,4})\s*$", ln)
273
  if m and ln.endswith(m[-1]):
274
+ out.setdefault(out_key, int(m[-1]))
275
  continue
276
 
277
+ # number on next line
278
  j = i + 1
279
  while j < len(lines) and not lines[j]:
280
  j += 1
281
  if j < len(lines) and re.fullmatch(r"\d{1,4}", lines[j]):
282
+ out.setdefault(out_key, int(lines[j]))
283
 
284
  return out
285
 
286
 
287
  def build_internal_to_pdf_map(all_texts: Sequence[str]) -> Dict[int, int]:
 
 
 
288
  mapping: Dict[int, int] = {}
289
  for pdf_i, txt in enumerate(all_texts):
290
  n = extract_footer_internal_page(txt or "")
291
  if n is None:
292
  continue
293
+ mapping.setdefault(n, pdf_i)
294
  return mapping
295
 
296
 
297
  def map_internal_to_pdf(internal: int, internal_to_pdf: Dict[int, int]) -> Optional[int]:
 
 
 
 
 
298
  if internal in internal_to_pdf:
299
  return internal_to_pdf[internal]
 
 
300
  keys = sorted(internal_to_pdf.keys())
301
  if not keys:
302
  return None
 
 
303
  best_k = min(keys, key=lambda k: abs(k - internal))
304
  return internal_to_pdf[best_k] + (internal - best_k)
305
 
306
 
307
  # =========================
308
+ # Scoring
309
  # =========================
310
  def _page_stats(text: str) -> Dict[str, float]:
311
  t = text or ""
312
  low = t.lower()
 
 
313
  year_count = len(re.findall(r"\b20\d{2}\b", t))
314
  currency_count = len(re.findall(r"[$€£]|usd|inr|eur|gbp", low))
315
+ paren_neg = len(re.findall(r"\(\s*\d", t))
316
  integral = 1.0 if INTEGRAL_FOOTER in low else 0.0
317
 
318
  tokens = re.findall(r"[A-Za-z]+|\d+(?:,\d{3})*(?:\.\d+)?", t)
319
  if not tokens:
320
+ return dict(num_ratio=0.0, year_count=float(year_count), currency=float(currency_count), paren=float(paren_neg), integral=integral)
 
321
 
322
  nums = sum(1 for tok in tokens if re.fullmatch(r"\d+(?:,\d{3})*(?:\.\d+)?", tok))
323
  alphas = sum(1 for tok in tokens if re.fullmatch(r"[A-Za-z]+", tok))
324
  num_ratio = nums / max(1.0, nums + alphas)
325
 
326
+ return dict(num_ratio=float(num_ratio), year_count=float(year_count), currency=float(currency_count), paren=float(paren_neg), integral=integral)
 
327
 
328
 
329
  def score_statement_page(text: str, stmt: str) -> Tuple[float, Dict[str, Any]]:
 
331
  top = (text or "")[:1200]
332
  st = _page_stats(text)
333
 
334
+ reasons: Dict[str, Any] = {"title": False, "scope": "unknown", "sig_hits": [], "integral": False, "penalties": [], "stats": st}
335
  score = 0.0
336
 
337
+ ok, _, scope = detect_title_match(top, stmt)
338
+ if ok:
339
  score += 60.0
340
  reasons["title"] = True
341
+ reasons["scope"] = scope
342
  else:
343
+ score -= 20.0
344
+ reasons["penalties"].append("no_title(-20)")
 
345
 
 
346
  if st["integral"] > 0:
347
+ score += 12.0
348
  reasons["integral"] = True
349
 
 
350
  hits = 0
351
  for term in SIG_TERMS.get(stmt, []):
352
  if term in low:
353
  hits += 1
354
  reasons["sig_hits"].append(term)
355
+ score += min(hits, 10) * 5.0
356
 
357
+ score += st["num_ratio"] * 24.0
358
+ score += min(st["year_count"], 10.0) * 1.2
359
+ score += min(st["currency"], 10.0) * 1.8
 
360
  score += min(st["paren"], 10.0) * 1.0
361
 
 
362
  if NOTE_HEADING_RE.search((text or "")[:220]):
363
+ score -= 45.0
364
+ reasons["penalties"].append("note_heading(-45)")
365
 
 
366
  if DOT_LEADER_RE.search(text or ""):
367
+ score -= 25.0
368
+ reasons["penalties"].append("toc_dotleaders(-25)")
369
 
370
+ if reasons["title"] and st["num_ratio"] < 0.08 and st["year_count"] < 1:
371
+ score -= 30.0
372
+ reasons["penalties"].append("title_without_table(-30)")
 
 
373
 
 
374
  if hits < 2:
375
+ score -= 12.0
376
+ reasons["penalties"].append("low_sig_hits(<2)(-12)")
377
 
378
  return score, reasons
379
 
380
 
381
+ def _statement_signal_no_title(text: str, stmt: str) -> float:
382
+ """
383
+ Continuation-page score (no title required). Used to extend blocks forward.
384
+ """
385
+ if not text:
386
+ return 0.0
387
+
388
+ if NOTE_HEADING_RE.search(text[:220]):
389
+ return 0.0
390
+ if DOT_LEADER_RE.search(text):
391
+ return 0.0
392
+
393
+ low = text.lower()
394
+ st = _page_stats(text)
395
+
396
+ hits = 0
397
+ for term in SIG_TERMS.get(stmt, []):
398
+ if term in low:
399
+ hits += 1
400
+
401
+ score = 0.0
402
+ score += min(hits, 10) * 4.5
403
+ score += st["num_ratio"] * 26.0
404
+ score += min(st["year_count"], 10.0) * 1.1
405
+ score += min(st["currency"], 10.0) * 1.5
406
+ score += min(st["paren"], 10.0) * 0.7
407
+
408
+ if CONTINUED_RE.search(text[:240]):
409
+ score += 8.0
410
+
411
+ # special: if a page has strong signature terms + years, it's often a continuation
412
+ if hits >= 2 and st["year_count"] >= 1:
413
+ score += 6.0
414
+
415
+ return score
416
+
417
+
418
+ def _any_other_statement_title(text: str, stmt: str) -> bool:
419
+ for other in TARGETS:
420
+ if other == stmt:
421
+ continue
422
+ if detect_title(text[:1200], other):
423
+ return True
424
+ return False
425
+
426
+
427
+ def _expand_block(all_texts: Sequence[str], stmt: str, start: int, max_forward: int = 6) -> int:
428
+ """
429
+ Expand forward to include continuation pages.
430
+ Stops if another statement begins (unless this stmt title repeats).
431
+ """
432
+ end = start
433
+ n = len(all_texts)
434
+
435
+ for j in range(start + 1, min(n, start + 1 + max_forward)):
436
+ txt = all_texts[j] or ""
437
+
438
+ if _any_other_statement_title(txt, stmt) and not detect_title(txt[:1200], stmt):
439
+ break
440
+
441
+ sig = _statement_signal_no_title(txt, stmt)
442
+ if sig >= 13.5:
443
+ end = j
444
+ continue
445
+
446
+ if CONTINUED_RE.search(txt[:240]) and sig >= 8.0:
447
+ end = j
448
+ continue
449
+
450
+ break
451
+
452
+ return end
453
+
454
+
455
+ def _blocks_overlap(a: Tuple[int, int], b: Tuple[int, int]) -> bool:
456
+ return not (a[1] < b[0] or b[1] < a[0])
457
+
458
+
459
+ def _dedup_blocks(blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
460
+ """
461
+ Deduplicate overlapping blocks, keeping higher 'score'.
462
+ """
463
+ blocks = sorted(blocks, key=lambda x: (int(x.get("start", 10**9)), -(float(x.get("score") or 0.0))))
464
+ kept: List[Dict[str, Any]] = []
465
+ for b in blocks:
466
+ r = (int(b.get("start")), int(b.get("end")))
467
+ merged = False
468
+ for k in kept:
469
+ kr = (int(k.get("start")), int(k.get("end")))
470
+ if _blocks_overlap(r, kr):
471
+ if float(b.get("score") or 0.0) > float(k.get("score") or 0.0):
472
+ k.update(b)
473
+ merged = True
474
+ break
475
+ if not merged:
476
+ kept.append(b)
477
+ return kept
478
+
479
+
480
+ def build_blocks_from_titles(all_texts: Sequence[str], continuation_max_forward: int = 6) -> Dict[str, List[Dict[str, Any]]]:
481
  """
482
+ Finds MULTIPLE blocks per statement (consolidated + standalone).
483
+ Strategy:
484
+ - find title pages for stmt
485
+ - cluster nearby title hits of same scope
486
+ - expand each start forward with continuation scoring
487
  """
488
+ out: Dict[str, List[Dict[str, Any]]] = {k: [] for k in TARGETS}
 
 
489
 
490
+ for stmt in TARGETS:
491
+ title_hits: List[Tuple[int, float, str, Optional[str]]] = []
492
+ for i, txt in enumerate(all_texts):
493
+ ok, variant, scope = detect_title_match((txt or "")[:1200], stmt)
494
+ if not ok:
495
+ continue
496
+ sc, _why = score_statement_page(txt or "", stmt)
497
+ if sc < 30.0:
498
+ continue
499
+ title_hits.append((i, float(sc), scope, variant))
500
+
501
+ if not title_hits:
502
+ continue
503
+
504
+ title_hits.sort(key=lambda x: x[0])
505
 
506
+ clusters: List[List[Tuple[int, float, str, Optional[str]]]] = []
507
+ for hit in title_hits:
508
+ if not clusters:
509
+ clusters.append([hit])
510
+ continue
511
+ last = clusters[-1][-1]
512
+ # group if same scope and close
513
+ if hit[2] == last[2] and hit[0] <= last[0] + 3:
514
+ clusters[-1].append(hit)
515
+ else:
516
+ clusters.append([hit])
517
+
518
+ blocks: List[Dict[str, Any]] = []
519
+ for cl in clusters:
520
+ start = min(h[0] for h in cl)
521
+ best = max(cl, key=lambda x: x[1])
522
+ best_score = best[1]
523
+ scope = best[2]
524
+ title = best[3]
525
+ end = _expand_block(all_texts, stmt, start, max_forward=continuation_max_forward)
526
+
527
+ blocks.append(
528
+ {
529
+ "start": int(start),
530
+ "end": int(end),
531
+ "scope": scope,
532
+ "title": title,
533
+ "score": float(best_score),
534
+ }
535
+ )
536
+
537
+ out[stmt] = _dedup_blocks(blocks)
538
 
539
+ return out
 
540
 
541
 
542
  # =========================
543
+ # Main builder
544
  # =========================
545
  def build_candidate_lists(
546
  pages: Sequence[Any],
547
+ page_count: int,
548
+ topk_per_statement: int = 3,
549
+ continuation_max_forward: int = 6,
550
  debug: bool = True,
551
  ) -> Tuple[Dict[str, List[Tuple[int, float]]], Dict[str, Any]]:
552
  """
553
  Returns:
554
+ candidates: {stmt: [(page_idx, score), ...]}
555
+ debug_info: includes heuristic_blocks_0_based per stmt (list of blocks)
556
  """
557
  all_texts = [_combined_text(p) for p in pages]
 
558
 
559
  debug_info: Dict[str, Any] = {
560
  "item8_toc_page": None,
561
  "toc_internal": {},
562
  "internal_to_pdf_map_size": 0,
563
+ "heuristic_blocks_0_based": {k: [] for k in TARGETS},
564
+ "top_scoring": {k: [] for k in TARGETS},
 
565
  }
566
 
567
+ # 1) Title-based multi-blocks (works for many non-10K PDFs too)
568
+ title_blocks = build_blocks_from_titles(all_texts, continuation_max_forward=continuation_max_forward)
569
+
570
+ # 2) Try 10-K Item8 TOC mapping (optional; mostly US 10-Ks)
571
+ toc_blocks: Dict[str, List[Dict[str, Any]]] = {k: [] for k in TARGETS}
572
  toc_i = find_item8_toc_page(all_texts)
573
  if toc_i is not None:
574
+ debug_info["item8_toc_page"] = toc_i
575
+ toc_text = all_texts[toc_i] or ""
576
  toc_internal = parse_statement_index_numbers(toc_text)
577
+ debug_info["toc_internal"] = toc_internal
578
+
579
  internal_to_pdf = build_internal_to_pdf_map(all_texts)
580
+ debug_info["internal_to_pdf_map_size"] = len(internal_to_pdf)
581
 
582
+ # convert internal -> pdf
583
+ for key_scoped, internal_page in toc_internal.items():
584
+ if "__" not in key_scoped:
585
+ continue
586
+ stmt, scope = key_scoped.split("__", 1)
587
+ if stmt not in TARGETS:
588
+ continue
589
+
590
+ start_pdf = map_internal_to_pdf(internal_page, internal_to_pdf)
591
+ if start_pdf is None:
592
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
594
+ # expand a block from TOC-derived start
595
+ end_pdf = _expand_block(all_texts, stmt, start_pdf, max_forward=continuation_max_forward)
596
+
597
+ toc_blocks[stmt].append(
598
+ {
599
+ "start": int(start_pdf),
600
+ "end": int(end_pdf),
601
+ "scope": scope if scope in {"consolidated", "standalone"} else "unknown",
602
+ "title": None,
603
+ "score": 55.0, # heuristic
604
+ }
605
+ )
606
+
607
+ for stmt in TARGETS:
608
+ toc_blocks[stmt] = _dedup_blocks(toc_blocks[stmt])
609
+
610
+ # merge blocks
611
+ merged_blocks: Dict[str, List[Dict[str, Any]]] = {}
612
+ for stmt in TARGETS:
613
+ merged_blocks[stmt] = _dedup_blocks((title_blocks.get(stmt) or []) + (toc_blocks.get(stmt) or []))
614
+
615
+ # keep only top N blocks by score, but keep distinct scope if possible
616
+ bl = sorted(merged_blocks[stmt], key=lambda b: float(b.get("score") or 0.0), reverse=True)
617
+ chosen: List[Dict[str, Any]] = []
618
+ seen_scope = set()
619
+ for b in bl:
620
+ scope = (b.get("scope") or "unknown")
621
+ if scope in seen_scope and len(bl) > 1:
622
+ continue
623
+ chosen.append(b)
624
+ seen_scope.add(scope)
625
+ if len(chosen) >= 4: # internal cap, actual final cap comes from settings in main
626
+ break
627
+ merged_blocks[stmt] = sorted(chosen, key=lambda b: (int(b["start"]), int(b["end"])))
628
+
629
+ debug_info["heuristic_blocks_0_based"] = merged_blocks
630
+
631
+ # 3) Strong per-page scoring candidates (fallback / also helpful for LLM page picking)
632
  candidates: Dict[str, List[Tuple[int, float]]] = {k: [] for k in TARGETS}
633
  reasons_store: Dict[str, Dict[int, Any]] = {k: {} for k in TARGETS}
634
 
635
+ for i, txt in enumerate(all_texts):
 
 
 
636
  for stmt in TARGETS:
637
+ sc, why = score_statement_page(txt or "", stmt)
638
  if sc > 0:
639
+ candidates[stmt].append((i, float(sc)))
640
+ if debug and (why.get("title") or sc > 80):
641
+ reasons_store[stmt][i] = why
642
 
643
  for stmt in TARGETS:
644
  candidates[stmt].sort(key=lambda x: x[1], reverse=True)
645
+ debug_info["top_scoring"][stmt] = candidates[stmt][: min(len(candidates[stmt]), 10)]
646
+ candidates[stmt] = candidates[stmt][:topk_per_statement]
647
+ debug_info[f"reasons_{stmt}"] = reasons_store[stmt]
 
 
 
648
 
649
  return candidates, debug_info
650
 
 
654
  debug_info: Dict[str, Any],
655
  page_count: int,
656
  max_images: int,
657
+ max_blocks_per_statement: int = 2,
658
  ) -> List[int]:
659
  """
660
+ Prefer multi-block heuristic pages (include BOTH consolidated + standalone if found).
661
+ Else fallback to top candidates + neighbors.
662
  """
663
+ picked: List[int] = []
664
  seen = set()
665
 
666
  def add(p: int):
 
668
  seen.add(p)
669
  picked.append(p)
670
 
671
+ blocks_by_stmt = debug_info.get("heuristic_blocks_0_based") or {}
672
+ if isinstance(blocks_by_stmt, dict) and any(blocks_by_stmt.get(k) for k in TARGETS):
 
673
  for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
674
+ bl = blocks_by_stmt.get(stmt) or []
675
+ if not isinstance(bl, list) or not bl:
676
+ continue
677
+
678
+ # pick top blocks, prefer distinct scopes
679
+ bl_sorted = sorted(bl, key=lambda b: float(b.get("score") or 0.0), reverse=True)
680
+ chosen: List[Dict[str, Any]] = []
681
+ seen_scope = set()
682
+ for b in bl_sorted:
683
+ scope = (b.get("scope") or "unknown")
684
+ if scope in seen_scope and len(bl_sorted) > 1:
685
+ continue
686
+ chosen.append(b)
687
+ seen_scope.add(scope)
688
+ if len(chosen) >= max_blocks_per_statement:
689
+ break
690
+
691
+ for b in chosen:
692
+ s, e = int(b.get("start")), int(b.get("end"))
693
  for p in range(s, e + 1):
694
  add(p)
695
  add(s - 1)
696
  add(e + 1)
697
+
698
  return sorted(picked)
699
 
700
+ # fallback: use top candidates
701
  for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
702
  for (p, _sc) in candidates.get(stmt, [])[:2]:
703
  add(p)