FridayCodehhr commited on
Commit
fc361bb
·
verified ·
1 Parent(s): 3796af8

Upload 9 files

Browse files
Files changed (9) hide show
  1. Dockerfile +32 -0
  2. app.py +67 -0
  3. config.py +45 -0
  4. index.html +86 -0
  5. main.py +297 -0
  6. openrouter_client.py +256 -0
  7. pdf_io.py +75 -0
  8. requirements.txt +7 -0
  9. statement_candidates.py +545 -0
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python runtime as a parent image
2
+ FROM python:3.10-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies (Tesseract)
8
+ RUN apt-get update && apt-get install -y \
9
+ tesseract-ocr \
10
+ libtesseract-dev \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements first to leverage Docker cache
14
+ COPY requirements.txt .
15
+
16
+ # Install Python dependencies
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy the rest of the application code
20
+ COPY . .
21
+
22
+ # Create a user to run the app (security best practice, required by some environments)
23
+ RUN useradd -m -u 1000 user
24
+ USER user
25
+ ENV HOME=/home/user \
26
+ PATH=/home/user/.local/bin:$PATH
27
+
28
+ # Expose port 7860 (Hugging Face Spaces default)
29
+ EXPOSE 7860
30
+
31
+ # Command to run the application
32
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ import json
5
+ from fastapi import FastAPI, File, UploadFile, HTTPException
6
+ from fastapi.responses import JSONResponse, HTMLResponse
7
+ from fastapi.staticfiles import StaticFiles
8
+ from main import analyze_pdf
9
+
10
+ app = FastAPI()
11
+
12
+ # Mount static files to serve index.html
13
+ # We assume index.html is in the same directory
14
+ app.mount("/static", StaticFiles(directory="."), name="static")
15
+
16
+ @app.get("/", response_class=HTMLResponse)
17
+ async def read_root():
18
+ with open("index.html", "r") as f:
19
+ return f.read()
20
+
21
+ @app.post("/analyze")
22
+ async def analyze_endpoint(file: UploadFile = File(...)):
23
+ if not file.filename.endswith(".pdf"):
24
+ raise HTTPException(status_code=400, detail="File must be a PDF")
25
+
26
+ # Save uploaded file to a temp location
27
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
28
+ shutil.copyfileobj(file.file, tmp)
29
+ tmp_path = tmp.name
30
+
31
+ try:
32
+ # Create a temp debug dir
33
+ debug_dir = tempfile.mkdtemp()
34
+
35
+ # Get API Key from environment (injected by Space secrets)
36
+ api_key = os.getenv("OPENROUTER_API_KEY")
37
+ if not api_key:
38
+ raise HTTPException(status_code=500, detail="Server misconfigured: OPENROUTER_API_KEY missing")
39
+
40
+ # Run analysis using the refactored main logic
41
+ # We pass None for output_path so it doesn't try to write to a fixed file unless we want it to
42
+ # But analyze_pdf writes to output_path if provided. We can just let it return the dict.
43
+ result = analyze_pdf(
44
+ pdf_path=tmp_path,
45
+ output_path="", # Don't write to file, just return dict
46
+ debug_dir=debug_dir,
47
+ openrouter_api_key=api_key
48
+ )
49
+
50
+ return JSONResponse(content=result)
51
+
52
+ except Exception as e:
53
+ import traceback
54
+ traceback.print_exc()
55
+ raise HTTPException(status_code=500, detail=str(e))
56
+ finally:
57
+ # Cleanup
58
+ if os.path.exists(tmp_path):
59
+ os.remove(tmp_path)
60
+ # We might want to keep debug dir for a bit or clean it up.
61
+ # For a simple demo, we can clean it up or ignore it (tmp cleans up eventually on restart usually, but explicitly is better)
62
+ if os.path.exists(debug_dir):
63
+ shutil.rmtree(debug_dir, ignore_errors=True)
64
+
65
+ if __name__ == "__main__":
66
+ import uvicorn
67
+ uvicorn.run(app, host="0.0.0.0", port=7860)
config.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ @dataclass(frozen=True)
7
+ class Settings:
8
+ openrouter_api_key: str
9
+ openrouter_model: str | None
10
+ max_images: int
11
+ dpi: int
12
+ ocr_lang: str
13
+ min_text_chars_for_digital: int
14
+ topk_per_statement: int
15
+
16
+ DEFAULT_FREE_VISION_MODELS = [
17
+ # Free + vision-capable (as of their OpenRouter pages)
18
+ "google/gemma-3-12b-it:free",
19
+ "nvidia/nemotron-nano-12b-v2-vl:free",
20
+ "amazon/nova-2-lite-v1:free",
21
+ ]
22
+
23
+ def load_settings(**kwargs) -> Settings:
24
+ load_dotenv()
25
+
26
+ api_key = kwargs.get("openrouter_api_key") or os.getenv("OPENROUTER_API_KEY", "").strip()
27
+ if not api_key:
28
+ raise RuntimeError("Missing OPENROUTER_API_KEY in environment/.env")
29
+
30
+ model = kwargs.get("openrouter_model") or os.getenv("OPENROUTER_MODEL", "").strip() or None
31
+ max_images = kwargs.get("max_images") or int(os.getenv("MAX_IMAGES", "12"))
32
+ dpi = kwargs.get("dpi") or int(os.getenv("PDF_RENDER_DPI", "200"))
33
+ ocr_lang = kwargs.get("ocr_lang") or os.getenv("OCR_LANG", "eng")
34
+ min_text_chars_for_digital = kwargs.get("min_text_chars_for_digital") or int(os.getenv("MIN_TEXT_CHARS_FOR_DIGITAL", "80"))
35
+ topk_per_statement = kwargs.get("topk_per_statement") or int(os.getenv("TOPK_PER_STATEMENT", "3"))
36
+
37
+ return Settings(
38
+ openrouter_api_key=api_key,
39
+ openrouter_model=model,
40
+ max_images=max_images,
41
+ dpi=dpi,
42
+ ocr_lang=ocr_lang,
43
+ min_text_chars_for_digital=min_text_chars_for_digital,
44
+ topk_per_statement=topk_per_statement,
45
+ )
index.html ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Financial Report Analyzer</title>
7
+ <style>
8
+ body { font-family: sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; }
9
+ .container { border: 1px solid #ccc; padding: 20px; border-radius: 8px; background: #f9f9f9; }
10
+ h1 { text-align: center; color: #333; }
11
+ .form-group { margin-bottom: 20px; text-align: center; }
12
+ input[type="file"] { margin: 10px 0; }
13
+ button { background-color: #007bff; color: white; border: none; padding: 10px 20px; border-radius: 4px; cursor: pointer; font-size: 16px; }
14
+ button:hover { background-color: #0056b3; }
15
+ button:disabled { background-color: #ccc; cursor: not-allowed; }
16
+ #status { text-align: center; margin-top: 10px; font-weight: bold; }
17
+ #result { margin-top: 20px; white-space: pre-wrap; background: #fff; padding: 15px; border: 1px solid #ddd; border-radius: 4px; display: none; }
18
+ .error { color: #dc3545; }
19
+ </style>
20
+ </head>
21
+ <body>
22
+
23
+ <div class="container">
24
+ <h1>Financial Report Analyzer</h1>
25
+ <p style="text-align: center;">Upload a 10-K/Annual Report PDF to extract page ranges for primary financial statements.</p>
26
+
27
+ <div class="form-group">
28
+ <input type="file" id="pdfInput" accept=".pdf" />
29
+ <br>
30
+ <button id="analyzeBtn" onclick="analyzePdf()">Analyze PDF</button>
31
+ </div>
32
+
33
+ <div id="status"></div>
34
+ <pre id="result"></pre>
35
+ </div>
36
+
37
+ <script>
38
+ async function analyzePdf() {
39
+ const input = document.getElementById('pdfInput');
40
+ const file = input.files[0];
41
+ const btn = document.getElementById('analyzeBtn');
42
+ const status = document.getElementById('status');
43
+ const resultDisplay = document.getElementById('result');
44
+
45
+ if (!file) {
46
+ alert("Please select a PDF file first.");
47
+ return;
48
+ }
49
+
50
+ // Reset UI
51
+ btn.disabled = true;
52
+ status.textContent = "Analyzing... This may take a minute.";
53
+ status.className = "";
54
+ resultDisplay.style.display = 'none';
55
+ resultDisplay.textContent = "";
56
+
57
+ const formData = new FormData();
58
+ formData.append('file', file);
59
+
60
+ try {
61
+ const response = await fetch('/analyze', {
62
+ method: 'POST',
63
+ body: formData
64
+ });
65
+
66
+ if (!response.ok) {
67
+ const errorData = await response.json();
68
+ throw new Error(errorData.detail || "Analysis failed");
69
+ }
70
+
71
+ const data = await response.json();
72
+ status.textContent = "Analysis Complete!";
73
+ resultDisplay.textContent = JSON.stringify(data, null, 2);
74
+ resultDisplay.style.display = 'block';
75
+
76
+ } catch (error) {
77
+ console.error("Error:", error);
78
+ status.textContent = "Error: " + error.message;
79
+ status.className = "error";
80
+ } finally {
81
+ btn.disabled = false;
82
+ }
83
+ }
84
+ </script>
85
+ </body>
86
+ </html>
main.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import argparse
3
+ import json
4
+ import time
5
+
6
+ from config import load_settings, DEFAULT_FREE_VISION_MODELS
7
+ from pdf_io import extract_texts_from_pdf, render_pages_to_png_bytes
8
+ from statement_candidates import build_candidate_lists, select_pages_for_llm
9
+ from openrouter_client import (
10
+ choose_free_vision_model,
11
+ choose_any_free_text_model,
12
+ chat_completion,
13
+ make_user_message_with_images,
14
+ robust_json_loads,
15
+ repair_to_json,
16
+ )
17
+
18
+
19
+ PROMPT_TEMPLATE = """
20
+ You are given:
21
+ 1) OCR/extracted text for a set of PDF pages from a company's financial report (10-K/annual report)
22
+ 2) Images of the same pages
23
+
24
+ Task:
25
+ Identify the PDF PAGE RANGES (start_page, end_page) for the THREE PRIMARY FINANCIAL STATEMENT TABLES ONLY:
26
+ - Balance Sheet (a.k.a. Statement of Financial Position / Consolidated Balance Sheets)
27
+ - Profit & Loss (a.k.a. Income Statement / Statements of Earnings / Statements of Operations)
28
+ - Cash Flow Statement (Statements of Cash Flows)
29
+
30
+ IMPORTANT RULES (STRICT):
31
+ - Only return ranges for the PRIMARY consolidated financial statements pages.
32
+ - Do NOT return ranges for note disclosures (e.g., derivatives, leases, fair value tables), MD&A, segment notes, or narrative discussion.
33
+ - A primary statement table page usually has:
34
+ (a) a clear statement title at the top (e.g., “Consolidated Balance Sheets”)
35
+ (b) many numeric columns (often multiple years)
36
+ (c) canonical line items like:
37
+ Balance sheet: “Total assets”, “Total liabilities”, “Total equity/stockholders’ equity”
38
+ P&L: “Net revenues/sales”, “Cost of sales”, “Operating income”, “Net earnings/income”, “Earnings per share”
39
+ Cash flow: “Cash flows from operating/investing/financing activities”, “Net cash provided by”, “Cash and cash equivalents at end”
40
+ - If a statement continues onto the next page, include that continuation page in the range.
41
+
42
+ Pages provided (OCR snippets):
43
+ {page_snippets}
44
+
45
+ Output JSON ONLY in this schema (no extra keys, no markdown):
46
+ {{
47
+ "balance_sheet": {{"start_page": int, "end_page": int, "confidence": float, "title": str}},
48
+ "profit_and_loss": {{"start_page": int, "end_page": int, "confidence": float, "title": str}},
49
+ "cash_flow": {{"start_page": int, "end_page": int, "confidence": float, "title": str}}
50
+ }}
51
+
52
+ Remember: PDF page numbers are 1-based in your output.
53
+ """
54
+
55
+ SCHEMA_HINT = """{
56
+ "balance_sheet": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
57
+ "profit_and_loss": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
58
+ "cash_flow": {"start_page": "int|null", "end_page": "int|null", "confidence": "number", "evidence_pages": "int[]", "title": "string|null"},
59
+ "notes": "string[]"
60
+ }"""
61
+
62
+
63
+ def log(msg: str):
64
+ ts = time.strftime("%H:%M:%S")
65
+ print(f"[{ts}] {msg}", flush=True)
66
+
67
+
68
+ def build_page_snippets(page_texts, selected_pages):
69
+ chunks = []
70
+ for p in selected_pages:
71
+ pt = page_texts[p]
72
+ txt = (pt.extracted_text or "") + "\n" + (pt.ocr_text or "")
73
+ txt = " ".join(txt.strip().split())
74
+ if len(txt) > 900:
75
+ txt = txt[:900] + "..."
76
+ chunks.append(f"- Page {p+1}: {txt}")
77
+ return "\n".join(chunks)
78
+
79
+
80
+ def validate_ranges(result: dict, page_count: int) -> dict:
81
+ def clamp(v):
82
+ if v is None:
83
+ return None
84
+ if not isinstance(v, int):
85
+ return None
86
+ if v < 1 or v > page_count:
87
+ return None
88
+ return v
89
+
90
+ for k in ["balance_sheet", "profit_and_loss", "cash_flow"]:
91
+ obj = result.get(k, {})
92
+ if not isinstance(obj, dict):
93
+ result[k] = {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None}
94
+ continue
95
+
96
+ sp = clamp(obj.get("start_page"))
97
+ ep = clamp(obj.get("end_page"))
98
+ if sp is not None and ep is not None and ep < sp:
99
+ sp, ep = None, None
100
+
101
+ obj["start_page"] = sp
102
+ obj["end_page"] = ep
103
+ if "confidence" not in obj or not isinstance(obj["confidence"], (int, float)):
104
+ obj["confidence"] = 0.0
105
+ if "evidence_pages" not in obj or not isinstance(obj["evidence_pages"], list):
106
+ obj["evidence_pages"] = []
107
+ if "title" not in obj:
108
+ obj["title"] = None
109
+ result[k] = obj
110
+
111
+ if "notes" not in result or not isinstance(result["notes"], list):
112
+ result["notes"] = []
113
+ return result
114
+
115
+
116
+ def analyze_pdf(
117
+ pdf_path: str,
118
+ output_path: str = "ranges.json",
119
+ debug_dir: str = "debug",
120
+ openrouter_api_key: str = None
121
+ ) -> dict:
122
+ """
123
+ Analyzes a PDF to find financial statement page ranges.
124
+ Returns the result dict.
125
+ """
126
+ settings_kwargs = {}
127
+ if openrouter_api_key:
128
+ settings_kwargs["openrouter_api_key"] = openrouter_api_key
129
+
130
+ st = load_settings(**settings_kwargs)
131
+
132
+ log(f"Loading PDF: {pdf_path}")
133
+ page_texts, page_count = extract_texts_from_pdf(
134
+ pdf_path=pdf_path,
135
+ dpi=st.dpi,
136
+ ocr_lang=st.ocr_lang,
137
+ min_text_chars_for_digital=st.min_text_chars_for_digital,
138
+ )
139
+
140
+ ocr_pages = sum(1 for p in page_texts if p.used_ocr)
141
+ log(f"Pages: {page_count} | OCR used on {ocr_pages} pages")
142
+
143
+ candidates, cand_debug = build_candidate_lists(page_texts, top_k=30, debug=True)
144
+ log("TOC/Index debug:")
145
+ log(f" item8_toc_page = {cand_debug.get('item8_toc_page')}")
146
+ log(f" toc_internal = {cand_debug.get('toc_internal')}")
147
+ log(f" toc_pdf_all = {cand_debug.get('toc_pdf_targets_all')}")
148
+ log(f" heuristic_ranges_0_based = {cand_debug.get('heuristic_ranges_0_based')}")
149
+
150
+ selected_pages = select_pages_for_llm(
151
+ candidates=candidates,
152
+ debug_info=cand_debug,
153
+ page_count=page_count,
154
+ max_images=st.max_images
155
+ )
156
+ log(f"Selected pages to render/send (1-indexed): {[p+1 for p in selected_pages]}")
157
+
158
+ log(f"Rendering {len(selected_pages)} pages to images (dpi={st.dpi})...")
159
+ page_png_map = render_pages_to_png_bytes(pdf_path, selected_pages, dpi=st.dpi)
160
+ log("Image rendering done.")
161
+
162
+ if st.openrouter_model:
163
+ model = st.openrouter_model
164
+ log(f"Using model from env: {model}")
165
+ else:
166
+ model = choose_free_vision_model(st.openrouter_api_key, preferred=DEFAULT_FREE_VISION_MODELS)
167
+ log(f"Auto-selected free vision model: {model}")
168
+
169
+ snippets = build_page_snippets(page_texts, selected_pages)
170
+ prompt = PROMPT_TEMPLATE.format(page_snippets=snippets)
171
+
172
+ # --- LLM call with progressive image backoff ---
173
+ pages_sent = list(selected_pages)
174
+ llm_res = None
175
+ while pages_sent:
176
+ images = [page_png_map[p] for p in pages_sent]
177
+ msg = make_user_message_with_images(prompt, images)
178
+
179
+ log(f"Calling OpenRouter (images={len(images)})...")
180
+ llm_res = chat_completion(
181
+ api_key=st.openrouter_api_key,
182
+ model=model,
183
+ messages=[msg],
184
+ max_tokens=4096,
185
+ temperature=0.0,
186
+ require_json=True,
187
+ )
188
+
189
+ log(f"finish_reason={llm_res.finish_reason} native={llm_res.native_finish_reason} content_len={len(llm_res.content)}")
190
+
191
+ # save raw response for debugging
192
+ try:
193
+ import os
194
+ os.makedirs(debug_dir, exist_ok=True)
195
+ with open(f"{debug_dir}/openrouter_raw_response.json", "w", encoding="utf-8") as f:
196
+ json.dump(llm_res.raw, f, indent=2)
197
+ except Exception:
198
+ pass
199
+
200
+ if llm_res.finish_reason == "error" or ("error" in llm_res.raw and llm_res.raw["error"]):
201
+ log("OpenRouter returned an error payload (see debug/openrouter_raw_response.json). Backing off images...")
202
+ elif llm_res.content.strip():
203
+ break
204
+
205
+ if len(pages_sent) <= 3:
206
+ break
207
+ pages_sent = pages_sent[:-2]
208
+ log(f"Retrying with fewer images. Now sending pages: {[p+1 for p in pages_sent]}")
209
+
210
+ if not llm_res:
211
+ raise RuntimeError("LLM call never executed.")
212
+
213
+ raw_text = (llm_res.content or "").strip()
214
+ log("DEBUG: raw model output (first 1200 chars):")
215
+ print(raw_text[:1200], flush=True)
216
+
217
+ # --- Parse JSON with repair fallback ---
218
+ try:
219
+ result = robust_json_loads(raw_text)
220
+ log("Parsed JSON successfully.")
221
+ except Exception as e:
222
+ log(f"JSON parse failed: {e}")
223
+ # Save raw text
224
+ try:
225
+ import os
226
+ os.makedirs(debug_dir, exist_ok=True)
227
+ with open(f"{debug_dir}/llm_raw_output.txt", "w", encoding="utf-8") as f:
228
+ f.write(raw_text)
229
+ except Exception:
230
+ pass
231
+
232
+ # Repair pass with free-tier text model
233
+ repair_model = choose_any_free_text_model(st.openrouter_api_key, preferred=[
234
+ model, # try same model first
235
+ "google/gemma-3-12b-it:free",
236
+ "amazon/nova-2-lite-v1:free",
237
+ "nvidia/nemotron-nano-12b-v2-vl:free",
238
+ ])
239
+ log(f"Attempting JSON repair using: {repair_model}")
240
+ try:
241
+ result = repair_to_json(
242
+ api_key=st.openrouter_api_key,
243
+ model=repair_model,
244
+ bad_output=raw_text if raw_text else json.dumps(llm_res.raw),
245
+ schema_hint=SCHEMA_HINT,
246
+ )
247
+ log("Repair JSON succeeded.")
248
+ except Exception as e2:
249
+ log(f"Repair JSON failed: {e2}")
250
+ # Final safe fallback
251
+ result = {
252
+ "balance_sheet": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
253
+ "profit_and_loss": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
254
+ "cash_flow": {"start_page": None, "end_page": None, "confidence": 0.0, "evidence_pages": [], "title": None},
255
+ "notes": [
256
+ "Model output could not be parsed as JSON.",
257
+ "Check debug/openrouter_raw_response.json and debug/llm_raw_output.txt",
258
+ ],
259
+ }
260
+
261
+ result = validate_ranges(result, page_count=page_count)
262
+ result["debug"] = {
263
+ "model_used": model,
264
+ "pages_sent": [p + 1 for p in pages_sent],
265
+ "candidate_pages": candidates,
266
+ "finish_reason": llm_res.finish_reason,
267
+ "native_finish_reason": llm_res.native_finish_reason,
268
+ }
269
+
270
+ if output_path:
271
+ with open(output_path, "w", encoding="utf-8") as f:
272
+ json.dump(result, f, indent=2)
273
+ log(f"Saved output: {output_path}")
274
+
275
+ return result
276
+
277
+
278
+ def main():
279
+ ap = argparse.ArgumentParser()
280
+ ap.add_argument("--pdf", required=True, help="Path to financial report PDF")
281
+ ap.add_argument("--out", default="ranges.json", help="Output JSON path")
282
+ ap.add_argument("--debug_dir", default="debug", help="Folder to write debug artifacts")
283
+ args = ap.parse_args()
284
+
285
+ # Call the core logic
286
+ result = analyze_pdf(
287
+ pdf_path=args.pdf,
288
+ output_path=args.out,
289
+ debug_dir=args.debug_dir
290
+ )
291
+
292
+ # Print result to stdout for CLI use
293
+ print(json.dumps(result, indent=2), flush=True)
294
+
295
+
296
+ if __name__ == "__main__":
297
+ main()
openrouter_client.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import base64
3
+ import json
4
+ import re
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+ import requests
9
+
10
+ OPENROUTER_CHAT_URL = "https://openrouter.ai/api/v1/chat/completions"
11
+ OPENROUTER_MODELS_URL = "https://openrouter.ai/api/v1/models"
12
+
13
+
14
+ @dataclass
15
+ class ChatResult:
16
+ content: str
17
+ finish_reason: str | None
18
+ native_finish_reason: str | None
19
+ tool_calls: Any
20
+ raw: dict
21
+
22
+
23
+ def list_models(api_key: str) -> dict:
24
+ headers = {"Authorization": f"Bearer {api_key}"}
25
+ r = requests.get(OPENROUTER_MODELS_URL, headers=headers, timeout=60)
26
+ r.raise_for_status()
27
+ return r.json()
28
+
29
+
30
+ def choose_free_vision_model(api_key: str, preferred: list[str]) -> str:
31
+ models = list_models(api_key).get("data", [])
32
+ by_id = {m.get("id"): m for m in models}
33
+
34
+ def is_free(m: dict) -> bool:
35
+ pricing = m.get("pricing") or {}
36
+ try:
37
+ return float(pricing.get("prompt", "1")) == 0.0 and float(pricing.get("completion", "1")) == 0.0
38
+ except Exception:
39
+ return False
40
+
41
+ def is_vision(m: dict) -> bool:
42
+ arch = (m.get("architecture") or {})
43
+ in_mods = set(arch.get("input_modalities") or [])
44
+ return "image" in in_mods
45
+
46
+ # Preferred first
47
+ for mid in preferred:
48
+ m = by_id.get(mid)
49
+ if m and is_free(m) and is_vision(m):
50
+ return mid
51
+
52
+ # Any free vision
53
+ for m in models:
54
+ if is_free(m) and is_vision(m):
55
+ return m.get("id")
56
+
57
+ raise RuntimeError("Could not find any free vision-capable model in /models.")
58
+
59
+
60
+ def choose_any_free_text_model(api_key: str, preferred: list[str] | None = None) -> str:
61
+ models = list_models(api_key).get("data", [])
62
+ by_id = {m.get("id"): m for m in models}
63
+
64
+ def is_free(m: dict) -> bool:
65
+ pricing = m.get("pricing") or {}
66
+ try:
67
+ return float(pricing.get("prompt", "1")) == 0.0 and float(pricing.get("completion", "1")) == 0.0
68
+ except Exception:
69
+ return False
70
+
71
+ def is_text_input(m: dict) -> bool:
72
+ arch = (m.get("architecture") or {})
73
+ in_mods = set(arch.get("input_modalities") or [])
74
+ return "text" in in_mods
75
+
76
+ if preferred:
77
+ for mid in preferred:
78
+ m = by_id.get(mid)
79
+ if m and is_free(m) and is_text_input(m):
80
+ return mid
81
+
82
+ for m in models:
83
+ if is_free(m) and is_text_input(m):
84
+ return m.get("id")
85
+
86
+ raise RuntimeError("Could not find any free text-capable model in /models.")
87
+
88
+
89
+ def _img_bytes_to_data_url(png_bytes: bytes) -> str:
90
+ b64 = base64.b64encode(png_bytes).decode("utf-8")
91
+ return f"data:image/png;base64,{b64}"
92
+
93
+
94
+ def make_user_message_with_images(prompt_text: str, images: list[bytes]) -> dict:
95
+ """
96
+ OpenRouter follows OpenAI chat schema; some SDK examples show imageUrl (camelCase).
97
+ We include both keys for maximum compatibility.
98
+ """
99
+ content: list[dict] = [{"type": "text", "text": prompt_text}]
100
+ for im in images:
101
+ url = _img_bytes_to_data_url(im)
102
+ content.append(
103
+ {
104
+ "type": "image_url",
105
+ "image_url": {"url": url}, # OpenAI-style
106
+ "imageUrl": {"url": url}, # SDK-style
107
+ }
108
+ )
109
+ return {"role": "user", "content": content}
110
+
111
+
112
+ def chat_completion(
113
+ api_key: str,
114
+ model: str,
115
+ messages: list[dict],
116
+ max_tokens: int = 2000,
117
+ temperature: float = 0.0,
118
+ require_json: bool = True,
119
+ extra: dict | None = None,
120
+ ) -> ChatResult:
121
+ headers = {
122
+ "Authorization": f"Bearer {api_key}",
123
+ "Content-Type": "application/json",
124
+ "HTTP-Referer": "http://localhost",
125
+ "X-Title": "fin-statement-page-locator",
126
+ }
127
+
128
+ payload: dict[str, Any] = {
129
+ "model": model,
130
+ "messages": messages,
131
+ "temperature": temperature,
132
+ "max_tokens": max_tokens,
133
+ # Force no tool calls even if provider supports them
134
+ "tool_choice": "none",
135
+ }
136
+
137
+ if require_json:
138
+ # OpenRouter supports response_format json_object (JSON mode)
139
+ payload["response_format"] = {"type": "json_object"}
140
+
141
+ if extra:
142
+ payload.update(extra)
143
+
144
+ r = requests.post(OPENROUTER_CHAT_URL, headers=headers, json=payload, timeout=180)
145
+ r.raise_for_status()
146
+ data = r.json()
147
+
148
+ # OpenRouter can return errors at top-level even with HTTP 200 in some scenarios
149
+ if isinstance(data, dict) and "error" in data and data["error"]:
150
+ # keep raw for debugging
151
+ return ChatResult(
152
+ content="",
153
+ finish_reason="error",
154
+ native_finish_reason=None,
155
+ tool_calls=None,
156
+ raw=data,
157
+ )
158
+
159
+ choice0 = (data.get("choices") or [{}])[0]
160
+ msg = choice0.get("message") or {}
161
+
162
+ content = (msg.get("content") or "").strip()
163
+ tool_calls = msg.get("tool_calls") or msg.get("toolCalls")
164
+
165
+ return ChatResult(
166
+ content=content,
167
+ finish_reason=choice0.get("finish_reason"),
168
+ native_finish_reason=choice0.get("native_finish_reason"),
169
+ tool_calls=tool_calls,
170
+ raw=data,
171
+ )
172
+
173
+
174
+ def _extract_json_from_codeblock(s: str) -> str | None:
175
+ # ```json ... ```
176
+ m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", s, flags=re.IGNORECASE)
177
+ if m:
178
+ return m.group(1).strip()
179
+ return None
180
+
181
+
182
+ def _extract_first_balanced_object(s: str) -> str | None:
183
+ """
184
+ Extract the first balanced {...} JSON object from arbitrary text.
185
+ """
186
+ start = s.find("{")
187
+ if start == -1:
188
+ return None
189
+
190
+ depth = 0
191
+ for i in range(start, len(s)):
192
+ ch = s[i]
193
+ if ch == "{":
194
+ depth += 1
195
+ elif ch == "}":
196
+ depth -= 1
197
+ if depth == 0:
198
+ return s[start : i + 1]
199
+ return None
200
+
201
+
202
+ def robust_json_loads(s: str) -> dict:
203
+ s = (s or "").strip()
204
+ if not s:
205
+ raise ValueError("Empty model content (no JSON to parse).")
206
+
207
+ # 1) direct parse
208
+ try:
209
+ return json.loads(s)
210
+ except Exception:
211
+ pass
212
+
213
+ # 2) codeblock
214
+ cb = _extract_json_from_codeblock(s)
215
+ if cb:
216
+ try:
217
+ return json.loads(cb)
218
+ except Exception:
219
+ pass
220
+
221
+ # 3) balanced object
222
+ obj = _extract_first_balanced_object(s)
223
+ if obj:
224
+ return json.loads(obj)
225
+
226
+ raise ValueError("Could not parse JSON from model output (no valid JSON object found).")
227
+
228
+
229
+ def repair_to_json(
230
+ api_key: str,
231
+ model: str,
232
+ bad_output: str,
233
+ schema_hint: str,
234
+ ) -> dict:
235
+ """
236
+ Ask a free model to convert arbitrary text into valid JSON for our schema.
237
+ """
238
+ repair_prompt = f"""Convert the following content into VALID JSON ONLY.
239
+ No markdown, no backticks, no explanations.
240
+
241
+ Schema (must match keys/types):
242
+ {schema_hint}
243
+
244
+ Content to convert:
245
+ {bad_output}
246
+ """
247
+ msg = {"role": "user", "content": repair_prompt}
248
+ res = chat_completion(
249
+ api_key=api_key,
250
+ model=model,
251
+ messages=[msg],
252
+ max_tokens=900,
253
+ temperature=0.0,
254
+ require_json=True,
255
+ )
256
+ return robust_json_loads(res.content)
pdf_io.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import List, Optional, Tuple
4
+ import fitz # PyMuPDF
5
+ from PIL import Image
6
+ import io
7
+
8
+ @dataclass
9
+ class PageText:
10
+ page_index: int # 0-based
11
+ extracted_text: str
12
+ ocr_text: str
13
+ used_ocr: bool
14
+
15
+ def _safe_text(s: str) -> str:
16
+ return (s or "").replace("\x00", " ").strip()
17
+
18
+ def render_page_to_pil(doc: fitz.Document, page_index: int, dpi: int) -> Image.Image:
19
+ page = doc.load_page(page_index)
20
+ zoom = dpi / 72.0
21
+ mat = fitz.Matrix(zoom, zoom)
22
+ pix = page.get_pixmap(matrix=mat, alpha=False)
23
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
24
+ return img
25
+
26
+ def ocr_pil_image(img: Image.Image, lang: str = "eng") -> str:
27
+ try:
28
+ import pytesseract
29
+ except Exception as e:
30
+ raise RuntimeError(
31
+ "pytesseract not available. Install pytesseract and system Tesseract OCR."
32
+ ) from e
33
+
34
+ # psm 6: assume a block of text (good for tables + headings)
35
+ txt = pytesseract.image_to_string(img, lang=lang, config="--psm 6")
36
+ return _safe_text(txt)
37
+
38
+ def is_likely_scanned(extracted_text: str, min_chars: int) -> bool:
39
+ # If the page has almost no selectable text, it’s probably scanned.
40
+ return len(_safe_text(extracted_text)) < min_chars
41
+
42
+ def extract_texts_from_pdf(
43
+ pdf_path: str,
44
+ dpi: int,
45
+ ocr_lang: str,
46
+ min_text_chars_for_digital: int,
47
+ ) -> Tuple[List[PageText], int]:
48
+ doc = fitz.open(pdf_path)
49
+ page_count = doc.page_count
50
+ results: List[PageText] = []
51
+
52
+ for i in range(page_count):
53
+ page = doc.load_page(i)
54
+ extracted = _safe_text(page.get_text("text"))
55
+
56
+ if is_likely_scanned(extracted, min_text_chars_for_digital):
57
+ img = render_page_to_pil(doc, i, dpi=dpi)
58
+ ocr_txt = ocr_pil_image(img, lang=ocr_lang)
59
+ results.append(PageText(i, extracted_text=extracted, ocr_text=ocr_txt, used_ocr=True))
60
+ else:
61
+ results.append(PageText(i, extracted_text=extracted, ocr_text="", used_ocr=False))
62
+
63
+ doc.close()
64
+ return results, page_count
65
+
66
+ def render_pages_to_png_bytes(pdf_path: str, page_indices: List[int], dpi: int) -> dict[int, bytes]:
67
+ doc = fitz.open(pdf_path)
68
+ out: dict[int, bytes] = {}
69
+ for p in page_indices:
70
+ img = render_page_to_pil(doc, p, dpi=dpi)
71
+ buf = io.BytesIO()
72
+ img.save(buf, format="PNG")
73
+ out[p] = buf.getvalue()
74
+ doc.close()
75
+ return out
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-multipart
4
+ pymupdf
5
+ pillow
6
+ requests
7
+ python-dotenv
statement_candidates.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # statement_candidates.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
7
+ import difflib
8
+
9
+
10
+ # =========================
11
+ # Targets (you want ONLY these 3)
12
+ # =========================
13
+ TARGETS = ["balance_sheet", "profit_and_loss", "cash_flow"]
14
+
15
+ # Auxiliary statements used ONLY for delimiting ranges (helpful in 10-K order)
16
+ AUX = ["comprehensive_income", "equity", "notes"]
17
+
18
+ # =========================
19
+ # Title variants (based on your screenshots + common 10-K phrasing)
20
+ # =========================
21
+ TITLE_VARIANTS: Dict[str, List[str]] = {
22
+ "balance_sheet": [
23
+ "Consolidated Balance Sheets",
24
+ "Balance Sheets",
25
+ "Statement of Financial Position",
26
+ ],
27
+ "profit_and_loss": [
28
+ "Consolidated Statements of Earnings", # AbbVie screenshot
29
+ "Consolidated Statements of Operations",
30
+ "Consolidated Statements of Income",
31
+ "Income Statement",
32
+ "Statement of Profit and Loss",
33
+ ],
34
+ "cash_flow": [
35
+ "Consolidated Statements of Cash Flows",
36
+ "Statement of Cash Flows",
37
+ "Cash Flow Statement",
38
+ ],
39
+ # auxiliary
40
+ "comprehensive_income": [
41
+ "Consolidated Statements of Comprehensive Income",
42
+ "Statement of Comprehensive Income",
43
+ ],
44
+ "equity": [
45
+ "Consolidated Statements of Equity",
46
+ "Statement of Stockholders' Equity",
47
+ "Statement of Shareholders' Equity",
48
+ ],
49
+ "notes": [
50
+ "Notes to Consolidated Financial Statements",
51
+ "Notes to Financial Statements",
52
+ ],
53
+ }
54
+
55
+ # Footer phrase (exact idea from your images)
56
+ INTEGRAL_FOOTER = "the accompanying notes are an integral part"
57
+
58
+ # =========================
59
+ # Signature table line-items (increase precision against note tables)
60
+ # =========================
61
+ SIG_TERMS: Dict[str, List[str]] = {
62
+ "balance_sheet": [
63
+ "total assets",
64
+ "total liabilities",
65
+ "total equity",
66
+ "stockholders' equity",
67
+ "shareholders' equity",
68
+ "assets",
69
+ "liabilities and equity",
70
+ "current assets",
71
+ "current liabilities",
72
+ ],
73
+ "profit_and_loss": [
74
+ "net revenues",
75
+ "net sales",
76
+ "revenue",
77
+ "cost of products sold",
78
+ "cost of sales",
79
+ "gross profit",
80
+ "operating income",
81
+ "operating earnings",
82
+ "net earnings",
83
+ "net income",
84
+ "earnings per share",
85
+ "basic",
86
+ "diluted",
87
+ ],
88
+ "cash_flow": [
89
+ "cash flows from operating activities",
90
+ "cash flows from investing activities",
91
+ "cash flows from financing activities",
92
+ "net cash provided by operating activities",
93
+ "net cash used in investing activities",
94
+ "net cash used in financing activities",
95
+ "cash and cash equivalents, end of year",
96
+ "cash and equivalents, end of year",
97
+ "net change in cash",
98
+ ],
99
+ # aux
100
+ "notes": ["note 1", "note 2", "notes to consolidated financial statements"],
101
+ }
102
+
103
+ NOTE_HEADING_RE = re.compile(r"^\s*note\s+\d+\b", re.IGNORECASE)
104
+
105
+ # Typical TOC “dot leaders”
106
+ DOT_LEADER_RE = re.compile(r"\.{5,}")
107
+
108
+ # Item 8 TOC trigger
109
+ ITEM8_RE = re.compile(r"\bITEM\s+8\.\s+FINANCIAL\s+STATEMENTS\s+AND\s+SUPPLEMENTARY\s+DATA\b", re.IGNORECASE)
110
+
111
+
112
+ # =========================
113
+ # Page object -> combined text
114
+ # =========================
115
+ def _combined_text(page_obj: Any) -> str:
116
+ """
117
+ Works with your PageText dataclass:
118
+ extracted_text + ocr_text
119
+ Also supports dict/object string fallback.
120
+ """
121
+ if page_obj is None:
122
+ return ""
123
+ if isinstance(page_obj, str):
124
+ return page_obj
125
+
126
+ # dict-like
127
+ if isinstance(page_obj, dict):
128
+ a = page_obj.get("extracted_text") or page_obj.get("text") or ""
129
+ b = page_obj.get("ocr_text") or ""
130
+ return (a + "\n" + b).strip()
131
+
132
+ # attribute style
133
+ a = getattr(page_obj, "extracted_text", None) or getattr(page_obj, "text", None) or ""
134
+ b = getattr(page_obj, "ocr_text", None) or ""
135
+ return (a + "\n" + b).strip()
136
+
137
+
138
+ def _page_index(page_obj: Any, fallback: int) -> int:
139
+ if isinstance(page_obj, dict):
140
+ if isinstance(page_obj.get("page_index"), int):
141
+ return int(page_obj["page_index"])
142
+ v = getattr(page_obj, "page_index", None)
143
+ return int(v) if isinstance(v, int) else fallback
144
+
145
+
146
+ def _norm(s: str) -> str:
147
+ return re.sub(r"\s+", " ", (s or "")).strip().lower()
148
+
149
+
150
+ # =========================
151
+ # Fuzzy title detection (OCR typos tolerant)
152
+ # =========================
153
+ def _fuzzy_line_contains_title(top_lines: List[str], title: str, threshold: float = 0.86) -> bool:
154
+ title_n = _norm(title)
155
+ for ln in top_lines:
156
+ ln_n = _norm(ln)
157
+ if not ln_n:
158
+ continue
159
+ # direct contains
160
+ if title_n in ln_n:
161
+ return True
162
+ # fuzzy ratio
163
+ r = difflib.SequenceMatcher(None, ln_n, title_n).ratio()
164
+ if r >= threshold:
165
+ return True
166
+ return False
167
+
168
+
169
+ def detect_title(text: str, stmt: str) -> bool:
170
+ lines = (text or "").splitlines()
171
+ top_lines = [ln.strip() for ln in lines[:14] if ln.strip()] # titles live here in your screenshots
172
+ for variant in TITLE_VARIANTS.get(stmt, []):
173
+ if _fuzzy_line_contains_title(top_lines, variant):
174
+ return True
175
+ return False
176
+
177
+
178
+ # =========================
179
+ # Footer internal page number extraction (10-K style)
180
+ # =========================
181
+ FOOTER_PIPE_RE = re.compile(r"\|\s*(\d{1,4})\s*$", re.MULTILINE)
182
+ FOOTER_FORM_RE = re.compile(r"form\s+10-?k\s*\|\s*(\d{1,4})\s*$", re.IGNORECASE | re.MULTILINE)
183
+
184
+ def extract_footer_internal_page(text: str) -> Optional[int]:
185
+ t = text or ""
186
+
187
+ m = FOOTER_PIPE_RE.findall(t)
188
+ if m:
189
+ return int(m[-1])
190
+
191
+ m = FOOTER_FORM_RE.findall(t)
192
+ if m:
193
+ return int(m[-1])
194
+
195
+ # fallback: last few non-empty lines that are ONLY digits (avoid table numbers)
196
+ lines = [ln.strip() for ln in (t.splitlines() if t else []) if ln.strip()]
197
+ for ln in reversed(lines[-6:]):
198
+ if re.fullmatch(r"\d{1,4}", ln):
199
+ return int(ln)
200
+
201
+ return None
202
+
203
+
204
+ # =========================
205
+ # Item 8 TOC page detection + TOC parsing
206
+ # AbbVie TOC is "title line" then next line has page number ("55")
207
+ # =========================
208
+ def find_item8_toc_page(all_texts: Sequence[str]) -> Optional[int]:
209
+ """
210
+ Choose the Item 8 page that LOOKS like an index/TOC (has dot leaders or 'Page').
211
+ """
212
+ candidates = []
213
+ for i, txt in enumerate(all_texts):
214
+ if not ITEM8_RE.search(txt or ""):
215
+ continue
216
+ low = _norm(txt)
217
+ tocish = ("page" in low) and (DOT_LEADER_RE.search(txt or "") is not None)
218
+ if tocish:
219
+ candidates.append(i)
220
+
221
+ return candidates[0] if candidates else None
222
+
223
+
224
+ def parse_statement_index_numbers(toc_text: str) -> Dict[str, int]:
225
+ """
226
+ Returns internal page numbers from the index.
227
+ Handles:
228
+ - same line "Consolidated Balance Sheets .... 57"
229
+ - two-line "Consolidated Balance Sheets" newline "57" (AbbVie)
230
+ """
231
+ lines = [ln.strip() for ln in (toc_text or "").splitlines()]
232
+ out: Dict[str, int] = {}
233
+
234
+ # compile quick patterns
235
+ pats = {
236
+ "profit_and_loss": re.compile(r"consolidated\s+statements?\s+of\s+(earnings|operations|income)", re.I),
237
+ "comprehensive_income": re.compile(r"consolidated\s+statements?\s+of\s+comprehensive\s+income", re.I),
238
+ "balance_sheet": re.compile(r"consolidated\s+balance\s+sheets?|statement\s+of\s+financial\s+position", re.I),
239
+ "equity": re.compile(r"consolidated\s+statements?\s+of\s+equity|stockholders[’']\s+equity|shareholders[’']\s+equity", re.I),
240
+ "cash_flow": re.compile(r"consolidated\s+statements?\s+of\s+cash\s+flows?", re.I),
241
+ "notes": re.compile(r"notes\s+to\s+consolidated\s+financial\s+statements", re.I),
242
+ }
243
+
244
+ for i, ln in enumerate(lines):
245
+ if not ln:
246
+ continue
247
+
248
+ for key, pat in pats.items():
249
+ if not pat.search(ln):
250
+ continue
251
+
252
+ # case 1: number on same line at end
253
+ m = re.findall(r"(\d{1,4})\s*$", ln)
254
+ if m and ln.endswith(m[-1]):
255
+ out[key] = int(m[-1])
256
+ continue
257
+
258
+ # case 2: number on next non-empty line
259
+ j = i + 1
260
+ while j < len(lines) and not lines[j]:
261
+ j += 1
262
+ if j < len(lines) and re.fullmatch(r"\d{1,4}", lines[j]):
263
+ out[key] = int(lines[j])
264
+
265
+ return out
266
+
267
+
268
+ def build_internal_to_pdf_map(all_texts: Sequence[str]) -> Dict[int, int]:
269
+ """
270
+ internal_page_number -> pdf_page_index
271
+ """
272
+ mapping: Dict[int, int] = {}
273
+ for pdf_i, txt in enumerate(all_texts):
274
+ n = extract_footer_internal_page(txt or "")
275
+ if n is None:
276
+ continue
277
+ mapping.setdefault(n, pdf_i) # keep first occurrence
278
+ return mapping
279
+
280
+
281
+ def map_internal_to_pdf(internal: int, internal_to_pdf: Dict[int, int]) -> Optional[int]:
282
+ """
283
+ Robust mapping:
284
+ - direct if exists
285
+ - else estimate from nearest known internal page (assumes mostly consecutive internal numbering)
286
+ """
287
+ if internal in internal_to_pdf:
288
+ return internal_to_pdf[internal]
289
+
290
+ # nearest neighbor estimate
291
+ keys = sorted(internal_to_pdf.keys())
292
+ if not keys:
293
+ return None
294
+
295
+ # find closest key
296
+ best_k = min(keys, key=lambda k: abs(k - internal))
297
+ return internal_to_pdf[best_k] + (internal - best_k)
298
+
299
+
300
+ # =========================
301
+ # Strong statement scoring (only used if TOC mapping fails)
302
+ # =========================
303
+ def _page_stats(text: str) -> Dict[str, float]:
304
+ t = text or ""
305
+ low = t.lower()
306
+
307
+ # numeric signals
308
+ year_count = len(re.findall(r"\b20\d{2}\b", t))
309
+ currency_count = len(re.findall(r"[$€£]|usd|inr|eur|gbp", low))
310
+ paren_neg = len(re.findall(r"\(\s*\d", t)) # (123) negatives
311
+ integral = 1.0 if INTEGRAL_FOOTER in low else 0.0
312
+
313
+ tokens = re.findall(r"[A-Za-z]+|\d+(?:,\d{3})*(?:\.\d+)?", t)
314
+ if not tokens:
315
+ return dict(num_ratio=0.0, year_count=float(year_count), currency=float(currency_count),
316
+ paren=float(paren_neg), integral=integral)
317
+
318
+ nums = sum(1 for tok in tokens if re.fullmatch(r"\d+(?:,\d{3})*(?:\.\d+)?", tok))
319
+ alphas = sum(1 for tok in tokens if re.fullmatch(r"[A-Za-z]+", tok))
320
+ num_ratio = nums / max(1.0, nums + alphas)
321
+
322
+ return dict(num_ratio=float(num_ratio), year_count=float(year_count), currency=float(currency_count),
323
+ paren=float(paren_neg), integral=integral)
324
+
325
+
326
+ def score_statement_page(text: str, stmt: str) -> Tuple[float, Dict[str, Any]]:
327
+ low = (text or "").lower()
328
+ top = (text or "")[:1200]
329
+ st = _page_stats(text)
330
+
331
+ reasons = {"title": False, "sig_hits": [], "integral": False, "penalties": [], "stats": st}
332
+ score = 0.0
333
+
334
+ # Title near top is a MUST (or fuzzy)
335
+ if detect_title(top, stmt):
336
+ score += 60.0
337
+ reasons["title"] = True
338
+ else:
339
+ # without title, heavily downrank (note tables can be very numeric)
340
+ score -= 25.0
341
+ reasons["penalties"].append("no_title(-25)")
342
+
343
+ # Integral footer is very characteristic of primary statements (seen in your screenshots)
344
+ if st["integral"] > 0:
345
+ score += 18.0
346
+ reasons["integral"] = True
347
+
348
+ # Signature line items: require multiple hits
349
+ hits = 0
350
+ for term in SIG_TERMS.get(stmt, []):
351
+ if term in low:
352
+ hits += 1
353
+ reasons["sig_hits"].append(term)
354
+ score += min(hits, 10) * 6.0 # stronger weight
355
+
356
+ # Table-ness: years + currency + negative brackets + numeric ratio
357
+ score += st["num_ratio"] * 30.0
358
+ score += min(st["year_count"], 10.0) * 1.5
359
+ score += min(st["currency"], 10.0) * 2.0
360
+ score += min(st["paren"], 10.0) * 1.0
361
+
362
+ # Hard penalties for NOTE pages
363
+ if NOTE_HEADING_RE.search((text or "")[:220]):
364
+ score -= 60.0
365
+ reasons["penalties"].append("note_heading(-60)")
366
+
367
+ # If it looks like TOC index page, punish (dot leaders)
368
+ if DOT_LEADER_RE.search(text or ""):
369
+ score -= 30.0
370
+ reasons["penalties"].append("toc_dotleaders(-30)")
371
+
372
+ # Guardrails:
373
+ # If title found but it doesn't look like a table at all, punish
374
+ if reasons["title"] and st["num_ratio"] < 0.10 and st["year_count"] < 1:
375
+ score -= 35.0
376
+ reasons["penalties"].append("title_without_table(-35)")
377
+
378
+ # Require at least 2 signature hits for high confidence
379
+ if hits < 2:
380
+ score -= 18.0
381
+ reasons["penalties"].append("low_sig_hits(<2)(-18)")
382
+
383
+ return score, reasons
384
+
385
+
386
+ # =========================
387
+ # Range inference from ordered statement starts
388
+ # =========================
389
+ def infer_ranges_from_starts(
390
+ starts_pdf: Dict[str, int],
391
+ page_count: int,
392
+ ordered_keys: List[str],
393
+ ) -> Dict[str, Tuple[int, int]]:
394
+ """
395
+ Given start pdf indices (0-based) for an ordered list of keys,
396
+ return inclusive ranges for TARGETS based on next-start-1.
397
+ """
398
+ # keep only those that exist
399
+ items = [(k, starts_pdf[k]) for k in ordered_keys if k in starts_pdf and isinstance(starts_pdf[k], int)]
400
+ items.sort(key=lambda x: x[1])
401
+
402
+ next_start = {}
403
+ for idx, (k, p) in enumerate(items):
404
+ nxt = items[idx + 1][1] if idx + 1 < len(items) else None
405
+ next_start[k] = nxt
406
+
407
+ ranges: Dict[str, Tuple[int, int]] = {}
408
+ for k, p in items:
409
+ end = (next_start[k] - 1) if next_start[k] is not None else p
410
+ end = min(max(end, p), page_count - 1)
411
+ ranges[k] = (p, end)
412
+
413
+ # return only targets that exist
414
+ return {k: ranges[k] for k in TARGETS if k in ranges}
415
+
416
+
417
+ # =========================
418
+ # Public API
419
+ # =========================
420
+ def build_candidate_lists(
421
+ pages: Sequence[Any],
422
+ top_k: int = 25,
423
+ debug: bool = True,
424
+ ) -> Tuple[Dict[str, List[Tuple[int, float]]], Dict[str, Any]]:
425
+ """
426
+ Returns:
427
+ candidates: {stmt: [(pdf_page_idx, score), ...]} for TARGETS only
428
+ debug_info: contains toc/internal mapping and top explanations
429
+ """
430
+ all_texts = [_combined_text(p) for p in pages]
431
+ page_count = len(all_texts)
432
+
433
+ debug_info: Dict[str, Any] = {
434
+ "item8_toc_page": None,
435
+ "toc_internal": {},
436
+ "internal_to_pdf_map_size": 0,
437
+ "toc_pdf_targets_all": {},
438
+ "heuristic_ranges_0_based": {},
439
+ "top_scoring": {},
440
+ }
441
+
442
+ # ---- 1) TOC-based detection (most accurate on 10-K) ----
443
+ toc_i = find_item8_toc_page(all_texts)
444
+ if toc_i is not None:
445
+ toc_text = all_texts[toc_i]
446
+ toc_internal = parse_statement_index_numbers(toc_text)
447
+ internal_to_pdf = build_internal_to_pdf_map(all_texts)
448
+
449
+ toc_pdf_all: Dict[str, int] = {}
450
+ for k, internal_n in toc_internal.items():
451
+ mapped = map_internal_to_pdf(internal_n, internal_to_pdf)
452
+ if mapped is not None and 0 <= mapped < page_count:
453
+ toc_pdf_all[k] = mapped
454
+
455
+ debug_info.update({
456
+ "item8_toc_page": toc_i,
457
+ "toc_internal": toc_internal,
458
+ "internal_to_pdf_map_size": len(internal_to_pdf),
459
+ "toc_pdf_targets_all": toc_pdf_all,
460
+ })
461
+
462
+ # If we got our 3 targets, build direct ranges using the typical order:
463
+ # Earnings -> Comprehensive Income -> Balance Sheet -> Equity -> Cash Flow -> Notes
464
+ if all(k in toc_pdf_all for k in ["profit_and_loss", "balance_sheet", "cash_flow"]):
465
+ ordered = ["profit_and_loss", "comprehensive_income", "balance_sheet", "equity", "cash_flow", "notes"]
466
+ ranges = infer_ranges_from_starts(toc_pdf_all, page_count, ordered)
467
+ debug_info["heuristic_ranges_0_based"] = ranges
468
+
469
+ # Build candidates directly from these starts with huge confidence
470
+ candidates = {k: [] for k in TARGETS}
471
+ for k in TARGETS:
472
+ start, end = ranges.get(k, (None, None))
473
+ if start is None:
474
+ continue
475
+ # prioritize start page; include end too
476
+ candidates[k].append((start, 999.0))
477
+ if end != start:
478
+ candidates[k].append((end, 950.0))
479
+ return candidates, debug_info
480
+
481
+ # ---- 2) Fallback: statement scoring over ALL pages ----
482
+ candidates: Dict[str, List[Tuple[int, float]]] = {k: [] for k in TARGETS}
483
+ reasons_store: Dict[str, Dict[int, Any]] = {k: {} for k in TARGETS}
484
+
485
+ for i, p in enumerate(pages):
486
+ idx = _page_index(p, i)
487
+ txt = _combined_text(p)
488
+
489
+ for stmt in TARGETS:
490
+ sc, why = score_statement_page(txt, stmt)
491
+ if sc > 0:
492
+ candidates[stmt].append((idx, float(sc)))
493
+ if debug and (why["title"] or sc > 80):
494
+ reasons_store[stmt][idx] = why
495
+
496
+ for stmt in TARGETS:
497
+ candidates[stmt].sort(key=lambda x: x[1], reverse=True)
498
+ candidates[stmt] = candidates[stmt][:max(8, top_k)]
499
+ if debug:
500
+ debug_info["top_scoring"][stmt] = [
501
+ {"page": p, "score": round(s, 2), "why": reasons_store[stmt].get(p)}
502
+ for p, s in candidates[stmt][:10]
503
+ ]
504
+
505
+ return candidates, debug_info
506
+
507
+
508
+ def select_pages_for_llm(
509
+ candidates: Dict[str, List[Tuple[int, float]]],
510
+ debug_info: Dict[str, Any],
511
+ page_count: int,
512
+ max_images: int,
513
+ ) -> List[int]:
514
+ """
515
+ If TOC-based ranges exist -> send ONLY those pages (+neighbors) (highest precision).
516
+ Else -> send top candidates + neighbors.
517
+ """
518
+ picked = []
519
+ seen = set()
520
+
521
+ def add(p: int):
522
+ if 0 <= p < page_count and p not in seen and len(picked) < max_images:
523
+ seen.add(p)
524
+ picked.append(p)
525
+
526
+ # TOC ranges (best)
527
+ ranges = debug_info.get("heuristic_ranges_0_based") or {}
528
+ if ranges:
529
+ for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
530
+ if stmt in ranges:
531
+ s, e = ranges[stmt]
532
+ for p in range(s, e + 1):
533
+ add(p)
534
+ add(s - 1)
535
+ add(e + 1)
536
+ return sorted(picked)
537
+
538
+ # fallback
539
+ for stmt in ["profit_and_loss", "balance_sheet", "cash_flow"]:
540
+ for (p, _sc) in candidates.get(stmt, [])[:2]:
541
+ add(p)
542
+ add(p - 1)
543
+ add(p + 1)
544
+
545
+ return sorted(picked)