FridayCodehhr commited on
Commit
689d59b
·
verified ·
1 Parent(s): 29d4f11

Upload 9 files

Browse files
Files changed (3) hide show
  1. main.py +3 -3
  2. requirements.txt +1 -1
  3. statement_candidates.py +15 -7
main.py CHANGED
@@ -23,15 +23,15 @@ You are given:
23
 
24
  Task:
25
  Identify the PDF PAGE RANGES (start_page, end_page) for the THREE PRIMARY FINANCIAL STATEMENT TABLES ONLY:
26
- - Balance Sheet (a.k.a. Statement of Financial Position / Consolidated Balance Sheets)
27
  - Profit & Loss (a.k.a. Income Statement / Statements of Earnings / Statements of Operations)
28
  - Cash Flow Statement (Statements of Cash Flows)
29
 
30
  IMPORTANT RULES (STRICT):
31
- - Only return ranges for the PRIMARY consolidated financial statements pages.
32
  - Do NOT return ranges for note disclosures (e.g., derivatives, leases, fair value tables), MD&A, segment notes, or narrative discussion.
33
  - A primary statement table page usually has:
34
- (a) a clear statement title at the top (e.g., “Consolidated Balance Sheets)
35
  (b) many numeric columns (often multiple years)
36
  (c) canonical line items like:
37
  Balance sheet: “Total assets”, “Total liabilities”, “Total equity/stockholders’ equity”
 
23
 
24
  Task:
25
  Identify the PDF PAGE RANGES (start_page, end_page) for the THREE PRIMARY FINANCIAL STATEMENT TABLES ONLY:
26
+ - Balance Sheet (a.k.a. Statement of Financial Position / Consolidated Balance Sheet / Standalone Balance Sheet)
27
  - Profit & Loss (a.k.a. Income Statement / Statements of Earnings / Statements of Operations)
28
  - Cash Flow Statement (Statements of Cash Flows)
29
 
30
  IMPORTANT RULES (STRICT):
31
+ - Only return ranges for the PRIMARY consolidated & standalone financial statements pages.
32
  - Do NOT return ranges for note disclosures (e.g., derivatives, leases, fair value tables), MD&A, segment notes, or narrative discussion.
33
  - A primary statement table page usually has:
34
+ (a) a clear statement title at the top (e.g., “Consolidated Balance Sheets”, "Standalone Balance Sheets")
35
  (b) many numeric columns (often multiple years)
36
  (c) canonical line items like:
37
  Balance sheet: “Total assets”, “Total liabilities”, “Total equity/stockholders’ equity”
requirements.txt CHANGED
@@ -5,4 +5,4 @@ pymupdf
5
  pillow
6
  requests
7
  python-dotenv
8
- pytesseract
 
5
  pillow
6
  requests
7
  python-dotenv
8
+ pytesseract
statement_candidates.py CHANGED
@@ -21,33 +21,41 @@ AUX = ["comprehensive_income", "equity", "notes"]
21
  TITLE_VARIANTS: Dict[str, List[str]] = {
22
  "balance_sheet": [
23
  "Consolidated Balance Sheets",
 
24
  "Balance Sheets",
25
  "Statement of Financial Position",
26
  ],
27
  "profit_and_loss": [
28
  "Consolidated Statements of Earnings", # AbbVie screenshot
 
29
  "Consolidated Statements of Operations",
 
30
  "Consolidated Statements of Income",
 
31
  "Income Statement",
32
  "Statement of Profit and Loss",
33
  ],
34
  "cash_flow": [
35
  "Consolidated Statements of Cash Flows",
 
36
  "Statement of Cash Flows",
37
  "Cash Flow Statement",
38
  ],
39
  # auxiliary
40
  "comprehensive_income": [
41
  "Consolidated Statements of Comprehensive Income",
 
42
  "Statement of Comprehensive Income",
43
  ],
44
  "equity": [
45
  "Consolidated Statements of Equity",
 
46
  "Statement of Stockholders' Equity",
47
  "Statement of Shareholders' Equity",
48
  ],
49
  "notes": [
50
  "Notes to Consolidated Financial Statements",
 
51
  "Notes to Financial Statements",
52
  ],
53
  }
@@ -97,7 +105,7 @@ SIG_TERMS: Dict[str, List[str]] = {
97
  "net change in cash",
98
  ],
99
  # aux
100
- "notes": ["note 1", "note 2", "notes to consolidated financial statements"],
101
  }
102
 
103
  NOTE_HEADING_RE = re.compile(r"^\s*note\s+\d+\b", re.IGNORECASE)
@@ -233,12 +241,12 @@ def parse_statement_index_numbers(toc_text: str) -> Dict[str, int]:
233
 
234
  # compile quick patterns
235
  pats = {
236
- "profit_and_loss": re.compile(r"consolidated\s+statements?\s+of\s+(earnings|operations|income)", re.I),
237
- "comprehensive_income": re.compile(r"consolidated\s+statements?\s+of\s+comprehensive\s+income", re.I),
238
- "balance_sheet": re.compile(r"consolidated\s+balance\s+sheets?|statement\s+of\s+financial\s+position", re.I),
239
- "equity": re.compile(r"consolidated\s+statements?\s+of\s+equity|stockholders[’']\s+equity|shareholders[’']\s+equity", re.I),
240
- "cash_flow": re.compile(r"consolidated\s+statements?\s+of\s+cash\s+flows?", re.I),
241
- "notes": re.compile(r"notes\s+to\s+consolidated\s+financial\s+statements", re.I),
242
  }
243
 
244
  for i, ln in enumerate(lines):
 
21
  TITLE_VARIANTS: Dict[str, List[str]] = {
22
  "balance_sheet": [
23
  "Consolidated Balance Sheets",
24
+ "Standalone Balance Sheets",
25
  "Balance Sheets",
26
  "Statement of Financial Position",
27
  ],
28
  "profit_and_loss": [
29
  "Consolidated Statements of Earnings", # AbbVie screenshot
30
+ "Standalone Statements of Earnings",
31
  "Consolidated Statements of Operations",
32
+ "Standalone Statements of Operations",
33
  "Consolidated Statements of Income",
34
+ "Standalone Statements of Income",
35
  "Income Statement",
36
  "Statement of Profit and Loss",
37
  ],
38
  "cash_flow": [
39
  "Consolidated Statements of Cash Flows",
40
+ "Standalone Statements of Cash Flows",
41
  "Statement of Cash Flows",
42
  "Cash Flow Statement",
43
  ],
44
  # auxiliary
45
  "comprehensive_income": [
46
  "Consolidated Statements of Comprehensive Income",
47
+ "Standalone Statements of Comprehensive Income",
48
  "Statement of Comprehensive Income",
49
  ],
50
  "equity": [
51
  "Consolidated Statements of Equity",
52
+ "Standalone Statements of Equity",
53
  "Statement of Stockholders' Equity",
54
  "Statement of Shareholders' Equity",
55
  ],
56
  "notes": [
57
  "Notes to Consolidated Financial Statements",
58
+ "Notes to Standalone Financial Statements",
59
  "Notes to Financial Statements",
60
  ],
61
  }
 
105
  "net change in cash",
106
  ],
107
  # aux
108
+ "notes": ["note 1", "note 2", "notes to consolidated financial statements", "notes to standalone financial statements"],
109
  }
110
 
111
  NOTE_HEADING_RE = re.compile(r"^\s*note\s+\d+\b", re.IGNORECASE)
 
241
 
242
  # compile quick patterns
243
  pats = {
244
+ "profit_and_loss": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+(earnings|operations|income)", re.I),
245
+ "comprehensive_income": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+comprehensive\s+income", re.I),
246
+ "balance_sheet": re.compile(r"(consolidated|standalone)\s+balance\s+sheets?|statement\s+of\s+financial\s+position", re.I),
247
+ "equity": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+equity|stockholders[’']\s+equity|shareholders[’']\s+equity", re.I),
248
+ "cash_flow": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+cash\s+flows?", re.I),
249
+ "notes": re.compile(r"notes\s+to\s+(consolidated|standalone)\s+financial\s+statements", re.I),
250
  }
251
 
252
  for i, ln in enumerate(lines):