Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- main.py +3 -3
- requirements.txt +1 -1
- statement_candidates.py +15 -7
main.py
CHANGED
|
@@ -23,15 +23,15 @@ You are given:
|
|
| 23 |
|
| 24 |
Task:
|
| 25 |
Identify the PDF PAGE RANGES (start_page, end_page) for the THREE PRIMARY FINANCIAL STATEMENT TABLES ONLY:
|
| 26 |
-
- Balance Sheet (a.k.a. Statement of Financial Position / Consolidated Balance
|
| 27 |
- Profit & Loss (a.k.a. Income Statement / Statements of Earnings / Statements of Operations)
|
| 28 |
- Cash Flow Statement (Statements of Cash Flows)
|
| 29 |
|
| 30 |
IMPORTANT RULES (STRICT):
|
| 31 |
-
- Only return ranges for the PRIMARY consolidated financial statements pages.
|
| 32 |
- Do NOT return ranges for note disclosures (e.g., derivatives, leases, fair value tables), MD&A, segment notes, or narrative discussion.
|
| 33 |
- A primary statement table page usually has:
|
| 34 |
-
(a) a clear statement title at the top (e.g., “Consolidated Balance Sheets
|
| 35 |
(b) many numeric columns (often multiple years)
|
| 36 |
(c) canonical line items like:
|
| 37 |
Balance sheet: “Total assets”, “Total liabilities”, “Total equity/stockholders’ equity”
|
|
|
|
| 23 |
|
| 24 |
Task:
|
| 25 |
Identify the PDF PAGE RANGES (start_page, end_page) for the THREE PRIMARY FINANCIAL STATEMENT TABLES ONLY:
|
| 26 |
+
- Balance Sheet (a.k.a. Statement of Financial Position / Consolidated Balance Sheet / Standalone Balance Sheet)
|
| 27 |
- Profit & Loss (a.k.a. Income Statement / Statements of Earnings / Statements of Operations)
|
| 28 |
- Cash Flow Statement (Statements of Cash Flows)
|
| 29 |
|
| 30 |
IMPORTANT RULES (STRICT):
|
| 31 |
+
- Only return ranges for the PRIMARY consolidated & standalone financial statements pages.
|
| 32 |
- Do NOT return ranges for note disclosures (e.g., derivatives, leases, fair value tables), MD&A, segment notes, or narrative discussion.
|
| 33 |
- A primary statement table page usually has:
|
| 34 |
+
(a) a clear statement title at the top (e.g., “Consolidated Balance Sheets”, "Standalone Balance Sheets")
|
| 35 |
(b) many numeric columns (often multiple years)
|
| 36 |
(c) canonical line items like:
|
| 37 |
Balance sheet: “Total assets”, “Total liabilities”, “Total equity/stockholders’ equity”
|
requirements.txt
CHANGED
|
@@ -5,4 +5,4 @@ pymupdf
|
|
| 5 |
pillow
|
| 6 |
requests
|
| 7 |
python-dotenv
|
| 8 |
-
pytesseract
|
|
|
|
| 5 |
pillow
|
| 6 |
requests
|
| 7 |
python-dotenv
|
| 8 |
+
pytesseract
|
statement_candidates.py
CHANGED
|
@@ -21,33 +21,41 @@ AUX = ["comprehensive_income", "equity", "notes"]
|
|
| 21 |
TITLE_VARIANTS: Dict[str, List[str]] = {
|
| 22 |
"balance_sheet": [
|
| 23 |
"Consolidated Balance Sheets",
|
|
|
|
| 24 |
"Balance Sheets",
|
| 25 |
"Statement of Financial Position",
|
| 26 |
],
|
| 27 |
"profit_and_loss": [
|
| 28 |
"Consolidated Statements of Earnings", # AbbVie screenshot
|
|
|
|
| 29 |
"Consolidated Statements of Operations",
|
|
|
|
| 30 |
"Consolidated Statements of Income",
|
|
|
|
| 31 |
"Income Statement",
|
| 32 |
"Statement of Profit and Loss",
|
| 33 |
],
|
| 34 |
"cash_flow": [
|
| 35 |
"Consolidated Statements of Cash Flows",
|
|
|
|
| 36 |
"Statement of Cash Flows",
|
| 37 |
"Cash Flow Statement",
|
| 38 |
],
|
| 39 |
# auxiliary
|
| 40 |
"comprehensive_income": [
|
| 41 |
"Consolidated Statements of Comprehensive Income",
|
|
|
|
| 42 |
"Statement of Comprehensive Income",
|
| 43 |
],
|
| 44 |
"equity": [
|
| 45 |
"Consolidated Statements of Equity",
|
|
|
|
| 46 |
"Statement of Stockholders' Equity",
|
| 47 |
"Statement of Shareholders' Equity",
|
| 48 |
],
|
| 49 |
"notes": [
|
| 50 |
"Notes to Consolidated Financial Statements",
|
|
|
|
| 51 |
"Notes to Financial Statements",
|
| 52 |
],
|
| 53 |
}
|
|
@@ -97,7 +105,7 @@ SIG_TERMS: Dict[str, List[str]] = {
|
|
| 97 |
"net change in cash",
|
| 98 |
],
|
| 99 |
# aux
|
| 100 |
-
"notes": ["note 1", "note 2", "notes to consolidated financial statements"],
|
| 101 |
}
|
| 102 |
|
| 103 |
NOTE_HEADING_RE = re.compile(r"^\s*note\s+\d+\b", re.IGNORECASE)
|
|
@@ -233,12 +241,12 @@ def parse_statement_index_numbers(toc_text: str) -> Dict[str, int]:
|
|
| 233 |
|
| 234 |
# compile quick patterns
|
| 235 |
pats = {
|
| 236 |
-
"profit_and_loss": re.compile(r"consolidated\s+statements?\s+of\s+(earnings|operations|income)", re.I),
|
| 237 |
-
"comprehensive_income": re.compile(r"consolidated\s+statements?\s+of\s+comprehensive\s+income", re.I),
|
| 238 |
-
"balance_sheet": re.compile(r"consolidated\s+balance\s+sheets?|statement\s+of\s+financial\s+position", re.I),
|
| 239 |
-
"equity": re.compile(r"consolidated\s+statements?\s+of\s+equity|stockholders[’']\s+equity|shareholders[’']\s+equity", re.I),
|
| 240 |
-
"cash_flow": re.compile(r"consolidated\s+statements?\s+of\s+cash\s+flows?", re.I),
|
| 241 |
-
"notes": re.compile(r"notes\s+to\s+consolidated\s+financial\s+statements", re.I),
|
| 242 |
}
|
| 243 |
|
| 244 |
for i, ln in enumerate(lines):
|
|
|
|
| 21 |
TITLE_VARIANTS: Dict[str, List[str]] = {
|
| 22 |
"balance_sheet": [
|
| 23 |
"Consolidated Balance Sheets",
|
| 24 |
+
"Standalone Balance Sheets",
|
| 25 |
"Balance Sheets",
|
| 26 |
"Statement of Financial Position",
|
| 27 |
],
|
| 28 |
"profit_and_loss": [
|
| 29 |
"Consolidated Statements of Earnings", # AbbVie screenshot
|
| 30 |
+
"Standalone Statements of Earnings",
|
| 31 |
"Consolidated Statements of Operations",
|
| 32 |
+
"Standalone Statements of Operations",
|
| 33 |
"Consolidated Statements of Income",
|
| 34 |
+
"Standalone Statements of Income",
|
| 35 |
"Income Statement",
|
| 36 |
"Statement of Profit and Loss",
|
| 37 |
],
|
| 38 |
"cash_flow": [
|
| 39 |
"Consolidated Statements of Cash Flows",
|
| 40 |
+
"Standalone Statements of Cash Flows",
|
| 41 |
"Statement of Cash Flows",
|
| 42 |
"Cash Flow Statement",
|
| 43 |
],
|
| 44 |
# auxiliary
|
| 45 |
"comprehensive_income": [
|
| 46 |
"Consolidated Statements of Comprehensive Income",
|
| 47 |
+
"Standalone Statements of Comprehensive Income",
|
| 48 |
"Statement of Comprehensive Income",
|
| 49 |
],
|
| 50 |
"equity": [
|
| 51 |
"Consolidated Statements of Equity",
|
| 52 |
+
"Standalone Statements of Equity",
|
| 53 |
"Statement of Stockholders' Equity",
|
| 54 |
"Statement of Shareholders' Equity",
|
| 55 |
],
|
| 56 |
"notes": [
|
| 57 |
"Notes to Consolidated Financial Statements",
|
| 58 |
+
"Notes to Standalone Financial Statements",
|
| 59 |
"Notes to Financial Statements",
|
| 60 |
],
|
| 61 |
}
|
|
|
|
| 105 |
"net change in cash",
|
| 106 |
],
|
| 107 |
# aux
|
| 108 |
+
"notes": ["note 1", "note 2", "notes to consolidated financial statements", "notes to standalone financial statements"],
|
| 109 |
}
|
| 110 |
|
| 111 |
NOTE_HEADING_RE = re.compile(r"^\s*note\s+\d+\b", re.IGNORECASE)
|
|
|
|
| 241 |
|
| 242 |
# compile quick patterns
|
| 243 |
pats = {
|
| 244 |
+
"profit_and_loss": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+(earnings|operations|income)", re.I),
|
| 245 |
+
"comprehensive_income": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+comprehensive\s+income", re.I),
|
| 246 |
+
"balance_sheet": re.compile(r"(consolidated|standalone)\s+balance\s+sheets?|statement\s+of\s+financial\s+position", re.I),
|
| 247 |
+
"equity": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+equity|stockholders[’']\s+equity|shareholders[’']\s+equity", re.I),
|
| 248 |
+
"cash_flow": re.compile(r"(consolidated|standalone)\s+statements?\s+of\s+cash\s+flows?", re.I),
|
| 249 |
+
"notes": re.compile(r"notes\s+to\s+(consolidated|standalone)\s+financial\s+statements", re.I),
|
| 250 |
}
|
| 251 |
|
| 252 |
for i, ln in enumerate(lines):
|