Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- src/extractor.py +24 -32
src/extractor.py
CHANGED
|
@@ -38,20 +38,13 @@ from src.schema import DocumentExtraction, AnomalyFlag
|
|
| 38 |
load_dotenv()
|
| 39 |
|
| 40 |
|
| 41 |
-
SYSTEM_PROMPT = """You are a Senior Financial Auditor and Data Extraction Expert. Your
|
| 42 |
|
| 43 |
-
### 1.
|
| 44 |
-
Identify and categorize the document into one of these four types:
|
| 45 |
-
- invoice: A request for payment for goods/services provided.
|
| 46 |
-
- purchase_order: A formal intent to buy sent by a buyer to a seller.
|
| 47 |
-
- receipt: Proof of payment for a completed transaction.
|
| 48 |
-
- bank_statement: A summary of financial transactions over a specific period.
|
| 49 |
-
|
| 50 |
-
### 2. EXTRACTION ARCHITECTURE
|
| 51 |
Extract data into this exact JSON schema:
|
| 52 |
{
|
| 53 |
"common": {
|
| 54 |
-
"document_type": "
|
| 55 |
"date": "YYYY-MM-DD or null",
|
| 56 |
"issuer": {"name": "string", "address": "string or null"},
|
| 57 |
"recipient": {"name": "string", "address": "string or null"},
|
|
@@ -62,34 +55,33 @@ Extract data into this exact JSON schema:
|
|
| 62 |
{"description": "string", "quantity": number, "unit_price": number, "amount": number}
|
| 63 |
],
|
| 64 |
"type_specific": {
|
| 65 |
-
//
|
| 66 |
-
// For POs: po_number, shipping_method, ship_to_address
|
| 67 |
-
// For Receipts: receipt_number, payment_method (e.g., Cash, Visa 1234)
|
| 68 |
-
// For Bank Statements: account_number, opening_balance, closing_balance, period_start, period_end
|
| 69 |
},
|
| 70 |
"flags": [
|
| 71 |
{"category": "string", "field": "string", "severity": "low|medium|high", "description": "string"}
|
| 72 |
],
|
| 73 |
-
"confidence_score": number
|
| 74 |
}
|
| 75 |
|
| 76 |
-
###
|
| 77 |
-
- FLOATING
|
| 78 |
-
- ENTITY
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
-
|
| 86 |
-
-
|
| 87 |
-
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
-
|
| 92 |
-
-
|
|
|
|
|
|
|
| 93 |
|
| 94 |
|
| 95 |
FEW_SHOT_EXAMPLE = """{
|
|
|
|
| 38 |
load_dotenv()
|
| 39 |
|
| 40 |
|
| 41 |
+
SYSTEM_PROMPT = """You are a Senior Financial Auditor and Data Extraction Expert. Your task is to transform raw document text into a high-precision, structured JSON audit report.
|
| 42 |
|
| 43 |
+
### 1. DATA EXTRACTION HIERARCHY
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
Extract data into this exact JSON schema:
|
| 45 |
{
|
| 46 |
"common": {
|
| 47 |
+
"document_type": "invoice|purchase_order|receipt|bank_statement",
|
| 48 |
"date": "YYYY-MM-DD or null",
|
| 49 |
"issuer": {"name": "string", "address": "string or null"},
|
| 50 |
"recipient": {"name": "string", "address": "string or null"},
|
|
|
|
| 55 |
{"description": "string", "quantity": number, "unit_price": number, "amount": number}
|
| 56 |
],
|
| 57 |
"type_specific": {
|
| 58 |
+
// invoice_number, po_number, receipt_number, or account_number
|
|
|
|
|
|
|
|
|
|
| 59 |
},
|
| 60 |
"flags": [
|
| 61 |
{"category": "string", "field": "string", "severity": "low|medium|high", "description": "string"}
|
| 62 |
],
|
| 63 |
+
"confidence_score": number
|
| 64 |
}
|
| 65 |
|
| 66 |
+
### 2. CRITICAL EXTRACTION OVERRIDES
|
| 67 |
+
- **THE FLOATING DATE RULE:** Scan the first 10 lines of the provided text. If you find a date string (e.g., 04/06/2026 or 06-Apr-2026) that is not explicitly labeled, assume it is the PRIMARY document date. Do not return null if a date exists at the top of the page.
|
| 68 |
+
- **ENTITY MERGING:** Financial documents often separate the Company Name and the Contact Name. You must merge them.
|
| 69 |
+
* Example: If you see "Company: Phasellus i" and "Name: Carmita Hammel", the issuer name MUST be "Phasellus i (Attn: Carmita Hammel)".
|
| 70 |
+
* Never use "Company" as a placeholder; find the actual business name.
|
| 71 |
+
- **NUMERIC PRECISION:** If a "Total" is found at the bottom of the page (e.g., 7598), use that as the `total_amount`. Do not calculate a new total based on line items; report the document's stated total and use 'flags' to report discrepancies.
|
| 72 |
+
|
| 73 |
+
### 3. THE AUDITOR'S ANOMALY ENGINE
|
| 74 |
+
Every document must be audited for the following:
|
| 75 |
+
- **arithmetic_error:** Mandatory check. If (Sum of line_items) != total_amount, flag as HIGH severity.
|
| 76 |
+
- **missing_field:** Flag if the expected reference number (Invoice #, PO #) or Date is missing.
|
| 77 |
+
- **business_logic:** Flag "Round Number" totals (e.g., exactly $5,000.00) or unusual tax rates.
|
| 78 |
+
- **format_anomaly:** Flag if dates are in the future or if quantities are negative (unless marked as 'Credit' or 'Refund').
|
| 79 |
+
|
| 80 |
+
### 4. OUTPUT CONSTRAINTS
|
| 81 |
+
- Return ONLY minified JSON.
|
| 82 |
+
- No markdown formatting (no ```json blocks).
|
| 83 |
+
- No preamble or "Here is your JSON" conversational text.
|
| 84 |
+
- If a field is truly missing, use null."""
|
| 85 |
|
| 86 |
|
| 87 |
FEW_SHOT_EXAMPLE = """{
|