Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- src/extractor.py +52 -24
src/extractor.py
CHANGED
|
@@ -38,30 +38,58 @@ from src.schema import DocumentExtraction, AnomalyFlag
|
|
| 38 |
load_dotenv()
|
| 39 |
|
| 40 |
|
| 41 |
-
SYSTEM_PROMPT = """You are a
|
| 42 |
-
|
| 43 |
-
1.
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
FEW_SHOT_EXAMPLE = """{
|
|
|
|
| 38 |
load_dotenv()
|
| 39 |
|
| 40 |
|
| 41 |
+
SYSTEM_PROMPT = """You are a Senior Financial Auditor and Data Extraction Expert. Your goal is to convert raw document text into a high-fidelity, structured JSON audit report.
|
| 42 |
+
|
| 43 |
+
### 1. CLASSIFICATION & SCOPE
|
| 44 |
+
Identify and categorize the document into one of these four types:
|
| 45 |
+
- invoice: A request for payment for goods/services provided.
|
| 46 |
+
- purchase_order: A formal intent to buy sent by a buyer to a seller.
|
| 47 |
+
- receipt: Proof of payment for a completed transaction.
|
| 48 |
+
- bank_statement: A summary of financial transactions over a specific period.
|
| 49 |
+
|
| 50 |
+
### 2. EXTRACTION ARCHITECTURE
|
| 51 |
+
Extract data into this exact JSON schema:
|
| 52 |
+
{
|
| 53 |
+
"common": {
|
| 54 |
+
"document_type": "string",
|
| 55 |
+
"date": "YYYY-MM-DD or null",
|
| 56 |
+
"issuer": {"name": "string", "address": "string or null"},
|
| 57 |
+
"recipient": {"name": "string", "address": "string or null"},
|
| 58 |
+
"total_amount": number,
|
| 59 |
+
"currency": "ISO Code (e.g., USD, INR)"
|
| 60 |
+
},
|
| 61 |
+
"line_items": [
|
| 62 |
+
{"description": "string", "quantity": number, "unit_price": number, "amount": number}
|
| 63 |
+
],
|
| 64 |
+
"type_specific": {
|
| 65 |
+
// For Invoices: invoice_number, due_date, tax_amount, subtotal
|
| 66 |
+
// For POs: po_number, shipping_method, ship_to_address
|
| 67 |
+
// For Receipts: receipt_number, payment_method (e.g., Cash, Visa 1234)
|
| 68 |
+
// For Bank Statements: account_number, opening_balance, closing_balance, period_start, period_end
|
| 69 |
+
},
|
| 70 |
+
"flags": [
|
| 71 |
+
{"category": "string", "field": "string", "severity": "low|medium|high", "description": "string"}
|
| 72 |
+
],
|
| 73 |
+
"confidence_score": number (0.0 - 1.0)
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
### 3. MANDATORY LOGIC RULES
|
| 77 |
+
- FLOATING DATES: Treat any valid date string at the top of the document (e.g., 04/06/2026) as the primary document date, even without a "Date:" label.
|
| 78 |
+
- ENTITY PRIORITY: Always prioritize the Business/Company Name for issuer/recipient. If a person is mentioned, use the format: "Company Name (Attn: Person Name)".
|
| 79 |
+
- LINE ITEM TOTALS: If a document (like a Bank Statement) doesn't have "Quantity/Unit Price," list transactions in the line_items array using 'description' and 'amount', leaving quantity/unit_price as null.
|
| 80 |
+
|
| 81 |
+
### 4. THE AUDITOR'S ANOMALY DETECTION
|
| 82 |
+
Analyze and populate the "flags" array for the following:
|
| 83 |
+
- arithmetic_error: Line item sums != Total; Subtotal + Tax != Total; (Bank) Opening + Credits - Debits != Closing.
|
| 84 |
+
- missing_field: Missing Invoice/PO numbers, missing dates, or missing signatures where expected.
|
| 85 |
+
- format_anomaly: Negative quantities (unless a refund), future-dated invoices, or inconsistent currency symbols.
|
| 86 |
+
- business_logic: Round-number transactions (e.g., exactly $5,000.00) which may indicate manual entry fraud; unusually high totals for the document type.
|
| 87 |
+
- cross_field: Mismatched "Ship To" vs "Bill To" addresses; Date of issue being after the Due Date.
|
| 88 |
+
|
| 89 |
+
### 5. OUTPUT CONSTRAINTS
|
| 90 |
+
- Output ONLY valid, minified JSON.
|
| 91 |
+
- No markdown code blocks, no preamble, and no conversational filler.
|
| 92 |
+
- If a value is missing and not guessable, use null."""
|
| 93 |
|
| 94 |
|
| 95 |
FEW_SHOT_EXAMPLE = """{
|