Vaibuzzz commited on
Commit
faa05c3
·
verified ·
1 Parent(s): 431c55b

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. src/extractor.py +21 -17
src/extractor.py CHANGED
@@ -72,12 +72,15 @@ Extract data into this exact JSON schema:
72
 
73
  ### 3. THE AUDITOR'S ANOMALY ENGINE
74
  Every document must be audited for the following:
75
- - **arithmetic_error:** Mandatory check. If (Sum of line_items) != total_amount, flag as HIGH severity.
76
  - **missing_field:** Flag if the expected reference number (Invoice #, PO #) or Date is missing.
77
  - **business_logic:** Flag "Round Number" totals (e.g., exactly $5,000.00) or unusual tax rates.
78
  - **format_anomaly:** Flag if dates are in the future or if quantities are negative (unless marked as 'Credit' or 'Refund').
79
 
80
- ### 4. OUTPUT CONSTRAINTS
 
 
 
81
  - Return ONLY minified JSON.
82
  - No markdown formatting (no ```json blocks).
83
  - No preamble or "Here is your JSON" conversational text.
@@ -332,22 +335,23 @@ class FinancialDocExtractor:
332
 
333
  # Allow small floating-point variance, but fail on >1.0 mismatch
334
  if abs(computed_total - validated.common.total_amount) > 1.0:
335
- if attempt < self.max_retries:
336
- print(
337
- f" [Auditor] Math mismatch detected "
338
- f"(Sum: {computed_total} vs Total: {validated.common.total_amount}). "
339
- f"Triggering self-correction!"
340
- )
341
- text = (
342
- text
343
- + f"\n\n[SYSTEM AUDIT: In your previous attempt, the sum of your "
344
- f"line items ({computed_total:.2f}) DID NOT MATCH the total_amount "
345
- f"({validated.common.total_amount:.2f}). You likely missed a line "
346
- f"item or read a price incorrectly. Carefully re-read the text "
347
- f"above and output the correct JSON.]"
 
 
348
  )
349
- result.error = f"Attempt {attempt}: Math validation failed. Triggered self-correction."
350
- continue
351
 
352
  result.validated = validated
353
  result.is_schema_compliant = True
 
72
 
73
  ### 3. THE AUDITOR'S ANOMALY ENGINE
74
  Every document must be audited for the following:
75
+ - **arithmetic_error:** Mandatory check. If (Sum of line_items) != total_amount, flag as HIGH severity. DO NOT invent or modify line items to force the math to work. Keep the original extracted data exactly as it appears and simply set this flag.
76
  - **missing_field:** Flag if the expected reference number (Invoice #, PO #) or Date is missing.
77
  - **business_logic:** Flag "Round Number" totals (e.g., exactly $5,000.00) or unusual tax rates.
78
  - **format_anomaly:** Flag if dates are in the future or if quantities are negative (unless marked as 'Credit' or 'Refund').
79
 
80
+ ### 4. DATE PERSISTENCE
81
+ - Never drop or forget the `common.date` field during the extraction. If you found it, keep it.
82
+
83
+ ### 5. OUTPUT CONSTRAINTS
84
  - Return ONLY minified JSON.
85
  - No markdown formatting (no ```json blocks).
86
  - No preamble or "Here is your JSON" conversational text.
 
335
 
336
  # Allow small floating-point variance, but fail on >1.0 mismatch
337
  if abs(computed_total - validated.common.total_amount) > 1.0:
338
+ print(
339
+ f" [Auditor] Math mismatch detected "
340
+ f"(Sum: {computed_total} vs Total: {validated.common.total_amount}). "
341
+ f"Preserving first-pass data and injecting arithmetic_error flag."
342
+ )
343
+ # We deliberately do NOT trigger self-correction (retry) here to
344
+ # protect first-pass accuracy on dates and entities.
345
+
346
+ # Programmatically ensure the flag is present if the LLM missed it
347
+ if not any(f.category == "arithmetic_error" for f in validated.flags):
348
+ error_flag = AnomalyFlag(
349
+ category="arithmetic_error",
350
+ field="total_amount",
351
+ severity="high",
352
+ description=f"Calculated line item sum ({computed_total:.2f}) does not match stated total_amount ({validated.common.total_amount:.2f})."
353
  )
354
+ validated.flags.append(error_flag)
 
355
 
356
  result.validated = validated
357
  result.is_schema_compliant = True