Spaces:
Sleeping
Sleeping
added SOP
Browse files- prompts.py +31 -31
prompts.py
CHANGED
|
@@ -1,46 +1,46 @@
|
|
| 1 |
-
|
| 2 |
def get_ocr_extraction_prompt(raw_text: str) -> str:
|
| 3 |
"""
|
| 4 |
-
|
|
|
|
| 5 |
"""
|
|
|
|
| 6 |
return f"""<|im_start|>system
|
| 7 |
-
You are
|
| 8 |
-
Your task is to extract
|
| 9 |
-
|
| 10 |
-
### STANDARD OPERATING PROCEDURE (SOP):
|
| 11 |
-
1. **Sanitization**: Ignore page headers, footers, and marketing messages.
|
| 12 |
-
2. **Dates**: Convert all dates to YYYY-MM-DD format.
|
| 13 |
-
3. **Null Handling**: If a field is not found, set it to null. DO NOT invent data.
|
| 14 |
-
4. **Line Items**: Extract the table of goods/services accurately.
|
| 15 |
-
5. **Output**: Return ONLY valid JSON. No Markdown. No commentary.
|
| 16 |
-
|
| 17 |
-
### ONE-SHOT EXAMPLE:
|
| 18 |
-
**Input OCR**:
|
| 19 |
-
"CITY OF AUBURN... Account Number: 076248-000... Due Date: 01/07/25...
|
| 20 |
-
Water Total $649.69... Sewer Total $1,333.45... Total New Charges $2,363.39"
|
| 21 |
|
| 22 |
-
|
|
|
|
| 23 |
{{
|
| 24 |
-
"
|
| 25 |
-
"
|
| 26 |
-
"
|
| 27 |
-
"due_date": "
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
"line_items": [
|
| 31 |
-
{{
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
| 34 |
]
|
| 35 |
}}
|
| 36 |
-
<|im_end|>
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
<|im_start|>user
|
| 39 |
-
###
|
| 40 |
-
|
| 41 |
|
| 42 |
-
|
| 43 |
<|im_end|>
|
| 44 |
<|im_start|>assistant
|
| 45 |
{{
|
| 46 |
-
""" # Pre-
|
|
|
|
|
|
|
| 1 |
def get_ocr_extraction_prompt(raw_text: str) -> str:
|
| 2 |
"""
|
| 3 |
+
Constructs a prompt that forces the LLM to output JSON strictly adhering
|
| 4 |
+
to the Zoho Invoice API Schema.
|
| 5 |
"""
|
| 6 |
+
|
| 7 |
return f"""<|im_start|>system
|
| 8 |
+
You are an intelligent Invoice Processing Agent responsible for Data Entry.
|
| 9 |
+
Your task is to extract data from the provided OCR text and format it strictly according to the **Target API Schema** below.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
### TARGET API SCHEMA (Zoho Books):
|
| 12 |
+
Output a single JSON object with these exact keys:
|
| 13 |
{{
|
| 14 |
+
"customer_name": "string (The Vendor or Bill To entity)",
|
| 15 |
+
"invoice_number": "string",
|
| 16 |
+
"date": "string (YYYY-MM-DD)",
|
| 17 |
+
"due_date": "string (YYYY-MM-DD)",
|
| 18 |
+
"reference_number": "string (PO Number if available, else null)",
|
| 19 |
+
"total": float (The grand total amount),
|
| 20 |
"line_items": [
|
| 21 |
+
{{
|
| 22 |
+
"name": "string (Description of goods/service)",
|
| 23 |
+
"quantity": float (Default to 1.0 if missing),
|
| 24 |
+
"rate": float (Unit price),
|
| 25 |
+
"item_total": float (Line amount)
|
| 26 |
+
}}
|
| 27 |
]
|
| 28 |
}}
|
|
|
|
| 29 |
|
| 30 |
+
### STANDARD OPERATING PROCEDURE (SOP):
|
| 31 |
+
1. **Analyze**: Read the OCR text below, accounting for potential noise or scanning errors.
|
| 32 |
+
2. **Normalize Dates**: Convert all dates (e.g., 'Jan 15, 2024') to ISO format (2024-01-15).
|
| 33 |
+
3. **Table Extraction**: Identify the line items table. If the OCR layout is messy, use the price and quantity columns to align rows.
|
| 34 |
+
4. **Validation**: Ensure 'item_total' equals 'quantity' * 'rate'.
|
| 35 |
+
5. **Output**: Return ONLY the JSON object. Do not include markdown formatting (```json) or conversational text.
|
| 36 |
+
|
| 37 |
+
<|im_end|>
|
| 38 |
<|im_start|>user
|
| 39 |
+
### INPUT OCR TEXT:
|
| 40 |
+
{raw_text[:4000]}
|
| 41 |
|
| 42 |
+
### RESPONSE (Valid JSON):
|
| 43 |
<|im_end|>
|
| 44 |
<|im_start|>assistant
|
| 45 |
{{
|
| 46 |
+
""" # Pre-filling the brace forces the model into JSON generation immediately.
|