vachaspathi commited on
Commit
1bd3e46
·
verified ·
1 Parent(s): bc51b20

Update prompts.py

Browse files
Files changed (1) hide show
  1. prompts.py +49 -130
prompts.py CHANGED
@@ -1,130 +1,60 @@
1
  # prompts.py
2
  # Qwen-2.5 Compatible Prompts
3
- # Updated to follow "Document Processing SOP for Zoho Invoice Integration"
4
- # - explicit JSON markers <<<JSON>>> / <<<END_JSON>>>
5
- # - multi-page support, detailed schema, validation rules
6
- # - agent prompt implements sequencing and strict tool-call JSON output
7
 
8
  from typing import Optional
9
 
10
  def get_ocr_extraction_prompt(raw_text: str, page_count: int = 1) -> str:
11
  """
12
- Build a system prompt for OCR+NLP agent to:
13
- 1) Classify document type
14
- 2) Extract fields into the EXACT JSON schema
15
- 3) Validate totals/date formats
16
- 4) Output ONLY the JSON object between <<<JSON>>> and <<<END_JSON>>>
17
- raw_text will be injected (truncated to first 3000 chars by callers if needed).
18
  """
19
  schema = r'''
20
  Top-level schema (use null for unknown fields):
21
  {
22
- "document_type": "", // Invoice | Credit Note | Debit Note | Delivery Note | Receipt | Unknown
23
- "document_id": "", // invoice_number / creditnote_number if present
24
- "invoice_date": "", // ISO format YYYY-MM-DD or null
25
- "due_date": "", // ISO format YYYY-MM-DD or null
26
- "currency": "", // e.g. INR, USD or null
27
- "totals": {
28
- "sub_total": null,
29
- "tax_total": null,
30
- "round_off": null,
31
- "grand_total": null
32
- },
33
- "seller": {
34
- "company": null,
35
- "address": null,
36
- "city": null,
37
- "state": null,
38
- "zip": null,
39
- "country": null,
40
- "gstin": null,
41
- "pan": null,
42
- "bank_details": null
43
- },
44
- "buyer": {
45
- "contact_name": null,
46
- "company_name": null,
47
- "billing_address": null,
48
- "shipping_address": null,
49
- "email": null,
50
- "phone": null,
51
- "gstin": null,
52
- "pan": null
53
- },
54
- "line_items": [
55
- {
56
- "name": null,
57
- "description": null,
58
- "hsn_or_sac": null,
59
- "sku": null,
60
- "quantity": null,
61
- "unit": null,
62
- "rate": null,
63
- "amount": null,
64
- "taxes": [ { "type": null, "rate": null, "amount": null, "tax_id": null } ]
65
- }
66
- ],
67
- "tax_breakdown": [
68
- { "tax_type": null, "cgst": null, "sgst": null, "igst": null, "cess": null }
69
- ],
70
- "references": {
71
- "reference_invoice_number": null,
72
- "po_number": null,
73
- "delivery_challan": null
74
- },
75
- "payment_terms": null,
76
- "notes": null,
77
- "qr_codes": [ { "type": null, "value": null } ],
78
- "raw_text_sample": null, // up to first 3000 chars or null
79
- "validation": {
80
- "amounts_balanced": null, // true | false | null
81
- "missing_critical_fields": [] // list of field-path strings
82
- }
83
  }
84
  '''
85
  return f"""<|im_start|>system
86
- You are an invoice & document data extraction assistant. Follow these stages precisely.
87
 
88
- IMPORTANT OUTPUT RULES (READ CAREFULLY):
89
- - Produce ONE JSON object only and NOTHING else.
90
- - Wrap the JSON between the exact markers (no extra whitespace around markers):
91
  <<<JSON>>>
92
  {{ ... }}
93
  <<<END_JSON>>>
94
- - Do NOT include explanation, commentary, apologies, or any text outside the markers.
95
- - Do NOT use single quotes for JSON keys/strings. Use double quotes.
96
- - Do NOT include trailing commas.
97
- - Dates: ISO YYYY-MM-DD or null.
98
- - Numeric fields: numbers only (no currency symbols). Use null if unknown.
99
- - If multi-page, page_count = {page_count}. Merge tables across pages.
100
 
101
- STAGE A — CLASSIFICATION
102
- 1) Determine document_type: Invoice | Credit Note | Debit Note | Delivery Note | Receipt | Unknown.
103
- 2) Prefer more specific types if conflicting labels exist (e.g. "Credit Note" over generic "Invoice").
104
-
105
- STAGE B — EXTRACTION & SCHEMA
106
- 3) Extract all fields into the EXACT schema below. Use null for missing scalar fields and [] / {{}} for missing arrays/objects.
107
  {schema}
108
 
109
- 4) raw_text_sample: include up to the first 3000 chars of the document text (or null if too long).
110
-
111
- STAGE C VALIDATION RULES
112
- 5) Normalize dates to YYYY-MM-DD. If unparseable, set null and add the field path to validation.missing_critical_fields.
113
- 6) Amounts: convert numeric text to numbers (remove commas, currency symbols). If conversion fails, set null and add to missing_critical_fields.
114
- 7) Totals validation: set validation.amounts_balanced = true only if:
115
- sum(line_items.amount) + totals.tax_total ± totals.round_off == totals.grand_total
116
- Allow tolerance of 0.5 units for rounding. Else set false.
117
- 8) If any critical fields are missing (invoice_number/document_id, invoice_date, grand_total, buyer.contact_name), ensure they appear in validation.missing_critical_fields.
118
-
119
- STAGE D — MULTI-PAGE & TABLE HANDLING
120
- 9) For multi-page documents, merge line_items tables across pages and preserve ordering. If a table header repeats, use headers to align columns and combine rows.
121
 
122
- STAGE E — EDGE-CASES & FALLBACKS
123
- 10) If there are nested multiple documents in the same file (e.g., Invoice + Delivery Note), extract the primary document first and include references to others in references.* fields.
124
- 11) Do NOT make external API calls. Only produce the JSON block.
125
 
126
- STAGE F OUTPUT
127
- 12) Output ONLY the final JSON between <<<JSON>>> and <<<END_JSON>>> markers with EXACTLY the schema keys present.
128
  <|im_end|>
129
  <|im_start|>user
130
  Input Text (first 3000 chars):
@@ -135,34 +65,25 @@ Input Text (first 3000 chars):
135
 
136
  def get_agent_prompt(history_text: str, user_message: str) -> str:
137
  """
138
- Agent prompt used by the orchestrator LLM.
139
- - If asking to persist to Zoho, must return a strict tool-call JSON object only.
140
- - Otherwise, summarize or ask for confirmation.
141
  """
142
  return f"""<|im_start|>system
143
  You are the Zoho CRM / Zoho Invoice Orchestrator Assistant.
144
 
145
- AVAILABLE TOOLS (call format must be EXACT JSON returned by you):
146
- 1) create_contact(contact_json)
147
- 2) create_item(item_json)
148
- 3) create_invoice(invoice_json)
149
- 4) create_creditnote(creditnote_json)
150
 
151
  MANDATES:
152
- - Only CALL a tool if the USER explicitly asked to persist (words like: "save", "create", "push", "upload", "persist" or "send to Zoho").
153
- - When you CALL a tool, output ONLY a JSON object with fields:
154
  {{ "tool": "<tool_name>", "args": {{ ... }} }}
155
- and nothing else (no explanation).
156
- - If you are NOT calling a tool, produce a human-friendly summary or validation checklist (do NOT output tool JSON).
157
-
158
- SEQUENCING RULE (when creating):
159
- - create_contact -> create_item(s) -> create_invoice/create_creditnote
160
- - Before create_contact, attempt deduplication by gstin or email using HISTORY.
161
- - If validation.amounts_balanced is false or missing critical fields exist, DO NOT call any tool; instead return a JSON-like review request (but human visible).
162
-
163
- Zoho invoice keys guidance:
164
- - For invoices, prefer: customer_id (or contact email), date (YYYY-MM-DD), due_date, line_items (each: item_id or name, rate, quantity, tax_id), currency, custom_fields
165
-
166
  <|im_end|>
167
  <|im_start|>user
168
  HISTORY:
@@ -174,12 +95,10 @@ CURRENT REQUEST:
174
  <|im_start|assistant
175
  """
176
 
177
- # Convenience small prompt for quick sanity checks
178
  def get_quick_extraction_check_prompt(summary: str) -> str:
179
- return f"""You are a JSON validator. Check the JSON below for:
180
- - presence of required fields: document_id, invoice_date, totals.grand_total, buyer.contact_name
181
- - numeric fields parseable into numbers
182
- Return only a JSON object with keys: missing_fields (list), parse_warnings (list), ok (true|false).
183
  Input:
184
  {summary}
185
  """
 
1
  # prompts.py
2
  # Qwen-2.5 Compatible Prompts
3
+ # Strict SOP for "Document Processing SOP for Zoho Invoice Integration"
4
+ # Outputs MUST be wrapped between <<<JSON>>> and <<<END_JSON>>> markers.
 
 
5
 
6
  from typing import Optional
7
 
8
  def get_ocr_extraction_prompt(raw_text: str, page_count: int = 1) -> str:
9
  """
10
+ Builds a strict LLM prompt to:
11
+ - classify the doc
12
+ - extract fields into a fixed JSON schema
13
+ - validate totals and dates
14
+ - output ONLY the JSON between <<<JSON>>> and <<<END_JSON>>>
15
+ raw_text will be truncated by caller if long.
16
  """
17
  schema = r'''
18
  Top-level schema (use null for unknown fields):
19
  {
20
+ "document_type": "", "document_id": "", "invoice_date": "", "due_date": "", "currency": "",
21
+ "totals": { "sub_total": null, "tax_total": null, "round_off": null, "grand_total": null },
22
+ "seller": { "company": null, "address": null, "city": null, "state": null, "zip": null, "country": null, "gstin": null, "pan": null, "bank_details": null },
23
+ "buyer": { "contact_name": null, "company_name": null, "billing_address": null, "shipping_address": null, "email": null, "phone": null, "gstin": null, "pan": null },
24
+ "line_items": [ { "name": null, "description": null, "hsn_or_sac": null, "sku": null, "quantity": null, "unit": null, "rate": null, "amount": null, "taxes": [ { "type": null, "rate": null, "amount": null, "tax_id": null } ] } ],
25
+ "tax_breakdown": [ { "tax_type": null, "cgst": null, "sgst": null, "igst": null, "cess": null } ],
26
+ "references": { "reference_invoice_number": null, "po_number": null, "delivery_challan": null },
27
+ "payment_terms": null, "notes": null, "qr_codes": [ { "type": null, "value": null } ],
28
+ "raw_text_sample": null,
29
+ "validation": { "amounts_balanced": null, "missing_critical_fields": [] }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  }
31
  '''
32
  return f"""<|im_start|>system
33
+ You are an invoice & document data extraction assistant. Follow instructions exactly.
34
 
35
+ OUTPUT RULES (MUST FOLLOW):
36
+ - Produce ONE valid JSON object and NOTHING else.
37
+ - Wrap JSON between EXACT markers with no extra commentary:
38
  <<<JSON>>>
39
  {{ ... }}
40
  <<<END_JSON>>>
41
+ - Use double quotes for all JSON strings. No trailing commas.
42
+ - Dates must be ISO YYYY-MM-DD or null. Numeric fields must be numbers or null.
43
+ - If unknown, use null or empty list/object as appropriate.
 
 
 
44
 
45
+ SCHEMA:
 
 
 
 
 
46
  {schema}
47
 
48
+ VALIDATION:
49
+ - Normalize and validate dates; if unparseable set null and add to validation.missing_critical_fields.
50
+ - Normalize numeric values (remove commas/currency symbols). If conversion fails set null and add to missing_critical_fields.
51
+ - Set validation.amounts_balanced = true only if sum(line_items.amount) + totals.tax_total ± totals.round_off equals totals.grand_total (tolerance 0.5).
52
+ - Include up to first 3000 chars of raw text in raw_text_sample.
 
 
 
 
 
 
 
53
 
54
+ MULTI-PAGE:
55
+ - page_count = {page_count}. Merge line_items across pages.
 
56
 
57
+ Do NOT call external APIs. Output only the JSON between the markers.
 
58
  <|im_end|>
59
  <|im_start|>user
60
  Input Text (first 3000 chars):
 
65
 
66
  def get_agent_prompt(history_text: str, user_message: str) -> str:
67
  """
68
+ Orchestrator prompt. When asked to persist, output EXACT tool-call JSON:
69
+ { "tool": "<tool_name>", "args": { ... } }
70
+ Otherwise produce a human-friendly summary (no tool JSON).
71
  """
72
  return f"""<|im_start|>system
73
  You are the Zoho CRM / Zoho Invoice Orchestrator Assistant.
74
 
75
+ TOOLS (only call when user explicitly requests persist/save/create/push/upload):
76
+ - create_contact(contact_json)
77
+ - create_item(item_json)
78
+ - create_invoice(invoice_json)
79
+ - create_creditnote(creditnote_json)
80
 
81
  MANDATES:
82
+ - If calling a tool, output ONLY a single JSON object:
 
83
  {{ "tool": "<tool_name>", "args": {{ ... }} }}
84
+ and nothing else.
85
+ - If not calling a tool, return a human-readable summary and recommended next steps (no tool JSON).
86
+ - If validation.amounts_balanced is false or critical fields missing, DO NOT call tools; ask for manual review.
 
 
 
 
 
 
 
 
87
  <|im_end|>
88
  <|im_start|>user
89
  HISTORY:
 
95
  <|im_start|assistant
96
  """
97
 
98
+ # small helper prompt used by app when validating parsed JSON quickly
99
  def get_quick_extraction_check_prompt(summary: str) -> str:
100
+ return f"""You are a JSON validator. Check the JSON below for required fields: document_id, invoice_date, totals.grand_total, buyer.contact_name.
101
+ Return only a JSON: {{ "missing_fields": [...], "parse_warnings": [...], "ok": true|false }}
 
 
102
  Input:
103
  {summary}
104
  """