sgonzalezu's picture
Deploy LLaVA invoice extraction service
2c911a0
"""
Prompts optimizados para extracción de datos de facturas usando LLaVA-OneVision
"""
def get_invoice_extraction_prompt(vendor_id: str = "Default") -> str:
"""
Retorna un prompt estructurado para extraer información de facturas
Args:
vendor_id: ID del proveedor para prompts específicos
Returns:
String con el prompt optimizado
"""
# Prompt base para cualquier factura
base_prompt = """Analyze this invoice image carefully and extract the following information in JSON format:
{
"issuer": "Company name that issued the invoice",
"issuer_address": "Full address of the issuer",
"date": "Invoice date in YYYY-MM-DD format",
"transaction_id": "Transaction or invoice number",
"customer_name": "Customer name if present",
"gst_hst_number": "GST/HST registration number if present",
"items": [
{
"sku": "Product SKU or code",
"description": "Item description",
"quantity": 0.0,
"unit_price": 0.0,
"amount": 0.0,
"tax_code": "H if taxable, empty if not"
}
],
"subtotal": 0.0,
"hst": 0.0,
"total": 0.0
}
Important instructions:
- Extract ALL items from the invoice, not just a few examples
- For numbers, use numeric values without $ or , symbols
- If a field is not found, use empty string "" for text or 0.0 for numbers
- For tax_code, use "H" if the item has HST/tax, otherwise leave empty ""
- Ensure subtotal + hst = total (validate the math)
- Be precise with numbers, double-check all amounts
- Return ONLY the JSON, no additional text or explanation"""
# Prompts específicos por proveedor
vendor_prompts = {
"A1 Cash and Carry_Fisico": """Analyze this Burlington Cash and Carry (A1) invoice image and extract information in JSON format.
IMPORTANT VENDOR-SPECIFIC INSTRUCTIONS:
- The issuer is "Burlington Cash and Carry"
- Items follow this pattern:
* SKU (alphanumeric code like ALU104, FLRO58)
* Description (may span multiple lines)
* Unit price and total price (look for $ amounts)
* Quantity purchased
* Package size info (e.g., "100 ct", "12x355 ml")
- Look for "H" marker next to prices - this indicates taxable items
- Transaction ID usually starts with "BL" or "L" followed by numbers
- Customer is typically "FAMILIA FINE FOODS"
Return this JSON structure:
{
"issuer": "Burlington Cash and Carry",
"issuer_address": "Full address from invoice",
"date": "YYYY-MM-DD",
"transaction_id": "Transaction number",
"customer_name": "Customer name",
"gst_hst_number": "GST/HST number",
"items": [
{
"sku": "Product code",
"description": "Full item description",
"quantity": 0.0,
"unit_price": 0.0,
"amount": 0.0,
"tax_code": "H or empty"
}
],
"subtotal": 0.0,
"hst": 0.0,
"total": 0.0
}
Extract ALL items carefully. Return ONLY valid JSON, no extra text.""",
"Costco_Formato1": """Analyze this Costco invoice and extract information in JSON format.
COSTCO-SPECIFIC INSTRUCTIONS:
- Issuer is "Costco Wholesale"
- Items are in a table format with columns
- Look for member number and warehouse location
- Tax codes: "A" = taxable, "E" = exempt
- Total is at bottom, usually labeled "TOTAL"
Return JSON with all invoice data. Extract ALL items from the table.""",
"Costco_Formato2": """Analyze this Costco invoice (format 2) and extract information in JSON format.
Follow same structure as Costco format 1 but be aware of different layout.
Extract ALL items and amounts carefully."""
}
# Retornar prompt específico si existe, sino el base
return vendor_prompts.get(vendor_id, base_prompt)
def get_validation_prompt(extracted_data: dict) -> str:
"""
Prompt para validar datos extraídos
Args:
extracted_data: Datos extraídos previamente
Returns:
Prompt de validación
"""
return f"""Review the following invoice data extracted from the image and verify its accuracy:
{extracted_data}
Check:
1. Are all items from the invoice included?
2. Do the numbers add up correctly (subtotal + tax = total)?
3. Are dates in correct format?
4. Are all required fields filled?
If you find any errors or missing information, provide the corrected JSON. If everything is correct, return the same JSON.
Return ONLY valid JSON, nothing else."""
def get_item_extraction_prompt() -> str:
"""
Prompt específico para extraer solo items (para imágenes muy grandes)
Returns:
Prompt optimizado para extracción de items
"""
return """Focus on the items section of this invoice. Extract ALL line items in this JSON format:
{
"items": [
{
"sku": "Product code/SKU",
"description": "Item description",
"quantity": 0.0,
"unit_price": 0.0,
"amount": 0.0,
"tax_code": "H if taxable, empty otherwise"
}
]
}
Extract EVERY item visible in the image. Be thorough and precise with numbers.
Return ONLY the JSON array, no other text."""