import json import re from pathlib import Path from typing import Optional import pymupdf4llm from groq import Groq from loguru import logger from pydantic import BaseModel, field_validator PROMPT_TEMPLATE = (Path(__file__).parent / "prompts" / "invoice.prompt.txt").read_text(encoding="utf-8") class Invoice(BaseModel): proveedor: str = "Desconocido" nif_proveedor: Optional[str] = None cliente: Optional[str] = None numero_factura: Optional[str] = None fecha: Optional[str] = None fecha_vencimiento: Optional[str] = None concepto: str = "Sin descripción" categoria: str = "Otros" subtotal: Optional[float] = None iva_porcentaje: Optional[float] = None iva_importe: Optional[float] = None total: Optional[float] = None moneda: str = "EUR" @field_validator("categoria") @classmethod def validate_categoria(cls, v: str) -> str: valid = {"Servicios", "Suministros", "Transporte", "Software", "Material", "Alquiler", "Otros"} return v if v in valid else "Otros" def extract_text_from_pdf(pdf_path: str) -> str: try: pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False) return "\n\n".join(p["text"] for p in pages) except Exception as e: logger.error(f"Error leyendo {pdf_path}: {e}") return "" def extract_invoice_data(pdf_path: str, client: Groq, model: str = "llama-3.1-8b-instant") -> Invoice: text = extract_text_from_pdf(pdf_path) if not text.strip(): logger.warning(f"Sin texto en {pdf_path}") return Invoice(proveedor=Path(pdf_path).stem) prompt = PROMPT_TEMPLATE.format(text=text[:4000]) try: response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], temperature=0, max_tokens=512, ) raw = response.choices[0].message.content.strip() # Extract JSON block if wrapped in markdown match = re.search(r"\{.*\}", raw, re.DOTALL) if match: raw = match.group(0) data = json.loads(raw) return Invoice(**data) except (json.JSONDecodeError, Exception) as e: logger.error(f"Error extrayendo datos de {pdf_path}: {e}") return Invoice(proveedor=Path(pdf_path).stem)