invoice-processor / extractor.py
JoseAndresLopez's picture
Upload folder using huggingface_hub
6cc3191 verified
Raw
History Blame Contribute Delete
2.34 kB
import json
import re
from pathlib import Path
from typing import Optional
import pymupdf4llm
from groq import Groq
from loguru import logger
from pydantic import BaseModel, field_validator
PROMPT_TEMPLATE = (Path(__file__).parent / "prompts" / "invoice.prompt.txt").read_text(encoding="utf-8")
class Invoice(BaseModel):
proveedor: str = "Desconocido"
nif_proveedor: Optional[str] = None
cliente: Optional[str] = None
numero_factura: Optional[str] = None
fecha: Optional[str] = None
fecha_vencimiento: Optional[str] = None
concepto: str = "Sin descripción"
categoria: str = "Otros"
subtotal: Optional[float] = None
iva_porcentaje: Optional[float] = None
iva_importe: Optional[float] = None
total: Optional[float] = None
moneda: str = "EUR"
@field_validator("categoria")
@classmethod
def validate_categoria(cls, v: str) -> str:
valid = {"Servicios", "Suministros", "Transporte", "Software", "Material", "Alquiler", "Otros"}
return v if v in valid else "Otros"
def extract_text_from_pdf(pdf_path: str) -> str:
try:
pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False)
return "\n\n".join(p["text"] for p in pages)
except Exception as e:
logger.error(f"Error leyendo {pdf_path}: {e}")
return ""
def extract_invoice_data(pdf_path: str, client: Groq, model: str = "llama-3.1-8b-instant") -> Invoice:
text = extract_text_from_pdf(pdf_path)
if not text.strip():
logger.warning(f"Sin texto en {pdf_path}")
return Invoice(proveedor=Path(pdf_path).stem)
prompt = PROMPT_TEMPLATE.format(text=text[:4000])
try:
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
temperature=0,
max_tokens=512,
)
raw = response.choices[0].message.content.strip()
# Extract JSON block if wrapped in markdown
match = re.search(r"\{.*\}", raw, re.DOTALL)
if match:
raw = match.group(0)
data = json.loads(raw)
return Invoice(**data)
except (json.JSONDecodeError, Exception) as e:
logger.error(f"Error extrayendo datos de {pdf_path}: {e}")
return Invoice(proveedor=Path(pdf_path).stem)