Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| from pathlib import Path | |
| from typing import Optional | |
| import pymupdf4llm | |
| from groq import Groq | |
| from loguru import logger | |
| from pydantic import BaseModel, field_validator | |
| PROMPT_TEMPLATE = (Path(__file__).parent / "prompts" / "invoice.prompt.txt").read_text(encoding="utf-8") | |
| class Invoice(BaseModel): | |
| proveedor: str = "Desconocido" | |
| nif_proveedor: Optional[str] = None | |
| cliente: Optional[str] = None | |
| numero_factura: Optional[str] = None | |
| fecha: Optional[str] = None | |
| fecha_vencimiento: Optional[str] = None | |
| concepto: str = "Sin descripción" | |
| categoria: str = "Otros" | |
| subtotal: Optional[float] = None | |
| iva_porcentaje: Optional[float] = None | |
| iva_importe: Optional[float] = None | |
| total: Optional[float] = None | |
| moneda: str = "EUR" | |
| def validate_categoria(cls, v: str) -> str: | |
| valid = {"Servicios", "Suministros", "Transporte", "Software", "Material", "Alquiler", "Otros"} | |
| return v if v in valid else "Otros" | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| try: | |
| pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False) | |
| return "\n\n".join(p["text"] for p in pages) | |
| except Exception as e: | |
| logger.error(f"Error leyendo {pdf_path}: {e}") | |
| return "" | |
| def extract_invoice_data(pdf_path: str, client: Groq, model: str = "llama-3.1-8b-instant") -> Invoice: | |
| text = extract_text_from_pdf(pdf_path) | |
| if not text.strip(): | |
| logger.warning(f"Sin texto en {pdf_path}") | |
| return Invoice(proveedor=Path(pdf_path).stem) | |
| prompt = PROMPT_TEMPLATE.format(text=text[:4000]) | |
| try: | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0, | |
| max_tokens=512, | |
| ) | |
| raw = response.choices[0].message.content.strip() | |
| # Extract JSON block if wrapped in markdown | |
| match = re.search(r"\{.*\}", raw, re.DOTALL) | |
| if match: | |
| raw = match.group(0) | |
| data = json.loads(raw) | |
| return Invoice(**data) | |
| except (json.JSONDecodeError, Exception) as e: | |
| logger.error(f"Error extrayendo datos de {pdf_path}: {e}") | |
| return Invoice(proveedor=Path(pdf_path).stem) | |