{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Parse PDF with Document Intelligence" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from azure.core.credentials import AzureKeyCredential\n", "from azure.ai.documentintelligence import DocumentIntelligenceClient\n", "from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat, AnalyzeResult" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "AZURE_ENDPOINT = \"https://document-intelligence-rag-spai.cognitiveservices.azure.com/\"\n", "AZURE_KEY = \"9bBfqC0z9P5GqEEBwESzwD4KCPoSDR7Io3k7NSJHWSYnDfC6YbStJQQJ99BBACYeBjFXJ3w3AAALACOG6nXk\"\n", "PDF_PATH = \"/teamspace/studios/this_studio/AgenticRAG/papers/ManualRenta2023_es_es.pdf\" \n", "OUTPUT_MD = \"Renta_2023_doc_int.md\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "document_intelligence_client = DocumentIntelligenceClient(endpoint=AZURE_ENDPOINT, credential=AzureKeyCredential(AZURE_KEY))\n", "with open(PDF_PATH, \"rb\") as f:\n", " poller = document_intelligence_client.begin_analyze_document(\n", " \"prebuilt-layout\",\n", " f,\n", " output_content_format=DocumentContentFormat.MARKDOWN,\n", ")\n", "result: AnalyzeResult = poller.result()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "with open(OUTPUT_MD, 'w', encoding=\"utf-8\") as f:\n", " f.write(result.content)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Extract PDF Table of Contents" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Bash\n", "\n", "# ```mutool show ManualRenta2023_es_es.pdf outline > indice.txt``\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "_Una vez extraido el TOC con MuPDF tool se ha guardado en indice.txt y parseado a indice.md_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Parse Titles" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "_Limpieza de forma manual algunos headers y titulos in indice.md y Renta_2023_doc_int.md_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "import difflib\n", "import unicodedata\n", "\n", "def normalize_text(text):\n", " \"\"\"\n", " Normaliza el texto:\n", " - Reemplaza backslashes, saltos de línea y exceso de espacios.\n", " - Elimina acentos y pasa el texto a minúsculas.\n", " - Elimina casi toda la puntuación (se conserva el punto).\n", " \"\"\"\n", " text = text.replace('\\\\', ' ')\n", " text = text.replace('\\n', ' ')\n", " text = text.strip()\n", " text = re.sub(r'\\s+', ' ', text)\n", " text = unicodedata.normalize('NFD', text)\n", " text = ''.join(ch for ch in text if unicodedata.category(ch) != 'Mn')\n", " text = text.lower()\n", " text = re.sub(r'[^\\w\\s\\.]', '', text)\n", " return text.strip()\n", "\n", "def extract_headers_list(file_path):\n", " \"\"\"\n", " Extrae los headers de un archivo y devuelve una lista de tuplas:\n", " (nivel, header_original, header_normalizado)\n", " \"\"\"\n", " headers = []\n", " header_regex = re.compile(r'^(#+)\\s*(.+)$')\n", " with open(file_path, 'r', encoding='utf-8') as f:\n", " for line in f:\n", " line = line.rstrip(\"\\n\")\n", " if line.strip().startswith('#'):\n", " m = header_regex.match(line.strip())\n", " if m:\n", " level = len(m.group(1))\n", " header_text = m.group(2).strip()\n", " norm = normalize_text(header_text)\n", " headers.append((level, header_text, norm))\n", " return headers\n", "\n", "# ejecuta hasta que todos los headers coincidan\n", "while True:\n", " # 1. Extraer headers de indice.md y del documento.\n", " indice_headers = extract_headers_list('indice.md')\n", " corrected_headers = extract_headers_list('Renta_2023_doc_int_corrected.md')\n", " \n", " # Convertir a listas de cadenas con formato uniforme (ej. \"## Título\")\n", " indice_headers_str = [f\"{'#'*level} {text}\" for (level, text, norm) in indice_headers]\n", " corrected_headers_str = [f\"{'#'*level} {text}\" for (level, text, norm) in corrected_headers]\n", " \n", " # Comprobar si coinciden exactamente\n", " if indice_headers_str == corrected_headers_str:\n", " print(\"¡Todos los headers coinciden exactamente!\")\n", " break\n", "\n", " # 2. Determinar el primer header diferente\n", " min_len = min(len(indice_headers_str), len(corrected_headers_str))\n", " first_diff = None\n", " for i in range(min_len):\n", " if indice_headers_str[i] != corrected_headers_str[i]:\n", " first_diff = i\n", " break\n", " if first_diff is None:\n", " print(\"Las listas de headers tienen distinta longitud.\")\n", " break\n", "\n", " # 3. Abrir el documento y dividirlo en parrafos\n", " with open('Renta_2023_doc_int_corrected.md', 'r', encoding='utf-8') as f:\n", " doc_text = f.read()\n", " paragraphs = re.split(r'\\n\\s*\\n', doc_text)\n", " \n", " # 4. Obtener los indices de parrafos que son headers\n", " header_paragraph_indices = [idx for idx, para in enumerate(paragraphs) if para.strip().startswith('#')]\n", " \n", " # Definir el rango de búsqueda en los párrafos para el header a corregir\n", " if first_diff == 0:\n", " search_start = 0\n", " else:\n", " search_start = header_paragraph_indices[first_diff - 1] + 1 if first_diff - 1 < len(header_paragraph_indices) else 0\n", " if first_diff < len(header_paragraph_indices) - 1:\n", " search_end = header_paragraph_indices[first_diff + 1]\n", " else:\n", " search_end = len(paragraphs)\n", " \n", " expected_header = indice_headers_str[first_diff] # Header correcto esperado (con hashes correctos)\n", " expected_level = indice_headers[first_diff][0] # Nivel correcto\n", " expected_norm = normalize_text(indice_headers[first_diff][1])\n", " \n", " # 5. Buscar en el rango definido la aparición del header esperado (fuzzy matching)\n", " found_candidate_idx = None\n", " threshold_candidate = 0.98 # similitud\n", " for i in range(search_start, search_end):\n", " candidate = paragraphs[i].strip()\n", " candidate_text = candidate.lstrip('#').strip() if candidate.startswith('#') else candidate\n", " norm_candidate = normalize_text(candidate_text)\n", " ratio = difflib.SequenceMatcher(None, expected_norm, norm_candidate).ratio()\n", " if ratio >= threshold_candidate:\n", " found_candidate_idx = i\n", " break\n", "\n", " if found_candidate_idx is not None:\n", " # actualizar el párrafo encontrado con el header correcto\n", " new_header_line = ('#' * expected_level) + \" \" + indice_headers[first_diff][1]\n", " paragraphs[found_candidate_idx] = new_header_line\n", " with open('Renta_2023_doc_int_corrected_2.md', 'w', encoding='utf-8') as f:\n", " f.write(\"\\n\\n\".join(paragraphs))\n", " print(f\"Se ha corregido el header en el índice {first_diff}.\")\n", " else:\n", " print(\"No se encontró en el rango de búsqueda el header esperado.\")\n", " break\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Delete figures and PageFooter" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('Renta_2023_doc_int_corrected_2.md', 'r', encoding='utf-8') as f:\n", " new_doc = f.read()\n", "\n", "new_doc = re.sub(r'
\\s*Agencia Tributaria\\s*
', '', new_doc, flags=re.DOTALL)\n", "new_doc = new_doc.replace('', '')\n", "\n", "with open('Renta_2023_doc_int_corrected_2.md', 'w', encoding='utf-8') as f:\n", " f.write(new_doc)\n", "\n", "print(\"Archivo corregido y guardado.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check Titles" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Conteo de títulos:\n", "{'correctos': 1457, 'incorrectos': 0}\n", "\n", "Títulos incorrectos:\n" ] } ], "source": [ "import re\n", "\n", "def analizar_titulos_markdown_desde_archivos(archivo_contenido, archivo_titulos):\n", " try:\n", " with open(archivo_contenido, 'r', encoding='utf-8') as f:\n", " contenido_markdown = f.read()\n", " except FileNotFoundError:\n", " print(f\"Error: No se encontró el archivo del contenido: {archivo_contenido}\")\n", " return {\"correctos\": 0, \"incorrectos\": 0}, []\n", " except Exception as e:\n", " print(f\"Error al leer el archivo del contenido: {archivo_contenido} - {e}\")\n", " return {\"correctos\": 0, \"incorrectos\": 0}, []\n", "\n", " try:\n", " with open(archivo_titulos, 'r', encoding='utf-8') as f:\n", " listado_titulos_markdown = f.read()\n", " except FileNotFoundError:\n", " print(f\"Error: No se encontró el archivo de títulos: {archivo_titulos}\")\n", " return {\"correctos\": 0, \"incorrectos\": 0}, []\n", " except Exception as e:\n", " print(f\"Error al leer el archivo de títulos: {archivo_titulos} - {e}\")\n", " return {\"correctos\": 0, \"incorrectos\": 0}, []\n", "\n", " # 1. Preparar el listado de títulos esperado\n", " titulos_esperados = []\n", " for linea in listado_titulos_markdown.splitlines():\n", " linea = linea.strip()\n", " if linea: # Ignorar líneas vacías\n", " titulos_esperados.append(linea)\n", "\n", " # 2. Extraer los títulos del contenido del documento\n", " titulos_encontrados = []\n", " for linea in re.findall(r\"^(#+ .+)$\", contenido_markdown, re.MULTILINE):\n", " titulos_encontrados.append(linea)\n", " \n", " # 3. Comparar los títulos y contar correctos/incorrectos\n", " correctos = 0\n", " incorrectos = 0\n", " titulos_incorrectos = []\n", " \n", " i = 0\n", " j = 0\n", " while i < len(titulos_esperados) and j < len(titulos_encontrados):\n", " if titulos_esperados[i] == titulos_encontrados[j]:\n", " correctos += 1\n", " i += 1\n", " j += 1\n", " else:\n", " incorrectos += 1\n", " titulos_incorrectos.append({\n", " \"esperado\": titulos_esperados[i],\n", " \"encontrado\": titulos_encontrados[j]\n", " })\n", " i+=1\n", " j+=1\n", " \n", " # Si quedan títulos por verificar, se consideran incorrectos\n", " while i < len(titulos_esperados):\n", " incorrectos += 1\n", " titulos_incorrectos.append({\n", " \"esperado\": titulos_esperados[i],\n", " \"encontrado\": None \n", " })\n", " i+=1\n", " \n", " while j < len(titulos_encontrados):\n", " incorrectos += 1\n", " titulos_incorrectos.append({\n", " \"esperado\": None,\n", " \"encontrado\": titulos_encontrados[j] \n", " })\n", " j+=1\n", " \n", " return {\"correctos\": correctos, \"incorrectos\": incorrectos}, titulos_incorrectos\n", "\n", "archivo_documento = \"/teamspace/studios/this_studio/AgenticRAG/doc_renta/Renta_2023_doc_int_corrected_2.md\" \n", "archivo_lista_titulos = \"/teamspace/studios/this_studio/AgenticRAG/doc_renta/indice.md\" \n", "\n", "resultados, incorrectos = analizar_titulos_markdown_desde_archivos(archivo_documento, archivo_lista_titulos)\n", "\n", "print(\"Conteo de títulos:\")\n", "print(resultados)\n", "\n", "print(\"\\nTítulos incorrectos:\")\n", "for error in incorrectos:\n", " print(f\"- Esperado: {error['esperado']}, Encontrado: {error['encontrado']}\")" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }