| import gradio as gr |
| import json |
| import os |
| from backend.tools.pdf_tools import resolve_pdf, download_pdf, read_pdf, chunk_text |
| from .utils import format_error |
|
|
| |
| _vector_store = None |
|
|
| def _get_vector_store(): |
| global _vector_store |
| if _vector_store is None: |
| try: |
| from backend.vector_store import VectorStore |
| _vector_store = VectorStore() |
| except Exception as e: |
| import logging |
| logging.getLogger("pdf_tab").warning(f"VectorStore unavailable: {e}") |
| return None |
| return _vector_store |
|
|
| async def pdf_handler(source, action): |
| if not source: |
| return "**Error:** Proporcione una URL o DOI." |
| try: |
| if action == "resolve": |
| result = await resolve_pdf(source) |
| if "error" in result: return f"**Error:** {result['error']}" |
| steps = result.get("steps", []) |
| pdf_url = result.get("pdfUrl", "N/A") |
| output = f"## Resultado\n\n**URL:** {pdf_url}\n\n**Pasos:**\n" + "\n".join(f"- {s}" for s in steps) |
| if pdf_url.startswith("http"): output += f"\n\n[📄 Abrir PDF]({pdf_url})" |
| return output |
| |
| elif action == "read": |
| if not source.startswith("http"): |
| res = await resolve_pdf(source) |
| if "error" in res: return f"**Error:** No se pudo resolver a un PDF: {res['error']}" |
| source = res.get("pdfUrl") |
| |
| dl_res = await download_pdf(source) |
| if "error" in dl_res: return f"**Error:** {dl_res['error']}" |
| |
| read_res = await read_pdf(dl_res["path"]) |
| if "error" in read_res: return f"**Error:** {read_res['error']}" |
| |
| return f"## Lectura Exitosa\n\n**Páginas:** {read_res['pages']}\n\n**Previsualización:**\n\n```text\n{read_res['preview']}...\n```" |
| |
| elif action == "vectorize": |
| if not source.startswith("http"): |
| res = await resolve_pdf(source) |
| if "error" in res: return f"**Error:** {res['error']}" |
| source = res.get("pdfUrl") |
| |
| dl_res = await download_pdf(source) |
| if "error" in dl_res: return f"**Error:** {dl_res['error']}" |
| |
| read_res = await read_pdf(dl_res["path"]) |
| if "error" in read_res: return f"**Error:** {read_res['error']}" |
| |
| chunks = chunk_text(read_res["text"]) |
| |
| |
| ids = [f"{os.path.basename(source)}_{i}" for i in range(len(chunks))] |
| metadatas = [{"source": source, "chunk": i} for i in range(len(chunks))] |
| vs = _get_vector_store() |
| if vs: |
| vs.add_documents(chunks, metadatas, ids) |
| return f"## Vectorización Exitosa\n\nEl PDF se ha dividido en **{len(chunks)}** fragmentos y se ha guardado en la base de datos local ChromaDB.\n\nEjemplo de fragmento 0:\n\n```text\n{chunks[0][:500]}...\n```" |
| else: |
| return f"## PDF Procesado (sin vectorización)\n\nEl PDF se leyó correctamente ({len(chunks)} fragmentos) pero el motor vectorial no está disponible por limitaciones de memoria." |
| |
| else: |
| return "**Error:** Acción no válida." |
| |
| except Exception as e: |
| return format_error(e) |
|
|
| def create_pdf_tab(): |
| with gr.Tab("📄 PDF (Local)", id="pdf"): |
| gr.Markdown("## Procesamiento Nativo de PDF y Vectorización") |
| gr.Markdown("*Sin dependencias de Next.js - Usa PyMuPDF y ChromaDB local*") |
| with gr.Row(): |
| with gr.Column(scale=2): |
| source = gr.Textbox(label="URL del PDF / DOI", placeholder="https://arxiv.org/pdf/2301.00001.pdf") |
| action = gr.Radio(choices=["resolve", "read", "vectorize"], value="resolve", label="Acción") |
| pdf_btn = gr.Button("▶ Ejecutar Acción", variant="primary", size="lg") |
| with gr.Column(scale=3): |
| output_md = gr.Markdown("") |
| pdf_btn.click(fn=pdf_handler, inputs=[source, action], outputs=[output_md]) |
|
|