import gradio as gr import json import os from backend.tools.pdf_tools import resolve_pdf, download_pdf, read_pdf, chunk_text from .utils import format_error # Lazy-load VectorStore to avoid OOM at startup _vector_store = None def _get_vector_store(): global _vector_store if _vector_store is None: try: from backend.vector_store import VectorStore _vector_store = VectorStore() except Exception as e: import logging logging.getLogger("pdf_tab").warning(f"VectorStore unavailable: {e}") return None return _vector_store async def pdf_handler(source, action): if not source: return "**Error:** Proporcione una URL o DOI." try: if action == "resolve": result = await resolve_pdf(source) if "error" in result: return f"**Error:** {result['error']}" steps = result.get("steps", []) pdf_url = result.get("pdfUrl", "N/A") output = f"## Resultado\n\n**URL:** {pdf_url}\n\n**Pasos:**\n" + "\n".join(f"- {s}" for s in steps) if pdf_url.startswith("http"): output += f"\n\n[📄 Abrir PDF]({pdf_url})" return output elif action == "read": if not source.startswith("http"): res = await resolve_pdf(source) if "error" in res: return f"**Error:** No se pudo resolver a un PDF: {res['error']}" source = res.get("pdfUrl") dl_res = await download_pdf(source) if "error" in dl_res: return f"**Error:** {dl_res['error']}" read_res = await read_pdf(dl_res["path"]) if "error" in read_res: return f"**Error:** {read_res['error']}" return f"## Lectura Exitosa\n\n**Páginas:** {read_res['pages']}\n\n**Previsualización:**\n\n```text\n{read_res['preview']}...\n```" elif action == "vectorize": if not source.startswith("http"): res = await resolve_pdf(source) if "error" in res: return f"**Error:** {res['error']}" source = res.get("pdfUrl") dl_res = await download_pdf(source) if "error" in dl_res: return f"**Error:** {dl_res['error']}" read_res = await read_pdf(dl_res["path"]) if "error" in read_res: return f"**Error:** {read_res['error']}" chunks = chunk_text(read_res["text"]) # Guardar en ChromaDB ids = [f"{os.path.basename(source)}_{i}" for i in range(len(chunks))] metadatas = [{"source": source, "chunk": i} for i in range(len(chunks))] vs = _get_vector_store() if vs: vs.add_documents(chunks, metadatas, ids) return f"## Vectorización Exitosa\n\nEl PDF se ha dividido en **{len(chunks)}** fragmentos y se ha guardado en la base de datos local ChromaDB.\n\nEjemplo de fragmento 0:\n\n```text\n{chunks[0][:500]}...\n```" else: return f"## PDF Procesado (sin vectorización)\n\nEl PDF se leyó correctamente ({len(chunks)} fragmentos) pero el motor vectorial no está disponible por limitaciones de memoria." else: return "**Error:** Acción no válida." except Exception as e: return format_error(e) def create_pdf_tab(): with gr.Tab("📄 PDF (Local)", id="pdf"): gr.Markdown("## Procesamiento Nativo de PDF y Vectorización") gr.Markdown("*Sin dependencias de Next.js - Usa PyMuPDF y ChromaDB local*") with gr.Row(): with gr.Column(scale=2): source = gr.Textbox(label="URL del PDF / DOI", placeholder="https://arxiv.org/pdf/2301.00001.pdf") action = gr.Radio(choices=["resolve", "read", "vectorize"], value="resolve", label="Acción") pdf_btn = gr.Button("▶ Ejecutar Acción", variant="primary", size="lg") with gr.Column(scale=3): output_md = gr.Markdown("") pdf_btn.click(fn=pdf_handler, inputs=[source, action], outputs=[output_md])