letxinet / modules /pdf_tab.py
C2MV's picture
Initial upload for Build Small Hackathon
68fb5e2 verified
Raw
History Blame Contribute Delete
4.21 kB
import gradio as gr
import json
import os
from backend.tools.pdf_tools import resolve_pdf, download_pdf, read_pdf, chunk_text
from .utils import format_error
# Lazy-load VectorStore to avoid OOM at startup
_vector_store = None
def _get_vector_store():
global _vector_store
if _vector_store is None:
try:
from backend.vector_store import VectorStore
_vector_store = VectorStore()
except Exception as e:
import logging
logging.getLogger("pdf_tab").warning(f"VectorStore unavailable: {e}")
return None
return _vector_store
async def pdf_handler(source, action):
if not source:
return "**Error:** Proporcione una URL o DOI."
try:
if action == "resolve":
result = await resolve_pdf(source)
if "error" in result: return f"**Error:** {result['error']}"
steps = result.get("steps", [])
pdf_url = result.get("pdfUrl", "N/A")
output = f"## Resultado\n\n**URL:** {pdf_url}\n\n**Pasos:**\n" + "\n".join(f"- {s}" for s in steps)
if pdf_url.startswith("http"): output += f"\n\n[📄 Abrir PDF]({pdf_url})"
return output
elif action == "read":
if not source.startswith("http"):
res = await resolve_pdf(source)
if "error" in res: return f"**Error:** No se pudo resolver a un PDF: {res['error']}"
source = res.get("pdfUrl")
dl_res = await download_pdf(source)
if "error" in dl_res: return f"**Error:** {dl_res['error']}"
read_res = await read_pdf(dl_res["path"])
if "error" in read_res: return f"**Error:** {read_res['error']}"
return f"## Lectura Exitosa\n\n**Páginas:** {read_res['pages']}\n\n**Previsualización:**\n\n```text\n{read_res['preview']}...\n```"
elif action == "vectorize":
if not source.startswith("http"):
res = await resolve_pdf(source)
if "error" in res: return f"**Error:** {res['error']}"
source = res.get("pdfUrl")
dl_res = await download_pdf(source)
if "error" in dl_res: return f"**Error:** {dl_res['error']}"
read_res = await read_pdf(dl_res["path"])
if "error" in read_res: return f"**Error:** {read_res['error']}"
chunks = chunk_text(read_res["text"])
# Guardar en ChromaDB
ids = [f"{os.path.basename(source)}_{i}" for i in range(len(chunks))]
metadatas = [{"source": source, "chunk": i} for i in range(len(chunks))]
vs = _get_vector_store()
if vs:
vs.add_documents(chunks, metadatas, ids)
return f"## Vectorización Exitosa\n\nEl PDF se ha dividido en **{len(chunks)}** fragmentos y se ha guardado en la base de datos local ChromaDB.\n\nEjemplo de fragmento 0:\n\n```text\n{chunks[0][:500]}...\n```"
else:
return f"## PDF Procesado (sin vectorización)\n\nEl PDF se leyó correctamente ({len(chunks)} fragmentos) pero el motor vectorial no está disponible por limitaciones de memoria."
else:
return "**Error:** Acción no válida."
except Exception as e:
return format_error(e)
def create_pdf_tab():
with gr.Tab("📄 PDF (Local)", id="pdf"):
gr.Markdown("## Procesamiento Nativo de PDF y Vectorización")
gr.Markdown("*Sin dependencias de Next.js - Usa PyMuPDF y ChromaDB local*")
with gr.Row():
with gr.Column(scale=2):
source = gr.Textbox(label="URL del PDF / DOI", placeholder="https://arxiv.org/pdf/2301.00001.pdf")
action = gr.Radio(choices=["resolve", "read", "vectorize"], value="resolve", label="Acción")
pdf_btn = gr.Button("▶ Ejecutar Acción", variant="primary", size="lg")
with gr.Column(scale=3):
output_md = gr.Markdown("")
pdf_btn.click(fn=pdf_handler, inputs=[source, action], outputs=[output_md])