Spaces:

Geoeasy
/

DFSORT

Sleeping

File size: 10,177 Bytes

import os
import re
from pathlib import Path
from typing import List, Tuple

import numpy as np
import faiss
import gradio as gr

# Leitura do PDF
try:
    from pypdf import PdfReader  # pypdf é leve e confiável para extração de texto
except Exception:
    PdfReader = None

# Embeddings e LLM (API NVIDIA estilo OpenAI)
from sentence_transformers import SentenceTransformer
from openai import OpenAI, OpenAIError

"""
DFSORT RAG – Assistente em Português (Gradio)
---------------------------------------------
• Interface totalmente em português.
• Botões "Enviar" e "Limpar" no chat.
• Página enquadrada (layout responsivo) para tudo ficar visível.
• RAG simples: FAISS + MiniLM sobre o PDF fornecido (somente ele como fonte).
"""

# ===================== Configurações =====================
APP_TITLE   = "DFSORT RAG (PDF)"
PDF_PATH    = "ice2ca11.pdf"   # ajuste se o PDF tiver outro nome/caminho
INDEX_FILE  = "r_docs.index"
CHUNKS_FILE = "r_chunks.npy"

# Modelo de chat (NVIDIA OpenAI-compatible)
CHAT_MODEL = "meta/llama3-8b-instruct"
NV_API_KEY = os.environ.get("NV_API_KEY")
if not NV_API_KEY:
    raise RuntimeError("🔒 NV_API_KEY não definido. Configure em Settings → Variables & Secrets.")

client = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key=NV_API_KEY)

# Modelo de embeddings (baixa no primeiro uso)
EMB_MODEL_NAME  = "all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(EMB_MODEL_NAME)

# Estado global (carregado sob demanda)
faiss_index = None
pdf_chunks  = None

# ===================== Indexação a partir do PDF =====================

def _pdf_to_text_chunks(pdf_path: str, max_chunk_chars: int = 1200) -> List[str]:
    """Extrai texto do PDF e cria chunks (~max_chunk_chars) para o RAG.
    - Divide por páginas; normaliza espaços/linhas; agrega em blocos.
    """
    path = Path(pdf_path)
    if not path.exists():
        raise FileNotFoundError(f"PDF não encontrado: {pdf_path}")

    raw_pages: List[str] = []
    if PdfReader is None:
        # fallback tosco se pypdf faltar (não recomendado)
        with open(path, "rb") as f:
            data = f.read()
        text = data.decode(errors="ignore")
        raw_pages = re.split(r"\f|\n\s*\n", text)
    else:
        reader = PdfReader(str(path))
        for pg in reader.pages:
            try:
                raw = pg.extract_text() or ""
            except Exception:
                raw = ""
            raw_pages.append(raw)

    blocks: List[str] = []
    for page_txt in raw_pages:
        if not page_txt:
            continue
        t = re.sub(r"[ \t]+", " ", page_txt)
        t = re.sub(r"\n{2,}", "\n\n", t).strip()
        parts = re.split(r"\n\n+|\n• |\n- ", t)
        blocks.extend(p.strip() for p in parts if p and p.strip())

    chunks: List[str] = []
    buf: List[str] = []
    size = 0
    for b in blocks:
        if size + len(b) + 1 > max_chunk_chars:
            if buf:
                chunks.append("\n".join(buf))
            buf = [b]
            size = len(b)
        else:
            buf.append(b)
            size += len(b) + 1
    if buf:
        chunks.append("\n".join(buf))

    # remover pedaços muito curtos
    chunks = [c.strip() for c in chunks if len(c.strip()) > 50]
    return chunks


def build_or_load_index(pdf_path: str, index_path: str, chunks_path: str) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
    """Cria/carrega índice FAISS e os chunks a partir do PDF."""
    if Path(index_path).exists() and Path(chunks_path).exists():
        index = faiss.read_index(index_path)
        chunks = np.load(chunks_path, allow_pickle=True)
        return index, chunks

    # construir do zero
    chunks_list = _pdf_to_text_chunks(pdf_path)
    emb = embedding_model.encode(chunks_list, convert_to_numpy=True, normalize_embeddings=True)
    d = emb.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(emb)
    faiss.write_index(index, index_path)
    np.save(chunks_path, np.array(chunks_list, dtype=object))
    return index, np.array(chunks_list, dtype=object)


# ===================== Recuperação + LLM =====================

def retrieve_context(query: str, index: faiss.IndexFlatIP, chunks: np.ndarray, k: int = 6) -> str:
    q = embedding_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    scores, idxs = index.search(q, k)
    parts: List[str] = []
    for i in idxs[0]:
        if 0 <= i < len(chunks):
            parts.append(str(chunks[i]))
    return "\n---\n".join(parts)


def nv_complete(messages, temperature: float, top_p: float, max_tokens: int) -> str:
    resp = client.chat.completions.create(
        model=CHAT_MODEL,
        messages=messages,
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens,
        stream=False,
    )
    return resp.choices[0].message.content.strip()


def make_system_prompt(ctx: str) -> str:
    return (
        "Você é um assistente especializado em DFSORT (IBM z/OS).\n"
        "Responda **apenas** com base no contexto recuperado do PDF.\n"
        "Se a informação não estiver no contexto, diga que não sabe.\n\n"
        f"=== Contexto (trechos do PDF) ===\n{ctx}\n\n"
        "Quando der exemplos, forneça JCL/SYSIN curtos e claros."
    )


# ===================== Handlers do Chat =====================

def ensure_index_loaded():
    global faiss_index, pdf_chunks
    if faiss_index is None or pdf_chunks is None:
        faiss_index, pdf_chunks = build_or_load_index(PDF_PATH, INDEX_FILE, CHUNKS_FILE)


def on_send(user_msg, history, temperature, top_p, max_tokens, k):
    """Envia a pergunta, roda o RAG e devolve o histórico atualizado."""
    ensure_index_loaded()
    history = history or []
    user_msg = (user_msg or "").strip()
    if not user_msg:
        return history, ""

    ctx = retrieve_context(user_msg, faiss_index, pdf_chunks, k=int(k))
    sys_msg = {"role": "system", "content": make_system_prompt(ctx)}
    usr_msg = {"role": "user", "content": user_msg}

    try:
        answer = nv_complete([sys_msg, usr_msg], float(temperature), float(top_p), int(max_tokens))
    except OpenAIError as e:
        answer = f"⚠️ Erro da API: {e.__class__.__name__}: {e}"

    history = history + [
        {"role": "user", "content": user_msg},
        {"role": "assistant", "content": answer},
        ]
    return history, ""  # limpa o textbox


def on_clear():
    return [], ""


def rebuild_index_action():
    global faiss_index, pdf_chunks
    faiss_index, pdf_chunks = build_or_load_index(PDF_PATH, INDEX_FILE, CHUNKS_FILE)
    return "✅ Índice reconstruído com sucesso a partir do PDF."


# ===================== UI (Gradio) =====================
custom_css = r"""
:root { --primary:#2156d9; --bg:#f8fafc; --ink:#0f172a; }
body { background: var(--bg); color: var(--ink); }
.container { max-width: 1200px; margin: 0 auto; }
#chatbox { height: 70vh; overflow-y: auto; border:1px solid #cbd5e1; border-radius:8px; padding:0.5rem; }
"""

with gr.Blocks(title=APP_TITLE, css=custom_css, theme=gr.themes.Base()) as demo:
    with gr.Column(elem_classes="container"):
        gr.Markdown(f"## {APP_TITLE}")
        gr.Markdown(
            "Assistente **RAG** sobre **DFSORT**, usando **apenas** o PDF fornecido. "
            "Se algo não estiver no PDF, eu aviso que não sei."
        )

        with gr.Row():
            # ===== Coluna principal (chat) =====
            with gr.Column(scale=3):
                chatbot = gr.Chatbot(type="messages", elem_id="chatbox", height=560)
                state_history = gr.State([])  # guarda o histórico no formato messages

                user_box = gr.Textbox(placeholder="Pergunte algo sobre DFSORT… ex.: Como uso INCLUDE COND?", lines=2)
                with gr.Row():
                    btn_send  = gr.Button("Enviar", variant="primary")
                    btn_clear = gr.Button("Limpar")

                with gr.Row():
                    temperature = gr.Slider(0, 1, 0.4, step=0.05, label="Temperature")
                    top_p       = gr.Slider(0, 1, 0.95, step=0.01, label="Top-p")
                with gr.Row():
                    max_tokens  = gr.Slider(128, 4096, 768, step=64, label="Max Tokens")
                    k_chunks    = gr.Slider(2, 12, 6, step=1, label="Trechos (k)")

                # Enviar via botão e Enter
                btn_send.click(
                    on_send,
                    inputs=[user_box, state_history, temperature, top_p, max_tokens, k_chunks],
                    outputs=[chatbot, user_box],
                )
                user_box.submit(
                    on_send,
                    inputs=[user_box, state_history, temperature, top_p, max_tokens, k_chunks],
                    outputs=[chatbot, user_box],
                )
                btn_clear.click(on_clear, outputs=[chatbot, user_box])

            # ===== Coluna lateral (controle do índice e dicas) =====
            with gr.Column(scale=2):
                gr.Markdown("### Controlo do índice")
                gr.Markdown(f"PDF atual(DFSORT Application Programming Guide)): `{PDF_PATH}`")
                btn_rebuild = gr.Button("Reconstruir índice a partir do PDF")
                msg = gr.Markdown()
                btn_rebuild.click(lambda: rebuild_index_action(), [], [msg])

                gr.Markdown("---")
                gr.Markdown("### Dicas de consulta")
                gr.Markdown(
                    "- Ex.: `Ordenar por 10 bytes a partir da posição 1 (CH, A).`\n"
                    "- Ex.: `Como faço para eliminar duplicados com SUM FIELDS=NONE?`\n"
                    "- Ex.: `JOINKEYS: explique o uso de REFORMAT.`\n"
                    "- Ex.: `Exemplo de OUTFIL com cabeçalho e REMOVECC.`"
                )

if __name__ == "__main__":
    # cria índice na primeira execução (se não existir)
    if not Path(INDEX_FILE).exists() or not Path(CHUNKS_FILE).exists():
        print("[i] Construindo índice a partir do PDF…")
        faiss_index, pdf_chunks = build_or_load_index(PDF_PATH, INDEX_FILE, CHUNKS_FILE)
        print("[i] Índice criado.")
    demo.launch(server_name="0.0.0.0", server_port=7860)