Spaces:

Geoeasy
/

DFSORT

Sleeping

App Files Files Community

Geoeasy commited on Sep 18, 2025

Commit

15ca720

verified ·

1 Parent(s): bd14dc0

Upload 5 files

Browse files

Files changed (6) hide show

.gitattributes +2 -0
app.py +273 -0
ice2ca11.pdf +3 -0
r_chunks.npy +3 -0
r_docs.index +3 -0
requirements.txt +17 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ice2ca11.pdf filter=lfs diff=lfs merge=lfs -text
+r_docs.index filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import os
+import io
+import re
+import time
+from pathlib import Path
+from typing import List, Tuple
+import numpy as np
+import faiss
+import gradio as gr
+# Para leitura do PDF
+try:
+    from pypdf import PdfReader  # pypdf é leve e confiável para extração de texto
+except Exception:
+    # fallback simples se pypdf não estiver disponível
+    PdfReader = None
+# Embeddings e LLM (NVIDIA API estilo OpenAI)
+from sentence_transformers import SentenceTransformer
+from openai import OpenAI, OpenAIError
+"""
+===============================================================================
+DFSORT RAG – Assistente em Português (Gradio)
+-------------------------------------------------------------------------------
+• Objetivo: responder sobre DFSORT (IBM z/OS) usando **apenas** o PDF fornecido como
+  base de conhecimento (RAG — Retrieval Augmented Generation).
+• Tudo em português: interface, comentários e mensagens do sistema.
+• Sem conteúdos de CV ou outros temas. Foco total em DFSORT.
+• O app cria o índice (FAISS + embeddings MiniLM) automaticamente na primeira execução.
+Como usar
+1) Garanta que o PDF esteja disponível. Por padrão este script usa:
+   - PDF_PATH = "ice2ca11.pdf" (você pode alterar o caminho abaixo)
+2) Execute o script. Na primeira execução, ele extrai o texto do PDF e cria:
+   - r_docs.index  (FAISS)
+   - r_chunks.npy  (lista de trechos do PDF)
+3) Interaja no chat. O modelo responde **somente** com base nos trechos recuperados.
+Requisitos (pip):
+    pip install gradio pypdf faiss-cpu sentence-transformers openai
+==========================================================================
+ATENÇÃO SOBRE KEYS
+- Configure a variável de ambiente NV_API_KEY com a sua chave da NVIDIA
+  (API OpenAI-compatible em https://integrate.api.nvidia.com/v1).
+==========================================================================
+"""
+# ===================== Configurações =====================
+APP_TITLE = "DFSORT RAG (PDF)"
+PDF_PATH = "ice2ca11.pdf"  # use o PDF fornecido; altere se necessário
+INDEX_FILE = "r_docs.index"
+CHUNKS_FILE = "r_chunks.npy"
+# Modelo de chat na NVIDIA (pode trocar por outro suportado)
+CHAT_MODEL = "meta/llama3-8b-instruct"
+NV_API_KEY = os.environ.get("NV_API_KEY")
+if not NV_API_KEY:
+    raise RuntimeError("🔒 NV_API_KEY não definido. Configure em Settings → Variables & Secrets.")
+client = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key=NV_API_KEY)
+# Modelo de embeddings (baixa no primeiro uso)
+EMB_MODEL_NAME = "all-MiniLM-L6-v2"
+embedding_model = SentenceTransformer(EMB_MODEL_NAME)
+# ===================== Pipeline de Indexação =====================
+def _pdf_to_text_chunks(pdf_path: str, max_chunk_chars: int = 1200) -> List[str]:
+    """Lê o PDF e cria chunks de texto amigáveis ao RAG.
+    - Divide por páginas e quebras duplas de linha.
+    - Faz um 'merge' simples até atingir ~max_chunk_chars.
+    - Remove linhas vazias e normaliza espaços.
+    """
+    path = Path(pdf_path)
+    if not path.exists():
+        raise FileNotFoundError(f"PDF não encontrado: {pdf_path}")
+    raw_pages: List[str] = []
+    if PdfReader is None:
+        # fallback: ler bytes e tentar split muito simples (não ideal)
+        with open(path, "rb") as f:
+            data = f.read()
+        text = data.decode(errors="ignore")
+        raw_pages = re.split(r"\f|\n\s*\n", text)
+    else:
+        reader = PdfReader(str(path))
+        for pg in reader.pages:
+            try:
+                raw = pg.extract_text() or ""
+            except Exception:
+                raw = ""
+            raw_pages.append(raw)
+    # limpeza e chunking
+    blocks: List[str] = []
+    for page_txt in raw_pages:
+        if not page_txt:
+            continue
+        # normalizações leves
+        t = re.sub(r"[ \t]+", " ", page_txt)
+        t = re.sub(r"\n{2,}", "\n\n", t).strip()
+        # quebra por parágrafos duplos ou linhas
+        parts = re.split(r"\n\n+|\n• |\n- ", t)
+        blocks.extend(p.strip() for p in parts if p and p.strip())
+    # juntar em chunks de tamanho alvo
+    chunks: List[str] = []
+    buf = []
+    size = 0
+    for b in blocks:
+        if size + len(b) + 1 > max_chunk_chars:
+            if buf:
+                chunks.append("\n".join(buf))
+            buf = [b]
+            size = len(b)
+        else:
+            buf.append(b)
+            size += len(b) + 1
+    if buf:
+        chunks.append("\n".join(buf))
+    # reforço: remover pedaços muito curtos
+    chunks = [c.strip() for c in chunks if len(c.strip()) > 50]
+    return chunks
+def build_or_load_index(pdf_path: str, index_path: str, chunks_path: str) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
+    """Cria ou carrega o índice FAISS e os chunks."""
+    if Path(index_path).exists() and Path(chunks_path).exists():
+        index = faiss.read_index(index_path)
+        chunks = np.load(chunks_path, allow_pickle=True)
+        return index, chunks
+    # construir
+    chunks_list = _pdf_to_text_chunks(pdf_path)
+    emb = embedding_model.encode(chunks_list, convert_to_numpy=True, normalize_embeddings=True)
+    d = emb.shape[1]
+    index = faiss.IndexFlatIP(d)
+    index.add(emb)
+    faiss.write_index(index, index_path)
+    np.save(chunks_path, np.array(chunks_list, dtype=object))
+    return index, np.array(chunks_list, dtype=object)
+# ===================== Recuperação + Chat =====================
+def retrieve_context(query: str, index: faiss.IndexFlatIP, chunks: np.ndarray, k: int = 6) -> str:
+    q = embedding_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
+    scores, idxs = index.search(q, k)
+    parts = []
+    for i in idxs[0]:
+        if 0 <= i < len(chunks):
+            parts.append(str(chunks[i]))
+    return "\n---\n".join(parts)
+def nv_stream(messages, temperature: float, top_p: float, max_tokens: int):
+    """Streaming de resposta do modelo NVIDIA (compatível com OpenAI)."""
+    assistant_reply = ""
+    stream = client.chat.completions.create(
+        model=CHAT_MODEL,
+        messages=messages,
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens,
+        stream=True,
+    )
+    for chunk in stream:
+        delta = chunk.choices[0].delta
+        if hasattr(delta, "content") and delta.content:
+            assistant_reply += delta.content
+            yield assistant_reply
+def make_system_prompt(ctx: str) -> str:
+    return (
+        "Você é um assistente especializado em DFSORT (IBM z/OS).\n"
+        "Responda **apenas** com base no contexto recuperado do PDF.\n"
+        "Se a informação não estiver no contexto, diga que não sabe.\n\n"
+        f"=== Contexto (trechos do PDF) ===\n{ctx}\n\n"
+        "Ao mostrar exemplos, prefira JCL/SYSIN claros e curtos."
+    )
+# ===================== UI (Gradio) =====================
+def chatbot_ui(user_input: str, temperature: float, top_p: float, max_tokens: int, k: int):
+    if not user_input or not user_input.strip():
+        return ""
+    # garanta índice carregado
+    global faiss_index, pdf_chunks
+    if faiss_index is None or pdf_chunks is None:
+        faiss_index, pdf_chunks = build_or_load_index(PDF_PATH, INDEX_FILE, CHUNKS_FILE)
+    ctx = retrieve_context(user_input, faiss_index, pdf_chunks, k=k)
+    sys_msg = {"role": "system", "content": make_system_prompt(ctx)}
+    usr_msg = {"role": "user", "content": user_input}
+    # streaming para UX fluida
+    try:
+        out = ""
+        for partial in nv_stream([sys_msg, usr_msg], temperature, top_p, max_tokens):
+            out = partial
+        return out
+    except OpenAIError as e:
+        return f"⚠️ Erro da API: {e.__class__.__name__}: {e}"
+def rebuild_index_action():
+    global faiss_index, pdf_chunks
+    faiss_index, pdf_chunks = build_or_load_index(PDF_PATH, INDEX_FILE, CHUNKS_FILE)
+    return "✅ Índice reconstruído com sucesso a partir do PDF."
+# Estado global carregado sob demanda
+faiss_index = None
+pdf_chunks = None
+custom_css = r"""
+:root { --primary:#2156d9; --bg:#f8fafc; --ink:#0f172a; }
+body { background: var(--bg); color: var(--ink); }
+#chatbox { height: 60vh; overflow-y: auto; }
+"""
+with gr.Blocks(title=APP_TITLE, css=custom_css, theme=gr.themes.Base()) as demo:
+    gr.Markdown(f"## {APP_TITLE}")
+    gr.Markdown(
+        "Este assistente responde sobre **DFSORT** usando apenas o PDF como fonte. "
+        "Se algo não estiver no PDF, ele informa que não sabe."
+    )
+    with gr.Row():
+        with gr.Column(scale=3):
+            chat = gr.ChatInterface(
+                fn=lambda msg, hist, t, p, mt, k: chatbot_ui(msg, t, p, mt, k),
+                additional_inputs=[
+                    gr.Slider(0, 1, 0.4, label="Temperature"),
+                    gr.Slider(0, 1, 0.95, label="Top-p"),
+                    gr.Slider(128, 4096, 768, step=64, label="Max Tokens"),
+                    gr.Slider(2, 12, 6, step=1, label="Trechos (k)")
+                ],
+                multimodal=False,
+                title="Chat DFSORT (RAG)",
+                textbox=gr.Textbox(placeholder="Pergunte algo sobre DFSORT… ex.: Como uso INCLUDE COND?"),
+                cache_examples=False,
+            )
+        with gr.Column(scale=2):
+            gr.Markdown("### Controlo do índice")
+            gr.Markdown(f"PDF atual: `{PDF_PATH}`")
+            btn_rebuild = gr.Button("Reconstruir índice a partir do PDF")
+            msg = gr.Markdown()
+            btn_rebuild.click(lambda: rebuild_index_action(), [], [msg])
+            gr.Markdown("---")
+            gr.Markdown("### Dicas de consulta (direto do PDF)")
+            gr.Markdown(
+                "- Ex.: `Ordenar por 10 bytes a partir da posição 1 (CH, A).`\n"
+                "- Ex.: `Como faço para eliminar duplicados com SUM FIELDS=NONE?`\n"
+                "- Ex.: `JOINKEYS: explique o uso de REFORMAT.`\n"
+                "- Ex.: `Exemplo de OUTFIL com cabeçalho e REMOVECC.`"
+            )
+if __name__ == "__main__":
+    # cria índice na primeira execução
+    if not Path(INDEX_FILE).exists() or not Path(CHUNKS_FILE).exists():
+        print("[i] Construindo índice a partir do PDF…")
+        faiss_index, pdf_chunks = build_or_load_index(PDF_PATH, INDEX_FILE, CHUNKS_FILE)
+        print("[i] Índice criado.")
+    demo.launch(server_name="0.0.0.0", server_port=7861)

ice2ca11.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6601d98cbfc2ebe917d8758fdf7c24e4d1a59e0e2b8ff27707e470624995031
+size 6202478

r_chunks.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfde06a3044c293d72c60301f7e49a40ec575289316250d23751a7f4860a0103
+size 2060551

r_docs.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2052db52f49efd135545d64366b9ee74df937fc8f2c2449bfc8a8ff0d8a24f39
+size 1446957

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# Núcleo científico
+numpy>=1.24
+# FAISS (indexação vetorial)
+faiss-cpu>=1.7.4
+# Interface web
+gradio>=4.0
+# Leitura de PDF
+pypdf>=4.0
+# Embeddings
+sentence-transformers>=2.2.2
+# Cliente OpenAI (compatível com API da NVIDIA)
+openai>=1.0.0