Spaces:

Geoeasy
/

DFSORT

Sleeping

App Files Files Community

Geoeasy commited on Sep 18, 2025

Commit

8b4671d

verified ·

1 Parent(s): 94f5502

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -118

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import os
-import io
 import re
-import time
 from pathlib import Path
 from typing import List, Tuple
@@ -9,52 +7,32 @@ import numpy as np
 import faiss
 import gradio as gr
-# Para leitura do PDF
 try:
     from pypdf import PdfReader  # pypdf é leve e confiável para extração de texto
 except Exception:
-    # fallback simples se pypdf não estiver disponível
     PdfReader = None
-# Embeddings e LLM (NVIDIA API estilo OpenAI)
 from sentence_transformers import SentenceTransformer
 from openai import OpenAI, OpenAIError
 """
-===============================================================================
 DFSORT RAG – Assistente em Português (Gradio)
--------------------------------------------------------------------------------
-• Objetivo: responder sobre DFSORT (IBM z/OS) usando **apenas** o PDF fornecido como
-  base de conhecimento (RAG — Retrieval Augmented Generation).
-• Tudo em português: interface, comentários e mensagens do sistema.
-• Sem conteúdos de CV ou outros temas. Foco total em DFSORT.
-• O app cria o índice (FAISS + embeddings MiniLM) automaticamente na primeira execução.
-Como usar
-1) Garanta que o PDF esteja disponível. Por padrão este script usa:
-   - PDF_PATH = "ice2ca11.pdf" (você pode alterar o caminho abaixo)
-2) Execute o script. Na primeira execução, ele extrai o texto do PDF e cria:
-   - r_docs.index  (FAISS)
-   - r_chunks.npy  (lista de trechos do PDF)
-3) Interaja no chat. O modelo responde **somente** com base nos trechos recuperados.
-Requisitos (pip):
-    pip install gradio pypdf faiss-cpu sentence-transformers openai
-==========================================================================
-ATENÇÃO SOBRE KEYS
-- Configure a variável de ambiente NV_API_KEY com a sua chave da NVIDIA
-  (API OpenAI-compatible em https://integrate.api.nvidia.com/v1).
-==========================================================================
 """
 # ===================== Configurações =====================
-APP_TITLE = "DFSORT RAG (PDF)"
-PDF_PATH = "ice2ca11.pdf"  # use o PDF fornecido; altere se necessário
-INDEX_FILE = "r_docs.index"
 CHUNKS_FILE = "r_chunks.npy"
-# Modelo de chat na NVIDIA (pode trocar por outro suportado)
 CHAT_MODEL = "meta/llama3-8b-instruct"
 NV_API_KEY = os.environ.get("NV_API_KEY")
 if not NV_API_KEY:
@@ -63,16 +41,18 @@ if not NV_API_KEY:
 client = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key=NV_API_KEY)
 # Modelo de embeddings (baixa no primeiro uso)
-EMB_MODEL_NAME = "all-MiniLM-L6-v2"
 embedding_model = SentenceTransformer(EMB_MODEL_NAME)
-# ===================== Pipeline de Indexação =====================
 def _pdf_to_text_chunks(pdf_path: str, max_chunk_chars: int = 1200) -> List[str]:
-    """Lê o PDF e cria chunks de texto amigáveis ao RAG.
-    - Divide por páginas e quebras duplas de linha.
-    - Faz um 'merge' simples até atingir ~max_chunk_chars.
-    - Remove linhas vazias e normaliza espaços.
     """
     path = Path(pdf_path)
     if not path.exists():
@@ -80,7 +60,7 @@ def _pdf_to_text_chunks(pdf_path: str, max_chunk_chars: int = 1200) -> List[str]
     raw_pages: List[str] = []
     if PdfReader is None:
-        # fallback: ler bytes e tentar split muito simples (não ideal)
         with open(path, "rb") as f:
             data = f.read()
         text = data.decode(errors="ignore")
@@ -94,21 +74,17 @@ def _pdf_to_text_chunks(pdf_path: str, max_chunk_chars: int = 1200) -> List[str]
                 raw = ""
             raw_pages.append(raw)
-    # limpeza e chunking
     blocks: List[str] = []
     for page_txt in raw_pages:
         if not page_txt:
             continue
-        # normalizações leves
         t = re.sub(r"[ \t]+", " ", page_txt)
         t = re.sub(r"\n{2,}", "\n\n", t).strip()
-        # quebra por parágrafos duplos ou linhas
         parts = re.split(r"\n\n+|\n• |\n- ", t)
         blocks.extend(p.strip() for p in parts if p and p.strip())
-    # juntar em chunks de tamanho alvo
     chunks: List[str] = []
-    buf = []
     size = 0
     for b in blocks:
         if size + len(b) + 1 > max_chunk_chars:
@@ -122,19 +98,19 @@ def _pdf_to_text_chunks(pdf_path: str, max_chunk_chars: int = 1200) -> List[str]
     if buf:
         chunks.append("\n".join(buf))
-    # reforço: remover pedaços muito curtos
     chunks = [c.strip() for c in chunks if len(c.strip()) > 50]
     return chunks
 def build_or_load_index(pdf_path: str, index_path: str, chunks_path: str) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
-    """Cria ou carrega o índice FAISS e os chunks."""
     if Path(index_path).exists() and Path(chunks_path).exists():
         index = faiss.read_index(index_path)
         chunks = np.load(chunks_path, allow_pickle=True)
         return index, chunks
-    # construir
     chunks_list = _pdf_to_text_chunks(pdf_path)
     emb = embedding_model.encode(chunks_list, convert_to_numpy=True, normalize_embeddings=True)
     d = emb.shape[1]
@@ -145,34 +121,28 @@ def build_or_load_index(pdf_path: str, index_path: str, chunks_path: str) -> Tup
     return index, np.array(chunks_list, dtype=object)
-# ===================== Recuperação + Chat =====================
 def retrieve_context(query: str, index: faiss.IndexFlatIP, chunks: np.ndarray, k: int = 6) -> str:
     q = embedding_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
     scores, idxs = index.search(q, k)
-    parts = []
     for i in idxs[0]:
         if 0 <= i < len(chunks):
             parts.append(str(chunks[i]))
     return "\n---\n".join(parts)
-def nv_stream(messages, temperature: float, top_p: float, max_tokens: int):
-    """Streaming de resposta do modelo NVIDIA (compatível com OpenAI)."""
-    assistant_reply = ""
-    stream = client.chat.completions.create(
         model=CHAT_MODEL,
         messages=messages,
         temperature=temperature,
         top_p=top_p,
         max_tokens=max_tokens,
-        stream=True,
     )
-    for chunk in stream:
-        delta = chunk.choices[0].delta
-        if hasattr(delta, "content") and delta.content:
-            assistant_reply += delta.content
-            yield assistant_reply
 def make_system_prompt(ctx: str) -> str:
@@ -181,32 +151,44 @@ def make_system_prompt(ctx: str) -> str:
         "Responda **apenas** com base no contexto recuperado do PDF.\n"
         "Se a informação não estiver no contexto, diga que não sabe.\n\n"
         f"=== Contexto (trechos do PDF) ===\n{ctx}\n\n"
-        "Ao mostrar exemplos, prefira JCL/SYSIN claros e curtos."
     )
-# ===================== UI (Gradio) =====================
-def chatbot_ui(user_input: str, temperature: float, top_p: float, max_tokens: int, k: int):
-    if not user_input or not user_input.strip():
-        return ""
-    # garanta índice carregado
     global faiss_index, pdf_chunks
     if faiss_index is None or pdf_chunks is None:
         faiss_index, pdf_chunks = build_or_load_index(PDF_PATH, INDEX_FILE, CHUNKS_FILE)
-    ctx = retrieve_context(user_input, faiss_index, pdf_chunks, k=k)
     sys_msg = {"role": "system", "content": make_system_prompt(ctx)}
-    usr_msg = {"role": "user", "content": user_input}
-    # streaming para UX fluida
     try:
-        out = ""
-        for partial in nv_stream([sys_msg, usr_msg], temperature, top_p, max_tokens):
-            out = partial
-        return out
     except OpenAIError as e:
-        return f"⚠️ Erro da API: {e.__class__.__name__}: {e}"
 def rebuild_index_action():
@@ -215,59 +197,74 @@ def rebuild_index_action():
     return "✅ Índice reconstruído com sucesso a partir do PDF."
-# Estado global carregado sob demanda
-faiss_index = None
-pdf_chunks = None
 custom_css = r"""
 :root { --primary:#2156d9; --bg:#f8fafc; --ink:#0f172a; }
 body { background: var(--bg); color: var(--ink); }
-#chatbox { height: 60vh; overflow-y: auto; }
 """
 with gr.Blocks(title=APP_TITLE, css=custom_css, theme=gr.themes.Base()) as demo:
-    gr.Markdown(f"## {APP_TITLE}")
-    gr.Markdown(
-        "Este assistente responde sobre **DFSORT** usando apenas o PDF como fonte. "
-        "Se algo não estiver no PDF, ele informa que não sabe."
-    )
-    with gr.Row():
-        with gr.Column(scale=3):
-            chat = gr.ChatInterface(
-                fn=lambda msg, hist, t, p, mt, k: chatbot_ui(msg, t, p, mt, k),
-                additional_inputs=[
-                    gr.Slider(0, 1, 0.4, label="Temperature"),
-                    gr.Slider(0, 1, 0.95, label="Top-p"),
-                    gr.Slider(128, 4096, 768, step=64, label="Max Tokens"),
-                    gr.Slider(2, 12, 6, step=1, label="Trechos (k)")
-                ],
-                multimodal=False,
-                title="Chat DFSORT (RAG)",
-                textbox=gr.Textbox(placeholder="Pergunte algo sobre DFSORT… ex.: Como uso INCLUDE COND?"),
-                cache_examples=False,
-            )
-        with gr.Column(scale=2):
-            gr.Markdown("### Controlo do índice")
-            gr.Markdown(f"PDF atual: `{PDF_PATH}`")
-            btn_rebuild = gr.Button("Reconstruir índice a partir do PDF")
-            msg = gr.Markdown()
-            btn_rebuild.click(lambda: rebuild_index_action(), [], [msg])
-            gr.Markdown("---")
-            gr.Markdown("### Dicas de consulta (direto do PDF)")
-            gr.Markdown(
-                "- Ex.: `Ordenar por 10 bytes a partir da posição 1 (CH, A).`\n"
-                "- Ex.: `Como faço para eliminar duplicados com SUM FIELDS=NONE?`\n"
-                "- Ex.: `JOINKEYS: explique o uso de REFORMAT.`\n"
-                "- Ex.: `Exemplo de OUTFIL com cabeçalho e REMOVECC.`"
-            )
 if __name__ == "__main__":
-    # cria índice na primeira execução
     if not Path(INDEX_FILE).exists() or not Path(CHUNKS_FILE).exists():
         print("[i] Construindo índice a partir do PDF…")
         faiss_index, pdf_chunks = build_or_load_index(PDF_PATH, INDEX_FILE, CHUNKS_FILE)
         print("[i] Índice criado.")
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
 import re
 from pathlib import Path
 from typing import List, Tuple
 import faiss
 import gradio as gr
+# Leitura do PDF
 try:
     from pypdf import PdfReader  # pypdf é leve e confiável para extração de texto
 except Exception:
     PdfReader = None
+# Embeddings e LLM (API NVIDIA estilo OpenAI)
 from sentence_transformers import SentenceTransformer
 from openai import OpenAI, OpenAIError
 """
 DFSORT RAG – Assistente em Português (Gradio)
+---------------------------------------------
+• Interface totalmente em português.
+• Botões "Enviar" e "Limpar" no chat.
+• Página enquadrada (layout responsivo) para tudo ficar visível.
+• RAG simples: FAISS + MiniLM sobre o PDF fornecido (somente ele como fonte).
 """
 # ===================== Configurações =====================
+APP_TITLE   = "DFSORT RAG (PDF)"
+PDF_PATH    = "ice2ca11.pdf"   # ajuste se o PDF tiver outro nome/caminho
+INDEX_FILE  = "r_docs.index"
 CHUNKS_FILE = "r_chunks.npy"
+# Modelo de chat (NVIDIA OpenAI-compatible)
 CHAT_MODEL = "meta/llama3-8b-instruct"
 NV_API_KEY = os.environ.get("NV_API_KEY")
 if not NV_API_KEY:
 client = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key=NV_API_KEY)
 # Modelo de embeddings (baixa no primeiro uso)
+EMB_MODEL_NAME  = "all-MiniLM-L6-v2"
 embedding_model = SentenceTransformer(EMB_MODEL_NAME)
+# Estado global (carregado sob demanda)
+faiss_index = None
+pdf_chunks  = None
+# ===================== Indexação a partir do PDF =====================
 def _pdf_to_text_chunks(pdf_path: str, max_chunk_chars: int = 1200) -> List[str]:
+    """Extrai texto do PDF e cria chunks (~max_chunk_chars) para o RAG.
+    - Divide por páginas; normaliza espaços/linhas; agrega em blocos.
     """
     path = Path(pdf_path)
     if not path.exists():
     raw_pages: List[str] = []
     if PdfReader is None:
+        # fallback tosco se pypdf faltar (não recomendado)
         with open(path, "rb") as f:
             data = f.read()
         text = data.decode(errors="ignore")
                 raw = ""
             raw_pages.append(raw)
     blocks: List[str] = []
     for page_txt in raw_pages:
         if not page_txt:
             continue
         t = re.sub(r"[ \t]+", " ", page_txt)
         t = re.sub(r"\n{2,}", "\n\n", t).strip()
         parts = re.split(r"\n\n+|\n• |\n- ", t)
         blocks.extend(p.strip() for p in parts if p and p.strip())
     chunks: List[str] = []
+    buf: List[str] = []
     size = 0
     for b in blocks:
         if size + len(b) + 1 > max_chunk_chars:
     if buf:
         chunks.append("\n".join(buf))
+    # remover pedaços muito curtos
     chunks = [c.strip() for c in chunks if len(c.strip()) > 50]
     return chunks
 def build_or_load_index(pdf_path: str, index_path: str, chunks_path: str) -> Tuple[faiss.IndexFlatIP, np.ndarray]:
+    """Cria/carrega índice FAISS e os chunks a partir do PDF."""
     if Path(index_path).exists() and Path(chunks_path).exists():
         index = faiss.read_index(index_path)
         chunks = np.load(chunks_path, allow_pickle=True)
         return index, chunks
+    # construir do zero
     chunks_list = _pdf_to_text_chunks(pdf_path)
     emb = embedding_model.encode(chunks_list, convert_to_numpy=True, normalize_embeddings=True)
     d = emb.shape[1]
     return index, np.array(chunks_list, dtype=object)
+# ===================== Recuperação + LLM =====================
 def retrieve_context(query: str, index: faiss.IndexFlatIP, chunks: np.ndarray, k: int = 6) -> str:
     q = embedding_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
     scores, idxs = index.search(q, k)
+    parts: List[str] = []
     for i in idxs[0]:
         if 0 <= i < len(chunks):
             parts.append(str(chunks[i]))
     return "\n---\n".join(parts)
+def nv_complete(messages, temperature: float, top_p: float, max_tokens: int) -> str:
+    resp = client.chat.completions.create(
         model=CHAT_MODEL,
         messages=messages,
         temperature=temperature,
         top_p=top_p,
         max_tokens=max_tokens,
+        stream=False,
     )
+    return resp.choices[0].message.content.strip()
 def make_system_prompt(ctx: str) -> str:
         "Responda **apenas** com base no contexto recuperado do PDF.\n"
         "Se a informação não estiver no contexto, diga que não sabe.\n\n"
         f"=== Contexto (trechos do PDF) ===\n{ctx}\n\n"
+        "Quando der exemplos, forneça JCL/SYSIN curtos e claros."
     )
+# ===================== Handlers do Chat =====================
+def ensure_index_loaded():
     global faiss_index, pdf_chunks
     if faiss_index is None or pdf_chunks is None:
         faiss_index, pdf_chunks = build_or_load_index(PDF_PATH, INDEX_FILE, CHUNKS_FILE)
+def on_send(user_msg, history, temperature, top_p, max_tokens, k):
+    """Envia a pergunta, roda o RAG e devolve o histórico atualizado."""
+    ensure_index_loaded()
+    history = history or []
+    user_msg = (user_msg or "").strip()
+    if not user_msg:
+        return history, ""
+    ctx = retrieve_context(user_msg, faiss_index, pdf_chunks, k=int(k))
     sys_msg = {"role": "system", "content": make_system_prompt(ctx)}
+    usr_msg = {"role": "user", "content": user_msg}
     try:
+        answer = nv_complete([sys_msg, usr_msg], float(temperature), float(top_p), int(max_tokens))
     except OpenAIError as e:
+        answer = f"⚠️ Erro da API: {e.__class__.__name__}: {e}"
+    history = history + [
+        {"role": "user", "content": user_msg},
+        {"role": "assistant", "content": answer},
+        ]
+    return history, ""  # limpa o textbox
+def on_clear():
+    return [], ""
 def rebuild_index_action():
     return "✅ Índice reconstruído com sucesso a partir do PDF."
+# ===================== UI (Gradio) =====================
 custom_css = r"""
 :root { --primary:#2156d9; --bg:#f8fafc; --ink:#0f172a; }
 body { background: var(--bg); color: var(--ink); }
+.container { max-width: 1200px; margin: 0 auto; }
+#chatbox { height: 70vh; overflow-y: auto; border:1px solid #cbd5e1; border-radius:8px; padding:0.5rem; }
 """
 with gr.Blocks(title=APP_TITLE, css=custom_css, theme=gr.themes.Base()) as demo:
+    with gr.Column(elem_classes="container"):
+        gr.Markdown(f"## {APP_TITLE}")
+        gr.Markdown(
+            "Assistente **RAG** sobre **DFSORT**, usando **apenas** o PDF fornecido. "
+            "Se algo não estiver no PDF, eu aviso que não sei."
+        )
+        with gr.Row():
+            # ===== Coluna principal (chat) =====
+            with gr.Column(scale=3):
+                chatbot = gr.Chatbot(type="messages", elem_id="chatbox", height=560)
+                state_history = gr.State([])  # guarda o histórico no formato messages
+                user_box = gr.Textbox(placeholder="Pergunte algo sobre DFSORT… ex.: Como uso INCLUDE COND?", lines=2)
+                with gr.Row():
+                    btn_send  = gr.Button("Enviar", variant="primary")
+                    btn_clear = gr.Button("Limpar")
+                with gr.Row():
+                    temperature = gr.Slider(0, 1, 0.4, step=0.05, label="Temperature")
+                    top_p       = gr.Slider(0, 1, 0.95, step=0.01, label="Top-p")
+                with gr.Row():
+                    max_tokens  = gr.Slider(128, 4096, 768, step=64, label="Max Tokens")
+                    k_chunks    = gr.Slider(2, 12, 6, step=1, label="Trechos (k)")
+                # Enviar via botão e Enter
+                btn_send.click(
+                    on_send,
+                    inputs=[user_box, state_history, temperature, top_p, max_tokens, k_chunks],
+                    outputs=[chatbot, user_box],
+                )
+                user_box.submit(
+                    on_send,
+                    inputs=[user_box, state_history, temperature, top_p, max_tokens, k_chunks],
+                    outputs=[chatbot, user_box],
+                )
+                btn_clear.click(on_clear, outputs=[chatbot, user_box])
+            # ===== Coluna lateral (controle do índice e dicas) =====
+            with gr.Column(scale=2):
+                gr.Markdown("### Controlo do índice")
+                gr.Markdown(f"PDF atual(DFSORT Application Programming Guide)): `{PDF_PATH}`")
+                btn_rebuild = gr.Button("Reconstruir índice a partir do PDF")
+                msg = gr.Markdown()
+                btn_rebuild.click(lambda: rebuild_index_action(), [], [msg])
+                gr.Markdown("---")
+                gr.Markdown("### Dicas de consulta")
+                gr.Markdown(
+                    "- Ex.: `Ordenar por 10 bytes a partir da posição 1 (CH, A).`\n"
+                    "- Ex.: `Como faço para eliminar duplicados com SUM FIELDS=NONE?`\n"
+                    "- Ex.: `JOINKEYS: explique o uso de REFORMAT.`\n"
+                    "- Ex.: `Exemplo de OUTFIL com cabeçalho e REMOVECC.`"
+                )
 if __name__ == "__main__":
+    # cria índice na primeira execução (se não existir)
     if not Path(INDEX_FILE).exists() or not Path(CHUNKS_FILE).exists():
         print("[i] Construindo índice a partir do PDF…")
         faiss_index, pdf_chunks = build_or_load_index(PDF_PATH, INDEX_FILE, CHUNKS_FILE)
         print("[i] Índice criado.")
+    demo.launch(server_name="0.0.0.0", server_port=7860)