Spaces:
Build error
Build error
Upload app.py
Browse files
app.py
CHANGED
|
@@ -1,746 +1,727 @@
|
|
| 1 |
-
# -*- coding: utf-8 -*-
|
| 2 |
-
# Db2 z/OS • RAG (NVIDIA NIM)
|
| 3 |
-
# Sidebar + Diagnóstico + Extração robusta (com OCR opcional) + Chunkização por caracteres
|
| 4 |
-
# Requisitos:
|
| 5 |
-
# pip install openai gradio numpy pypdf pdfminer.six pymupdf
|
| 6 |
-
# (OCR opcional) + dependências de SO:
|
| 7 |
-
# pip install pytesseract pdf2image pillow
|
| 8 |
-
# Linux: apt-get install -y tesseract-ocr poppler-utils
|
| 9 |
-
|
| 10 |
-
import os
|
| 11 |
-
import re
|
| 12 |
-
import json
|
| 13 |
-
from pathlib import Path
|
| 14 |
-
from typing import List, Tuple, Dict, Optional
|
| 15 |
-
|
| 16 |
-
import numpy as np
|
| 17 |
-
import gradio as gr
|
| 18 |
-
|
| 19 |
-
# ==============================
|
| 20 |
-
# Config (Db2 + NVIDIA NIM)
|
| 21 |
-
# ==============================
|
| 22 |
-
BASE_DIR = Path(__file__).parent if "__file__" in globals() else Path.cwd()
|
| 23 |
-
|
| 24 |
-
# Altere se necessário; o app também aceita PDFs no diretório atual (*.pdf)
|
| 25 |
-
USER_PDF = Path("db2z_13_utilities.pdf")
|
| 26 |
-
PDFS = [USER_PDF] if USER_PDF.exists() else sorted([p for p in BASE_DIR.glob("*.pdf") if p.is_file() and p.suffix.lower()==".pdf"])
|
| 27 |
-
|
| 28 |
-
NVCF_BASE = os.getenv("NVCF_BASE", "https://integrate.api.nvidia.com/v1")
|
| 29 |
-
NVCF_API_KEY = os.getenv("NVCF_API_KEY", "")
|
| 30 |
-
EMBED_MODEL = os.getenv("EMBED_MODEL", "nvidia/nv-embed-v1") # 4096-dim atualmente
|
| 31 |
-
CHAT_MODEL = os.getenv("CHAT_MODEL", "meta/llama-3.1-8b-instruct")
|
| 32 |
-
|
| 33 |
-
SAFE_IDX = f".db2_index_{EMBED_MODEL.replace('/','__')}"
|
| 34 |
-
INDEX_DIR = BASE_DIR / SAFE_IDX
|
| 35 |
-
INDEX_DIR.mkdir(exist_ok=True)
|
| 36 |
-
VEC_FILE = INDEX_DIR / "vectors.npy"
|
| 37 |
-
META_FILE = INDEX_DIR / "meta.json"
|
| 38 |
-
|
| 39 |
-
TOP_K_RETRIEVE = 3
|
| 40 |
-
TARGET_CONTEXT_CHARS = 1500
|
| 41 |
-
|
| 42 |
-
# ==============================
|
| 43 |
-
# NVIDIA (OpenAI-compatible)
|
| 44 |
-
# ==============================
|
| 45 |
-
try:
|
| 46 |
-
from openai import OpenAI
|
| 47 |
-
except ImportError:
|
| 48 |
-
raise RuntimeError("Instale: pip install openai gradio numpy pypdf pdfminer.six pymupdf")
|
| 49 |
-
|
| 50 |
-
def get_client():
|
| 51 |
-
if not NVCF_API_KEY or NVCF_API_KEY == "xxxxxxxxxxxxxxxxxxx":
|
| 52 |
-
raise RuntimeError("NVCF_API_KEY não definido. export/set NVCF_API_KEY='SUA_CHAVE'")
|
| 53 |
-
return OpenAI(base_url=NVCF_BASE, api_key=NVCF_API_KEY)
|
| 54 |
-
|
| 55 |
-
# ==============================
|
| 56 |
-
# PDF utils (robusto + OCR opcional)
|
| 57 |
-
# ==============================
|
| 58 |
-
try:
|
| 59 |
-
import fitz # PyMuPDF
|
| 60 |
-
except Exception:
|
| 61 |
-
fitz = None
|
| 62 |
-
|
| 63 |
-
try:
|
| 64 |
-
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
| 65 |
-
except Exception:
|
| 66 |
-
pdfminer_extract_text = None
|
| 67 |
-
|
| 68 |
-
try:
|
| 69 |
-
from pypdf import PdfReader
|
| 70 |
-
except Exception:
|
| 71 |
-
PdfReader = None
|
| 72 |
-
|
| 73 |
-
NBSP = "\u00A0"
|
| 74 |
-
def _normalize_text(t: str) -> str:
|
| 75 |
-
if not isinstance(t, str):
|
| 76 |
-
t = str(t or "")
|
| 77 |
-
t = t.replace(NBSP, " ")
|
| 78 |
-
t = re.sub(r"[\u0000-\u001F]", " ", t) # remove controles
|
| 79 |
-
# mantém quebras simples e colapsa espaços longos
|
| 80 |
-
t = re.sub(r"[ \t]{2,}", " ", t)
|
| 81 |
-
t = re.sub(r"\n{3,}", "\n\n", t)
|
| 82 |
-
return t.strip()
|
| 83 |
-
|
| 84 |
-
def _extract_with_pymupdf(path: Path) -> List[Tuple[int, str]]:
|
| 85 |
-
out: List[Tuple[int, str]] = []
|
| 86 |
-
with fitz.open(path) as doc:
|
| 87 |
-
for i, page in enumerate(doc):
|
| 88 |
-
text = page.get_text("text") or ""
|
| 89 |
-
out.append((i + 1, _normalize_text(text)))
|
| 90 |
-
return out
|
| 91 |
-
|
| 92 |
-
def _extract_with_pdfminer(path: Path) -> List[Tuple[int, str]]:
|
| 93 |
-
text_all = pdfminer_extract_text(str(path)) or ""
|
| 94 |
-
pages = re.split(r"\f", text_all)
|
| 95 |
-
out: List[Tuple[int, str]] = []
|
| 96 |
-
for i, tx in enumerate(pages):
|
| 97 |
-
out.append((i + 1, _normalize_text(tx)))
|
| 98 |
-
return out
|
| 99 |
-
|
| 100 |
-
def _extract_with_pypdf(path: Path) -> List[Tuple[int, str]]:
|
| 101 |
-
if PdfReader is None:
|
| 102 |
-
return []
|
| 103 |
-
try:
|
| 104 |
-
r = PdfReader(str(path), strict=False)
|
| 105 |
-
except Exception as e:
|
| 106 |
-
print(f"[WARN] pypdf: falha ao abrir {path.name}: {e}")
|
| 107 |
-
return []
|
| 108 |
-
out: List[Tuple[int, str]] = []
|
| 109 |
-
for i in range(len(r.pages)):
|
| 110 |
-
try:
|
| 111 |
-
page = r.pages[i]
|
| 112 |
-
t = page.extract_text() or ""
|
| 113 |
-
except Exception as e:
|
| 114 |
-
print(f"[WARN] pypdf: falha ao extrair pag {i+1}: {e}")
|
| 115 |
-
t = ""
|
| 116 |
-
out.append((i + 1, _normalize_text(t)))
|
| 117 |
-
return out
|
| 118 |
-
|
| 119 |
-
def _maybe_ocr_images(path: Path) -> List[Tuple[int, str]]:
|
| 120 |
-
try:
|
| 121 |
-
import pytesseract
|
| 122 |
-
from pdf2image import convert_from_path
|
| 123 |
-
except Exception:
|
| 124 |
-
return []
|
| 125 |
-
out: List[Tuple[int, str]] = []
|
| 126 |
-
try:
|
| 127 |
-
images = convert_from_path(str(path))
|
| 128 |
-
for i, img in enumerate(images):
|
| 129 |
-
txt = pytesseract.image_to_string(img) or ""
|
| 130 |
-
out.append((i + 1, _normalize_text(txt)))
|
| 131 |
-
except Exception as e:
|
| 132 |
-
print(f"[WARN] OCR: falha ao converter/ler {path.name}: {e}")
|
| 133 |
-
return out
|
| 134 |
-
|
| 135 |
-
def read_pdf_pages(path: Path) -> List[Tuple[int, str]]:
|
| 136 |
-
if not path.exists():
|
| 137 |
-
return []
|
| 138 |
-
# 1) PyMuPDF
|
| 139 |
-
if fitz is not None:
|
| 140 |
-
try:
|
| 141 |
-
pages = _extract_with_pymupdf(path)
|
| 142 |
-
if any((tx or "").strip() for _, tx in pages):
|
| 143 |
-
return pages
|
| 144 |
-
except Exception as e:
|
| 145 |
-
print(f"[WARN] PyMuPDF falhou: {e}")
|
| 146 |
-
# 2) pdfminer
|
| 147 |
-
if pdfminer_extract_text is not None:
|
| 148 |
-
try:
|
| 149 |
-
pages = _extract_with_pdfminer(path)
|
| 150 |
-
if any((tx or "").strip() for _, tx in pages):
|
| 151 |
-
return pages
|
| 152 |
-
except Exception as e:
|
| 153 |
-
print(f"[WARN] pdfminer falhou: {e}")
|
| 154 |
-
# 3) pypdf
|
| 155 |
-
try:
|
| 156 |
-
pages = _extract_with_pypdf(path)
|
| 157 |
-
if any((tx or "").strip() for _, tx in pages):
|
| 158 |
-
return pages
|
| 159 |
-
except Exception as e:
|
| 160 |
-
print(f"[WARN] pypdf falhou: {e}")
|
| 161 |
-
# 4) OCR quando nada foi extraído
|
| 162 |
-
ocr_pages = _maybe_ocr_images(path)
|
| 163 |
-
if not any((tx or "").strip() for _, tx in ocr_pages):
|
| 164 |
-
print("[ERRO] Nenhum texto extraído, nem com OCR.")
|
| 165 |
-
return ocr_pages
|
| 166 |
-
|
| 167 |
-
# ==============================
|
| 168 |
-
# Segmentação (detecção de seções para metadados)
|
| 169 |
-
# ==============================
|
| 170 |
-
DB2_HEADER_RE = re.compile(
|
| 171 |
-
r"^(Part\s+\d+\.|Chapter\s+\d+\.)|"
|
| 172 |
-
r"\b(BACKUP SYSTEM|CATMAINT|CHECK DATA|CHECK INDEX|CHECK LOB|COPY|COPYTOCOPY|DIAGNOSE|LISTDEF|LOAD|"
|
| 173 |
-
r"MERGECOPY|MODIFY RECOVERY|MODIFY STATISTICS|OPTIONS|QUIESCE|REBUILD INDEX|RECOVER|REORG INDEX|REORG TABLESPACE|"
|
| 174 |
-
r"REPAIR|REPORT|RESTORE SYSTEM|RUNSTATS|STOSPACE|TEMPLATE|UNLOAD)\b",
|
| 175 |
-
re.IGNORECASE
|
| 176 |
-
)
|
| 177 |
-
|
| 178 |
-
def split_db2_docs(pages: List[Tuple[int, str]], doc_label: str) -> List[Dict]:
|
| 179 |
-
"""Agrupa páginas por possíveis cabeçalhos (capítulos/utilities) para compor metadados de seção."""
|
| 180 |
-
blocks: List[Dict] = []
|
| 181 |
-
current = {"doc": doc_label, "section": "INTRO", "start_page": 1, "texts": []}
|
| 182 |
-
for pg, tx in pages:
|
| 183 |
-
head = (tx or "")[:300]
|
| 184 |
-
if DB2_HEADER_RE.search(head):
|
| 185 |
-
if current["texts"]:
|
| 186 |
-
current["end_page"] = current["texts"][-1][0]
|
| 187 |
-
blocks.append(current)
|
| 188 |
-
m = re.search(r"(Chapter\s+\d+\.\s*[^\n]+|^[^\n]{1,200})", tx or "")
|
| 189 |
-
title = (m.group(1).strip() if m else f"Section@{pg}")
|
| 190 |
-
current = {"doc": doc_label, "section": title, "start_page": pg, "texts": []}
|
| 191 |
-
current["texts"].append((pg, tx or ""))
|
| 192 |
-
if current["texts"]:
|
| 193 |
-
current["end_page"] = current["texts"][-1][0]
|
| 194 |
-
blocks.append(current)
|
| 195 |
-
return blocks
|
| 196 |
-
|
| 197 |
-
# ==============================
|
| 198 |
-
# Chunkização por caracteres (robusta)
|
| 199 |
-
# ==============================
|
| 200 |
-
def make_chunks_by_chars(blocks: List[Dict], max_chars: int = 1500, min_chars: int = 180) -> List[Dict]:
|
| 201 |
-
"""Concatena o texto das páginas de cada bloco e fatia por janelas de caracteres com overlap."""
|
| 202 |
-
out: List[Dict] = []
|
| 203 |
-
for b in blocks:
|
| 204 |
-
pieces: List[str] = []
|
| 205 |
-
pages: List[int] = []
|
| 206 |
-
for pg, tx in b["texts"]:
|
| 207 |
-
txn = _normalize_text(tx or "")
|
| 208 |
-
if txn:
|
| 209 |
-
pieces.append(txn)
|
| 210 |
-
pages.append(pg)
|
| 211 |
-
if not pieces:
|
| 212 |
-
continue
|
| 213 |
-
blob = "\n".join(pieces).strip()
|
| 214 |
-
if not blob:
|
| 215 |
-
continue
|
| 216 |
-
start_page = min(pages) if pages else b.get("start_page", 0)
|
| 217 |
-
end_page = max(pages) if pages else b.get("end_page", start_page)
|
| 218 |
-
|
| 219 |
-
if len(blob) <= max_chars and len(blob) >= min_chars:
|
| 220 |
-
out.append({
|
| 221 |
-
"doc": b["doc"],
|
| 222 |
-
"section": b["section"],
|
| 223 |
-
"start_page": start_page,
|
| 224 |
-
"end_page": end_page,
|
| 225 |
-
"text": blob
|
| 226 |
-
})
|
| 227 |
-
continue
|
| 228 |
-
|
| 229 |
-
overlap = 120
|
| 230 |
-
i, n = 0, len(blob)
|
| 231 |
-
while i < n:
|
| 232 |
-
j = min(i + max_chars, n)
|
| 233 |
-
chunk_text = blob[i:j].strip()
|
| 234 |
-
if len(chunk_text) >= min_chars:
|
| 235 |
-
out.append({
|
| 236 |
-
"doc": b["doc"],
|
| 237 |
-
"section": b["section"],
|
| 238 |
-
"start_page": start_page,
|
| 239 |
-
"end_page": end_page,
|
| 240 |
-
"text": chunk_text
|
| 241 |
-
})
|
| 242 |
-
new_i = j - overlap
|
| 243 |
-
i = j if new_i <= i else new_i
|
| 244 |
-
# filtro final
|
| 245 |
-
out = [c for c in out if (c.get("text") or "").strip()]
|
| 246 |
-
return out
|
| 247 |
-
|
| 248 |
-
# ==============================
|
| 249 |
-
# Embeddings
|
| 250 |
-
# ==============================
|
| 251 |
-
def embed_texts(texts: List[str], batch_size: int = 16) -> np.ndarray:
|
| 252 |
-
client = get_client()
|
| 253 |
-
clean = [(i, t) for i, t in enumerate(texts) if isinstance(t, str) and t.strip()]
|
| 254 |
-
if not clean:
|
| 255 |
-
return np.zeros((0, 0), dtype=np.float32)
|
| 256 |
-
order, payload = zip(*clean)
|
| 257 |
-
vecs: Dict[int, np.ndarray] = {}
|
| 258 |
-
for i in range(0, len(payload), batch_size):
|
| 259 |
-
batch = list(payload[i:i + batch_size])
|
| 260 |
-
resp = client.embeddings.create(model=EMBED_MODEL, input=batch)
|
| 261 |
-
for k, item in enumerate(resp.data):
|
| 262 |
-
vecs[int(order[i + k])] = np.array(item.embedding, dtype=np.float32)
|
| 263 |
-
rows: List[np.ndarray] = []
|
| 264 |
-
for idx in range(len(texts)):
|
| 265 |
-
if idx in vecs:
|
| 266 |
-
rows.append(vecs[idx])
|
| 267 |
-
if not rows:
|
| 268 |
-
return np.zeros((0, 0), dtype=np.float32)
|
| 269 |
-
mat = np.vstack(rows).astype(np.float32)
|
| 270 |
-
norms = np.linalg.norm(mat, axis=1, keepdims=True)
|
| 271 |
-
norms[norms == 0] = 1.0
|
| 272 |
-
return mat / norms
|
| 273 |
-
|
| 274 |
-
def embed_query(q: str) -> np.ndarray:
|
| 275 |
-
client = get_client()
|
| 276 |
-
resp = client.embeddings.create(model=EMBED_MODEL, input=[q])
|
| 277 |
-
v = np.array(resp.data[0].embedding, dtype=np.float32)
|
| 278 |
-
n = np.linalg.norm(v)
|
| 279 |
-
return (v / (n if n > 0 else 1.0)).astype(np.float32)
|
| 280 |
-
|
| 281 |
-
# ==============================
|
| 282 |
-
# Indexação
|
| 283 |
-
# ==============================
|
| 284 |
-
def build_index() -> Tuple[np.ndarray, List[Dict]]:
|
| 285 |
-
all_blocks: List[Dict] = []
|
| 286 |
-
for p in PDFS:
|
| 287 |
-
pages = read_pdf_pages(p)
|
| 288 |
-
if not pages or not any((tx or "").strip() for _, tx in pages):
|
| 289 |
-
print(f"[WARN] Sem texto legível em {p.name}; ignorando.")
|
| 290 |
-
continue
|
| 291 |
-
blks = split_db2_docs(pages, p.name)
|
| 292 |
-
all_blocks.extend(blks)
|
| 293 |
-
|
| 294 |
-
all_chunks = make_chunks_by_chars(all_blocks, max_chars=1500, min_chars=180)
|
| 295 |
-
all_chunks = [c for c in all_chunks if (c.get("text") or "").strip()]
|
| 296 |
-
|
| 297 |
-
if not all_chunks:
|
| 298 |
-
with open(META_FILE, "w", encoding="utf-8") as f:
|
| 299 |
-
json.dump({"chunks": [], "embed_model": EMBED_MODEL, "embed_dim": 0, "total_chars": 0}, f, ensure_ascii=False, indent=2)
|
| 300 |
-
np.save(VEC_FILE, np.zeros((0, 0), dtype=np.float32))
|
| 301 |
-
raise RuntimeError("Nenhum chunk foi criado. Verifique extração/ OCR.")
|
| 302 |
-
|
| 303 |
-
texts = [c["text"] for c in all_chunks]
|
| 304 |
-
total_chars = sum(len(t) for t in texts)
|
| 305 |
-
|
| 306 |
-
mat = embed_texts(texts) if texts else np.zeros((0, 0), dtype=np.float32)
|
| 307 |
-
embed_dim = int(mat.shape[1]) if mat.size else 0
|
| 308 |
-
|
| 309 |
-
np.save(VEC_FILE, mat)
|
| 310 |
-
with open(META_FILE, "w", encoding="utf-8") as f:
|
| 311 |
-
json.dump(
|
| 312 |
-
{"chunks": all_chunks, "embed_model": EMBED_MODEL, "embed_dim": embed_dim, "total_chars": total_chars},
|
| 313 |
-
f, ensure_ascii=False, indent=2
|
| 314 |
-
)
|
| 315 |
-
return mat, all_chunks
|
| 316 |
-
|
| 317 |
-
def load_index() -> Tuple[np.ndarray, List[Dict]]:
|
| 318 |
-
if VEC_FILE.exists() and META_FILE.exists():
|
| 319 |
-
mat = np.load(VEC_FILE)
|
| 320 |
-
dd = json.loads(META_FILE.read_text(encoding="utf-8"))
|
| 321 |
-
chunks = dd.get("chunks", [])
|
| 322 |
-
return mat, chunks
|
| 323 |
-
return build_index()
|
| 324 |
-
|
| 325 |
-
def wipe_index() -> str:
|
| 326 |
-
try:
|
| 327 |
-
if INDEX_DIR.exists():
|
| 328 |
-
for p in INDEX_DIR.glob("*"):
|
| 329 |
-
p.unlink()
|
| 330 |
-
INDEX_DIR.rmdir()
|
| 331 |
-
INDEX_DIR.mkdir(exist_ok=True)
|
| 332 |
-
return "Índice limpo."
|
| 333 |
-
except Exception as e:
|
| 334 |
-
return f"Erro ao limpar índice: {e}"
|
| 335 |
-
|
| 336 |
-
# ==============================
|
| 337 |
-
# Recuperação + LLM
|
| 338 |
-
# ==============================
|
| 339 |
-
def _check_embed_dim(mat: np.ndarray) -> Optional[str]:
|
| 340 |
-
try:
|
| 341 |
-
dd = json.loads(META_FILE.read_text(encoding="utf-8"))
|
| 342 |
-
idx_dim = int(dd.get("embed_dim", 0))
|
| 343 |
-
except Exception:
|
| 344 |
-
idx_dim = 0
|
| 345 |
-
try:
|
| 346 |
-
v = embed_query("dim_test")
|
| 347 |
-
cur_dim = int(v.shape[0])
|
| 348 |
-
except Exception as e:
|
| 349 |
-
return f"Falha ao checar dimensão do embedding: {e}"
|
| 350 |
-
if idx_dim and cur_dim and idx_dim != cur_dim:
|
| 351 |
-
return (f"Incompatibilidade de dimensão do embedding: índice={idx_dim}, modelo atual={cur_dim}. "
|
| 352 |
-
f"Reindexe com o mesmo EMBED_MODEL. (Atual EMBED_MODEL: {EMBED_MODEL})")
|
| 353 |
-
return None
|
| 354 |
-
|
| 355 |
-
def retrieve_topk(query: str, doc_filter: Optional[str] = None, k: int = TOP_K_RETRIEVE) -> List[Dict]:
|
| 356 |
-
mat, chunks = load_index()
|
| 357 |
-
if mat.shape[0] == 0 or not chunks:
|
| 358 |
-
return []
|
| 359 |
-
qv = embed_query(query)
|
| 360 |
-
if mat.shape[1] != qv.shape[0]:
|
| 361 |
-
raise RuntimeError(
|
| 362 |
-
f"Dimensão incompatível mat={mat.shape} vs query={qv.shape}. "
|
| 363 |
-
f"Provável troca de EMBED_MODEL após criar o índice. Clique 'Reindexar'."
|
| 364 |
-
)
|
| 365 |
-
sims = (mat @ qv).astype(float)
|
| 366 |
-
if doc_filter and doc_filter != "(Todos)":
|
| 367 |
-
mask = np.array([1.0 if c["doc"] == doc_filter else 0.0 for c in chunks], dtype=float)
|
| 368 |
-
sims *= mask
|
| 369 |
-
idxs = np.argsort(-sims)[:k]
|
| 370 |
-
out = []
|
| 371 |
-
for i in idxs:
|
| 372 |
-
c = chunks[int(i)]
|
| 373 |
-
out.append({
|
| 374 |
-
"doc": c["doc"],
|
| 375 |
-
"section": c.get("section", ""),
|
| 376 |
-
"start_page": c.get("start_page", "?"),
|
| 377 |
-
"end_page": c.get("end_page", "?"),
|
| 378 |
-
"text": c["text"],
|
| 379 |
-
"score": float(sims[int(i)]),
|
| 380 |
-
"idx": int(i)
|
| 381 |
-
})
|
| 382 |
-
return out
|
| 383 |
-
|
| 384 |
-
def expand_context(hits: List[Dict], all_chunks: List[Dict], target_chars: int = TARGET_CONTEXT_CHARS) -> Tuple[str, List[Tuple[str, str, str]]]:
|
| 385 |
-
if not hits:
|
| 386 |
-
return "", []
|
| 387 |
-
best = max(hits, key=lambda x: x["score"])
|
| 388 |
-
ctx = best["text"]
|
| 389 |
-
srcs = {(best["doc"], best["section"], f"{best['start_page']}–{best['end_page']}")}
|
| 390 |
-
doc, section, best_idx = best["doc"], best["section"], best["idx"]
|
| 391 |
-
indices = [i for i, c in enumerate(all_chunks) if c["doc"] == doc and c.get("section", "") == section]
|
| 392 |
-
if not indices:
|
| 393 |
-
return ctx, sorted(list(srcs))
|
| 394 |
-
indices.sort()
|
| 395 |
-
if best_idx not in indices:
|
| 396 |
-
return ctx, sorted(list(srcs))
|
| 397 |
-
pos = indices.index(best_idx)
|
| 398 |
-
left, right = pos - 1, pos + 1
|
| 399 |
-
while len(ctx) < target_chars and (left >= 0 or right < len(indices)):
|
| 400 |
-
if right < len(indices) and len(ctx) < target_chars:
|
| 401 |
-
rch = all_chunks[indices[right]]
|
| 402 |
-
ctx += "\n\n" + rch["text"]
|
| 403 |
-
srcs.add((doc, section, f"{rch.get('start_page', '?')}–{rch.get('end_page', '?')}"))
|
| 404 |
-
right += 1
|
| 405 |
-
if left >= 0 and len(ctx) < target_chars:
|
| 406 |
-
lch = all_chunks[indices[left]]
|
| 407 |
-
ctx = lch["text"] + "\n\n" + ctx
|
| 408 |
-
srcs.add((doc, section, f"{lch.get('start_page', '?')}–{lch.get('end_page', '?')}"))
|
| 409 |
-
left -= 1
|
| 410 |
-
return ctx, sorted(list(srcs))
|
| 411 |
-
|
| 412 |
-
def answer_with_llm(question: str, context: str) -> str:
|
| 413 |
-
client = get_client()
|
| 414 |
-
system = ("Você é um assistente especialista em IBM Db2 para z/OS. "
|
| 415 |
-
"Responda em português, com exemplos de comandos SQL/JCL completos e corretos. "
|
| 416 |
-
"Use apenas o contexto fornecido; se algo não estiver nele, diga que não está disponível.")
|
| 417 |
-
user = (f"Pergunta:\n{question}\n\n"
|
| 418 |
-
f"Contexto do(s) manual(is):\n{context}\n\n"
|
| 419 |
-
"Regras de resposta:\n"
|
| 420 |
-
"- Explique o necessário e como fazer.\n"
|
| 421 |
-
"- Inclua pelo menos um exemplo de comando Db2 utilitário, SQL ou JCL (auto-contido), se aplicável.\n"
|
| 422 |
-
"- Liste observações/pré-requisitos, se houver.\n"
|
| 423 |
-
"- Cite as fontes (Documento e páginas) ao final.")
|
| 424 |
-
chat = client.chat.completions.create(
|
| 425 |
-
model=CHAT_MODEL,
|
| 426 |
-
messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
|
| 427 |
-
temperature=0.2,
|
| 428 |
-
)
|
| 429 |
-
return chat.choices[0].message.content.strip()
|
| 430 |
-
|
| 431 |
-
def format_sources_md(sources: List[Tuple[str, str, str]]) -> str:
|
| 432 |
-
if not sources:
|
| 433 |
-
return ""
|
| 434 |
-
lines = [
|
| 435 |
-
f"- **Documento:** {d} \n **Seção:** {s} \n **Páginas:** {p}"
|
| 436 |
-
for (d, s, p) in sources
|
| 437 |
-
]
|
| 438 |
-
return "\n".join(lines)
|
| 439 |
-
|
| 440 |
-
# ==============================
|
| 441 |
-
# Templates Db2 (exemplos)
|
| 442 |
-
# ==============================
|
| 443 |
-
DB2_TEMPLATES: Dict[str, str] = {
|
| 444 |
-
"RUNSTATS_TABLESPACE": (
|
| 445 |
-
"//RUNSTAT JOB (ACCT),'RUNSTATS',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 446 |
-
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='RUNSTATS',UTPROC=''\n"
|
| 447 |
-
"//SYSIN DD *\n"
|
| 448 |
-
" RUNSTATS TABLESPACE(DBNAME.TSNAME) TABLE(ALL) INDEX(ALL)\n"
|
| 449 |
-
"/*\n"
|
| 450 |
-
),
|
| 451 |
-
"REORG_TABLESPACE": (
|
| 452 |
-
"//REORG JOB (ACCT),'REORG',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 453 |
-
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='REORGTS',UTPROC=''\n"
|
| 454 |
-
"//SYSIN DD *\n"
|
| 455 |
-
" REORG TABLESPACE(DBNAME.TSNAME) SHRLEVEL CHANGE\n"
|
| 456 |
-
"/*\n"
|
| 457 |
-
),
|
| 458 |
-
"EXPLAIN_SQL": (
|
| 459 |
-
"//EXPLAIN JOB (ACCT),'EXPLAIN',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 460 |
-
"//STEP1 EXEC DSNTEP2,SYSTEM=DSN1\n"
|
| 461 |
-
"//SYSIN DD *\n"
|
| 462 |
-
" EXPLAIN PLAN FOR\n"
|
| 463 |
-
" SELECT COL1, COL2 FROM DBNAME.TBNAME WHERE COL3 = 'X';\n"
|
| 464 |
-
"/*\n"
|
| 465 |
-
),
|
| 466 |
-
"DISPLAY_BUFFERPOOL": (
|
| 467 |
-
"//DISPBP JOB (ACCT),'DISPLAY BP',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 468 |
-
"//STEP1 EXEC PGM=IKJEFT01\n"
|
| 469 |
-
"//SYSTSPRT DD SYSOUT=*\n"
|
| 470 |
-
"//SYSIN DD *\n"
|
| 471 |
-
" DSN SYSTEM(DSN1)\n"
|
| 472 |
-
" -DISPLAY BUFFERPOOL(BP0) DETAIL\n"
|
| 473 |
-
" END\n"
|
| 474 |
-
"/*\n"
|
| 475 |
-
),
|
| 476 |
-
"DSNTEP2_SELECT": (
|
| 477 |
-
"//SELECT JOB (ACCT),'DSNTEP2',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 478 |
-
"//STEP1 EXEC DSNTEP2,SYSTEM=DSN1\n"
|
| 479 |
-
"//SYSIN DD *\n"
|
| 480 |
-
" SELECT FIRSTNME, LASTNAME FROM DSN8810.EMP\n"
|
| 481 |
-
" WHERE WORKDEPT = 'A00';\n"
|
| 482 |
-
"/*\n"
|
| 483 |
-
),
|
| 484 |
-
"COPY_TABLESPACE": (
|
| 485 |
-
"//COPYTS JOB (ACCT),'COPY',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 486 |
-
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='COPYTS',UTPROC=''\n"
|
| 487 |
-
"//SYSIN DD *\n"
|
| 488 |
-
" COPY TABLESPACE(DBNAME.TSNAME) FULL YES SHRLEVEL CHANGE\n"
|
| 489 |
-
"/*\n"
|
| 490 |
-
),
|
| 491 |
-
"LOAD_TABLE": (
|
| 492 |
-
"//LOADTBL JOB (ACCT),'LOAD',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 493 |
-
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='LOADTBL',UTPROC=''\n"
|
| 494 |
-
"//SYSIN DD *\n"
|
| 495 |
-
" LOAD DATA INDDN SYSREC INTO TABLE DBNAME.TBNAME\n"
|
| 496 |
-
" REPLACE\n"
|
| 497 |
-
"/*\n"
|
| 498 |
-
),
|
| 499 |
-
"RECOVER_TABLESPACE": (
|
| 500 |
-
"//RECOVTS JOB (ACCT),'RECOVER',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 501 |
-
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='RECOVTS',UTPROC=''\n"
|
| 502 |
-
"//SYSIN DD *\n"
|
| 503 |
-
" RECOVER TABLESPACE(DBNAME.TSNAME)\n"
|
| 504 |
-
"/*\n"
|
| 505 |
-
),
|
| 506 |
-
"STATS_INDEX": (
|
| 507 |
-
"//STATSIX JOB (ACCT),'STATS INDEX',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 508 |
-
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='STATSIX',UTPROC=''\n"
|
| 509 |
-
"//SYSIN DD *\n"
|
| 510 |
-
" RUNSTATS INDEX(DBNAME.IXNAME) ALL\n"
|
| 511 |
-
"/*\n"
|
| 512 |
-
),
|
| 513 |
-
"MODIFY_RECOVERY": (
|
| 514 |
-
"//MODREC JOB (ACCT),'MODIFY RECOVERY',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 515 |
-
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='MODREC',UTPROC=''\n"
|
| 516 |
-
"//SYSIN DD *\n"
|
| 517 |
-
" MODIFY RECOVERY TABLESPACE(DBNAME.TSNAME) AGE(30)\n"
|
| 518 |
-
"/*\n"
|
| 519 |
-
),
|
| 520 |
-
"CHECK_DATA": (
|
| 521 |
-
"//CHKDATA JOB (ACCT),'CHECK DATA',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 522 |
-
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='CHKDATA',UTPROC=''\n"
|
| 523 |
-
"//SYSIN DD *\n"
|
| 524 |
-
" CHECK DATA TABLESPACE(DBNAME.TSNAME) SCOPE ALL\n"
|
| 525 |
-
"/*\n"
|
| 526 |
-
),
|
| 527 |
-
}
|
| 528 |
-
|
| 529 |
-
def template_for_db2(cmd: str) -> str:
|
| 530 |
-
return DB2_TEMPLATES.get(cmd, "//GENERIC ...\n")
|
| 531 |
-
|
| 532 |
-
# ==============================
|
| 533 |
-
# UI — layout com SIDEBAR + diagnóstico
|
| 534 |
-
# ==============================
|
| 535 |
-
CUSTOM_CSS = """
|
| 536 |
-
:root{ --ink:#0f172a; --muted:#475569; }
|
| 537 |
-
.gradio-container { max-width: 1200px !important; margin: 0 auto !important; }
|
| 538 |
-
.section-card { background: #fff; border: 1px solid #e2e8f0; border-radius: 16px; padding: 16px; box-shadow: 0 10px 30px rgba(2,6,23,.05); }
|
| 539 |
-
.section-title { font-size: 1.05rem; font-weight: 800; color: var(--ink); display: flex; gap: .6rem; align-items: center; }
|
| 540 |
-
.subtitle { color:var(--muted); font-size:.95rem; margin-top:.25rem; }
|
| 541 |
-
.result-card { background:#fcfdff; border:1px solid #e2e8f0; border-radius:12px; padding:12px; }
|
| 542 |
-
hr.sep { border:none; border-top:1px dashed #e2e8f0; margin:10px 0 14px; }
|
| 543 |
-
.small { font-size:.9rem; color:var(--muted); }
|
| 544 |
-
"""
|
| 545 |
-
|
| 546 |
-
def build_app():
|
| 547 |
-
doc_label = PDFS[0].name if PDFS else "(Nenhum PDF)"
|
| 548 |
-
all_doc_names = [p.name for p in PDFS] or ["(Nenhum PDF encontrado)"]
|
| 549 |
-
default_doc = all_doc_names[0] if all_doc_names else "(Todos)"
|
| 550 |
-
|
| 551 |
-
with gr.Blocks(title="Db2 z/OS • RAG (NVIDIA NIM)", css=CUSTOM_CSS, fill_height=True) as demo:
|
| 552 |
-
|
| 553 |
-
# ===== Sidebar =====
|
| 554 |
-
with gr.Sidebar():
|
| 555 |
-
gr.Markdown(
|
| 556 |
-
f"""
|
| 557 |
-
<div class="section-title">💼 Db2 RAG</div>
|
| 558 |
-
<div class="small">Contexto: <code>{doc_label}</code></div>
|
| 559 |
-
"""
|
| 560 |
-
)
|
| 561 |
-
status_box = gr.Markdown("Pronto ✅" if PDFS else "⚠️ Nenhum PDF encontrado.")
|
| 562 |
-
gr.Markdown("<hr class='sep'/>")
|
| 563 |
-
|
| 564 |
-
with gr.Group():
|
| 565 |
-
gr.Markdown("**Ações**")
|
| 566 |
-
test_btn = gr.Button("🧪 Testar conexão NVIDIA", variant="secondary")
|
| 567 |
-
rebuild_btn = gr.Button("🔁 Reindexar (NIM)")
|
| 568 |
-
diag_btn = gr.Button("🛠️ Diagnóstico do Índice")
|
| 569 |
-
|
| 570 |
-
gr.Markdown("<hr class='sep'/>")
|
| 571 |
-
|
| 572 |
-
with gr.Accordion("Configuração", open=False):
|
| 573 |
-
gr.Markdown(f"- **Embeddings:** `{EMBED_MODEL}`\n- **LLM:** `{CHAT_MODEL}`\n- **Índice:** `{INDEX_DIR.name}`")
|
| 574 |
-
doc_choice = gr.Dropdown(
|
| 575 |
-
choices=(["(Todos)"] + all_doc_names),
|
| 576 |
-
value=default_doc if PDFS else "(Todos)",
|
| 577 |
-
label="Documento"
|
| 578 |
-
)
|
| 579 |
-
|
| 580 |
-
# ===== Main content =====
|
| 581 |
-
gr.Markdown(
|
| 582 |
-
f"""
|
| 583 |
-
<div class="section-card" style="padding:18px; display:flex; gap:16px; align-items:center;">
|
| 584 |
-
<div style="font-size:26px;">🧭</div>
|
| 585 |
-
<div style="flex:1">
|
| 586 |
-
<div style="font-size:1.2rem; font-weight:800; color:#0f172a;">DB2 -Z/OS UTILITIES | RAG + NVIDIA NIM</div>
|
| 587 |
-
<div class="subtitle">Pergunte sobre utilidades (COPY, LOAD, REORG, RUNSTATS, RECOVER, etc.). As respostas vêm do manual: <code>{doc_label}</code>.</div>
|
| 588 |
-
</div>
|
| 589 |
-
</div>
|
| 590 |
-
"""
|
| 591 |
-
)
|
| 592 |
-
|
| 593 |
-
with gr.Row():
|
| 594 |
-
q = gr.Textbox(
|
| 595 |
-
label="Pergunta (Db2 Utilities)",
|
| 596 |
-
placeholder="Ex.: Como usar COPY FULL com SHRLEVEL CHANGE? • Quando rodar RUNSTATS INDEX? • REORG TABLESPACE SHRLEVEL CHANGE • RECOVER PITR...",
|
| 597 |
-
scale=8
|
| 598 |
-
)
|
| 599 |
-
with gr.Row():
|
| 600 |
-
ask_btn = gr.Button("🔍 Buscar", variant="primary", scale=2)
|
| 601 |
-
clear_btn = gr.Button("🧹 Limpar", scale=1)
|
| 602 |
-
|
| 603 |
-
out = gr.Markdown(label="Resposta (Db2)")
|
| 604 |
-
|
| 605 |
-
gr.Markdown("<hr class='sep'/>")
|
| 606 |
-
|
| 607 |
-
with gr.Accordion("🧩 Templates Db2 executáveis", open=False):
|
| 608 |
-
db2_choice = gr.Dropdown(
|
| 609 |
-
choices=list(DB2_TEMPLATES.keys()),
|
| 610 |
-
value="RUNSTATS_TABLESPACE",
|
| 611 |
-
label="Comando / Padrão"
|
| 612 |
-
)
|
| 613 |
-
db2_btn = gr.Button("📄 Gerar exemplo")
|
| 614 |
-
db2_out = gr.Textbox(label="Exemplo (copiar/ajustar)", lines=18, show_copy_button=True)
|
| 615 |
-
|
| 616 |
-
with gr.Accordion("🧪 Log / Diagnóstico", open=False):
|
| 617 |
-
diag_out = gr.Markdown()
|
| 618 |
-
|
| 619 |
-
# ===== Callbacks =====
|
| 620 |
-
def _test_conn():
|
| 621 |
-
try:
|
| 622 |
-
dim = len(get_client().embeddings.create(model=EMBED_MODEL, input=["ping"]).data[0].embedding)
|
| 623 |
-
return f"Conexão ok ✅ — dimensão do embedding: **{dim}**"
|
| 624 |
-
except Exception as e:
|
| 625 |
-
return f"⚠️ Falha na conexão/credenciais NVIDIA: `{type(e).__name__}` — {e}"
|
| 626 |
-
|
| 627 |
-
def _rebuild():
|
| 628 |
-
try:
|
| 629 |
-
msg = wipe_index()
|
| 630 |
-
mat, chunks = build_index()
|
| 631 |
-
return msg + f" Reindexação concluída ✅ PDFs: {len(PDFS)} • Chunks: {len(chunks)} • Vetores: {mat.shape}"
|
| 632 |
-
except Exception as e:
|
| 633 |
-
return f"⚠️ Erro ao reindexar: `{type(e).__name__}` — {e}"
|
| 634 |
-
|
| 635 |
-
def _diagnose(dsel: str) -> str:
|
| 636 |
-
try:
|
| 637 |
-
if not (VEC_FILE.exists() and META_FILE.exists()):
|
| 638 |
-
return "❌ Nenhum índice encontrado. Clique **Reindexar (NIM)**."
|
| 639 |
-
mat = np.load(VEC_FILE)
|
| 640 |
-
meta = json.loads(META_FILE.read_text(encoding="utf-8"))
|
| 641 |
-
chunks = meta.get("chunks", [])
|
| 642 |
-
embed_dim = meta.get("embed_dim", 0)
|
| 643 |
-
total_chars = int(meta.get("total_chars", 0))
|
| 644 |
-
dim_msg = _check_embed_dim(mat)
|
| 645 |
-
# primeiras seções
|
| 646 |
-
first_secs = []
|
| 647 |
-
for c in chunks[:12]:
|
| 648 |
-
if dsel == "(Todos)" or c["doc"] == dsel:
|
| 649 |
-
first_secs.append(f"- {c['doc']} • {c.get('section','?')} • p.{c.get('start_page','?')}-{c.get('end_page','?')}")
|
| 650 |
-
if not first_secs:
|
| 651 |
-
first_secs = ["(Filtro de documento não encontra seções no índice.)"]
|
| 652 |
-
# prévia do primeiro chunk
|
| 653 |
-
preview = ""
|
| 654 |
-
for c in chunks:
|
| 655 |
-
t = (c.get("text") or "").strip()
|
| 656 |
-
if t:
|
| 657 |
-
preview = t[:400].replace("\n", " ")
|
| 658 |
-
break
|
| 659 |
-
if not preview:
|
| 660 |
-
preview = "(Nenhum chunk contém texto — verifique extração/OCR.)"
|
| 661 |
-
msg = [
|
| 662 |
-
f"**Índice**: Vetores `{mat.shape}` • embed_dim(meta): `{embed_dim}` • Modelo atual: `{EMBED_MODEL}`",
|
| 663 |
-
f"**Chunks**: **{len(chunks)}** • **Total de caracteres**: {total_chars}",
|
| 664 |
-
f"**Documento selecionado**: `{dsel}`",
|
| 665 |
-
f"**Primeiras seções**:\n" + "\n".join(first_secs),
|
| 666 |
-
f"\n**Prévia (400 chars)**:\n```\n{preview}\n```"
|
| 667 |
-
]
|
| 668 |
-
if dim_msg:
|
| 669 |
-
msg.append(f"\n⚠️ {dim_msg}")
|
| 670 |
-
return "\n".join(msg)
|
| 671 |
-
except Exception as e:
|
| 672 |
-
return f"⚠️ Diagnóstico falhou: `{type(e).__name__}` — {e}"
|
| 673 |
-
|
| 674 |
-
def _search_answer(qstr: str, d: str) -> str:
|
| 675 |
-
try:
|
| 676 |
-
if not qstr or qstr.strip() == "":
|
| 677 |
-
return "_Informe uma pergunta._"
|
| 678 |
-
if not (VEC_FILE.exists() and META_FILE.exists()):
|
| 679 |
-
return "_Nenhum conteúdo indexado. Use **Reindexar**._"
|
| 680 |
-
mat = np.load(VEC_FILE)
|
| 681 |
-
meta = json.loads(META_FILE.read_text(encoding="utf-8"))
|
| 682 |
-
chunks = meta.get("chunks", [])
|
| 683 |
-
if mat.size == 0 or not chunks:
|
| 684 |
-
return "_Índice vazio. Reindexe (pode ser necessário OCR)._"
|
| 685 |
-
dim_msg = _check_embed_dim(mat)
|
| 686 |
-
if dim_msg:
|
| 687 |
-
return f"⚠️ {dim_msg}"
|
| 688 |
-
# retrieve
|
| 689 |
-
hits = retrieve_topk(qstr, None if d == "(Todos)" else d, k=TOP_K_RETRIEVE)
|
| 690 |
-
hits = [h for h in hits if (h.get("text") or "").strip()]
|
| 691 |
-
if not hits:
|
| 692 |
-
return "_Nada encontrado para a consulta (verifique o filtro de documento ou reindexe)._"
|
| 693 |
-
context, sources = expand_context(hits, chunks, TARGET_CONTEXT_CHARS)
|
| 694 |
-
if not context.strip():
|
| 695 |
-
return "_Contexto insuficiente encontrado._"
|
| 696 |
-
answer = answer_with_llm(qstr, context)
|
| 697 |
-
src_md = format_sources_md(sources)
|
| 698 |
-
return f"<div class='result-card'>{answer}</div>\n\n### Fontes\n{src_md}"
|
| 699 |
-
except Exception as e:
|
| 700 |
-
return f"⚠️ Erro ao buscar: `{type(e).__name__}` — {e}"
|
| 701 |
-
|
| 702 |
-
def _clear(doc_default: str) -> Tuple[str, str]:
|
| 703 |
-
return "", (doc_default if PDFS else "(Todos)")
|
| 704 |
-
|
| 705 |
-
def ui_db2_template(cmd_choice: str) -> str:
|
| 706 |
-
return template_for_db2(cmd_choice)
|
| 707 |
-
|
| 708 |
-
test_btn.click(_test_conn, outputs=[status_box])
|
| 709 |
-
rebuild_btn.click(_rebuild, outputs=[status_box])
|
| 710 |
-
diag_btn.click(_diagnose, inputs=[doc_choice], outputs=[diag_out])
|
| 711 |
-
|
| 712 |
-
ask_btn.click(_search_answer, inputs=[q, doc_choice], outputs=[out])
|
| 713 |
-
clear_btn.click(_clear, inputs=[gr.State(default_doc)], outputs=[q, doc_choice])
|
| 714 |
-
db2_btn.click(ui_db2_template, inputs=[db2_choice], outputs=[db2_out])
|
| 715 |
-
|
| 716 |
-
return demo
|
| 717 |
-
|
| 718 |
-
# ==============================
|
| 719 |
-
# Main (robusto: bind público, respeita $PORT, SSR off, queue opcional)
|
| 720 |
-
# ==============================
|
| 721 |
-
if __name__ == "__main__":
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
except Exception as e:
|
| 729 |
-
print(f"[AVISO] Índice não carregado: {e}")
|
| 730 |
-
|
| 731 |
-
demo = build_app()
|
| 732 |
-
try:
|
| 733 |
-
demo = demo.queue() # algumas versões não aceitam kwargs; se falhar, segue sem fila
|
| 734 |
-
except Exception as e:
|
| 735 |
-
print(f"[INFO] Queue não habilitada ({type(e).__name__}: {e}). Continuando sem fila.")
|
| 736 |
-
|
| 737 |
-
print(f"[INFO] Iniciando em http://{SERVER_NAME}:{PORT}{ROOT_PATH} (SSR off)")
|
| 738 |
-
demo.launch(
|
| 739 |
-
server_name=SERVER_NAME, # 0.0.0.0 para acesso externo
|
| 740 |
-
server_port=PORT, # usa $PORT quando presente
|
| 741 |
-
root_path=ROOT_PATH, # necessário se estiver atrás de subpath
|
| 742 |
-
show_error=True,
|
| 743 |
-
ssr_mode=False, # evita tela branca em alguns proxies
|
| 744 |
-
share=False,
|
| 745 |
-
inbrowser=False
|
| 746 |
-
)
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
# Db2 z/OS • RAG (NVIDIA NIM)
|
| 3 |
+
# Sidebar + Diagnóstico + Extração robusta (com OCR opcional) + Chunkização por caracteres
|
| 4 |
+
# Requisitos:
|
| 5 |
+
# pip install openai gradio numpy pypdf pdfminer.six pymupdf
|
| 6 |
+
# (OCR opcional) + dependências de SO:
|
| 7 |
+
# pip install pytesseract pdf2image pillow
|
| 8 |
+
# Linux: apt-get install -y tesseract-ocr poppler-utils
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import re
|
| 12 |
+
import json
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import List, Tuple, Dict, Optional
|
| 15 |
+
|
| 16 |
+
import numpy as np
|
| 17 |
+
import gradio as gr
|
| 18 |
+
|
| 19 |
+
# ==============================
|
| 20 |
+
# Config (Db2 + NVIDIA NIM)
|
| 21 |
+
# ==============================
|
| 22 |
+
BASE_DIR = Path(__file__).parent if "__file__" in globals() else Path.cwd()
|
| 23 |
+
|
| 24 |
+
# Altere se necessário; o app também aceita PDFs no diretório atual (*.pdf)
|
| 25 |
+
USER_PDF = Path("db2z_13_utilities.pdf")
|
| 26 |
+
PDFS = [USER_PDF] if USER_PDF.exists() else sorted([p for p in BASE_DIR.glob("*.pdf") if p.is_file() and p.suffix.lower()==".pdf"])
|
| 27 |
+
|
| 28 |
+
NVCF_BASE = os.getenv("NVCF_BASE", "https://integrate.api.nvidia.com/v1")
|
| 29 |
+
NVCF_API_KEY = os.getenv("NVCF_API_KEY", "")
|
| 30 |
+
EMBED_MODEL = os.getenv("EMBED_MODEL", "nvidia/nv-embed-v1") # 4096-dim atualmente
|
| 31 |
+
CHAT_MODEL = os.getenv("CHAT_MODEL", "meta/llama-3.1-8b-instruct")
|
| 32 |
+
|
| 33 |
+
SAFE_IDX = f".db2_index_{EMBED_MODEL.replace('/','__')}"
|
| 34 |
+
INDEX_DIR = BASE_DIR / SAFE_IDX
|
| 35 |
+
INDEX_DIR.mkdir(exist_ok=True)
|
| 36 |
+
VEC_FILE = INDEX_DIR / "vectors.npy"
|
| 37 |
+
META_FILE = INDEX_DIR / "meta.json"
|
| 38 |
+
|
| 39 |
+
TOP_K_RETRIEVE = 3
|
| 40 |
+
TARGET_CONTEXT_CHARS = 1500
|
| 41 |
+
|
| 42 |
+
# ==============================
|
| 43 |
+
# NVIDIA (OpenAI-compatible)
|
| 44 |
+
# ==============================
|
| 45 |
+
try:
|
| 46 |
+
from openai import OpenAI
|
| 47 |
+
except ImportError:
|
| 48 |
+
raise RuntimeError("Instale: pip install openai gradio numpy pypdf pdfminer.six pymupdf")
|
| 49 |
+
|
| 50 |
+
def get_client():
|
| 51 |
+
if not NVCF_API_KEY or NVCF_API_KEY == "xxxxxxxxxxxxxxxxxxx":
|
| 52 |
+
raise RuntimeError("NVCF_API_KEY não definido. export/set NVCF_API_KEY='SUA_CHAVE'")
|
| 53 |
+
return OpenAI(base_url=NVCF_BASE, api_key=NVCF_API_KEY)
|
| 54 |
+
|
| 55 |
+
# ==============================
|
| 56 |
+
# PDF utils (robusto + OCR opcional)
|
| 57 |
+
# ==============================
|
| 58 |
+
try:
|
| 59 |
+
import fitz # PyMuPDF
|
| 60 |
+
except Exception:
|
| 61 |
+
fitz = None
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
| 65 |
+
except Exception:
|
| 66 |
+
pdfminer_extract_text = None
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
from pypdf import PdfReader
|
| 70 |
+
except Exception:
|
| 71 |
+
PdfReader = None
|
| 72 |
+
|
| 73 |
+
NBSP = "\u00A0"
|
| 74 |
+
def _normalize_text(t: str) -> str:
|
| 75 |
+
if not isinstance(t, str):
|
| 76 |
+
t = str(t or "")
|
| 77 |
+
t = t.replace(NBSP, " ")
|
| 78 |
+
t = re.sub(r"[\u0000-\u001F]", " ", t) # remove controles
|
| 79 |
+
# mantém quebras simples e colapsa espaços longos
|
| 80 |
+
t = re.sub(r"[ \t]{2,}", " ", t)
|
| 81 |
+
t = re.sub(r"\n{3,}", "\n\n", t)
|
| 82 |
+
return t.strip()
|
| 83 |
+
|
| 84 |
+
def _extract_with_pymupdf(path: Path) -> List[Tuple[int, str]]:
|
| 85 |
+
out: List[Tuple[int, str]] = []
|
| 86 |
+
with fitz.open(path) as doc:
|
| 87 |
+
for i, page in enumerate(doc):
|
| 88 |
+
text = page.get_text("text") or ""
|
| 89 |
+
out.append((i + 1, _normalize_text(text)))
|
| 90 |
+
return out
|
| 91 |
+
|
| 92 |
+
def _extract_with_pdfminer(path: Path) -> List[Tuple[int, str]]:
|
| 93 |
+
text_all = pdfminer_extract_text(str(path)) or ""
|
| 94 |
+
pages = re.split(r"\f", text_all)
|
| 95 |
+
out: List[Tuple[int, str]] = []
|
| 96 |
+
for i, tx in enumerate(pages):
|
| 97 |
+
out.append((i + 1, _normalize_text(tx)))
|
| 98 |
+
return out
|
| 99 |
+
|
| 100 |
+
def _extract_with_pypdf(path: Path) -> List[Tuple[int, str]]:
|
| 101 |
+
if PdfReader is None:
|
| 102 |
+
return []
|
| 103 |
+
try:
|
| 104 |
+
r = PdfReader(str(path), strict=False)
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"[WARN] pypdf: falha ao abrir {path.name}: {e}")
|
| 107 |
+
return []
|
| 108 |
+
out: List[Tuple[int, str]] = []
|
| 109 |
+
for i in range(len(r.pages)):
|
| 110 |
+
try:
|
| 111 |
+
page = r.pages[i]
|
| 112 |
+
t = page.extract_text() or ""
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f"[WARN] pypdf: falha ao extrair pag {i+1}: {e}")
|
| 115 |
+
t = ""
|
| 116 |
+
out.append((i + 1, _normalize_text(t)))
|
| 117 |
+
return out
|
| 118 |
+
|
| 119 |
+
def _maybe_ocr_images(path: Path) -> List[Tuple[int, str]]:
|
| 120 |
+
try:
|
| 121 |
+
import pytesseract
|
| 122 |
+
from pdf2image import convert_from_path
|
| 123 |
+
except Exception:
|
| 124 |
+
return []
|
| 125 |
+
out: List[Tuple[int, str]] = []
|
| 126 |
+
try:
|
| 127 |
+
images = convert_from_path(str(path))
|
| 128 |
+
for i, img in enumerate(images):
|
| 129 |
+
txt = pytesseract.image_to_string(img) or ""
|
| 130 |
+
out.append((i + 1, _normalize_text(txt)))
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"[WARN] OCR: falha ao converter/ler {path.name}: {e}")
|
| 133 |
+
return out
|
| 134 |
+
|
| 135 |
+
def read_pdf_pages(path: Path) -> List[Tuple[int, str]]:
|
| 136 |
+
if not path.exists():
|
| 137 |
+
return []
|
| 138 |
+
# 1) PyMuPDF
|
| 139 |
+
if fitz is not None:
|
| 140 |
+
try:
|
| 141 |
+
pages = _extract_with_pymupdf(path)
|
| 142 |
+
if any((tx or "").strip() for _, tx in pages):
|
| 143 |
+
return pages
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"[WARN] PyMuPDF falhou: {e}")
|
| 146 |
+
# 2) pdfminer
|
| 147 |
+
if pdfminer_extract_text is not None:
|
| 148 |
+
try:
|
| 149 |
+
pages = _extract_with_pdfminer(path)
|
| 150 |
+
if any((tx or "").strip() for _, tx in pages):
|
| 151 |
+
return pages
|
| 152 |
+
except Exception as e:
|
| 153 |
+
print(f"[WARN] pdfminer falhou: {e}")
|
| 154 |
+
# 3) pypdf
|
| 155 |
+
try:
|
| 156 |
+
pages = _extract_with_pypdf(path)
|
| 157 |
+
if any((tx or "").strip() for _, tx in pages):
|
| 158 |
+
return pages
|
| 159 |
+
except Exception as e:
|
| 160 |
+
print(f"[WARN] pypdf falhou: {e}")
|
| 161 |
+
# 4) OCR quando nada foi extraído
|
| 162 |
+
ocr_pages = _maybe_ocr_images(path)
|
| 163 |
+
if not any((tx or "").strip() for _, tx in ocr_pages):
|
| 164 |
+
print("[ERRO] Nenhum texto extraído, nem com OCR.")
|
| 165 |
+
return ocr_pages
|
| 166 |
+
|
| 167 |
+
# ==============================
|
| 168 |
+
# Segmentação (detecção de seções para metadados)
|
| 169 |
+
# ==============================
|
| 170 |
+
DB2_HEADER_RE = re.compile(
|
| 171 |
+
r"^(Part\s+\d+\.|Chapter\s+\d+\.)|"
|
| 172 |
+
r"\b(BACKUP SYSTEM|CATMAINT|CHECK DATA|CHECK INDEX|CHECK LOB|COPY|COPYTOCOPY|DIAGNOSE|LISTDEF|LOAD|"
|
| 173 |
+
r"MERGECOPY|MODIFY RECOVERY|MODIFY STATISTICS|OPTIONS|QUIESCE|REBUILD INDEX|RECOVER|REORG INDEX|REORG TABLESPACE|"
|
| 174 |
+
r"REPAIR|REPORT|RESTORE SYSTEM|RUNSTATS|STOSPACE|TEMPLATE|UNLOAD)\b",
|
| 175 |
+
re.IGNORECASE
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
def split_db2_docs(pages: List[Tuple[int, str]], doc_label: str) -> List[Dict]:
|
| 179 |
+
"""Agrupa páginas por possíveis cabeçalhos (capítulos/utilities) para compor metadados de seção."""
|
| 180 |
+
blocks: List[Dict] = []
|
| 181 |
+
current = {"doc": doc_label, "section": "INTRO", "start_page": 1, "texts": []}
|
| 182 |
+
for pg, tx in pages:
|
| 183 |
+
head = (tx or "")[:300]
|
| 184 |
+
if DB2_HEADER_RE.search(head):
|
| 185 |
+
if current["texts"]:
|
| 186 |
+
current["end_page"] = current["texts"][-1][0]
|
| 187 |
+
blocks.append(current)
|
| 188 |
+
m = re.search(r"(Chapter\s+\d+\.\s*[^\n]+|^[^\n]{1,200})", tx or "")
|
| 189 |
+
title = (m.group(1).strip() if m else f"Section@{pg}")
|
| 190 |
+
current = {"doc": doc_label, "section": title, "start_page": pg, "texts": []}
|
| 191 |
+
current["texts"].append((pg, tx or ""))
|
| 192 |
+
if current["texts"]:
|
| 193 |
+
current["end_page"] = current["texts"][-1][0]
|
| 194 |
+
blocks.append(current)
|
| 195 |
+
return blocks
|
| 196 |
+
|
| 197 |
+
# ==============================
|
| 198 |
+
# Chunkização por caracteres (robusta)
|
| 199 |
+
# ==============================
|
| 200 |
+
def make_chunks_by_chars(blocks: List[Dict], max_chars: int = 1500, min_chars: int = 180) -> List[Dict]:
|
| 201 |
+
"""Concatena o texto das páginas de cada bloco e fatia por janelas de caracteres com overlap."""
|
| 202 |
+
out: List[Dict] = []
|
| 203 |
+
for b in blocks:
|
| 204 |
+
pieces: List[str] = []
|
| 205 |
+
pages: List[int] = []
|
| 206 |
+
for pg, tx in b["texts"]:
|
| 207 |
+
txn = _normalize_text(tx or "")
|
| 208 |
+
if txn:
|
| 209 |
+
pieces.append(txn)
|
| 210 |
+
pages.append(pg)
|
| 211 |
+
if not pieces:
|
| 212 |
+
continue
|
| 213 |
+
blob = "\n".join(pieces).strip()
|
| 214 |
+
if not blob:
|
| 215 |
+
continue
|
| 216 |
+
start_page = min(pages) if pages else b.get("start_page", 0)
|
| 217 |
+
end_page = max(pages) if pages else b.get("end_page", start_page)
|
| 218 |
+
|
| 219 |
+
if len(blob) <= max_chars and len(blob) >= min_chars:
|
| 220 |
+
out.append({
|
| 221 |
+
"doc": b["doc"],
|
| 222 |
+
"section": b["section"],
|
| 223 |
+
"start_page": start_page,
|
| 224 |
+
"end_page": end_page,
|
| 225 |
+
"text": blob
|
| 226 |
+
})
|
| 227 |
+
continue
|
| 228 |
+
|
| 229 |
+
overlap = 120
|
| 230 |
+
i, n = 0, len(blob)
|
| 231 |
+
while i < n:
|
| 232 |
+
j = min(i + max_chars, n)
|
| 233 |
+
chunk_text = blob[i:j].strip()
|
| 234 |
+
if len(chunk_text) >= min_chars:
|
| 235 |
+
out.append({
|
| 236 |
+
"doc": b["doc"],
|
| 237 |
+
"section": b["section"],
|
| 238 |
+
"start_page": start_page,
|
| 239 |
+
"end_page": end_page,
|
| 240 |
+
"text": chunk_text
|
| 241 |
+
})
|
| 242 |
+
new_i = j - overlap
|
| 243 |
+
i = j if new_i <= i else new_i
|
| 244 |
+
# filtro final
|
| 245 |
+
out = [c for c in out if (c.get("text") or "").strip()]
|
| 246 |
+
return out
|
| 247 |
+
|
| 248 |
+
# ==============================
|
| 249 |
+
# Embeddings
|
| 250 |
+
# ==============================
|
| 251 |
+
def embed_texts(texts: List[str], batch_size: int = 16) -> np.ndarray:
|
| 252 |
+
client = get_client()
|
| 253 |
+
clean = [(i, t) for i, t in enumerate(texts) if isinstance(t, str) and t.strip()]
|
| 254 |
+
if not clean:
|
| 255 |
+
return np.zeros((0, 0), dtype=np.float32)
|
| 256 |
+
order, payload = zip(*clean)
|
| 257 |
+
vecs: Dict[int, np.ndarray] = {}
|
| 258 |
+
for i in range(0, len(payload), batch_size):
|
| 259 |
+
batch = list(payload[i:i + batch_size])
|
| 260 |
+
resp = client.embeddings.create(model=EMBED_MODEL, input=batch)
|
| 261 |
+
for k, item in enumerate(resp.data):
|
| 262 |
+
vecs[int(order[i + k])] = np.array(item.embedding, dtype=np.float32)
|
| 263 |
+
rows: List[np.ndarray] = []
|
| 264 |
+
for idx in range(len(texts)):
|
| 265 |
+
if idx in vecs:
|
| 266 |
+
rows.append(vecs[idx])
|
| 267 |
+
if not rows:
|
| 268 |
+
return np.zeros((0, 0), dtype=np.float32)
|
| 269 |
+
mat = np.vstack(rows).astype(np.float32)
|
| 270 |
+
norms = np.linalg.norm(mat, axis=1, keepdims=True)
|
| 271 |
+
norms[norms == 0] = 1.0
|
| 272 |
+
return mat / norms
|
| 273 |
+
|
| 274 |
+
def embed_query(q: str) -> np.ndarray:
|
| 275 |
+
client = get_client()
|
| 276 |
+
resp = client.embeddings.create(model=EMBED_MODEL, input=[q])
|
| 277 |
+
v = np.array(resp.data[0].embedding, dtype=np.float32)
|
| 278 |
+
n = np.linalg.norm(v)
|
| 279 |
+
return (v / (n if n > 0 else 1.0)).astype(np.float32)
|
| 280 |
+
|
| 281 |
+
# ==============================
|
| 282 |
+
# Indexação
|
| 283 |
+
# ==============================
|
| 284 |
+
def build_index() -> Tuple[np.ndarray, List[Dict]]:
|
| 285 |
+
all_blocks: List[Dict] = []
|
| 286 |
+
for p in PDFS:
|
| 287 |
+
pages = read_pdf_pages(p)
|
| 288 |
+
if not pages or not any((tx or "").strip() for _, tx in pages):
|
| 289 |
+
print(f"[WARN] Sem texto legível em {p.name}; ignorando.")
|
| 290 |
+
continue
|
| 291 |
+
blks = split_db2_docs(pages, p.name)
|
| 292 |
+
all_blocks.extend(blks)
|
| 293 |
+
|
| 294 |
+
all_chunks = make_chunks_by_chars(all_blocks, max_chars=1500, min_chars=180)
|
| 295 |
+
all_chunks = [c for c in all_chunks if (c.get("text") or "").strip()]
|
| 296 |
+
|
| 297 |
+
if not all_chunks:
|
| 298 |
+
with open(META_FILE, "w", encoding="utf-8") as f:
|
| 299 |
+
json.dump({"chunks": [], "embed_model": EMBED_MODEL, "embed_dim": 0, "total_chars": 0}, f, ensure_ascii=False, indent=2)
|
| 300 |
+
np.save(VEC_FILE, np.zeros((0, 0), dtype=np.float32))
|
| 301 |
+
raise RuntimeError("Nenhum chunk foi criado. Verifique extração/ OCR.")
|
| 302 |
+
|
| 303 |
+
texts = [c["text"] for c in all_chunks]
|
| 304 |
+
total_chars = sum(len(t) for t in texts)
|
| 305 |
+
|
| 306 |
+
mat = embed_texts(texts) if texts else np.zeros((0, 0), dtype=np.float32)
|
| 307 |
+
embed_dim = int(mat.shape[1]) if mat.size else 0
|
| 308 |
+
|
| 309 |
+
np.save(VEC_FILE, mat)
|
| 310 |
+
with open(META_FILE, "w", encoding="utf-8") as f:
|
| 311 |
+
json.dump(
|
| 312 |
+
{"chunks": all_chunks, "embed_model": EMBED_MODEL, "embed_dim": embed_dim, "total_chars": total_chars},
|
| 313 |
+
f, ensure_ascii=False, indent=2
|
| 314 |
+
)
|
| 315 |
+
return mat, all_chunks
|
| 316 |
+
|
| 317 |
+
def load_index() -> Tuple[np.ndarray, List[Dict]]:
|
| 318 |
+
if VEC_FILE.exists() and META_FILE.exists():
|
| 319 |
+
mat = np.load(VEC_FILE)
|
| 320 |
+
dd = json.loads(META_FILE.read_text(encoding="utf-8"))
|
| 321 |
+
chunks = dd.get("chunks", [])
|
| 322 |
+
return mat, chunks
|
| 323 |
+
return build_index()
|
| 324 |
+
|
| 325 |
+
def wipe_index() -> str:
|
| 326 |
+
try:
|
| 327 |
+
if INDEX_DIR.exists():
|
| 328 |
+
for p in INDEX_DIR.glob("*"):
|
| 329 |
+
p.unlink()
|
| 330 |
+
INDEX_DIR.rmdir()
|
| 331 |
+
INDEX_DIR.mkdir(exist_ok=True)
|
| 332 |
+
return "Índice limpo."
|
| 333 |
+
except Exception as e:
|
| 334 |
+
return f"Erro ao limpar índice: {e}"
|
| 335 |
+
|
| 336 |
+
# ==============================
|
| 337 |
+
# Recuperação + LLM
|
| 338 |
+
# ==============================
|
| 339 |
+
def _check_embed_dim(mat: np.ndarray) -> Optional[str]:
|
| 340 |
+
try:
|
| 341 |
+
dd = json.loads(META_FILE.read_text(encoding="utf-8"))
|
| 342 |
+
idx_dim = int(dd.get("embed_dim", 0))
|
| 343 |
+
except Exception:
|
| 344 |
+
idx_dim = 0
|
| 345 |
+
try:
|
| 346 |
+
v = embed_query("dim_test")
|
| 347 |
+
cur_dim = int(v.shape[0])
|
| 348 |
+
except Exception as e:
|
| 349 |
+
return f"Falha ao checar dimensão do embedding: {e}"
|
| 350 |
+
if idx_dim and cur_dim and idx_dim != cur_dim:
|
| 351 |
+
return (f"Incompatibilidade de dimensão do embedding: índice={idx_dim}, modelo atual={cur_dim}. "
|
| 352 |
+
f"Reindexe com o mesmo EMBED_MODEL. (Atual EMBED_MODEL: {EMBED_MODEL})")
|
| 353 |
+
return None
|
| 354 |
+
|
| 355 |
+
def retrieve_topk(query: str, doc_filter: Optional[str] = None, k: int = TOP_K_RETRIEVE) -> List[Dict]:
|
| 356 |
+
mat, chunks = load_index()
|
| 357 |
+
if mat.shape[0] == 0 or not chunks:
|
| 358 |
+
return []
|
| 359 |
+
qv = embed_query(query)
|
| 360 |
+
if mat.shape[1] != qv.shape[0]:
|
| 361 |
+
raise RuntimeError(
|
| 362 |
+
f"Dimensão incompatível mat={mat.shape} vs query={qv.shape}. "
|
| 363 |
+
f"Provável troca de EMBED_MODEL após criar o índice. Clique 'Reindexar'."
|
| 364 |
+
)
|
| 365 |
+
sims = (mat @ qv).astype(float)
|
| 366 |
+
if doc_filter and doc_filter != "(Todos)":
|
| 367 |
+
mask = np.array([1.0 if c["doc"] == doc_filter else 0.0 for c in chunks], dtype=float)
|
| 368 |
+
sims *= mask
|
| 369 |
+
idxs = np.argsort(-sims)[:k]
|
| 370 |
+
out = []
|
| 371 |
+
for i in idxs:
|
| 372 |
+
c = chunks[int(i)]
|
| 373 |
+
out.append({
|
| 374 |
+
"doc": c["doc"],
|
| 375 |
+
"section": c.get("section", ""),
|
| 376 |
+
"start_page": c.get("start_page", "?"),
|
| 377 |
+
"end_page": c.get("end_page", "?"),
|
| 378 |
+
"text": c["text"],
|
| 379 |
+
"score": float(sims[int(i)]),
|
| 380 |
+
"idx": int(i)
|
| 381 |
+
})
|
| 382 |
+
return out
|
| 383 |
+
|
| 384 |
+
def expand_context(hits: List[Dict], all_chunks: List[Dict], target_chars: int = TARGET_CONTEXT_CHARS) -> Tuple[str, List[Tuple[str, str, str]]]:
|
| 385 |
+
if not hits:
|
| 386 |
+
return "", []
|
| 387 |
+
best = max(hits, key=lambda x: x["score"])
|
| 388 |
+
ctx = best["text"]
|
| 389 |
+
srcs = {(best["doc"], best["section"], f"{best['start_page']}–{best['end_page']}")}
|
| 390 |
+
doc, section, best_idx = best["doc"], best["section"], best["idx"]
|
| 391 |
+
indices = [i for i, c in enumerate(all_chunks) if c["doc"] == doc and c.get("section", "") == section]
|
| 392 |
+
if not indices:
|
| 393 |
+
return ctx, sorted(list(srcs))
|
| 394 |
+
indices.sort()
|
| 395 |
+
if best_idx not in indices:
|
| 396 |
+
return ctx, sorted(list(srcs))
|
| 397 |
+
pos = indices.index(best_idx)
|
| 398 |
+
left, right = pos - 1, pos + 1
|
| 399 |
+
while len(ctx) < target_chars and (left >= 0 or right < len(indices)):
|
| 400 |
+
if right < len(indices) and len(ctx) < target_chars:
|
| 401 |
+
rch = all_chunks[indices[right]]
|
| 402 |
+
ctx += "\n\n" + rch["text"]
|
| 403 |
+
srcs.add((doc, section, f"{rch.get('start_page', '?')}–{rch.get('end_page', '?')}"))
|
| 404 |
+
right += 1
|
| 405 |
+
if left >= 0 and len(ctx) < target_chars:
|
| 406 |
+
lch = all_chunks[indices[left]]
|
| 407 |
+
ctx = lch["text"] + "\n\n" + ctx
|
| 408 |
+
srcs.add((doc, section, f"{lch.get('start_page', '?')}–{lch.get('end_page', '?')}"))
|
| 409 |
+
left -= 1
|
| 410 |
+
return ctx, sorted(list(srcs))
|
| 411 |
+
|
| 412 |
+
def answer_with_llm(question: str, context: str) -> str:
|
| 413 |
+
client = get_client()
|
| 414 |
+
system = ("Você é um assistente especialista em IBM Db2 para z/OS. "
|
| 415 |
+
"Responda em português, com exemplos de comandos SQL/JCL completos e corretos. "
|
| 416 |
+
"Use apenas o contexto fornecido; se algo não estiver nele, diga que não está disponível.")
|
| 417 |
+
user = (f"Pergunta:\n{question}\n\n"
|
| 418 |
+
f"Contexto do(s) manual(is):\n{context}\n\n"
|
| 419 |
+
"Regras de resposta:\n"
|
| 420 |
+
"- Explique o necessário e como fazer.\n"
|
| 421 |
+
"- Inclua pelo menos um exemplo de comando Db2 utilitário, SQL ou JCL (auto-contido), se aplicável.\n"
|
| 422 |
+
"- Liste observações/pré-requisitos, se houver.\n"
|
| 423 |
+
"- Cite as fontes (Documento e páginas) ao final.")
|
| 424 |
+
chat = client.chat.completions.create(
|
| 425 |
+
model=CHAT_MODEL,
|
| 426 |
+
messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
|
| 427 |
+
temperature=0.2,
|
| 428 |
+
)
|
| 429 |
+
return chat.choices[0].message.content.strip()
|
| 430 |
+
|
| 431 |
+
def format_sources_md(sources: List[Tuple[str, str, str]]) -> str:
|
| 432 |
+
if not sources:
|
| 433 |
+
return ""
|
| 434 |
+
lines = [
|
| 435 |
+
f"- **Documento:** {d} \n **Seção:** {s} \n **Páginas:** {p}"
|
| 436 |
+
for (d, s, p) in sources
|
| 437 |
+
]
|
| 438 |
+
return "\n".join(lines)
|
| 439 |
+
|
| 440 |
+
# ==============================
|
| 441 |
+
# Templates Db2 (exemplos)
|
| 442 |
+
# ==============================
|
| 443 |
+
DB2_TEMPLATES: Dict[str, str] = {
|
| 444 |
+
"RUNSTATS_TABLESPACE": (
|
| 445 |
+
"//RUNSTAT JOB (ACCT),'RUNSTATS',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 446 |
+
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='RUNSTATS',UTPROC=''\n"
|
| 447 |
+
"//SYSIN DD *\n"
|
| 448 |
+
" RUNSTATS TABLESPACE(DBNAME.TSNAME) TABLE(ALL) INDEX(ALL)\n"
|
| 449 |
+
"/*\n"
|
| 450 |
+
),
|
| 451 |
+
"REORG_TABLESPACE": (
|
| 452 |
+
"//REORG JOB (ACCT),'REORG',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 453 |
+
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='REORGTS',UTPROC=''\n"
|
| 454 |
+
"//SYSIN DD *\n"
|
| 455 |
+
" REORG TABLESPACE(DBNAME.TSNAME) SHRLEVEL CHANGE\n"
|
| 456 |
+
"/*\n"
|
| 457 |
+
),
|
| 458 |
+
"EXPLAIN_SQL": (
|
| 459 |
+
"//EXPLAIN JOB (ACCT),'EXPLAIN',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 460 |
+
"//STEP1 EXEC DSNTEP2,SYSTEM=DSN1\n"
|
| 461 |
+
"//SYSIN DD *\n"
|
| 462 |
+
" EXPLAIN PLAN FOR\n"
|
| 463 |
+
" SELECT COL1, COL2 FROM DBNAME.TBNAME WHERE COL3 = 'X';\n"
|
| 464 |
+
"/*\n"
|
| 465 |
+
),
|
| 466 |
+
"DISPLAY_BUFFERPOOL": (
|
| 467 |
+
"//DISPBP JOB (ACCT),'DISPLAY BP',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 468 |
+
"//STEP1 EXEC PGM=IKJEFT01\n"
|
| 469 |
+
"//SYSTSPRT DD SYSOUT=*\n"
|
| 470 |
+
"//SYSIN DD *\n"
|
| 471 |
+
" DSN SYSTEM(DSN1)\n"
|
| 472 |
+
" -DISPLAY BUFFERPOOL(BP0) DETAIL\n"
|
| 473 |
+
" END\n"
|
| 474 |
+
"/*\n"
|
| 475 |
+
),
|
| 476 |
+
"DSNTEP2_SELECT": (
|
| 477 |
+
"//SELECT JOB (ACCT),'DSNTEP2',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 478 |
+
"//STEP1 EXEC DSNTEP2,SYSTEM=DSN1\n"
|
| 479 |
+
"//SYSIN DD *\n"
|
| 480 |
+
" SELECT FIRSTNME, LASTNAME FROM DSN8810.EMP\n"
|
| 481 |
+
" WHERE WORKDEPT = 'A00';\n"
|
| 482 |
+
"/*\n"
|
| 483 |
+
),
|
| 484 |
+
"COPY_TABLESPACE": (
|
| 485 |
+
"//COPYTS JOB (ACCT),'COPY',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 486 |
+
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='COPYTS',UTPROC=''\n"
|
| 487 |
+
"//SYSIN DD *\n"
|
| 488 |
+
" COPY TABLESPACE(DBNAME.TSNAME) FULL YES SHRLEVEL CHANGE\n"
|
| 489 |
+
"/*\n"
|
| 490 |
+
),
|
| 491 |
+
"LOAD_TABLE": (
|
| 492 |
+
"//LOADTBL JOB (ACCT),'LOAD',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 493 |
+
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='LOADTBL',UTPROC=''\n"
|
| 494 |
+
"//SYSIN DD *\n"
|
| 495 |
+
" LOAD DATA INDDN SYSREC INTO TABLE DBNAME.TBNAME\n"
|
| 496 |
+
" REPLACE\n"
|
| 497 |
+
"/*\n"
|
| 498 |
+
),
|
| 499 |
+
"RECOVER_TABLESPACE": (
|
| 500 |
+
"//RECOVTS JOB (ACCT),'RECOVER',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 501 |
+
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='RECOVTS',UTPROC=''\n"
|
| 502 |
+
"//SYSIN DD *\n"
|
| 503 |
+
" RECOVER TABLESPACE(DBNAME.TSNAME)\n"
|
| 504 |
+
"/*\n"
|
| 505 |
+
),
|
| 506 |
+
"STATS_INDEX": (
|
| 507 |
+
"//STATSIX JOB (ACCT),'STATS INDEX',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 508 |
+
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='STATSIX',UTPROC=''\n"
|
| 509 |
+
"//SYSIN DD *\n"
|
| 510 |
+
" RUNSTATS INDEX(DBNAME.IXNAME) ALL\n"
|
| 511 |
+
"/*\n"
|
| 512 |
+
),
|
| 513 |
+
"MODIFY_RECOVERY": (
|
| 514 |
+
"//MODREC JOB (ACCT),'MODIFY RECOVERY',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 515 |
+
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='MODREC',UTPROC=''\n"
|
| 516 |
+
"//SYSIN DD *\n"
|
| 517 |
+
" MODIFY RECOVERY TABLESPACE(DBNAME.TSNAME) AGE(30)\n"
|
| 518 |
+
"/*\n"
|
| 519 |
+
),
|
| 520 |
+
"CHECK_DATA": (
|
| 521 |
+
"//CHKDATA JOB (ACCT),'CHECK DATA',CLASS=A,MSGCLASS=X,NOTIFY=&SYSUID\n"
|
| 522 |
+
"//STEP1 EXEC DSNUPROC,SYSTEM=DSN1,UID='CHKDATA',UTPROC=''\n"
|
| 523 |
+
"//SYSIN DD *\n"
|
| 524 |
+
" CHECK DATA TABLESPACE(DBNAME.TSNAME) SCOPE ALL\n"
|
| 525 |
+
"/*\n"
|
| 526 |
+
),
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
def template_for_db2(cmd: str) -> str:
|
| 530 |
+
return DB2_TEMPLATES.get(cmd, "//GENERIC ...\n")
|
| 531 |
+
|
| 532 |
+
# ==============================
|
| 533 |
+
# UI — layout com SIDEBAR + diagnóstico
|
| 534 |
+
# ==============================
|
| 535 |
+
CUSTOM_CSS = """
|
| 536 |
+
:root{ --ink:#0f172a; --muted:#475569; }
|
| 537 |
+
.gradio-container { max-width: 1200px !important; margin: 0 auto !important; }
|
| 538 |
+
.section-card { background: #fff; border: 1px solid #e2e8f0; border-radius: 16px; padding: 16px; box-shadow: 0 10px 30px rgba(2,6,23,.05); }
|
| 539 |
+
.section-title { font-size: 1.05rem; font-weight: 800; color: var(--ink); display: flex; gap: .6rem; align-items: center; }
|
| 540 |
+
.subtitle { color:var(--muted); font-size:.95rem; margin-top:.25rem; }
|
| 541 |
+
.result-card { background:#fcfdff; border:1px solid #e2e8f0; border-radius:12px; padding:12px; }
|
| 542 |
+
hr.sep { border:none; border-top:1px dashed #e2e8f0; margin:10px 0 14px; }
|
| 543 |
+
.small { font-size:.9rem; color:var(--muted); }
|
| 544 |
+
"""
|
| 545 |
+
|
| 546 |
+
def build_app():
|
| 547 |
+
doc_label = PDFS[0].name if PDFS else "(Nenhum PDF)"
|
| 548 |
+
all_doc_names = [p.name for p in PDFS] or ["(Nenhum PDF encontrado)"]
|
| 549 |
+
default_doc = all_doc_names[0] if all_doc_names else "(Todos)"
|
| 550 |
+
|
| 551 |
+
with gr.Blocks(title="Db2 z/OS • RAG (NVIDIA NIM)", css=CUSTOM_CSS, fill_height=True) as demo:
|
| 552 |
+
|
| 553 |
+
# ===== Sidebar =====
|
| 554 |
+
with gr.Sidebar():
|
| 555 |
+
gr.Markdown(
|
| 556 |
+
f"""
|
| 557 |
+
<div class="section-title">💼 Db2 RAG</div>
|
| 558 |
+
<div class="small">Contexto: <code>{doc_label}</code></div>
|
| 559 |
+
"""
|
| 560 |
+
)
|
| 561 |
+
status_box = gr.Markdown("Pronto ✅" if PDFS else "⚠️ Nenhum PDF encontrado.")
|
| 562 |
+
gr.Markdown("<hr class='sep'/>")
|
| 563 |
+
|
| 564 |
+
with gr.Group():
|
| 565 |
+
gr.Markdown("**Ações**")
|
| 566 |
+
test_btn = gr.Button("🧪 Testar conexão NVIDIA", variant="secondary")
|
| 567 |
+
rebuild_btn = gr.Button("🔁 Reindexar (NIM)")
|
| 568 |
+
diag_btn = gr.Button("🛠️ Diagnóstico do Índice")
|
| 569 |
+
|
| 570 |
+
gr.Markdown("<hr class='sep'/>")
|
| 571 |
+
|
| 572 |
+
with gr.Accordion("Configuração", open=False):
|
| 573 |
+
gr.Markdown(f"- **Embeddings:** `{EMBED_MODEL}`\n- **LLM:** `{CHAT_MODEL}`\n- **Índice:** `{INDEX_DIR.name}`")
|
| 574 |
+
doc_choice = gr.Dropdown(
|
| 575 |
+
choices=(["(Todos)"] + all_doc_names),
|
| 576 |
+
value=default_doc if PDFS else "(Todos)",
|
| 577 |
+
label="Documento"
|
| 578 |
+
)
|
| 579 |
+
|
| 580 |
+
# ===== Main content =====
|
| 581 |
+
gr.Markdown(
|
| 582 |
+
f"""
|
| 583 |
+
<div class="section-card" style="padding:18px; display:flex; gap:16px; align-items:center;">
|
| 584 |
+
<div style="font-size:26px;">🧭</div>
|
| 585 |
+
<div style="flex:1">
|
| 586 |
+
<div style="font-size:1.2rem; font-weight:800; color:#0f172a;">DB2 -Z/OS UTILITIES | RAG + NVIDIA NIM</div>
|
| 587 |
+
<div class="subtitle">Pergunte sobre utilidades (COPY, LOAD, REORG, RUNSTATS, RECOVER, etc.). As respostas vêm do manual: <code>{doc_label}</code>.</div>
|
| 588 |
+
</div>
|
| 589 |
+
</div>
|
| 590 |
+
"""
|
| 591 |
+
)
|
| 592 |
+
|
| 593 |
+
with gr.Row():
|
| 594 |
+
q = gr.Textbox(
|
| 595 |
+
label="Pergunta (Db2 Utilities)",
|
| 596 |
+
placeholder="Ex.: Como usar COPY FULL com SHRLEVEL CHANGE? • Quando rodar RUNSTATS INDEX? • REORG TABLESPACE SHRLEVEL CHANGE • RECOVER PITR...",
|
| 597 |
+
scale=8
|
| 598 |
+
)
|
| 599 |
+
with gr.Row():
|
| 600 |
+
ask_btn = gr.Button("🔍 Buscar", variant="primary", scale=2)
|
| 601 |
+
clear_btn = gr.Button("🧹 Limpar", scale=1)
|
| 602 |
+
|
| 603 |
+
out = gr.Markdown(label="Resposta (Db2)")
|
| 604 |
+
|
| 605 |
+
gr.Markdown("<hr class='sep'/>")
|
| 606 |
+
|
| 607 |
+
with gr.Accordion("🧩 Templates Db2 executáveis", open=False):
|
| 608 |
+
db2_choice = gr.Dropdown(
|
| 609 |
+
choices=list(DB2_TEMPLATES.keys()),
|
| 610 |
+
value="RUNSTATS_TABLESPACE",
|
| 611 |
+
label="Comando / Padrão"
|
| 612 |
+
)
|
| 613 |
+
db2_btn = gr.Button("📄 Gerar exemplo")
|
| 614 |
+
db2_out = gr.Textbox(label="Exemplo (copiar/ajustar)", lines=18, show_copy_button=True)
|
| 615 |
+
|
| 616 |
+
with gr.Accordion("🧪 Log / Diagnóstico", open=False):
|
| 617 |
+
diag_out = gr.Markdown()
|
| 618 |
+
|
| 619 |
+
# ===== Callbacks =====
|
| 620 |
+
def _test_conn():
|
| 621 |
+
try:
|
| 622 |
+
dim = len(get_client().embeddings.create(model=EMBED_MODEL, input=["ping"]).data[0].embedding)
|
| 623 |
+
return f"Conexão ok ✅ — dimensão do embedding: **{dim}**"
|
| 624 |
+
except Exception as e:
|
| 625 |
+
return f"⚠️ Falha na conexão/credenciais NVIDIA: `{type(e).__name__}` — {e}"
|
| 626 |
+
|
| 627 |
+
def _rebuild():
|
| 628 |
+
try:
|
| 629 |
+
msg = wipe_index()
|
| 630 |
+
mat, chunks = build_index()
|
| 631 |
+
return msg + f" Reindexação concluída ✅ PDFs: {len(PDFS)} • Chunks: {len(chunks)} • Vetores: {mat.shape}"
|
| 632 |
+
except Exception as e:
|
| 633 |
+
return f"⚠️ Erro ao reindexar: `{type(e).__name__}` — {e}"
|
| 634 |
+
|
| 635 |
+
def _diagnose(dsel: str) -> str:
|
| 636 |
+
try:
|
| 637 |
+
if not (VEC_FILE.exists() and META_FILE.exists()):
|
| 638 |
+
return "❌ Nenhum índice encontrado. Clique **Reindexar (NIM)**."
|
| 639 |
+
mat = np.load(VEC_FILE)
|
| 640 |
+
meta = json.loads(META_FILE.read_text(encoding="utf-8"))
|
| 641 |
+
chunks = meta.get("chunks", [])
|
| 642 |
+
embed_dim = meta.get("embed_dim", 0)
|
| 643 |
+
total_chars = int(meta.get("total_chars", 0))
|
| 644 |
+
dim_msg = _check_embed_dim(mat)
|
| 645 |
+
# primeiras seções
|
| 646 |
+
first_secs = []
|
| 647 |
+
for c in chunks[:12]:
|
| 648 |
+
if dsel == "(Todos)" or c["doc"] == dsel:
|
| 649 |
+
first_secs.append(f"- {c['doc']} • {c.get('section','?')} • p.{c.get('start_page','?')}-{c.get('end_page','?')}")
|
| 650 |
+
if not first_secs:
|
| 651 |
+
first_secs = ["(Filtro de documento não encontra seções no índice.)"]
|
| 652 |
+
# prévia do primeiro chunk
|
| 653 |
+
preview = ""
|
| 654 |
+
for c in chunks:
|
| 655 |
+
t = (c.get("text") or "").strip()
|
| 656 |
+
if t:
|
| 657 |
+
preview = t[:400].replace("\n", " ")
|
| 658 |
+
break
|
| 659 |
+
if not preview:
|
| 660 |
+
preview = "(Nenhum chunk contém texto — verifique extração/OCR.)"
|
| 661 |
+
msg = [
|
| 662 |
+
f"**Índice**: Vetores `{mat.shape}` • embed_dim(meta): `{embed_dim}` • Modelo atual: `{EMBED_MODEL}`",
|
| 663 |
+
f"**Chunks**: **{len(chunks)}** • **Total de caracteres**: {total_chars}",
|
| 664 |
+
f"**Documento selecionado**: `{dsel}`",
|
| 665 |
+
f"**Primeiras seções**:\n" + "\n".join(first_secs),
|
| 666 |
+
f"\n**Prévia (400 chars)**:\n```\n{preview}\n```"
|
| 667 |
+
]
|
| 668 |
+
if dim_msg:
|
| 669 |
+
msg.append(f"\n⚠️ {dim_msg}")
|
| 670 |
+
return "\n".join(msg)
|
| 671 |
+
except Exception as e:
|
| 672 |
+
return f"⚠️ Diagnóstico falhou: `{type(e).__name__}` — {e}"
|
| 673 |
+
|
| 674 |
+
def _search_answer(qstr: str, d: str) -> str:
|
| 675 |
+
try:
|
| 676 |
+
if not qstr or qstr.strip() == "":
|
| 677 |
+
return "_Informe uma pergunta._"
|
| 678 |
+
if not (VEC_FILE.exists() and META_FILE.exists()):
|
| 679 |
+
return "_Nenhum conteúdo indexado. Use **Reindexar**._"
|
| 680 |
+
mat = np.load(VEC_FILE)
|
| 681 |
+
meta = json.loads(META_FILE.read_text(encoding="utf-8"))
|
| 682 |
+
chunks = meta.get("chunks", [])
|
| 683 |
+
if mat.size == 0 or not chunks:
|
| 684 |
+
return "_Índice vazio. Reindexe (pode ser necessário OCR)._"
|
| 685 |
+
dim_msg = _check_embed_dim(mat)
|
| 686 |
+
if dim_msg:
|
| 687 |
+
return f"⚠️ {dim_msg}"
|
| 688 |
+
# retrieve
|
| 689 |
+
hits = retrieve_topk(qstr, None if d == "(Todos)" else d, k=TOP_K_RETRIEVE)
|
| 690 |
+
hits = [h for h in hits if (h.get("text") or "").strip()]
|
| 691 |
+
if not hits:
|
| 692 |
+
return "_Nada encontrado para a consulta (verifique o filtro de documento ou reindexe)._"
|
| 693 |
+
context, sources = expand_context(hits, chunks, TARGET_CONTEXT_CHARS)
|
| 694 |
+
if not context.strip():
|
| 695 |
+
return "_Contexto insuficiente encontrado._"
|
| 696 |
+
answer = answer_with_llm(qstr, context)
|
| 697 |
+
src_md = format_sources_md(sources)
|
| 698 |
+
return f"<div class='result-card'>{answer}</div>\n\n### Fontes\n{src_md}"
|
| 699 |
+
except Exception as e:
|
| 700 |
+
return f"⚠️ Erro ao buscar: `{type(e).__name__}` — {e}"
|
| 701 |
+
|
| 702 |
+
def _clear(doc_default: str) -> Tuple[str, str]:
|
| 703 |
+
return "", (doc_default if PDFS else "(Todos)")
|
| 704 |
+
|
| 705 |
+
def ui_db2_template(cmd_choice: str) -> str:
|
| 706 |
+
return template_for_db2(cmd_choice)
|
| 707 |
+
|
| 708 |
+
test_btn.click(_test_conn, outputs=[status_box])
|
| 709 |
+
rebuild_btn.click(_rebuild, outputs=[status_box])
|
| 710 |
+
diag_btn.click(_diagnose, inputs=[doc_choice], outputs=[diag_out])
|
| 711 |
+
|
| 712 |
+
ask_btn.click(_search_answer, inputs=[q, doc_choice], outputs=[out])
|
| 713 |
+
clear_btn.click(_clear, inputs=[gr.State(default_doc)], outputs=[q, doc_choice])
|
| 714 |
+
db2_btn.click(ui_db2_template, inputs=[db2_choice], outputs=[db2_out])
|
| 715 |
+
|
| 716 |
+
return demo
|
| 717 |
+
|
| 718 |
+
# ==============================
|
| 719 |
+
# Main (robusto: bind público, respeita $PORT, SSR off, queue opcional)
|
| 720 |
+
# ==============================
|
| 721 |
+
if __name__ == "__main__":
|
| 722 |
+
try:
|
| 723 |
+
_ = load_index()
|
| 724 |
+
except Exception as e:
|
| 725 |
+
print(f"[AVISO] Índice não carregado: {e}")
|
| 726 |
+
app = build_app()
|
| 727 |
+
app.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|