Geoeasy commited on
Commit
7d9d7d9
·
verified ·
1 Parent(s): ab6f0d9

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +346 -0
  2. logo.svg +254 -0
  3. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG completo em Gradio usando:
3
+ - Crawler para o Pandas (links internos)
4
+ - Chunking + FAISS (vector store local)
5
+ - Embeddings e LLM via NVIDIA NIM (API compatível com OpenAI)
6
+
7
+ Como usar:
8
+ 1) Instale dependências:
9
+ pip install gradio requests beautifulsoup4 langchain langchain-community faiss-cpu sentence-transformers langchain-nvidia-ai-endpoints
10
+
11
+ 2) Defina a sua chave da NVIDIA (NIM):
12
+ export NVIDIA_API_KEY="SEU_TOKEN"
13
+ # ou em Windows PowerShell: $env:NVIDIA_API_KEY="SEU_TOKEN"
14
+
15
+ 3) Rode o app:
16
+ python app.py
17
+
18
+ Notas:
19
+ - O índice FAISS é salvo em ./indices/pandas_userg e reutilizado nas próximas execuções.
20
+ - O crawler respeita robots.txt e limita a taxa de requisições (SLEEP_SECONDS).
21
+ - Você pode limitar o número de páginas durante testes definindo MAX_PAGES.
22
+
23
+ Trocar modelos:
24
+ - LLM: mude `LLM_MODEL` (ex.: "meta/llama-3.1-8b-instruct", "mistralai/mixtral-8x7b-instruct-v0.1", etc.)
25
+ - Embeddings: mude `EMBED_MODEL` (ex.: "nvidia/nv-embed-v1")
26
+ """
27
+ from __future__ import annotations
28
+
29
+ import os
30
+ import re
31
+ import time
32
+ import queue
33
+ import logging
34
+ import base64
35
+ from io import StringIO
36
+ from typing import List, Dict, Set, Tuple
37
+
38
+ import requests
39
+ from bs4 import BeautifulSoup
40
+ from urllib.parse import urljoin, urlparse, urldefrag
41
+ import urllib.robotparser as robotparser
42
+
43
+ import gradio as gr
44
+
45
+ # LangChain & vector search
46
+ from langchain_community.vectorstores import FAISS
47
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
48
+ from langchain.schema import Document
49
+
50
+ # NVIDIA NIM endpoints (LangChain integration)
51
+ from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA
52
+
53
+ # ----------------------------
54
+ # Log / Observabilidade
55
+ # ----------------------------
56
+ class InMemoryLogHandler(logging.Handler):
57
+ def __init__(self):
58
+ super().__init__()
59
+ self.buffer = StringIO()
60
+ def emit(self, record):
61
+ msg = self.format(record)
62
+ self.buffer.write(msg + "\n")
63
+ def get_value(self):
64
+ return self.buffer.getvalue()
65
+ def clear(self):
66
+ self.buffer.seek(0)
67
+ self.buffer.truncate(0)
68
+
69
+ logger = logging.getLogger("rag_pandas")
70
+ logger.setLevel(logging.INFO)
71
+ _stream_handler = logging.StreamHandler()
72
+ _stream_handler.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
73
+ logger.addHandler(_stream_handler)
74
+
75
+ mem_handler = InMemoryLogHandler()
76
+ mem_handler.setFormatter(logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s", datefmt="%H:%M:%S"))
77
+ logger.addHandler(mem_handler)
78
+
79
+ # ----------------------------
80
+ # Configurações
81
+ # ----------------------------
82
+ BASE_URL = "https://pandas.pydata.org/docs/user_guide/index.html"
83
+ SAVE_DIR = os.path.join("indices")
84
+
85
+ USER_AGENT = "RAG-Indexer/1.0 (+https://example.com/contact)"
86
+
87
+ CHUNK_SIZE = 1000
88
+ CHUNK_OVERLAP = 200
89
+ REQUEST_TIMEOUT = 25
90
+ SLEEP_SECONDS = 0.6
91
+ MAX_PAGES = None
92
+ ALLOWED_NETLOC = urlparse(BASE_URL).netloc
93
+ ALLOWED_PREFIX = BASE_URL
94
+
95
+ # Modelos NVIDIA NIM
96
+ EMBED_MODEL = "nvidia/nv-embed-v1"
97
+ LLM_MODEL = "meta/llama-3.1-8b-instruct"
98
+
99
+ # Logo (SVG) — alinhar à esquerda, sem espaços
100
+ LOGO_PATH = r"C:\pandas\logo.svg"
101
+
102
+ # ----------------------------
103
+ # Utilidades
104
+ # ----------------------------
105
+ def _clean_text_from_html(html: str) -> str:
106
+ soup = BeautifulSoup(html, "html.parser")
107
+ for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside"]):
108
+ tag.decompose()
109
+ main = soup.find("div", {"role": "main"}) or soup
110
+ text = main.get_text("\n", strip=True)
111
+ text = re.sub(r"\n{3,}", "\n\n", text)
112
+ return text
113
+
114
+ def _canonicalize(href: str, base: str) -> str:
115
+ abs_url = urljoin(base, href)
116
+ abs_url, _ = urldefrag(abs_url)
117
+ if abs_url.endswith("index.html"):
118
+ abs_url = abs_url[:-10]
119
+ return abs_url
120
+
121
+ def _same_site_internal(url: str) -> bool:
122
+ u = urlparse(url)
123
+ return (u.netloc == ALLOWED_NETLOC) and url.startswith(ALLOWED_PREFIX)
124
+
125
+ def _is_allowed_by_robots(url: str, rp: robotparser.RobotFileParser) -> bool:
126
+ try:
127
+ return rp.can_fetch(USER_AGENT, url)
128
+ except Exception:
129
+ return True
130
+
131
+ def _fetch(url: str) -> Tuple[int, str]:
132
+ resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=REQUEST_TIMEOUT)
133
+ return resp.status_code, resp.text
134
+
135
+ def _svg_data_uri(path: str) -> str | None:
136
+ try:
137
+ with open(path, "rb") as f:
138
+ b64 = base64.b64encode(f.read()).decode("ascii")
139
+ return f"data:image/svg+xml;base64,{b64}"
140
+ except Exception as e:
141
+ logger.warning(f"Logo não encontrado ou inválido: {path} ({e})")
142
+ return None
143
+
144
+ # ----------------------------
145
+ # Crawler
146
+ # ----------------------------
147
+ def crawl_training_manual(start_url: str, max_pages: int | None = None) -> List[Dict]:
148
+ robots_url = urljoin(start_url, "/robots.txt")
149
+ rp = robotparser.RobotFileParser()
150
+ try:
151
+ rp.set_url(robots_url)
152
+ rp.read()
153
+ except Exception:
154
+ pass
155
+
156
+ visited: Set[str] = set()
157
+ out: List[Dict] = []
158
+ q: queue.Queue[str] = queue.Queue()
159
+ q.put(start_url)
160
+
161
+ while not q.empty():
162
+ url = q.get()
163
+ if url in visited:
164
+ continue
165
+ visited.add(url)
166
+
167
+ if not _same_site_internal(url):
168
+ continue
169
+ if not _is_allowed_by_robots(url, rp):
170
+ continue
171
+
172
+ try:
173
+ status, html = _fetch(url)
174
+ except Exception:
175
+ continue
176
+ if status != 200:
177
+ continue
178
+
179
+ soup = BeautifulSoup(html, "html.parser")
180
+ title = soup.title.get_text(strip=True) if soup.title else url
181
+ text = _clean_text_from_html(html)
182
+ if text:
183
+ out.append({"url": url, "title": title, "text": text})
184
+
185
+ for a in soup.find_all("a", href=True):
186
+ href = a["href"].strip()
187
+ if href.startswith(("mailto:", "javascript:", "tel:")):
188
+ continue
189
+ abs_url = _canonicalize(href, url)
190
+ if _same_site_internal(abs_url) and abs_url not in visited:
191
+ q.put(abs_url)
192
+
193
+ time.sleep(SLEEP_SECONDS)
194
+ if max_pages and len(out) >= max_pages:
195
+ break
196
+
197
+ return out
198
+
199
+ # ----------------------------
200
+ # Indexação
201
+ # ----------------------------
202
+ def _make_documents(pages: List[Dict]) -> List[Document]:
203
+ splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
204
+ docs: List[Document] = []
205
+ for p in pages:
206
+ meta_base = {"source": p["url"], "title": p.get("title", "")}
207
+ chunks = splitter.split_text(p["text"])
208
+ for i, ch in enumerate(chunks):
209
+ meta = dict(meta_base)
210
+ meta["chunk"] = i
211
+ docs.append(Document(page_content=ch, metadata=meta))
212
+ return docs
213
+
214
+ def build_or_load_index(force_rebuild: bool = False) -> Tuple[FAISS, NVIDIAEmbeddings]:
215
+ os.makedirs(SAVE_DIR, exist_ok=True)
216
+ embeddings = NVIDIAEmbeddings(model=EMBED_MODEL, api_key=os.getenv("NVIDIA_API_KEY"))
217
+
218
+ index_path = os.path.join(SAVE_DIR, "index.faiss")
219
+ store_path = os.path.join(SAVE_DIR, "index.pkl")
220
+
221
+ if (not force_rebuild) and os.path.exists(index_path) and os.path.exists(store_path):
222
+ db = FAISS.load_local(SAVE_DIR, embeddings, allow_dangerous_deserialization=True)
223
+ return db, embeddings
224
+
225
+ pages = crawl_training_manual(BASE_URL, max_pages=MAX_PAGES)
226
+ docs = _make_documents(pages)
227
+ db = FAISS.from_documents(docs, embeddings)
228
+ db.save_local(SAVE_DIR)
229
+ return db, embeddings
230
+
231
+ # ----------------------------
232
+ # LLM & RAG
233
+ # ----------------------------
234
+ def make_llm() -> ChatNVIDIA:
235
+ api_key = os.getenv("NVIDIA_API_KEY")
236
+ if not api_key:
237
+ raise RuntimeError("Defina NVIDIA_API_KEY no ambiente.")
238
+ return ChatNVIDIA(model=LLM_MODEL, api_key=api_key)
239
+
240
+ def format_answer(question: str, context_docs: List[Document], llm_text: str) -> str:
241
+ seen = set()
242
+ refs = []
243
+ for d in context_docs:
244
+ src = d.metadata.get("source", "")
245
+ title = d.metadata.get("title", "") or src
246
+ key = (title, src)
247
+ if key not in seen:
248
+ seen.add(key)
249
+ refs.append(f"- {title}\n {src}")
250
+ if len(refs) >= 5:
251
+ break
252
+ refs_block = "\n".join(refs) if refs else "- (sem fontes encontradas)"
253
+ return f"{llm_text}\n\n---\n**Pergunta:** {question}\n\n**Fontes:**\n{refs_block}"
254
+
255
+ def rag_answer(db: FAISS, llm: ChatNVIDIA, question: str, k: int = 4, max_context_tokens: int = 2800) -> str:
256
+ retriever = db.as_retriever(search_kwargs={"k": k})
257
+ docs = retriever.get_relevant_documents(question)
258
+
259
+ ctx_parts, total = [], 0
260
+ for d in docs:
261
+ txt = d.page_content.strip()
262
+ if total + len(txt) > max_context_tokens:
263
+ txt = txt[: max(0, max_context_tokens - total)]
264
+ ctx_parts.append(txt)
265
+ total += len(txt)
266
+ if total >= max_context_tokens:
267
+ break
268
+ context = "\n\n".join(ctx_parts)
269
+
270
+ system_msg = (
271
+ "Você é um Expert no package Pandas. Responda de forma direta, cite passos práticos e comandos quando útil.\n"
272
+ "Se a resposta não estiver clara no contexto, seja honesto sobre a incerteza."
273
+ )
274
+ user_prompt = (
275
+ f"Use APENAS o contexto a seguir para responder. Se faltar informação, diga o que falta.\n\n"
276
+ f"### Contexto\n{context}\n\n"
277
+ f"### Pergunta\n{question}"
278
+ )
279
+
280
+ msg = [("system", system_msg), ("user", user_prompt)]
281
+ llm_text = llm.invoke(msg).content
282
+ return format_answer(question, docs, llm_text)
283
+
284
+ # ----------------------------
285
+ # Gradio UI
286
+ # ----------------------------
287
+ db_global: FAISS | None = None
288
+ llm_global: ChatNVIDIA | None = None
289
+
290
+ def _init_once(force_rebuild: bool = False):
291
+ global db_global, llm_global
292
+ if db_global is None or force_rebuild:
293
+ db_global, _ = build_or_load_index(force_rebuild=force_rebuild)
294
+ if llm_global is None:
295
+ llm_global = make_llm()
296
+
297
+ def ui_query(question: str, k: int, force_rebuild: bool):
298
+ try:
299
+ _init_once(force_rebuild)
300
+ return rag_answer(db_global, llm_global, question, k=k)
301
+ except Exception as e:
302
+ return f"Erro: {e}"
303
+
304
+ def build_ui():
305
+ custom_css = """
306
+ .gradio-container { padding: 0 !important; } /* remove padding global */
307
+ #logo_bar { margin: 0 !important; padding: 0 !important; } /* barra do logo sem espaços */
308
+ #logo_bar img { display: block; margin: 0 !important; } /* imagem sem margens */
309
+ #title_md { margin-top: 0 !important; } /* título encostado no topo */
310
+ """
311
+
312
+ with gr.Blocks(title="RAG PANDAS", css=custom_css) as demo:
313
+ # LOGO à esquerda, sem espaços
314
+ _logo_uri = _svg_data_uri(LOGO_PATH)
315
+ if _logo_uri:
316
+ gr.HTML(
317
+ f'<div id="logo_bar" style="width:100%;display:block;">'
318
+ f' <img src="{_logo_uri}" alt="logo" style="height:200px;"/>'
319
+ f'</div>'
320
+ )
321
+
322
+ gr.Markdown("""
323
+ # Manual do PANDAS
324
+ Este app realiza *crawl* do manual, indexa localmente (FAISS).
325
+ """, elem_id="title_md")
326
+
327
+ with gr.Row():
328
+ question = gr.Textbox(label="Pergunta", placeholder="Ex.: Como criar um dataframe?")
329
+ with gr.Row():
330
+ k = gr.Slider(1, 10, value=4, step=1, label="k (nº de trechos)")
331
+ rebuild = gr.Checkbox(False, label="Reindexar do zero (forçar crawler)")
332
+ btn = gr.Button("Consultar")
333
+ output = gr.Markdown()
334
+ btn.click(fn=ui_query, inputs=[question, k, rebuild], outputs=output)
335
+
336
+ gr.Markdown("""
337
+ **Dicas**
338
+ - A primeira execução pode demorar (crawler + indexação). Nas próximas, o índice é reaproveitado.
339
+ - Marque *Reindexar do zero* se quiser atualizar ou refazer o índice.
340
+ """)
341
+
342
+ return demo
343
+
344
+ if __name__ == "__main__":
345
+ demo = build_ui()
346
+ demo.launch()
logo.svg ADDED
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # requirements.txt
2
+ gradio>=4.36,<5
3
+ requests>=2.31
4
+ beautifulsoup4>=4.12
5
+ langchain>=0.2,<0.3
6
+ langchain-community>=0.2,<0.3
7
+ faiss-cpu>=1.7.4
8
+ langchain-nvidia-ai-endpoints>=0.2,<0.3