Spaces:

roundb
/

SOGETREL

Sleeping

App Files Files Community

roundb commited on Feb 26

Commit

6034b74

verified ·

1 Parent(s): 36c6f1c

Upload 5 files

Browse files

Files changed (6) hide show

.gitattributes +1 -0
app.py +190 -0
requirements.txt +7 -0
vectorstore_faiss/_fingerprint.json +31 -0
vectorstore_faiss/index.faiss +3 -0
vectorstore_faiss/index.pkl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+vectorstore_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,190 @@

+#!/usr/bin/env python3
+"""
+RAG Chatbot – Gradio + FAISS + NVIDIA NIM
+Layout com cards automáticos usando examples do ChatInterface
+"""
+import os
+import glob
+from typing import List
+import gradio as gr
+import pandas as pd
+from openai import OpenAI
+from langchain_core.documents import Document
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# =========================
+# CONFIG
+# =========================
+DATA_DIR = os.getenv("DATA_DIR", "data")
+EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+CHUNK_SIZE = 900
+CHUNK_OVERLAP = 150
+TOP_K = 6
+MAX_CONTEXT_CHARS = 4500
+NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY", "")
+NVIDIA_BASE_URL = "https://integrate.api.nvidia.com/v1"
+NVIDIA_MODEL = "meta/llama-3.3-70b-instruct"
+client = OpenAI(base_url=NVIDIA_BASE_URL, api_key=NVIDIA_API_KEY) if NVIDIA_API_KEY else None
+SYSTEM_PROMPT = """Você é um assistente que responde perguntas com base em documentos.
+Responda SOMENTE com base no CONTEXTO recuperado.
+Se não houver evidência suficiente, diga claramente.
+Seja objetivo.
+"""
+# =========================
+# READ FILES
+# =========================
+SUPPORTED_EXT = {".pdf", ".docx", ".xlsx", ".xls", ".csv", ".txt"}
+def list_files(data_dir: str) -> List[str]:
+    files = []
+    for ext in SUPPORTED_EXT:
+        files.extend(glob.glob(os.path.join(data_dir, f"**/*{ext}"), recursive=True))
+    return sorted(set(files))
+def read_txt(path):
+    try:
+        with open(path, "r", encoding="utf-8", errors="ignore") as f:
+            return f.read()
+    except:
+        return ""
+def read_csv(path):
+    try:
+        df = pd.read_csv(path)
+        return df.head(1000).to_csv(index=False)
+    except:
+        return ""
+def read_docx(path):
+    from docx import Document as DocxDocument
+    doc = DocxDocument(path)
+    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
+def read_pdf(path):
+    from pypdf import PdfReader
+    reader = PdfReader(path)
+    return "\n".join([p.extract_text() or "" for p in reader.pages])
+# =========================
+# BUILD VECTOR DATABASE
+# =========================
+def build_vectordb():
+    files = list_files(DATA_DIR)
+    if not files:
+        raise FileNotFoundError("Nenhum arquivo encontrado na pasta data/")
+    docs = []
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP
+    )
+    for path in files:
+        ext = os.path.splitext(path)[1].lower()
+        text = ""
+        if ext == ".txt":
+            text = read_txt(path)
+        elif ext == ".csv":
+            text = read_csv(path)
+        elif ext in [".xlsx", ".xls"]:
+            text = read_csv(path)
+        elif ext == ".docx":
+            text = read_docx(path)
+        elif ext == ".pdf":
+            text = read_pdf(path)
+        for chunk in splitter.split_text(text):
+            docs.append(Document(page_content=chunk, metadata={"source": path}))
+    embedding = HuggingFaceEmbeddings(model_name=EMB_MODEL)
+    db = FAISS.from_documents(docs, embedding)
+    return db
+vectordb = build_vectordb()
+# =========================
+# SUGGESTIONS (CARDS)
+# =========================
+SUGGESTIONS = [
+    "Resuma os principais pontos do documento.",
+    "Quais procedimentos são descritos?",
+    "Liste requisitos ou obrigações mencionadas.",
+    "Explique os termos técnicos utilizados.",
+    "Há prazos ou datas importantes?",
+    "Existe checklist operacional?",
+    "Quais seções são mais relevantes?",
+    "Há diferenças entre versões?"
+]
+# =========================
+# RAG FUNCTION
+# =========================
+def format_context(docs):
+    context = "\n\n".join([d.page_content for d in docs])
+    if len(context) > MAX_CONTEXT_CHARS:
+        context = context[:MAX_CONTEXT_CHARS]
+    return context
+def chat_rag_nvidia(message, history):
+    if not client:
+        return "❌ Configure NVIDIA_API_KEY."
+    retrieved = vectordb.similarity_search(message, k=TOP_K)
+    context = format_context(retrieved)
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": f"CONTEXTO:\n{context}\n\nPERGUNTA:\n{message}"}
+    ]
+    completion = client.chat.completions.create(
+        model=NVIDIA_MODEL,
+        messages=messages,
+        temperature=0.3,
+        max_tokens=800,
+    )
+    return completion.choices[0].message.content
+# =========================
+# UI (USANDO EXAMPLES NATIVOS)
+# =========================
+with gr.Blocks(title="Document RAG Assistant") as demo:
+    gr.Markdown("""
+    ## 📚 SOGETREL
+    Faça perguntas sobre os documentos indexados.
+    """)
+    gr.ChatInterface(
+        fn=chat_rag_nvidia,
+        examples=SUGGESTIONS,  # ← Aqui são gerados os cards automaticamente
+        title="Assistant",
+        description="Pergunte algo sobre os documentos."
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+openai
+langchain
+langchain-community
+langchain-huggingface
+faiss-cpu
+sentence-transformers

vectorstore_faiss/_fingerprint.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "fingerprint": "53f693c5016866ca",
+  "files": [
+    "ATELIER_SOGETREL_26-06_25.pdf",
+    "CAFF BE-CPA ORANGE.xls",
+    "CHELEM + ARTICLES.xlsx",
+    "COML 14 DISSIM POI2.docx",
+    "CPDET - avenant OI 2025 du 1er avril 2025.pdf",
+    "CPFOR - Catalogue Lot 2- 3 - avenant DELTA du 1er juin 2024 avec marques de révision 1.pdf",
+    "CQ RACCO.docx",
+    "CRIAR DEVIS DEFAC RIP MEGALIS.docx",
+    "CRIAÇÃO PAR.docx",
+    "CUIVRE.docx",
+    "DESSAT ARM.docx",
+    "IPON 13-11-2025.docx",
+    "IPON DESSAT.docx",
+    "MEMO KA.pdf",
+    "MOD OP_DT.DICT.pdf",
+    "MOD'OP_COML .docx",
+    "OPTI BLOCAGE IMB.docx",
+    "PAR.docx",
+    "PEC_UIO.docx",
+    "PMV-1090104-QPR500974-2505606.pdf",
+    "PROCESS-DISSIMULATION_10_12_25.pdf",
+    "RAMI.docx",
+    "THDB_Modop_Resacode.pdf",
+    "TPSWAN.docx",
+    "[Fascicules et catalogues] CPFOR - Catalogue Lot 2-3 - avenant OI  AVRIL 2025.pdf",
+    "récap géoref 2 (1).docx"
+  ]
+}

vectorstore_faiss/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4addf064543f8d0208ba01c59f2e3b00b949814be043f0dfc9cae4078ce87e8a
+size 4432941

vectorstore_faiss/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27b6bebb4ff59b75699437d47a1d221b383116af7b30c39845356b815c5a2f04
+size 2661417