Spaces:
Sleeping
Sleeping
Upload 12 files
Browse files- .gitattributes +1 -0
- app.py +36 -0
- backend.py +64 -0
- data/index/index.faiss +3 -0
- data/index/index.pkl +3 -0
- src/__pycache__/build_index.cpython-311.pyc +0 -0
- src/__pycache__/embedder.cpython-311.pyc +0 -0
- src/__pycache__/parser.cpython-311.pyc +0 -0
- src/__pycache__/qa.cpython-311.pyc +0 -0
- src/build_index.py +17 -0
- src/embedder.py +27 -0
- src/parser.py +24 -0
- src/qa.py +32 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/index/index.faiss filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from app import ask_question # Função já integrada com retriever + LLM
|
| 3 |
+
|
| 4 |
+
# Chat logic
|
| 5 |
+
def qa_interface(question, chat_history):
|
| 6 |
+
if question.strip().lower() in ["exit", "quit"]:
|
| 7 |
+
return "", chat_history + [("exit", "Session ended.")]
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
answer = ask_question(question)
|
| 11 |
+
except Exception as e:
|
| 12 |
+
answer = f"⚠️ Error: {str(e)}"
|
| 13 |
+
|
| 14 |
+
chat_history.append((question, answer))
|
| 15 |
+
return "", chat_history
|
| 16 |
+
|
| 17 |
+
# UI
|
| 18 |
+
with gr.Blocks() as demo:
|
| 19 |
+
gr.Markdown("# Compliance Q&A Assistant")
|
| 20 |
+
gr.Markdown(
|
| 21 |
+
"Ask questions related to compliance documents. This prototype uses a local LLM + vector search."
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
chatbot = gr.Chatbot(label="Chat History", height=400)
|
| 25 |
+
|
| 26 |
+
with gr.Row():
|
| 27 |
+
msg = gr.Textbox(
|
| 28 |
+
label="Your Question",
|
| 29 |
+
placeholder="e.g. Has Tesla ever appeared in OFAC sanctions?",
|
| 30 |
+
scale=4
|
| 31 |
+
)
|
| 32 |
+
submit = gr.Button("Ask", scale=1)
|
| 33 |
+
|
| 34 |
+
submit.click(fn=qa_interface, inputs=[msg, chatbot], outputs=[msg, chatbot])
|
| 35 |
+
|
| 36 |
+
demo.launch()
|
backend.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.llms import Ollama
|
| 2 |
+
from langchain_community.vectorstores import FAISS
|
| 3 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 4 |
+
from langchain.chains import RetrievalQA
|
| 5 |
+
from langchain.prompts import PromptTemplate
|
| 6 |
+
|
| 7 |
+
# Custom prompt to control model behavior
|
| 8 |
+
QA_PROMPT = PromptTemplate(
|
| 9 |
+
input_variables=["context", "question"],
|
| 10 |
+
template="""
|
| 11 |
+
You are a helpful compliance assistant.
|
| 12 |
+
Answer the question below using only the provided context.
|
| 13 |
+
If you don't know the answer based on the context, say:
|
| 14 |
+
"I’m sorry, I couldn't find information about that based on the current documents."
|
| 15 |
+
|
| 16 |
+
Context:
|
| 17 |
+
{context}
|
| 18 |
+
|
| 19 |
+
Question:
|
| 20 |
+
{question}
|
| 21 |
+
"""
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
def build_qa_chain(index_path: str):
|
| 25 |
+
print(f"Loading FAISS index from: {index_path}")
|
| 26 |
+
|
| 27 |
+
embedding = HuggingFaceEmbeddings(
|
| 28 |
+
model_name="sentence-transformers/paraphrase-albert-small-v2",
|
| 29 |
+
encode_kwargs={"normalize_embeddings": True}
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
db = FAISS.load_local(index_path, embedding, allow_dangerous_deserialization=True)
|
| 33 |
+
retriever = db.as_retriever()
|
| 34 |
+
|
| 35 |
+
print("=== Starting local LLM via Ollama ===")
|
| 36 |
+
llm = Ollama(
|
| 37 |
+
model="tinyllama",
|
| 38 |
+
temperature=0.3,
|
| 39 |
+
top_p=0.95,
|
| 40 |
+
repeat_penalty=1.1
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
qa_chain = RetrievalQA.from_chain_type(
|
| 44 |
+
llm=llm,
|
| 45 |
+
retriever=retriever,
|
| 46 |
+
return_source_documents=False,
|
| 47 |
+
chain_type_kwargs={"prompt": QA_PROMPT}
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
return qa_chain
|
| 51 |
+
|
| 52 |
+
def run_qa_app(index_path: str):
|
| 53 |
+
qa_chain = build_qa_chain(index_path)
|
| 54 |
+
|
| 55 |
+
print("=== System is ready. Ask your compliance questions below ===\n")
|
| 56 |
+
|
| 57 |
+
while True:
|
| 58 |
+
question = input("Your question (or type 'exit'): ")
|
| 59 |
+
if question.lower() in ["exit", "quit"]:
|
| 60 |
+
print("=== Session ended ===")
|
| 61 |
+
break
|
| 62 |
+
|
| 63 |
+
response = qa_chain.invoke({"query": question})
|
| 64 |
+
print(f"\n=== Answer:\n{response['result']}\n")
|
data/index/index.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90f4c68df0cbe4d66cf79af1af5c07475e767c9175bb350b66a4cd7239a385dc
|
| 3 |
+
size 8211501
|
data/index/index.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d7c1d6b15c0d7bae97a0923da87359e66ae56e4e36120af8ba3aa7a0ba2e1ba7
|
| 3 |
+
size 1656441
|
src/__pycache__/build_index.cpython-311.pyc
ADDED
|
Binary file (1.43 kB). View file
|
|
|
src/__pycache__/embedder.cpython-311.pyc
ADDED
|
Binary file (2.26 kB). View file
|
|
|
src/__pycache__/parser.cpython-311.pyc
ADDED
|
Binary file (1.88 kB). View file
|
|
|
src/__pycache__/qa.cpython-311.pyc
ADDED
|
Binary file (1.26 kB). View file
|
|
|
src/build_index.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from parser import load_documents
|
| 3 |
+
from embedder import embed_and_store
|
| 4 |
+
|
| 5 |
+
def build_vector_store(docs_path, index_path):
|
| 6 |
+
print(f"===Loading documents from {docs_path} ===")
|
| 7 |
+
documents = load_documents(docs_path)
|
| 8 |
+
print(f"=== Loaded {len(documents)} documents ===")
|
| 9 |
+
|
| 10 |
+
print("=== Adding new documents to vector store (index) ===")
|
| 11 |
+
embed_and_store(documents, index_path)
|
| 12 |
+
print("=== Index updated successfully. ===")
|
| 13 |
+
|
| 14 |
+
if __name__ == "__main__":
|
| 15 |
+
DOCS_DIR = "data/raw"
|
| 16 |
+
INDEX_DIR = "data/index"
|
| 17 |
+
build_vector_store(DOCS_DIR, INDEX_DIR)
|
src/embedder.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 2 |
+
from langchain_community.vectorstores import FAISS
|
| 3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
def embed_and_store(docs, index_path):
|
| 7 |
+
print(" === Splitting documents ===")
|
| 8 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=0)
|
| 9 |
+
docs_split = splitter.split_documents(docs)
|
| 10 |
+
print(f"=== Split into {len(docs_split)} chunks ===")
|
| 11 |
+
|
| 12 |
+
print(" === Initializing embedding model ===")
|
| 13 |
+
embedding = HuggingFaceEmbeddings(
|
| 14 |
+
model_name="sentence-transformers/paraphrase-albert-small-v2",
|
| 15 |
+
encode_kwargs={"normalize_embeddings": True}
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
print("=== Creating FAISS index ===")
|
| 19 |
+
texts = [doc.page_content for doc in docs_split]
|
| 20 |
+
metadatas = [doc.metadata for doc in docs_split]
|
| 21 |
+
|
| 22 |
+
vectorstore = FAISS.from_texts(texts, embedding, metadatas=metadatas)
|
| 23 |
+
|
| 24 |
+
os.makedirs(index_path, exist_ok=True)
|
| 25 |
+
print(f"=== Saving FAISS index to: {index_path} ===")
|
| 26 |
+
vectorstore.save_local(index_path)
|
| 27 |
+
print("=== Index saved successfully ===")
|
src/parser.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.document_loaders import (
|
| 2 |
+
PyPDFLoader,
|
| 3 |
+
TextLoader,
|
| 4 |
+
CSVLoader,
|
| 5 |
+
UnstructuredHTMLLoader
|
| 6 |
+
)
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
def load_documents(directory):
|
| 10 |
+
docs = []
|
| 11 |
+
path = Path(directory)
|
| 12 |
+
|
| 13 |
+
for file in path.rglob("*"):
|
| 14 |
+
if file.suffix.lower() == ".pdf":
|
| 15 |
+
docs.extend(PyPDFLoader(str(file)).load())
|
| 16 |
+
elif file.suffix.lower() == ".txt":
|
| 17 |
+
docs.extend(TextLoader(str(file)).load())
|
| 18 |
+
elif file.suffix.lower() == ".csv":
|
| 19 |
+
docs.extend(CSVLoader(file_path=str(file), encoding='utf-8').load())
|
| 20 |
+
elif file.suffix.lower() in [".html", ".htm"]:
|
| 21 |
+
docs.extend(UnstructuredHTMLLoader(str(file)).load())
|
| 22 |
+
|
| 23 |
+
return docs
|
| 24 |
+
|
src/qa.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.chains import RetrievalQA
|
| 2 |
+
from langchain.prompts import PromptTemplate
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
# Prompt refinado para controle do comportamento do modelo
|
| 8 |
+
QA_PROMPT = PromptTemplate(
|
| 9 |
+
input_variables=["context", "question"],
|
| 10 |
+
template="""
|
| 11 |
+
You are a helpful compliance assistant.
|
| 12 |
+
Answer the question below using only the provided context.
|
| 13 |
+
If you don't know the answer based on the context, say:
|
| 14 |
+
"I’m sorry, I couldn't find information about that based on the current documents."
|
| 15 |
+
|
| 16 |
+
Context:
|
| 17 |
+
{context}
|
| 18 |
+
|
| 19 |
+
Question:
|
| 20 |
+
{question}
|
| 21 |
+
"""
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
def ask_question(question, retriever, llm):
|
| 25 |
+
qa = RetrievalQA.from_chain_type(
|
| 26 |
+
llm=llm,
|
| 27 |
+
retriever=retriever,
|
| 28 |
+
return_source_documents=False,
|
| 29 |
+
chain_type_kwargs={"prompt": QA_PROMPT}
|
| 30 |
+
)
|
| 31 |
+
result = qa.invoke({"query": question})
|
| 32 |
+
return result['result']
|