Spaces:

ANAMARIAMAGALHAES
/

compliance

Sleeping

App Files Files Community

ANAMARIAMAGALHAES commited on Jul 2, 2025

Commit

cb8830c

verified ·

1 Parent(s): a1bb0bf

Upload 12 files

Browse files

Files changed (13) hide show

.gitattributes +1 -0
app.py +36 -0
backend.py +64 -0
data/index/index.faiss +3 -0
data/index/index.pkl +3 -0
src/__pycache__/build_index.cpython-311.pyc +0 -0
src/__pycache__/embedder.cpython-311.pyc +0 -0
src/__pycache__/parser.cpython-311.pyc +0 -0
src/__pycache__/qa.cpython-311.pyc +0 -0
src/build_index.py +17 -0
src/embedder.py +27 -0
src/parser.py +24 -0
src/qa.py +32 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/index/index.faiss filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import gradio as gr
+from app import ask_question  # Função já integrada com retriever + LLM
+# Chat logic
+def qa_interface(question, chat_history):
+    if question.strip().lower() in ["exit", "quit"]:
+        return "", chat_history + [("exit", "Session ended.")]
+    try:
+        answer = ask_question(question)
+    except Exception as e:
+        answer = f"⚠️ Error: {str(e)}"
+    chat_history.append((question, answer))
+    return "", chat_history
+# UI
+with gr.Blocks() as demo:
+    gr.Markdown("# Compliance Q&A Assistant")
+    gr.Markdown(
+        "Ask questions related to compliance documents. This prototype uses a local LLM + vector search."
+    )
+    chatbot = gr.Chatbot(label="Chat History", height=400)
+    with gr.Row():
+        msg = gr.Textbox(
+            label="Your Question",
+            placeholder="e.g. Has Tesla ever appeared in OFAC sanctions?",
+            scale=4
+        )
+        submit = gr.Button("Ask", scale=1)
+    submit.click(fn=qa_interface, inputs=[msg, chatbot], outputs=[msg, chatbot])
+demo.launch()

backend.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from langchain_community.llms import Ollama
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+# Custom prompt to control model behavior
+QA_PROMPT = PromptTemplate(
+    input_variables=["context", "question"],
+    template="""
+You are a helpful compliance assistant.
+Answer the question below using only the provided context.
+If you don't know the answer based on the context, say:
+"I’m sorry, I couldn't find information about that based on the current documents."
+Context:
+{context}
+Question:
+{question}
+"""
+)
+def build_qa_chain(index_path: str):
+    print(f"Loading FAISS index from: {index_path}")
+    embedding = HuggingFaceEmbeddings(
+        model_name="sentence-transformers/paraphrase-albert-small-v2",
+        encode_kwargs={"normalize_embeddings": True}
+    )
+    db = FAISS.load_local(index_path, embedding, allow_dangerous_deserialization=True)
+    retriever = db.as_retriever()
+    print("=== Starting local LLM via Ollama ===")
+    llm = Ollama(
+        model="tinyllama",
+        temperature=0.3,
+        top_p=0.95,
+        repeat_penalty=1.1
+    )
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        retriever=retriever,
+        return_source_documents=False,
+        chain_type_kwargs={"prompt": QA_PROMPT}
+    )
+    return qa_chain
+def run_qa_app(index_path: str):
+    qa_chain = build_qa_chain(index_path)
+    print("=== System is ready. Ask your compliance questions below ===\n")
+    while True:
+        question = input("Your question (or type 'exit'): ")
+        if question.lower() in ["exit", "quit"]:
+            print("=== Session ended ===")
+            break
+        response = qa_chain.invoke({"query": question})
+        print(f"\n=== Answer:\n{response['result']}\n")

data/index/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90f4c68df0cbe4d66cf79af1af5c07475e767c9175bb350b66a4cd7239a385dc
+size 8211501

data/index/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7c1d6b15c0d7bae97a0923da87359e66ae56e4e36120af8ba3aa7a0ba2e1ba7
+size 1656441

src/__pycache__/build_index.cpython-311.pyc ADDED Viewed

Binary file (1.43 kB). View file

src/__pycache__/embedder.cpython-311.pyc ADDED Viewed

Binary file (2.26 kB). View file

src/__pycache__/parser.cpython-311.pyc ADDED Viewed

Binary file (1.88 kB). View file

src/__pycache__/qa.cpython-311.pyc ADDED Viewed

Binary file (1.26 kB). View file

src/build_index.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+from parser import load_documents
+from embedder import embed_and_store
+def build_vector_store(docs_path, index_path):
+    print(f"===Loading documents from {docs_path} ===")
+    documents = load_documents(docs_path)
+    print(f"=== Loaded {len(documents)} documents ===")
+    print("=== Adding new documents to vector store (index) ===")
+    embed_and_store(documents, index_path)
+    print("=== Index updated successfully. ===")
+if __name__ == "__main__":
+    DOCS_DIR = "data/raw"
+    INDEX_DIR = "data/index"
+    build_vector_store(DOCS_DIR, INDEX_DIR)

src/embedder.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import os
+def embed_and_store(docs, index_path):
+    print(" === Splitting documents ===")
+    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=0)
+    docs_split = splitter.split_documents(docs)
+    print(f"=== Split into {len(docs_split)} chunks ===")
+    print(" === Initializing embedding model ===")
+    embedding = HuggingFaceEmbeddings(
+        model_name="sentence-transformers/paraphrase-albert-small-v2",
+        encode_kwargs={"normalize_embeddings": True}
+    )
+    print("=== Creating FAISS index ===")
+    texts = [doc.page_content for doc in docs_split]
+    metadatas = [doc.metadata for doc in docs_split]
+    vectorstore = FAISS.from_texts(texts, embedding, metadatas=metadatas)
+    os.makedirs(index_path, exist_ok=True)
+    print(f"=== Saving FAISS index to: {index_path} ===")
+    vectorstore.save_local(index_path)
+    print("=== Index saved successfully ===")

src/parser.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from langchain_community.document_loaders import (
+    PyPDFLoader,
+    TextLoader,
+    CSVLoader,
+    UnstructuredHTMLLoader
+)
+from pathlib import Path
+def load_documents(directory):
+    docs = []
+    path = Path(directory)
+    for file in path.rglob("*"):
+        if file.suffix.lower() == ".pdf":
+            docs.extend(PyPDFLoader(str(file)).load())
+        elif file.suffix.lower() == ".txt":
+            docs.extend(TextLoader(str(file)).load())
+        elif file.suffix.lower() == ".csv":
+            docs.extend(CSVLoader(file_path=str(file), encoding='utf-8').load())
+        elif file.suffix.lower() in [".html", ".htm"]:
+            docs.extend(UnstructuredHTMLLoader(str(file)).load())
+    return docs

src/qa.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from dotenv import load_dotenv
+load_dotenv()
+# Prompt refinado para controle do comportamento do modelo
+QA_PROMPT = PromptTemplate(
+    input_variables=["context", "question"],
+    template="""
+You are a helpful compliance assistant.
+Answer the question below using only the provided context.
+If you don't know the answer based on the context, say:
+"I’m sorry, I couldn't find information about that based on the current documents."
+Context:
+{context}
+Question:
+{question}
+"""
+)
+def ask_question(question, retriever, llm):
+    qa = RetrievalQA.from_chain_type(
+        llm=llm,
+        retriever=retriever,
+        return_source_documents=False,
+        chain_type_kwargs={"prompt": QA_PROMPT}
+    )
+    result = qa.invoke({"query": question})
+    return result['result']