ANAMARIAMAGALHAES commited on
Commit
cb8830c
·
verified ·
1 Parent(s): a1bb0bf

Upload 12 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/index/index.faiss filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from app import ask_question # Função já integrada com retriever + LLM
3
+
4
+ # Chat logic
5
+ def qa_interface(question, chat_history):
6
+ if question.strip().lower() in ["exit", "quit"]:
7
+ return "", chat_history + [("exit", "Session ended.")]
8
+
9
+ try:
10
+ answer = ask_question(question)
11
+ except Exception as e:
12
+ answer = f"⚠️ Error: {str(e)}"
13
+
14
+ chat_history.append((question, answer))
15
+ return "", chat_history
16
+
17
+ # UI
18
+ with gr.Blocks() as demo:
19
+ gr.Markdown("# Compliance Q&A Assistant")
20
+ gr.Markdown(
21
+ "Ask questions related to compliance documents. This prototype uses a local LLM + vector search."
22
+ )
23
+
24
+ chatbot = gr.Chatbot(label="Chat History", height=400)
25
+
26
+ with gr.Row():
27
+ msg = gr.Textbox(
28
+ label="Your Question",
29
+ placeholder="e.g. Has Tesla ever appeared in OFAC sanctions?",
30
+ scale=4
31
+ )
32
+ submit = gr.Button("Ask", scale=1)
33
+
34
+ submit.click(fn=qa_interface, inputs=[msg, chatbot], outputs=[msg, chatbot])
35
+
36
+ demo.launch()
backend.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.llms import Ollama
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain.chains import RetrievalQA
5
+ from langchain.prompts import PromptTemplate
6
+
7
+ # Custom prompt to control model behavior
8
+ QA_PROMPT = PromptTemplate(
9
+ input_variables=["context", "question"],
10
+ template="""
11
+ You are a helpful compliance assistant.
12
+ Answer the question below using only the provided context.
13
+ If you don't know the answer based on the context, say:
14
+ "I’m sorry, I couldn't find information about that based on the current documents."
15
+
16
+ Context:
17
+ {context}
18
+
19
+ Question:
20
+ {question}
21
+ """
22
+ )
23
+
24
+ def build_qa_chain(index_path: str):
25
+ print(f"Loading FAISS index from: {index_path}")
26
+
27
+ embedding = HuggingFaceEmbeddings(
28
+ model_name="sentence-transformers/paraphrase-albert-small-v2",
29
+ encode_kwargs={"normalize_embeddings": True}
30
+ )
31
+
32
+ db = FAISS.load_local(index_path, embedding, allow_dangerous_deserialization=True)
33
+ retriever = db.as_retriever()
34
+
35
+ print("=== Starting local LLM via Ollama ===")
36
+ llm = Ollama(
37
+ model="tinyllama",
38
+ temperature=0.3,
39
+ top_p=0.95,
40
+ repeat_penalty=1.1
41
+ )
42
+
43
+ qa_chain = RetrievalQA.from_chain_type(
44
+ llm=llm,
45
+ retriever=retriever,
46
+ return_source_documents=False,
47
+ chain_type_kwargs={"prompt": QA_PROMPT}
48
+ )
49
+
50
+ return qa_chain
51
+
52
+ def run_qa_app(index_path: str):
53
+ qa_chain = build_qa_chain(index_path)
54
+
55
+ print("=== System is ready. Ask your compliance questions below ===\n")
56
+
57
+ while True:
58
+ question = input("Your question (or type 'exit'): ")
59
+ if question.lower() in ["exit", "quit"]:
60
+ print("=== Session ended ===")
61
+ break
62
+
63
+ response = qa_chain.invoke({"query": question})
64
+ print(f"\n=== Answer:\n{response['result']}\n")
data/index/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90f4c68df0cbe4d66cf79af1af5c07475e767c9175bb350b66a4cd7239a385dc
3
+ size 8211501
data/index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7c1d6b15c0d7bae97a0923da87359e66ae56e4e36120af8ba3aa7a0ba2e1ba7
3
+ size 1656441
src/__pycache__/build_index.cpython-311.pyc ADDED
Binary file (1.43 kB). View file
 
src/__pycache__/embedder.cpython-311.pyc ADDED
Binary file (2.26 kB). View file
 
src/__pycache__/parser.cpython-311.pyc ADDED
Binary file (1.88 kB). View file
 
src/__pycache__/qa.cpython-311.pyc ADDED
Binary file (1.26 kB). View file
 
src/build_index.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from parser import load_documents
3
+ from embedder import embed_and_store
4
+
5
+ def build_vector_store(docs_path, index_path):
6
+ print(f"===Loading documents from {docs_path} ===")
7
+ documents = load_documents(docs_path)
8
+ print(f"=== Loaded {len(documents)} documents ===")
9
+
10
+ print("=== Adding new documents to vector store (index) ===")
11
+ embed_and_store(documents, index_path)
12
+ print("=== Index updated successfully. ===")
13
+
14
+ if __name__ == "__main__":
15
+ DOCS_DIR = "data/raw"
16
+ INDEX_DIR = "data/index"
17
+ build_vector_store(DOCS_DIR, INDEX_DIR)
src/embedder.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.embeddings import HuggingFaceEmbeddings
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ import os
5
+
6
+ def embed_and_store(docs, index_path):
7
+ print(" === Splitting documents ===")
8
+ splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=0)
9
+ docs_split = splitter.split_documents(docs)
10
+ print(f"=== Split into {len(docs_split)} chunks ===")
11
+
12
+ print(" === Initializing embedding model ===")
13
+ embedding = HuggingFaceEmbeddings(
14
+ model_name="sentence-transformers/paraphrase-albert-small-v2",
15
+ encode_kwargs={"normalize_embeddings": True}
16
+ )
17
+
18
+ print("=== Creating FAISS index ===")
19
+ texts = [doc.page_content for doc in docs_split]
20
+ metadatas = [doc.metadata for doc in docs_split]
21
+
22
+ vectorstore = FAISS.from_texts(texts, embedding, metadatas=metadatas)
23
+
24
+ os.makedirs(index_path, exist_ok=True)
25
+ print(f"=== Saving FAISS index to: {index_path} ===")
26
+ vectorstore.save_local(index_path)
27
+ print("=== Index saved successfully ===")
src/parser.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import (
2
+ PyPDFLoader,
3
+ TextLoader,
4
+ CSVLoader,
5
+ UnstructuredHTMLLoader
6
+ )
7
+ from pathlib import Path
8
+
9
+ def load_documents(directory):
10
+ docs = []
11
+ path = Path(directory)
12
+
13
+ for file in path.rglob("*"):
14
+ if file.suffix.lower() == ".pdf":
15
+ docs.extend(PyPDFLoader(str(file)).load())
16
+ elif file.suffix.lower() == ".txt":
17
+ docs.extend(TextLoader(str(file)).load())
18
+ elif file.suffix.lower() == ".csv":
19
+ docs.extend(CSVLoader(file_path=str(file), encoding='utf-8').load())
20
+ elif file.suffix.lower() in [".html", ".htm"]:
21
+ docs.extend(UnstructuredHTMLLoader(str(file)).load())
22
+
23
+ return docs
24
+
src/qa.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import RetrievalQA
2
+ from langchain.prompts import PromptTemplate
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ # Prompt refinado para controle do comportamento do modelo
8
+ QA_PROMPT = PromptTemplate(
9
+ input_variables=["context", "question"],
10
+ template="""
11
+ You are a helpful compliance assistant.
12
+ Answer the question below using only the provided context.
13
+ If you don't know the answer based on the context, say:
14
+ "I’m sorry, I couldn't find information about that based on the current documents."
15
+
16
+ Context:
17
+ {context}
18
+
19
+ Question:
20
+ {question}
21
+ """
22
+ )
23
+
24
+ def ask_question(question, retriever, llm):
25
+ qa = RetrievalQA.from_chain_type(
26
+ llm=llm,
27
+ retriever=retriever,
28
+ return_source_documents=False,
29
+ chain_type_kwargs={"prompt": QA_PROMPT}
30
+ )
31
+ result = qa.invoke({"query": question})
32
+ return result['result']