Spaces:
Sleeping
Sleeping
Add files for RAG backend
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- Dockerfile +12 -0
- LICENSE +21 -0
- README.md +2 -12
- api/__init__.py +0 -0
- api/__pycache__/__init__.cpython-313.pyc +0 -0
- api/__pycache__/main.cpython-313.pyc +0 -0
- api/__pycache__/rag_chain.cpython-313.pyc +0 -0
- api/__pycache__/schemas.cpython-313.pyc +0 -0
- api/main.py +57 -0
- api/rag_chain.py +108 -0
- api/schemas.py +14 -0
- config/base.yaml +43 -0
- config/dev.yaml +0 -0
- config/prod.yaml +0 -0
- data/embeddings/batch_000.npy +3 -0
- data/raw/base_treinamento.txt +0 -0
- data/vector_store_faiss/index.faiss +3 -0
- data/vector_store_faiss/index.pkl +3 -0
- notebooks/demo_embedder.py +24 -0
- notebooks/demo_evaluator.py +3 -0
- notebooks/demo_generator.py +23 -0
- notebooks/demo_ingestion.py +21 -0
- notebooks/demo_reranker.py +35 -0
- notebooks/demo_retriever.py +14 -0
- notebooks/demo_vector_store.py +20 -0
- pytest.ini +6 -0
- requiriments.txt +21 -0
- src/__pycache__/__init__.cpython-313.pyc +0 -0
- src/__pycache__/download.cpython-313.pyc +0 -0
- src/__pycache__/gerar_chunks.cpython-313.pyc +0 -0
- src/evaluation/benchmarks.py +0 -0
- src/evaluation/evaluator.py +70 -0
- src/evaluation/metrics.py +38 -0
- src/generation/llm_client.py +37 -0
- src/generation/prompt_templates.py +44 -0
- src/generation/response_generator.py +23 -0
- src/ingestion/document_loader.py +20 -0
- src/ingestion/embedder.py +63 -0
- src/ingestion/preprocessor.py +17 -0
- src/ingestion/text_splitter.py +46 -0
- src/retrieval/__pycache__/reranker.cpython-313.pyc +0 -0
- src/retrieval/__pycache__/retriever.cpython-313.pyc +0 -0
- src/retrieval/__pycache__/vector_store.cpython-313.pyc +0 -0
- src/retrieval/query_processor.py +0 -0
- src/retrieval/reranker.py +57 -0
- src/retrieval/retriever.py +23 -0
- src/retrieval/vector_store.py +20 -0
- src/utils/env.py +5 -0
- src/utils/io.py +3 -0
.gitattributes
CHANGED
|
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
RAG_HelpDesk/data/vector_store_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
RAG_HelpDesk/data/vector_store_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/vector_store_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
|
| 7 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 8 |
+
pip install --no-cache-dir -r requirements.txt
|
| 9 |
+
|
| 10 |
+
COPY . .
|
| 11 |
+
|
| 12 |
+
CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Vinicius Moreira
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,12 +1,2 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
emoji: 🚀
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: indigo
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
license: mit
|
| 9 |
-
short_description: RAG with langchain and fastAPI for to answer technical IT q/
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
+
# RAG_HelpDesk
|
| 2 |
+
Esse projeto visa criar um RAG para ajudar usuários em relação a duvidas de Hardware e Software.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api/__init__.py
ADDED
|
File without changes
|
api/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (163 Bytes). View file
|
|
|
api/__pycache__/main.cpython-313.pyc
ADDED
|
Binary file (2.88 kB). View file
|
|
|
api/__pycache__/rag_chain.cpython-313.pyc
ADDED
|
Binary file (5.39 kB). View file
|
|
|
api/__pycache__/schemas.cpython-313.pyc
ADDED
|
Binary file (1.34 kB). View file
|
|
|
api/main.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import traceback
|
| 4 |
+
|
| 5 |
+
project_root = Path(__file__).resolve().parents[1]
|
| 6 |
+
sys.path.append(str(project_root))
|
| 7 |
+
|
| 8 |
+
from fastapi import FastAPI, HTTPException
|
| 9 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
+
from api.schemas import QueryRequest, QueryResponse, SourceChunk
|
| 11 |
+
from api.rag_chain import get_rag_chain
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
app = FastAPI(
|
| 15 |
+
title="Helpdesk RAG API",
|
| 16 |
+
description="API for answering questions about an IT knowledge base",
|
| 17 |
+
version="1.0.0"
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
rag_chain = get_rag_chain()
|
| 22 |
+
except Exception as e:
|
| 23 |
+
print("error to load pipeline RAG")
|
| 24 |
+
traceback.print_exc()
|
| 25 |
+
raise RuntimeError(f"error to load pipeline RAG: {e}")
|
| 26 |
+
|
| 27 |
+
app.add_middleware(
|
| 28 |
+
CORSMiddleware,
|
| 29 |
+
allow_origins=["*"],
|
| 30 |
+
allow_credentials=True,
|
| 31 |
+
allow_methods=["GET", "POST"],
|
| 32 |
+
allow_headers=["*"],
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
@app.get("/", tags=["Status"])
|
| 36 |
+
def read_root():
|
| 37 |
+
return {"status": "API ON"}
|
| 38 |
+
|
| 39 |
+
@app.post("/query", response_model=QueryResponse, tags=["RAG"])
|
| 40 |
+
async def handle_query(request: QueryRequest):
|
| 41 |
+
print(f"processing query: '{request.query}'")
|
| 42 |
+
try:
|
| 43 |
+
result = rag_chain.invoke(request.query)
|
| 44 |
+
source_chunks = [
|
| 45 |
+
SourceChunk(
|
| 46 |
+
page_content=doc.page_content,
|
| 47 |
+
source=doc.metadata.get('source', 'desconhecida')
|
| 48 |
+
) for doc in result['source_chunks']
|
| 49 |
+
]
|
| 50 |
+
return QueryResponse(
|
| 51 |
+
answer=result['answer'],
|
| 52 |
+
source_chunks=source_chunks
|
| 53 |
+
)
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"error to process query")
|
| 56 |
+
traceback.print_exc()
|
| 57 |
+
raise HTTPException(status_code=500, detail=f"internal error. check the server console")
|
api/rag_chain.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from operator import itemgetter
|
| 5 |
+
from typing import List, Dict
|
| 6 |
+
|
| 7 |
+
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
|
| 8 |
+
from langchain_core.prompts import PromptTemplate
|
| 9 |
+
from langchain_community.vectorstores import FAISS
|
| 10 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 11 |
+
from huggingface_hub import InferenceClient
|
| 12 |
+
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 16 |
+
VECTOR_STORE_PATH = str(PROJECT_ROOT / "data" / "vector_store_faiss")
|
| 17 |
+
EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
|
| 18 |
+
LLM_REPO_ID = os.getenv("HUGGINGFACE_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
|
| 19 |
+
|
| 20 |
+
HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
|
| 21 |
+
|
| 22 |
+
if not HF_TOKEN:
|
| 23 |
+
raise ValueError("HUGGINGFACE_API_TOKEN not found")
|
| 24 |
+
|
| 25 |
+
client = InferenceClient(model=LLM_REPO_ID, token=HF_TOKEN)
|
| 26 |
+
|
| 27 |
+
prompt_template = PromptTemplate.from_template("""
|
| 28 |
+
<|system|>
|
| 29 |
+
Você é um assistente de helpdesk de TI especialista... (seu prompt aqui)
|
| 30 |
+
</s><|user|>
|
| 31 |
+
Contexto: {context}\n\nPergunta: {query}
|
| 32 |
+
</s><|assistant|>
|
| 33 |
+
Resposta em Português:
|
| 34 |
+
""")
|
| 35 |
+
|
| 36 |
+
def format_docs(docs: List[Dict]) -> str:
|
| 37 |
+
return "\n\n".join(doc.page_content for doc in docs)
|
| 38 |
+
|
| 39 |
+
def generate_answer_from_context(input_dict: Dict) -> str:
|
| 40 |
+
|
| 41 |
+
context_docs = input_dict["context"]
|
| 42 |
+
query_text = input_dict["query"]
|
| 43 |
+
formatted_context = format_docs(context_docs)
|
| 44 |
+
|
| 45 |
+
prompt_value = prompt_template.invoke({
|
| 46 |
+
"context": formatted_context,
|
| 47 |
+
"query": query_text
|
| 48 |
+
})
|
| 49 |
+
final_prompt_text = str(prompt_value)
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
response = client.chat_completion(
|
| 53 |
+
messages=[{"role": "user", "content": final_prompt_text}],
|
| 54 |
+
max_tokens=300,
|
| 55 |
+
temperature=0.1
|
| 56 |
+
)
|
| 57 |
+
raw_answer = response.choices[0].message.content
|
| 58 |
+
|
| 59 |
+
clean_answer = raw_answer.strip()
|
| 60 |
+
|
| 61 |
+
if clean_answer.startswith('text="'):
|
| 62 |
+
clean_answer = clean_answer[6:]
|
| 63 |
+
elif clean_answer.startswith("text='"):
|
| 64 |
+
clean_answer = clean_answer[6:]
|
| 65 |
+
|
| 66 |
+
if clean_answer.endswith('"') or clean_answer.endswith("'"):
|
| 67 |
+
clean_answer = clean_answer[:-1]
|
| 68 |
+
|
| 69 |
+
if clean_answer.startswith("Resposta:"):
|
| 70 |
+
clean_answer = clean_answer.split("Resposta:", 1)[1]
|
| 71 |
+
|
| 72 |
+
return clean_answer.strip()
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"error for call API huggingface: {e}")
|
| 76 |
+
return f"error for contact llm: {e}"
|
| 77 |
+
|
| 78 |
+
def get_rag_chain():
|
| 79 |
+
print("loading pipeline")
|
| 80 |
+
|
| 81 |
+
embeddings_model = HuggingFaceEmbeddings(
|
| 82 |
+
model_name=EMBEDDING_MODEL_NAME,
|
| 83 |
+
model_kwargs={'device': 'cpu'},
|
| 84 |
+
encode_kwargs={'normalize_embeddings': True}
|
| 85 |
+
)
|
| 86 |
+
vector_store = FAISS.load_local(
|
| 87 |
+
VECTOR_STORE_PATH, embeddings_model, allow_dangerous_deserialization=True
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
from src.retrieval.reranker import HybridReranker
|
| 92 |
+
hybrid_reranker = HybridReranker(vector_store=vector_store)
|
| 93 |
+
retrieval_chain = lambda query: hybrid_reranker.retrieve_and_rerank(query)
|
| 94 |
+
print("using pipeline with reranker")
|
| 95 |
+
except ImportError:
|
| 96 |
+
retrieval_chain = vector_store.as_retriever(search_kwargs={"k": 5})
|
| 97 |
+
print("reranker not found, using simple retriever")
|
| 98 |
+
|
| 99 |
+
rag_chain = {
|
| 100 |
+
"context": retrieval_chain,
|
| 101 |
+
"query": RunnablePassthrough()
|
| 102 |
+
} | RunnableParallel({
|
| 103 |
+
"source_chunks": itemgetter("context"),
|
| 104 |
+
"answer": generate_answer_from_context
|
| 105 |
+
})
|
| 106 |
+
|
| 107 |
+
print("pipeline ready.")
|
| 108 |
+
return rag_chain
|
api/schemas.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
|
| 4 |
+
class QueryRequest(BaseModel):
|
| 5 |
+
query: str = Field(..., description="query for RAG")
|
| 6 |
+
top_k: int = Field(3, description="number of relevants docs to be retrivied", ge=1, le=10)
|
| 7 |
+
|
| 8 |
+
class SourceChunk(BaseModel):
|
| 9 |
+
page_content: str
|
| 10 |
+
source: str = Field(description="file path")
|
| 11 |
+
|
| 12 |
+
class QueryResponse(BaseModel):
|
| 13 |
+
answer: str
|
| 14 |
+
source_chunks: List[SourceChunk]
|
config/base.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
raw_path: "data/raw"
|
| 3 |
+
processed_path: "data/processed"
|
| 4 |
+
embeddings_path: "data/embeddings"
|
| 5 |
+
|
| 6 |
+
ingestion:
|
| 7 |
+
chunk_size: 500
|
| 8 |
+
chunk_overlap: 50
|
| 9 |
+
languages: ["pt", "en"]
|
| 10 |
+
|
| 11 |
+
embedder:
|
| 12 |
+
model_name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| 13 |
+
batch_size: 64
|
| 14 |
+
normalize: true
|
| 15 |
+
|
| 16 |
+
vector_store:
|
| 17 |
+
type: "faiss"
|
| 18 |
+
index_factory: "Flat"
|
| 19 |
+
metric: "L2"
|
| 20 |
+
save_index_path: "data/embeddings/faiss_index.idx"
|
| 21 |
+
|
| 22 |
+
llm:
|
| 23 |
+
provider: "openai"
|
| 24 |
+
model_name: "gpt-3.5-turbo"
|
| 25 |
+
max_tokens: 512
|
| 26 |
+
temperature: 0.7
|
| 27 |
+
top_p: 0.9
|
| 28 |
+
api_key_env: "OPENAI_API_KEY"
|
| 29 |
+
|
| 30 |
+
api:
|
| 31 |
+
host: "0.0.0.0"
|
| 32 |
+
port: 8000
|
| 33 |
+
docs_url: "/docs"
|
| 34 |
+
|
| 35 |
+
logging:
|
| 36 |
+
level: "INFO"
|
| 37 |
+
format: "[%(asctime)s] %(levelname)s %(name)s: %(message)s"
|
| 38 |
+
|
| 39 |
+
evaluation:
|
| 40 |
+
top_k: 5
|
| 41 |
+
benchmark_path: "tests/benchmark.json"
|
| 42 |
+
save_reports: true
|
| 43 |
+
reports_path: "reports/"
|
config/dev.yaml
ADDED
|
File without changes
|
config/prod.yaml
ADDED
|
File without changes
|
data/embeddings/batch_000.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0234624565756c111fcd23e4090f7f08255567e57fad9a01cde641f862f4c93
|
| 3 |
+
size 76800128
|
data/raw/base_treinamento.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/vector_store_faiss/index.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da3c437c49b2aab9bcff75a1553f3ea1cd17212b70c66cdc3fcebbab1bbf1a7c
|
| 3 |
+
size 43315245
|
data/vector_store_faiss/index.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac470dbf2aa92c9fe996c3a0df6ca4e4435afb806b06f6e8e877def38323393f
|
| 3 |
+
size 10958179
|
notebooks/demo_embedder.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.ingestion.document_loader import load_documents_from_dir
|
| 2 |
+
from src.ingestion.preprocessor import preprocess_documents
|
| 3 |
+
from src.ingestion.text_splitter import split_text
|
| 4 |
+
from src.ingestion.embedder import load_embedder, generate_embeddings, save_embeddings, get_chunk_texts
|
| 5 |
+
|
| 6 |
+
CHUNK_SIZE = 300
|
| 7 |
+
CHUNK_OVERLAP = 50
|
| 8 |
+
RAW_PATH = "data/raw"
|
| 9 |
+
EMBEDDING_PATH = "data/embeddings/batch_000.npy"
|
| 10 |
+
|
| 11 |
+
docs = load_documents_from_dir(RAW_PATH)
|
| 12 |
+
clean_docs = preprocess_documents(docs)
|
| 13 |
+
chunks = split_text(clean_docs, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
|
| 14 |
+
|
| 15 |
+
texts = get_chunk_texts(chunks)
|
| 16 |
+
|
| 17 |
+
model = load_embedder()
|
| 18 |
+
embeddings = generate_embeddings(texts, model)
|
| 19 |
+
|
| 20 |
+
print(f"embeddings shape: {embeddings.shape}")
|
| 21 |
+
print(f"vector example (1º):\n{embeddings[0][:10]}...")
|
| 22 |
+
|
| 23 |
+
save_embeddings(embeddings, EMBEDDING_PATH)
|
| 24 |
+
print(f"save embeddings in: {EMBEDDING_PATH}")
|
notebooks/demo_evaluator.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.evaluation.evaluator import run_evaluation
|
| 2 |
+
|
| 3 |
+
run_evaluation()
|
notebooks/demo_generator.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from src.retrieval.retriever import Retriever
|
| 3 |
+
from src.generation.response_generator import generate_answer
|
| 4 |
+
|
| 5 |
+
retriever = Retriever("data/embeddings/batch_000.npy")
|
| 6 |
+
query = "Meu notebook está muito lento, o que posso fazer? E se não resolver?"
|
| 7 |
+
|
| 8 |
+
idxs, scores = retriever.retrieve(query, top_k=1)
|
| 9 |
+
|
| 10 |
+
idxs = np.atleast_1d(idxs).flatten()
|
| 11 |
+
scores = np.atleast_1d(scores).flatten()
|
| 12 |
+
|
| 13 |
+
with open("data/raw/base_treinamento.txt", encoding="utf-8") as f:
|
| 14 |
+
chunks = [b.strip() for b in f.read().split("\n\n") if b.strip()]
|
| 15 |
+
|
| 16 |
+
context = chunks[int(idxs[0])]
|
| 17 |
+
|
| 18 |
+
answer = generate_answer(query, context)
|
| 19 |
+
|
| 20 |
+
print(f"\nquery: {query}")
|
| 21 |
+
print(f"context selected (chunk #{idxs[0]}):\n{context}\n")
|
| 22 |
+
print("generated response:\n")
|
| 23 |
+
print(answer)
|
notebooks/demo_ingestion.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.ingestion.document_loader import load_documents_from_dir
|
| 2 |
+
from src.ingestion.preprocessor import preprocess_documents
|
| 3 |
+
from src.ingestion.text_splitter import split_text
|
| 4 |
+
|
| 5 |
+
RAW_PATH = "data/raw"
|
| 6 |
+
CHUNK_SIZE = 300
|
| 7 |
+
CHUNK_OVERLAP = 50
|
| 8 |
+
|
| 9 |
+
docs = load_documents_from_dir(RAW_PATH)
|
| 10 |
+
print(f"docs loaded: {len(docs)}")
|
| 11 |
+
|
| 12 |
+
cleaned_docs = preprocess_documents(docs)
|
| 13 |
+
print(f"pre-process completed")
|
| 14 |
+
|
| 15 |
+
chunks = split_text(cleaned_docs, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
|
| 16 |
+
print(f"total chunks generated: {len(chunks)}")
|
| 17 |
+
|
| 18 |
+
for i, chunk in enumerate(chunks[:3]):
|
| 19 |
+
print(f"\n--- Chunk {i} ---")
|
| 20 |
+
print(f"font: {chunk['source']}")
|
| 21 |
+
print(chunk['content'][:300])
|
notebooks/demo_reranker.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from src.retrieval.retriever import Retriever
|
| 3 |
+
from src.retrieval.reranker import HybridReranker
|
| 4 |
+
|
| 5 |
+
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
|
| 6 |
+
|
| 7 |
+
def main():
|
| 8 |
+
# Carrega os textos (chunks)
|
| 9 |
+
with open("data/raw/base_treinamento.txt", encoding="utf-8") as f:
|
| 10 |
+
chunks = [b.strip() for b in f.read().split("\n\n") if b.strip()]
|
| 11 |
+
|
| 12 |
+
# Cria o retriever com os embeddings pré-calculados
|
| 13 |
+
retriever = Retriever("data/embeddings/batch_000.npy")
|
| 14 |
+
|
| 15 |
+
# Cria o reranker, passando o retriever e os chunks para reranking
|
| 16 |
+
hybrid = HybridReranker(
|
| 17 |
+
retriever=retriever,
|
| 18 |
+
chunk_texts=chunks,
|
| 19 |
+
reranker_model=RERANKER_MODEL,
|
| 20 |
+
sparse_alpha=0.5,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
query = "Meu notebook está muito lento, o que posso fazer? E se não resolver?"
|
| 24 |
+
|
| 25 |
+
# Recupera e reranqueia os top documentos
|
| 26 |
+
idxs, scores = hybrid.retrieve_and_rerank(query, top_k_dense=10, top_k_final=3)
|
| 27 |
+
|
| 28 |
+
print(f"\nQuery: {query}\n")
|
| 29 |
+
for i, idx in enumerate(idxs):
|
| 30 |
+
print(f"Rank {i+1} - Chunk #{idx} (score: {scores[i]:.4f}):")
|
| 31 |
+
print(chunks[idx])
|
| 32 |
+
print("-" * 40)
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
main()
|
notebooks/demo_retriever.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from src.retrieval.retriever import Retriever
|
| 3 |
+
|
| 4 |
+
retriever = Retriever("data/embeddings/batch_000.npy")
|
| 5 |
+
|
| 6 |
+
query = "O que fazer se o notebook não liga?"
|
| 7 |
+
idxs, scores = retriever.retrieve(query, top_k=5)
|
| 8 |
+
|
| 9 |
+
idxs = np.atleast_1d(idxs).flatten()
|
| 10 |
+
scores = np.atleast_1d(scores).flatten()
|
| 11 |
+
|
| 12 |
+
print(f"\ntop results for: {query}\n")
|
| 13 |
+
for i, (idx, score) in enumerate(zip(idxs, scores), 1):
|
| 14 |
+
print(f"{i}. idx: {int(idx):4d} — similarity: {score:.4f}")
|
notebooks/demo_vector_store.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from src.retrieval.vector_store import VectorStore
|
| 3 |
+
|
| 4 |
+
embeds = np.load("data/embeddings/batch_000.npy")
|
| 5 |
+
dim = embeds.shape[1]
|
| 6 |
+
print(f"embeddings loaded: {embeds.shape}")
|
| 7 |
+
|
| 8 |
+
vs = VectorStore(dim=dim)
|
| 9 |
+
vs.add(embeds)
|
| 10 |
+
print("vectors added in index")
|
| 11 |
+
|
| 12 |
+
query = embeds[0]
|
| 13 |
+
dists, idxs = vs.search(query, top_k=5)
|
| 14 |
+
|
| 15 |
+
print("\nresult search (dummy):")
|
| 16 |
+
for i, (idx, dist) in enumerate(zip(idxs, dists), 1):
|
| 17 |
+
print(f"{i}. idx: {idx:4d} — distância: {dist:.4f}")
|
| 18 |
+
|
| 19 |
+
print("Norma do primeiro vetor:", np.linalg.norm(embeds[0]))
|
| 20 |
+
print("Norma do segundo vetor:", np.linalg.norm(embeds[1]))
|
pytest.ini
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pytest]
|
| 2 |
+
minversion = 6.0
|
| 3 |
+
addopts = -ra -q
|
| 4 |
+
testpaths =
|
| 5 |
+
tests
|
| 6 |
+
python_paths = src
|
requiriments.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sentence-transformers>=2.2.2
|
| 2 |
+
faiss-cpu>=1.7.3
|
| 3 |
+
openai>=0.27.0
|
| 4 |
+
python-docx>=0.8.11
|
| 5 |
+
PyPDF2>=3.0.0
|
| 6 |
+
beautifulsoup4>=4.12.2
|
| 7 |
+
PyYAML>=6.0
|
| 8 |
+
uvicorn[standard]>=0.22.0
|
| 9 |
+
pytest>=7.0.1
|
| 10 |
+
python-dotenv>=1.0.0
|
| 11 |
+
nltk>=3.8.1
|
| 12 |
+
pytest
|
| 13 |
+
unstructured
|
| 14 |
+
torch
|
| 15 |
+
fastapi
|
| 16 |
+
uvicorn[standard]
|
| 17 |
+
langchain
|
| 18 |
+
langchain-huggingface
|
| 19 |
+
pydantic
|
| 20 |
+
langchain_community
|
| 21 |
+
streamlit
|
src/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (163 Bytes). View file
|
|
|
src/__pycache__/download.cpython-313.pyc
ADDED
|
Binary file (384 Bytes). View file
|
|
|
src/__pycache__/gerar_chunks.cpython-313.pyc
ADDED
|
Binary file (2.59 kB). View file
|
|
|
src/evaluation/benchmarks.py
ADDED
|
File without changes
|
src/evaluation/evaluator.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import numpy as np
|
| 3 |
+
from src.retrieval.retriever import Retriever
|
| 4 |
+
from src.retrieval.reranker import HybridReranker
|
| 5 |
+
from src.evaluation.metrics import (
|
| 6 |
+
precision_at_k as retrieval_precision_at_k,
|
| 7 |
+
recall_at_k,
|
| 8 |
+
mean_reciprocal_rank,
|
| 9 |
+
bleu_score
|
| 10 |
+
)
|
| 11 |
+
from src.ingestion.document_loader import load_documents_from_dir
|
| 12 |
+
from src.ingestion.preprocessor import preprocess_documents
|
| 13 |
+
from src.ingestion.text_splitter import split_text
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def run_evaluation(
|
| 17 |
+
benchmark_path: str = "tests/benchmark.json",
|
| 18 |
+
k: int = 3,
|
| 19 |
+
top_k_dense: int = 10,
|
| 20 |
+
top_k_final: int = 3,
|
| 21 |
+
sparse_alpha: float = 0.5
|
| 22 |
+
):
|
| 23 |
+
with open(benchmark_path, encoding="utf-8") as f:
|
| 24 |
+
benchmarks = json.load(f)
|
| 25 |
+
|
| 26 |
+
docs = load_documents_from_dir("data/raw")
|
| 27 |
+
clean_docs = preprocess_documents(docs)
|
| 28 |
+
chunks = split_text(clean_docs, chunk_size=300, chunk_overlap=50)
|
| 29 |
+
texts = [chunk['content'] for chunk in chunks]
|
| 30 |
+
|
| 31 |
+
retriever = Retriever("data/embeddings/batch_000.npy")
|
| 32 |
+
reranker = HybridReranker(
|
| 33 |
+
retriever=retriever,
|
| 34 |
+
chunk_texts=texts,
|
| 35 |
+
reranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
|
| 36 |
+
sparse_alpha=sparse_alpha
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
all_retrieved = []
|
| 40 |
+
all_relevant = []
|
| 41 |
+
|
| 42 |
+
print(f"starting assessment reranker: Precision@{k}, Recall@{k}, MRR")
|
| 43 |
+
print(f"dense top_k: {top_k_dense}, final top_k: {top_k_final}, sparse_alpha: {sparse_alpha}\n")
|
| 44 |
+
|
| 45 |
+
for i, entry in enumerate(benchmarks, 1):
|
| 46 |
+
query = entry['query']
|
| 47 |
+
relevant_idxs = entry.get('relevant_idxs', [])
|
| 48 |
+
|
| 49 |
+
idxs, scores = reranker.retrieve_and_rerank(
|
| 50 |
+
query,
|
| 51 |
+
top_k_dense=top_k_dense,
|
| 52 |
+
top_k_final=top_k_final
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
p = retrieval_precision_at_k(retrieved_idxs=idxs, relevant_idxs=relevant_idxs, k=k)
|
| 56 |
+
r = recall_at_k(retrieved_idxs=idxs, relevant_idxs=relevant_idxs, k=k)
|
| 57 |
+
|
| 58 |
+
all_retrieved.append(idxs)
|
| 59 |
+
all_relevant.append(relevant_idxs)
|
| 60 |
+
|
| 61 |
+
print(f"{i}. Query: {query}")
|
| 62 |
+
print(f" Precision@{k}: {p:.2f}, Recall@{k}: {r:.2f}")
|
| 63 |
+
print(f" Retrieved idxs: {idxs}")
|
| 64 |
+
print(f" Rerank scores: {[f'{s:.4f}' for s in scores]}\n")
|
| 65 |
+
|
| 66 |
+
mrr = mean_reciprocal_rank(retrieved_lists=all_retrieved, relevant_idxs_list=all_relevant)
|
| 67 |
+
print(f"mean reciprocal rank (MRR): {mrr:.2f}\n")
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
run_evaluation()
|
src/evaluation/metrics.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from typing import List
|
| 3 |
+
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def precision_at_k(retrieved_idxs: List[int], relevant_idxs: List[int], k: int) -> float:
|
| 7 |
+
if k <= 0:
|
| 8 |
+
return 0.0
|
| 9 |
+
top_k = retrieved_idxs[:k]
|
| 10 |
+
hits = sum(1 for idx in top_k if idx in relevant_idxs)
|
| 11 |
+
return hits / k
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def recall_at_k(retrieved_idxs: List[int], relevant_idxs: List[int], k: int) -> float:
|
| 15 |
+
if not relevant_idxs:
|
| 16 |
+
return 0.0
|
| 17 |
+
top_k = retrieved_idxs[:k]
|
| 18 |
+
hits = sum(1 for idx in top_k if idx in relevant_idxs)
|
| 19 |
+
return hits / len(relevant_idxs)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def mean_reciprocal_rank(retrieved_lists: List[List[int]], relevant_idxs_list: List[List[int]]) -> float:
|
| 23 |
+
|
| 24 |
+
rr_scores = []
|
| 25 |
+
for retrieved, relevant in zip(retrieved_lists, relevant_idxs_list):
|
| 26 |
+
rr = 0.0
|
| 27 |
+
for rank, idx in enumerate(retrieved, start=1):
|
| 28 |
+
if idx in relevant:
|
| 29 |
+
rr = 1.0 / rank
|
| 30 |
+
break
|
| 31 |
+
rr_scores.append(rr)
|
| 32 |
+
return float(np.mean(rr_scores)) if rr_scores else 0.0
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def bleu_score(reference: str, candidate: str) -> float:
|
| 36 |
+
smoothie = SmoothingFunction().method4
|
| 37 |
+
weights = (0.25, 0.25, 0.25, 0.25)
|
| 38 |
+
return sentence_bleu([reference.split()], candidate.split(), weights=weights, smoothing_function=smoothie)
|
src/generation/llm_client.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
env_path = Path(__file__).resolve().parents[2] / '.env'
|
| 7 |
+
load_dotenv(dotenv_path=env_path)
|
| 8 |
+
|
| 9 |
+
HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
|
| 10 |
+
REPO_ID = os.getenv("HUGGINGFACE_MODEL", "HuggingFaceH4/zephyr-7b-beta")
|
| 11 |
+
API_URL = f"https://api-inference.huggingface.co/models/{REPO_ID}"
|
| 12 |
+
|
| 13 |
+
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def call_llm(prompt: str, max_length: int = 200) -> str:
|
| 17 |
+
payload = {
|
| 18 |
+
"inputs": prompt,
|
| 19 |
+
"parameters": {"max_new_tokens": max_length, "temperature": 0.2}
|
| 20 |
+
}
|
| 21 |
+
try:
|
| 22 |
+
print(f"[llm_client] POST {API_URL}")
|
| 23 |
+
print(f"[llm_client] HEADERS: {HEADERS}")
|
| 24 |
+
response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=60)
|
| 25 |
+
print(f"[llm_client] Status code: {response.status_code}")
|
| 26 |
+
text = response.text
|
| 27 |
+
print(f"[llm_client] Response text: {text}")
|
| 28 |
+
response.raise_for_status()
|
| 29 |
+
data = response.json()
|
| 30 |
+
if isinstance(data, list) and data and isinstance(data[0], dict) and "generated_text" in data[0]:
|
| 31 |
+
return data[0]["generated_text"].strip()
|
| 32 |
+
if isinstance(data, dict) and "generated_text" in data:
|
| 33 |
+
return data["generated_text"].strip()
|
| 34 |
+
return str(data)
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"[llm_client] error HTTP HF: {e}")
|
| 37 |
+
return f"error in generate response: {e}"
|
src/generation/prompt_templates.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/generation/prompt_templates.py
|
| 2 |
+
|
| 3 |
+
helpdesk_prompt = """
|
| 4 |
+
Contexto técnico:
|
| 5 |
+
{context}
|
| 6 |
+
|
| 7 |
+
Com base nisso, responda à seguinte pergunta:
|
| 8 |
+
{query}
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
concise_helpdesk_prompt = """
|
| 12 |
+
Contexto técnico:
|
| 13 |
+
{context}
|
| 14 |
+
|
| 15 |
+
Responda de forma breve e objetiva:
|
| 16 |
+
{query}
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
informal_helpdesk_prompt = """
|
| 20 |
+
Oi! Aqui está o que você precisa saber com base no contexto:
|
| 21 |
+
{context}
|
| 22 |
+
|
| 23 |
+
Pergunta:
|
| 24 |
+
{query}
|
| 25 |
+
|
| 26 |
+
Resposta descontraída:
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
step_by_step_prompt = """
|
| 30 |
+
Contexto técnico:
|
| 31 |
+
{context}
|
| 32 |
+
|
| 33 |
+
Por favor, explique passo a passo como resolver:
|
| 34 |
+
{query}
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
link_suggestion_prompt = """
|
| 38 |
+
Contexto técnico:
|
| 39 |
+
{context}
|
| 40 |
+
|
| 41 |
+
Responda a pergunta: {query}
|
| 42 |
+
|
| 43 |
+
Se possível, inclua links úteis para mais informações.
|
| 44 |
+
"""
|
src/generation/response_generator.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.generation.llm_client import call_llm
|
| 2 |
+
from src.generation.prompt_templates import (
|
| 3 |
+
helpdesk_prompt,
|
| 4 |
+
concise_helpdesk_prompt,
|
| 5 |
+
informal_helpdesk_prompt,
|
| 6 |
+
step_by_step_prompt,
|
| 7 |
+
link_suggestion_prompt
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
def build_prompt(query: str, context: str, mode: str = "default") -> str:
|
| 11 |
+
prompt_map = {
|
| 12 |
+
"default": helpdesk_prompt,
|
| 13 |
+
"concise": concise_helpdesk_prompt,
|
| 14 |
+
"informal": informal_helpdesk_prompt,
|
| 15 |
+
"step_by_step": step_by_step_prompt,
|
| 16 |
+
"with_links": link_suggestion_prompt
|
| 17 |
+
}
|
| 18 |
+
template = prompt_map.get(mode, helpdesk_prompt)
|
| 19 |
+
return template.format(context=context, query=query)
|
| 20 |
+
|
| 21 |
+
def generate_answer(query: str, context: str, mode: str = "default") -> str:
|
| 22 |
+
prompt = build_prompt(query, context, mode)
|
| 23 |
+
return call_llm(prompt)
|
src/ingestion/document_loader.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
SUPPORTED_EXTENSIONS = {".txt", ".md"}
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def load_documents_from_dir(directory: str) -> list[dict]:
|
| 8 |
+
|
| 9 |
+
docs = []
|
| 10 |
+
|
| 11 |
+
for file_path in Path(directory).rglob("*"):
|
| 12 |
+
if file_path.suffix.lower() in SUPPORTED_EXTENSIONS:
|
| 13 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 14 |
+
content = f.read()
|
| 15 |
+
docs.append({
|
| 16 |
+
"content": content,
|
| 17 |
+
"source": str(file_path)
|
| 18 |
+
})
|
| 19 |
+
|
| 20 |
+
return docs
|
src/ingestion/embedder.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import torch
|
| 4 |
+
from langchain_community.document_loaders import DirectoryLoader
|
| 5 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 6 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 7 |
+
from langchain_community.vectorstores import FAISS
|
| 8 |
+
|
| 9 |
+
def check_environment():
|
| 10 |
+
|
| 11 |
+
if not torch.cuda.is_available():
|
| 12 |
+
print("\nCUDA not available")
|
| 13 |
+
return False
|
| 14 |
+
gpu_name = torch.cuda.get_device_name(0)
|
| 15 |
+
print(f"CUDA verified, gpu detected: {gpu_name}")
|
| 16 |
+
return True
|
| 17 |
+
|
| 18 |
+
if not check_environment():
|
| 19 |
+
exit()
|
| 20 |
+
|
| 21 |
+
RAW_DATA_DIR = "data/raw"
|
| 22 |
+
VECTOR_STORE_PATH = "data/vector_store_faiss"
|
| 23 |
+
EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
|
| 24 |
+
|
| 25 |
+
def run_ingestion():
|
| 26 |
+
print(f"\nstarting ingestion with GPU and model: {EMBEDDING_MODEL_NAME}")
|
| 27 |
+
|
| 28 |
+
loader = DirectoryLoader(RAW_DATA_DIR, glob="**/*.txt", show_progress=True, use_multithreading=True)
|
| 29 |
+
docs = loader.load()
|
| 30 |
+
if not docs:
|
| 31 |
+
print(f"no docs found in '{RAW_DATA_DIR}'")
|
| 32 |
+
return
|
| 33 |
+
print(f"loading {len(docs)} docs.")
|
| 34 |
+
|
| 35 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 36 |
+
chunk_size=1000,
|
| 37 |
+
chunk_overlap=100
|
| 38 |
+
)
|
| 39 |
+
chunks = text_splitter.split_documents(docs)
|
| 40 |
+
print(f"docs divided in {len(chunks)} chunks.")
|
| 41 |
+
|
| 42 |
+
print("loading model embedding for GPU")
|
| 43 |
+
embeddings_model = HuggingFaceEmbeddings(
|
| 44 |
+
model_name=EMBEDDING_MODEL_NAME,
|
| 45 |
+
model_kwargs={'device': 'cuda'},
|
| 46 |
+
encode_kwargs={'normalize_embeddings': True}
|
| 47 |
+
)
|
| 48 |
+
print("loaded model")
|
| 49 |
+
|
| 50 |
+
print("creating vector DB faiss")
|
| 51 |
+
if os.path.exists(VECTOR_STORE_PATH):
|
| 52 |
+
print(f"removing old vector in '{VECTOR_STORE_PATH}'...")
|
| 53 |
+
shutil.rmtree(VECTOR_STORE_PATH)
|
| 54 |
+
|
| 55 |
+
vector_store = FAISS.from_documents(chunks, embeddings_model)
|
| 56 |
+
vector_store.save_local(VECTOR_STORE_PATH)
|
| 57 |
+
|
| 58 |
+
print("-" * 50)
|
| 59 |
+
print("pipeline ingestion conclude")
|
| 60 |
+
print("-" * 50)
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
run_ingestion()
|
src/ingestion/preprocessor.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def clean_text(text: str) -> str:
|
| 4 |
+
text = re.sub(r"\n+", "\n", text)
|
| 5 |
+
text = re.sub(r"[ \t]+", " ", text)
|
| 6 |
+
text = text.strip()
|
| 7 |
+
return text
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def preprocess_documents(documents: list[dict]) -> list[dict]:
|
| 11 |
+
return [
|
| 12 |
+
{
|
| 13 |
+
**doc,
|
| 14 |
+
"content": clean_text(doc["content"])
|
| 15 |
+
}
|
| 16 |
+
for doc in documents
|
| 17 |
+
]
|
src/ingestion/text_splitter.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
|
| 4 |
+
def split_text(
|
| 5 |
+
documents: List[Dict],
|
| 6 |
+
chunk_size: int = 500,
|
| 7 |
+
chunk_overlap: int = 50
|
| 8 |
+
) -> List[Dict]:
|
| 9 |
+
chunks = []
|
| 10 |
+
|
| 11 |
+
for doc in documents:
|
| 12 |
+
text = doc["content"]
|
| 13 |
+
source = doc.get("source", "unknown")
|
| 14 |
+
|
| 15 |
+
entries = re.split(r"(?=Pergunta:)", text)
|
| 16 |
+
|
| 17 |
+
for entry_id, entry in enumerate(entries):
|
| 18 |
+
entry = entry.strip()
|
| 19 |
+
if not entry:
|
| 20 |
+
continue
|
| 21 |
+
|
| 22 |
+
if len(entry) <= chunk_size:
|
| 23 |
+
chunks.append({
|
| 24 |
+
"content": entry,
|
| 25 |
+
"source": source,
|
| 26 |
+
"entry_id": entry_id,
|
| 27 |
+
"chunk_id": 0
|
| 28 |
+
})
|
| 29 |
+
else:
|
| 30 |
+
start = 0
|
| 31 |
+
end = chunk_size
|
| 32 |
+
chunk_id = 0
|
| 33 |
+
|
| 34 |
+
while start < len(entry):
|
| 35 |
+
snippet = entry[start:end]
|
| 36 |
+
chunks.append({
|
| 37 |
+
"content": snippet,
|
| 38 |
+
"source": source,
|
| 39 |
+
"entry_id": entry_id,
|
| 40 |
+
"chunk_id": chunk_id
|
| 41 |
+
})
|
| 42 |
+
chunk_id += 1
|
| 43 |
+
start = end - chunk_overlap
|
| 44 |
+
end = start + chunk_size
|
| 45 |
+
|
| 46 |
+
return chunks
|
src/retrieval/__pycache__/reranker.cpython-313.pyc
ADDED
|
Binary file (3.99 kB). View file
|
|
|
src/retrieval/__pycache__/retriever.cpython-313.pyc
ADDED
|
Binary file (2.3 kB). View file
|
|
|
src/retrieval/__pycache__/vector_store.cpython-313.pyc
ADDED
|
Binary file (1.85 kB). View file
|
|
|
src/retrieval/query_processor.py
ADDED
|
File without changes
|
src/retrieval/reranker.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 4 |
+
from sentence_transformers import CrossEncoder
|
| 5 |
+
from langchain_community.vectorstores import FAISS
|
| 6 |
+
from langchain_core.documents import Document
|
| 7 |
+
|
| 8 |
+
class HybridReranker:
|
| 9 |
+
def __init__(
|
| 10 |
+
self,
|
| 11 |
+
vector_store: FAISS,
|
| 12 |
+
reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
| 13 |
+
):
|
| 14 |
+
self.vector_store = vector_store
|
| 15 |
+
self.reranker = CrossEncoder(reranker_model)
|
| 16 |
+
|
| 17 |
+
docs_in_order = list(self.vector_store.docstore._dict.values())
|
| 18 |
+
self.chunk_texts = [doc.page_content for doc in docs_in_order]
|
| 19 |
+
self.chunk_metadata = [doc.metadata for doc in docs_in_order]
|
| 20 |
+
|
| 21 |
+
print(f"rerank model '{reranker_model}' loading. building matriz tf-idf")
|
| 22 |
+
self.vectorizer = TfidfVectorizer()
|
| 23 |
+
self.tfidf_matrix = self.vectorizer.fit_transform(self.chunk_texts)
|
| 24 |
+
print("reranker ready")
|
| 25 |
+
|
| 26 |
+
def retrieve_and_rerank(
|
| 27 |
+
self,
|
| 28 |
+
query: str,
|
| 29 |
+
top_k_dense: int = 20,
|
| 30 |
+
top_k_final: int = 5,
|
| 31 |
+
) -> List[Document]:
|
| 32 |
+
dense_docs = self.vector_store.similarity_search(query, k=top_k_dense)
|
| 33 |
+
|
| 34 |
+
q_vec = self.vectorizer.transform([query])
|
| 35 |
+
sparse_scores = (self.tfidf_matrix @ q_vec.T).toarray().ravel()
|
| 36 |
+
sparse_indices = np.argsort(-sparse_scores)[:top_k_dense]
|
| 37 |
+
|
| 38 |
+
sparse_docs = [
|
| 39 |
+
Document(page_content=self.chunk_texts[i], metadata=self.chunk_metadata[i])
|
| 40 |
+
for i in sparse_indices
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
combined_docs = []
|
| 44 |
+
seen_contents = set()
|
| 45 |
+
for doc in dense_docs + sparse_docs:
|
| 46 |
+
if doc.page_content not in seen_contents:
|
| 47 |
+
combined_docs.append(doc)
|
| 48 |
+
seen_contents.add(doc.page_content)
|
| 49 |
+
|
| 50 |
+
pairs = [[query, doc.page_content] for doc in combined_docs]
|
| 51 |
+
rerank_scores = self.reranker.predict(pairs)
|
| 52 |
+
|
| 53 |
+
doc_scores = list(zip(combined_docs, rerank_scores))
|
| 54 |
+
sorted_doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True)
|
| 55 |
+
final_docs = [doc for doc, score in sorted_doc_scores[:top_k_final]]
|
| 56 |
+
|
| 57 |
+
return final_docs
|
src/retrieval/retriever.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
from src.retrieval.vector_store import VectorStore
|
| 4 |
+
|
| 5 |
+
class Retriever:
|
| 6 |
+
def __init__(self, embeddings_path: str, model_name: str = "BAAI/bge-small-en-v1.5"):
|
| 7 |
+
self.model = SentenceTransformer(model_name)
|
| 8 |
+
dim = self.model.get_sentence_embedding_dimension()
|
| 9 |
+
self.index = VectorStore(dim=dim)
|
| 10 |
+
|
| 11 |
+
embeds = np.load(embeddings_path).astype(np.float32)
|
| 12 |
+
self.index.add(embeds)
|
| 13 |
+
|
| 14 |
+
def retrieve(self, query: str, top_k: int = 5):
|
| 15 |
+
qv = self.model.encode(query, normalize_embeddings=True).astype(np.float32)
|
| 16 |
+
distances, indices = self.index.search(qv, top_k)
|
| 17 |
+
return indices[0].copy(), distances[0].copy()
|
| 18 |
+
|
| 19 |
+
def set_chunk_texts(self, texts: list[str]):
|
| 20 |
+
self._chunk_texts = texts
|
| 21 |
+
|
| 22 |
+
def get_chunk_texts(self) -> list[str]:
|
| 23 |
+
return getattr(self, "_chunk_texts", [])
|
src/retrieval/vector_store.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import faiss
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
class VectorStore:
|
| 5 |
+
def __init__(self, dim: int):
|
| 6 |
+
self.dim = dim
|
| 7 |
+
self.index = faiss.IndexFlatIP(dim)
|
| 8 |
+
|
| 9 |
+
def add(self, embeddings: np.ndarray):
|
| 10 |
+
if embeddings.dtype != np.float32:
|
| 11 |
+
embeddings = embeddings.astype(np.float32)
|
| 12 |
+
self.index.add(embeddings)
|
| 13 |
+
|
| 14 |
+
def search(self, query_vec: np.ndarray, top_k: int = 5):
|
| 15 |
+
if query_vec.ndim == 1:
|
| 16 |
+
query_vec = np.expand_dims(query_vec, axis=0)
|
| 17 |
+
if query_vec.dtype != np.float32:
|
| 18 |
+
query_vec = query_vec.astype(np.float32)
|
| 19 |
+
distances, indices = self.index.search(query_vec, top_k)
|
| 20 |
+
return distances[0], indices[0]
|
src/utils/env.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
load_dotenv()
|
| 4 |
+
def get_env_var(name: str, default: str = "") -> str:
|
| 5 |
+
return os.getenv(name, default)
|
src/utils/io.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def read_txt_chunks(path: str) -> list[str]:
|
| 2 |
+
with open(path, encoding="utf-8") as f:
|
| 3 |
+
return [chunk.strip() for chunk in f.read().split("\n\n") if chunk.strip()]
|