vinimoreira commited on
Commit
2068d15
·
verified ·
1 Parent(s): 04e8f63

Add files for RAG backend

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +12 -0
  3. LICENSE +21 -0
  4. README.md +2 -12
  5. api/__init__.py +0 -0
  6. api/__pycache__/__init__.cpython-313.pyc +0 -0
  7. api/__pycache__/main.cpython-313.pyc +0 -0
  8. api/__pycache__/rag_chain.cpython-313.pyc +0 -0
  9. api/__pycache__/schemas.cpython-313.pyc +0 -0
  10. api/main.py +57 -0
  11. api/rag_chain.py +108 -0
  12. api/schemas.py +14 -0
  13. config/base.yaml +43 -0
  14. config/dev.yaml +0 -0
  15. config/prod.yaml +0 -0
  16. data/embeddings/batch_000.npy +3 -0
  17. data/raw/base_treinamento.txt +0 -0
  18. data/vector_store_faiss/index.faiss +3 -0
  19. data/vector_store_faiss/index.pkl +3 -0
  20. notebooks/demo_embedder.py +24 -0
  21. notebooks/demo_evaluator.py +3 -0
  22. notebooks/demo_generator.py +23 -0
  23. notebooks/demo_ingestion.py +21 -0
  24. notebooks/demo_reranker.py +35 -0
  25. notebooks/demo_retriever.py +14 -0
  26. notebooks/demo_vector_store.py +20 -0
  27. pytest.ini +6 -0
  28. requiriments.txt +21 -0
  29. src/__pycache__/__init__.cpython-313.pyc +0 -0
  30. src/__pycache__/download.cpython-313.pyc +0 -0
  31. src/__pycache__/gerar_chunks.cpython-313.pyc +0 -0
  32. src/evaluation/benchmarks.py +0 -0
  33. src/evaluation/evaluator.py +70 -0
  34. src/evaluation/metrics.py +38 -0
  35. src/generation/llm_client.py +37 -0
  36. src/generation/prompt_templates.py +44 -0
  37. src/generation/response_generator.py +23 -0
  38. src/ingestion/document_loader.py +20 -0
  39. src/ingestion/embedder.py +63 -0
  40. src/ingestion/preprocessor.py +17 -0
  41. src/ingestion/text_splitter.py +46 -0
  42. src/retrieval/__pycache__/reranker.cpython-313.pyc +0 -0
  43. src/retrieval/__pycache__/retriever.cpython-313.pyc +0 -0
  44. src/retrieval/__pycache__/vector_store.cpython-313.pyc +0 -0
  45. src/retrieval/query_processor.py +0 -0
  46. src/retrieval/reranker.py +57 -0
  47. src/retrieval/retriever.py +23 -0
  48. src/retrieval/vector_store.py +20 -0
  49. src/utils/env.py +5 -0
  50. src/utils/io.py +3 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  RAG_HelpDesk/data/vector_store_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  RAG_HelpDesk/data/vector_store_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ data/vector_store_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+
7
+ RUN pip install --no-cache-dir --upgrade pip && \
8
+ pip install --no-cache-dir -r requirements.txt
9
+
10
+ COPY . .
11
+
12
+ CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Vinicius Moreira
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,2 @@
1
- ---
2
- title: RAG HelpDesk
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: RAG with langchain and fastAPI for to answer technical IT q/
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # RAG_HelpDesk
2
+ Esse projeto visa criar um RAG para ajudar usuários em relação a duvidas de Hardware e Software.
 
 
 
 
 
 
 
 
 
 
api/__init__.py ADDED
File without changes
api/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (163 Bytes). View file
 
api/__pycache__/main.cpython-313.pyc ADDED
Binary file (2.88 kB). View file
 
api/__pycache__/rag_chain.cpython-313.pyc ADDED
Binary file (5.39 kB). View file
 
api/__pycache__/schemas.cpython-313.pyc ADDED
Binary file (1.34 kB). View file
 
api/main.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+ import traceback
4
+
5
+ project_root = Path(__file__).resolve().parents[1]
6
+ sys.path.append(str(project_root))
7
+
8
+ from fastapi import FastAPI, HTTPException
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from api.schemas import QueryRequest, QueryResponse, SourceChunk
11
+ from api.rag_chain import get_rag_chain
12
+
13
+
14
+ app = FastAPI(
15
+ title="Helpdesk RAG API",
16
+ description="API for answering questions about an IT knowledge base",
17
+ version="1.0.0"
18
+ )
19
+
20
+ try:
21
+ rag_chain = get_rag_chain()
22
+ except Exception as e:
23
+ print("error to load pipeline RAG")
24
+ traceback.print_exc()
25
+ raise RuntimeError(f"error to load pipeline RAG: {e}")
26
+
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=["*"],
30
+ allow_credentials=True,
31
+ allow_methods=["GET", "POST"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ @app.get("/", tags=["Status"])
36
+ def read_root():
37
+ return {"status": "API ON"}
38
+
39
+ @app.post("/query", response_model=QueryResponse, tags=["RAG"])
40
+ async def handle_query(request: QueryRequest):
41
+ print(f"processing query: '{request.query}'")
42
+ try:
43
+ result = rag_chain.invoke(request.query)
44
+ source_chunks = [
45
+ SourceChunk(
46
+ page_content=doc.page_content,
47
+ source=doc.metadata.get('source', 'desconhecida')
48
+ ) for doc in result['source_chunks']
49
+ ]
50
+ return QueryResponse(
51
+ answer=result['answer'],
52
+ source_chunks=source_chunks
53
+ )
54
+ except Exception as e:
55
+ print(f"error to process query")
56
+ traceback.print_exc()
57
+ raise HTTPException(status_code=500, detail=f"internal error. check the server console")
api/rag_chain.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from dotenv import load_dotenv
4
+ from operator import itemgetter
5
+ from typing import List, Dict
6
+
7
+ from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
8
+ from langchain_core.prompts import PromptTemplate
9
+ from langchain_community.vectorstores import FAISS
10
+ from langchain_huggingface import HuggingFaceEmbeddings
11
+ from huggingface_hub import InferenceClient
12
+
13
+ load_dotenv()
14
+
15
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
16
+ VECTOR_STORE_PATH = str(PROJECT_ROOT / "data" / "vector_store_faiss")
17
+ EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
18
+ LLM_REPO_ID = os.getenv("HUGGINGFACE_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
19
+
20
+ HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
21
+
22
+ if not HF_TOKEN:
23
+ raise ValueError("HUGGINGFACE_API_TOKEN not found")
24
+
25
+ client = InferenceClient(model=LLM_REPO_ID, token=HF_TOKEN)
26
+
27
+ prompt_template = PromptTemplate.from_template("""
28
+ <|system|>
29
+ Você é um assistente de helpdesk de TI especialista... (seu prompt aqui)
30
+ </s><|user|>
31
+ Contexto: {context}\n\nPergunta: {query}
32
+ </s><|assistant|>
33
+ Resposta em Português:
34
+ """)
35
+
36
+ def format_docs(docs: List[Dict]) -> str:
37
+ return "\n\n".join(doc.page_content for doc in docs)
38
+
39
+ def generate_answer_from_context(input_dict: Dict) -> str:
40
+
41
+ context_docs = input_dict["context"]
42
+ query_text = input_dict["query"]
43
+ formatted_context = format_docs(context_docs)
44
+
45
+ prompt_value = prompt_template.invoke({
46
+ "context": formatted_context,
47
+ "query": query_text
48
+ })
49
+ final_prompt_text = str(prompt_value)
50
+
51
+ try:
52
+ response = client.chat_completion(
53
+ messages=[{"role": "user", "content": final_prompt_text}],
54
+ max_tokens=300,
55
+ temperature=0.1
56
+ )
57
+ raw_answer = response.choices[0].message.content
58
+
59
+ clean_answer = raw_answer.strip()
60
+
61
+ if clean_answer.startswith('text="'):
62
+ clean_answer = clean_answer[6:]
63
+ elif clean_answer.startswith("text='"):
64
+ clean_answer = clean_answer[6:]
65
+
66
+ if clean_answer.endswith('"') or clean_answer.endswith("'"):
67
+ clean_answer = clean_answer[:-1]
68
+
69
+ if clean_answer.startswith("Resposta:"):
70
+ clean_answer = clean_answer.split("Resposta:", 1)[1]
71
+
72
+ return clean_answer.strip()
73
+
74
+ except Exception as e:
75
+ print(f"error for call API huggingface: {e}")
76
+ return f"error for contact llm: {e}"
77
+
78
+ def get_rag_chain():
79
+ print("loading pipeline")
80
+
81
+ embeddings_model = HuggingFaceEmbeddings(
82
+ model_name=EMBEDDING_MODEL_NAME,
83
+ model_kwargs={'device': 'cpu'},
84
+ encode_kwargs={'normalize_embeddings': True}
85
+ )
86
+ vector_store = FAISS.load_local(
87
+ VECTOR_STORE_PATH, embeddings_model, allow_dangerous_deserialization=True
88
+ )
89
+
90
+ try:
91
+ from src.retrieval.reranker import HybridReranker
92
+ hybrid_reranker = HybridReranker(vector_store=vector_store)
93
+ retrieval_chain = lambda query: hybrid_reranker.retrieve_and_rerank(query)
94
+ print("using pipeline with reranker")
95
+ except ImportError:
96
+ retrieval_chain = vector_store.as_retriever(search_kwargs={"k": 5})
97
+ print("reranker not found, using simple retriever")
98
+
99
+ rag_chain = {
100
+ "context": retrieval_chain,
101
+ "query": RunnablePassthrough()
102
+ } | RunnableParallel({
103
+ "source_chunks": itemgetter("context"),
104
+ "answer": generate_answer_from_context
105
+ })
106
+
107
+ print("pipeline ready.")
108
+ return rag_chain
api/schemas.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional
3
+
4
+ class QueryRequest(BaseModel):
5
+ query: str = Field(..., description="query for RAG")
6
+ top_k: int = Field(3, description="number of relevants docs to be retrivied", ge=1, le=10)
7
+
8
+ class SourceChunk(BaseModel):
9
+ page_content: str
10
+ source: str = Field(description="file path")
11
+
12
+ class QueryResponse(BaseModel):
13
+ answer: str
14
+ source_chunks: List[SourceChunk]
config/base.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ raw_path: "data/raw"
3
+ processed_path: "data/processed"
4
+ embeddings_path: "data/embeddings"
5
+
6
+ ingestion:
7
+ chunk_size: 500
8
+ chunk_overlap: 50
9
+ languages: ["pt", "en"]
10
+
11
+ embedder:
12
+ model_name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
13
+ batch_size: 64
14
+ normalize: true
15
+
16
+ vector_store:
17
+ type: "faiss"
18
+ index_factory: "Flat"
19
+ metric: "L2"
20
+ save_index_path: "data/embeddings/faiss_index.idx"
21
+
22
+ llm:
23
+ provider: "openai"
24
+ model_name: "gpt-3.5-turbo"
25
+ max_tokens: 512
26
+ temperature: 0.7
27
+ top_p: 0.9
28
+ api_key_env: "OPENAI_API_KEY"
29
+
30
+ api:
31
+ host: "0.0.0.0"
32
+ port: 8000
33
+ docs_url: "/docs"
34
+
35
+ logging:
36
+ level: "INFO"
37
+ format: "[%(asctime)s] %(levelname)s %(name)s: %(message)s"
38
+
39
+ evaluation:
40
+ top_k: 5
41
+ benchmark_path: "tests/benchmark.json"
42
+ save_reports: true
43
+ reports_path: "reports/"
config/dev.yaml ADDED
File without changes
config/prod.yaml ADDED
File without changes
data/embeddings/batch_000.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0234624565756c111fcd23e4090f7f08255567e57fad9a01cde641f862f4c93
3
+ size 76800128
data/raw/base_treinamento.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/vector_store_faiss/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da3c437c49b2aab9bcff75a1553f3ea1cd17212b70c66cdc3fcebbab1bbf1a7c
3
+ size 43315245
data/vector_store_faiss/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac470dbf2aa92c9fe996c3a0df6ca4e4435afb806b06f6e8e877def38323393f
3
+ size 10958179
notebooks/demo_embedder.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.ingestion.document_loader import load_documents_from_dir
2
+ from src.ingestion.preprocessor import preprocess_documents
3
+ from src.ingestion.text_splitter import split_text
4
+ from src.ingestion.embedder import load_embedder, generate_embeddings, save_embeddings, get_chunk_texts
5
+
6
+ CHUNK_SIZE = 300
7
+ CHUNK_OVERLAP = 50
8
+ RAW_PATH = "data/raw"
9
+ EMBEDDING_PATH = "data/embeddings/batch_000.npy"
10
+
11
+ docs = load_documents_from_dir(RAW_PATH)
12
+ clean_docs = preprocess_documents(docs)
13
+ chunks = split_text(clean_docs, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
14
+
15
+ texts = get_chunk_texts(chunks)
16
+
17
+ model = load_embedder()
18
+ embeddings = generate_embeddings(texts, model)
19
+
20
+ print(f"embeddings shape: {embeddings.shape}")
21
+ print(f"vector example (1º):\n{embeddings[0][:10]}...")
22
+
23
+ save_embeddings(embeddings, EMBEDDING_PATH)
24
+ print(f"save embeddings in: {EMBEDDING_PATH}")
notebooks/demo_evaluator.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from src.evaluation.evaluator import run_evaluation
2
+
3
+ run_evaluation()
notebooks/demo_generator.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from src.retrieval.retriever import Retriever
3
+ from src.generation.response_generator import generate_answer
4
+
5
+ retriever = Retriever("data/embeddings/batch_000.npy")
6
+ query = "Meu notebook está muito lento, o que posso fazer? E se não resolver?"
7
+
8
+ idxs, scores = retriever.retrieve(query, top_k=1)
9
+
10
+ idxs = np.atleast_1d(idxs).flatten()
11
+ scores = np.atleast_1d(scores).flatten()
12
+
13
+ with open("data/raw/base_treinamento.txt", encoding="utf-8") as f:
14
+ chunks = [b.strip() for b in f.read().split("\n\n") if b.strip()]
15
+
16
+ context = chunks[int(idxs[0])]
17
+
18
+ answer = generate_answer(query, context)
19
+
20
+ print(f"\nquery: {query}")
21
+ print(f"context selected (chunk #{idxs[0]}):\n{context}\n")
22
+ print("generated response:\n")
23
+ print(answer)
notebooks/demo_ingestion.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.ingestion.document_loader import load_documents_from_dir
2
+ from src.ingestion.preprocessor import preprocess_documents
3
+ from src.ingestion.text_splitter import split_text
4
+
5
+ RAW_PATH = "data/raw"
6
+ CHUNK_SIZE = 300
7
+ CHUNK_OVERLAP = 50
8
+
9
+ docs = load_documents_from_dir(RAW_PATH)
10
+ print(f"docs loaded: {len(docs)}")
11
+
12
+ cleaned_docs = preprocess_documents(docs)
13
+ print(f"pre-process completed")
14
+
15
+ chunks = split_text(cleaned_docs, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
16
+ print(f"total chunks generated: {len(chunks)}")
17
+
18
+ for i, chunk in enumerate(chunks[:3]):
19
+ print(f"\n--- Chunk {i} ---")
20
+ print(f"font: {chunk['source']}")
21
+ print(chunk['content'][:300])
notebooks/demo_reranker.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from src.retrieval.retriever import Retriever
3
+ from src.retrieval.reranker import HybridReranker
4
+
5
+ RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
6
+
7
+ def main():
8
+ # Carrega os textos (chunks)
9
+ with open("data/raw/base_treinamento.txt", encoding="utf-8") as f:
10
+ chunks = [b.strip() for b in f.read().split("\n\n") if b.strip()]
11
+
12
+ # Cria o retriever com os embeddings pré-calculados
13
+ retriever = Retriever("data/embeddings/batch_000.npy")
14
+
15
+ # Cria o reranker, passando o retriever e os chunks para reranking
16
+ hybrid = HybridReranker(
17
+ retriever=retriever,
18
+ chunk_texts=chunks,
19
+ reranker_model=RERANKER_MODEL,
20
+ sparse_alpha=0.5,
21
+ )
22
+
23
+ query = "Meu notebook está muito lento, o que posso fazer? E se não resolver?"
24
+
25
+ # Recupera e reranqueia os top documentos
26
+ idxs, scores = hybrid.retrieve_and_rerank(query, top_k_dense=10, top_k_final=3)
27
+
28
+ print(f"\nQuery: {query}\n")
29
+ for i, idx in enumerate(idxs):
30
+ print(f"Rank {i+1} - Chunk #{idx} (score: {scores[i]:.4f}):")
31
+ print(chunks[idx])
32
+ print("-" * 40)
33
+
34
+ if __name__ == "__main__":
35
+ main()
notebooks/demo_retriever.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from src.retrieval.retriever import Retriever
3
+
4
+ retriever = Retriever("data/embeddings/batch_000.npy")
5
+
6
+ query = "O que fazer se o notebook não liga?"
7
+ idxs, scores = retriever.retrieve(query, top_k=5)
8
+
9
+ idxs = np.atleast_1d(idxs).flatten()
10
+ scores = np.atleast_1d(scores).flatten()
11
+
12
+ print(f"\ntop results for: {query}\n")
13
+ for i, (idx, score) in enumerate(zip(idxs, scores), 1):
14
+ print(f"{i}. idx: {int(idx):4d} — similarity: {score:.4f}")
notebooks/demo_vector_store.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from src.retrieval.vector_store import VectorStore
3
+
4
+ embeds = np.load("data/embeddings/batch_000.npy")
5
+ dim = embeds.shape[1]
6
+ print(f"embeddings loaded: {embeds.shape}")
7
+
8
+ vs = VectorStore(dim=dim)
9
+ vs.add(embeds)
10
+ print("vectors added in index")
11
+
12
+ query = embeds[0]
13
+ dists, idxs = vs.search(query, top_k=5)
14
+
15
+ print("\nresult search (dummy):")
16
+ for i, (idx, dist) in enumerate(zip(idxs, dists), 1):
17
+ print(f"{i}. idx: {idx:4d} — distância: {dist:.4f}")
18
+
19
+ print("Norma do primeiro vetor:", np.linalg.norm(embeds[0]))
20
+ print("Norma do segundo vetor:", np.linalg.norm(embeds[1]))
pytest.ini ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [pytest]
2
+ minversion = 6.0
3
+ addopts = -ra -q
4
+ testpaths =
5
+ tests
6
+ python_paths = src
requiriments.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sentence-transformers>=2.2.2
2
+ faiss-cpu>=1.7.3
3
+ openai>=0.27.0
4
+ python-docx>=0.8.11
5
+ PyPDF2>=3.0.0
6
+ beautifulsoup4>=4.12.2
7
+ PyYAML>=6.0
8
+ uvicorn[standard]>=0.22.0
9
+ pytest>=7.0.1
10
+ python-dotenv>=1.0.0
11
+ nltk>=3.8.1
12
+ pytest
13
+ unstructured
14
+ torch
15
+ fastapi
16
+ uvicorn[standard]
17
+ langchain
18
+ langchain-huggingface
19
+ pydantic
20
+ langchain_community
21
+ streamlit
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (163 Bytes). View file
 
src/__pycache__/download.cpython-313.pyc ADDED
Binary file (384 Bytes). View file
 
src/__pycache__/gerar_chunks.cpython-313.pyc ADDED
Binary file (2.59 kB). View file
 
src/evaluation/benchmarks.py ADDED
File without changes
src/evaluation/evaluator.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ from src.retrieval.retriever import Retriever
4
+ from src.retrieval.reranker import HybridReranker
5
+ from src.evaluation.metrics import (
6
+ precision_at_k as retrieval_precision_at_k,
7
+ recall_at_k,
8
+ mean_reciprocal_rank,
9
+ bleu_score
10
+ )
11
+ from src.ingestion.document_loader import load_documents_from_dir
12
+ from src.ingestion.preprocessor import preprocess_documents
13
+ from src.ingestion.text_splitter import split_text
14
+
15
+
16
+ def run_evaluation(
17
+ benchmark_path: str = "tests/benchmark.json",
18
+ k: int = 3,
19
+ top_k_dense: int = 10,
20
+ top_k_final: int = 3,
21
+ sparse_alpha: float = 0.5
22
+ ):
23
+ with open(benchmark_path, encoding="utf-8") as f:
24
+ benchmarks = json.load(f)
25
+
26
+ docs = load_documents_from_dir("data/raw")
27
+ clean_docs = preprocess_documents(docs)
28
+ chunks = split_text(clean_docs, chunk_size=300, chunk_overlap=50)
29
+ texts = [chunk['content'] for chunk in chunks]
30
+
31
+ retriever = Retriever("data/embeddings/batch_000.npy")
32
+ reranker = HybridReranker(
33
+ retriever=retriever,
34
+ chunk_texts=texts,
35
+ reranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
36
+ sparse_alpha=sparse_alpha
37
+ )
38
+
39
+ all_retrieved = []
40
+ all_relevant = []
41
+
42
+ print(f"starting assessment reranker: Precision@{k}, Recall@{k}, MRR")
43
+ print(f"dense top_k: {top_k_dense}, final top_k: {top_k_final}, sparse_alpha: {sparse_alpha}\n")
44
+
45
+ for i, entry in enumerate(benchmarks, 1):
46
+ query = entry['query']
47
+ relevant_idxs = entry.get('relevant_idxs', [])
48
+
49
+ idxs, scores = reranker.retrieve_and_rerank(
50
+ query,
51
+ top_k_dense=top_k_dense,
52
+ top_k_final=top_k_final
53
+ )
54
+
55
+ p = retrieval_precision_at_k(retrieved_idxs=idxs, relevant_idxs=relevant_idxs, k=k)
56
+ r = recall_at_k(retrieved_idxs=idxs, relevant_idxs=relevant_idxs, k=k)
57
+
58
+ all_retrieved.append(idxs)
59
+ all_relevant.append(relevant_idxs)
60
+
61
+ print(f"{i}. Query: {query}")
62
+ print(f" Precision@{k}: {p:.2f}, Recall@{k}: {r:.2f}")
63
+ print(f" Retrieved idxs: {idxs}")
64
+ print(f" Rerank scores: {[f'{s:.4f}' for s in scores]}\n")
65
+
66
+ mrr = mean_reciprocal_rank(retrieved_lists=all_retrieved, relevant_idxs_list=all_relevant)
67
+ print(f"mean reciprocal rank (MRR): {mrr:.2f}\n")
68
+
69
+ if __name__ == "__main__":
70
+ run_evaluation()
src/evaluation/metrics.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import List
3
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
4
+
5
+
6
+ def precision_at_k(retrieved_idxs: List[int], relevant_idxs: List[int], k: int) -> float:
7
+ if k <= 0:
8
+ return 0.0
9
+ top_k = retrieved_idxs[:k]
10
+ hits = sum(1 for idx in top_k if idx in relevant_idxs)
11
+ return hits / k
12
+
13
+
14
+ def recall_at_k(retrieved_idxs: List[int], relevant_idxs: List[int], k: int) -> float:
15
+ if not relevant_idxs:
16
+ return 0.0
17
+ top_k = retrieved_idxs[:k]
18
+ hits = sum(1 for idx in top_k if idx in relevant_idxs)
19
+ return hits / len(relevant_idxs)
20
+
21
+
22
+ def mean_reciprocal_rank(retrieved_lists: List[List[int]], relevant_idxs_list: List[List[int]]) -> float:
23
+
24
+ rr_scores = []
25
+ for retrieved, relevant in zip(retrieved_lists, relevant_idxs_list):
26
+ rr = 0.0
27
+ for rank, idx in enumerate(retrieved, start=1):
28
+ if idx in relevant:
29
+ rr = 1.0 / rank
30
+ break
31
+ rr_scores.append(rr)
32
+ return float(np.mean(rr_scores)) if rr_scores else 0.0
33
+
34
+
35
+ def bleu_score(reference: str, candidate: str) -> float:
36
+ smoothie = SmoothingFunction().method4
37
+ weights = (0.25, 0.25, 0.25, 0.25)
38
+ return sentence_bleu([reference.split()], candidate.split(), weights=weights, smoothing_function=smoothie)
src/generation/llm_client.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from dotenv import load_dotenv
4
+ from pathlib import Path
5
+
6
+ env_path = Path(__file__).resolve().parents[2] / '.env'
7
+ load_dotenv(dotenv_path=env_path)
8
+
9
+ HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
10
+ REPO_ID = os.getenv("HUGGINGFACE_MODEL", "HuggingFaceH4/zephyr-7b-beta")
11
+ API_URL = f"https://api-inference.huggingface.co/models/{REPO_ID}"
12
+
13
+ HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
14
+
15
+
16
+ def call_llm(prompt: str, max_length: int = 200) -> str:
17
+ payload = {
18
+ "inputs": prompt,
19
+ "parameters": {"max_new_tokens": max_length, "temperature": 0.2}
20
+ }
21
+ try:
22
+ print(f"[llm_client] POST {API_URL}")
23
+ print(f"[llm_client] HEADERS: {HEADERS}")
24
+ response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=60)
25
+ print(f"[llm_client] Status code: {response.status_code}")
26
+ text = response.text
27
+ print(f"[llm_client] Response text: {text}")
28
+ response.raise_for_status()
29
+ data = response.json()
30
+ if isinstance(data, list) and data and isinstance(data[0], dict) and "generated_text" in data[0]:
31
+ return data[0]["generated_text"].strip()
32
+ if isinstance(data, dict) and "generated_text" in data:
33
+ return data["generated_text"].strip()
34
+ return str(data)
35
+ except Exception as e:
36
+ print(f"[llm_client] error HTTP HF: {e}")
37
+ return f"error in generate response: {e}"
src/generation/prompt_templates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/generation/prompt_templates.py
2
+
3
+ helpdesk_prompt = """
4
+ Contexto técnico:
5
+ {context}
6
+
7
+ Com base nisso, responda à seguinte pergunta:
8
+ {query}
9
+ """
10
+
11
+ concise_helpdesk_prompt = """
12
+ Contexto técnico:
13
+ {context}
14
+
15
+ Responda de forma breve e objetiva:
16
+ {query}
17
+ """
18
+
19
+ informal_helpdesk_prompt = """
20
+ Oi! Aqui está o que você precisa saber com base no contexto:
21
+ {context}
22
+
23
+ Pergunta:
24
+ {query}
25
+
26
+ Resposta descontraída:
27
+ """
28
+
29
+ step_by_step_prompt = """
30
+ Contexto técnico:
31
+ {context}
32
+
33
+ Por favor, explique passo a passo como resolver:
34
+ {query}
35
+ """
36
+
37
+ link_suggestion_prompt = """
38
+ Contexto técnico:
39
+ {context}
40
+
41
+ Responda a pergunta: {query}
42
+
43
+ Se possível, inclua links úteis para mais informações.
44
+ """
src/generation/response_generator.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.generation.llm_client import call_llm
2
+ from src.generation.prompt_templates import (
3
+ helpdesk_prompt,
4
+ concise_helpdesk_prompt,
5
+ informal_helpdesk_prompt,
6
+ step_by_step_prompt,
7
+ link_suggestion_prompt
8
+ )
9
+
10
+ def build_prompt(query: str, context: str, mode: str = "default") -> str:
11
+ prompt_map = {
12
+ "default": helpdesk_prompt,
13
+ "concise": concise_helpdesk_prompt,
14
+ "informal": informal_helpdesk_prompt,
15
+ "step_by_step": step_by_step_prompt,
16
+ "with_links": link_suggestion_prompt
17
+ }
18
+ template = prompt_map.get(mode, helpdesk_prompt)
19
+ return template.format(context=context, query=query)
20
+
21
+ def generate_answer(query: str, context: str, mode: str = "default") -> str:
22
+ prompt = build_prompt(query, context, mode)
23
+ return call_llm(prompt)
src/ingestion/document_loader.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ SUPPORTED_EXTENSIONS = {".txt", ".md"}
5
+
6
+
7
+ def load_documents_from_dir(directory: str) -> list[dict]:
8
+
9
+ docs = []
10
+
11
+ for file_path in Path(directory).rglob("*"):
12
+ if file_path.suffix.lower() in SUPPORTED_EXTENSIONS:
13
+ with open(file_path, "r", encoding="utf-8") as f:
14
+ content = f.read()
15
+ docs.append({
16
+ "content": content,
17
+ "source": str(file_path)
18
+ })
19
+
20
+ return docs
src/ingestion/embedder.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import torch
4
+ from langchain_community.document_loaders import DirectoryLoader
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import FAISS
8
+
9
+ def check_environment():
10
+
11
+ if not torch.cuda.is_available():
12
+ print("\nCUDA not available")
13
+ return False
14
+ gpu_name = torch.cuda.get_device_name(0)
15
+ print(f"CUDA verified, gpu detected: {gpu_name}")
16
+ return True
17
+
18
+ if not check_environment():
19
+ exit()
20
+
21
+ RAW_DATA_DIR = "data/raw"
22
+ VECTOR_STORE_PATH = "data/vector_store_faiss"
23
+ EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
24
+
25
+ def run_ingestion():
26
+ print(f"\nstarting ingestion with GPU and model: {EMBEDDING_MODEL_NAME}")
27
+
28
+ loader = DirectoryLoader(RAW_DATA_DIR, glob="**/*.txt", show_progress=True, use_multithreading=True)
29
+ docs = loader.load()
30
+ if not docs:
31
+ print(f"no docs found in '{RAW_DATA_DIR}'")
32
+ return
33
+ print(f"loading {len(docs)} docs.")
34
+
35
+ text_splitter = RecursiveCharacterTextSplitter(
36
+ chunk_size=1000,
37
+ chunk_overlap=100
38
+ )
39
+ chunks = text_splitter.split_documents(docs)
40
+ print(f"docs divided in {len(chunks)} chunks.")
41
+
42
+ print("loading model embedding for GPU")
43
+ embeddings_model = HuggingFaceEmbeddings(
44
+ model_name=EMBEDDING_MODEL_NAME,
45
+ model_kwargs={'device': 'cuda'},
46
+ encode_kwargs={'normalize_embeddings': True}
47
+ )
48
+ print("loaded model")
49
+
50
+ print("creating vector DB faiss")
51
+ if os.path.exists(VECTOR_STORE_PATH):
52
+ print(f"removing old vector in '{VECTOR_STORE_PATH}'...")
53
+ shutil.rmtree(VECTOR_STORE_PATH)
54
+
55
+ vector_store = FAISS.from_documents(chunks, embeddings_model)
56
+ vector_store.save_local(VECTOR_STORE_PATH)
57
+
58
+ print("-" * 50)
59
+ print("pipeline ingestion conclude")
60
+ print("-" * 50)
61
+
62
+ if __name__ == "__main__":
63
+ run_ingestion()
src/ingestion/preprocessor.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def clean_text(text: str) -> str:
4
+ text = re.sub(r"\n+", "\n", text)
5
+ text = re.sub(r"[ \t]+", " ", text)
6
+ text = text.strip()
7
+ return text
8
+
9
+
10
+ def preprocess_documents(documents: list[dict]) -> list[dict]:
11
+ return [
12
+ {
13
+ **doc,
14
+ "content": clean_text(doc["content"])
15
+ }
16
+ for doc in documents
17
+ ]
src/ingestion/text_splitter.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Dict
3
+
4
+ def split_text(
5
+ documents: List[Dict],
6
+ chunk_size: int = 500,
7
+ chunk_overlap: int = 50
8
+ ) -> List[Dict]:
9
+ chunks = []
10
+
11
+ for doc in documents:
12
+ text = doc["content"]
13
+ source = doc.get("source", "unknown")
14
+
15
+ entries = re.split(r"(?=Pergunta:)", text)
16
+
17
+ for entry_id, entry in enumerate(entries):
18
+ entry = entry.strip()
19
+ if not entry:
20
+ continue
21
+
22
+ if len(entry) <= chunk_size:
23
+ chunks.append({
24
+ "content": entry,
25
+ "source": source,
26
+ "entry_id": entry_id,
27
+ "chunk_id": 0
28
+ })
29
+ else:
30
+ start = 0
31
+ end = chunk_size
32
+ chunk_id = 0
33
+
34
+ while start < len(entry):
35
+ snippet = entry[start:end]
36
+ chunks.append({
37
+ "content": snippet,
38
+ "source": source,
39
+ "entry_id": entry_id,
40
+ "chunk_id": chunk_id
41
+ })
42
+ chunk_id += 1
43
+ start = end - chunk_overlap
44
+ end = start + chunk_size
45
+
46
+ return chunks
src/retrieval/__pycache__/reranker.cpython-313.pyc ADDED
Binary file (3.99 kB). View file
 
src/retrieval/__pycache__/retriever.cpython-313.pyc ADDED
Binary file (2.3 kB). View file
 
src/retrieval/__pycache__/vector_store.cpython-313.pyc ADDED
Binary file (1.85 kB). View file
 
src/retrieval/query_processor.py ADDED
File without changes
src/retrieval/reranker.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import numpy as np
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sentence_transformers import CrossEncoder
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_core.documents import Document
7
+
8
+ class HybridReranker:
9
+ def __init__(
10
+ self,
11
+ vector_store: FAISS,
12
+ reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
13
+ ):
14
+ self.vector_store = vector_store
15
+ self.reranker = CrossEncoder(reranker_model)
16
+
17
+ docs_in_order = list(self.vector_store.docstore._dict.values())
18
+ self.chunk_texts = [doc.page_content for doc in docs_in_order]
19
+ self.chunk_metadata = [doc.metadata for doc in docs_in_order]
20
+
21
+ print(f"rerank model '{reranker_model}' loading. building matriz tf-idf")
22
+ self.vectorizer = TfidfVectorizer()
23
+ self.tfidf_matrix = self.vectorizer.fit_transform(self.chunk_texts)
24
+ print("reranker ready")
25
+
26
+ def retrieve_and_rerank(
27
+ self,
28
+ query: str,
29
+ top_k_dense: int = 20,
30
+ top_k_final: int = 5,
31
+ ) -> List[Document]:
32
+ dense_docs = self.vector_store.similarity_search(query, k=top_k_dense)
33
+
34
+ q_vec = self.vectorizer.transform([query])
35
+ sparse_scores = (self.tfidf_matrix @ q_vec.T).toarray().ravel()
36
+ sparse_indices = np.argsort(-sparse_scores)[:top_k_dense]
37
+
38
+ sparse_docs = [
39
+ Document(page_content=self.chunk_texts[i], metadata=self.chunk_metadata[i])
40
+ for i in sparse_indices
41
+ ]
42
+
43
+ combined_docs = []
44
+ seen_contents = set()
45
+ for doc in dense_docs + sparse_docs:
46
+ if doc.page_content not in seen_contents:
47
+ combined_docs.append(doc)
48
+ seen_contents.add(doc.page_content)
49
+
50
+ pairs = [[query, doc.page_content] for doc in combined_docs]
51
+ rerank_scores = self.reranker.predict(pairs)
52
+
53
+ doc_scores = list(zip(combined_docs, rerank_scores))
54
+ sorted_doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True)
55
+ final_docs = [doc for doc, score in sorted_doc_scores[:top_k_final]]
56
+
57
+ return final_docs
src/retrieval/retriever.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sentence_transformers import SentenceTransformer
3
+ from src.retrieval.vector_store import VectorStore
4
+
5
+ class Retriever:
6
+ def __init__(self, embeddings_path: str, model_name: str = "BAAI/bge-small-en-v1.5"):
7
+ self.model = SentenceTransformer(model_name)
8
+ dim = self.model.get_sentence_embedding_dimension()
9
+ self.index = VectorStore(dim=dim)
10
+
11
+ embeds = np.load(embeddings_path).astype(np.float32)
12
+ self.index.add(embeds)
13
+
14
+ def retrieve(self, query: str, top_k: int = 5):
15
+ qv = self.model.encode(query, normalize_embeddings=True).astype(np.float32)
16
+ distances, indices = self.index.search(qv, top_k)
17
+ return indices[0].copy(), distances[0].copy()
18
+
19
+ def set_chunk_texts(self, texts: list[str]):
20
+ self._chunk_texts = texts
21
+
22
+ def get_chunk_texts(self) -> list[str]:
23
+ return getattr(self, "_chunk_texts", [])
src/retrieval/vector_store.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+
4
+ class VectorStore:
5
+ def __init__(self, dim: int):
6
+ self.dim = dim
7
+ self.index = faiss.IndexFlatIP(dim)
8
+
9
+ def add(self, embeddings: np.ndarray):
10
+ if embeddings.dtype != np.float32:
11
+ embeddings = embeddings.astype(np.float32)
12
+ self.index.add(embeddings)
13
+
14
+ def search(self, query_vec: np.ndarray, top_k: int = 5):
15
+ if query_vec.ndim == 1:
16
+ query_vec = np.expand_dims(query_vec, axis=0)
17
+ if query_vec.dtype != np.float32:
18
+ query_vec = query_vec.astype(np.float32)
19
+ distances, indices = self.index.search(query_vec, top_k)
20
+ return distances[0], indices[0]
src/utils/env.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ def get_env_var(name: str, default: str = "") -> str:
5
+ return os.getenv(name, default)
src/utils/io.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ def read_txt_chunks(path: str) -> list[str]:
2
+ with open(path, encoding="utf-8") as f:
3
+ return [chunk.strip() for chunk in f.read().split("\n\n") if chunk.strip()]