Spaces:

vinimoreira
/

RAG_backend

Sleeping

App Files Files Community

vinimoreira commited on Jul 1, 2025

Commit

2068d15

verified ·

1 Parent(s): 04e8f63

Add files for RAG backend

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
Dockerfile +12 -0
LICENSE +21 -0
README.md +2 -12
api/__init__.py +0 -0
api/__pycache__/__init__.cpython-313.pyc +0 -0
api/__pycache__/main.cpython-313.pyc +0 -0
api/__pycache__/rag_chain.cpython-313.pyc +0 -0
api/__pycache__/schemas.cpython-313.pyc +0 -0
api/main.py +57 -0
api/rag_chain.py +108 -0
api/schemas.py +14 -0
config/base.yaml +43 -0
config/dev.yaml +0 -0
config/prod.yaml +0 -0
data/embeddings/batch_000.npy +3 -0
data/raw/base_treinamento.txt +0 -0
data/vector_store_faiss/index.faiss +3 -0
data/vector_store_faiss/index.pkl +3 -0
notebooks/demo_embedder.py +24 -0
notebooks/demo_evaluator.py +3 -0
notebooks/demo_generator.py +23 -0
notebooks/demo_ingestion.py +21 -0
notebooks/demo_reranker.py +35 -0
notebooks/demo_retriever.py +14 -0
notebooks/demo_vector_store.py +20 -0
pytest.ini +6 -0
requiriments.txt +21 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/download.cpython-313.pyc +0 -0
src/__pycache__/gerar_chunks.cpython-313.pyc +0 -0
src/evaluation/benchmarks.py +0 -0
src/evaluation/evaluator.py +70 -0
src/evaluation/metrics.py +38 -0
src/generation/llm_client.py +37 -0
src/generation/prompt_templates.py +44 -0
src/generation/response_generator.py +23 -0
src/ingestion/document_loader.py +20 -0
src/ingestion/embedder.py +63 -0
src/ingestion/preprocessor.py +17 -0
src/ingestion/text_splitter.py +46 -0
src/retrieval/__pycache__/reranker.cpython-313.pyc +0 -0
src/retrieval/__pycache__/retriever.cpython-313.pyc +0 -0
src/retrieval/__pycache__/vector_store.cpython-313.pyc +0 -0
src/retrieval/query_processor.py +0 -0
src/retrieval/reranker.py +57 -0
src/retrieval/retriever.py +23 -0
src/retrieval/vector_store.py +20 -0
src/utils/env.py +5 -0
src/utils/io.py +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 RAG_HelpDesk/data/vector_store_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 RAG_HelpDesk/data/vector_store_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text
+data/vector_store_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+COPY . .
+CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Vinicius Moreira
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,2 @@
----
-title: RAG HelpDesk
-emoji: 🚀
-colorFrom: red
-colorTo: indigo
-sdk: docker
-pinned: false
-license: mit
-short_description: RAG with langchain and fastAPI for to answer technical IT q/
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # RAG_HelpDesk
2	+ Esse projeto visa criar um RAG para ajudar usuários em relação a duvidas de Hardware e Software.

api/__init__.py ADDED Viewed

File without changes

api/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (163 Bytes). View file

api/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (2.88 kB). View file

api/__pycache__/rag_chain.cpython-313.pyc ADDED Viewed

Binary file (5.39 kB). View file

api/__pycache__/schemas.cpython-313.pyc ADDED Viewed

Binary file (1.34 kB). View file

api/main.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import sys
+from pathlib import Path
+import traceback
+project_root = Path(__file__).resolve().parents[1]
+sys.path.append(str(project_root))
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from api.schemas import QueryRequest, QueryResponse, SourceChunk
+from api.rag_chain import get_rag_chain
+app = FastAPI(
+    title="Helpdesk RAG API",
+    description="API for answering questions about an IT knowledge base",
+    version="1.0.0"
+)
+try:
+    rag_chain = get_rag_chain()
+except Exception as e:
+    print("error to load pipeline RAG")
+    traceback.print_exc()
+    raise RuntimeError(f"error to load pipeline RAG: {e}")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["GET", "POST"],
+    allow_headers=["*"],
+)
+@app.get("/", tags=["Status"])
+def read_root():
+    return {"status": "API ON"}
+@app.post("/query", response_model=QueryResponse, tags=["RAG"])
+async def handle_query(request: QueryRequest):
+    print(f"processing query: '{request.query}'")
+    try:
+        result = rag_chain.invoke(request.query)
+        source_chunks = [
+            SourceChunk(
+                page_content=doc.page_content,
+                source=doc.metadata.get('source', 'desconhecida')
+            ) for doc in result['source_chunks']
+        ]
+        return QueryResponse(
+            answer=result['answer'],
+            source_chunks=source_chunks
+        )
+    except Exception as e:
+        print(f"error to process query")
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=f"internal error. check the server console")

api/rag_chain.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+from pathlib import Path
+from dotenv import load_dotenv
+from operator import itemgetter
+from typing import List, Dict
+from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
+from langchain_core.prompts import PromptTemplate
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+from huggingface_hub import InferenceClient
+load_dotenv()
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+VECTOR_STORE_PATH = str(PROJECT_ROOT / "data" / "vector_store_faiss")
+EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
+LLM_REPO_ID = os.getenv("HUGGINGFACE_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
+HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
+if not HF_TOKEN:
+    raise ValueError("HUGGINGFACE_API_TOKEN not found")
+client = InferenceClient(model=LLM_REPO_ID, token=HF_TOKEN)
+prompt_template = PromptTemplate.from_template("""
+<|system|>
+Você é um assistente de helpdesk de TI especialista... (seu prompt aqui)
+</s><|user|>
+Contexto: {context}\n\nPergunta: {query}
+</s><|assistant|>
+Resposta em Português:
+""")
+def format_docs(docs: List[Dict]) -> str:
+    return "\n\n".join(doc.page_content for doc in docs)
+def generate_answer_from_context(input_dict: Dict) -> str:
+    context_docs = input_dict["context"]
+    query_text = input_dict["query"]
+    formatted_context = format_docs(context_docs)
+    prompt_value = prompt_template.invoke({
+        "context": formatted_context,
+        "query": query_text
+    })
+    final_prompt_text = str(prompt_value)
+    try:
+        response = client.chat_completion(
+            messages=[{"role": "user", "content": final_prompt_text}],
+            max_tokens=300,
+            temperature=0.1
+        )
+        raw_answer = response.choices[0].message.content
+        clean_answer = raw_answer.strip()
+        if clean_answer.startswith('text="'):
+            clean_answer = clean_answer[6:]
+        elif clean_answer.startswith("text='"):
+            clean_answer = clean_answer[6:]
+        if clean_answer.endswith('"') or clean_answer.endswith("'"):
+            clean_answer = clean_answer[:-1]
+        if clean_answer.startswith("Resposta:"):
+            clean_answer = clean_answer.split("Resposta:", 1)[1]
+        return clean_answer.strip()
+    except Exception as e:
+        print(f"error for call API huggingface: {e}")
+        return f"error for contact llm: {e}"
+def get_rag_chain():
+    print("loading pipeline")
+    embeddings_model = HuggingFaceEmbeddings(
+        model_name=EMBEDDING_MODEL_NAME,
+        model_kwargs={'device': 'cpu'},
+        encode_kwargs={'normalize_embeddings': True}
+    )
+    vector_store = FAISS.load_local(
+        VECTOR_STORE_PATH, embeddings_model, allow_dangerous_deserialization=True
+    )
+    try:
+        from src.retrieval.reranker import HybridReranker
+        hybrid_reranker = HybridReranker(vector_store=vector_store)
+        retrieval_chain = lambda query: hybrid_reranker.retrieve_and_rerank(query)
+        print("using pipeline with reranker")
+    except ImportError:
+        retrieval_chain = vector_store.as_retriever(search_kwargs={"k": 5})
+        print("reranker not found, using simple retriever")
+    rag_chain = {
+        "context": retrieval_chain,
+        "query": RunnablePassthrough()
+    } | RunnableParallel({
+        "source_chunks": itemgetter("context"),
+        "answer": generate_answer_from_context
+    })
+    print("pipeline ready.")
+    return rag_chain

api/schemas.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional
+class QueryRequest(BaseModel):
+    query: str = Field(..., description="query for RAG")
+    top_k: int = Field(3, description="number of relevants docs to be retrivied", ge=1, le=10)
+class SourceChunk(BaseModel):
+    page_content: str
+    source: str = Field(description="file path")
+class QueryResponse(BaseModel):
+    answer: str
+    source_chunks: List[SourceChunk]

config/base.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+raw_path: "data/raw"
+processed_path: "data/processed"
+embeddings_path: "data/embeddings"
+ingestion:
+  chunk_size: 500
+  chunk_overlap: 50
+  languages: ["pt", "en"]
+embedder:
+  model_name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+  batch_size: 64
+  normalize: true
+vector_store:
+  type: "faiss"
+  index_factory: "Flat"
+  metric: "L2"
+  save_index_path: "data/embeddings/faiss_index.idx"
+llm:
+  provider: "openai"
+  model_name: "gpt-3.5-turbo"
+  max_tokens: 512
+  temperature: 0.7
+  top_p: 0.9
+  api_key_env: "OPENAI_API_KEY"
+api:
+  host: "0.0.0.0"
+  port: 8000
+  docs_url: "/docs"
+logging:
+  level: "INFO"
+  format: "[%(asctime)s] %(levelname)s %(name)s: %(message)s"
+evaluation:
+  top_k: 5
+  benchmark_path: "tests/benchmark.json"
+  save_reports: true
+  reports_path: "reports/"

config/dev.yaml ADDED Viewed

File without changes

config/prod.yaml ADDED Viewed

File without changes

data/embeddings/batch_000.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0234624565756c111fcd23e4090f7f08255567e57fad9a01cde641f862f4c93
+size 76800128

data/raw/base_treinamento.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/vector_store_faiss/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da3c437c49b2aab9bcff75a1553f3ea1cd17212b70c66cdc3fcebbab1bbf1a7c
+size 43315245

data/vector_store_faiss/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac470dbf2aa92c9fe996c3a0df6ca4e4435afb806b06f6e8e877def38323393f
+size 10958179

notebooks/demo_embedder.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from src.ingestion.document_loader import load_documents_from_dir
+from src.ingestion.preprocessor import preprocess_documents
+from src.ingestion.text_splitter import split_text
+from src.ingestion.embedder import load_embedder, generate_embeddings, save_embeddings, get_chunk_texts
+CHUNK_SIZE = 300
+CHUNK_OVERLAP = 50
+RAW_PATH = "data/raw"
+EMBEDDING_PATH = "data/embeddings/batch_000.npy"
+docs = load_documents_from_dir(RAW_PATH)
+clean_docs = preprocess_documents(docs)
+chunks = split_text(clean_docs, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+texts = get_chunk_texts(chunks)
+model = load_embedder()
+embeddings = generate_embeddings(texts, model)
+print(f"embeddings shape: {embeddings.shape}")
+print(f"vector example (1º):\n{embeddings[0][:10]}...")
+save_embeddings(embeddings, EMBEDDING_PATH)
+print(f"save embeddings in: {EMBEDDING_PATH}")

notebooks/demo_evaluator.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from src.evaluation.evaluator import run_evaluation
2	+
3	+ run_evaluation()

notebooks/demo_generator.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy as np
+from src.retrieval.retriever import Retriever
+from src.generation.response_generator import generate_answer
+retriever = Retriever("data/embeddings/batch_000.npy")
+query = "Meu notebook está muito lento, o que posso fazer? E se não resolver?"
+idxs, scores = retriever.retrieve(query, top_k=1)
+idxs = np.atleast_1d(idxs).flatten()
+scores = np.atleast_1d(scores).flatten()
+with open("data/raw/base_treinamento.txt", encoding="utf-8") as f:
+    chunks = [b.strip() for b in f.read().split("\n\n") if b.strip()]
+context = chunks[int(idxs[0])]
+answer = generate_answer(query, context)
+print(f"\nquery: {query}")
+print(f"context selected (chunk #{idxs[0]}):\n{context}\n")
+print("generated response:\n")
+print(answer)

notebooks/demo_ingestion.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from src.ingestion.document_loader import load_documents_from_dir
+from src.ingestion.preprocessor import preprocess_documents
+from src.ingestion.text_splitter import split_text
+RAW_PATH = "data/raw"
+CHUNK_SIZE = 300
+CHUNK_OVERLAP = 50
+docs = load_documents_from_dir(RAW_PATH)
+print(f"docs loaded: {len(docs)}")
+cleaned_docs = preprocess_documents(docs)
+print(f"pre-process completed")
+chunks = split_text(cleaned_docs, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+print(f"total chunks generated: {len(chunks)}")
+for i, chunk in enumerate(chunks[:3]):
+    print(f"\n--- Chunk {i} ---")
+    print(f"font: {chunk['source']}")
+    print(chunk['content'][:300])

notebooks/demo_reranker.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import numpy as np
+from src.retrieval.retriever import Retriever
+from src.retrieval.reranker import HybridReranker
+RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
+def main():
+    # Carrega os textos (chunks)
+    with open("data/raw/base_treinamento.txt", encoding="utf-8") as f:
+        chunks = [b.strip() for b in f.read().split("\n\n") if b.strip()]
+    # Cria o retriever com os embeddings pré-calculados
+    retriever = Retriever("data/embeddings/batch_000.npy")
+    # Cria o reranker, passando o retriever e os chunks para reranking
+    hybrid = HybridReranker(
+        retriever=retriever,
+        chunk_texts=chunks,
+        reranker_model=RERANKER_MODEL,
+        sparse_alpha=0.5,
+    )
+    query = "Meu notebook está muito lento, o que posso fazer? E se não resolver?"
+    # Recupera e reranqueia os top documentos
+    idxs, scores = hybrid.retrieve_and_rerank(query, top_k_dense=10, top_k_final=3)
+    print(f"\nQuery: {query}\n")
+    for i, idx in enumerate(idxs):
+        print(f"Rank {i+1} - Chunk #{idx} (score: {scores[i]:.4f}):")
+        print(chunks[idx])
+        print("-" * 40)
+if __name__ == "__main__":
+    main()

notebooks/demo_retriever.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import numpy as np
+from src.retrieval.retriever import Retriever
+retriever = Retriever("data/embeddings/batch_000.npy")
+query = "O que fazer se o notebook não liga?"
+idxs, scores = retriever.retrieve(query, top_k=5)
+idxs = np.atleast_1d(idxs).flatten()
+scores = np.atleast_1d(scores).flatten()
+print(f"\ntop results for: {query}\n")
+for i, (idx, score) in enumerate(zip(idxs, scores), 1):
+    print(f"{i}. idx: {int(idx):4d} — similarity: {score:.4f}")

notebooks/demo_vector_store.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import numpy as np
+from src.retrieval.vector_store import VectorStore
+embeds = np.load("data/embeddings/batch_000.npy")
+dim = embeds.shape[1]
+print(f"embeddings loaded: {embeds.shape}")
+vs = VectorStore(dim=dim)
+vs.add(embeds)
+print("vectors added in index")
+query = embeds[0]
+dists, idxs = vs.search(query, top_k=5)
+print("\nresult search (dummy):")
+for i, (idx, dist) in enumerate(zip(idxs, dists), 1):
+    print(f"{i}. idx: {idx:4d} — distância: {dist:.4f}")
+print("Norma do primeiro vetor:", np.linalg.norm(embeds[0]))
+print("Norma do segundo vetor:", np.linalg.norm(embeds[1]))

pytest.ini ADDED Viewed

	@@ -0,0 +1,6 @@

+[pytest]
+minversion = 6.0
+addopts = -ra -q
+testpaths =
+    tests
+python_paths = src

requiriments.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+sentence-transformers>=2.2.2
+faiss-cpu>=1.7.3
+openai>=0.27.0
+python-docx>=0.8.11
+PyPDF2>=3.0.0
+beautifulsoup4>=4.12.2
+PyYAML>=6.0
+uvicorn[standard]>=0.22.0
+pytest>=7.0.1
+python-dotenv>=1.0.0
+nltk>=3.8.1
+pytest
+unstructured
+torch
+fastapi
+uvicorn[standard]
+langchain
+langchain-huggingface
+pydantic
+langchain_community
+streamlit

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (163 Bytes). View file

src/__pycache__/download.cpython-313.pyc ADDED Viewed

Binary file (384 Bytes). View file

src/__pycache__/gerar_chunks.cpython-313.pyc ADDED Viewed

Binary file (2.59 kB). View file

src/evaluation/benchmarks.py ADDED Viewed

File without changes

src/evaluation/evaluator.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import json
+import numpy as np
+from src.retrieval.retriever import Retriever
+from src.retrieval.reranker import HybridReranker
+from src.evaluation.metrics import (
+    precision_at_k as retrieval_precision_at_k,
+    recall_at_k,
+    mean_reciprocal_rank,
+    bleu_score
+)
+from src.ingestion.document_loader import load_documents_from_dir
+from src.ingestion.preprocessor import preprocess_documents
+from src.ingestion.text_splitter import split_text
+def run_evaluation(
+    benchmark_path: str = "tests/benchmark.json",
+    k: int = 3,
+    top_k_dense: int = 10,
+    top_k_final: int = 3,
+    sparse_alpha: float = 0.5
+):
+    with open(benchmark_path, encoding="utf-8") as f:
+        benchmarks = json.load(f)
+    docs = load_documents_from_dir("data/raw")
+    clean_docs = preprocess_documents(docs)
+    chunks = split_text(clean_docs, chunk_size=300, chunk_overlap=50)
+    texts = [chunk['content'] for chunk in chunks]
+    retriever = Retriever("data/embeddings/batch_000.npy")
+    reranker = HybridReranker(
+        retriever=retriever,
+        chunk_texts=texts,
+        reranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
+        sparse_alpha=sparse_alpha
+    )
+    all_retrieved = []
+    all_relevant = []
+    print(f"starting assessment reranker: Precision@{k}, Recall@{k}, MRR")
+    print(f"dense top_k: {top_k_dense}, final top_k: {top_k_final}, sparse_alpha: {sparse_alpha}\n")
+    for i, entry in enumerate(benchmarks, 1):
+        query = entry['query']
+        relevant_idxs = entry.get('relevant_idxs', [])
+        idxs, scores = reranker.retrieve_and_rerank(
+            query,
+            top_k_dense=top_k_dense,
+            top_k_final=top_k_final
+        )
+        p = retrieval_precision_at_k(retrieved_idxs=idxs, relevant_idxs=relevant_idxs, k=k)
+        r = recall_at_k(retrieved_idxs=idxs, relevant_idxs=relevant_idxs, k=k)
+        all_retrieved.append(idxs)
+        all_relevant.append(relevant_idxs)
+        print(f"{i}. Query: {query}")
+        print(f"   Precision@{k}: {p:.2f}, Recall@{k}: {r:.2f}")
+        print(f"   Retrieved idxs: {idxs}")
+        print(f"   Rerank scores: {[f'{s:.4f}' for s in scores]}\n")
+    mrr = mean_reciprocal_rank(retrieved_lists=all_retrieved, relevant_idxs_list=all_relevant)
+    print(f"mean reciprocal rank (MRR): {mrr:.2f}\n")
+if __name__ == "__main__":
+    run_evaluation()

src/evaluation/metrics.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import numpy as np
+from typing import List
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+def precision_at_k(retrieved_idxs: List[int], relevant_idxs: List[int], k: int) -> float:
+    if k <= 0:
+        return 0.0
+    top_k = retrieved_idxs[:k]
+    hits = sum(1 for idx in top_k if idx in relevant_idxs)
+    return hits / k
+def recall_at_k(retrieved_idxs: List[int], relevant_idxs: List[int], k: int) -> float:
+    if not relevant_idxs:
+        return 0.0
+    top_k = retrieved_idxs[:k]
+    hits = sum(1 for idx in top_k if idx in relevant_idxs)
+    return hits / len(relevant_idxs)
+def mean_reciprocal_rank(retrieved_lists: List[List[int]], relevant_idxs_list: List[List[int]]) -> float:
+    rr_scores = []
+    for retrieved, relevant in zip(retrieved_lists, relevant_idxs_list):
+        rr = 0.0
+        for rank, idx in enumerate(retrieved, start=1):
+            if idx in relevant:
+                rr = 1.0 / rank
+                break
+        rr_scores.append(rr)
+    return float(np.mean(rr_scores)) if rr_scores else 0.0
+def bleu_score(reference: str, candidate: str) -> float:
+    smoothie = SmoothingFunction().method4
+    weights = (0.25, 0.25, 0.25, 0.25)
+    return sentence_bleu([reference.split()], candidate.split(), weights=weights, smoothing_function=smoothie)

src/generation/llm_client.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import requests
+from dotenv import load_dotenv
+from pathlib import Path
+env_path = Path(__file__).resolve().parents[2] / '.env'
+load_dotenv(dotenv_path=env_path)
+HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
+REPO_ID = os.getenv("HUGGINGFACE_MODEL", "HuggingFaceH4/zephyr-7b-beta")
+API_URL = f"https://api-inference.huggingface.co/models/{REPO_ID}"
+HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
+def call_llm(prompt: str, max_length: int = 200) -> str:
+    payload = {
+        "inputs": prompt,
+        "parameters": {"max_new_tokens": max_length, "temperature": 0.2}
+    }
+    try:
+        print(f"[llm_client] POST {API_URL}")
+        print(f"[llm_client] HEADERS: {HEADERS}")
+        response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=60)
+        print(f"[llm_client] Status code: {response.status_code}")
+        text = response.text
+        print(f"[llm_client] Response text: {text}")
+        response.raise_for_status()
+        data = response.json()
+        if isinstance(data, list) and data and isinstance(data[0], dict) and "generated_text" in data[0]:
+            return data[0]["generated_text"].strip()
+        if isinstance(data, dict) and "generated_text" in data:
+            return data["generated_text"].strip()
+        return str(data)
+    except Exception as e:
+        print(f"[llm_client] error HTTP HF: {e}")
+        return f"error in generate response: {e}"

src/generation/prompt_templates.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# src/generation/prompt_templates.py
+helpdesk_prompt = """
+Contexto técnico:
+{context}
+Com base nisso, responda à seguinte pergunta:
+{query}
+"""
+concise_helpdesk_prompt = """
+Contexto técnico:
+{context}
+Responda de forma breve e objetiva:
+{query}
+"""
+informal_helpdesk_prompt = """
+Oi! Aqui está o que você precisa saber com base no contexto:
+{context}
+Pergunta:
+{query}
+Resposta descontraída:
+"""
+step_by_step_prompt = """
+Contexto técnico:
+{context}
+Por favor, explique passo a passo como resolver:
+{query}
+"""
+link_suggestion_prompt = """
+Contexto técnico:
+{context}
+Responda a pergunta: {query}
+Se possível, inclua links úteis para mais informações.
+"""

src/generation/response_generator.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from src.generation.llm_client import call_llm
+from src.generation.prompt_templates import (
+    helpdesk_prompt,
+    concise_helpdesk_prompt,
+    informal_helpdesk_prompt,
+    step_by_step_prompt,
+    link_suggestion_prompt
+)
+def build_prompt(query: str, context: str, mode: str = "default") -> str:
+    prompt_map = {
+        "default": helpdesk_prompt,
+        "concise": concise_helpdesk_prompt,
+        "informal": informal_helpdesk_prompt,
+        "step_by_step": step_by_step_prompt,
+        "with_links": link_suggestion_prompt
+    }
+    template = prompt_map.get(mode, helpdesk_prompt)
+    return template.format(context=context, query=query)
+def generate_answer(query: str, context: str, mode: str = "default") -> str:
+    prompt = build_prompt(query, context, mode)
+    return call_llm(prompt)

src/ingestion/document_loader.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+from pathlib import Path
+SUPPORTED_EXTENSIONS = {".txt", ".md"}
+def load_documents_from_dir(directory: str) -> list[dict]:
+    docs = []
+    for file_path in Path(directory).rglob("*"):
+        if file_path.suffix.lower() in SUPPORTED_EXTENSIONS:
+            with open(file_path, "r", encoding="utf-8") as f:
+                content = f.read()
+                docs.append({
+                    "content": content,
+                    "source": str(file_path)
+                })
+    return docs

src/ingestion/embedder.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import shutil
+import torch
+from langchain_community.document_loaders import DirectoryLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+def check_environment():
+    if not torch.cuda.is_available():
+        print("\nCUDA not available")
+        return False
+    gpu_name = torch.cuda.get_device_name(0)
+    print(f"CUDA verified, gpu detected: {gpu_name}")
+    return True
+if not check_environment():
+    exit()
+RAW_DATA_DIR = "data/raw"
+VECTOR_STORE_PATH = "data/vector_store_faiss"
+EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
+def run_ingestion():
+    print(f"\nstarting ingestion with GPU and model: {EMBEDDING_MODEL_NAME}")
+    loader = DirectoryLoader(RAW_DATA_DIR, glob="**/*.txt", show_progress=True, use_multithreading=True)
+    docs = loader.load()
+    if not docs:
+        print(f"no docs found in '{RAW_DATA_DIR}'")
+        return
+    print(f"loading {len(docs)} docs.")
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=100
+    )
+    chunks = text_splitter.split_documents(docs)
+    print(f"docs divided in {len(chunks)} chunks.")
+    print("loading model embedding for GPU")
+    embeddings_model = HuggingFaceEmbeddings(
+        model_name=EMBEDDING_MODEL_NAME,
+        model_kwargs={'device': 'cuda'},
+        encode_kwargs={'normalize_embeddings': True}
+    )
+    print("loaded model")
+    print("creating vector DB faiss")
+    if os.path.exists(VECTOR_STORE_PATH):
+        print(f"removing old vector in '{VECTOR_STORE_PATH}'...")
+        shutil.rmtree(VECTOR_STORE_PATH)
+    vector_store = FAISS.from_documents(chunks, embeddings_model)
+    vector_store.save_local(VECTOR_STORE_PATH)
+    print("-" * 50)
+    print("pipeline ingestion conclude")
+    print("-" * 50)
+if __name__ == "__main__":
+    run_ingestion()

src/ingestion/preprocessor.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import re
+def clean_text(text: str) -> str:
+    text = re.sub(r"\n+", "\n", text)
+    text = re.sub(r"[ \t]+", " ", text)
+    text = text.strip()
+    return text
+def preprocess_documents(documents: list[dict]) -> list[dict]:
+    return [
+        {
+            **doc,
+            "content": clean_text(doc["content"])
+        }
+        for doc in documents
+    ]

src/ingestion/text_splitter.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import re
+from typing import List, Dict
+def split_text(
+    documents: List[Dict],
+    chunk_size: int = 500,
+    chunk_overlap: int = 50
+) -> List[Dict]:
+    chunks = []
+    for doc in documents:
+        text = doc["content"]
+        source = doc.get("source", "unknown")
+        entries = re.split(r"(?=Pergunta:)", text)
+        for entry_id, entry in enumerate(entries):
+            entry = entry.strip()
+            if not entry:
+                continue
+            if len(entry) <= chunk_size:
+                chunks.append({
+                    "content": entry,
+                    "source": source,
+                    "entry_id": entry_id,
+                    "chunk_id": 0
+                })
+            else:
+                start = 0
+                end = chunk_size
+                chunk_id = 0
+                while start < len(entry):
+                    snippet = entry[start:end]
+                    chunks.append({
+                        "content": snippet,
+                        "source": source,
+                        "entry_id": entry_id,
+                        "chunk_id": chunk_id
+                    })
+                    chunk_id += 1
+                    start = end - chunk_overlap
+                    end = start + chunk_size
+    return chunks

src/retrieval/__pycache__/reranker.cpython-313.pyc ADDED Viewed

Binary file (3.99 kB). View file

src/retrieval/__pycache__/retriever.cpython-313.pyc ADDED Viewed

Binary file (2.3 kB). View file

src/retrieval/__pycache__/vector_store.cpython-313.pyc ADDED Viewed

Binary file (1.85 kB). View file

src/retrieval/query_processor.py ADDED Viewed

File without changes

src/retrieval/reranker.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import List
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sentence_transformers import CrossEncoder
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+class HybridReranker:
+    def __init__(
+        self,
+        vector_store: FAISS,
+        reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
+    ):
+        self.vector_store = vector_store
+        self.reranker = CrossEncoder(reranker_model)
+        docs_in_order = list(self.vector_store.docstore._dict.values())
+        self.chunk_texts = [doc.page_content for doc in docs_in_order]
+        self.chunk_metadata = [doc.metadata for doc in docs_in_order]
+        print(f"rerank model '{reranker_model}' loading. building matriz tf-idf")
+        self.vectorizer = TfidfVectorizer()
+        self.tfidf_matrix = self.vectorizer.fit_transform(self.chunk_texts)
+        print("reranker ready")
+    def retrieve_and_rerank(
+        self,
+        query: str,
+        top_k_dense: int = 20,
+        top_k_final: int = 5,
+    ) -> List[Document]:
+        dense_docs = self.vector_store.similarity_search(query, k=top_k_dense)
+        q_vec = self.vectorizer.transform([query])
+        sparse_scores = (self.tfidf_matrix @ q_vec.T).toarray().ravel()
+        sparse_indices = np.argsort(-sparse_scores)[:top_k_dense]
+        sparse_docs = [
+            Document(page_content=self.chunk_texts[i], metadata=self.chunk_metadata[i])
+            for i in sparse_indices
+        ]
+        combined_docs = []
+        seen_contents = set()
+        for doc in dense_docs + sparse_docs:
+            if doc.page_content not in seen_contents:
+                combined_docs.append(doc)
+                seen_contents.add(doc.page_content)
+        pairs = [[query, doc.page_content] for doc in combined_docs]
+        rerank_scores = self.reranker.predict(pairs)
+        doc_scores = list(zip(combined_docs, rerank_scores))
+        sorted_doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True)
+        final_docs = [doc for doc, score in sorted_doc_scores[:top_k_final]]
+        return final_docs

src/retrieval/retriever.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy as np
+from sentence_transformers import SentenceTransformer
+from src.retrieval.vector_store import VectorStore
+class Retriever:
+    def __init__(self, embeddings_path: str, model_name: str = "BAAI/bge-small-en-v1.5"):
+        self.model = SentenceTransformer(model_name)
+        dim = self.model.get_sentence_embedding_dimension()
+        self.index = VectorStore(dim=dim)
+        embeds = np.load(embeddings_path).astype(np.float32)
+        self.index.add(embeds)
+    def retrieve(self, query: str, top_k: int = 5):
+        qv = self.model.encode(query, normalize_embeddings=True).astype(np.float32)
+        distances, indices = self.index.search(qv, top_k)
+        return indices[0].copy(), distances[0].copy()
+    def set_chunk_texts(self, texts: list[str]):
+        self._chunk_texts = texts
+    def get_chunk_texts(self) -> list[str]:
+        return getattr(self, "_chunk_texts", [])

src/retrieval/vector_store.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import faiss
+import numpy as np
+class VectorStore:
+    def __init__(self, dim: int):
+        self.dim = dim
+        self.index = faiss.IndexFlatIP(dim)
+    def add(self, embeddings: np.ndarray):
+        if embeddings.dtype != np.float32:
+            embeddings = embeddings.astype(np.float32)
+        self.index.add(embeddings)
+    def search(self, query_vec: np.ndarray, top_k: int = 5):
+        if query_vec.ndim == 1:
+            query_vec = np.expand_dims(query_vec, axis=0)
+        if query_vec.dtype != np.float32:
+            query_vec = query_vec.astype(np.float32)
+        distances, indices = self.index.search(query_vec, top_k)
+        return distances[0], indices[0]

src/utils/env.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+def get_env_var(name: str, default: str = "") -> str:
+    return os.getenv(name, default)

src/utils/io.py ADDED Viewed

	@@ -0,0 +1,3 @@

+def read_txt_chunks(path: str) -> list[str]:
+    with open(path, encoding="utf-8") as f:
+        return [chunk.strip() for chunk in f.read().split("\n\n") if chunk.strip()]