Spaces:

vinimoreira
/

RAG_backend

Sleeping

App Files Files Community

vinimoreira commited on Jul 1, 2025

Commit

04e8f63

verified ·

1 Parent(s): 73ac40a

Delete RAG_HelpDeks

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

RAG_HelpDesk/Dockerfile +0 -12
RAG_HelpDesk/LICENSE +0 -21
RAG_HelpDesk/README.md +0 -2
RAG_HelpDesk/api/__init__.py +0 -0
RAG_HelpDesk/api/__pycache__/__init__.cpython-313.pyc +0 -0
RAG_HelpDesk/api/__pycache__/main.cpython-313.pyc +0 -0
RAG_HelpDesk/api/__pycache__/rag_chain.cpython-313.pyc +0 -0
RAG_HelpDesk/api/__pycache__/schemas.cpython-313.pyc +0 -0
RAG_HelpDesk/api/main.py +0 -57
RAG_HelpDesk/api/rag_chain.py +0 -108
RAG_HelpDesk/api/schemas.py +0 -14
RAG_HelpDesk/config/base.yaml +0 -43
RAG_HelpDesk/config/dev.yaml +0 -0
RAG_HelpDesk/config/prod.yaml +0 -0
RAG_HelpDesk/data/embeddings/batch_000.npy +0 -3
RAG_HelpDesk/data/raw/base_treinamento.txt +0 -0
RAG_HelpDesk/data/vector_store_faiss/index.faiss +0 -3
RAG_HelpDesk/data/vector_store_faiss/index.pkl +0 -3
RAG_HelpDesk/frontend/app_frontend.py +0 -49
RAG_HelpDesk/frontend/requirements.txt +0 -2
RAG_HelpDesk/notebooks/demo_embedder.py +0 -24
RAG_HelpDesk/notebooks/demo_evaluator.py +0 -3
RAG_HelpDesk/notebooks/demo_generator.py +0 -23
RAG_HelpDesk/notebooks/demo_ingestion.py +0 -21
RAG_HelpDesk/notebooks/demo_reranker.py +0 -35
RAG_HelpDesk/notebooks/demo_retriever.py +0 -14
RAG_HelpDesk/notebooks/demo_vector_store.py +0 -20
RAG_HelpDesk/pytest.ini +0 -6
RAG_HelpDesk/requiriments.txt +0 -21
RAG_HelpDesk/src/__pycache__/__init__.cpython-313.pyc +0 -0
RAG_HelpDesk/src/__pycache__/download.cpython-313.pyc +0 -0
RAG_HelpDesk/src/__pycache__/gerar_chunks.cpython-313.pyc +0 -0
RAG_HelpDesk/src/evaluation/benchmarks.py +0 -0
RAG_HelpDesk/src/evaluation/evaluator.py +0 -70
RAG_HelpDesk/src/evaluation/metrics.py +0 -38
RAG_HelpDesk/src/generation/llm_client.py +0 -37
RAG_HelpDesk/src/generation/prompt_templates.py +0 -44
RAG_HelpDesk/src/generation/response_generator.py +0 -23
RAG_HelpDesk/src/ingestion/document_loader.py +0 -20
RAG_HelpDesk/src/ingestion/embedder.py +0 -63
RAG_HelpDesk/src/ingestion/preprocessor.py +0 -17
RAG_HelpDesk/src/ingestion/text_splitter.py +0 -46
RAG_HelpDesk/src/retrieval/__pycache__/reranker.cpython-313.pyc +0 -0
RAG_HelpDesk/src/retrieval/__pycache__/retriever.cpython-313.pyc +0 -0
RAG_HelpDesk/src/retrieval/__pycache__/vector_store.cpython-313.pyc +0 -0
RAG_HelpDesk/src/retrieval/query_processor.py +0 -0
RAG_HelpDesk/src/retrieval/reranker.py +0 -57
RAG_HelpDesk/src/retrieval/retriever.py +0 -23
RAG_HelpDesk/src/retrieval/vector_store.py +0 -20
RAG_HelpDesk/src/utils/env.py +0 -5

RAG_HelpDesk/Dockerfile DELETED Viewed

@@ -1,12 +0,0 @@
-FROM python:3.11-slim
-WORKDIR /app
-COPY requirements.txt .
-RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r requirements.txt
-COPY . .
-CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]

RAG_HelpDesk/LICENSE DELETED Viewed

@@ -1,21 +0,0 @@
-MIT License
-Copyright (c) 2025 Vinicius Moreira
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

RAG_HelpDesk/README.md DELETED Viewed

	@@ -1,2 +0,0 @@
1	- # RAG_HelpDesk
2	- Esse projeto visa criar um RAG para ajudar usuários em relação a duvidas de Hardware e Software.

RAG_HelpDesk/api/__init__.py DELETED Viewed

File without changes

RAG_HelpDesk/api/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (163 Bytes)

RAG_HelpDesk/api/__pycache__/main.cpython-313.pyc DELETED Viewed

Binary file (2.88 kB)

RAG_HelpDesk/api/__pycache__/rag_chain.cpython-313.pyc DELETED Viewed

Binary file (5.39 kB)

RAG_HelpDesk/api/__pycache__/schemas.cpython-313.pyc DELETED Viewed

Binary file (1.34 kB)

RAG_HelpDesk/api/main.py DELETED Viewed

@@ -1,57 +0,0 @@
-import sys
-from pathlib import Path
-import traceback
-project_root = Path(__file__).resolve().parents[1]
-sys.path.append(str(project_root))
-from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from api.schemas import QueryRequest, QueryResponse, SourceChunk
-from api.rag_chain import get_rag_chain
-app = FastAPI(
-    title="Helpdesk RAG API",
-    description="API for answering questions about an IT knowledge base",
-    version="1.0.0"
-)
-try:
-    rag_chain = get_rag_chain()
-except Exception as e:
-    print("error to load pipeline RAG")
-    traceback.print_exc()
-    raise RuntimeError(f"error to load pipeline RAG: {e}")
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["GET", "POST"],
-    allow_headers=["*"],
-)
-@app.get("/", tags=["Status"])
-def read_root():
-    return {"status": "API ON"}
-@app.post("/query", response_model=QueryResponse, tags=["RAG"])
-async def handle_query(request: QueryRequest):
-    print(f"processing query: '{request.query}'")
-    try:
-        result = rag_chain.invoke(request.query)
-        source_chunks = [
-            SourceChunk(
-                page_content=doc.page_content,
-                source=doc.metadata.get('source', 'desconhecida')
-            ) for doc in result['source_chunks']
-        ]
-        return QueryResponse(
-            answer=result['answer'],
-            source_chunks=source_chunks
-        )
-    except Exception as e:
-        print(f"error to process query")
-        traceback.print_exc()
-        raise HTTPException(status_code=500, detail=f"internal error. check the server console")

RAG_HelpDesk/api/rag_chain.py DELETED Viewed

@@ -1,108 +0,0 @@
-import os
-from pathlib import Path
-from dotenv import load_dotenv
-from operator import itemgetter
-from typing import List, Dict
-from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
-from langchain_core.prompts import PromptTemplate
-from langchain_community.vectorstores import FAISS
-from langchain_huggingface import HuggingFaceEmbeddings
-from huggingface_hub import InferenceClient
-load_dotenv()
-PROJECT_ROOT = Path(__file__).resolve().parents[1]
-VECTOR_STORE_PATH = str(PROJECT_ROOT / "data" / "vector_store_faiss")
-EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
-LLM_REPO_ID = os.getenv("HUGGINGFACE_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
-HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
-if not HF_TOKEN:
-    raise ValueError("HUGGINGFACE_API_TOKEN not found")
-client = InferenceClient(model=LLM_REPO_ID, token=HF_TOKEN)
-prompt_template = PromptTemplate.from_template("""
-<|system|>
-Você é um assistente de helpdesk de TI especialista... (seu prompt aqui)
-</s><|user|>
-Contexto: {context}\n\nPergunta: {query}
-</s><|assistant|>
-Resposta em Português:
-""")
-def format_docs(docs: List[Dict]) -> str:
-    return "\n\n".join(doc.page_content for doc in docs)
-def generate_answer_from_context(input_dict: Dict) -> str:
-    context_docs = input_dict["context"]
-    query_text = input_dict["query"]
-    formatted_context = format_docs(context_docs)
-    prompt_value = prompt_template.invoke({
-        "context": formatted_context,
-        "query": query_text
-    })
-    final_prompt_text = str(prompt_value)
-    try:
-        response = client.chat_completion(
-            messages=[{"role": "user", "content": final_prompt_text}],
-            max_tokens=300,
-            temperature=0.1
-        )
-        raw_answer = response.choices[0].message.content
-        clean_answer = raw_answer.strip()
-        if clean_answer.startswith('text="'):
-            clean_answer = clean_answer[6:]
-        elif clean_answer.startswith("text='"):
-            clean_answer = clean_answer[6:]
-        if clean_answer.endswith('"') or clean_answer.endswith("'"):
-            clean_answer = clean_answer[:-1]
-        if clean_answer.startswith("Resposta:"):
-            clean_answer = clean_answer.split("Resposta:", 1)[1]
-        return clean_answer.strip()
-    except Exception as e:
-        print(f"error for call API huggingface: {e}")
-        return f"error for contact llm: {e}"
-def get_rag_chain():
-    print("loading pipeline")
-    embeddings_model = HuggingFaceEmbeddings(
-        model_name=EMBEDDING_MODEL_NAME,
-        model_kwargs={'device': 'cpu'},
-        encode_kwargs={'normalize_embeddings': True}
-    )
-    vector_store = FAISS.load_local(
-        VECTOR_STORE_PATH, embeddings_model, allow_dangerous_deserialization=True
-    )
-    try:
-        from src.retrieval.reranker import HybridReranker
-        hybrid_reranker = HybridReranker(vector_store=vector_store)
-        retrieval_chain = lambda query: hybrid_reranker.retrieve_and_rerank(query)
-        print("using pipeline with reranker")
-    except ImportError:
-        retrieval_chain = vector_store.as_retriever(search_kwargs={"k": 5})
-        print("reranker not found, using simple retriever")
-    rag_chain = {
-        "context": retrieval_chain,
-        "query": RunnablePassthrough()
-    } | RunnableParallel({
-        "source_chunks": itemgetter("context"),
-        "answer": generate_answer_from_context
-    })
-    print("pipeline ready.")
-    return rag_chain

RAG_HelpDesk/api/schemas.py DELETED Viewed

@@ -1,14 +0,0 @@
-from pydantic import BaseModel, Field
-from typing import List, Optional
-class QueryRequest(BaseModel):
-    query: str = Field(..., description="query for RAG")
-    top_k: int = Field(3, description="number of relevants docs to be retrivied", ge=1, le=10)
-class SourceChunk(BaseModel):
-    page_content: str
-    source: str = Field(description="file path")
-class QueryResponse(BaseModel):
-    answer: str
-    source_chunks: List[SourceChunk]

RAG_HelpDesk/config/base.yaml DELETED Viewed

@@ -1,43 +0,0 @@
-raw_path: "data/raw"
-processed_path: "data/processed"
-embeddings_path: "data/embeddings"
-ingestion:
-  chunk_size: 500
-  chunk_overlap: 50
-  languages: ["pt", "en"]
-embedder:
-  model_name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
-  batch_size: 64
-  normalize: true
-vector_store:
-  type: "faiss"
-  index_factory: "Flat"
-  metric: "L2"
-  save_index_path: "data/embeddings/faiss_index.idx"
-llm:
-  provider: "openai"
-  model_name: "gpt-3.5-turbo"
-  max_tokens: 512
-  temperature: 0.7
-  top_p: 0.9
-  api_key_env: "OPENAI_API_KEY"
-api:
-  host: "0.0.0.0"
-  port: 8000
-  docs_url: "/docs"
-logging:
-  level: "INFO"
-  format: "[%(asctime)s] %(levelname)s %(name)s: %(message)s"
-evaluation:
-  top_k: 5
-  benchmark_path: "tests/benchmark.json"
-  save_reports: true
-  reports_path: "reports/"

RAG_HelpDesk/config/dev.yaml DELETED Viewed

File without changes

RAG_HelpDesk/config/prod.yaml DELETED Viewed

File without changes

RAG_HelpDesk/data/embeddings/batch_000.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b0234624565756c111fcd23e4090f7f08255567e57fad9a01cde641f862f4c93
-size 76800128

RAG_HelpDesk/data/raw/base_treinamento.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

RAG_HelpDesk/data/vector_store_faiss/index.faiss DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:da3c437c49b2aab9bcff75a1553f3ea1cd17212b70c66cdc3fcebbab1bbf1a7c
-size 43315245

RAG_HelpDesk/data/vector_store_faiss/index.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac470dbf2aa92c9fe996c3a0df6ca4e4435afb806b06f6e8e877def38323393f
-size 10958179

RAG_HelpDesk/frontend/app_frontend.py DELETED Viewed

@@ -1,49 +0,0 @@
-import streamlit as st
-import requests
-import os
-st.set_page_config(page_title="Assistente RAG", page_icon="🤖", layout="centered")
-st.title("🤖 Assistente de Helpdesk (RAG)")
-API_URL = "http://127.0.0.1:8000/query"
-if "messages" not in st.session_state:
-    st.session_state.messages = [{"role": "assistant", "content": "Olá! Como posso te ajudar com suas dúvidas de TI?"}]
-for message in st.session_state.messages:
-    with st.chat_message(message["role"]):
-        st.markdown(message["content"])
-if prompt := st.chat_input("Digite sua pergunta aqui..."):
-    st.session_state.messages.append({"role": "user", "content": prompt})
-    with st.chat_message("user"):
-        st.markdown(prompt)
-    with st.chat_message("assistant"):
-        message_placeholder = st.empty()
-        with st.spinner("Analisando a base de conhecimento..."):
-            try:
-                payload = {"query": prompt, "top_k": 5}
-                response = requests.post(API_URL, json=payload, timeout=120)
-                if response.status_code == 200:
-                    data = response.json()
-                    answer = data.get("answer", "Não foi possível obter uma resposta.")
-                    sources = data.get("source_chunks", [])
-                    full_response = answer
-                    if sources:
-                        unique_sources = set(chunk['source'] for chunk in sources)
-                        full_response += "\n\n---\n*Fontes consultadas:*"
-                        for source_file in unique_sources:
-                            full_response += f"\n- `{os.path.basename(source_file)}`"
-                    message_placeholder.markdown(full_response)
-                else:
-                    full_response = f"Erro da API: {response.status_code} - {response.text}"
-                    message_placeholder.error(full_response)
-            except requests.exceptions.RequestException as e:
-                full_response = f"Erro de conexão com o back-end: {e}"
-                message_placeholder.error(full_response)
-    st.session_state.messages.append({"role": "assistant", "content": full_response})

RAG_HelpDesk/frontend/requirements.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- streamlit
2	- requests

RAG_HelpDesk/notebooks/demo_embedder.py DELETED Viewed

@@ -1,24 +0,0 @@
-from src.ingestion.document_loader import load_documents_from_dir
-from src.ingestion.preprocessor import preprocess_documents
-from src.ingestion.text_splitter import split_text
-from src.ingestion.embedder import load_embedder, generate_embeddings, save_embeddings, get_chunk_texts
-CHUNK_SIZE = 300
-CHUNK_OVERLAP = 50
-RAW_PATH = "data/raw"
-EMBEDDING_PATH = "data/embeddings/batch_000.npy"
-docs = load_documents_from_dir(RAW_PATH)
-clean_docs = preprocess_documents(docs)
-chunks = split_text(clean_docs, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
-texts = get_chunk_texts(chunks)
-model = load_embedder()
-embeddings = generate_embeddings(texts, model)
-print(f"embeddings shape: {embeddings.shape}")
-print(f"vector example (1º):\n{embeddings[0][:10]}...")
-save_embeddings(embeddings, EMBEDDING_PATH)
-print(f"save embeddings in: {EMBEDDING_PATH}")

RAG_HelpDesk/notebooks/demo_evaluator.py DELETED Viewed

@@ -1,3 +0,0 @@
-from src.evaluation.evaluator import run_evaluation
-run_evaluation()

RAG_HelpDesk/notebooks/demo_generator.py DELETED Viewed

@@ -1,23 +0,0 @@
-import numpy as np
-from src.retrieval.retriever import Retriever
-from src.generation.response_generator import generate_answer
-retriever = Retriever("data/embeddings/batch_000.npy")
-query = "Meu notebook está muito lento, o que posso fazer? E se não resolver?"
-idxs, scores = retriever.retrieve(query, top_k=1)
-idxs = np.atleast_1d(idxs).flatten()
-scores = np.atleast_1d(scores).flatten()
-with open("data/raw/base_treinamento.txt", encoding="utf-8") as f:
-    chunks = [b.strip() for b in f.read().split("\n\n") if b.strip()]
-context = chunks[int(idxs[0])]
-answer = generate_answer(query, context)
-print(f"\nquery: {query}")
-print(f"context selected (chunk #{idxs[0]}):\n{context}\n")
-print("generated response:\n")
-print(answer)

RAG_HelpDesk/notebooks/demo_ingestion.py DELETED Viewed

@@ -1,21 +0,0 @@
-from src.ingestion.document_loader import load_documents_from_dir
-from src.ingestion.preprocessor import preprocess_documents
-from src.ingestion.text_splitter import split_text
-RAW_PATH = "data/raw"
-CHUNK_SIZE = 300
-CHUNK_OVERLAP = 50
-docs = load_documents_from_dir(RAW_PATH)
-print(f"docs loaded: {len(docs)}")
-cleaned_docs = preprocess_documents(docs)
-print(f"pre-process completed")
-chunks = split_text(cleaned_docs, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
-print(f"total chunks generated: {len(chunks)}")
-for i, chunk in enumerate(chunks[:3]):
-    print(f"\n--- Chunk {i} ---")
-    print(f"font: {chunk['source']}")
-    print(chunk['content'][:300])

RAG_HelpDesk/notebooks/demo_reranker.py DELETED Viewed

@@ -1,35 +0,0 @@
-import numpy as np
-from src.retrieval.retriever import Retriever
-from src.retrieval.reranker import HybridReranker
-RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
-def main():
-    # Carrega os textos (chunks)
-    with open("data/raw/base_treinamento.txt", encoding="utf-8") as f:
-        chunks = [b.strip() for b in f.read().split("\n\n") if b.strip()]
-    # Cria o retriever com os embeddings pré-calculados
-    retriever = Retriever("data/embeddings/batch_000.npy")
-    # Cria o reranker, passando o retriever e os chunks para reranking
-    hybrid = HybridReranker(
-        retriever=retriever,
-        chunk_texts=chunks,
-        reranker_model=RERANKER_MODEL,
-        sparse_alpha=0.5,
-    )
-    query = "Meu notebook está muito lento, o que posso fazer? E se não resolver?"
-    # Recupera e reranqueia os top documentos
-    idxs, scores = hybrid.retrieve_and_rerank(query, top_k_dense=10, top_k_final=3)
-    print(f"\nQuery: {query}\n")
-    for i, idx in enumerate(idxs):
-        print(f"Rank {i+1} - Chunk #{idx} (score: {scores[i]:.4f}):")
-        print(chunks[idx])
-        print("-" * 40)
-if __name__ == "__main__":
-    main()

RAG_HelpDesk/notebooks/demo_retriever.py DELETED Viewed

@@ -1,14 +0,0 @@
-import numpy as np
-from src.retrieval.retriever import Retriever
-retriever = Retriever("data/embeddings/batch_000.npy")
-query = "O que fazer se o notebook não liga?"
-idxs, scores = retriever.retrieve(query, top_k=5)
-idxs = np.atleast_1d(idxs).flatten()
-scores = np.atleast_1d(scores).flatten()
-print(f"\ntop results for: {query}\n")
-for i, (idx, score) in enumerate(zip(idxs, scores), 1):
-    print(f"{i}. idx: {int(idx):4d} — similarity: {score:.4f}")

RAG_HelpDesk/notebooks/demo_vector_store.py DELETED Viewed

@@ -1,20 +0,0 @@
-import numpy as np
-from src.retrieval.vector_store import VectorStore
-embeds = np.load("data/embeddings/batch_000.npy")
-dim = embeds.shape[1]
-print(f"embeddings loaded: {embeds.shape}")
-vs = VectorStore(dim=dim)
-vs.add(embeds)
-print("vectors added in index")
-query = embeds[0]
-dists, idxs = vs.search(query, top_k=5)
-print("\nresult search (dummy):")
-for i, (idx, dist) in enumerate(zip(idxs, dists), 1):
-    print(f"{i}. idx: {idx:4d} — distância: {dist:.4f}")
-print("Norma do primeiro vetor:", np.linalg.norm(embeds[0]))
-print("Norma do segundo vetor:", np.linalg.norm(embeds[1]))

RAG_HelpDesk/pytest.ini DELETED Viewed

@@ -1,6 +0,0 @@
-[pytest]
-minversion = 6.0
-addopts = -ra -q
-testpaths =
-    tests
-python_paths = src

RAG_HelpDesk/requiriments.txt DELETED Viewed

@@ -1,21 +0,0 @@
-sentence-transformers>=2.2.2
-faiss-cpu>=1.7.3
-openai>=0.27.0
-python-docx>=0.8.11
-PyPDF2>=3.0.0
-beautifulsoup4>=4.12.2
-PyYAML>=6.0
-uvicorn[standard]>=0.22.0
-pytest>=7.0.1
-python-dotenv>=1.0.0
-nltk>=3.8.1
-pytest
-unstructured
-torch
-fastapi
-uvicorn[standard]
-langchain
-langchain-huggingface
-pydantic
-langchain_community
-streamlit

RAG_HelpDesk/src/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (163 Bytes)

RAG_HelpDesk/src/__pycache__/download.cpython-313.pyc DELETED Viewed

Binary file (384 Bytes)

RAG_HelpDesk/src/__pycache__/gerar_chunks.cpython-313.pyc DELETED Viewed

Binary file (2.59 kB)

RAG_HelpDesk/src/evaluation/benchmarks.py DELETED Viewed

File without changes

RAG_HelpDesk/src/evaluation/evaluator.py DELETED Viewed

@@ -1,70 +0,0 @@
-import json
-import numpy as np
-from src.retrieval.retriever import Retriever
-from src.retrieval.reranker import HybridReranker
-from src.evaluation.metrics import (
-    precision_at_k as retrieval_precision_at_k,
-    recall_at_k,
-    mean_reciprocal_rank,
-    bleu_score
-)
-from src.ingestion.document_loader import load_documents_from_dir
-from src.ingestion.preprocessor import preprocess_documents
-from src.ingestion.text_splitter import split_text
-def run_evaluation(
-    benchmark_path: str = "tests/benchmark.json",
-    k: int = 3,
-    top_k_dense: int = 10,
-    top_k_final: int = 3,
-    sparse_alpha: float = 0.5
-):
-    with open(benchmark_path, encoding="utf-8") as f:
-        benchmarks = json.load(f)
-    docs = load_documents_from_dir("data/raw")
-    clean_docs = preprocess_documents(docs)
-    chunks = split_text(clean_docs, chunk_size=300, chunk_overlap=50)
-    texts = [chunk['content'] for chunk in chunks]
-    retriever = Retriever("data/embeddings/batch_000.npy")
-    reranker = HybridReranker(
-        retriever=retriever,
-        chunk_texts=texts,
-        reranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
-        sparse_alpha=sparse_alpha
-    )
-    all_retrieved = []
-    all_relevant = []
-    print(f"starting assessment reranker: Precision@{k}, Recall@{k}, MRR")
-    print(f"dense top_k: {top_k_dense}, final top_k: {top_k_final}, sparse_alpha: {sparse_alpha}\n")
-    for i, entry in enumerate(benchmarks, 1):
-        query = entry['query']
-        relevant_idxs = entry.get('relevant_idxs', [])
-        idxs, scores = reranker.retrieve_and_rerank(
-            query,
-            top_k_dense=top_k_dense,
-            top_k_final=top_k_final
-        )
-        p = retrieval_precision_at_k(retrieved_idxs=idxs, relevant_idxs=relevant_idxs, k=k)
-        r = recall_at_k(retrieved_idxs=idxs, relevant_idxs=relevant_idxs, k=k)
-        all_retrieved.append(idxs)
-        all_relevant.append(relevant_idxs)
-        print(f"{i}. Query: {query}")
-        print(f"   Precision@{k}: {p:.2f}, Recall@{k}: {r:.2f}")
-        print(f"   Retrieved idxs: {idxs}")
-        print(f"   Rerank scores: {[f'{s:.4f}' for s in scores]}\n")
-    mrr = mean_reciprocal_rank(retrieved_lists=all_retrieved, relevant_idxs_list=all_relevant)
-    print(f"mean reciprocal rank (MRR): {mrr:.2f}\n")
-if __name__ == "__main__":
-    run_evaluation()

RAG_HelpDesk/src/evaluation/metrics.py DELETED Viewed

@@ -1,38 +0,0 @@
-import numpy as np
-from typing import List
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
-def precision_at_k(retrieved_idxs: List[int], relevant_idxs: List[int], k: int) -> float:
-    if k <= 0:
-        return 0.0
-    top_k = retrieved_idxs[:k]
-    hits = sum(1 for idx in top_k if idx in relevant_idxs)
-    return hits / k
-def recall_at_k(retrieved_idxs: List[int], relevant_idxs: List[int], k: int) -> float:
-    if not relevant_idxs:
-        return 0.0
-    top_k = retrieved_idxs[:k]
-    hits = sum(1 for idx in top_k if idx in relevant_idxs)
-    return hits / len(relevant_idxs)
-def mean_reciprocal_rank(retrieved_lists: List[List[int]], relevant_idxs_list: List[List[int]]) -> float:
-    rr_scores = []
-    for retrieved, relevant in zip(retrieved_lists, relevant_idxs_list):
-        rr = 0.0
-        for rank, idx in enumerate(retrieved, start=1):
-            if idx in relevant:
-                rr = 1.0 / rank
-                break
-        rr_scores.append(rr)
-    return float(np.mean(rr_scores)) if rr_scores else 0.0
-def bleu_score(reference: str, candidate: str) -> float:
-    smoothie = SmoothingFunction().method4
-    weights = (0.25, 0.25, 0.25, 0.25)
-    return sentence_bleu([reference.split()], candidate.split(), weights=weights, smoothing_function=smoothie)

RAG_HelpDesk/src/generation/llm_client.py DELETED Viewed

@@ -1,37 +0,0 @@
-import os
-import requests
-from dotenv import load_dotenv
-from pathlib import Path
-env_path = Path(__file__).resolve().parents[2] / '.env'
-load_dotenv(dotenv_path=env_path)
-HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
-REPO_ID = os.getenv("HUGGINGFACE_MODEL", "HuggingFaceH4/zephyr-7b-beta")
-API_URL = f"https://api-inference.huggingface.co/models/{REPO_ID}"
-HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
-def call_llm(prompt: str, max_length: int = 200) -> str:
-    payload = {
-        "inputs": prompt,
-        "parameters": {"max_new_tokens": max_length, "temperature": 0.2}
-    }
-    try:
-        print(f"[llm_client] POST {API_URL}")
-        print(f"[llm_client] HEADERS: {HEADERS}")
-        response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=60)
-        print(f"[llm_client] Status code: {response.status_code}")
-        text = response.text
-        print(f"[llm_client] Response text: {text}")
-        response.raise_for_status()
-        data = response.json()
-        if isinstance(data, list) and data and isinstance(data[0], dict) and "generated_text" in data[0]:
-            return data[0]["generated_text"].strip()
-        if isinstance(data, dict) and "generated_text" in data:
-            return data["generated_text"].strip()
-        return str(data)
-    except Exception as e:
-        print(f"[llm_client] error HTTP HF: {e}")
-        return f"error in generate response: {e}"

RAG_HelpDesk/src/generation/prompt_templates.py DELETED Viewed

@@ -1,44 +0,0 @@
-# src/generation/prompt_templates.py
-helpdesk_prompt = """
-Contexto técnico:
-{context}
-Com base nisso, responda à seguinte pergunta:
-{query}
-"""
-concise_helpdesk_prompt = """
-Contexto técnico:
-{context}
-Responda de forma breve e objetiva:
-{query}
-"""
-informal_helpdesk_prompt = """
-Oi! Aqui está o que você precisa saber com base no contexto:
-{context}
-Pergunta:
-{query}
-Resposta descontraída:
-"""
-step_by_step_prompt = """
-Contexto técnico:
-{context}
-Por favor, explique passo a passo como resolver:
-{query}
-"""
-link_suggestion_prompt = """
-Contexto técnico:
-{context}
-Responda a pergunta: {query}
-Se possível, inclua links úteis para mais informações.
-"""

RAG_HelpDesk/src/generation/response_generator.py DELETED Viewed

@@ -1,23 +0,0 @@
-from src.generation.llm_client import call_llm
-from src.generation.prompt_templates import (
-    helpdesk_prompt,
-    concise_helpdesk_prompt,
-    informal_helpdesk_prompt,
-    step_by_step_prompt,
-    link_suggestion_prompt
-)
-def build_prompt(query: str, context: str, mode: str = "default") -> str:
-    prompt_map = {
-        "default": helpdesk_prompt,
-        "concise": concise_helpdesk_prompt,
-        "informal": informal_helpdesk_prompt,
-        "step_by_step": step_by_step_prompt,
-        "with_links": link_suggestion_prompt
-    }
-    template = prompt_map.get(mode, helpdesk_prompt)
-    return template.format(context=context, query=query)
-def generate_answer(query: str, context: str, mode: str = "default") -> str:
-    prompt = build_prompt(query, context, mode)
-    return call_llm(prompt)

RAG_HelpDesk/src/ingestion/document_loader.py DELETED Viewed

@@ -1,20 +0,0 @@
-import os
-from pathlib import Path
-SUPPORTED_EXTENSIONS = {".txt", ".md"}
-def load_documents_from_dir(directory: str) -> list[dict]:
-    docs = []
-    for file_path in Path(directory).rglob("*"):
-        if file_path.suffix.lower() in SUPPORTED_EXTENSIONS:
-            with open(file_path, "r", encoding="utf-8") as f:
-                content = f.read()
-                docs.append({
-                    "content": content,
-                    "source": str(file_path)
-                })
-    return docs

RAG_HelpDesk/src/ingestion/embedder.py DELETED Viewed

@@ -1,63 +0,0 @@
-import os
-import shutil
-import torch
-from langchain_community.document_loaders import DirectoryLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-def check_environment():
-    if not torch.cuda.is_available():
-        print("\nCUDA not available")
-        return False
-    gpu_name = torch.cuda.get_device_name(0)
-    print(f"CUDA verified, gpu detected: {gpu_name}")
-    return True
-if not check_environment():
-    exit()
-RAW_DATA_DIR = "data/raw"
-VECTOR_STORE_PATH = "data/vector_store_faiss"
-EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
-def run_ingestion():
-    print(f"\nstarting ingestion with GPU and model: {EMBEDDING_MODEL_NAME}")
-    loader = DirectoryLoader(RAW_DATA_DIR, glob="**/*.txt", show_progress=True, use_multithreading=True)
-    docs = loader.load()
-    if not docs:
-        print(f"no docs found in '{RAW_DATA_DIR}'")
-        return
-    print(f"loading {len(docs)} docs.")
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=100
-    )
-    chunks = text_splitter.split_documents(docs)
-    print(f"docs divided in {len(chunks)} chunks.")
-    print("loading model embedding for GPU")
-    embeddings_model = HuggingFaceEmbeddings(
-        model_name=EMBEDDING_MODEL_NAME,
-        model_kwargs={'device': 'cuda'},
-        encode_kwargs={'normalize_embeddings': True}
-    )
-    print("loaded model")
-    print("creating vector DB faiss")
-    if os.path.exists(VECTOR_STORE_PATH):
-        print(f"removing old vector in '{VECTOR_STORE_PATH}'...")
-        shutil.rmtree(VECTOR_STORE_PATH)
-    vector_store = FAISS.from_documents(chunks, embeddings_model)
-    vector_store.save_local(VECTOR_STORE_PATH)
-    print("-" * 50)
-    print("pipeline ingestion conclude")
-    print("-" * 50)
-if __name__ == "__main__":
-    run_ingestion()

RAG_HelpDesk/src/ingestion/preprocessor.py DELETED Viewed

@@ -1,17 +0,0 @@
-import re
-def clean_text(text: str) -> str:
-    text = re.sub(r"\n+", "\n", text)
-    text = re.sub(r"[ \t]+", " ", text)
-    text = text.strip()
-    return text
-def preprocess_documents(documents: list[dict]) -> list[dict]:
-    return [
-        {
-            **doc,
-            "content": clean_text(doc["content"])
-        }
-        for doc in documents
-    ]

RAG_HelpDesk/src/ingestion/text_splitter.py DELETED Viewed

@@ -1,46 +0,0 @@
-import re
-from typing import List, Dict
-def split_text(
-    documents: List[Dict],
-    chunk_size: int = 500,
-    chunk_overlap: int = 50
-) -> List[Dict]:
-    chunks = []
-    for doc in documents:
-        text = doc["content"]
-        source = doc.get("source", "unknown")
-        entries = re.split(r"(?=Pergunta:)", text)
-        for entry_id, entry in enumerate(entries):
-            entry = entry.strip()
-            if not entry:
-                continue
-            if len(entry) <= chunk_size:
-                chunks.append({
-                    "content": entry,
-                    "source": source,
-                    "entry_id": entry_id,
-                    "chunk_id": 0
-                })
-            else:
-                start = 0
-                end = chunk_size
-                chunk_id = 0
-                while start < len(entry):
-                    snippet = entry[start:end]
-                    chunks.append({
-                        "content": snippet,
-                        "source": source,
-                        "entry_id": entry_id,
-                        "chunk_id": chunk_id
-                    })
-                    chunk_id += 1
-                    start = end - chunk_overlap
-                    end = start + chunk_size
-    return chunks

RAG_HelpDesk/src/retrieval/__pycache__/reranker.cpython-313.pyc DELETED Viewed

Binary file (3.99 kB)

RAG_HelpDesk/src/retrieval/__pycache__/retriever.cpython-313.pyc DELETED Viewed

Binary file (2.3 kB)

RAG_HelpDesk/src/retrieval/__pycache__/vector_store.cpython-313.pyc DELETED Viewed

Binary file (1.85 kB)

RAG_HelpDesk/src/retrieval/query_processor.py DELETED Viewed

File without changes

RAG_HelpDesk/src/retrieval/reranker.py DELETED Viewed

@@ -1,57 +0,0 @@
-from typing import List
-import numpy as np
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sentence_transformers import CrossEncoder
-from langchain_community.vectorstores import FAISS
-from langchain_core.documents import Document
-class HybridReranker:
-    def __init__(
-        self,
-        vector_store: FAISS,
-        reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
-    ):
-        self.vector_store = vector_store
-        self.reranker = CrossEncoder(reranker_model)
-        docs_in_order = list(self.vector_store.docstore._dict.values())
-        self.chunk_texts = [doc.page_content for doc in docs_in_order]
-        self.chunk_metadata = [doc.metadata for doc in docs_in_order]
-        print(f"rerank model '{reranker_model}' loading. building matriz tf-idf")
-        self.vectorizer = TfidfVectorizer()
-        self.tfidf_matrix = self.vectorizer.fit_transform(self.chunk_texts)
-        print("reranker ready")
-    def retrieve_and_rerank(
-        self,
-        query: str,
-        top_k_dense: int = 20,
-        top_k_final: int = 5,
-    ) -> List[Document]:
-        dense_docs = self.vector_store.similarity_search(query, k=top_k_dense)
-        q_vec = self.vectorizer.transform([query])
-        sparse_scores = (self.tfidf_matrix @ q_vec.T).toarray().ravel()
-        sparse_indices = np.argsort(-sparse_scores)[:top_k_dense]
-        sparse_docs = [
-            Document(page_content=self.chunk_texts[i], metadata=self.chunk_metadata[i])
-            for i in sparse_indices
-        ]
-        combined_docs = []
-        seen_contents = set()
-        for doc in dense_docs + sparse_docs:
-            if doc.page_content not in seen_contents:
-                combined_docs.append(doc)
-                seen_contents.add(doc.page_content)
-        pairs = [[query, doc.page_content] for doc in combined_docs]
-        rerank_scores = self.reranker.predict(pairs)
-        doc_scores = list(zip(combined_docs, rerank_scores))
-        sorted_doc_scores = sorted(doc_scores, key=lambda x: x[1], reverse=True)
-        final_docs = [doc for doc, score in sorted_doc_scores[:top_k_final]]
-        return final_docs

RAG_HelpDesk/src/retrieval/retriever.py DELETED Viewed

@@ -1,23 +0,0 @@
-import numpy as np
-from sentence_transformers import SentenceTransformer
-from src.retrieval.vector_store import VectorStore
-class Retriever:
-    def __init__(self, embeddings_path: str, model_name: str = "BAAI/bge-small-en-v1.5"):
-        self.model = SentenceTransformer(model_name)
-        dim = self.model.get_sentence_embedding_dimension()
-        self.index = VectorStore(dim=dim)
-        embeds = np.load(embeddings_path).astype(np.float32)
-        self.index.add(embeds)
-    def retrieve(self, query: str, top_k: int = 5):
-        qv = self.model.encode(query, normalize_embeddings=True).astype(np.float32)
-        distances, indices = self.index.search(qv, top_k)
-        return indices[0].copy(), distances[0].copy()
-    def set_chunk_texts(self, texts: list[str]):
-        self._chunk_texts = texts
-    def get_chunk_texts(self) -> list[str]:
-        return getattr(self, "_chunk_texts", [])

RAG_HelpDesk/src/retrieval/vector_store.py DELETED Viewed

@@ -1,20 +0,0 @@
-import faiss
-import numpy as np
-class VectorStore:
-    def __init__(self, dim: int):
-        self.dim = dim
-        self.index = faiss.IndexFlatIP(dim)
-    def add(self, embeddings: np.ndarray):
-        if embeddings.dtype != np.float32:
-            embeddings = embeddings.astype(np.float32)
-        self.index.add(embeddings)
-    def search(self, query_vec: np.ndarray, top_k: int = 5):
-        if query_vec.ndim == 1:
-            query_vec = np.expand_dims(query_vec, axis=0)
-        if query_vec.dtype != np.float32:
-            query_vec = query_vec.astype(np.float32)
-        distances, indices = self.index.search(query_vec, top_k)
-        return distances[0], indices[0]

RAG_HelpDesk/src/utils/env.py DELETED Viewed

@@ -1,5 +0,0 @@
-import os
-from dotenv import load_dotenv
-load_dotenv()
-def get_env_var(name: str, default: str = "") -> str:
-    return os.getenv(name, default)