File size: 4,872 Bytes
3107242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import logging
import os
from src.utils.helpers import load_chunks_from_disk, load_metadata
from src.configs.config import LOG_DIR, METADATA_FILE, CHUNKS_FILE, EMBEDDINGS_FILE, FAISS_INDEX_FILE, EMBEDDING_MODEL, TOP_K
from src.models.llm_wrapper import GeminiWrapper
from src.utils.helpers import load_prompt_template  
import json
import torch
from sentence_transformers import util

LOG_FILE = os.path.join(LOG_DIR, "Agents.log")
logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

class RAGPipeline:
    def __init__(self):
        self.embedding_model = SentenceTransformer(EMBEDDING_MODEL)
        self.index = faiss.read_index(str(FAISS_INDEX_FILE))
        self.chunks_data = load_chunks_from_disk(CHUNKS_FILE)
        self.metadata = load_metadata(METADATA_FILE)
        self.llm = GeminiWrapper()
    def retrieve_from_pdf(self, query, k=TOP_K):

        if self.index is None:
            logging.error("No FAISS index loaded.")
            return []

        query_embedding = self.embedding_model.encode([query], convert_to_tensor=False)[0]
        distances, indices = self.index.search(np.array([query_embedding]), k)
        results = []
        chunk_map = []
        for doc in self.chunks_data:
            for chunk in doc["chunks"]:
                chunk_map.append({
                    "faiss_index": chunk["faiss_index"],
                    "text": chunk["text"],
                    "Id": doc["Id"],
                    "pdf_title": doc["pdf_title"],
                    "download_link": doc["download_link"]
                })

        for idx, dist in zip(indices[0], distances[0]):
            chunk_info = next((item for item in chunk_map if item["faiss_index"] == idx), None)
            if chunk_info:
                meta = self.metadata.get(chunk_info["pdf_title"], {
                    "Id": chunk_info["Id"],
                    "Nom du document": chunk_info["pdf_title"],
                    "Lien": chunk_info["download_link"]
                })
                results.append({
                    "text": chunk_info["text"],
                    "indicator": chunk_info["Id"],
                    "pdf_title": meta["Nom du document"],
                    "pdf_link": meta["Lien"],
                    "distance": float(dist)
                })
        logging.info("Retrieved %d results from FAISS index.", len(results))
        return results

    def generate(self, query, retrieved_chunks):

        raw_context = "\n".join([chunk["text"] for chunk in retrieved_chunks])
        prompt_path = "src/prompts/documents_rag_prompt.txt"
        prompt = load_prompt_template(prompt_path, {
            "context": raw_context,
            "query": query
        })

        logging.info("Prompt sent to LLM:\n%s", prompt)
        return self.llm.generate(prompt)

    def get_top_docs_chunks_for_query(self, query, relevant_docs, top_k=5, chunks_file=CHUNKS_FILE, embeddings_file=EMBEDDINGS_FILE):
   
        with open(str(chunks_file), encoding="utf-8") as f:
            all_chunks_data = json.load(f)

        title_map = {doc.get("pdf_title"): doc.get("chunks", []) for doc in all_chunks_data}
        link_map = {doc.get("download_link"): doc.get("chunks", []) for doc in all_chunks_data}

        all_chunks = []
        for doc in relevant_docs:
            pdf_title = doc.get("Nom du document") or doc.get("pdf_title")
            pdf_link = doc.get("Lien") or doc.get("pdf_link") or doc.get("download_link")
            chunks = title_map.get(pdf_title) or link_map.get(pdf_link)
            if chunks:
                for chunk in chunks:
                    chunk['pdf_title'] = pdf_title
                    chunk['pdf_link'] = pdf_link
                all_chunks.extend(chunks)

        logging.info("Total matched chunks before re-ranking: %d", len(all_chunks))

        if not all_chunks:
            logging.warning("No chunks found for the given relevant documents.")
            return []

        embeddings = np.load(str(embeddings_file))
        chunk_indices = [chunk["faiss_index"] for chunk in all_chunks]
        chunk_embeddings = embeddings[chunk_indices]

        from src.utils.search_docs_utils import get_model
        model = get_model()
        query_emb = model.encode([query])[0]

        chunk_embeddings_tensor = torch.tensor(chunk_embeddings)
        query_tensor = torch.tensor(query_emb)
        cos_sim = util.cos_sim(query_tensor, chunk_embeddings_tensor)[0]
        scored = list(zip(cos_sim.tolist(), all_chunks))
        scored.sort(reverse=True, key=lambda x: x[0])

        logging.info("Returning top-%d most relevant chunks.", top_k)
        return [chunk for score, chunk in scored[:top_k]]