import json from retrievals import TFIDFRetriever import pprint from retrievals import BM25Retriever from typing import Callable, List import numpy as np from typing import Callable import bm25s import numpy as np import Stemmer from scipy.spatial.distance import cdist from sklearn.feature_extraction.text import TfidfVectorizer import asyncio from typing import List, Union, Optional from transformers import AutoTokenizer, AutoModel import torch import os from typing import List, Optional, Union import requests import numpy as np from typing import Callable, List from scipy.spatial.distance import cdist from embedding_function import sync_embed from dotenv import load_dotenv load_dotenv() #################################################################################### with open("src/data_final_cleaned.json", "r", encoding="utf-8") as f: raw_data = json.load(f) formatted_data = [] for item in raw_data: if "docs" in item: metadata_value = item["docs"].get("metadata", "") content_value = item["docs"].get("content", "") formatted_data.append({ "cleaned_content": content_value, "metadata": {"source": metadata_value} }) ######################################"TF_IDF######################################## def get_retrieval_tf_idf(query): tfidf_retriever = TFIDFRetriever() tfidf_retriever.index_data(formatted_data) results = tfidf_retriever.search(query, k=3) formatted_results = { 'json': { 'question': query, 'results': [] } } for result in results: formatted_results['json']['results'].append({ 'content': result['text'], 'metadata': result['source'], 'score': float(result['score']) }) return formatted_results ##################################BM25########################################## def get_retrieval_bm25(query): bm25_retriever = BM25Retriever() bm25_retriever.index_data(formatted_data) results = bm25_retriever.search(query, k=3) formatted_results = { 'json': { 'question': query, 'results': [] } } for result in results: formatted_results['json']['results'].append({ 'content': result['text'], 'metadata': result['source'], 'score': float(result['score']) }) return formatted_results #######################################dense retrieval################################### import numpy as np from typing import Callable, List from scipy.spatial.distance import cdist import pickle import os class DenseRetriever: """ A retriever model that uses dense embeddings for indexing and searching documents. Attributes: vectorizer (Callable): The function used to generate embeddings. index (np.ndarray): The indexed embeddings. data (list): The data to be indexed. """ def __init__(self, vectorizer: Callable): """ Initialize the DenseRetriever. Args: vectorizer (Callable): The function to generate embeddings. """ self.vectorizer = vectorizer self.index = None self.data = None def load_index(self, filepath: str): """ Load the index and metadata from a pickle file. Args: filepath (str): Path to the .pkl file containing 'index' and 'data'. """ with open(filepath, 'rb') as f: saved = pickle.load(f) self.index = saved['index'] self.data = saved['data'] def index_data(self, data: List[dict]): """ Indexes the provided data using dense embeddings. Args: data (list): A list of documents to be indexed. Each document should be a dictionary containing a key 'cleaned_content' with the text to be indexed. """ self.data = data docs = [doc["cleaned_content"] for doc in data] embeddings = self.vectorizer(docs) self.index = np.array(embeddings) def search(self, query: str, k: int = 5) -> List[dict]: """ Searches the indexed data for the given query using cosine similarity. Args: query (str): The search query. k (int): The number of top results to return. Returns: list: A list of dictionaries containing the source, text, and score of the top-k results. """ query_embedding = self.vectorizer([query]) # Doit retourner une liste ou np.ndarray # Vérification du résultat if query_embedding is None: raise ValueError("La fonction vectorizer a retourné None.") query_embedding = np.array(query_embedding) if query_embedding.ndim == 1: query_embedding = query_embedding[np.newaxis, :] # le transformer en (1, dim) if query_embedding.ndim != 2: raise ValueError("query_embedding doit être un tableau 2D.") if self.index.ndim != 2: raise ValueError("L'index dense doit être un tableau 2D.") if self.index.shape[1] != query_embedding.shape[1]: raise ValueError(f"Dimensions incompatibles entre query ({query_embedding.shape[1]}) et index ({self.index.shape[1]}).") cosine_distances = cdist(query_embedding, self.index, metric="cosine")[0] top_k_indices = cosine_distances.argsort()[:k] output = [] for idx in top_k_indices: output.append( { "source": self.data[idx]["metadata"]["source"], "text": self.data[idx]["cleaned_content"], "score": 1 - cosine_distances[idx], } ) return output def predict(self, query: str, k: int) -> List[dict]: return self.search(query, k) import os import pickle def get_retrieval_dense(query, model=None, api_key=None): if model is None: raise ValueError("Model must be specified") if isinstance(model, list): model = model[0] # Sécurisation model_filename = model.split("/")[-1] + ".pkl" index_path = os.path.join("embeddings_cache", model_filename) if not os.path.exists(index_path): raise FileNotFoundError(f"L'index pour le modèle {model} est introuvable à l'emplacement : {index_path}") with open(index_path, "rb") as f: saved = pickle.load(f) dr = DenseRetriever(vectorizer=lambda docs: sync_embed(texts=docs, model=f"{model}", api_key=os.getenv("HF_API_KEY"))) # Attribuer les valeurs du dictionnaire à l'instance dr.index = saved["index"] dr.data = saved["data"] # Exécuter la recherche results = dr.search(query, k=3) formatted_results = { 'json': { 'question': query, 'results': [] } } for result in results: formatted_results['json']['results'].append({ 'content': result['text'], 'metadata': result['source'], 'score': float(result['score']) }) return formatted_results