retrieval_evaluation / src /retrieval.py
samiha123's picture
first commit
ae47781
import json
from retrievals import TFIDFRetriever
import pprint
from retrievals import BM25Retriever
from typing import Callable, List
import numpy as np
from typing import Callable
import bm25s
import numpy as np
import Stemmer
from scipy.spatial.distance import cdist
from sklearn.feature_extraction.text import TfidfVectorizer
import asyncio
from typing import List, Union, Optional
from transformers import AutoTokenizer, AutoModel
import torch
import os
from typing import List, Optional, Union
import requests
import numpy as np
from typing import Callable, List
from scipy.spatial.distance import cdist
from embedding_function import sync_embed
from dotenv import load_dotenv
load_dotenv()
####################################################################################
with open("src/data_final_cleaned.json", "r", encoding="utf-8") as f:
raw_data = json.load(f)
formatted_data = []
for item in raw_data:
if "docs" in item:
metadata_value = item["docs"].get("metadata", "")
content_value = item["docs"].get("content", "")
formatted_data.append({
"cleaned_content": content_value,
"metadata": {"source": metadata_value}
})
######################################"TF_IDF########################################
def get_retrieval_tf_idf(query):
tfidf_retriever = TFIDFRetriever()
tfidf_retriever.index_data(formatted_data)
results = tfidf_retriever.search(query, k=3)
formatted_results = {
'json': {
'question': query,
'results': []
}
}
for result in results:
formatted_results['json']['results'].append({
'content': result['text'],
'metadata': result['source'],
'score': float(result['score'])
})
return formatted_results
##################################BM25##########################################
def get_retrieval_bm25(query):
bm25_retriever = BM25Retriever()
bm25_retriever.index_data(formatted_data)
results = bm25_retriever.search(query, k=3)
formatted_results = {
'json': {
'question': query,
'results': []
}
}
for result in results:
formatted_results['json']['results'].append({
'content': result['text'],
'metadata': result['source'],
'score': float(result['score'])
})
return formatted_results
#######################################dense retrieval###################################
import numpy as np
from typing import Callable, List
from scipy.spatial.distance import cdist
import pickle
import os
class DenseRetriever:
"""
A retriever model that uses dense embeddings for indexing and searching documents.
Attributes:
vectorizer (Callable): The function used to generate embeddings.
index (np.ndarray): The indexed embeddings.
data (list): The data to be indexed.
"""
def __init__(self, vectorizer: Callable):
"""
Initialize the DenseRetriever.
Args:
vectorizer (Callable): The function to generate embeddings.
"""
self.vectorizer = vectorizer
self.index = None
self.data = None
def load_index(self, filepath: str):
"""
Load the index and metadata from a pickle file.
Args:
filepath (str): Path to the .pkl file containing 'index' and 'data'.
"""
with open(filepath, 'rb') as f:
saved = pickle.load(f)
self.index = saved['index']
self.data = saved['data']
def index_data(self, data: List[dict]):
"""
Indexes the provided data using dense embeddings.
Args:
data (list): A list of documents to be indexed. Each document should be a dictionary
containing a key 'cleaned_content' with the text to be indexed.
"""
self.data = data
docs = [doc["cleaned_content"] for doc in data]
embeddings = self.vectorizer(docs)
self.index = np.array(embeddings)
def search(self, query: str, k: int = 5) -> List[dict]:
"""
Searches the indexed data for the given query using cosine similarity.
Args:
query (str): The search query.
k (int): The number of top results to return.
Returns:
list: A list of dictionaries containing the source, text, and score of the top-k results.
"""
query_embedding = self.vectorizer([query]) # Doit retourner une liste ou np.ndarray
# Vérification du résultat
if query_embedding is None:
raise ValueError("La fonction vectorizer a retourné None.")
query_embedding = np.array(query_embedding)
if query_embedding.ndim == 1:
query_embedding = query_embedding[np.newaxis, :] # le transformer en (1, dim)
if query_embedding.ndim != 2:
raise ValueError("query_embedding doit être un tableau 2D.")
if self.index.ndim != 2:
raise ValueError("L'index dense doit être un tableau 2D.")
if self.index.shape[1] != query_embedding.shape[1]:
raise ValueError(f"Dimensions incompatibles entre query ({query_embedding.shape[1]}) et index ({self.index.shape[1]}).")
cosine_distances = cdist(query_embedding, self.index, metric="cosine")[0]
top_k_indices = cosine_distances.argsort()[:k]
output = []
for idx in top_k_indices:
output.append(
{
"source": self.data[idx]["metadata"]["source"],
"text": self.data[idx]["cleaned_content"],
"score": 1 - cosine_distances[idx],
}
)
return output
def predict(self, query: str, k: int) -> List[dict]:
return self.search(query, k)
import os
import pickle
def get_retrieval_dense(query, model=None, api_key=None):
if model is None:
raise ValueError("Model must be specified")
if isinstance(model, list):
model = model[0] # Sécurisation
model_filename = model.split("/")[-1] + ".pkl"
index_path = os.path.join("embeddings_cache", model_filename)
if not os.path.exists(index_path):
raise FileNotFoundError(f"L'index pour le modèle {model} est introuvable à l'emplacement : {index_path}")
with open(index_path, "rb") as f:
saved = pickle.load(f)
dr = DenseRetriever(vectorizer=lambda docs: sync_embed(texts=docs, model=f"{model}", api_key=os.getenv("HF_API_KEY")))
# Attribuer les valeurs du dictionnaire à l'instance
dr.index = saved["index"]
dr.data = saved["data"]
# Exécuter la recherche
results = dr.search(query, k=3)
formatted_results = {
'json': {
'question': query,
'results': []
}
}
for result in results:
formatted_results['json']['results'].append({
'content': result['text'],
'metadata': result['source'],
'score': float(result['score'])
})
return formatted_results