Spaces:
Sleeping
Sleeping
| import json | |
| from retrievals import TFIDFRetriever | |
| import pprint | |
| from retrievals import BM25Retriever | |
| from typing import Callable, List | |
| import numpy as np | |
| from typing import Callable | |
| import bm25s | |
| import numpy as np | |
| import Stemmer | |
| from scipy.spatial.distance import cdist | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import asyncio | |
| from typing import List, Union, Optional | |
| from transformers import AutoTokenizer, AutoModel | |
| import torch | |
| import os | |
| from typing import List, Optional, Union | |
| import requests | |
| import numpy as np | |
| from typing import Callable, List | |
| from scipy.spatial.distance import cdist | |
| from embedding_function import sync_embed | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| #################################################################################### | |
| with open("src/data_final_cleaned.json", "r", encoding="utf-8") as f: | |
| raw_data = json.load(f) | |
| formatted_data = [] | |
| for item in raw_data: | |
| if "docs" in item: | |
| metadata_value = item["docs"].get("metadata", "") | |
| content_value = item["docs"].get("content", "") | |
| formatted_data.append({ | |
| "cleaned_content": content_value, | |
| "metadata": {"source": metadata_value} | |
| }) | |
| ######################################"TF_IDF######################################## | |
| def get_retrieval_tf_idf(query): | |
| tfidf_retriever = TFIDFRetriever() | |
| tfidf_retriever.index_data(formatted_data) | |
| results = tfidf_retriever.search(query, k=3) | |
| formatted_results = { | |
| 'json': { | |
| 'question': query, | |
| 'results': [] | |
| } | |
| } | |
| for result in results: | |
| formatted_results['json']['results'].append({ | |
| 'content': result['text'], | |
| 'metadata': result['source'], | |
| 'score': float(result['score']) | |
| }) | |
| return formatted_results | |
| ##################################BM25########################################## | |
| def get_retrieval_bm25(query): | |
| bm25_retriever = BM25Retriever() | |
| bm25_retriever.index_data(formatted_data) | |
| results = bm25_retriever.search(query, k=3) | |
| formatted_results = { | |
| 'json': { | |
| 'question': query, | |
| 'results': [] | |
| } | |
| } | |
| for result in results: | |
| formatted_results['json']['results'].append({ | |
| 'content': result['text'], | |
| 'metadata': result['source'], | |
| 'score': float(result['score']) | |
| }) | |
| return formatted_results | |
| #######################################dense retrieval################################### | |
| import numpy as np | |
| from typing import Callable, List | |
| from scipy.spatial.distance import cdist | |
| import pickle | |
| import os | |
| class DenseRetriever: | |
| """ | |
| A retriever model that uses dense embeddings for indexing and searching documents. | |
| Attributes: | |
| vectorizer (Callable): The function used to generate embeddings. | |
| index (np.ndarray): The indexed embeddings. | |
| data (list): The data to be indexed. | |
| """ | |
| def __init__(self, vectorizer: Callable): | |
| """ | |
| Initialize the DenseRetriever. | |
| Args: | |
| vectorizer (Callable): The function to generate embeddings. | |
| """ | |
| self.vectorizer = vectorizer | |
| self.index = None | |
| self.data = None | |
| def load_index(self, filepath: str): | |
| """ | |
| Load the index and metadata from a pickle file. | |
| Args: | |
| filepath (str): Path to the .pkl file containing 'index' and 'data'. | |
| """ | |
| with open(filepath, 'rb') as f: | |
| saved = pickle.load(f) | |
| self.index = saved['index'] | |
| self.data = saved['data'] | |
| def index_data(self, data: List[dict]): | |
| """ | |
| Indexes the provided data using dense embeddings. | |
| Args: | |
| data (list): A list of documents to be indexed. Each document should be a dictionary | |
| containing a key 'cleaned_content' with the text to be indexed. | |
| """ | |
| self.data = data | |
| docs = [doc["cleaned_content"] for doc in data] | |
| embeddings = self.vectorizer(docs) | |
| self.index = np.array(embeddings) | |
| def search(self, query: str, k: int = 5) -> List[dict]: | |
| """ | |
| Searches the indexed data for the given query using cosine similarity. | |
| Args: | |
| query (str): The search query. | |
| k (int): The number of top results to return. | |
| Returns: | |
| list: A list of dictionaries containing the source, text, and score of the top-k results. | |
| """ | |
| query_embedding = self.vectorizer([query]) # Doit retourner une liste ou np.ndarray | |
| # Vérification du résultat | |
| if query_embedding is None: | |
| raise ValueError("La fonction vectorizer a retourné None.") | |
| query_embedding = np.array(query_embedding) | |
| if query_embedding.ndim == 1: | |
| query_embedding = query_embedding[np.newaxis, :] # le transformer en (1, dim) | |
| if query_embedding.ndim != 2: | |
| raise ValueError("query_embedding doit être un tableau 2D.") | |
| if self.index.ndim != 2: | |
| raise ValueError("L'index dense doit être un tableau 2D.") | |
| if self.index.shape[1] != query_embedding.shape[1]: | |
| raise ValueError(f"Dimensions incompatibles entre query ({query_embedding.shape[1]}) et index ({self.index.shape[1]}).") | |
| cosine_distances = cdist(query_embedding, self.index, metric="cosine")[0] | |
| top_k_indices = cosine_distances.argsort()[:k] | |
| output = [] | |
| for idx in top_k_indices: | |
| output.append( | |
| { | |
| "source": self.data[idx]["metadata"]["source"], | |
| "text": self.data[idx]["cleaned_content"], | |
| "score": 1 - cosine_distances[idx], | |
| } | |
| ) | |
| return output | |
| def predict(self, query: str, k: int) -> List[dict]: | |
| return self.search(query, k) | |
| import os | |
| import pickle | |
| def get_retrieval_dense(query, model=None, api_key=None): | |
| if model is None: | |
| raise ValueError("Model must be specified") | |
| if isinstance(model, list): | |
| model = model[0] # Sécurisation | |
| model_filename = model.split("/")[-1] + ".pkl" | |
| index_path = os.path.join("embeddings_cache", model_filename) | |
| if not os.path.exists(index_path): | |
| raise FileNotFoundError(f"L'index pour le modèle {model} est introuvable à l'emplacement : {index_path}") | |
| with open(index_path, "rb") as f: | |
| saved = pickle.load(f) | |
| dr = DenseRetriever(vectorizer=lambda docs: sync_embed(texts=docs, model=f"{model}", api_key=os.getenv("HF_API_KEY"))) | |
| # Attribuer les valeurs du dictionnaire à l'instance | |
| dr.index = saved["index"] | |
| dr.data = saved["data"] | |
| # Exécuter la recherche | |
| results = dr.search(query, k=3) | |
| formatted_results = { | |
| 'json': { | |
| 'question': query, | |
| 'results': [] | |
| } | |
| } | |
| for result in results: | |
| formatted_results['json']['results'].append({ | |
| 'content': result['text'], | |
| 'metadata': result['source'], | |
| 'score': float(result['score']) | |
| }) | |
| return formatted_results | |