Spaces:

samiha123
/

retrieval_evaluation

Sleeping

App Files Files Community

samiha123 commited on Jun 28, 2025

Commit

07c3ebf

1 Parent(s): 3e8c377

first commit

Browse files

Files changed (13) hide show

requirements.txt +7 -3
src/data_final_cleaned.json +0 -0
src/embedding_function.py +58 -0
src/embeddings_cache/all-MiniLM-L6-v2.pkl +3 -0
src/embeddings_cache/distiluse-base-multilingual-cased-v2.pkl +3 -0
src/embeddings_cache/e5-small-v2.pkl +3 -0
src/embeddings_cache/multi-qa-MiniLM-L6-cos-v1.pkl +3 -0
src/embeddings_cache/multilingual-e5-large.pkl +3 -0
src/embeddings_cache/multilingual-e5-small.pkl +3 -0
src/embeddings_cache/paraphrase-mpnet-base-v2.pkl +3 -0
src/retrieval.py +247 -0
src/retrievals.py +243 -0
src/streamlit_app.py +115 -38

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
-altair
-pandas
-streamlit

+streamlit==1.41.1
+scipy
+pystemmer
+scikit-learn
+bm25s
+transformers
+torch

src/data_final_cleaned.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/embedding_function.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""
+This module provides functionality to embed texts using the Hugging Face API.
+It includes an EmbeddingFunction class for asynchronous embedding and a sync_embed function for synchronous embedding.
+"""
+from huggingface_hub import InferenceClient
+import asyncio
+import os
+from typing import List, Optional, Union
+import os
+from huggingface_hub import InferenceClient
+import httpx
+from dotenv import load_dotenv
+load_dotenv()
+TextType = Union[str, List[str]]
+class EmbeddingFunction:
+    """
+    A class to handle embedding functions using the Hugging Face API.
+    """
+    def __init__(
+        self,
+        model: str,
+        api_key: Optional[str] = None,
+        batch_size: int = 50,
+        api_url: Optional[str] = None,
+    ):
+        """
+        Initialize the EmbeddingFunction.
+        Args:
+            model (str): The model to use for embedding.
+            api_key (Optional[str]): The API key for the Hugging Face API. If not provided,
+                it will be fetched from the environment variable `HF_API_KEY`.
+            batch_size (int): The number of texts to process in a single batch. Default is 50.
+            api_url (Optional[str]): Custom API URL for Hugging Face inference endpoint.
+        """
+def sync_embed(texts: str, model: str, api_key: str) -> list:
+    """
+    Extrait les embeddings d'un texte via l'API Inference de Hugging Face.
+    Args:
+        texts (str): Le texte à encoder.
+        model (str): Le modèle Hugging Face à utiliser.
+        api_key (str): La clé API Hugging Face.
+    Returns:
+        list: Les embeddings du texte.
+    """
+    client = InferenceClient(provider="hf-inference", api_key=api_key)
+    result = client.feature_extraction(texts, model=model)
+    return result[0]  # Retourne le premier embedding

src/embeddings_cache/all-MiniLM-L6-v2.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8df55f6acd3449885335d0199fa46ea1f243d627370c0d741c71f96ac9ee9a05
+size 1875998

src/embeddings_cache/distiluse-base-multilingual-cased-v2.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57f452770dd3ba3527bbd587465b0ea6ae6d29c27ecc5278a5b56c1d7adad52c
+size 3485726

src/embeddings_cache/e5-small-v2.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ed0114d23bdee9ad0fc620f5f593e4ca20c43173539c8f9bd2f3cc9b807f8da
+size 4439841

src/embeddings_cache/multi-qa-MiniLM-L6-cos-v1.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7aebb55ef48daaf077e1ccca89345ea530b8eeccb3d3b1216388c93025a10456
+size 4439841

src/embeddings_cache/multilingual-e5-large.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5871f6ba37ea5bae28531c12169148c36cf7ecd414a4af5aadbc01ca77890c3
+size 2434314

src/embeddings_cache/multilingual-e5-small.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a42cf13b938bd83500736c2644758950a0b2ad5aadc2b11d5c7b719e319eead1
+size 1875998

src/embeddings_cache/paraphrase-mpnet-base-v2.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78c5816feafe64571efe8a43cd3db4c7c12f4f040e596c8eb09af7b60f58429e
+size 1897738

src/retrieval.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import json
+from retrievals import TFIDFRetriever
+import pprint
+from retrievals import BM25Retriever
+from typing import Callable, List
+import numpy as np
+from typing import Callable
+import bm25s
+import numpy as np
+import Stemmer
+from scipy.spatial.distance import cdist
+from sklearn.feature_extraction.text import TfidfVectorizer
+import asyncio
+from typing import List, Union, Optional
+from transformers import AutoTokenizer, AutoModel
+import torch
+import os
+from typing import List, Optional, Union
+import requests
+import numpy as np
+from typing import Callable, List
+from scipy.spatial.distance import cdist
+from retrieval_evaluation.src.embedding_function import sync_embed
+####################################################################################
+with open("data_final_cleaned.json", "r", encoding="utf-8") as f:
+    raw_data = json.load(f)
+formatted_data = []
+for item in raw_data:
+    if "docs" in item:
+        metadata_value = item["docs"].get("metadata", "")
+        content_value = item["docs"].get("content", "")
+        formatted_data.append({
+            "cleaned_content": content_value,
+            "metadata": {"source": metadata_value}
+        })
+######################################"TF_IDF########################################
+def get_retrieval_tf_idf(query):
+    tfidf_retriever = TFIDFRetriever()
+    tfidf_retriever.index_data(formatted_data)
+    results = tfidf_retriever.search(query, k=3)
+    formatted_results = {
+        'json': {
+            'question': query,
+            'results': []
+        }
+    }
+    for result in results:
+        formatted_results['json']['results'].append({
+            'content': result['text'],
+            'metadata': result['source'],
+            'score': float(result['score'])
+        })
+    return formatted_results
+##################################BM25##########################################
+def get_retrieval_bm25(query):
+    bm25_retriever = BM25Retriever()
+    bm25_retriever.index_data(formatted_data)
+    results = bm25_retriever.search(query, k=3)
+    formatted_results = {
+        'json': {
+            'question': query,
+            'results': []
+        }
+    }
+    for result in results:
+        formatted_results['json']['results'].append({
+            'content': result['text'],
+            'metadata': result['source'],
+            'score': float(result['score'])
+        })
+    return formatted_results
+    #######################################dense retrieval###################################
+import numpy as np
+from typing import Callable, List
+from scipy.spatial.distance import cdist
+import pickle
+import os
+class DenseRetriever:
+    """
+    A retriever model that uses dense embeddings for indexing and searching documents.
+    Attributes:
+        vectorizer (Callable): The function used to generate embeddings.
+        index (np.ndarray): The indexed embeddings.
+        data (list): The data to be indexed.
+    """
+    def __init__(self, vectorizer: Callable):
+        """
+        Initialize the DenseRetriever.
+        Args:
+            vectorizer (Callable): The function to generate embeddings.
+        """
+        self.vectorizer = vectorizer
+        self.index = None
+        self.data = None
+    def load_index(self, filepath: str):
+        """
+        Load the index and metadata from a pickle file.
+        Args:
+            filepath (str): Path to the .pkl file containing 'index' and 'data'.
+        """
+        with open(filepath, 'rb') as f:
+            saved = pickle.load(f)
+            self.index = saved['index']
+            self.data = saved['data']
+    def index_data(self, data: List[dict]):
+        """
+        Indexes the provided data using dense embeddings.
+        Args:
+            data (list): A list of documents to be indexed. Each document should be a dictionary
+                         containing a key 'cleaned_content' with the text to be indexed.
+        """
+        self.data = data
+        docs = [doc["cleaned_content"] for doc in data]
+        embeddings = self.vectorizer(docs)
+        self.index = np.array(embeddings)
+    def search(self, query: str, k: int = 5) -> List[dict]:
+        """
+        Searches the indexed data for the given query using cosine similarity.
+        Args:
+            query (str): The search query.
+            k (int): The number of top results to return.
+        Returns:
+            list: A list of dictionaries containing the source, text, and score of the top-k results.
+        """
+        query_embedding = self.vectorizer([query])  # Doit retourner une liste ou np.ndarray
+        # Vérification du résultat
+        if query_embedding is None:
+            raise ValueError("La fonction vectorizer a retourné None.")
+        query_embedding = np.array(query_embedding)
+        if query_embedding.ndim == 1:
+            query_embedding = query_embedding[np.newaxis, :]  # le transformer en (1, dim)
+        if query_embedding.ndim != 2:
+            raise ValueError("query_embedding doit être un tableau 2D.")
+        if self.index.ndim != 2:
+            raise ValueError("L'index dense doit être un tableau 2D.")
+        if self.index.shape[1] != query_embedding.shape[1]:
+            raise ValueError(f"Dimensions incompatibles entre query ({query_embedding.shape[1]}) et index ({self.index.shape[1]}).")
+        cosine_distances = cdist(query_embedding, self.index, metric="cosine")[0]
+        top_k_indices = cosine_distances.argsort()[:k]
+        output = []
+        for idx in top_k_indices:
+            output.append(
+                {
+                    "source": self.data[idx]["metadata"]["source"],
+                    "text": self.data[idx]["cleaned_content"],
+                    "score": 1 - cosine_distances[idx],
+                }
+            )
+        return output
+    def predict(self, query: str, k: int) -> List[dict]:
+        return self.search(query, k)
+import os
+import pickle
+def get_retrieval_dense(query, model=None, api_key=None):
+    if model is None:
+        raise ValueError("Model must be specified")
+    if isinstance(model, list):
+        model = model[0]  # Sécurisation
+    model_filename = model.split("/")[-1] + ".pkl"
+    index_path = os.path.join("embeddings_cache", model_filename)
+    if not os.path.exists(index_path):
+        raise FileNotFoundError(f"L'index pour le modèle {model} est introuvable à l'emplacement : {index_path}")
+    with open(index_path, "rb") as f:
+        saved = pickle.load(f)
+    dr = DenseRetriever(vectorizer=lambda docs: sync_embed(texts=docs, model=f"{model}", api_key=os.getenv("HF_API_KEY")))
+# Attribuer les valeurs du dictionnaire à l'instance
+    dr.index = saved["index"]
+    dr.data = saved["data"]
+    # Exécuter la recherche
+    results = dr.search(query, k=3)
+    formatted_results = {
+        'json': {
+            'question': query,
+            'results': []
+        }
+    }
+    for result in results:
+        formatted_results['json']['results'].append({
+            'content': result['text'],
+            'metadata': result['source'],
+            'score': float(result['score'])
+        })
+    return formatted_results

src/retrievals.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""
+This module contains implementations of various retriever models for document retrieval.
+"""
+from typing import Callable
+import bm25s
+import numpy as np
+import Stemmer
+from scipy.spatial.distance import cdist
+from sklearn.feature_extraction.text import TfidfVectorizer
+import asyncio
+from typing import List, Union, Optional
+from transformers import AutoTokenizer, AutoModel
+import torch
+import os
+from typing import List, Optional, Union
+import requests
+import numpy as np
+from typing import Callable, List
+from scipy.spatial.distance import cdist
+class TFIDFRetriever:
+    """
+    A retriever model that uses TF-IDF for indexing and searching documents.
+    Attributes:
+        vectorizer (TfidfVectorizer): The TF-IDF vectorizer.
+        index (scipy.sparse matrix): The indexed TF-IDF vectors.
+        data (list): The original data used for indexing.
+    """
+    def __init__(self):
+        self.vectorizer = TfidfVectorizer()
+        self.index = None
+        self.data = None
+        self.stemmer = Stemmer.Stemmer("english")
+    def index_data(self, data):
+        """
+        Indexes the provided data using TF-IDF.
+        Args:
+            data (list): A list of documents to be indexed. Each document should be a dictionary
+                         containing a key 'cleaned_content' with the text to be indexed.
+        """
+        self.data = data
+        docs = [doc["cleaned_content"] for doc in data]
+        self.index = self.vectorizer.fit_transform(docs)
+    def search(self, query, k=5):
+        """
+        Searches the indexed data for the given query using cosine similarity.
+        Args:
+            query (str): The search query.
+            k (int): The number of top results to return. Default is 5.
+        Returns:
+            list: A list of dictionaries containing the source, text, and score of the top-k results.
+        """
+        query_vec = self.vectorizer.transform([query])
+        cosine_distances = cdist(
+            query_vec.todense(), self.index.todense(), metric="cosine"
+        )[0]
+        top_k_indices = cosine_distances.argsort()[:k]
+        output = []
+        for idx in top_k_indices:
+            output.append(
+                {
+                    "source": self.data[idx]["metadata"]["source"],
+                    "text": self.data[idx]["cleaned_content"],
+                    "score": 1 - cosine_distances[idx],
+                }
+            )
+        return output
+    def predict(self, query: str, k: int):
+        """
+        Predicts the top-k results for the given query.
+        Args:
+            query (str): The search query.
+            k (int): The number of top results to return.
+        Returns:
+            list: A list of dictionaries containing the source, text, and score of the top-k results.
+        """
+        return self.search(query, k)
+################################################BM25##########################################
+class BM25Retriever:
+    """
+    A retriever model that uses BM25 for indexing and searching documents.
+    Attributes:
+        index (bm25s.BM25): The BM25 index.
+        data (list): The data to be indexed.
+    """
+    def __init__(self):
+        self.index = bm25s.BM25()
+        self.data = None
+    def index_data(self, data):
+        """
+        Indexes the provided data using BM25.
+        Args:
+            data (list): A list of documents to be indexed. Each document should be a dictionary
+                         containing a key 'cleaned_content' with the text to be indexed.
+        """
+        self.data = data
+        corpus = [doc["cleaned_content"] for doc in data]
+        corpus_tokens = bm25s.tokenize(corpus, show_progress=False)
+        self.index.index(corpus_tokens, show_progress=False)
+    def search(self, query, k=5):
+        """
+        Searches the indexed data for the given query using BM25.
+        Args:
+            query (str): The search query.
+            k (int): The number of top results to return. Default is 5.
+        Returns:
+            list: A list of dictionaries containing the source, text, and score of the top-k results.
+        """
+        query_tokens = bm25s.tokenize(query, show_progress=False)
+        # Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
+        results, scores = self.index.retrieve(
+            query_tokens, corpus=self.data, k=k, show_progress=False
+        )
+        output = []
+        for idx in range(results.shape[1]):
+            output.append(
+                {
+                    "source": results[0, idx]["metadata"]["source"],
+                    "text": results[0, idx]["cleaned_content"],
+                    "score": scores[0, idx],
+                }
+            )
+        return output
+    def predict(self, query: str, k: int):
+        """
+        Predicts the top-k results for the given query.
+        Args:
+            query (str): The search query.
+            k (int): The number of top results to return.
+        Returns:
+            list: A list of dictionaries containing the source, text, and score of the top-k results.
+        """
+        return self.search(query, k)
+###########################################EMBEDDINGS##########################################
+class DenseRetriever:
+    """
+    A retriever model that uses dense embeddings for indexing and searching documents.
+    Attributes:
+        vectorizer (Callable): The function used to generate embeddings.
+        index (np.ndarray): The indexed embeddings.
+        data (list): The data to be indexed.
+    """
+    def __init__(self, vectorizer: Callable, batch_size: int = 50):
+        """
+        Initialize the DenseRetriever.
+        Args:
+            vectorizer (Callable): The function to generate embeddings.
+            batch_size (int): The number of texts to process in a single batch. Default is 50.
+        """
+        self.vectorizer = vectorizer
+        self.batch_size = batch_size
+        self.index = None
+        self.data = None
+    def index_data(self, data: List[dict]):
+        """
+        Indexes the provided data using dense embeddings.
+        Args:
+            data (list): A list of documents to be indexed. Each document should be a dictionary
+                         containing a key 'cleaned_content' with the text to be indexed.
+        """
+        self.data = data
+        docs = [doc["cleaned_content"] for doc in data]
+        embeddings = self.vectorizer(docs)
+        self.index = np.array(embeddings)
+    def search(self, query: str, k: int = 5) -> List[dict]:
+        """
+        Searches the indexed data for the given query using cosine similarity.
+        Args:
+            query (str): The search query.
+            k (int): The number of top results to return. Default is 5.
+        Returns:
+            list: A list of dictionaries containing the source, text, and score of the top-k results.
+        """
+        query_embedding = self.vectorizer([query])
+        cosine_distances = cdist(query_embedding, self.index, metric="cosine")[0]
+        top_k_indices = cosine_distances.argsort()[:k]
+        output = []
+        for idx in top_k_indices:
+            output.append(
+                {
+                    "source": self.data[idx]["metadata"]["source"],
+                    "text": self.data[idx]["cleaned_content"],
+                    "score": 1 - cosine_distances[idx],
+                }
+            )
+        return output
+    def predict(self, query: str, k: int) -> List[dict]:
+        """
+        Predicts the top-k results for the given query.
+        Args:
+            query (str): The search query.
+            k (int): The number of top results to return.
+        Returns:
+            list: A list of dictionaries containing the source, text, and score of the top-k results.
+        """
+        return self.search(query, k)

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,117 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import json
+from retrievals import TFIDFRetriever, BM25Retriever
+from retrieval import get_retrieval_tf_idf, get_retrieval_bm25, get_retrieval_dense
+from embedding_function import sync_embed
+import numpy as np
+import os
+st.set_page_config(
+    page_title="Vector Store Query App",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+st.markdown("""
+    <style>
+        .block-container {
+            padding-top: 2rem;
+            padding-bottom: 2rem;
+        }
+        .section {
+            padding: 1rem;
+            border-radius: 0.5rem;
+            margin-bottom: 1rem;
+        }
+    </style>
+""", unsafe_allow_html=True)
+with st.sidebar:
+    st.title("About")
+    st.markdown("""
+    This app allows you to query a vector store and view results in both JSON format
+    and rendered markdown. Enter your question in the main panel and click 'Search'.
+    """)
+    retrieval_method = st.selectbox(
+        "Choose the retrieval method:",
+        ["Sparse Retrievals", "Dense Retrievals", "Hybrid Retrievals"]
+    )
+    if retrieval_method == "Sparse Retrievals":
+        sparse_method = st.selectbox(
+            "Choose a Sparse Retrieval method:",
+            ["BM25", "TF-IDF"]
+        )
+        st.write(f"Selected Sparse Method: {sparse_method}")
+    elif retrieval_method == "Dense Retrievals":
+        model_selection = st.selectbox(
+            "Choose a model:",
+            [
+                "sentence-transformers/all-MiniLM-L6-v2",
+                "intfloat/multilingual-e5-large"
+            ]
+        )
+        st.write(f"Selected model: {model_selection}")
+        st.session_state.model_selection = model_selection
+st.title("Vector Store Query Interface")
+if 'results' not in st.session_state:
+    st.session_state.results = None
+with st.form("query_form"):
+    col1, col2 = st.columns([4, 1])
+    with col1:
+        query = st.text_input(
+            "Enter your question:",
+            placeholder="What are you looking for?",
+            label_visibility="collapsed"
+        )
+    with col2:
+        st.write("")
+        if st.form_submit_button("Search", use_container_width=True):
+            if query:
+                # Dense Retrieval with selected model
+                if retrieval_method == "Dense Retrievals":
+                    model_selection = st.session_state.get('model_selection')
+                    api_key = os.getenv("HF_API_KEY")
+                    embeddings = sync_embed(texts=query, model=model_selection, api_key=api_key)
+                    st.session_state.results = get_retrieval_dense(query, model=model_selection, api_key=api_key)
+                elif retrieval_method == "Sparse Retrievals" and sparse_method == "TF-IDF":
+                    st.session_state.results = get_retrieval_tf_idf(query)
+                elif retrieval_method == "Sparse Retrievals" and sparse_method == "BM25":
+                    st.session_state.results = get_retrieval_bm25(query)
+            else:
+                st.warning("Please enter a question")
+if st.session_state.results:
+    st.divider()
+    st.subheader("Results")
+    col_left, col_right = st.columns([1, 2], gap="large")
+    with col_left:
+        st.markdown("**JSON Output**")
+        st.code(
+            json.dumps(st.session_state.results['json'], indent=2),
+            language='json'
+        )
+    with col_right:
+        st.markdown("**Document Content**")
+        for i, doc in enumerate(st.session_state.results['json']['results']):
+            with st.container():
+                st.markdown(f"### Document {i+1}")
+                st.markdown(doc['content'])
+                st.markdown(f"**Source:** {doc['metadata']}")
+                st.divider()
+elif st.session_state.results is None:
+    st.info("👈 Enter a question and click Search to get started")