Spaces:
Sleeping
Sleeping
File size: 6,010 Bytes
68fd999 c795cd4 68fd999 c795cd4 68fd999 c795cd4 68fd999 6b6def4 68fd999 6b6def4 68fd999 6b6def4 68fd999 6b6def4 68fd999 6b6def4 68fd999 c795cd4 68fd999 bd1c23b c795cd4 68fd999 c795cd4 68fd999 c795cd4 68fd999 c795cd4 68fd999 c795cd4 68fd999 c795cd4 68fd999 c795cd4 68fd999 c795cd4 68fd999 c795cd4 68fd999 c795cd4 68fd999 c795cd4 68fd999 b6127b6 68fd999 b6127b6 68fd999 b6127b6 68fd999 c795cd4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# This script defines functions that search the corpus for blocks that are similar to the query.
# Loading embeddings of the query had to be changed for deployment in production because
# my CSVs took too much space for the free tier of HuggingFace spaces.
# import packages
import numpy as np
import polars as pl
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import glob
from collections.abc import Callable
import os
def sbert_query(query: str, corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> pl.DataFrame:
"""
Calculate the cosine similarity of the query to each block of text from the corpus.
Parameters:
query (str): Text of the query to search for in the documents.
corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
model (sentence_transformers.SentenceTransformer): The model used to encode the sentences.
Returns:
polars.DataFrame: Corpus documents ranked by their match to the query.
"""
query_embeddings = np.reshape(model.encode(query), shape = (1, -1))
sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))
sorted_df = pl.DataFrame(
{
'score': np.reshape(sbert_scores, shape=-1),
'file': corpus_embeddings_df['file'],
'doc_block_indx': corpus_embeddings_df['doc_block_indx']
}).group_by("file").agg(pl.col("score").max())
# sort the results and return
return sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])]))
def sbert_query_factory(corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> Callable[[str], pl.DataFrame]:
"""
Create a function that compares query text to the corpus by matching vector space embeddings.
Parameters:
corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
model (sentence_transformers.SentenceTransformer): The model used to estimate embeddings.
Returns:
Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
"""
def do_sbert_query(query: str) -> pl.DataFrame:
"""
Compare the query to the corpus.
Parameters:
query (str): The query with which to search the corpus.
Returns:
polars.DataFrame: Corpus documents ranked by their match to the query.
"""
return sbert_query(query, corpus_embeddings_df, model)
return do_sbert_query
def load_embeddings_dfs(embeddings_dir: str = "block-embeddings") -> pl.DataFrame:
"""
Create the paragraph-feature embeddings data frame by loading all the CSVs in a directory.
Parameters:
embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
Returns:
polars.DataFrame: Data frame of the vector space embeddings for all documents in the corpus. Size is (paragraphs, features) plus two columns of metadata (`file` and `doc_block_indx` [aka within-document paragraph index].)
"""
# import the block embeddings
files = glob.glob(os.path.join(embeddings_dir, "block-embeddings") + "*")
block_embeddings_list = list()
for filename in files:
print("Reading:", filename)
block_embeddings_list.append(pl.read_csv(filename))
return pl.concat(block_embeddings_list, how = 'vertical')
def create_embeddings_search_function(model_name: str, embeddings_dir: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
"""
Create a function that compares query text to the corpus by matching vector space embeddings.
Parameters:
embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
model_name (str): Name of model used to calulate embeddings.
device (str): Device on which to do the calculations.
Returns:
Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
"""
# Instantiate the sentence-transformer model:
sentence_model = SentenceTransformer(model_name).to(device = device)
# import the embeddings CSVs
block_embeddings_df = load_embeddings_dfs(embeddings_dir)
# call the factory to make the search function and return it
return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)
def create_embeddings_search_function_from_embeddings_df(model_name: str, embeddings_df_path: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
"""
Create a function that compares query text to the corpus by matching vector space embeddings.
Parameters:
embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
model_name (str): Name of model used to calulate embeddings.
device (str): Device on which to do the calculations.
Returns:
Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
"""
# Instantiate the sentence-transformer model:
sentence_model = SentenceTransformer(model_name).to(device = device)
# import the embeddings CSVs
block_embeddings_df = pl.read_parquet(embeddings_df_path)
# call the factory to make the search function and return it
return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)
|