Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -185,16 +185,35 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
| 185 |
|
| 186 |
return results[:top_k], end_time - start_time, vector_store
|
| 187 |
|
| 188 |
-
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model):
|
| 189 |
-
|
| 190 |
"num_results": len(results),
|
| 191 |
-
"avg_content_length":
|
| 192 |
"search_time": search_time,
|
| 193 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
| 194 |
"num_documents": len(vector_store.docstore._dict),
|
| 195 |
"num_tokens": num_tokens,
|
| 196 |
-
"embedding_vocab_size": embedding_model.client.get_vocab_size() if hasattr(embedding_model, 'client') and hasattr(embedding_model.client, 'get_vocab_size') else "N/A"
|
|
|
|
|
|
|
| 197 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
| 200 |
# Tokenize the texts
|
|
@@ -236,7 +255,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
|
| 236 |
|
| 237 |
return tokenizer, optimized_texts
|
| 238 |
|
| 239 |
-
def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang, use_custom_embedding, optimize_vocab, phonetic_weight):
|
| 240 |
all_results = []
|
| 241 |
all_stats = []
|
| 242 |
settings = {
|
|
@@ -273,6 +292,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
| 273 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
| 274 |
chunks = optimized_chunks
|
| 275 |
|
|
|
|
| 276 |
results, search_time, vector_store = search_embeddings(
|
| 277 |
chunks,
|
| 278 |
embedding_model,
|
|
@@ -284,7 +304,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
| 284 |
phonetic_weight
|
| 285 |
)
|
| 286 |
|
| 287 |
-
stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model)
|
| 288 |
stats["model"] = f"{model_type} - {model_name}"
|
| 289 |
stats.update(settings)
|
| 290 |
|
|
@@ -309,6 +329,39 @@ def format_results(results, stats):
|
|
| 309 |
formatted_results.append(result)
|
| 310 |
return formatted_results
|
| 311 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
def launch_interface(share=True):
|
| 313 |
iface = gr.Interface(
|
| 314 |
fn=compare_embeddings,
|
|
@@ -331,7 +384,8 @@ def launch_interface(share=True):
|
|
| 331 |
],
|
| 332 |
outputs=[
|
| 333 |
gr.Dataframe(label="Results", interactive=False),
|
| 334 |
-
gr.Dataframe(label="Statistics", interactive=False)
|
|
|
|
| 335 |
],
|
| 336 |
title="Advanced Embedding Comparison Tool",
|
| 337 |
description="Compare different embedding models and retrieval strategies with advanced preprocessing and phonetic matching"
|
|
|
|
| 185 |
|
| 186 |
return results[:top_k], end_time - start_time, vector_store
|
| 187 |
|
| 188 |
+
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
|
| 189 |
+
stats = {
|
| 190 |
"num_results": len(results),
|
| 191 |
+
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
| 192 |
"search_time": search_time,
|
| 193 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
| 194 |
"num_documents": len(vector_store.docstore._dict),
|
| 195 |
"num_tokens": num_tokens,
|
| 196 |
+
"embedding_vocab_size": embedding_model.client.get_vocab_size() if hasattr(embedding_model, 'client') and hasattr(embedding_model.client, 'get_vocab_size') else "N/A",
|
| 197 |
+
"embedding_dimension": len(embedding_model.embed_query(query)),
|
| 198 |
+
"top_k": top_k,
|
| 199 |
}
|
| 200 |
+
|
| 201 |
+
# Calculate diversity of results
|
| 202 |
+
if len(results) > 1:
|
| 203 |
+
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
| 204 |
+
pairwise_similarities = cosine_similarity(embeddings)
|
| 205 |
+
stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
| 206 |
+
else:
|
| 207 |
+
stats["result_diversity"] = "N/A"
|
| 208 |
+
|
| 209 |
+
# Calculate rank correlation between embedding similarity and result order
|
| 210 |
+
query_embedding = embedding_model.embed_query(query)
|
| 211 |
+
result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
| 212 |
+
similarities = [cosine_similarity([query_embedding], [emb])[0][0] for emb in result_embeddings]
|
| 213 |
+
rank_correlation, _ = spearmanr(similarities, range(len(similarities)))
|
| 214 |
+
stats["rank_correlation"] = rank_correlation
|
| 215 |
+
|
| 216 |
+
return stats
|
| 217 |
|
| 218 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
| 219 |
# Tokenize the texts
|
|
|
|
| 255 |
|
| 256 |
return tokenizer, optimized_texts
|
| 257 |
|
| 258 |
+
def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3):
|
| 259 |
all_results = []
|
| 260 |
all_stats = []
|
| 261 |
settings = {
|
|
|
|
| 292 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
| 293 |
chunks = optimized_chunks
|
| 294 |
|
| 295 |
+
|
| 296 |
results, search_time, vector_store = search_embeddings(
|
| 297 |
chunks,
|
| 298 |
embedding_model,
|
|
|
|
| 304 |
phonetic_weight
|
| 305 |
)
|
| 306 |
|
| 307 |
+
stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k)
|
| 308 |
stats["model"] = f"{model_type} - {model_name}"
|
| 309 |
stats.update(settings)
|
| 310 |
|
|
|
|
| 329 |
formatted_results.append(result)
|
| 330 |
return formatted_results
|
| 331 |
|
| 332 |
+
import matplotlib.pyplot as plt
|
| 333 |
+
import seaborn as sns
|
| 334 |
+
from sklearn.manifold import TSNE
|
| 335 |
+
|
| 336 |
+
def visualize_results(results_df, stats_df):
|
| 337 |
+
# Create a figure with subplots
|
| 338 |
+
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
|
| 339 |
+
|
| 340 |
+
# 1. Bar plot of search times
|
| 341 |
+
sns.barplot(x='model', y='search_time', data=stats_df, ax=axs[0, 0])
|
| 342 |
+
axs[0, 0].set_title('Search Time by Model')
|
| 343 |
+
axs[0, 0].set_xticklabels(axs[0, 0].get_xticklabels(), rotation=45, ha='right')
|
| 344 |
+
|
| 345 |
+
# 2. Scatter plot of result diversity vs. rank correlation
|
| 346 |
+
sns.scatterplot(x='result_diversity', y='rank_correlation', hue='model', data=stats_df, ax=axs[0, 1])
|
| 347 |
+
axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
|
| 348 |
+
|
| 349 |
+
# 3. Box plot of content lengths
|
| 350 |
+
sns.boxplot(x='model', y='content_length', data=results_df, ax=axs[1, 0])
|
| 351 |
+
axs[1, 0].set_title('Distribution of Result Content Lengths')
|
| 352 |
+
axs[1, 0].set_xticklabels(axs[1, 0].get_xticklabels(), rotation=45, ha='right')
|
| 353 |
+
|
| 354 |
+
# 4. t-SNE visualization of embeddings
|
| 355 |
+
embeddings = np.array(results_df['embedding'].tolist())
|
| 356 |
+
tsne = TSNE(n_components=2, random_state=42)
|
| 357 |
+
embeddings_2d = tsne.fit_transform(embeddings)
|
| 358 |
+
|
| 359 |
+
sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], hue=results_df['model'], ax=axs[1, 1])
|
| 360 |
+
axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
|
| 361 |
+
|
| 362 |
+
plt.tight_layout()
|
| 363 |
+
return fig
|
| 364 |
+
|
| 365 |
def launch_interface(share=True):
|
| 366 |
iface = gr.Interface(
|
| 367 |
fn=compare_embeddings,
|
|
|
|
| 384 |
],
|
| 385 |
outputs=[
|
| 386 |
gr.Dataframe(label="Results", interactive=False),
|
| 387 |
+
gr.Dataframe(label="Statistics", interactive=False),
|
| 388 |
+
gr.Plot(label="Visualizations")
|
| 389 |
],
|
| 390 |
title="Advanced Embedding Comparison Tool",
|
| 391 |
description="Compare different embedding models and retrieval strategies with advanced preprocessing and phonetic matching"
|