Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -113,12 +113,15 @@ def get_splade_cocondenser_representation(text):
|
|
| 113 |
|
| 114 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 115 |
|
| 116 |
-
formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n"
|
| 117 |
if not sorted_representation:
|
| 118 |
formatted_output += "No significant terms found for this input.\n"
|
| 119 |
else:
|
|
|
|
|
|
|
| 120 |
for term, weight in sorted_representation:
|
| 121 |
-
|
|
|
|
| 122 |
|
| 123 |
info_output = f"--- Sparse Vector Info ---\n"
|
| 124 |
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
|
@@ -168,12 +171,15 @@ def get_splade_lexical_representation(text):
|
|
| 168 |
|
| 169 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 170 |
|
| 171 |
-
formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n"
|
| 172 |
if not sorted_representation:
|
| 173 |
formatted_output += "No significant terms found for this input.\n"
|
| 174 |
else:
|
|
|
|
|
|
|
| 175 |
for term, weight in sorted_representation:
|
| 176 |
-
|
|
|
|
| 177 |
|
| 178 |
info_output = f"--- Raw Sparse Vector Info ---\n"
|
| 179 |
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
|
@@ -210,15 +216,15 @@ def get_splade_doc_representation(text):
|
|
| 210 |
|
| 211 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
|
| 212 |
|
| 213 |
-
formatted_output = "Binary Bag-of-Words Representation:\n" # Changed title
|
| 214 |
if not sorted_representation:
|
| 215 |
formatted_output += "No significant terms found for this input.\n"
|
| 216 |
else:
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
|
| 223 |
info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
|
| 224 |
info_output += f"Total activated terms: {len(indices)}\n"
|
|
@@ -302,7 +308,7 @@ def get_splade_doc_vector(text):
|
|
| 302 |
# This function remains unchanged as it's a generic formatter for any sparse vector.
|
| 303 |
def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
| 304 |
if splade_vector is None:
|
| 305 |
-
return "Failed to generate vector."
|
| 306 |
|
| 307 |
indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
|
| 308 |
if not isinstance(indices, list):
|
|
@@ -326,20 +332,23 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
|
| 326 |
else:
|
| 327 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 328 |
|
| 329 |
-
formatted_output = ""
|
| 330 |
if not sorted_representation:
|
| 331 |
formatted_output += "No significant terms found.\n"
|
| 332 |
else:
|
|
|
|
| 333 |
for i, (term, weight) in enumerate(sorted_representation):
|
| 334 |
-
|
| 335 |
-
|
|
|
|
| 336 |
break
|
| 337 |
if is_binary:
|
| 338 |
-
|
| 339 |
else:
|
| 340 |
-
|
|
|
|
| 341 |
|
| 342 |
-
info_output = f"
|
| 343 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
|
| 344 |
|
| 345 |
return formatted_output, info_output # Now returns two strings
|
|
@@ -375,22 +384,23 @@ def calculate_dot_product_and_representations_independent(query_model_choice, do
|
|
| 375 |
# and to ensure .item() works reliably for conversion to float.
|
| 376 |
dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
|
| 377 |
|
| 378 |
-
# Format representations
|
| 379 |
-
# These functions now return two strings (main_output, info_output)
|
| 380 |
query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
|
| 381 |
doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
|
| 382 |
|
| 383 |
|
| 384 |
-
|
| 385 |
-
query_rep_str += query_main_rep_str + "\n" + query_info_str
|
| 386 |
-
|
| 387 |
-
doc_rep_str = f"Document Representation ({doc_model_name_display}):\n"
|
| 388 |
-
doc_rep_str += doc_main_rep_str + "\n" + doc_info_str
|
| 389 |
-
|
| 390 |
-
# Combine output
|
| 391 |
full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
|
| 392 |
full_output += "---\n\n"
|
| 393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
return full_output
|
| 396 |
|
|
|
|
| 113 |
|
| 114 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 115 |
|
| 116 |
+
formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n" # Added newline
|
| 117 |
if not sorted_representation:
|
| 118 |
formatted_output += "No significant terms found for this input.\n"
|
| 119 |
else:
|
| 120 |
+
# Changed to paragraph style
|
| 121 |
+
terms_list = []
|
| 122 |
for term, weight in sorted_representation:
|
| 123 |
+
terms_list.append(f"**{term}**: {weight:.4f}")
|
| 124 |
+
formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
|
| 125 |
|
| 126 |
info_output = f"--- Sparse Vector Info ---\n"
|
| 127 |
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
|
|
|
| 171 |
|
| 172 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 173 |
|
| 174 |
+
formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n" # Added newline
|
| 175 |
if not sorted_representation:
|
| 176 |
formatted_output += "No significant terms found for this input.\n"
|
| 177 |
else:
|
| 178 |
+
# Changed to paragraph style
|
| 179 |
+
terms_list = []
|
| 180 |
for term, weight in sorted_representation:
|
| 181 |
+
terms_list.append(f"**{term}**: {weight:.4f}")
|
| 182 |
+
formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
|
| 183 |
|
| 184 |
info_output = f"--- Raw Sparse Vector Info ---\n"
|
| 185 |
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
|
|
|
| 216 |
|
| 217 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
|
| 218 |
|
| 219 |
+
formatted_output = "Binary Bag-of-Words Representation:\n\n" # Changed title, added newline
|
| 220 |
if not sorted_representation:
|
| 221 |
formatted_output += "No significant terms found for this input.\n"
|
| 222 |
else:
|
| 223 |
+
# Changed to paragraph style
|
| 224 |
+
terms_list = []
|
| 225 |
+
for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
|
| 226 |
+
terms_list.append(f"**{term}**")
|
| 227 |
+
formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
|
| 228 |
|
| 229 |
info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
|
| 230 |
info_output += f"Total activated terms: {len(indices)}\n"
|
|
|
|
| 308 |
# This function remains unchanged as it's a generic formatter for any sparse vector.
|
| 309 |
def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
| 310 |
if splade_vector is None:
|
| 311 |
+
return "Failed to generate vector.", ""
|
| 312 |
|
| 313 |
indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
|
| 314 |
if not isinstance(indices, list):
|
|
|
|
| 332 |
else:
|
| 333 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
| 334 |
|
| 335 |
+
formatted_output = "" # Removed initial newline to allow control outside
|
| 336 |
if not sorted_representation:
|
| 337 |
formatted_output += "No significant terms found.\n"
|
| 338 |
else:
|
| 339 |
+
terms_list = []
|
| 340 |
for i, (term, weight) in enumerate(sorted_representation):
|
| 341 |
+
# Limit display for very long lists, but ensure it's still a paragraph if cut
|
| 342 |
+
if i >= 50:
|
| 343 |
+
terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
|
| 344 |
break
|
| 345 |
if is_binary:
|
| 346 |
+
terms_list.append(f"**{term}**")
|
| 347 |
else:
|
| 348 |
+
terms_list.append(f"**{term}**: {weight:.4f}")
|
| 349 |
+
formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
|
| 350 |
|
| 351 |
+
info_output = f"Total non-zero terms: {len(indices)}\n"
|
| 352 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
|
| 353 |
|
| 354 |
return formatted_output, info_output # Now returns two strings
|
|
|
|
| 384 |
# and to ensure .item() works reliably for conversion to float.
|
| 385 |
dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
|
| 386 |
|
| 387 |
+
# Format representations - these functions now return two strings (main_output, info_output)
|
|
|
|
| 388 |
query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
|
| 389 |
doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
|
| 390 |
|
| 391 |
|
| 392 |
+
# Combine output into a single string for the Markdown component
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
|
| 394 |
full_output += "---\n\n"
|
| 395 |
+
|
| 396 |
+
# Query Representation
|
| 397 |
+
full_output += f"Query Representation ({query_model_name_display}):\n\n"
|
| 398 |
+
full_output += query_main_rep_str + "\n\n" + query_info_str # Added an extra newline for better spacing
|
| 399 |
+
full_output += "\n\n---\n\n" # Separator
|
| 400 |
+
|
| 401 |
+
# Document Representation
|
| 402 |
+
full_output += f"Document Representation ({doc_model_name_display}):\n\n"
|
| 403 |
+
full_output += doc_main_rep_str + "\n\n" + doc_info_str # Added an extra newline for better spacing
|
| 404 |
|
| 405 |
return full_output
|
| 406 |
|