| import torch |
| from transformers import AutoTokenizer, AutoModel |
|
|
| def lorentz_dist(u: torch.Tensor, v: torch.Tensor) -> torch.Tensor: |
| """ |
| Computes the exact Hyperbolic distance between two batches of Lorentz vectors. |
| """ |
| |
| u_0, u_x = u[..., 0:1], u[..., 1:] |
| v_0, v_x = v[..., 0:1], v[..., 1:] |
| |
| |
| inner_product = -u_0 * v_0 + (u_x * v_x).sum(dim=-1, keepdim=True) |
| |
| |
| inner_product = torch.min(inner_product, torch.tensor(-1.0, device=u.device)) |
| return torch.acosh(-inner_product).squeeze(-1) |
|
|
| def main(): |
| model_id = "YARlabs/v5_Embedding" |
| |
| print(f"Loading {model_id}...") |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) |
| model = AutoModel.from_pretrained(model_id, trust_remote_code=True) |
| model.eval() |
|
|
| texts = [ |
| "What is the capital of France?", |
| "Paris is the capital of France.", |
| "Berlin is the capital of Germany." |
| ] |
|
|
| print("Tokenizing texts...") |
| inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt") |
|
|
| print("Generating Matryoshka Lorentz Embeddings with dimension 64...") |
| with torch.no_grad(): |
| lorentz_vectors = model(**inputs, target_dim=64) |
| |
| print(f"Vectors shape: {lorentz_vectors.shape}") |
| |
| |
| dist_correct = lorentz_dist(lorentz_vectors[0], lorentz_vectors[1]) |
| dist_wrong = lorentz_dist(lorentz_vectors[0], lorentz_vectors[2]) |
| |
| print(f"\nDistance (Question <-> Correct Answer): {dist_correct.item():.4f}") |
| print(f"Distance (Question <-> Wrong Answer): {dist_wrong.item():.4f}") |
| |
| if dist_correct.item() < dist_wrong.item(): |
| print("\n✅ Semantic search successfully retrieved the closest context!") |
|
|
| if __name__ == "__main__": |
| |
| main() |
|
|