Embedding_Inverse

Sleeping

App Files Files Community

everydaytok commited on 21 days ago

Commit

da60d06

verified ·

1 Parent(s): 5e1b7f6

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -103

app.py CHANGED Viewed

@@ -1,117 +1,99 @@
-import gradio as gr
 import torch
-from transformers import (
-    AutoModel,
-    AutoTokenizer,
-)
-import os
-from threading import Thread
-# import spaces
-import time
-if torch.cuda.is_available():
-    device = torch.device("cuda")
-    print(f"Using GPU: {torch.cuda.get_device_name(device)}")
-else:
-    device = torch.device("cpu")
-    print("Using CPU")
-def mean_pooling(model_output, attention_mask):
-    token_embeddings = model_output[0]
-    input_mask_expanded = (
-        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    )
-    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
-        input_mask_expanded.sum(1), min=1e-9
-    )
-def cls_pooling(model_output):
-    return model_output[0][:, 0]
-# @spaces.GPU
-def get_embedding(text, use_mean_pooling, model_id):
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = AutoModel.from_pretrained(model_id, torch_dtype=torch.float16)
-    model = model.to(device)
-    inputs = tokenizer(
-        text, return_tensors="pt", padding=True, truncation=True, max_length=512
-    )
-    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
-    with torch.no_grad():
-        model_output = model(**inputs)
-    if use_mean_pooling:
-        return mean_pooling(model_output, inputs["attention_mask"])
-    return cls_pooling(model_output)
-def get_similarity(text1, text2, pooling_method, model_id):
-    use_mean_pooling = pooling_method == "Use Mean Pooling"
-    embedding1 = get_embedding(text1, use_mean_pooling, model_id)
-    embedding2 = get_embedding(text2, use_mean_pooling, model_id)
-    print("----E1----")
-    print(embedding1)
-    print("----E2----")
-    print(embedding2)
-    return torch.nn.functional.cosine_similarity(embedding1, embedding2).item()
-gr.Interface(
-    get_similarity,
-    [
-        gr.Textbox(lines=7, label="Text 1"),
-        gr.Textbox(lines=7, label="Text 2"),
-        gr.Dropdown(
-            choices=["Use Mean Pooling", "Use CLS"],
-            value="Use Mean Pooling",
-            label="Pooling Method",
-            info="Mean Pooling: Averages all token embeddings (better for semantic similarity)\nCLS Pooling: Uses only the [CLS] token embedding (faster, might miss context)",
-        ),
-        gr.Dropdown(
-            choices=[
-                "nomic-ai/modernbert-embed-base",
-                "tasksource/ModernBERT-base-embed",
-                "tasksource/ModernBERT-base-nli",
-                "joe32140/ModernBERT-large-msmarco",
-                "answerdotai/ModernBERT-large",
-                "answerdotai/ModernBERT-base",
-            ],
-            value="answerdotai/ModernBERT-large",
-            label="Model",
-            info="Choose between the variants of ModernBERT \nMight take a few seconds to load the model",
-        ),
-    ],
-    gr.Textbox(label="Similarity"),
-    title="ModernBERT Similarity Demo",
-    description="Compute the similarity between two texts using ModernBERT. Choose between different pooling strategies for embedding generation.",
-    examples=[
-        [
-            "The quick brown fox jumps over the lazy dog",
-            "A swift brown fox leaps above a sleeping canine",
-            "Use Mean Pooling",
-            "answerdotai/ModernBERT-large",
-        ],
-        [
-            "I love programming in Python",
-            "I hate coding with Python",
-            "Use Mean Pooling",
-            "joe32140/ModernBERT-large-msmarco",
-        ],
-        [
-            "The weather is beautiful today",
-            "Machine learning models are improving rapidly",
-            "Use Mean Pooling",
-            "tasksource/ModernBERT-base-embed",
-        ],
-        [
-            "def calculate_sum(a, b):\n    return a + b",
-            "def add_numbers(x, y):\n    result = x + y\n    return result",
-            "Use Mean Pooling",
-            "tasksource/ModernBERT-base-nli",
-        ],
-    ],
-).launch()

 import torch
+from transformers import BartTokenizer, BartForConditionalGeneration
+from transformers.modeling_outputs import BaseModelOutput
+# 1. Load the Pre-trained Model and Tokenizer
+model_name = "facebook/bart-base"
+print(f"Loading {model_name}...")
+tokenizer = BartTokenizer.from_pretrained(model_name)
+model = BartForConditionalGeneration.from_pretrained(model_name)
+# Ensure model is in eval mode (turns off dropout for consistent results)
+model.eval()
+# --- FUNCTION 1: ENCODE (Text -> Embedding) ---
+def text_to_embedding(text):
+    print(f"\n--- Encoding: '{text}' ---")
+    # Tokenize input
+    inputs = tokenizer(text, return_tensors="pt")
+    # Run ONLY the Encoder part of BART
+    # We access the internal 'model' and then its 'encoder'
+    with torch.no_grad():
+        encoder_outputs = model.model.encoder(**inputs)
+    # This is the "Embedding": A tensor of shape (Batch_Size, Seq_Length, 768)
+    embedding = encoder_outputs.last_hidden_state
+    print(f"Generated Vector Shape: {embedding.shape}")
+    # Shape explanation: [1, 8, 768] means 1 sentence, 8 tokens long, 768 dimensions per token
+    return embedding
+# --- FUNCTION 2: DECODE (Embedding -> Text) ---
+def embedding_to_text(embedding_tensor):
+    print("--- Decoding Vector back to Text ---")
+    # We must wrap the tensor in a specific class so the Generator understands it
+    # The generator expects an object that has a .last_hidden_state attribute
+    encoder_outputs_wrapped = BaseModelOutput(last_hidden_state=embedding_tensor)
+    # Run the Generator
+    # We tell it: "Don't encode anything new, use these 'encoder_outputs' I gave you."
+    with torch.no_grad():
+        generated_ids = model.generate(
+            encoder_outputs=encoder_outputs_wrapped,
+            max_length=20,
+            num_beams=4 # Use beam search for better quality
+        )
+    # Decode the result IDs back to strings
+    decoded_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    return decoded_text
+# ==========================================
+# TEST RUN
+# ==========================================
+# 1. Original Text
+original_sentence = "The cat sat on the mat."
+# 2. Convert to Vector
+vector_representation = text_to_embedding(original_sentence)
+# 3. (Optional) Simulate "Math" or "Transmission"
+# Let's verify the vectors are real numbers by printing a tiny slice
+print(f"First 5 values of vector: {vector_representation[0][0][:5].numpy()}")
+# 4. Convert back to Text
+reconstructed_text = embedding_to_text(vector_representation)
+print(f"\nOriginal:      {original_sentence}")
+print(f"Reconstructed: {reconstructed_text}")
+# ==========================================
+# EXPERIMENT: MIXING VECTORS
+# Let's try to 'average' two sentences and see what BART dreams up
+# ==========================================
+print("\n--- The Mixing Experiment ---")
+s1 = "The weather is sunny."
+s2 = "The weather is rainy."
+# Get vectors
+v1 = text_to_embedding(s1)
+v2 = text_to_embedding(s2)
+# To average them, they must be the same length (padding is usually handled by tokenizer,
+# but here we'll just cut to the minimum length for the demo hack)
+min_len = min(v1.shape[1], v2.shape[1])
+v1 = v1[:, :min_len, :]
+v2 = v2[:, :min_len, :]
+# Calculate the mean vector
+v_mixed = (v1 + v2) / 2.0
+# Decode the mixed thought
+mixed_text = embedding_to_text(v_mixed)
+print(f"Sentence A: {s1}")
+print(f"Sentence B: {s2}")
+print(f"Mixed Result: {mixed_text}")