everydaytok commited on
Commit
da60d06
·
verified ·
1 Parent(s): 5e1b7f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -103
app.py CHANGED
@@ -1,117 +1,99 @@
1
- import gradio as gr
2
  import torch
3
- from transformers import (
4
- AutoModel,
5
- AutoTokenizer,
6
- )
7
- import os
8
- from threading import Thread
9
- # import spaces
10
- import time
11
 
12
- if torch.cuda.is_available():
13
- device = torch.device("cuda")
14
- print(f"Using GPU: {torch.cuda.get_device_name(device)}")
15
- else:
16
- device = torch.device("cpu")
17
- print("Using CPU")
18
 
 
 
19
 
20
- def mean_pooling(model_output, attention_mask):
21
- token_embeddings = model_output[0]
22
- input_mask_expanded = (
23
- attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
24
- )
25
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
26
- input_mask_expanded.sum(1), min=1e-9
27
- )
 
 
 
 
 
 
 
 
 
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- def cls_pooling(model_output):
31
- return model_output[0][:, 0]
 
32
 
 
 
33
 
34
- # @spaces.GPU
35
- def get_embedding(text, use_mean_pooling, model_id):
36
 
37
- tokenizer = AutoTokenizer.from_pretrained(model_id)
38
- model = AutoModel.from_pretrained(model_id, torch_dtype=torch.float16)
 
39
 
40
- model = model.to(device)
41
- inputs = tokenizer(
42
- text, return_tensors="pt", padding=True, truncation=True, max_length=512
43
- )
44
- inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
45
- with torch.no_grad():
46
- model_output = model(**inputs)
47
- if use_mean_pooling:
48
- return mean_pooling(model_output, inputs["attention_mask"])
49
- return cls_pooling(model_output)
 
 
 
50
 
 
 
 
51
 
52
- def get_similarity(text1, text2, pooling_method, model_id):
53
- use_mean_pooling = pooling_method == "Use Mean Pooling"
54
- embedding1 = get_embedding(text1, use_mean_pooling, model_id)
55
- embedding2 = get_embedding(text2, use_mean_pooling, model_id)
56
- print("----E1----")
57
- print(embedding1)
58
- print("----E2----")
59
- print(embedding2)
60
- return torch.nn.functional.cosine_similarity(embedding1, embedding2).item()
61
 
 
 
62
 
63
- gr.Interface(
64
- get_similarity,
65
- [
66
- gr.Textbox(lines=7, label="Text 1"),
67
- gr.Textbox(lines=7, label="Text 2"),
68
- gr.Dropdown(
69
- choices=["Use Mean Pooling", "Use CLS"],
70
- value="Use Mean Pooling",
71
- label="Pooling Method",
72
- info="Mean Pooling: Averages all token embeddings (better for semantic similarity)\nCLS Pooling: Uses only the [CLS] token embedding (faster, might miss context)",
73
- ),
74
- gr.Dropdown(
75
- choices=[
76
- "nomic-ai/modernbert-embed-base",
77
- "tasksource/ModernBERT-base-embed",
78
- "tasksource/ModernBERT-base-nli",
79
- "joe32140/ModernBERT-large-msmarco",
80
- "answerdotai/ModernBERT-large",
81
- "answerdotai/ModernBERT-base",
82
- ],
83
- value="answerdotai/ModernBERT-large",
84
- label="Model",
85
- info="Choose between the variants of ModernBERT \nMight take a few seconds to load the model",
86
- ),
87
- ],
88
- gr.Textbox(label="Similarity"),
89
- title="ModernBERT Similarity Demo",
90
- description="Compute the similarity between two texts using ModernBERT. Choose between different pooling strategies for embedding generation.",
91
- examples=[
92
- [
93
- "The quick brown fox jumps over the lazy dog",
94
- "A swift brown fox leaps above a sleeping canine",
95
- "Use Mean Pooling",
96
- "answerdotai/ModernBERT-large",
97
- ],
98
- [
99
- "I love programming in Python",
100
- "I hate coding with Python",
101
- "Use Mean Pooling",
102
- "joe32140/ModernBERT-large-msmarco",
103
- ],
104
- [
105
- "The weather is beautiful today",
106
- "Machine learning models are improving rapidly",
107
- "Use Mean Pooling",
108
- "tasksource/ModernBERT-base-embed",
109
- ],
110
- [
111
- "def calculate_sum(a, b):\n return a + b",
112
- "def add_numbers(x, y):\n result = x + y\n return result",
113
- "Use Mean Pooling",
114
- "tasksource/ModernBERT-base-nli",
115
- ],
116
- ],
117
- ).launch()
 
 
1
  import torch
2
+ from transformers import BartTokenizer, BartForConditionalGeneration
3
+ from transformers.modeling_outputs import BaseModelOutput
 
 
 
 
 
 
4
 
5
+ # 1. Load the Pre-trained Model and Tokenizer
6
+ model_name = "facebook/bart-base"
7
+ print(f"Loading {model_name}...")
8
+ tokenizer = BartTokenizer.from_pretrained(model_name)
9
+ model = BartForConditionalGeneration.from_pretrained(model_name)
 
10
 
11
+ # Ensure model is in eval mode (turns off dropout for consistent results)
12
+ model.eval()
13
 
14
+ # --- FUNCTION 1: ENCODE (Text -> Embedding) ---
15
+ def text_to_embedding(text):
16
+ print(f"\n--- Encoding: '{text}' ---")
17
+
18
+ # Tokenize input
19
+ inputs = tokenizer(text, return_tensors="pt")
20
+
21
+ # Run ONLY the Encoder part of BART
22
+ # We access the internal 'model' and then its 'encoder'
23
+ with torch.no_grad():
24
+ encoder_outputs = model.model.encoder(**inputs)
25
+
26
+ # This is the "Embedding": A tensor of shape (Batch_Size, Seq_Length, 768)
27
+ embedding = encoder_outputs.last_hidden_state
28
+
29
+ print(f"Generated Vector Shape: {embedding.shape}")
30
+ # Shape explanation: [1, 8, 768] means 1 sentence, 8 tokens long, 768 dimensions per token
31
+ return embedding
32
 
33
+ # --- FUNCTION 2: DECODE (Embedding -> Text) ---
34
+ def embedding_to_text(embedding_tensor):
35
+ print("--- Decoding Vector back to Text ---")
36
+
37
+ # We must wrap the tensor in a specific class so the Generator understands it
38
+ # The generator expects an object that has a .last_hidden_state attribute
39
+ encoder_outputs_wrapped = BaseModelOutput(last_hidden_state=embedding_tensor)
40
+
41
+ # Run the Generator
42
+ # We tell it: "Don't encode anything new, use these 'encoder_outputs' I gave you."
43
+ with torch.no_grad():
44
+ generated_ids = model.generate(
45
+ encoder_outputs=encoder_outputs_wrapped,
46
+ max_length=20,
47
+ num_beams=4 # Use beam search for better quality
48
+ )
49
+
50
+ # Decode the result IDs back to strings
51
+ decoded_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
52
+ return decoded_text
53
 
54
+ # ==========================================
55
+ # TEST RUN
56
+ # ==========================================
57
 
58
+ # 1. Original Text
59
+ original_sentence = "The cat sat on the mat."
60
 
61
+ # 2. Convert to Vector
62
+ vector_representation = text_to_embedding(original_sentence)
63
 
64
+ # 3. (Optional) Simulate "Math" or "Transmission"
65
+ # Let's verify the vectors are real numbers by printing a tiny slice
66
+ print(f"First 5 values of vector: {vector_representation[0][0][:5].numpy()}")
67
 
68
+ # 4. Convert back to Text
69
+ reconstructed_text = embedding_to_text(vector_representation)
70
+
71
+ print(f"\nOriginal: {original_sentence}")
72
+ print(f"Reconstructed: {reconstructed_text}")
73
+
74
+ # ==========================================
75
+ # EXPERIMENT: MIXING VECTORS
76
+ # Let's try to 'average' two sentences and see what BART dreams up
77
+ # ==========================================
78
+ print("\n--- The Mixing Experiment ---")
79
+ s1 = "The weather is sunny."
80
+ s2 = "The weather is rainy."
81
 
82
+ # Get vectors
83
+ v1 = text_to_embedding(s1)
84
+ v2 = text_to_embedding(s2)
85
 
86
+ # To average them, they must be the same length (padding is usually handled by tokenizer,
87
+ # but here we'll just cut to the minimum length for the demo hack)
88
+ min_len = min(v1.shape[1], v2.shape[1])
89
+ v1 = v1[:, :min_len, :]
90
+ v2 = v2[:, :min_len, :]
 
 
 
 
91
 
92
+ # Calculate the mean vector
93
+ v_mixed = (v1 + v2) / 2.0
94
 
95
+ # Decode the mixed thought
96
+ mixed_text = embedding_to_text(v_mixed)
97
+ print(f"Sentence A: {s1}")
98
+ print(f"Sentence B: {s2}")
99
+ print(f"Mixed Result: {mixed_text}")