Embedding_Inverse_bart_with_slider

Sleeping

App Files Files Community

everydaytok commited on Feb 2

Commit

04701d7

verified ·

1 Parent(s): 7ec13e1

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -80

app.py CHANGED Viewed

@@ -4,123 +4,79 @@ from transformers import BartTokenizer, BartForConditionalGeneration
 from transformers.modeling_outputs import BaseModelOutput
 # ==========================================
-# 1. SETUP: Load Model (Global Scope)
 # ==========================================
-model_name = "facebook/bart-base"
 print(f"Loading {model_name}...")
 tokenizer = BartTokenizer.from_pretrained(model_name)
 model = BartForConditionalGeneration.from_pretrained(model_name)
-model.eval() # Set to evaluation mode
 # ==========================================
-# 2. CORE LOGIC FUNCTIONS
 # ==========================================
 def text_to_embedding(text):
-    """Encodes text into the BART Latent Space (Vectors)."""
     inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         encoder_outputs = model.model.encoder(**inputs)
     return encoder_outputs.last_hidden_state
 def embedding_to_text(embedding_tensor):
-    """Decodes a Vector back into Text."""
     encoder_outputs_wrapped = BaseModelOutput(last_hidden_state=embedding_tensor)
     with torch.no_grad():
         generated_ids = model.generate(
             encoder_outputs=encoder_outputs_wrapped,
             max_length=50,
-            num_beams=4,
-            early_stopping=True
         )
     decoded_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return decoded_text
-# ==========================================
-# 3. GRADIO INTERFACE FUNCTIONS
-# ==========================================
-def run_reconstruction(text):
-    if not text:
-        return "", "Please enter text."
-    # 1. Encode
-    vector = text_to_embedding(text)
-    # 2. Decode
-    reconstructed = embedding_to_text(vector)
-    # 3. Get Stats
-    shape_info = f"Vector Shape: {vector.shape} (Batch, Tokens, Dimensions)"
-    preview = f"First 5 values: {vector[0][0][:5].numpy().tolist()}"
-    debug_info = f"{shape_info}\n{preview}"
-    return reconstructed, debug_info
-def run_mixing(text1, text2):
-    if not text1 or not text2:
-        return "Please enter two sentences."
-    # 1. Get vectors
     v1 = text_to_embedding(text1)
     v2 = text_to_embedding(text2)
-    # 2. Align lengths (Truncate to minimum length)
-    # Note: In a production app, you might want to pad instead of truncate,
-    # but for this specific "averaging" demo, truncation prevents dimension mismatch errors.
     min_len = min(v1.shape[1], v2.shape[1])
-    v1_cut = v1[:, :min_len, :]
-    v2_cut = v2[:, :min_len, :]
-    # 3. Math: Average the vectors
-    v_mixed = (v1_cut + v2_cut) / 2.0
-    # 4. Decode
-    mixed_text = embedding_to_text(v_mixed)
-    return mixed_text
 # ==========================================
-# 4. BUILD UI
 # ==========================================
-with gr.Blocks(title="BART Latent Space Explorer", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🧠 BART Latent Space Explorer")
-    gr.Markdown("This tool uses `facebook/bart-base` to convert text into mathematical vectors (Embeddings) and back.")
-    with gr.Tabs():
-        # --- TAB 1: RECONSTRUCTION ---
-        with gr.TabItem("1. Auto-Encoder Test"):
-            gr.Markdown("Type a sentence. The model will turn it into numbers, then turn those numbers back into text.")
-            with gr.Row():
-                with gr.Column():
-                    input_text = gr.Textbox(label="Original Sentence", value="The cat sat on the mat.")
-                    btn_recon = gr.Button("Encode & Decode", variant="primary")
-                with gr.Column():
-                    output_recon = gr.Textbox(label="Reconstructed Text")
-                    output_debug = gr.Code(label="Vector Stats", language="json")
-            btn_recon.click(run_reconstruction, inputs=input_text, outputs=[output_recon, output_debug])
-        # --- TAB 2: VECTOR MIXING ---
-        with gr.TabItem("2. Vector Mixing (Math)"):
-            gr.Markdown("Type two different sentences. We will average their mathematical representations. Results may be surreal!")
-            with gr.Row():
-                with gr.Column():
-                    mix_in_1 = gr.Textbox(label="Sentence A", value="The weather is sunny.")
-                    mix_in_2 = gr.Textbox(label="Sentence B", value="The weather is rainy.")
-                    btn_mix = gr.Button("Calculate Average Meaning", variant="primary")
-                with gr.Column():
-                    mix_out = gr.Textbox(label="The AI's 'Middle Ground' Thought", lines=4)
-            btn_mix.click(run_mixing, inputs=[mix_in_1, mix_in_2], outputs=mix_out)
 if __name__ == "__main__":
     demo.launch()

 from transformers.modeling_outputs import BaseModelOutput
 # ==========================================
+# 1. SETUP: Use BART-Large (The Best "Parrot")
 # ==========================================
+# BART is an auto-encoder. Its job is to reconstruct inputs, not chat.
+model_name = "facebook/bart-large"
 print(f"Loading {model_name}...")
 tokenizer = BartTokenizer.from_pretrained(model_name)
 model = BartForConditionalGeneration.from_pretrained(model_name)
+model.eval()
 # ==========================================
+# 2. STRICT LOGIC
 # ==========================================
 def text_to_embedding(text):
     inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         encoder_outputs = model.model.encoder(**inputs)
     return encoder_outputs.last_hidden_state
 def embedding_to_text(embedding_tensor):
     encoder_outputs_wrapped = BaseModelOutput(last_hidden_state=embedding_tensor)
     with torch.no_grad():
         generated_ids = model.generate(
             encoder_outputs=encoder_outputs_wrapped,
+            # --- STRICT TRANSCRIPTION SETTINGS ---
             max_length=50,
+            num_beams=1,       # Greedy Search (No creative exploring)
+            do_sample=False,   # Deterministic (No randomness)
+            temperature=1.0,   # Standard probability curve
+            repetition_penalty=1.0 # Don't punish repeating words (we want exact copies)
         )
     decoded_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return decoded_text
+def run_weighted_mixing(text1, text2, mix_ratio):
+    if not text1 or not text2: return "Enter sentences."
     v1 = text_to_embedding(text1)
     v2 = text_to_embedding(text2)
+    # Truncate to min length
     min_len = min(v1.shape[1], v2.shape[1])
+    v1 = v1[:, :min_len, :]
+    v2 = v2[:, :min_len, :]
+    # Weighted Average
+    v_mixed = (v1 * (1 - mix_ratio)) + (v2 * mix_ratio)
+    return embedding_to_text(v_mixed)
 # ==========================================
+# 3. UI
 # ==========================================
+with gr.Blocks(title="BART-Large Vector Decoder", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🦜 BART Strict Vector Decoder")
+    gr.Markdown("This version uses `bart-large` with **Greedy Search** to force direct transcription instead of creative generation.")
+    with gr.Row():
+        t1 = gr.Textbox(label="Start Sentence", value="The dog is happy.")
+        t2 = gr.Textbox(label="End Sentence", value="The cat is angry.")
+    # 0.0 means 100% Start Sentence. 1.0 means 100% End Sentence.
+    slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="Ratio (0 = Start, 1 = End)")
+    btn_mix = gr.Button("Decode Vector")
+    out = gr.Textbox(label="Decoded Text")
+    btn_mix.click(run_weighted_mixing, inputs=[t1, t2, slider], outputs=out)
 if __name__ == "__main__":
     demo.launch()