Create colab_test_script.py

Browse files

Files changed (1) hide show

colab_test_script.py +86 -0

colab_test_script.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# ========================================================================================================================== #
+# CLEAN TEST: AutoModel load from HuggingFace
+# Run on a fresh Colab runtime with no prior state
+# Paste this in Colab and it will simply run.
+# Upcoming heads will add direct finetune capacity to this tiny model with exquisite potential.
+# ========================================================================================================================== #
+from transformers import AutoModel, AutoTokenizer
+import torch
+REPO_ID = "AbstractPhil/geolip-captionbert-8192"
+print("Loading model...")
+model = AutoModel.from_pretrained(REPO_ID, trust_remote_code=True)
+model.eval()
+print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")
+print("Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True)
+print(f"  Vocab: {tokenizer.vocab_size}")
+# Encode
+texts = [
+    "girl",
+    "boy",
+    "woman",
+    "man",
+    "mans",
+    "womens",
+    "women",
+    "woman",
+    "adjacency",
+    "adjacent",
+    "nearby",
+    "near",
+    "away",
+    "aways",
+    "similar",
+    "dissimilar",
+    "solid",
+    "liquid",
+    "prophetic",
+    "predictive",
+    "similarity",
+    "differentiation",
+    "differential",
+    "addition",
+    "subtraction",
+    "division",
+    "multiplication"
+    #"A cat sitting on a windowsill watching birds outside",
+    #"A golden retriever playing fetch on the beach at sunset",
+    #"A still life painting with flowers and fruit on a table",
+    #"An aerial photograph of a city skyline at night",
+    #"A child riding a bicycle through autumn leaves in a park",
+    #"a girl performing an action",
+    #"a boy performing an action",
+    #"a woman performing an action",
+    #"a man performing an action",
+]
+inputs = tokenizer(texts, max_length=8192, padding=True,
+                   truncation=True, return_tensors="pt")
+with torch.no_grad():
+    outputs = model(**inputs)
+emb = outputs.last_hidden_state
+print(f"\n  Output shape: {emb.shape}")
+print(f"  Norms: {emb.norm(dim=-1).tolist()}")
+# Pairwise similarity
+print(f"\n  Pairwise cosine similarity:")
+sim = emb @ emb.T
+for i in range(len(texts)):
+    for j in range(i+1, len(texts)):
+        print(f"    [{i}]↔[{j}]: {sim[i,j]:.3f}  ({texts[i][:40]}↔{texts[j][:40]})")
+# Test encode convenience method
+if hasattr(model, 'encode'):
+    print(f"\n  Testing encode() method...")
+    e = model.encode(["Hello world", "Testing the encoder"])
+    print(f"    Shape: {e.shape}")
+    print(f"    Cosine: {(e[0] @ e[1]).item():.3f}")
+print("\n✓ All tests passed")