AbstractPhil commited on
Commit
2028a79
·
verified ·
1 Parent(s): 91ea190

Create colab_test_script.py

Browse files
Files changed (1) hide show
  1. colab_test_script.py +86 -0
colab_test_script.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ========================================================================================================================== #
2
+ # CLEAN TEST: AutoModel load from HuggingFace
3
+ # Run on a fresh Colab runtime with no prior state
4
+ # Paste this in Colab and it will simply run.
5
+ # Upcoming heads will add direct finetune capacity to this tiny model with exquisite potential.
6
+ # ========================================================================================================================== #
7
+
8
+ from transformers import AutoModel, AutoTokenizer
9
+ import torch
10
+
11
+ REPO_ID = "AbstractPhil/geolip-captionbert-8192"
12
+
13
+ print("Loading model...")
14
+ model = AutoModel.from_pretrained(REPO_ID, trust_remote_code=True)
15
+ model.eval()
16
+ print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}")
17
+
18
+ print("Loading tokenizer...")
19
+ tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True)
20
+ print(f" Vocab: {tokenizer.vocab_size}")
21
+
22
+ # Encode
23
+ texts = [
24
+ "girl",
25
+ "boy",
26
+ "woman",
27
+ "man",
28
+ "mans",
29
+ "womens",
30
+ "women",
31
+ "woman",
32
+ "adjacency",
33
+ "adjacent",
34
+ "nearby",
35
+ "near",
36
+ "away",
37
+ "aways",
38
+ "similar",
39
+ "dissimilar",
40
+ "solid",
41
+ "liquid",
42
+ "prophetic",
43
+ "predictive",
44
+ "similarity",
45
+ "differentiation",
46
+ "differential",
47
+ "addition",
48
+ "subtraction",
49
+ "division",
50
+ "multiplication"
51
+ #"A cat sitting on a windowsill watching birds outside",
52
+ #"A golden retriever playing fetch on the beach at sunset",
53
+ #"A still life painting with flowers and fruit on a table",
54
+ #"An aerial photograph of a city skyline at night",
55
+ #"A child riding a bicycle through autumn leaves in a park",
56
+ #"a girl performing an action",
57
+ #"a boy performing an action",
58
+ #"a woman performing an action",
59
+ #"a man performing an action",
60
+ ]
61
+
62
+ inputs = tokenizer(texts, max_length=8192, padding=True,
63
+ truncation=True, return_tensors="pt")
64
+
65
+ with torch.no_grad():
66
+ outputs = model(**inputs)
67
+
68
+ emb = outputs.last_hidden_state
69
+ print(f"\n Output shape: {emb.shape}")
70
+ print(f" Norms: {emb.norm(dim=-1).tolist()}")
71
+
72
+ # Pairwise similarity
73
+ print(f"\n Pairwise cosine similarity:")
74
+ sim = emb @ emb.T
75
+ for i in range(len(texts)):
76
+ for j in range(i+1, len(texts)):
77
+ print(f" [{i}]↔[{j}]: {sim[i,j]:.3f} ({texts[i][:40]}↔{texts[j][:40]})")
78
+
79
+ # Test encode convenience method
80
+ if hasattr(model, 'encode'):
81
+ print(f"\n Testing encode() method...")
82
+ e = model.encode(["Hello world", "Testing the encoder"])
83
+ print(f" Shape: {e.shape}")
84
+ print(f" Cosine: {(e[0] @ e[1]).item():.3f}")
85
+
86
+ print("\n✓ All tests passed")