File size: 2,633 Bytes
2028a79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# ========================================================================================================================== #
# CLEAN TEST: AutoModel load from HuggingFace
# Run on a fresh Colab runtime with no prior state
# Paste this in Colab and it will simply run.
# Upcoming heads will add direct finetune capacity to this tiny model with exquisite potential.
# ========================================================================================================================== #

from transformers import AutoModel, AutoTokenizer
import torch

REPO_ID = "AbstractPhil/geolip-captionbert-8192"

print("Loading model...")
model = AutoModel.from_pretrained(REPO_ID, trust_remote_code=True)
model.eval()
print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True)
print(f"  Vocab: {tokenizer.vocab_size}")

# Encode
texts = [
    "girl",
    "boy",
    "woman",
    "man",
    "mans",
    "womens",
    "women",
    "woman",
    "adjacency",
    "adjacent",
    "nearby",
    "near",
    "away",
    "aways",
    "similar",
    "dissimilar",
    "solid",
    "liquid",
    "prophetic",
    "predictive",
    "similarity",
    "differentiation",
    "differential",
    "addition",
    "subtraction",
    "division",
    "multiplication"
    #"A cat sitting on a windowsill watching birds outside",
    #"A golden retriever playing fetch on the beach at sunset",
    #"A still life painting with flowers and fruit on a table",
    #"An aerial photograph of a city skyline at night",
    #"A child riding a bicycle through autumn leaves in a park",
    #"a girl performing an action",
    #"a boy performing an action",
    #"a woman performing an action",
    #"a man performing an action",
]

inputs = tokenizer(texts, max_length=8192, padding=True,
                   truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

emb = outputs.last_hidden_state
print(f"\n  Output shape: {emb.shape}")
print(f"  Norms: {emb.norm(dim=-1).tolist()}")

# Pairwise similarity
print(f"\n  Pairwise cosine similarity:")
sim = emb @ emb.T
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        print(f"    [{i}]↔[{j}]: {sim[i,j]:.3f}  ({texts[i][:40]}{texts[j][:40]})")

# Test encode convenience method
if hasattr(model, 'encode'):
    print(f"\n  Testing encode() method...")
    e = model.encode(["Hello world", "Testing the encoder"])
    print(f"    Shape: {e.shape}")
    print(f"    Cosine: {(e[0] @ e[1]).item():.3f}")

print("\n✓ All tests passed")