366degrees commited on
Commit
66c5911
·
verified ·
1 Parent(s): cfa8d90

Delete snp_universal_embedding.py

Browse files
Files changed (1) hide show
  1. snp_universal_embedding.py +0 -148
snp_universal_embedding.py DELETED
@@ -1,148 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """SNP-Universal-Embedding.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1z8p0PYKMZjd6IZ2FEgxtRddl7t_52iFA
8
- """
9
-
10
- !pip uninstall -y tokenizers transformers sentence-transformers
11
- !pip cache purge
12
-
13
- !pip install -q torch==2.8.0+cu126 torchvision==0.23.0+cu126 torchaudio==2.8.0+cu126 --index-url https://download.pytorch.org/whl/cu126
14
- !pip install -q tokenizers==0.19.1 transformers==4.40.1 sentence-transformers==2.6.1
15
-
16
- !pip install -q torch==2.8.0+cu126 torchvision==0.23.0+cu126 torchaudio==2.8.0+cu126 --index-url https://download.pytorch.org/whl/cu126
17
- !pip install -q tokenizers==0.19.1 transformers==4.40.1 sentence-transformers==2.6.1
18
-
19
- import torch
20
- from sentence_transformers import SentenceTransformer
21
- from sentence_transformers.models import Pooling
22
- from transformers import AutoTokenizer, AutoModel
23
-
24
- print("✅ Environment ready")
25
- print("Torch:", torch.__version__)
26
-
27
- import torch.nn as nn
28
- from transformers import AutoModel
29
-
30
- class CustomSNPModel(nn.Module):
31
- def __init__(self, base_model="roberta-base"):
32
- super().__init__()
33
- self.shared_encoder = AutoModel.from_pretrained(base_model)
34
- hidden_size = self.shared_encoder.config.hidden_size
35
- self.mirror_head = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.Tanh())
36
- self.prism_head = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.Tanh())
37
- self.projection = nn.Linear(hidden_size, 6) # Changed output dimension to 6
38
-
39
- def forward(self, input_ids, attention_mask=None, token_type_ids=None):
40
- outputs = self.shared_encoder(
41
- input_ids=input_ids,
42
- attention_mask=attention_mask,
43
- token_type_ids=token_type_ids
44
- )
45
- cls = outputs.last_hidden_state[:, 0, :] # [CLS] embedding
46
- mirror = self.mirror_head(cls)
47
- prism = self.prism_head(cls)
48
- proj = self.projection(cls)
49
-
50
- # 🧩 Instead of combining 768 and 6-D tensors, just output your 6-D Prism embedding
51
- return proj
52
-
53
-
54
- print("✅ SNP architecture defined.")
55
-
56
- import os
57
- import torch
58
- from sentence_transformers import SentenceTransformer
59
- from sentence_transformers.models import Pooling
60
- from transformers import AutoTokenizer, AutoModel
61
-
62
- ckpt_path = "/content/custom_snp_model_greene.pt"
63
- assert os.path.exists(ckpt_path), "❌ Greene checkpoint not found."
64
-
65
- state_dict = torch.load(ckpt_path, map_location="cpu")
66
-
67
- if "projection.weight" in state_dict:
68
- w = state_dict["projection.weight"]
69
- if w.shape == torch.Size([768, 6]): # Greene version
70
- print("🔁 Transposing projection.weight to match current model shape...")
71
- state_dict["projection.weight"] = w.T
72
-
73
- if "projection.bias" in state_dict:
74
- b = state_dict["projection.bias"]
75
- if b.shape == torch.Size([768]): # Greene version
76
- print("🔧 Adjusting projection.bias shape to match current model...")
77
- state_dict["projection.bias"] = b[:6] # keep first 6 or reshape accordingly
78
-
79
- # Remove distributed prefixes if any
80
- clean_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
81
-
82
- model = CustomSNPModel(base_model="bert-base-uncased")
83
- missing, unexpected = model.load_state_dict(clean_state_dict, strict=False)
84
- print(f"✅ Checkpoint loaded.\nMissing keys: {len(missing)} | Unexpected: {len(unexpected)}")
85
-
86
- # ============================================================
87
- # 🔹 Quick Embedding Test for CustomSNPModel
88
- # (Safe version that drops token_type_ids)
89
- # ============================================================
90
-
91
- import torch
92
-
93
- # Example text input
94
- text = "A student must decide between a scholarship and their family."
95
-
96
- # Tokenize
97
- inputs = tokenizer(text, return_tensors="pt")
98
-
99
- # Remove token_type_ids if your model doesn't expect it
100
- if "token_type_ids" in inputs:
101
- del inputs["token_type_ids"]
102
-
103
- # Run inference
104
- with torch.no_grad():
105
- output = model(**inputs)
106
-
107
- # Handle different output formats
108
- if isinstance(output, tuple):
109
- emb = output[0]
110
- elif isinstance(output, dict):
111
- emb = output.get("pooler_output", output.get("last_hidden_state"))
112
- else:
113
- emb = output
114
-
115
- print("✅ Embedding generated successfully.")
116
- print("Embedding shape:", emb.shape if hasattr(emb, "shape") else type(emb))
117
-
118
- import os, torch, json
119
- from transformers import AutoTokenizer
120
-
121
- EXPORT_DIR = "/content/SNP_Universal_Embedding"
122
- os.makedirs(EXPORT_DIR, exist_ok=True)
123
-
124
- # Save model weights
125
- torch.save(model.state_dict(), os.path.join(EXPORT_DIR, "pytorch_model.bin"))
126
-
127
- # Save config manually (add your own details)
128
- config = {
129
- "model_type": "custom_snp",
130
- "base_model": "bert-base-uncased",
131
- "embedding_dimension": 6,
132
- "description": "SNP-Universal-Embedding — distilled from emotional geometry via Substrate-Prism Neuron framework."
133
- }
134
- with open(os.path.join(EXPORT_DIR, "config.json"), "w") as f:
135
- json.dump(config, f, indent=4)
136
-
137
- # Save tokenizer
138
- tokenizer.save_pretrained(EXPORT_DIR)
139
-
140
- print("✅ Model and tokenizer saved to:", EXPORT_DIR)
141
- !ls -lh $EXPORT_DIR
142
-
143
- import shutil
144
- from google.colab import files
145
-
146
- ZIP_PATH = "/content/SNP-Universal-Embedding.zip"
147
- shutil.make_archive("/content/SNP-Universal-Embedding", 'zip', EXPORT_DIR)
148
- files.download(ZIP_PATH)