PunchNFIT commited on
Commit
bcede05
Β·
1 Parent(s): 96c00df

Initial commit - SNP Universal Embedding model

Browse files
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Make absolutely sure the working directory exists
4
+ RUN mkdir -p /app
5
+ WORKDIR /app
6
+
7
+ # Print working directory (for debugging)
8
+ RUN echo "βœ… Building from context:" && pwd && ls -R
9
+
10
+ # Copy specific files explicitly by name
11
+ COPY api_inference.py /app/api_inference.py
12
+ COPY snp_universal_embedding.py /app/snp_universal_embedding.py
13
+ COPY config.json /app/config.json
14
+ COPY tokenizer.json /app/tokenizer.json
15
+ COPY pytorch_model.bin /app/pytorch_model.bin
16
+ COPY requirements.txt /app/requirements.txt
17
+
18
+ # Install dependencies
19
+ RUN pip install --no-cache-dir -r /app/requirements.txt
20
+
21
+ # Expose Hugging Face port
22
+ EXPOSE 7860
23
+
24
+ # Run the app
25
+ CMD ["python", "/app/api_inference.py"]
api_inference.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api_inference.py
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModel, AutoConfig
4
+ from flask import Flask, request, jsonify
5
+ import os, json
6
+
7
+ app = Flask(__name__)
8
+
9
+ # === Load Model ===
10
+ MODEL_DIR = os.path.dirname(os.path.abspath(__file__))
11
+
12
+ print(f"πŸ” Loading model from {MODEL_DIR} ...")
13
+
14
+ try:
15
+ # --- Register your custom model class ---
16
+ from transformers.models.auto.modeling_auto import MODEL_MAPPING
17
+ from snp_universal_embedding import CustomSNPModel
18
+
19
+ # Register custom class to handle 'custom_snp' type
20
+ class DummyConfig(AutoConfig):
21
+ model_type = "custom_snp"
22
+
23
+ MODEL_MAPPING.register(DummyConfig, CustomSNPModel)
24
+
25
+ # Load model and tokenizer
26
+ config = AutoConfig.from_pretrained(MODEL_DIR, trust_remote_code=True)
27
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
28
+ model = AutoModel.from_pretrained(MODEL_DIR, config=config, trust_remote_code=True)
29
+ model.eval()
30
+
31
+ print("βœ… Custom SNP model loaded successfully.")
32
+ except Exception as e:
33
+ print("❌ Error loading custom model:", e)
34
+ raise e
35
+
36
+
37
+ # === Define Endpoints ===
38
+ @app.route("/")
39
+ def index():
40
+ return jsonify({
41
+ "status": "SNP Universal Embedding API running",
42
+ "endpoints": ["/embed", "/reason"]
43
+ })
44
+
45
+
46
+ @app.route("/embed", methods=["POST"])
47
+ def embed():
48
+ try:
49
+ data = request.get_json()
50
+ text = data.get("text", "")
51
+ if not text:
52
+ return jsonify({"error": "No text provided"}), 400
53
+
54
+ inputs = tokenizer(text, return_tensors="pt")
55
+ with torch.no_grad():
56
+ outputs = model(**inputs)
57
+ if isinstance(outputs, dict) and "last_hidden_state" in outputs:
58
+ embedding = outputs["last_hidden_state"].mean(dim=1).squeeze().tolist()
59
+ else:
60
+ embedding = outputs.mean(dim=1).squeeze().tolist()
61
+
62
+ return jsonify({"embedding": embedding})
63
+ except Exception as e:
64
+ return jsonify({"error": str(e)}), 500
65
+
66
+
67
+ @app.route("/health")
68
+ def health():
69
+ return "ok", 200
70
+
71
+
72
+ @app.route("/reason", methods=["POST"])
73
+ def reason():
74
+ data = request.get_json()
75
+ text = data.get("text", "")
76
+ return jsonify({
77
+ "text": text,
78
+ "reasoning_status": "Feature in development for SNP reasoning structure"
79
+ })
80
+
81
+
82
+ if __name__ == "__main__":
83
+ port = int(os.environ.get("PORT", 8080))
84
+ app.run(host="0.0.0.0", port=port)
config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "custom_snp",
3
+ "base_model": "bert-base-uncased",
4
+ "embedding_dimension": 6,
5
+ "description": "SNP-Universal-Embedding \u2014 distilled from emotional geometry via Substrate-Prism Neuron framework."
6
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20835793a996aa7a73fa44deb791be89a646e29769de91fae4e438e2234ea56e
3
+ size 442758231
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # requirements.txt
2
+ torch
3
+ transformers
4
+ sentence-transformers
5
+ flask
6
+ numpy
7
+ scikit-learn
snp_universal_embedding.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """SNP-Universal-Embedding.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1z8p0PYKMZjd6IZ2FEgxtRddl7t_52iFA
8
+ """
9
+
10
+ !pip uninstall -y tokenizers transformers sentence-transformers
11
+ !pip cache purge
12
+
13
+ !pip install -q torch==2.8.0+cu126 torchvision==0.23.0+cu126 torchaudio==2.8.0+cu126 --index-url https://download.pytorch.org/whl/cu126
14
+ !pip install -q tokenizers==0.19.1 transformers==4.40.1 sentence-transformers==2.6.1
15
+
16
+ !pip install -q torch==2.8.0+cu126 torchvision==0.23.0+cu126 torchaudio==2.8.0+cu126 --index-url https://download.pytorch.org/whl/cu126
17
+ !pip install -q tokenizers==0.19.1 transformers==4.40.1 sentence-transformers==2.6.1
18
+
19
+ import torch
20
+ from sentence_transformers import SentenceTransformer
21
+ from sentence_transformers.models import Pooling
22
+ from transformers import AutoTokenizer, AutoModel
23
+
24
+ print("βœ… Environment ready")
25
+ print("Torch:", torch.__version__)
26
+
27
+ import torch.nn as nn
28
+ from transformers import AutoModel
29
+
30
+ class CustomSNPModel(nn.Module):
31
+ def __init__(self, base_model="roberta-base"):
32
+ super().__init__()
33
+ self.shared_encoder = AutoModel.from_pretrained(base_model)
34
+ hidden_size = self.shared_encoder.config.hidden_size
35
+ self.mirror_head = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.Tanh())
36
+ self.prism_head = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.Tanh())
37
+ self.projection = nn.Linear(hidden_size, 6) # Changed output dimension to 6
38
+
39
+ def forward(self, input_ids, attention_mask=None, token_type_ids=None):
40
+ outputs = self.shared_encoder(
41
+ input_ids=input_ids,
42
+ attention_mask=attention_mask,
43
+ token_type_ids=token_type_ids
44
+ )
45
+ cls = outputs.last_hidden_state[:, 0, :] # [CLS] embedding
46
+ mirror = self.mirror_head(cls)
47
+ prism = self.prism_head(cls)
48
+ proj = self.projection(cls)
49
+
50
+ # 🧩 Instead of combining 768 and 6-D tensors, just output your 6-D Prism embedding
51
+ return proj
52
+
53
+
54
+ print("βœ… SNP architecture defined.")
55
+
56
+ import os
57
+ import torch
58
+ from sentence_transformers import SentenceTransformer
59
+ from sentence_transformers.models import Pooling
60
+ from transformers import AutoTokenizer, AutoModel
61
+
62
+ ckpt_path = "/content/custom_snp_model_greene.pt"
63
+ assert os.path.exists(ckpt_path), "❌ Greene checkpoint not found."
64
+
65
+ state_dict = torch.load(ckpt_path, map_location="cpu")
66
+
67
+ if "projection.weight" in state_dict:
68
+ w = state_dict["projection.weight"]
69
+ if w.shape == torch.Size([768, 6]): # Greene version
70
+ print("πŸ” Transposing projection.weight to match current model shape...")
71
+ state_dict["projection.weight"] = w.T
72
+
73
+ if "projection.bias" in state_dict:
74
+ b = state_dict["projection.bias"]
75
+ if b.shape == torch.Size([768]): # Greene version
76
+ print("πŸ”§ Adjusting projection.bias shape to match current model...")
77
+ state_dict["projection.bias"] = b[:6] # keep first 6 or reshape accordingly
78
+
79
+ # Remove distributed prefixes if any
80
+ clean_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
81
+
82
+ model = CustomSNPModel(base_model="bert-base-uncased")
83
+ missing, unexpected = model.load_state_dict(clean_state_dict, strict=False)
84
+ print(f"βœ… Checkpoint loaded.\nMissing keys: {len(missing)} | Unexpected: {len(unexpected)}")
85
+
86
+ # ============================================================
87
+ # πŸ”Ή Quick Embedding Test for CustomSNPModel
88
+ # (Safe version that drops token_type_ids)
89
+ # ============================================================
90
+
91
+ import torch
92
+
93
+ # Example text input
94
+ text = "A student must decide between a scholarship and their family."
95
+
96
+ # Tokenize
97
+ inputs = tokenizer(text, return_tensors="pt")
98
+
99
+ # Remove token_type_ids if your model doesn't expect it
100
+ if "token_type_ids" in inputs:
101
+ del inputs["token_type_ids"]
102
+
103
+ # Run inference
104
+ with torch.no_grad():
105
+ output = model(**inputs)
106
+
107
+ # Handle different output formats
108
+ if isinstance(output, tuple):
109
+ emb = output[0]
110
+ elif isinstance(output, dict):
111
+ emb = output.get("pooler_output", output.get("last_hidden_state"))
112
+ else:
113
+ emb = output
114
+
115
+ print("βœ… Embedding generated successfully.")
116
+ print("Embedding shape:", emb.shape if hasattr(emb, "shape") else type(emb))
117
+
118
+ import os, torch, json
119
+ from transformers import AutoTokenizer
120
+
121
+ EXPORT_DIR = "/content/SNP_Universal_Embedding"
122
+ os.makedirs(EXPORT_DIR, exist_ok=True)
123
+
124
+ # Save model weights
125
+ torch.save(model.state_dict(), os.path.join(EXPORT_DIR, "pytorch_model.bin"))
126
+
127
+ # Save config manually (add your own details)
128
+ config = {
129
+ "model_type": "custom_snp",
130
+ "base_model": "bert-base-uncased",
131
+ "embedding_dimension": 6,
132
+ "description": "SNP-Universal-Embedding β€” distilled from emotional geometry via Substrate-Prism Neuron framework."
133
+ }
134
+ with open(os.path.join(EXPORT_DIR, "config.json"), "w") as f:
135
+ json.dump(config, f, indent=4)
136
+
137
+ # Save tokenizer
138
+ tokenizer.save_pretrained(EXPORT_DIR)
139
+
140
+ print("βœ… Model and tokenizer saved to:", EXPORT_DIR)
141
+ !ls -lh $EXPORT_DIR
142
+
143
+ import shutil
144
+ from google.colab import files
145
+
146
+ ZIP_PATH = "/content/SNP-Universal-Embedding.zip"
147
+ shutil.make_archive("/content/SNP-Universal-Embedding", 'zip', EXPORT_DIR)
148
+ files.download(ZIP_PATH)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff