File size: 3,763 Bytes
29cdc9d 8663f54 29cdc9d 8663f54 29cdc9d 8663f54 29cdc9d f8ddcab 8663f54 f8ddcab 8663f54 c3e2cd8 f8ddcab 8663f54 c3e2cd8 8663f54 c3e2cd8 fa6e2ea c3e2cd8 8663f54 f8ddcab 8663f54 c3e2cd8 8663f54 fa6e2ea 8663f54 29cdc9d 8663f54 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | import numpy as np
from pathlib import Path
import ast
import hdc_engine
DIM = 10000
class VitalisKernel:
def __init__(self):
self.dim = DIM
self.weights_path = Path.home() / ".vitalis_workspace" / "kernel.weights.npy"
self.codebook_path = Path.home() / ".vitalis_workspace" / "codebook.npy"
self.bias = np.load(self.weights_path) if self.weights_path.exists() else np.array([0.0])
self._dirty = False
self._load_codebook()
def _load_codebook(self):
if self.codebook_path.exists():
self.codebook = np.load(self.codebook_path, allow_pickle=True).item()
else:
self.codebook = {}
def _save_codebook(self):
self.codebook_path.parent.mkdir(parents=True, exist_ok=True)
np.save(self.codebook_path, self.codebook)
self._dirty = False
def _get_token_vector(self, token: str) -> np.ndarray:
if token not in self.codebook:
self.codebook[token] = np.random.choice(
[-1, 1], size=self.dim
).astype(np.int8)
self._dirty = True
return self.codebook[token]
def _get_position_vector(self, position: int) -> np.ndarray:
rng = np.random.default_rng(seed=position)
return rng.choice([-1, 1], size=self.dim).astype(np.int8)
def vectorize_tokens(self, tokens: list, positional: bool = True) -> np.ndarray:
bundle = np.zeros(self.dim, dtype=np.int32)
for i, token in enumerate(tokens):
token_vec = self._get_token_vector(str(token))
if positional:
pos_vec = self._get_position_vector(i)
bound = hdc_engine.bind(token_vec, pos_vec)
else:
bound = token_vec
bundle += bound.astype(np.int32)
result = np.sign(bundle).astype(np.int8)
result[result == 0] = 1
if self._dirty:
self._save_codebook()
return result
def vectorize_source(self, source_code: str) -> np.ndarray:
tokens = self._extract_tokens(source_code)
return self.vectorize_tokens(tokens)
def vectorize_file(self, file_path: str) -> np.ndarray:
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"Source file not found: {file_path}")
return self.vectorize_source(path.read_text(encoding="utf-8"))
def _extract_tokens(self, source_code: str) -> list:
tokens = []
try:
tree = ast.parse(source_code)
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
tokens.append(f"DEF:{node.name}")
elif isinstance(node, ast.Name):
tokens.append(f"NAME:{node.id}")
elif isinstance(node, ast.Constant) and isinstance(node.value, str):
tokens.append(f"STR:{node.value[:32]}")
elif isinstance(node, ast.Import):
for alias in node.names:
tokens.append(f"IMPORT:{alias.name}")
elif isinstance(node, ast.ImportFrom):
tokens.append(f"FROM:{node.module}")
except SyntaxError:
tokens = source_code.split()
return tokens if tokens else ["EMPTY"]
def similarity(self, vec_a: np.ndarray, vec_b: np.ndarray) -> float:
a = vec_a.astype(np.float32)
b = vec_b.astype(np.float32)
denom = np.linalg.norm(a) * np.linalg.norm(b)
if denom == 0:
return 0.0
return float(np.dot(a, b) / denom)
def matmul(self, a, b):
return np.dot(a, b) + self.bias
def activation(self, x):
return np.sign(x)
|