Spaces:

itriedcoding
/

sage-demo

Sleeping

App Files Files Community

itriedcoding commited on 29 days ago

Commit

a982fff

verified ·

1 Parent(s): cf8d8b2

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +40 -0

app.py CHANGED Viewed

@@ -8,6 +8,46 @@ from huggingface_hub import hf_hub_download
 REPO_ID = "itriedcoding/Sage"
 # Custom model class matching Sage architecture
 class TransformerLM(nn.Module):
     def __init__(self, vocab_size, d_model=256, nhead=8, num_layers=4, dim_feedforward=1024, max_seq_length=64):

 REPO_ID = "itriedcoding/Sage"
+class CharacterTokenizer:
+    def __init__(self):
+        self.char_to_idx = {}
+        self.idx_to_char = {}
+        self.vocab_size = 0
+        self.pad_token_id = 0
+        self.unk_token_id = 1
+    def fit(self, texts):
+        chars = set()
+        for text in texts:
+            chars.update(list(str(text)))
+        self.char_to_idx['<PAD>'] = 0
+        self.char_to_idx['<UNK>'] = 1
+        for i, char in enumerate(sorted(chars)):
+            self.char_to_idx[char] = i + 2
+        self.idx_to_char = {v: k for k, v in self.char_to_idx.items()}
+        self.vocab_size = len(self.char_to_idx)
+    def encode(self, text, max_length=None, padding=False, truncation=False, return_tensors=None):
+        if isinstance(text, str):
+            text = [text]
+        encoded = []
+        for t in text:
+            tokens = [self.char_to_idx.get(c, self.unk_token_id) for c in str(t)]
+            if truncation and max_length:
+                tokens = tokens[:max_length]
+            if padding and max_length:
+                tokens = tokens + [self.pad_token_id] * (max_length - len(tokens))
+            encoded.append(tokens)
+        if return_tensors == 'pt':
+            return torch.tensor(encoded, dtype=torch.long)
+        return encoded
+    def decode(self, token_ids):
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.tolist()
+        chars = [self.idx_to_char.get(idx, '<UNK>') for idx in token_ids]
+        return ''.join(chars)
 # Custom model class matching Sage architecture
 class TransformerLM(nn.Module):
     def __init__(self, vocab_size, d_model=256, nhead=8, num_layers=4, dim_feedforward=1024, max_seq_length=64):