Ananthusajeev190 commited on
Commit
7908062
·
verified ·
1 Parent(s): a5a387c

Upload 6 files

Browse files
README.md CHANGED
@@ -1,3 +1,16 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CPython Embeddings (Safetensors)
2
+
3
+ This folder contains semantic embeddings generated from CPython
4
+ documentation and readable source files.
5
+
6
+ ## Contents
7
+ - cpython_embeddings.safetensors : Vector embeddings
8
+ - vocab.txt : Extracted vocabulary
9
+ - tokenizer.json : Tokenizer config
10
+ - config.json : Model configuration
11
+ - metadata.json : Project metadata
12
+
13
+ ## Notes
14
+ - This is NOT a Python interpreter
15
+ - This is NOT a trained LLM
16
+ - Intended for semantic search & analysis
config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "embedding",
3
+ "source": "cpython",
4
+ "embedding_dim": 384,
5
+ "num_embeddings": 2813,
6
+ "framework": "sentence-transformers",
7
+ "description": "CPython documentation and code text embeddings",
8
+ "license": "PSF"
9
+ }
cpython_embeddings.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:969faaec7588a43d68d97c56d1102cfccf8473b81cc00adb9fbb56607dcb1886
3
+ size 4320856
metadata.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "created_by": "Ananthu Sajeev",
3
+ "project": "Venomoussaversai",
4
+ "dataset": "CPython",
5
+ "files_used": [
6
+ "py",
7
+ "txt",
8
+ "rst"
9
+ ],
10
+ "purpose": "Semantic embeddings, not execution",
11
+ "warning": "Not an executable Python interpreter"
12
+ }
tokenizer.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "word",
3
+ "unk_token": "[UNK]",
4
+ "pad_token": "[PAD]",
5
+ "vocab_size": 50000,
6
+ "vocab_file": "vocab.txt"
7
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3
3
+ size 231508