Upload 6 files
Browse files- README.md +16 -3
- config.json +9 -0
- cpython_embeddings.safetensors +3 -0
- metadata.json +12 -0
- tokenizer.json +7 -0
- tokenizer.model +3 -0
README.md
CHANGED
|
@@ -1,3 +1,16 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CPython Embeddings (Safetensors)
|
| 2 |
+
|
| 3 |
+
This folder contains semantic embeddings generated from CPython
|
| 4 |
+
documentation and readable source files.
|
| 5 |
+
|
| 6 |
+
## Contents
|
| 7 |
+
- cpython_embeddings.safetensors : Vector embeddings
|
| 8 |
+
- vocab.txt : Extracted vocabulary
|
| 9 |
+
- tokenizer.json : Tokenizer config
|
| 10 |
+
- config.json : Model configuration
|
| 11 |
+
- metadata.json : Project metadata
|
| 12 |
+
|
| 13 |
+
## Notes
|
| 14 |
+
- This is NOT a Python interpreter
|
| 15 |
+
- This is NOT a trained LLM
|
| 16 |
+
- Intended for semantic search & analysis
|
config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "embedding",
|
| 3 |
+
"source": "cpython",
|
| 4 |
+
"embedding_dim": 384,
|
| 5 |
+
"num_embeddings": 2813,
|
| 6 |
+
"framework": "sentence-transformers",
|
| 7 |
+
"description": "CPython documentation and code text embeddings",
|
| 8 |
+
"license": "PSF"
|
| 9 |
+
}
|
cpython_embeddings.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:969faaec7588a43d68d97c56d1102cfccf8473b81cc00adb9fbb56607dcb1886
|
| 3 |
+
size 4320856
|
metadata.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"created_by": "Ananthu Sajeev",
|
| 3 |
+
"project": "Venomoussaversai",
|
| 4 |
+
"dataset": "CPython",
|
| 5 |
+
"files_used": [
|
| 6 |
+
"py",
|
| 7 |
+
"txt",
|
| 8 |
+
"rst"
|
| 9 |
+
],
|
| 10 |
+
"purpose": "Semantic embeddings, not execution",
|
| 11 |
+
"warning": "Not an executable Python interpreter"
|
| 12 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"type": "word",
|
| 3 |
+
"unk_token": "[UNK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"vocab_size": 50000,
|
| 6 |
+
"vocab_file": "vocab.txt"
|
| 7 |
+
}
|
tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3
|
| 3 |
+
size 231508
|