sailesh27 commited on
Commit
229a4c5
·
verified ·
1 Parent(s): ce076ec

Add UniXcoder ONNX model for Transformers.js

Browse files
Files changed (8) hide show
  1. README.md +128 -0
  2. config.json +28 -0
  3. merges.txt +0 -0
  4. model.onnx +3 -0
  5. special_tokens_map.json +51 -0
  6. tokenizer.json +0 -0
  7. tokenizer_config.json +58 -0
  8. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - code
5
+ license: apache-2.0
6
+ library_name: transformers.js
7
+ tags:
8
+ - code
9
+ - embeddings
10
+ - onnx
11
+ - transformers.js
12
+ - semantic-search
13
+ - code-search
14
+ pipeline_tag: feature-extraction
15
+ base_model: microsoft/unixcoder-base
16
+ ---
17
+
18
+ # UniXcoder ONNX for Code Search
19
+
20
+ **Converted by [VibeAtlas](https://vibeatlas.dev)** - AI Context Optimization for Developers
21
+
22
+ This is [Microsoft's UniXcoder](https://huggingface.co/microsoft/unixcoder-base) converted to ONNX format for use with **Transformers.js** in browser and Node.js environments.
23
+
24
+ ## Why UniXcoder?
25
+
26
+ UniXcoder understands code **semantically**, not just as text:
27
+ - Trained on 6 programming languages (Python, Java, JavaScript, PHP, Ruby, Go)
28
+ - Understands AST structure and data flow
29
+ - 20-30% better code search accuracy vs generic embedding models
30
+
31
+ ## Quick Start
32
+
33
+ ### Transformers.js (Browser/Node.js)
34
+
35
+ ```javascript
36
+ import { pipeline } from '@huggingface/transformers';
37
+
38
+ const embedder = await pipeline(
39
+ 'feature-extraction',
40
+ 'sailesh27/unixcoder-base-onnx'
41
+ );
42
+
43
+ const code = `function authenticate(user) {
44
+ return user.isValid && user.hasPermission;
45
+ }`;
46
+
47
+ const embedding = await embedder(code, {
48
+ pooling: 'mean',
49
+ normalize: true
50
+ });
51
+
52
+ console.log(embedding.dims); // [1, 768]
53
+ ```
54
+
55
+ ### Semantic Code Search
56
+
57
+ ```javascript
58
+ import { pipeline, cos_sim } from '@huggingface/transformers';
59
+
60
+ const embedder = await pipeline('feature-extraction', 'sailesh27/unixcoder-base-onnx');
61
+
62
+ // Index your code
63
+ const codeSnippets = [
64
+ 'function login(user, pass) { ... }',
65
+ 'function formatDate(date) { ... }',
66
+ 'function validateEmail(email) { ... }'
67
+ ];
68
+
69
+ const codeEmbeddings = await embedder(codeSnippets, { pooling: 'mean', normalize: true });
70
+
71
+ // Search with natural language
72
+ const query = 'user authentication';
73
+ const queryEmbedding = await embedder(query, { pooling: 'mean', normalize: true });
74
+
75
+ // Find most similar
76
+ const similarities = codeEmbeddings.tolist().map((emb, i) => ({
77
+ code: codeSnippets[i],
78
+ score: cos_sim(queryEmbedding.tolist()[0], emb)
79
+ }));
80
+ ```
81
+
82
+ ## Technical Details
83
+
84
+ - **Architecture**: RoBERTa-based encoder
85
+ - **Hidden Size**: 768
86
+ - **Max Sequence Length**: 512 tokens
87
+ - **Output Dimensions**: 768
88
+ - **ONNX Opset**: 14
89
+
90
+ ## About VibeAtlas
91
+
92
+ **VibeAtlas** is the reliability infrastructure for AI coding:
93
+
94
+ - Reduce AI token costs by 40-60%
95
+ - Improve code search accuracy with semantic understanding
96
+ - Add governance guardrails to AI workflows
97
+
98
+ **Links**:
99
+ - [Website](https://vibeatlas.dev)
100
+ - [VS Code Extension](https://marketplace.visualstudio.com/items?itemName=vibeatlas.vibeatlas)
101
+ - [GitHub](https://github.com/vibeatlas)
102
+
103
+ ## Citation
104
+
105
+ ```bibtex
106
+ @misc{unixcoder-onnx-2025,
107
+ title={UniXcoder ONNX: Code Embeddings for JavaScript},
108
+ author={VibeAtlas Team},
109
+ year={2025},
110
+ publisher={Hugging Face},
111
+ url={https://huggingface.co/sailesh27/unixcoder-base-onnx}
112
+ }
113
+ ```
114
+
115
+ ### Original UniXcoder Paper
116
+
117
+ ```bibtex
118
+ @inproceedings{guo2022unixcoder,
119
+ title={UniXcoder: Unified Cross-Modal Pre-training for Code Representation},
120
+ author={Guo, Daya and Lu, Shuai and Duan, Nan and Wang, Yanlin and Zhou, Ming and Yin, Jian},
121
+ booktitle={ACL},
122
+ year={2022}
123
+ }
124
+ ```
125
+
126
+ ## License
127
+
128
+ Apache 2.0 (same as original UniXcoder)
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 1026,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.55.4",
25
+ "type_vocab_size": 10,
26
+ "use_cache": true,
27
+ "vocab_size": 51416
28
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb7c6027c122832f0b416c1956739bf79e57cf81dfaebfaba57e95dfa84db85f
3
+ size 501599488
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff