Text Classification
Transformers
ONNX
Safetensors
English
roberta
code
programming-language
code-classification
text-embeddings-inference
Instructions to use philomath-1209/programming-language-identification with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use philomath-1209/programming-language-identification with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-classification", model="philomath-1209/programming-language-identification")# Load model directly from transformers import AutoTokenizer, AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained("philomath-1209/programming-language-identification") model = AutoModelForSequenceClassification.from_pretrained("philomath-1209/programming-language-identification") - Inference
- Notebooks
- Google Colab
- Kaggle
ONNX-converted version of the model
Browse filesWe decided to swap the existing model for the [Code Scanner](https://llm-guard.com/input_scanners/code/) in [llm-guard](https://github.com/laiyer-ai/llm-guard) with your model. Our tests show much better accuracy compared to the HuggingFace's one.
To have faster inference, we use ONNX models converted using Optimum from HuggingFace.
Example of the repo with ONNX built-in: https://huggingface.co/laiyer/deberta-v3-base-prompt-injection
- onnx/config.json +84 -0
- onnx/merges.txt +0 -0
- onnx/model.onnx +3 -0
- onnx/special_tokens_map.json +51 -0
- onnx/tokenizer.json +0 -0
- onnx/tokenizer_config.json +58 -0
- onnx/vocab.json +0 -0
onnx/config.json
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "philomath-1209/programming-language-identification",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"RobertaForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"eos_token_id": 2,
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_dropout_prob": 0.1,
|
| 12 |
+
"hidden_size": 768,
|
| 13 |
+
"id2label": {
|
| 14 |
+
"0": "Scala",
|
| 15 |
+
"1": "JavaScript",
|
| 16 |
+
"2": "COBOL",
|
| 17 |
+
"3": "ARM Assembly",
|
| 18 |
+
"4": "R",
|
| 19 |
+
"5": "Lua",
|
| 20 |
+
"6": "C++",
|
| 21 |
+
"7": "Visual Basic .NET",
|
| 22 |
+
"8": "Go",
|
| 23 |
+
"9": "Erlang",
|
| 24 |
+
"10": "C#",
|
| 25 |
+
"11": "Rust",
|
| 26 |
+
"12": "Ruby",
|
| 27 |
+
"13": "Swift",
|
| 28 |
+
"14": "Mathematica/Wolfram Language",
|
| 29 |
+
"15": "PHP",
|
| 30 |
+
"16": "Fortran",
|
| 31 |
+
"17": "AppleScript",
|
| 32 |
+
"18": "Pascal",
|
| 33 |
+
"19": "Java",
|
| 34 |
+
"20": "PowerShell",
|
| 35 |
+
"21": "Python",
|
| 36 |
+
"22": "C",
|
| 37 |
+
"23": "Perl",
|
| 38 |
+
"24": "Kotlin",
|
| 39 |
+
"25": "jq"
|
| 40 |
+
},
|
| 41 |
+
"initializer_range": 0.02,
|
| 42 |
+
"intermediate_size": 3072,
|
| 43 |
+
"label2id": {
|
| 44 |
+
"ARM Assembly": 3,
|
| 45 |
+
"AppleScript": 17,
|
| 46 |
+
"C": 22,
|
| 47 |
+
"C#": 10,
|
| 48 |
+
"C++": 6,
|
| 49 |
+
"COBOL": 2,
|
| 50 |
+
"Erlang": 9,
|
| 51 |
+
"Fortran": 16,
|
| 52 |
+
"Go": 8,
|
| 53 |
+
"Java": 19,
|
| 54 |
+
"JavaScript": 1,
|
| 55 |
+
"Kotlin": 24,
|
| 56 |
+
"Lua": 5,
|
| 57 |
+
"Mathematica/Wolfram Language": 14,
|
| 58 |
+
"PHP": 15,
|
| 59 |
+
"Pascal": 18,
|
| 60 |
+
"Perl": 23,
|
| 61 |
+
"PowerShell": 20,
|
| 62 |
+
"Python": 21,
|
| 63 |
+
"R": 4,
|
| 64 |
+
"Ruby": 12,
|
| 65 |
+
"Rust": 11,
|
| 66 |
+
"Scala": 0,
|
| 67 |
+
"Swift": 13,
|
| 68 |
+
"Visual Basic .NET": 7,
|
| 69 |
+
"jq": 25
|
| 70 |
+
},
|
| 71 |
+
"layer_norm_eps": 1e-05,
|
| 72 |
+
"max_position_embeddings": 514,
|
| 73 |
+
"model_type": "roberta",
|
| 74 |
+
"num_attention_heads": 12,
|
| 75 |
+
"num_hidden_layers": 6,
|
| 76 |
+
"pad_token_id": 1,
|
| 77 |
+
"position_embedding_type": "absolute",
|
| 78 |
+
"problem_type": "single_label_classification",
|
| 79 |
+
"torch_dtype": "float32",
|
| 80 |
+
"transformers_version": "4.36.2",
|
| 81 |
+
"type_vocab_size": 1,
|
| 82 |
+
"use_cache": true,
|
| 83 |
+
"vocab_size": 52000
|
| 84 |
+
}
|
onnx/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
onnx/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d9bf25fe9933c20db353c90fd5295554d4c02f06f6f052d8ec1ef87c623e0bf
|
| 3 |
+
size 334030262
|
onnx/special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": true,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": true,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": true,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": true,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": true,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
onnx/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
onnx/tokenizer_config.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"0": {
|
| 5 |
+
"content": "<s>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"1": {
|
| 13 |
+
"content": "<pad>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": true,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
},
|
| 20 |
+
"2": {
|
| 21 |
+
"content": "</s>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": true,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false,
|
| 26 |
+
"special": true
|
| 27 |
+
},
|
| 28 |
+
"3": {
|
| 29 |
+
"content": "<unk>",
|
| 30 |
+
"lstrip": false,
|
| 31 |
+
"normalized": true,
|
| 32 |
+
"rstrip": false,
|
| 33 |
+
"single_word": false,
|
| 34 |
+
"special": true
|
| 35 |
+
},
|
| 36 |
+
"4": {
|
| 37 |
+
"content": "<mask>",
|
| 38 |
+
"lstrip": true,
|
| 39 |
+
"normalized": false,
|
| 40 |
+
"rstrip": false,
|
| 41 |
+
"single_word": false,
|
| 42 |
+
"special": true
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"bos_token": "<s>",
|
| 46 |
+
"clean_up_tokenization_spaces": true,
|
| 47 |
+
"cls_token": "<s>",
|
| 48 |
+
"eos_token": "</s>",
|
| 49 |
+
"errors": "replace",
|
| 50 |
+
"mask_token": "<mask>",
|
| 51 |
+
"max_len": 512,
|
| 52 |
+
"model_max_length": 512,
|
| 53 |
+
"pad_token": "<pad>",
|
| 54 |
+
"sep_token": "</s>",
|
| 55 |
+
"tokenizer_class": "RobertaTokenizer",
|
| 56 |
+
"trim_offsets": true,
|
| 57 |
+
"unk_token": "<unk>"
|
| 58 |
+
}
|
onnx/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|