Change best model to BIOELECTRA model
Browse files- app.py +3 -3
- bioelectra_model/config.json +41 -0
- bioelectra_model/special_tokens_map.json +7 -0
- bioelectra_model/tokenizer.json +0 -0
- bioelectra_model/tokenizer_config.json +58 -0
- bioelectra_model/training_args.bin +0 -0
- bioelectra_model/vocab.txt +0 -0
- roberta_model/config.json +0 -3
- roberta_model/merges.txt +0 -3
- roberta_model/special_tokens_map.json +0 -3
- roberta_model/tokenizer.json +0 -3
- roberta_model/tokenizer_config.json +0 -3
- roberta_model/training_args.bin +0 -3
- roberta_model/vocab.json +0 -3
- templates/index.html +2 -2
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from flask import Flask, request, render_template, send_file
|
| 2 |
-
from transformers import AutoModelForTokenClassification,
|
| 3 |
from collections import Counter
|
| 4 |
import datetime, json
|
| 5 |
import os
|
|
@@ -8,8 +8,8 @@ app = Flask(__name__)
|
|
| 8 |
|
| 9 |
try:
|
| 10 |
# Load model from local files
|
| 11 |
-
model = AutoModelForTokenClassification.from_pretrained("
|
| 12 |
-
tokenizer =
|
| 13 |
nlp = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
| 14 |
print("Pipeline loaded successfully!")
|
| 15 |
except Exception as e:
|
|
|
|
| 1 |
from flask import Flask, request, render_template, send_file
|
| 2 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
| 3 |
from collections import Counter
|
| 4 |
import datetime, json
|
| 5 |
import os
|
|
|
|
| 8 |
|
| 9 |
try:
|
| 10 |
# Load model from local files
|
| 11 |
+
model = AutoModelForTokenClassification.from_pretrained("bioelectra_model", local_files_only=True)
|
| 12 |
+
tokenizer = AutoTokenizer.from_pretrained("bioelectra_model", local_files_only=True)
|
| 13 |
nlp = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
| 14 |
print("Pipeline loaded successfully!")
|
| 15 |
except Exception as e:
|
bioelectra_model/config.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"ElectraForTokenClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"embedding_size": 768,
|
| 8 |
+
"hidden_act": "gelu",
|
| 9 |
+
"hidden_dropout_prob": 0.1,
|
| 10 |
+
"hidden_size": 768,
|
| 11 |
+
"id2label": {
|
| 12 |
+
"0": "B-AC",
|
| 13 |
+
"1": "B-LF",
|
| 14 |
+
"2": "I-LF",
|
| 15 |
+
"3": "O"
|
| 16 |
+
},
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"intermediate_size": 3072,
|
| 19 |
+
"label2id": {
|
| 20 |
+
"B-AC": 0,
|
| 21 |
+
"B-LF": 1,
|
| 22 |
+
"I-LF": 2,
|
| 23 |
+
"O": 3
|
| 24 |
+
},
|
| 25 |
+
"layer_norm_eps": 1e-12,
|
| 26 |
+
"max_position_embeddings": 512,
|
| 27 |
+
"model_type": "electra",
|
| 28 |
+
"num_attention_heads": 12,
|
| 29 |
+
"num_hidden_layers": 12,
|
| 30 |
+
"pad_token_id": 0,
|
| 31 |
+
"position_embedding_type": "absolute",
|
| 32 |
+
"summary_activation": "gelu",
|
| 33 |
+
"summary_last_dropout": 0.1,
|
| 34 |
+
"summary_type": "first",
|
| 35 |
+
"summary_use_proj": true,
|
| 36 |
+
"torch_dtype": "float32",
|
| 37 |
+
"transformers_version": "4.51.3",
|
| 38 |
+
"type_vocab_size": 2,
|
| 39 |
+
"use_cache": true,
|
| 40 |
+
"vocab_size": 30522
|
| 41 |
+
}
|
bioelectra_model/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
bioelectra_model/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
bioelectra_model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"4": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": true,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_basic_tokenize": true,
|
| 47 |
+
"do_lower_case": true,
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "[MASK]",
|
| 50 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 51 |
+
"never_split": null,
|
| 52 |
+
"pad_token": "[PAD]",
|
| 53 |
+
"sep_token": "[SEP]",
|
| 54 |
+
"strip_accents": null,
|
| 55 |
+
"tokenize_chinese_chars": true,
|
| 56 |
+
"tokenizer_class": "ElectraTokenizer",
|
| 57 |
+
"unk_token": "[UNK]"
|
| 58 |
+
}
|
bioelectra_model/training_args.bin
ADDED
|
Binary file (5.37 kB). View file
|
|
|
bioelectra_model/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
roberta_model/config.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c46810bc1ab9c01c6d0161eec4da3d84b5a1ec6a48590b8dd6d93f0f6c0a8e3d
|
| 3 |
-
size 810
|
|
|
|
|
|
|
|
|
|
|
|
roberta_model/merges.txt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5
|
| 3 |
-
size 456318
|
|
|
|
|
|
|
|
|
|
|
|
roberta_model/special_tokens_map.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:06e405a36dfe4b9604f484f6a1e619af1a7f7d09e34a8555eb0b77b66318067f
|
| 3 |
-
size 280
|
|
|
|
|
|
|
|
|
|
|
|
roberta_model/tokenizer.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:5765a30438d50e6f698c3ac6f8a09125db725006ca376cf692056d9b9db40151
|
| 3 |
-
size 3558906
|
|
|
|
|
|
|
|
|
|
|
|
roberta_model/tokenizer_config.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:38b4f61b73aab22f9c2bb01d0f2be28d745e1db57ef7fcfddfa33c6ac8265444
|
| 3 |
-
size 1245
|
|
|
|
|
|
|
|
|
|
|
|
roberta_model/training_args.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:92e2a186efd310ad699b41d69c662674b2afe096fc60962964a6c63b2782b92e
|
| 3 |
-
size 5240
|
|
|
|
|
|
|
|
|
|
|
|
roberta_model/vocab.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ed19656ea1707df69134c4af35c8ceda2cc9860bf2c3495026153a133670ab5e
|
| 3 |
-
size 798293
|
|
|
|
|
|
|
|
|
|
|
|
templates/index.html
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
<html lang="en">
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8">
|
| 5 |
-
<title>
|
| 6 |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
| 7 |
<style>
|
| 8 |
body { font-family: Arial, sans-serif; padding: 20px; background-color: #f8f9fa; }
|
|
@@ -17,7 +17,7 @@
|
|
| 17 |
</head>
|
| 18 |
<body>
|
| 19 |
|
| 20 |
-
<h1>
|
| 21 |
|
| 22 |
<form method="post" action="/" enctype="application/x-www-form-urlencoded">
|
| 23 |
<label for="input_text">Enter a sentence:</label><br>
|
|
|
|
| 2 |
<html lang="en">
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8">
|
| 5 |
+
<title>BIOELECTRA Token Classification</title>
|
| 6 |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
| 7 |
<style>
|
| 8 |
body { font-family: Arial, sans-serif; padding: 20px; background-color: #f8f9fa; }
|
|
|
|
| 17 |
</head>
|
| 18 |
<body>
|
| 19 |
|
| 20 |
+
<h1>BIOELECTRA Token Classification Demo</h1>
|
| 21 |
|
| 22 |
<form method="post" action="/" enctype="application/x-www-form-urlencoded">
|
| 23 |
<label for="input_text">Enter a sentence:</label><br>
|