Update README.md
Browse files
README.md
CHANGED
|
@@ -18,9 +18,9 @@ The model is built on top of Qwen3(Qwen3-0.6B) and uses a custom non-causal atte
|
|
| 18 |
mechanism.
|
| 19 |
|
| 20 |
## Predicted Classes
|
| 21 |
-
0
|
| 22 |
-
1
|
| 23 |
-
2
|
| 24 |
|
| 25 |
## Transformer Inference Example
|
| 26 |
```python
|
|
@@ -70,6 +70,20 @@ def register_fa_attention():
|
|
| 70 |
# Register custom non-causal FA (Feel free to use FA2/FA3), required GPU
|
| 71 |
register_fa_attention()
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
tokenizer = AutoTokenizer.from_pretrained("Scicom-intl/multilingual-dynamic-entity-decoder")
|
| 74 |
model = Qwen3ForTokenClassification.from_pretrained(
|
| 75 |
"Scicom-intl/multilingual-dynamic-entity-decoder",
|
|
@@ -78,9 +92,9 @@ model = Qwen3ForTokenClassification.from_pretrained(
|
|
| 78 |
device_map={"":"cuda:0"}
|
| 79 |
)
|
| 80 |
|
| 81 |
-
|
| 82 |
token = tokenizer(
|
| 83 |
-
|
| 84 |
is_split_into_words=True,
|
| 85 |
return_tensors="pt"
|
| 86 |
).to(model.device)
|
|
@@ -91,9 +105,5 @@ with toch.no_grad():
|
|
| 91 |
print(prediction)
|
| 92 |
```
|
| 93 |
|
| 94 |
-
## Important Notes & Limitations
|
| 95 |
-
- Chinese text must be tokenized at the character level, not by words
|
| 96 |
-
|
| 97 |
-
|
| 98 |
## Evaluation Result
|
| 99 |
- F1 macro: 0.75
|
|
|
|
| 18 |
mechanism.
|
| 19 |
|
| 20 |
## Predicted Classes
|
| 21 |
+
- 0 : Non-entity token
|
| 22 |
+
- 1 : Name entity
|
| 23 |
+
- 2 : Address entity
|
| 24 |
|
| 25 |
## Transformer Inference Example
|
| 26 |
```python
|
|
|
|
| 70 |
# Register custom non-causal FA (Feel free to use FA2/FA3), required GPU
|
| 71 |
register_fa_attention()
|
| 72 |
|
| 73 |
+
def tokenize_sentence_to_word(sentence:str ):
|
| 74 |
+
tokens = []
|
| 75 |
+
chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]')
|
| 76 |
+
# Split text by spaces first
|
| 77 |
+
parts = sentence.split()
|
| 78 |
+
for part in parts:
|
| 79 |
+
if chinese_char_pattern.search(part):
|
| 80 |
+
# Character-level tokenization for Chinese
|
| 81 |
+
tokens.extend(list(part))
|
| 82 |
+
else:
|
| 83 |
+
# Word-level tokenization for other languages
|
| 84 |
+
tokens.append(part)
|
| 85 |
+
return tokens
|
| 86 |
+
|
| 87 |
tokenizer = AutoTokenizer.from_pretrained("Scicom-intl/multilingual-dynamic-entity-decoder")
|
| 88 |
model = Qwen3ForTokenClassification.from_pretrained(
|
| 89 |
"Scicom-intl/multilingual-dynamic-entity-decoder",
|
|
|
|
| 92 |
device_map={"":"cuda:0"}
|
| 93 |
)
|
| 94 |
|
| 95 |
+
word_token = tokenize_sentence_to_word("Hi, my name is Alex and I'm from Perlis")
|
| 96 |
token = tokenizer(
|
| 97 |
+
word_token,
|
| 98 |
is_split_into_words=True,
|
| 99 |
return_tensors="pt"
|
| 100 |
).to(model.device)
|
|
|
|
| 105 |
print(prediction)
|
| 106 |
```
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
## Evaluation Result
|
| 109 |
- F1 macro: 0.75
|