huseinzolkepliscicom commited on
Commit
b4bd6a0
·
verified ·
1 Parent(s): 4aa76ce

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +19 -9
README.md CHANGED
@@ -18,9 +18,9 @@ The model is built on top of Qwen3(Qwen3-0.6B) and uses a custom non-causal atte
18
  mechanism.
19
 
20
  ## Predicted Classes
21
- 0 - Non-entity token
22
- 1 - Name entity
23
- 2 - Address entity
24
 
25
  ## Transformer Inference Example
26
  ```python
@@ -70,6 +70,20 @@ def register_fa_attention():
70
  # Register custom non-causal FA (Feel free to use FA2/FA3), required GPU
71
  register_fa_attention()
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  tokenizer = AutoTokenizer.from_pretrained("Scicom-intl/multilingual-dynamic-entity-decoder")
74
  model = Qwen3ForTokenClassification.from_pretrained(
75
  "Scicom-intl/multilingual-dynamic-entity-decoder",
@@ -78,9 +92,9 @@ model = Qwen3ForTokenClassification.from_pretrained(
78
  device_map={"":"cuda:0"}
79
  )
80
 
81
- text = "Hi, my name is Alex and I'm from Perlis"
82
  token = tokenizer(
83
- text.split(),
84
  is_split_into_words=True,
85
  return_tensors="pt"
86
  ).to(model.device)
@@ -91,9 +105,5 @@ with toch.no_grad():
91
  print(prediction)
92
  ```
93
 
94
- ## Important Notes & Limitations
95
- - Chinese text must be tokenized at the character level, not by words
96
-
97
-
98
  ## Evaluation Result
99
  - F1 macro: 0.75
 
18
  mechanism.
19
 
20
  ## Predicted Classes
21
+ - 0 : Non-entity token
22
+ - 1 : Name entity
23
+ - 2 : Address entity
24
 
25
  ## Transformer Inference Example
26
  ```python
 
70
  # Register custom non-causal FA (Feel free to use FA2/FA3), required GPU
71
  register_fa_attention()
72
 
73
+ def tokenize_sentence_to_word(sentence:str ):
74
+ tokens = []
75
+ chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]')
76
+ # Split text by spaces first
77
+ parts = sentence.split()
78
+ for part in parts:
79
+ if chinese_char_pattern.search(part):
80
+ # Character-level tokenization for Chinese
81
+ tokens.extend(list(part))
82
+ else:
83
+ # Word-level tokenization for other languages
84
+ tokens.append(part)
85
+ return tokens
86
+
87
  tokenizer = AutoTokenizer.from_pretrained("Scicom-intl/multilingual-dynamic-entity-decoder")
88
  model = Qwen3ForTokenClassification.from_pretrained(
89
  "Scicom-intl/multilingual-dynamic-entity-decoder",
 
92
  device_map={"":"cuda:0"}
93
  )
94
 
95
+ word_token = tokenize_sentence_to_word("Hi, my name is Alex and I'm from Perlis")
96
  token = tokenizer(
97
+ word_token,
98
  is_split_into_words=True,
99
  return_tensors="pt"
100
  ).to(model.device)
 
105
  print(prediction)
106
  ```
107
 
 
 
 
 
108
  ## Evaluation Result
109
  - F1 macro: 0.75