jaeyong2 commited on
Commit
0402cc0
ยท
verified ยท
1 Parent(s): 67018b8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +67 -1
README.md CHANGED
@@ -8,6 +8,15 @@ language:
8
  base_model:
9
  - Qwen/Qwen3-0.6B
10
  ---
 
 
 
 
 
 
 
 
 
11
 
12
  ### example(En)
13
  ```
@@ -57,7 +66,7 @@ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
57
  [{'text': 'Tim', 'type': 'PERSON'}, {'text': 'mom', 'type': 'PERSON'}, {'text': 'Sue', 'type': 'PERSON'}, {'text': 'park', 'type': 'LOCATION'}, {'text': 'fountain', 'type': 'LOCATION'}, {'text': 'fish', 'type': 'ANIMAL'}]
58
  </entities>
59
  ```
60
-
61
  ### examlpe (ko)
62
  ```
63
  system = """
@@ -112,4 +121,61 @@ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
112
  <entities>
113
  [{'text': '์ˆ˜์ง„์ด', 'type': 'PERSON'}, {'text': '์Šคํƒ€ํ•„๋“œ ํ•˜๋‚จ', 'type': 'LOCATION'}, {'text': '์•„์ดํฐ 16', 'type': 'PRODUCT'}, {'text': '๋ฐฉํƒ„์†Œ๋…„๋‹จ', 'type': 'ORGANIZATION'}, {'text': '์ฝ˜์„œํŠธ ์‹คํ™ฉ ์˜ํ™”', 'type': 'WORK_OF_ART'}, {'text': 'ํ† ์š”์ผ', 'type': 'DATE'}, {'text': '์นดํŽ˜ ๋…ธํ‹ฐ๋“œ', 'type': 'LOCATION'}]
114
  </entities>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  ```
 
8
  base_model:
9
  - Qwen/Qwen3-0.6B
10
  ---
11
+ ## Model Detail
12
+ ### Goal
13
+ - Perform dynamic NER: given a sentence and a runtime schema of entity types, extract all matching entities.
14
+ - Support multilingual input (English, Korean, Japanese, etc.).
15
+
16
+ ### Limitation
17
+ - The model tends to extract only one entity per type and may miss multiple mentions of the same type.
18
+ - Overlapping or nested entities (e.g., โ€œNew Yorkโ€ vs โ€œYorkโ€) may be unclear without explicit overlap policy.
19
+
20
 
21
  ### example(En)
22
  ```
 
66
  [{'text': 'Tim', 'type': 'PERSON'}, {'text': 'mom', 'type': 'PERSON'}, {'text': 'Sue', 'type': 'PERSON'}, {'text': 'park', 'type': 'LOCATION'}, {'text': 'fountain', 'type': 'LOCATION'}, {'text': 'fish', 'type': 'ANIMAL'}]
67
  </entities>
68
  ```
69
+ ----------
70
  ### examlpe (ko)
71
  ```
72
  system = """
 
121
  <entities>
122
  [{'text': '์ˆ˜์ง„์ด', 'type': 'PERSON'}, {'text': '์Šคํƒ€ํ•„๋“œ ํ•˜๋‚จ', 'type': 'LOCATION'}, {'text': '์•„์ดํฐ 16', 'type': 'PRODUCT'}, {'text': '๋ฐฉํƒ„์†Œ๋…„๋‹จ', 'type': 'ORGANIZATION'}, {'text': '์ฝ˜์„œํŠธ ์‹คํ™ฉ ์˜ํ™”', 'type': 'WORK_OF_ART'}, {'text': 'ํ† ์š”์ผ', 'type': 'DATE'}, {'text': '์นดํŽ˜ ๋…ธํ‹ฐ๋“œ', 'type': 'LOCATION'}]
123
  </entities>
124
+ ```
125
+ -------
126
+
127
+ ### examlpe (ja)
128
+ ```
129
+ system = """
130
+ You are an AI that dynamically performs Named Entity Recognition (NER).
131
+ You receive a sentence and a list of entity types the user wants to extract, and then identify all entities of those types within the sentence.
132
+ If you cannot find any suitable entities within the sentence, return an empty list.
133
+ """
134
+
135
+ text = """
136
+ ใƒชใƒŠใฏ4ๆœˆใฎ็ต‚ใ‚ใ‚Šใซๆฑไบฌใƒ‡ใ‚ฃใ‚บใƒ‹ใƒผใƒฉใƒณใƒ‰ใธ่กŒใใพใ—ใŸใ€‚
137
+ ๅฝผๅฅณใฏใ‚นใƒ‘ใ‚คใƒ•ใ‚กใƒŸใƒชใƒผใฎใ‚ทใƒงใƒผใ‚’่ฆ‹ใฆใ€ใ‚นใ‚ฟใƒผใƒใƒƒใ‚ฏใ‚นใงๆŠน่Œถใƒฉใƒ†ใ‚’้ฃฒใฟใพใ—ใŸใ€‚
138
+ ๅคœใซใฏใ€Œๅƒใจๅƒๅฐ‹ใฎ็ฅž้š ใ—ใ€ใฎ็‰นๅˆฅไธŠๆ˜ ไผšใซใ‚‚ๅ‚ๅŠ ใ—ใพใ—ใŸใ€‚
139
+ """.strip()
140
+
141
+ named_entity = """
142
+ [
143
+ {"type": "PERSON", "description": "ๅ€‹ไบบๅ"},
144
+ {"type": "LOCATION", "description": "ๅœฐๅใ‚„ๆ–ฝ่จญๅ"},
145
+ {"type": "ORGANIZATION", "description": "ไผš็คพใ‚„ๅ›ฃไฝ“ๅ"},
146
+ {"type": "WORK_OF_ART", "description": "ๆ˜ ็”ปใ€้Ÿณๆฅฝใ€ใ‚ขใƒ‹ใƒกใ€ๆ›ธ็ฑใชใฉ"},
147
+ {"type": "PRODUCT", "description": "ๅ•†ๅ“ใ‚„ใƒ–ใƒฉใƒณใƒ‰ๅ"},
148
+ {"type": "DATE", "description": "ๆ—ฅไป˜ใ‚„ๆ™‚ๆœŸ"}
149
+ ]
150
+ """.strip()
151
+
152
+
153
+ user = f"<sentence>\n{text}\n</sentence>\n\n<entity_list>\n{named_entity}\n</entity_list>\n\n"
154
+ chat = [{"role":"system", "content":system}, {"role":"user", "content":user}]
155
+ chat_text = tokenizer.apply_chat_template(
156
+ chat,
157
+ enable_thinking=False,
158
+ add_generation_prompt=True,
159
+ tokenize=False
160
+ )
161
+
162
+ model_inputs = tokenizer([chat_text], return_tensors="pt").to(model.device)
163
+
164
+ generated_ids = model.generate(
165
+ **model_inputs,
166
+ max_new_tokens=512
167
+ )
168
+
169
+ generated_ids = [
170
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
171
+ ]
172
+
173
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
174
+ ```
175
+
176
+ ### result (ja)
177
+ ```
178
+ <entities>
179
+ [{'text': 'ใƒชใƒŠ', 'type': 'PERSON'}, {'text': 'ๆฑไบฌ', 'type': 'LOCATION'}, {'text': 'ใ‚นใƒ‘ใ‚คใƒ•ใ‚กใƒŸใƒชใƒผ', 'type': 'ORGANIZATION'}, {'text': 'ใ‚นใ‚ฟใƒผใƒใƒƒใ‚ฏใ‚น', 'type': 'ORGANIZATION'}, {'text': 'ๅƒใจๅƒๅฐ‹ใฎ็ฅž้š ใ—', 'type': 'WORK_OF_ART'}, {'text': 'ๅŽš่Œถใƒฉใƒ†', 'type': 'PRODUCT'}, {'text': '4ๆœˆ', 'type': 'DATE'}]
180
+ </entities>
181
  ```