jaeyong2 commited on
Commit
67018b8
Β·
verified Β·
1 Parent(s): bf14982

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +56 -0
README.md CHANGED
@@ -56,4 +56,60 @@ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
56
  <entities>
57
  [{'text': 'Tim', 'type': 'PERSON'}, {'text': 'mom', 'type': 'PERSON'}, {'text': 'Sue', 'type': 'PERSON'}, {'text': 'park', 'type': 'LOCATION'}, {'text': 'fountain', 'type': 'LOCATION'}, {'text': 'fish', 'type': 'ANIMAL'}]
58
  </entities>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ```
 
56
  <entities>
57
  [{'text': 'Tim', 'type': 'PERSON'}, {'text': 'mom', 'type': 'PERSON'}, {'text': 'Sue', 'type': 'PERSON'}, {'text': 'park', 'type': 'LOCATION'}, {'text': 'fountain', 'type': 'LOCATION'}, {'text': 'fish', 'type': 'ANIMAL'}]
58
  </entities>
59
+ ```
60
+
61
+ ### examlpe (ko)
62
+ ```
63
+ system = """
64
+ You are an AI that dynamically performs Named Entity Recognition (NER).
65
+ You receive a sentence and a list of entity types the user wants to extract, and then identify all entities of those types within the sentence.
66
+ If you cannot find any suitable entities within the sentence, return an empty list.
67
+ """
68
+
69
+ text = """
70
+ μˆ˜μ§„μ΄λŠ” μ§€λ‚œμ£Ό ν† μš”μΌμ— μŠ€νƒ€ν•„λ“œ ν•˜λ‚¨μ— κ°”μ–΄μš”.
71
+ 그듀은 μ• ν”Œ μŠ€ν† μ–΄μ—μ„œ μƒˆλ‘œ λ‚˜μ˜¨ 아이폰 16을 κ΅¬κ²½ν•˜κ³ , 카페 λ…Έν‹°λ“œμ—μ„œ 도넛을 λ¨Ήμ—ˆμ–΄μš”.
72
+ κ·Έλ‚  저녁엔 λ°©νƒ„μ†Œλ…„λ‹¨ μ½˜μ„œνŠΈ μ‹€ν™© μ˜ν™”λ₯Ό λ΄€μ–΄μš”. 정말 신났죠!
73
+ """.strip()
74
+
75
+ named_entity = """
76
+ [
77
+ {"type": "PERSON", "description": "μ‚¬λžŒ 이름"},
78
+ {"type": "LOCATION", "description": "μ§€λͺ… λ˜λŠ” μž₯μ†Œ"},
79
+ {"type": "ORGANIZATION", "description": "쑰직, νšŒμ‚¬, 단체"},
80
+ {"type": "PRODUCT", "description": "μ œν’ˆλͺ…"},
81
+ {"type": "WORK_OF_ART", "description": "예술 μž‘ν’ˆ, μ˜ν™”, μ±…, λ…Έλž˜ λ“±"},
82
+ {"type": "DATE", "description": "λ‚ μ§œ, μš”μΌ, μ‹œμ "}
83
+ ]
84
+ """.strip()
85
+
86
+
87
+ user = f"<sentence>\n{text}\n</sentence>\n\n<entity_list>\n{named_entity}\n</entity_list>\n\n"
88
+ chat = [{"role":"system", "content":system}, {"role":"user", "content":user}]
89
+ chat_text = tokenizer.apply_chat_template(
90
+ chat,
91
+ enable_thinking=False,
92
+ add_generation_prompt=True,
93
+ tokenize=False
94
+ )
95
+
96
+ model_inputs = tokenizer([chat_text], return_tensors="pt").to(model.device)
97
+
98
+ generated_ids = model.generate(
99
+ **model_inputs,
100
+ max_new_tokens=512
101
+ )
102
+
103
+ generated_ids = [
104
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
105
+ ]
106
+
107
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
108
+ ```
109
+
110
+ ### result (ko)
111
+ ```
112
+ <entities>
113
+ [{'text': 'μˆ˜μ§„μ΄', 'type': 'PERSON'}, {'text': 'μŠ€νƒ€ν•„λ“œ ν•˜λ‚¨', 'type': 'LOCATION'}, {'text': '아이폰 16', 'type': 'PRODUCT'}, {'text': 'λ°©νƒ„μ†Œλ…„λ‹¨', 'type': 'ORGANIZATION'}, {'text': 'μ½˜μ„œνŠΈ μ‹€ν™© μ˜ν™”', 'type': 'WORK_OF_ART'}, {'text': 'ν† μš”μΌ', 'type': 'DATE'}, {'text': '카페 λ…Έν‹°λ“œ', 'type': 'LOCATION'}]
114
+ </entities>
115
  ```