Update README.md
Browse files
README.md
CHANGED
|
@@ -37,7 +37,7 @@ It achieves the following results on the evaluation set:
|
|
| 37 |
|
| 38 |
## Model description
|
| 39 |
|
| 40 |
-
### 使用方法(
|
| 41 |
|
| 42 |
```python
|
| 43 |
ner_pipe = pipeline("token-classification", model='roberthsu2003/models_for_ner',aggregation_strategy="simple")
|
|
@@ -55,6 +55,87 @@ ner_result
|
|
| 55 |
{'PER': ['徐國堂'], 'LOC': ['台北']}
|
| 56 |
```
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
## Intended uses & limitations
|
| 59 |
|
| 60 |
More information needed
|
|
|
|
| 37 |
|
| 38 |
## Model description
|
| 39 |
|
| 40 |
+
### 使用方法(pipline的方法)
|
| 41 |
|
| 42 |
```python
|
| 43 |
ner_pipe = pipeline("token-classification", model='roberthsu2003/models_for_ner',aggregation_strategy="simple")
|
|
|
|
| 55 |
{'PER': ['徐國堂'], 'LOC': ['台北']}
|
| 56 |
```
|
| 57 |
|
| 58 |
+
### 使用方法(model,tokenizer)
|
| 59 |
+
|
| 60 |
+
```python
|
| 61 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
| 62 |
+
import numpy as np
|
| 63 |
+
|
| 64 |
+
# Load the pre-trained model and tokenizer
|
| 65 |
+
model = AutoModelForTokenClassification.from_pretrained('roberthsu2003/models_for_ner')
|
| 66 |
+
tokenizer = AutoTokenizer.from_pretrained('roberthsu2003/models_for_ner')
|
| 67 |
+
|
| 68 |
+
# The label mapping (you might need to adjust this based on your training)
|
| 69 |
+
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
|
| 70 |
+
|
| 71 |
+
def predict_ner(text):
|
| 72 |
+
"""Predicts NER tags for a given text using the loaded model."""
|
| 73 |
+
# Encode the text
|
| 74 |
+
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
|
| 75 |
+
|
| 76 |
+
# Get model predictions
|
| 77 |
+
outputs = model(**inputs)
|
| 78 |
+
predictions = np.argmax(outputs.logits.detach().numpy(), axis=-1)
|
| 79 |
+
|
| 80 |
+
# Get the word IDs from the encoded inputs
|
| 81 |
+
# This is the key change - word_ids() is a method on the encoding result, not the tokenizer itself
|
| 82 |
+
word_ids = inputs.word_ids(batch_index=0)
|
| 83 |
+
|
| 84 |
+
pred_tags = []
|
| 85 |
+
for word_id, pred in zip(word_ids, predictions[0]):
|
| 86 |
+
if word_id is None:
|
| 87 |
+
continue # Skip special tokens
|
| 88 |
+
pred_tags.append(label_list[pred])
|
| 89 |
+
|
| 90 |
+
return pred_tags
|
| 91 |
+
|
| 92 |
+
#To get the entities, you'll need to group consecutive non-O tags:
|
| 93 |
+
|
| 94 |
+
def get_entities(tags):
|
| 95 |
+
"""Groups consecutive NER tags to extract entities."""
|
| 96 |
+
entities = []
|
| 97 |
+
start_index = -1
|
| 98 |
+
current_entity_type = None
|
| 99 |
+
for i, tag in enumerate(tags):
|
| 100 |
+
if tag != 'O':
|
| 101 |
+
if start_index == -1:
|
| 102 |
+
start_index = i
|
| 103 |
+
current_entity_type = tag[2:] # Extract entity type (e.g., PER, LOC, ORG)
|
| 104 |
+
else: #tag == 'O'
|
| 105 |
+
if start_index != -1:
|
| 106 |
+
entities.append((start_index, i, current_entity_type))
|
| 107 |
+
start_index = -1
|
| 108 |
+
current_entity_type = None
|
| 109 |
+
if start_index != -1:
|
| 110 |
+
entities.append((start_index, len(tags), current_entity_type))
|
| 111 |
+
return entities
|
| 112 |
+
|
| 113 |
+
# Example usage:
|
| 114 |
+
text = "徐國堂在台北上班"
|
| 115 |
+
ner_tags = predict_ner(text)
|
| 116 |
+
print(f"Text: {text}")
|
| 117 |
+
#==output==
|
| 118 |
+
#Text: 徐國堂在台北上班
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
print(f"NER Tags: {ner_tags}")
|
| 122 |
+
#===output==
|
| 123 |
+
#NER Tags: ['B-PER', 'I-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC', 'O', 'O']
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
entities = get_entities(ner_tags)
|
| 127 |
+
word_tokens = tokenizer.tokenize(text) # Tokenize to get individual words
|
| 128 |
+
print(f"Entities:")
|
| 129 |
+
for start, end, entity_type in entities:
|
| 130 |
+
entity_text = "".join(word_tokens[start:end])
|
| 131 |
+
print(f"- {entity_text}: {entity_type}")
|
| 132 |
+
|
| 133 |
+
#==output==
|
| 134 |
+
#Entities:
|
| 135 |
+
#- 徐國堂: PER
|
| 136 |
+
#- 台北: LOC
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
## Intended uses & limitations
|
| 140 |
|
| 141 |
More information needed
|