Token Classification
Tatar
tatar
morphology
lstm
crf
ArabovMK commited on
Commit
f720d9c
·
verified ·
1 Parent(s): adcdd5e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +31 -17
README.md CHANGED
@@ -52,8 +52,8 @@ Then load and use the model:
52
  ```python
53
  import torch
54
  import json
55
- from torchcrf import CRF
56
  from torch import nn
 
57
  from huggingface_hub import hf_hub_download
58
 
59
  # Define the model class (must match training)
@@ -78,47 +78,61 @@ class BiLSTMCRF(nn.Module):
78
  else:
79
  return self.crf.decode(emissions, mask=mask.bool())
80
 
81
- # Load files from Hugging Face
82
- config_path = hf_hub_download("TatarNLPWorld/lstm-tatar-morph", "config.json")
83
- word2id_path = hf_hub_download("TatarNLPWorld/lstm-tatar-morph", "word2id.json")
84
- weights_path = hf_hub_download("TatarNLPWorld/lstm-tatar-morph", "best_model.pt")
85
- id2tag_path = hf_hub_download("TatarNLPWorld/lstm-tatar-morph", "id2tag.json")
 
86
 
87
  # Load hyperparameters
88
  with open(config_path) as f:
89
  config = json.load(f)
 
90
  with open(word2id_path) as f:
91
  word2id = json.load(f)
 
92
  with open(id2tag_path) as f:
93
  id2tag = {int(k): v for k, v in json.load(f).items()}
94
 
95
  # Instantiate model and load weights
96
- model = BiLSTMCRF(len(word2id), config['embedding_dim'], config['hidden_dim'], config['num_labels'], config['dropout'])
 
 
 
 
 
 
97
  model.load_state_dict(torch.load(weights_path, map_location='cpu'), strict=False)
98
  model.eval()
99
 
100
- def predict(tokens):
101
  ids = [word2id.get(w, word2id['<UNK>']) for w in tokens]
102
  mask = [1] * len(ids)
103
  orig_len = len(ids)
104
- if len(ids) < 128:
105
- ids += [0] * (128 - len(ids))
106
- mask += [0] * (128 - len(mask))
 
 
107
  else:
108
- ids = ids[:128]
109
- mask = mask[:128]
110
- input_ids = torch.tensor([ids])
111
- mask_tensor = torch.tensor([mask])
 
 
112
  with torch.no_grad():
113
  preds = model(input_ids, mask_tensor)[0]
 
114
  preds = preds[:orig_len]
115
  return [id2tag[p] for p in preds]
116
 
117
  # Example
118
  tokens = ["Татар", "теле", "бик", "бай", "."]
119
  tags = predict(tokens)
120
- for t, tag in zip(tokens, tags):
121
- print(f"{t} -> {tag}")
122
  ```
123
 
124
  Expected output:
 
52
  ```python
53
  import torch
54
  import json
 
55
  from torch import nn
56
+ from torchcrf import CRF
57
  from huggingface_hub import hf_hub_download
58
 
59
  # Define the model class (must match training)
 
78
  else:
79
  return self.crf.decode(emissions, mask=mask.bool())
80
 
81
+ # Download required files from Hugging Face
82
+ repo_id = "TatarNLPWorld/lstm-tatar-morph"
83
+ config_path = hf_hub_download(repo_id, "config.json")
84
+ word2id_path = hf_hub_download(repo_id, "word2id.json")
85
+ weights_path = hf_hub_download(repo_id, "best_model.pt")
86
+ id2tag_path = hf_hub_download(repo_id, "id2tag.json")
87
 
88
  # Load hyperparameters
89
  with open(config_path) as f:
90
  config = json.load(f)
91
+
92
  with open(word2id_path) as f:
93
  word2id = json.load(f)
94
+
95
  with open(id2tag_path) as f:
96
  id2tag = {int(k): v for k, v in json.load(f).items()}
97
 
98
  # Instantiate model and load weights
99
+ model = BiLSTMCRF(
100
+ vocab_size=len(word2id),
101
+ emb_dim=config['embedding_dim'],
102
+ hid_dim=config['hidden_dim'],
103
+ num_tags=config['num_labels'],
104
+ dropout=config.get('dropout', 0.5)
105
+ )
106
  model.load_state_dict(torch.load(weights_path, map_location='cpu'), strict=False)
107
  model.eval()
108
 
109
+ def predict(tokens, max_len=128):
110
  ids = [word2id.get(w, word2id['<UNK>']) for w in tokens]
111
  mask = [1] * len(ids)
112
  orig_len = len(ids)
113
+
114
+ if len(ids) > max_len:
115
+ ids = ids[:max_len]
116
+ mask = mask[:max_len]
117
+ tokens = tokens[:max_len]
118
  else:
119
+ ids += [0] * (max_len - len(ids))
120
+ mask += [0] * (max_len - len(mask))
121
+
122
+ input_ids = torch.tensor([ids], dtype=torch.long)
123
+ mask_tensor = torch.tensor([mask], dtype=torch.long)
124
+
125
  with torch.no_grad():
126
  preds = model(input_ids, mask_tensor)[0]
127
+
128
  preds = preds[:orig_len]
129
  return [id2tag[p] for p in preds]
130
 
131
  # Example
132
  tokens = ["Татар", "теле", "бик", "бай", "."]
133
  tags = predict(tokens)
134
+ for token, tag in zip(tokens, tags):
135
+ print(f"{token} -> {tag}")
136
  ```
137
 
138
  Expected output: