adjaysagar commited on
Commit
81d66fc
·
verified ·
1 Parent(s): afd3b69

Upload Indian Address NER model (checkpoint-20793)

Browse files
README.md ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🏠 TinyBERT Indian Address NER Model
2
+
3
+ This model is a fine-tuned **TinyBERT** for **Named Entity Recognition (NER)** on Indian addresses. It can extract and classify various address components from Indian address text with high accuracy, leveraging TinyBERT's efficient and lightweight architecture.
4
+
5
+ ## 🎯 Model Description
6
+
7
+ TinyBERT fine-tuned for Indian address Named Entity Recognition (NER)
8
+
9
+ ### Key Capabilities
10
+
11
+ - **Address Component Extraction**: Identify and classify various parts of Indian addresses
12
+ - **Multi-format Support**: Handle various Indian address formats and styles
13
+ - **Lightweight Architecture**: Built on TinyBERT's efficient transformer design
14
+ - **High Accuracy**: Fine-tuned on augmented Indian address dataset
15
+ - **Fast Inference**: Optimized TinyBERT for quick entity extraction
16
+ - **Robust Recognition**: Handles partial, incomplete, or informal addresses
17
+ - **Efficient Processing**: TinyBERT's compact design for better performance
18
+ - **Mobile-Friendly**: Smaller model size suitable for edge deployment
19
+ - **Resource Efficient**: Lower memory and computational requirements
20
+
21
+ ## 📊 Model Architecture
22
+
23
+ - **Base Model**: huawei-noah/TinyBERT_General_6L_768D (TinyBERT)
24
+ - **Model Type**: Token Classification (NER)
25
+ - **Vocabulary Size**: 30,522 tokens
26
+ - **Hidden Size**: 768
27
+ - **Number of Layers**: 6
28
+ - **Attention Heads**: 12
29
+ - **Max Sequence Length**: 512 tokens
30
+ - **Number of Labels**: 23
31
+ - **Model Size**: ~761MB
32
+ - **Checkpoint**: 20793
33
+
34
+ ## 🚀 Usage Examples
35
+
36
+ ```python
37
+ import torch
38
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
39
+ import warnings
40
+ warnings.filterwarnings("ignore")
41
+
42
+ class IndianAddressNER:
43
+ def __init__(self):
44
+ model_name = "shiprocket-ai/open-tinybert-indian-address-ner"
45
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
46
+ self.model = AutoModelForTokenClassification.from_pretrained(model_name)
47
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
48
+ self.model.to(self.device)
49
+ self.model.eval()
50
+
51
+ # Entity mappings
52
+ self.id2entity = {
53
+ "0": "O",
54
+ "1": "B-building_name",
55
+ "2": "I-building_name",
56
+ "3": "B-city",
57
+ "4": "I-city",
58
+ "5": "B-country",
59
+ "6": "I-country",
60
+ "7": "B-floor",
61
+ "8": "I-floor",
62
+ "9": "B-house_details",
63
+ "10": "I-house_details",
64
+ "11": "B-locality",
65
+ "12": "I-locality",
66
+ "13": "B-pincode",
67
+ "14": "I-pincode",
68
+ "15": "B-road",
69
+ "16": "I-road",
70
+ "17": "B-state",
71
+ "18": "I-state",
72
+ "19": "B-sub_locality",
73
+ "20": "I-sub_locality",
74
+ "21": "B-landmarks",
75
+ "22": "I-landmarks"
76
+ }
77
+
78
+ def predict(self, address):
79
+ """Extract entities from an Indian address - FIXED VERSION"""
80
+ if not address.strip():
81
+ return {}
82
+
83
+ # Tokenize with offset mapping for better text reconstruction
84
+ inputs = self.tokenizer(
85
+ address,
86
+ return_tensors="pt",
87
+ truncation=True,
88
+ padding=True,
89
+ max_length=128,
90
+ return_offsets_mapping=True
91
+ )
92
+
93
+ # Extract offset mapping before moving to device
94
+ offset_mapping = inputs.pop("offset_mapping")[0]
95
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
96
+
97
+ # Predict
98
+ with torch.no_grad():
99
+ outputs = self.model(**inputs)
100
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
101
+ predicted_ids = torch.argmax(predictions, dim=-1)
102
+ confidence_scores = torch.max(predictions, dim=-1)[0]
103
+
104
+ # Extract entities using offset mapping
105
+ entities = self.extract_entities_with_offsets(
106
+ address,
107
+ predicted_ids[0],
108
+ confidence_scores[0],
109
+ offset_mapping
110
+ )
111
+
112
+ return entities
113
+
114
+ def extract_entities_with_offsets(self, original_text, predicted_ids, confidences, offset_mapping):
115
+ """Extract entities using offset mapping for accurate text reconstruction"""
116
+ entities = {}
117
+ current_entity = None
118
+
119
+ for i, (pred_id, conf) in enumerate(zip(predicted_ids, confidences)):
120
+ if i >= len(offset_mapping):
121
+ break
122
+
123
+ start, end = offset_mapping[i]
124
+
125
+ # Skip special tokens (they have (0,0) mapping)
126
+ if start == end == 0:
127
+ continue
128
+
129
+ label = self.id2entity.get(str(pred_id.item()), "O")
130
+
131
+ if label.startswith("B-"):
132
+ # Save previous entity
133
+ if current_entity:
134
+ entity_type = current_entity["type"]
135
+ if entity_type not in entities:
136
+ entities[entity_type] = []
137
+ entities[entity_type].append({
138
+ "text": current_entity["text"],
139
+ "confidence": current_entity["confidence"]
140
+ })
141
+
142
+ # Start new entity
143
+ entity_type = label[2:] # Remove "B-"
144
+ current_entity = {
145
+ "type": entity_type,
146
+ "text": original_text[start:end],
147
+ "confidence": conf.item(),
148
+ "start": start,
149
+ "end": end
150
+ }
151
+
152
+ elif label.startswith("I-") and current_entity:
153
+ # Continue current entity
154
+ entity_type = label[2:] # Remove "I-"
155
+ if entity_type == current_entity["type"]:
156
+ # Extend the entity to include this token
157
+ current_entity["text"] = original_text[current_entity["start"]:end]
158
+ current_entity["confidence"] = (current_entity["confidence"] + conf.item()) / 2
159
+ current_entity["end"] = end
160
+
161
+ elif label == "O" and current_entity:
162
+ # End current entity
163
+ entity_type = current_entity["type"]
164
+ if entity_type not in entities:
165
+ entities[entity_type] = []
166
+ entities[entity_type].append({
167
+ "text": current_entity["text"],
168
+ "confidence": current_entity["confidence"]
169
+ })
170
+ current_entity = None
171
+
172
+ # Add final entity if exists
173
+ if current_entity:
174
+ entity_type = current_entity["type"]
175
+ if entity_type not in entities:
176
+ entities[entity_type] = []
177
+ entities[entity_type].append({
178
+ "text": current_entity["text"],
179
+ "confidence": current_entity["confidence"]
180
+ })
181
+
182
+ return entities
183
+
184
+ # Usage example
185
+ ner = IndianAddressNER()
186
+
187
+ # Test addresses
188
+ test_addresses = [
189
+ "Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058",
190
+ "DLF Cyber City, Sector 25, Gurgaon, Haryana",
191
+ "Flat 201, MG Road, Bangalore, Karnataka, 560001",
192
+ "Phoenix Mall, Kurla West, Mumbai"
193
+ ]
194
+
195
+ print("🏠 INDIAN ADDRESS NER EXAMPLES")
196
+ print("=" * 50)
197
+
198
+ for address in test_addresses:
199
+ print(f"\n📍 Address: {address}")
200
+ entities = ner.predict(address)
201
+
202
+ if entities:
203
+ for entity_type, entity_list in sorted(entities.items()):
204
+ print(f"🏷️ {entity_type.replace('_', ' ').title()}:")
205
+ for entity in entity_list:
206
+ confidence = entity['confidence']
207
+ text = entity['text']
208
+ confidence_icon = "🟢" if confidence > 0.8 else "🟡" if confidence > 0.6 else "🔴"
209
+ print(f" {confidence_icon} {text} (confidence: {confidence:.3f})")
210
+ else:
211
+ print("❌ No entities found")
212
+ print("-" * 40)
213
+ ```
214
+
215
+ ## 🏷️ Supported Entity Types
216
+
217
+ The model can identify and extract the following address components:
218
+
219
+ - **Building Name**: building_name
220
+ - **City**: city
221
+ - **Country**: country
222
+ - **Floor**: floor
223
+ - **House Details**: house_details
224
+ - **Landmarks**: landmarks
225
+ - **Locality**: locality
226
+ - **Pincode**: pincode
227
+ - **Road**: road
228
+ - **State**: state
229
+ - **Sub Locality**: sub_locality
230
+
231
+ ## 📈 Performance Highlights
232
+
233
+ - **Indian Address Optimized**: Specialized for Indian address patterns and formats
234
+ - **TinyBERT Advantage**: Efficient and lightweight transformer architecture
235
+ - **High Precision**: Accurate entity boundary detection
236
+ - **Multi-component Recognition**: Identifies multiple entities in complex addresses
237
+ - **Confidence Scoring**: Provides confidence scores for each extracted entity
238
+ - **Fast Inference**: Optimized for real-time applications
239
+ - **Robust Handling**: Works with partial or informal address formats
240
+ - **Compact Architecture**: TinyBERT's efficient design for deployment
241
+ - **Resource Friendly**: Lower computational requirements
242
+
243
+ ## 🔧 Training Details
244
+
245
+ - **Dataset**: 300% augmented Indian address dataset
246
+ - **Training Strategy**: Fine-tuned from pre-trained TinyBERT
247
+ - **Specialization**: Indian address entity extraction
248
+ - **Context Length**: 128 tokens
249
+ - **Version**: v1.0
250
+ - **Framework**: PyTorch + Transformers
251
+ - **BIO Tagging**: Uses Begin-Inside-Outside tagging scheme
252
+ - **Base Model Advantage**: TinyBERT's efficient architecture and compact size
253
+
254
+ ## 💡 Use Cases
255
+
256
+ ### 1. **Address Parsing & Standardization**
257
+ - Parse unstructured address text into components
258
+ - Standardize address formats for databases
259
+ - Extract specific components for validation
260
+
261
+ ### 2. **Form Auto-completion**
262
+ - Auto-fill address forms by extracting components
263
+ - Validate address field completeness
264
+ - Suggest corrections for incomplete addresses
265
+
266
+ ### 3. **Data Processing & Migration**
267
+ - Clean legacy address databases
268
+ - Extract structured data from unstructured text
269
+ - Migrate addresses between different systems
270
+
271
+ ### 4. **Logistics & Delivery**
272
+ - Extract delivery-relevant components
273
+ - Validate address completeness for shipping
274
+ - Improve address accuracy for last-mile delivery
275
+
276
+ ### 5. **Geocoding Preprocessing**
277
+ - Prepare addresses for geocoding APIs
278
+ - Extract location components for mapping
279
+ - Improve geocoding accuracy with clean components
280
+
281
+ ### 6. **Mobile & Edge Deployment**
282
+ - Deploy on mobile devices with limited resources
283
+ - Run inference on edge computing devices
284
+ - Integrate into lightweight applications
285
+
286
+ ## ⚡ Performance Tips
287
+
288
+ 1. **Input Length**: Keep addresses under 128 tokens for optimal performance
289
+ 2. **Batch Processing**: Process multiple addresses in batches for efficiency
290
+ 3. **GPU Usage**: Use GPU for faster inference on large datasets
291
+ 4. **Confidence Filtering**: Filter results by confidence score for higher precision
292
+ 5. **Text Preprocessing**: Clean input text for better recognition
293
+ 6. **TinyBERT Advantage**: Model benefits from efficient architecture optimizations
294
+ 7. **Edge Deployment**: Suitable for mobile and edge computing scenarios
295
+
296
+ ## ⚠️ Limitations
297
+
298
+ - **Language Support**: Primarily optimized for English Indian addresses
299
+ - **Regional Variations**: May struggle with highly regional or colloquial formats
300
+ - **New Localities**: Performance may vary on very recent developments
301
+ - **Complex Formatting**: May have difficulty with highly unstructured text
302
+ - **Context Dependency**: Works best with clear address context
303
+
304
+ ## 📋 Entity Mapping
305
+
306
+ The model uses BIO (Begin-Inside-Outside) tagging scheme:
307
+
308
+ ```json
309
+ {
310
+ "entity2id": {
311
+ "O": 0,
312
+ "B-building_name": 1,
313
+ "I-building_name": 2,
314
+ "B-city": 3,
315
+ "I-city": 4,
316
+ "B-country": 5,
317
+ "I-country": 6,
318
+ "B-floor": 7,
319
+ "I-floor": 8,
320
+ "B-house_details": 9,
321
+ "I-house_details": 10,
322
+ "B-locality": 11,
323
+ "I-locality": 12,
324
+ "B-pincode": 13,
325
+ "I-pincode": 14,
326
+ "B-road": 15,
327
+ "I-road": 16,
328
+ "B-state": 17,
329
+ "I-state": 18,
330
+ "B-sub_locality": 19,
331
+ "I-sub_locality": 20,
332
+ "B-landmarks": 21,
333
+ "I-landmarks": 22
334
+ },
335
+ "id2entity": {
336
+ "0": "O",
337
+ "1": "B-building_name",
338
+ "2": "I-building_name",
339
+ "3": "B-city",
340
+ "4": "I-city",
341
+ "5": "B-country",
342
+ "6": "I-country",
343
+ "7": "B-floor",
344
+ "8": "I-floor",
345
+ "9": "B-house_details",
346
+ "10": "I-house_details",
347
+ "11": "B-locality",
348
+ "12": "I-locality",
349
+ "13": "B-pincode",
350
+ "14": "I-pincode",
351
+ "15": "B-road",
352
+ "16": "I-road",
353
+ "17": "B-state",
354
+ "18": "I-state",
355
+ "19": "B-sub_locality",
356
+ "20": "I-sub_locality",
357
+ "21": "B-landmarks",
358
+ "22": "I-landmarks"
359
+ }
360
+ }
361
+ ```
362
+
363
+ ## 📋 Model Files
364
+
365
+ - `config.json`: Model configuration and hyperparameters
366
+ - `pytorch_model.bin` / `model.safetensors`: Model weights
367
+ - `tokenizer.json`: Tokenizer configuration
368
+ - `tokenizer_config.json`: Tokenizer settings
369
+ - `vocab.txt`: Vocabulary file
370
+ - `entity_mappings.json`: Entity type mappings
371
+
372
+ ## 🔄 Model Updates
373
+
374
+ - **Version**: v1.0 (Checkpoint 20793)
375
+ - **Last Updated**: 2025-06-19
376
+ - **Training Completion**: Based on augmented Indian address dataset
377
+ - **Base Model**: TinyBERT for efficient transformer architecture
378
+
379
+ ## 📚 Citation
380
+
381
+ If you use this model in your research or applications, please cite:
382
+
383
+ ```bibtex
384
+ @misc{open-tinybert-indian-address-ner,
385
+ title={TinyBERT Indian Address NER Model},
386
+ year={2025},
387
+ publisher={Hugging Face},
388
+ url={https://huggingface.co/shiprocket-ai/open-tinybert-indian-address-ner}
389
+ }
390
+ ```
391
+
392
+ ## 📞 Support & Contact
393
+
394
+ For questions, issues, or feature requests:
395
+ - Open an issue in this repository
396
+ - Contact: shiprocket-ai team
397
+ - Documentation: See usage examples above
398
+
399
+ ## 📜 License
400
+
401
+ This model is released under the Apache 2.0 License. See LICENSE file for details.
402
+
403
+ ---
404
+
405
+ *Specialized for Indian address entity recognition - Built with ❤️ by shiprocket-ai team using TinyBERT*
config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "cell": {},
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "O",
13
+ "1": "B-building_name",
14
+ "2": "I-building_name",
15
+ "3": "B-city",
16
+ "4": "I-city",
17
+ "5": "B-country",
18
+ "6": "I-country",
19
+ "7": "B-floor",
20
+ "8": "I-floor",
21
+ "9": "B-house_details",
22
+ "10": "I-house_details",
23
+ "11": "B-locality",
24
+ "12": "I-locality",
25
+ "13": "B-pincode",
26
+ "14": "I-pincode",
27
+ "15": "B-road",
28
+ "16": "I-road",
29
+ "17": "B-state",
30
+ "18": "I-state",
31
+ "19": "B-sub_locality",
32
+ "20": "I-sub_locality",
33
+ "21": "B-landmarks",
34
+ "22": "I-landmarks"
35
+ },
36
+ "initializer_range": 0.02,
37
+ "intermediate_size": 3072,
38
+ "label2id": {
39
+ "B-building_name": 1,
40
+ "B-city": 3,
41
+ "B-country": 5,
42
+ "B-floor": 7,
43
+ "B-house_details": 9,
44
+ "B-landmarks": 21,
45
+ "B-locality": 11,
46
+ "B-pincode": 13,
47
+ "B-road": 15,
48
+ "B-state": 17,
49
+ "B-sub_locality": 19,
50
+ "I-building_name": 2,
51
+ "I-city": 4,
52
+ "I-country": 6,
53
+ "I-floor": 8,
54
+ "I-house_details": 10,
55
+ "I-landmarks": 22,
56
+ "I-locality": 12,
57
+ "I-pincode": 14,
58
+ "I-road": 16,
59
+ "I-state": 18,
60
+ "I-sub_locality": 20,
61
+ "O": 0
62
+ },
63
+ "layer_norm_eps": 1e-12,
64
+ "max_position_embeddings": 512,
65
+ "model_type": "bert",
66
+ "num_attention_heads": 12,
67
+ "num_hidden_layers": 6,
68
+ "pad_token_id": 0,
69
+ "position_embedding_type": "absolute",
70
+ "pre_trained": "",
71
+ "structure": [],
72
+ "torch_dtype": "float32",
73
+ "transformers_version": "4.52.4",
74
+ "type_vocab_size": 2,
75
+ "use_cache": true,
76
+ "vocab_size": 30522
77
+ }
entity_mappings.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "entity2id": {
3
+ "O": 0,
4
+ "B-building_name": 1,
5
+ "I-building_name": 2,
6
+ "B-city": 3,
7
+ "I-city": 4,
8
+ "B-country": 5,
9
+ "I-country": 6,
10
+ "B-floor": 7,
11
+ "I-floor": 8,
12
+ "B-house_details": 9,
13
+ "I-house_details": 10,
14
+ "B-locality": 11,
15
+ "I-locality": 12,
16
+ "B-pincode": 13,
17
+ "I-pincode": 14,
18
+ "B-road": 15,
19
+ "I-road": 16,
20
+ "B-state": 17,
21
+ "I-state": 18,
22
+ "B-sub_locality": 19,
23
+ "I-sub_locality": 20,
24
+ "B-landmarks": 21,
25
+ "I-landmarks": 22
26
+ },
27
+ "id2entity": {
28
+ "0": "O",
29
+ "1": "B-building_name",
30
+ "2": "I-building_name",
31
+ "3": "B-city",
32
+ "4": "I-city",
33
+ "5": "B-country",
34
+ "6": "I-country",
35
+ "7": "B-floor",
36
+ "8": "I-floor",
37
+ "9": "B-house_details",
38
+ "10": "I-house_details",
39
+ "11": "B-locality",
40
+ "12": "I-locality",
41
+ "13": "B-pincode",
42
+ "14": "I-pincode",
43
+ "15": "B-road",
44
+ "16": "I-road",
45
+ "17": "B-state",
46
+ "18": "I-state",
47
+ "19": "B-sub_locality",
48
+ "20": "I-sub_locality",
49
+ "21": "B-landmarks",
50
+ "22": "I-landmarks"
51
+ }
52
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:822e8199f7caba4f6fa7ba38f0e006fc035c4a014acd1ec87d6c79f2ab185b4e
3
+ size 265540428
model_card_metadata.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": [
3
+ "en"
4
+ ],
5
+ "tags": [
6
+ "token-classification",
7
+ "ner",
8
+ "indian-addresses",
9
+ "address-parsing",
10
+ "tinybert",
11
+ "entity-extraction",
12
+ "address-components",
13
+ "indian-postal",
14
+ "location-extraction",
15
+ "lightweight-model"
16
+ ],
17
+ "datasets": [
18
+ "custom-indian-addresses"
19
+ ],
20
+ "metrics": [
21
+ "precision",
22
+ "recall",
23
+ "f1"
24
+ ],
25
+ "model_type": "bert",
26
+ "base_model": "huawei-noah/TinyBERT_General_6L_768D",
27
+ "pipeline_tag": "token-classification",
28
+ "widget": [
29
+ {
30
+ "text": "Shop No 123, Sunshine Apartments, Andheri West, Mumbai, 400058",
31
+ "example_title": "Complete Address"
32
+ },
33
+ {
34
+ "text": "DLF Cyber City, Sector 25, Gurgaon, Haryana",
35
+ "example_title": "Commercial Address"
36
+ },
37
+ {
38
+ "text": "Flat 201, MG Road, Bangalore, Karnataka, 560001",
39
+ "example_title": "Residential Address"
40
+ }
41
+ ]
42
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d10333f7f6f93e1967e6631f8b5e26dda56023e33c53336750a8d5c7020f684
3
+ size 531143627
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a30f1e62dce4b06fe3b4ad6f17ae7ba4b40aa140dd38d5e09c0e9ea9e316f46
3
+ size 14709
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0156ea329dd483385c86313a0b0d09c7050936b4ce8a9ed2d386c43d99df3669
3
+ size 1383
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ff3b4cc5efe2ab5b80c2cc4cbcc46ba5f5e5c4afbf9f7ef95071a6e348ef51d
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
trainer_state.json ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 20793,
3
+ "best_metric": 0.9469405010418878,
4
+ "best_model_checkpoint": "./ner_output_tinybert6L/combined_300percent_TinyBERT_General_6L_768D_20250619_074923/checkpoints/checkpoint-20793",
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 20793,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.24989179050642044,
14
+ "grad_norm": 0.6586679816246033,
15
+ "learning_rate": 4.5837541480305874e-05,
16
+ "loss": 0.3426,
17
+ "step": 1732
18
+ },
19
+ {
20
+ "epoch": 0.49978358101284087,
21
+ "grad_norm": 0.5292770266532898,
22
+ "learning_rate": 4.1672678305198864e-05,
23
+ "loss": 0.2178,
24
+ "step": 3464
25
+ },
26
+ {
27
+ "epoch": 0.7496753715192613,
28
+ "grad_norm": 1.1701136827468872,
29
+ "learning_rate": 3.751262444091762e-05,
30
+ "loss": 0.1982,
31
+ "step": 5196
32
+ },
33
+ {
34
+ "epoch": 0.9995671620256817,
35
+ "grad_norm": 3.1348302364349365,
36
+ "learning_rate": 3.3347761265810614e-05,
37
+ "loss": 0.1881,
38
+ "step": 6928
39
+ },
40
+ {
41
+ "epoch": 1.0,
42
+ "eval_accuracy": 0.9393524410059049,
43
+ "eval_f1": 0.9379816398725478,
44
+ "eval_loss": 0.17937302589416504,
45
+ "eval_precision": 0.9389823871991911,
46
+ "eval_recall": 0.9393524410059049,
47
+ "eval_runtime": 26.2651,
48
+ "eval_samples_per_second": 745.056,
49
+ "eval_steps_per_second": 46.602,
50
+ "step": 6931
51
+ },
52
+ {
53
+ "epoch": 1.2494589525321023,
54
+ "grad_norm": 3.0484063625335693,
55
+ "learning_rate": 2.9182898090703604e-05,
56
+ "loss": 0.1595,
57
+ "step": 8660
58
+ },
59
+ {
60
+ "epoch": 1.4993507430385224,
61
+ "grad_norm": 0.3237079679965973,
62
+ "learning_rate": 2.5020439571009476e-05,
63
+ "loss": 0.1524,
64
+ "step": 10392
65
+ },
66
+ {
67
+ "epoch": 1.749242533544943,
68
+ "grad_norm": 1.4062304496765137,
69
+ "learning_rate": 2.0857981051315347e-05,
70
+ "loss": 0.1491,
71
+ "step": 12124
72
+ },
73
+ {
74
+ "epoch": 1.9991343240513635,
75
+ "grad_norm": 3.2153682708740234,
76
+ "learning_rate": 1.6693117876208337e-05,
77
+ "loss": 0.1402,
78
+ "step": 13856
79
+ },
80
+ {
81
+ "epoch": 2.0,
82
+ "eval_accuracy": 0.9458694225866252,
83
+ "eval_f1": 0.9456418383157583,
84
+ "eval_loss": 0.15713337063789368,
85
+ "eval_precision": 0.9463121752269362,
86
+ "eval_recall": 0.9458694225866252,
87
+ "eval_runtime": 27.0546,
88
+ "eval_samples_per_second": 723.316,
89
+ "eval_steps_per_second": 45.242,
90
+ "step": 13862
91
+ },
92
+ {
93
+ "epoch": 2.249026114557784,
94
+ "grad_norm": 0.006557302549481392,
95
+ "learning_rate": 1.2528254701101333e-05,
96
+ "loss": 0.115,
97
+ "step": 15588
98
+ },
99
+ {
100
+ "epoch": 2.4989179050642045,
101
+ "grad_norm": 1.2286018133163452,
102
+ "learning_rate": 8.365796181407206e-06,
103
+ "loss": 0.1167,
104
+ "step": 17320
105
+ },
106
+ {
107
+ "epoch": 2.7488096955706247,
108
+ "grad_norm": 0.032648004591464996,
109
+ "learning_rate": 4.200933006300197e-06,
110
+ "loss": 0.1112,
111
+ "step": 19052
112
+ },
113
+ {
114
+ "epoch": 2.998701486077045,
115
+ "grad_norm": 4.031618118286133,
116
+ "learning_rate": 3.847448660606935e-08,
117
+ "loss": 0.1071,
118
+ "step": 20784
119
+ },
120
+ {
121
+ "epoch": 3.0,
122
+ "eval_accuracy": 0.9477408531081158,
123
+ "eval_f1": 0.9469405010418878,
124
+ "eval_loss": 0.15903286635875702,
125
+ "eval_precision": 0.946831252072933,
126
+ "eval_recall": 0.9477408531081158,
127
+ "eval_runtime": 26.9292,
128
+ "eval_samples_per_second": 726.682,
129
+ "eval_steps_per_second": 45.452,
130
+ "step": 20793
131
+ }
132
+ ],
133
+ "logging_steps": 1732,
134
+ "max_steps": 20793,
135
+ "num_input_tokens_seen": 0,
136
+ "num_train_epochs": 3,
137
+ "save_steps": 500,
138
+ "stateful_callbacks": {
139
+ "EarlyStoppingCallback": {
140
+ "args": {
141
+ "early_stopping_patience": 3,
142
+ "early_stopping_threshold": 0.001
143
+ },
144
+ "attributes": {
145
+ "early_stopping_patience_counter": 0
146
+ }
147
+ },
148
+ "TrainerControl": {
149
+ "args": {
150
+ "should_epoch_stop": false,
151
+ "should_evaluate": false,
152
+ "should_log": false,
153
+ "should_save": true,
154
+ "should_training_stop": true
155
+ },
156
+ "attributes": {}
157
+ }
158
+ },
159
+ "total_flos": 1.0869727318769664e+16,
160
+ "train_batch_size": 16,
161
+ "trial_name": null,
162
+ "trial_params": null
163
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:948193f13be807bc54705ba1e696855c62505a8c02d93728604bddb5d56f1c98
3
+ size 5841
vocab.txt ADDED
The diff for this file is too large to render. See raw diff