Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.gitattributes +1 -0
README.md +42 -16
config.json +24 -11
model.safetensors +2 -2
test_metrics.json +22 -13
tokenizer.json +0 -0
tokenizer_config.json +6 -8

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,3 @@
----
-license: mit
----
 language: en
 license: apache-2.0
 library_name: transformers
@@ -37,8 +34,7 @@ You can use this model directly with the `token-classification` (or `ner`) pipel
 from transformers import pipeline
 # Load the model from the Hub
-# (Replace with your actual model ID, e.g., "your-username/hirly-ner-multi")
-model_id = "your-username/hirly-ner-multi"
 # Initialize the pipeline
 # aggregation_strategy="simple" groups B- and I- tags (e.g., B-SKILL, I-SKILL -> SKILL)
@@ -49,7 +45,20 @@ extractor = pipeline(
 )
 # Example text
-text = "Data Scientist with 5+ years of experience in Python and machine learning. Also 6 months in Java."
 # Get entities
 entities = extractor(text)
@@ -66,11 +75,20 @@ for entity in confident_entities:
 **Expected Output:**
 ````
-[EXPERIENCE_DURATION]  5+ years (Confidence: 1.00)
-[SKILL]  Python (Confidence: 0.99)
-[SKILL]  machine learning (Confidence: 1.00)
-[EXPERIENCE_DURATION]  6 months (Confidence: 1.00)
-[SKILL]  Java (Confidence: 0.99)
 ````
 ## Training, Performance, and Limitations
@@ -83,15 +101,23 @@ The model was validated on a test set of \~2,000 examples, achieving the followi
 | Entity | F1-Score |
 | :--- | :--- |
-| **`EXPERIENCE_DURATION`** | **99.9%** |
-| **`SKILL`** | **97.6%** |
-| **Overall** | **98.8%** |
 ### Training Methodology
-1.  **`EXPERIENCE_DURATION` (High Quality):** This entity was labeled using a robust set of regular expressions designed to find time patterns (e.g., "5+ years", "six months"). Its near-perfect F1 score reflects this.
-2.  **`SKILL` (High Recall, Lower Precision):** This entity was labeled by performing *exact matching* against a large, proprietary vocabulary of \~8,700 terms.
 ### Limitations (Important)

 language: en
 license: apache-2.0
 library_name: transformers
 from transformers import pipeline
 # Load the model from the Hub
+model_id = "feliponi/hirly-ner-multi"
 # Initialize the pipeline
 # aggregation_strategy="simple" groups B- and I- tags (e.g., B-SKILL, I-SKILL -> SKILL)
 )
 # Example text
+text = """
+Data Scientist with 5+ years of experience in Python and machine learning.
+Also 6 months in Java.
+Soft skills:
+inclusive leadership
+paradigm thinking
+performance optimization
+personal initiative
+english language proficiency
+portuguese language proficiency
+AWS Certified Solutions Architect - Associate"""
 # Get entities
 entities = extractor(text)
 **Expected Output:**
 ````
+[{'entity_group': 'SKILL', 'score': np.float32(0.9340167), 'word': 'Data Scientist', 'start': 1, 'end': 15},
+{'entity_group': 'EXPERIENCE_DURATION', 'score': np.float32(0.9998663), 'word': ' 5+ years', 'start': 21, 'end': 29},
+{'entity_group': 'SKILL', 'score': np.float32(0.99859816), 'word': ' Python', 'start': 47, 'end': 53},
+{'entity_group': 'SKILL', 'score': np.float32(0.9998181), 'word': ' machine learning', 'start': 58, 'end': 74},
+{'entity_group': 'EXPERIENCE_DURATION', 'score': np.float32(0.9998392), 'word': ' 6 months', 'start': 81, 'end': 89},
+{'entity_group': 'SKILL', 'score': np.float32(0.9982002), 'word': ' Java', 'start': 93, 'end': 97},
+{'entity_group': 'SOFT_SKILL', 'score': np.float32(0.995745), 'word': ' leadership', 'start': 124, 'end': 134},
+{'entity_group': 'SOFT_SKILL', 'score': np.float32(0.9859735), 'word': 'performance optimization', 'start': 153, 'end': 177},
+{'entity_group': 'SOFT_SKILL', 'score': np.float32(0.98516375), 'word': 'personal initiative', 'start': 178, 'end': 197},
+{'entity_group': 'LANG', 'score': np.float32(0.96456385), 'word': 'english language proficiency', 'start': 199, 'end': 227},
+{'entity_group': 'LANG', 'score': np.float32(0.9288162), 'word': 'portuguese language proficiency', 'start': 228, 'end': 259},
+{'entity_group': 'SKILL', 'score': np.float32(0.926032), 'word': 'AWS', 'start': 261, 'end': 264},
+{'entity_group': 'SOFT_SKILL', 'score': np.float32(0.9559879), 'word': ' Solutions', 'start': 275, 'end': 284},
+{'entity_group': 'SKILL', 'score': np.float32(0.84499276), 'word': ' Architect', 'start': 285, 'end': 294}]
 ````
 ## Training, Performance, and Limitations
 | Entity | F1-Score |
 | :--- | :--- |
+| **`SKILLS`** | **98.9%** |
+| **`LANG`** | **99.0%** |
+| **`CERT`** | **84.9%** |
+| **`SOFT_SKILL`** | **98.6%** |
+| **`EXPERIENCE_DURATION`** | **99.8%** |
+| **Overall** | **96.3%** |
 ### Training Methodology
+This model's performance is a direct result of its **Weak Labeling** training methodology. The labels were generated automatically, not manually annotated.
+1.  **`EXPERIENCE_DURATION` (Pattern-Based):** This entity was labeled using a robust set of regular expressions designed to find time-based patterns (e.g., "5+ years", "six months", "3-5 anos"). Its near-perfect F1 score reflects the high precision of this regex approach.
+2.  **`SKILL`, `SOFT_SKILL`, `LANG`, `CERT` (Vocabulary-Based):** These four entities were labeled by performing high-speed, *exact matching* against four separate vocabulary files (`skills.txt`, `softskills.txt`, `langskills.txt`, `certifications.txt`).
+    * **High Performance (`SKILL`, `SOFT_SKILL`, `LANG`):** The excellent F1 scores (98-99%) indicate that the vocabularies for these labels were comprehensive and matched the training texts frequently.
+    * **Good Performance (`CERT`):** The 84.9% F1 score is strong but shows room for improvement. This score suggests the `certifications.txt` vocabulary was less comprehensive. The model's performance for this label would be directly improved by adding more certification names (e.g., "AWS CSAA", "PMP", etc.) to the vocabulary file and retraining.
 ### Limitations (Important)

config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "architectures": [
-    "RobertaForTokenClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
@@ -12,29 +12,42 @@
   "hidden_size": 768,
   "id2label": {
     "0": "O",
-    "1": "B-EXPERIENCE_DURATION",
-    "2": "I-EXPERIENCE_DURATION",
-    "3": "B-SKILL",
-    "4": "I-SKILL"
   },
   "initializer_range": 0.02,
   "intermediate_size": 3072,
   "label2id": {
-    "B-EXPERIENCE_DURATION": 1,
-    "B-SKILL": 3,
-    "I-EXPERIENCE_DURATION": 2,
-    "I-SKILL": 4,
     "O": 0
   },
   "layer_norm_eps": 1e-05,
   "max_position_embeddings": 514,
-  "model_type": "roberta",
   "num_attention_heads": 12,
   "num_hidden_layers": 12,
   "pad_token_id": 1,
   "position_embedding_type": "absolute",
   "transformers_version": "4.57.1",
   "type_vocab_size": 1,
   "use_cache": true,
-  "vocab_size": 50265
 }

 {
   "architectures": [
+    "XLMRobertaForTokenClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
   "hidden_size": 768,
   "id2label": {
     "0": "O",
+    "1": "B-CERT",
+    "2": "I-CERT",
+    "3": "B-EXPERIENCE_DURATION",
+    "4": "I-EXPERIENCE_DURATION",
+    "5": "B-LANG",
+    "6": "I-LANG",
+    "7": "B-SKILL",
+    "8": "I-SKILL",
+    "9": "B-SOFT_SKILL",
+    "10": "I-SOFT_SKILL"
   },
   "initializer_range": 0.02,
   "intermediate_size": 3072,
   "label2id": {
+    "B-CERT": 1,
+    "B-EXPERIENCE_DURATION": 3,
+    "B-LANG": 5,
+    "B-SKILL": 7,
+    "B-SOFT_SKILL": 9,
+    "I-CERT": 2,
+    "I-EXPERIENCE_DURATION": 4,
+    "I-LANG": 6,
+    "I-SKILL": 8,
+    "I-SOFT_SKILL": 10,
     "O": 0
   },
   "layer_norm_eps": 1e-05,
   "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
   "num_attention_heads": 12,
   "num_hidden_layers": 12,
+  "output_past": true,
   "pad_token_id": 1,
   "position_embedding_type": "absolute",
   "transformers_version": "4.57.1",
   "type_vocab_size": 1,
   "use_cache": true,
+  "vocab_size": 250002
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:509d0ec667aaea68af890e0f03fe55b16f5a278671d5c074c130c8f9558cad02
-size 496259468

 version https://git-lfs.github.com/spec/v1
+oid sha256:c560f1474d8f841d665671ca9a61561b565a542a4047d3213f57c6b77cf4ed36
+size 1109870108

test_metrics.json CHANGED Viewed

@@ -1,15 +1,24 @@
 {
-  "test_loss": 0.02206496149301529,
-  "test_precision": 0.9855069157852383,
-  "test_recall": 0.9891982048789573,
-  "test_f1": 0.9873462648969002,
-  "test_SKILL_precision": 0.974378359649466,
-  "test_SKILL_recall": 0.9787130658693776,
-  "test_SKILL_f1": 0.97653591590187,
-  "test_EXPERIENCE_DURATION_precision": 0.9966354719210107,
-  "test_EXPERIENCE_DURATION_recall": 0.9996833438885371,
-  "test_EXPERIENCE_DURATION_f1": 0.9981566138919303,
-  "test_runtime": 35.2236,
-  "test_samples_per_second": 56.723,
-  "test_steps_per_second": 7.098
 }

 {
+  "test_loss": 0.01223407406359911,
+  "test_precision": 0.9551368142202801,
+  "test_recall": 0.9799216081310224,
+  "test_f1": 0.9630685025439435,
+  "test_LANG_precision": 0.9819929629539341,
+  "test_LANG_recall": 1.0,
+  "test_LANG_f1": 0.9908397325566001,
+  "test_SKILL_precision": 0.9889449241719945,
+  "test_SKILL_recall": 0.9904749295912962,
+  "test_SKILL_f1": 0.9897093160099927,
+  "test_CERT_precision": 0.8260869565217391,
+  "test_CERT_recall": 0.9166666666666667,
+  "test_CERT_f1": 0.8492822966507176,
+  "test_SOFT_SKILL_precision": 0.9810121524462506,
+  "test_SOFT_SKILL_recall": 0.9926311347792305,
+  "test_SOFT_SKILL_f1": 0.9867712480908251,
+  "test_EXPERIENCE_DURATION_precision": 0.9976470750074813,
+  "test_EXPERIENCE_DURATION_recall": 0.9998353096179183,
+  "test_EXPERIENCE_DURATION_f1": 0.9987399194115818,
+  "test_runtime": 91.6905,
+  "test_samples_per_second": 43.603,
+  "test_steps_per_second": 5.453
 }

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -4,7 +4,7 @@
     "0": {
       "content": "<s>",
       "lstrip": false,
-      "normalized": true,
       "rstrip": false,
       "single_word": false,
       "special": true
@@ -12,7 +12,7 @@
     "1": {
       "content": "<pad>",
       "lstrip": false,
-      "normalized": true,
       "rstrip": false,
       "single_word": false,
       "special": true
@@ -20,7 +20,7 @@
     "2": {
       "content": "</s>",
       "lstrip": false,
-      "normalized": true,
       "rstrip": false,
       "single_word": false,
       "special": true
@@ -28,12 +28,12 @@
     "3": {
       "content": "<unk>",
       "lstrip": false,
-      "normalized": true,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "50264": {
       "content": "<mask>",
       "lstrip": true,
       "normalized": false,
@@ -46,13 +46,11 @@
   "clean_up_tokenization_spaces": false,
   "cls_token": "<s>",
   "eos_token": "</s>",
-  "errors": "replace",
   "extra_special_tokens": {},
   "mask_token": "<mask>",
   "model_max_length": 512,
   "pad_token": "<pad>",
   "sep_token": "</s>",
-  "tokenizer_class": "RobertaTokenizer",
-  "trim_offsets": true,
   "unk_token": "<unk>"
 }

     "0": {
       "content": "<s>",
       "lstrip": false,
+      "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     "1": {
       "content": "<pad>",
       "lstrip": false,
+      "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     "2": {
       "content": "</s>",
       "lstrip": false,
+      "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     "3": {
       "content": "<unk>",
       "lstrip": false,
+      "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "250001": {
       "content": "<mask>",
       "lstrip": true,
       "normalized": false,
   "clean_up_tokenization_spaces": false,
   "cls_token": "<s>",
   "eos_token": "</s>",
   "extra_special_tokens": {},
   "mask_token": "<mask>",
   "model_max_length": 512,
   "pad_token": "<pad>",
   "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
   "unk_token": "<unk>"
 }