Spaces:

CordwainerSmith
/

GolemPII

Sleeping

App Files Files Community

cordwainersmith commited on Oct 28, 2024

Commit

98a427a

1 Parent(s): 5584918

Add token

Browse files

Files changed (2) hide show

app.py +26 -19
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 import time
 import json
@@ -34,10 +35,10 @@ EXAMPLE_SENTENCES = [
 ]
 MODEL_DETAILS = {
-    "name": "GolemPII - Hebrew PII Detection Model CordwainerSmith/GolemPII-v7-full",
-    "description": "This on-premise PII model is designed to automatically identify and mask sensitive information (PII) within Hebrew text data. It has been trained to recognize a wide range of PII entities, including names, addresses, phone numbers, financial information, and more.",
-    "base_model": "microsoft/mdeberta-v3-base",
-    "training_data": "Custom Hebrew PII dataset (size not specified)",
     "detected_pii_entities": [
         "FIRST_NAME",
         "LAST_NAME",
@@ -52,13 +53,16 @@ MODEL_DETAILS = {
         "DATE",
         "POSTAL_CODE",
     ],
     "training_details": {
-        "Training epochs": "5",
-        "Batch size": "32",
-        "Learning rate": "5e-5",
-        "Weight decay": "0.01",
-        "Training speed": "~2.19 it/s",
-        "Total training time": "2:08:26",
     },
 }
@@ -66,13 +70,16 @@ MODEL_DETAILS = {
 class PIIMaskingModel:
     def __init__(self, model_name: str):
         self.model_name = model_name
-        hf_token = st.secrets["hf_token"]  # Retrieve the token from secrets
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name, use_auth_token=hf_token
         )
         self.model = AutoModelForTokenClassification.from_pretrained(
             model_name, use_auth_token=hf_token
         )
     def process_text(
         self, text: str
@@ -83,23 +90,23 @@ class PIIMaskingModel:
             text,
             truncation=True,
             padding=False,
-            return_tensors="np",  # Return NumPy arrays for CPU
             return_offsets_mapping=True,
             add_special_tokens=True,
         )
-        input_ids = tokenized_inputs.input_ids
-        attention_mask = tokenized_inputs.attention_mask
         offset_mapping = tokenized_inputs["offset_mapping"][0].tolist()
         # Handle special tokens
         offset_mapping[0] = None  # <s> token
         offset_mapping[-1] = None  # </s> token
-        # No need for torch.no_grad() as we are not using gradients
-        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
-        predictions = outputs.logits.argmax(dim=-1)  # No need to move to CPU
         predicted_labels = [
             self.model.config.id2label[label_id] for label_id in predictions[0]
         ]
@@ -139,7 +146,7 @@ class PIIMaskingModel:
             next_label = labels[j]
             # Stop if we hit a new B- tag (except for non-spaced tokens)
-            if next_label.startswith("B-") and tokens[j].startswith(" "):
                 break
             # Stop if we hit a different entity type in I- tags
@@ -151,7 +158,7 @@ class PIIMaskingModel:
                 last_valid_end = offset_mapping[j][1]
                 j += 1
             # Continue if it's a non-spaced B- token
-            elif next_label.startswith("B-") and not tokens[j].startswith(" "):
                 last_valid_end = offset_mapping[j][1]
                 j += 1
             else:

 import streamlit as st
+import torch
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 import time
 import json
 ]
 MODEL_DETAILS = {
+    "name": "GolemPII-xlm-roberta-v1 - Hebrew PII Detection Model",
+    "description": "This model is specifically designed to identify and categorize Personally Identifiable Information (PII) within Hebrew text. It leverages the powerful XLM-RoBERTa base, fine-tuned with a curated Hebrew PII dataset, making it adept at token classification tasks tailored for Hebrew.",
+    "base_model": "xlm-roberta-base",
+    "training_data": "Custom Hebrew PII dataset",
     "detected_pii_entities": [
         "FIRST_NAME",
         "LAST_NAME",
         "DATE",
         "POSTAL_CODE",
     ],
+    "performance_metrics": {
+        "Loss": 0.000729,
+        "Precision": 0.9982,
+        "Recall": 0.9982,
+        "F1-Score": 0.9982,
+        "Accuracy": 0.999795,
+    },
     "training_details": {
+        "Training language": "Hebrew",
+        # Add other relevant training details if available
     },
 }
 class PIIMaskingModel:
     def __init__(self, model_name: str):
         self.model_name = model_name
+        hf_token = st.secrets["hf_token"]
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name, use_auth_token=hf_token
         )
         self.model = AutoModelForTokenClassification.from_pretrained(
             model_name, use_auth_token=hf_token
         )
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+        self.model.eval()
     def process_text(
         self, text: str
             text,
             truncation=True,
             padding=False,
+            return_tensors="pt",
             return_offsets_mapping=True,
             add_special_tokens=True,
         )
+        input_ids = tokenized_inputs.input_ids.to(self.device)
+        attention_mask = tokenized_inputs.attention_mask.to(self.device)
         offset_mapping = tokenized_inputs["offset_mapping"][0].tolist()
         # Handle special tokens
         offset_mapping[0] = None  # <s> token
         offset_mapping[-1] = None  # </s> token
+        with torch.no_grad():
+            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+        predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
         predicted_labels = [
             self.model.config.id2label[label_id] for label_id in predictions[0]
         ]
             next_label = labels[j]
             # Stop if we hit a new B- tag (except for non-spaced tokens)
+            if next_label.startswith("B-") and tokens[j].startswith("▁"):
                 break
             # Stop if we hit a different entity type in I- tags
                 last_valid_end = offset_mapping[j][1]
                 j += 1
             # Continue if it's a non-spaced B- token
+            elif next_label.startswith("B-") and not tokens[j].startswith("▁"):
                 last_valid_end = offset_mapping[j][1]
                 j += 1
             else:

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 streamlit
-transformers

 streamlit
+transformers
+torch