Spaces:

doronpe12
/

logAnonymizer

Sleeping

App Files Files Community

doronpe12 commited on Jul 31, 2025

Commit

62e3da0

verified ·

1 Parent(s): 4f67600

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -31

app.py CHANGED Viewed

@@ -4,19 +4,18 @@ import json
 import os
 # Define your model ID on Hugging Face Hub
-# Make sure to replace 'your-username' with your actual Hugging Face username
-# and 'hebrew-english-log-pii-ner-model' with the exact repository name you pushed to.
 MODEL_ID = "doronpe12/log-anonymizer-heb-support"
 # Path to the ner_labels.json file.
-# When deployed on Hugging Face Spaces, this file will be in the root of the repo.
 LABELS_FILE = "ner_labels.json"
 # --- Load Label Mappings ---
 # This part ensures the Gradio app knows how to interpret the NER tags.
 try:
-    # Attempt to load ner_labels.json from the same directory as app.py
-    # This is how it will be found on Hugging Face Spaces
     with open(LABELS_FILE, 'r', encoding='utf-8') as f:
         label_mappings = json.load(f)
     id2label = {int(k): v for k, v in label_mappings["id2label"].items()}
@@ -24,64 +23,51 @@ try:
     print(f"Loaded {len(id2label)} NER labels from {LABELS_FILE}.")
 except FileNotFoundError:
     print(f"Error: {LABELS_FILE} not found. Please ensure it's uploaded to the Space.")
-    # Fallback if the file is not found (e.g., during local testing without the file)
-    # In a real Space, you'd want this file to be present.
-    id2label = {0: "O", 1: "B-PERSON", 2: "I-PERSON"} # Minimal fallback
-    label2id = {"O": 0, "B-PERSON": 1, "I-PERSON": 2}
 except Exception as e:
     print(f"Error loading {LABELS_FILE}: {e}")
-    # Fallback to minimal labels
-    id2label = {0: "O", 1: "B-PERSON", 2: "I-PERSON"}
-    label2id = {"O": 0, "B-PERSON": 1, "I-PERSON": 2}
 # --- Load the Model and Tokenizer ---
-# The pipeline will automatically download the model from MODEL_ID
-# if it's not already cached in the Space's environment.
 print(f"Loading NER pipeline for model: {MODEL_ID}...")
 try:
-    # Pass id2label to the pipeline for readable output
     ner_pipeline = pipeline(
         "ner",
         model=MODEL_ID,
         tokenizer=MODEL_ID,
-        aggregation_strategy="simple", # Aggregates B-I-I tokens into a single entity
-        id2label=id2label # Ensures output labels are human-readable strings
     )
     print("NER pipeline loaded successfully.")
 except Exception as e:
     print(f"Failed to load NER pipeline: {e}")
     print("Please ensure the model ID is correct and accessible on Hugging Face Hub.")
-    ner_pipeline = None # Set to None to handle errors gracefully in the predict function
 # --- Prediction Function for Gradio ---
 def predict_pii(text):
-    """
-    Predicts PII entities in the input text using the loaded NER pipeline.
-    Returns a list of dictionaries suitable for Gradio's HighlightedText component.
-    """
     if ner_pipeline is None:
         return {"text": text, "entities": []}, "Error: Model not loaded. Please check logs."
     results = ner_pipeline(text)
-    # Format results for Gradio's HighlightedText component
-    # HighlightedText expects a dict with 'text' and 'entities'
-    # Each entity is a dict with 'start', 'end', 'entity' (label), 'score' (optional)
     formatted_entities = []
     for entity in results:
-        # Gradio's HighlightedText needs 'entity' key for the label
         formatted_entities.append({
             "start": entity['start'],
             "end": entity['end'],
-            "entity": entity['entity_group'], # Use entity_group for aggregated label
             "score": entity['score']
         })
     return {"text": text, "entities": formatted_entities}, "PII detection complete."
 # --- Gradio Interface ---
-# Define example log entries
 examples = [
     "[2024-07-30 10:30:00] INFO: User John Doe from 192.168.1.10 accessed /data/reports on web-server-01. Email: john.doe@example.com, Phone: +972-50-1234567. ID: 123456789.",
     "[2024-07-30 11:15:20] ERROR: Failed authentication for username admin with password 'MyS3cr3tP@ssw0rd!' from IP 203.0.113.45. API Key: sk-xyz123abc.",
@@ -90,7 +76,6 @@ examples = [
     "[2024-07-30 15:00:00] INFO: בקשה מ-172.16.0.10 ל-api.mycompany.co.il. סטטוס: 200 OK. שם משתמש: upwwe13.",
 ]
-# Create the Gradio interface
 iface = gr.Interface(
     fn=predict_pii,
     inputs=gr.Textbox(lines=5, placeholder="הכנס כאן שורת לוג לבדיקת PII...", label="שורת לוג"),
@@ -101,8 +86,7 @@ iface = gr.Interface(
     title="מודל זיהוי PII בקבצי לוג (עברית ואנגלית)",
     description="הכנס שורת לוג כדי לזהות מידע מזהה אישית (PII) כגון כתובות IP, שמות משתמש, סיסמאות, מיילים, מספרי טלפון, פרטי חשבון בנק ועוד.",
     examples=examples,
-    allow_flagging="never" # Disable flagging for this demo
 )
-# Launch the Gradio app
 iface.launch()

 import os
 # Define your model ID on Hugging Face Hub
+# Make sure to replace 'doronpe12/log-anonymizer-heb-support' with your exact model repository name.
 MODEL_ID = "doronpe12/log-anonymizer-heb-support"
 # Path to the ner_labels.json file.
+# This file is still required for the app's internal logic, even if not needed by the pipeline directly.
 LABELS_FILE = "ner_labels.json"
 # --- Load Label Mappings ---
 # This part ensures the Gradio app knows how to interpret the NER tags.
+# We still need this file for the app's logic, but the pipeline itself will get
+# the labels from the model's config.json.
 try:
     with open(LABELS_FILE, 'r', encoding='utf-8') as f:
         label_mappings = json.load(f)
     id2label = {int(k): v for k, v in label_mappings["id2label"].items()}
     print(f"Loaded {len(id2label)} NER labels from {LABELS_FILE}.")
 except FileNotFoundError:
     print(f"Error: {LABELS_FILE} not found. Please ensure it's uploaded to the Space.")
+    # Fallback labels (should not be reached if the file is present)
+    id2label = {0: "O", 1: "B-PII", 2: "I-PII"}
+    label2id = {v: k for k, v in id2label.items()}
 except Exception as e:
     print(f"Error loading {LABELS_FILE}: {e}")
+    id2label = {0: "O", 1: "B-PII", 2: "I-PII"}
+    label2id = {v: k for k, v in id2label.items()}
 # --- Load the Model and Tokenizer ---
 print(f"Loading NER pipeline for model: {MODEL_ID}...")
 try:
     ner_pipeline = pipeline(
         "ner",
         model=MODEL_ID,
         tokenizer=MODEL_ID,
+        aggregation_strategy="simple",
+        # <<< הבעיה נפתרה כאן: הסרנו את הארגומנט id2label
     )
     print("NER pipeline loaded successfully.")
 except Exception as e:
     print(f"Failed to load NER pipeline: {e}")
     print("Please ensure the model ID is correct and accessible on Hugging Face Hub.")
+    print("If your model is private, ensure your Space is also private.")
+    ner_pipeline = None
 # --- Prediction Function for Gradio ---
 def predict_pii(text):
     if ner_pipeline is None:
         return {"text": text, "entities": []}, "Error: Model not loaded. Please check logs."
     results = ner_pipeline(text)
     formatted_entities = []
     for entity in results:
         formatted_entities.append({
             "start": entity['start'],
             "end": entity['end'],
+            "entity": entity['entity_group'],
             "score": entity['score']
         })
     return {"text": text, "entities": formatted_entities}, "PII detection complete."
 # --- Gradio Interface ---
 examples = [
     "[2024-07-30 10:30:00] INFO: User John Doe from 192.168.1.10 accessed /data/reports on web-server-01. Email: john.doe@example.com, Phone: +972-50-1234567. ID: 123456789.",
     "[2024-07-30 11:15:20] ERROR: Failed authentication for username admin with password 'MyS3cr3tP@ssw0rd!' from IP 203.0.113.45. API Key: sk-xyz123abc.",
     "[2024-07-30 15:00:00] INFO: בקשה מ-172.16.0.10 ל-api.mycompany.co.il. סטטוס: 200 OK. שם משתמש: upwwe13.",
 ]
 iface = gr.Interface(
     fn=predict_pii,
     inputs=gr.Textbox(lines=5, placeholder="הכנס כאן שורת לוג לבדיקת PII...", label="שורת לוג"),
     title="מודל זיהוי PII בקבצי לוג (עברית ואנגלית)",
     description="הכנס שורת לוג כדי לזהות מידע מזהה אישית (PII) כגון כתובות IP, שמות משתמש, סיסמאות, מיילים, מספרי טלפון, פרטי חשבון בנק ועוד.",
     examples=examples,
+    flagging_mode="never" # <<< תוקנה האזהרה של Gradio
 )
 iface.launch()