Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,19 +4,18 @@ import json
|
|
| 4 |
import os
|
| 5 |
|
| 6 |
# Define your model ID on Hugging Face Hub
|
| 7 |
-
# Make sure to replace '
|
| 8 |
-
# and 'hebrew-english-log-pii-ner-model' with the exact repository name you pushed to.
|
| 9 |
MODEL_ID = "doronpe12/log-anonymizer-heb-support"
|
| 10 |
|
| 11 |
# Path to the ner_labels.json file.
|
| 12 |
-
#
|
| 13 |
LABELS_FILE = "ner_labels.json"
|
| 14 |
|
| 15 |
# --- Load Label Mappings ---
|
| 16 |
# This part ensures the Gradio app knows how to interpret the NER tags.
|
|
|
|
|
|
|
| 17 |
try:
|
| 18 |
-
# Attempt to load ner_labels.json from the same directory as app.py
|
| 19 |
-
# This is how it will be found on Hugging Face Spaces
|
| 20 |
with open(LABELS_FILE, 'r', encoding='utf-8') as f:
|
| 21 |
label_mappings = json.load(f)
|
| 22 |
id2label = {int(k): v for k, v in label_mappings["id2label"].items()}
|
|
@@ -24,64 +23,51 @@ try:
|
|
| 24 |
print(f"Loaded {len(id2label)} NER labels from {LABELS_FILE}.")
|
| 25 |
except FileNotFoundError:
|
| 26 |
print(f"Error: {LABELS_FILE} not found. Please ensure it's uploaded to the Space.")
|
| 27 |
-
# Fallback
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
label2id = {"O": 0, "B-PERSON": 1, "I-PERSON": 2}
|
| 31 |
except Exception as e:
|
| 32 |
print(f"Error loading {LABELS_FILE}: {e}")
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
label2id = {"O": 0, "B-PERSON": 1, "I-PERSON": 2}
|
| 36 |
|
| 37 |
|
| 38 |
# --- Load the Model and Tokenizer ---
|
| 39 |
-
# The pipeline will automatically download the model from MODEL_ID
|
| 40 |
-
# if it's not already cached in the Space's environment.
|
| 41 |
print(f"Loading NER pipeline for model: {MODEL_ID}...")
|
| 42 |
try:
|
| 43 |
-
# Pass id2label to the pipeline for readable output
|
| 44 |
ner_pipeline = pipeline(
|
| 45 |
"ner",
|
| 46 |
model=MODEL_ID,
|
| 47 |
tokenizer=MODEL_ID,
|
| 48 |
-
aggregation_strategy="simple",
|
| 49 |
-
|
| 50 |
)
|
| 51 |
print("NER pipeline loaded successfully.")
|
| 52 |
except Exception as e:
|
| 53 |
print(f"Failed to load NER pipeline: {e}")
|
| 54 |
print("Please ensure the model ID is correct and accessible on Hugging Face Hub.")
|
| 55 |
-
|
|
|
|
| 56 |
|
| 57 |
# --- Prediction Function for Gradio ---
|
| 58 |
def predict_pii(text):
|
| 59 |
-
"""
|
| 60 |
-
Predicts PII entities in the input text using the loaded NER pipeline.
|
| 61 |
-
Returns a list of dictionaries suitable for Gradio's HighlightedText component.
|
| 62 |
-
"""
|
| 63 |
if ner_pipeline is None:
|
| 64 |
return {"text": text, "entities": []}, "Error: Model not loaded. Please check logs."
|
| 65 |
|
| 66 |
results = ner_pipeline(text)
|
| 67 |
|
| 68 |
-
# Format results for Gradio's HighlightedText component
|
| 69 |
-
# HighlightedText expects a dict with 'text' and 'entities'
|
| 70 |
-
# Each entity is a dict with 'start', 'end', 'entity' (label), 'score' (optional)
|
| 71 |
formatted_entities = []
|
| 72 |
for entity in results:
|
| 73 |
-
# Gradio's HighlightedText needs 'entity' key for the label
|
| 74 |
formatted_entities.append({
|
| 75 |
"start": entity['start'],
|
| 76 |
"end": entity['end'],
|
| 77 |
-
"entity": entity['entity_group'],
|
| 78 |
"score": entity['score']
|
| 79 |
})
|
| 80 |
|
| 81 |
return {"text": text, "entities": formatted_entities}, "PII detection complete."
|
| 82 |
|
| 83 |
# --- Gradio Interface ---
|
| 84 |
-
# Define example log entries
|
| 85 |
examples = [
|
| 86 |
"[2024-07-30 10:30:00] INFO: User John Doe from 192.168.1.10 accessed /data/reports on web-server-01. Email: john.doe@example.com, Phone: +972-50-1234567. ID: 123456789.",
|
| 87 |
"[2024-07-30 11:15:20] ERROR: Failed authentication for username admin with password 'MyS3cr3tP@ssw0rd!' from IP 203.0.113.45. API Key: sk-xyz123abc.",
|
|
@@ -90,7 +76,6 @@ examples = [
|
|
| 90 |
"[2024-07-30 15:00:00] INFO: 讘拽砖讛 诪-172.16.0.10 诇-api.mycompany.co.il. 住讟讟讜住: 200 OK. 砖诐 诪砖转诪砖: upwwe13.",
|
| 91 |
]
|
| 92 |
|
| 93 |
-
# Create the Gradio interface
|
| 94 |
iface = gr.Interface(
|
| 95 |
fn=predict_pii,
|
| 96 |
inputs=gr.Textbox(lines=5, placeholder="讛讻谞住 讻讗谉 砖讜专转 诇讜讙 诇讘讚讬拽转 PII...", label="砖讜专转 诇讜讙"),
|
|
@@ -101,8 +86,7 @@ iface = gr.Interface(
|
|
| 101 |
title="诪讜讚诇 讝讬讛讜讬 PII 讘拽讘爪讬 诇讜讙 (注讘专讬转 讜讗谞讙诇讬转)",
|
| 102 |
description="讛讻谞住 砖讜专转 诇讜讙 讻讚讬 诇讝讛讜转 诪讬讚注 诪讝讛讛 讗讬砖讬转 (PII) 讻讙讜谉 讻转讜讘讜转 IP, 砖诪讜转 诪砖转诪砖, 住讬住诪讗讜转, 诪讬讬诇讬诐, 诪住驻专讬 讟诇驻讜谉, 驻专讟讬 讞砖讘讜谉 讘谞拽 讜注讜讚.",
|
| 103 |
examples=examples,
|
| 104 |
-
|
| 105 |
)
|
| 106 |
|
| 107 |
-
# Launch the Gradio app
|
| 108 |
iface.launch()
|
|
|
|
| 4 |
import os
|
| 5 |
|
| 6 |
# Define your model ID on Hugging Face Hub
|
| 7 |
+
# Make sure to replace 'doronpe12/log-anonymizer-heb-support' with your exact model repository name.
|
|
|
|
| 8 |
MODEL_ID = "doronpe12/log-anonymizer-heb-support"
|
| 9 |
|
| 10 |
# Path to the ner_labels.json file.
|
| 11 |
+
# This file is still required for the app's internal logic, even if not needed by the pipeline directly.
|
| 12 |
LABELS_FILE = "ner_labels.json"
|
| 13 |
|
| 14 |
# --- Load Label Mappings ---
|
| 15 |
# This part ensures the Gradio app knows how to interpret the NER tags.
|
| 16 |
+
# We still need this file for the app's logic, but the pipeline itself will get
|
| 17 |
+
# the labels from the model's config.json.
|
| 18 |
try:
|
|
|
|
|
|
|
| 19 |
with open(LABELS_FILE, 'r', encoding='utf-8') as f:
|
| 20 |
label_mappings = json.load(f)
|
| 21 |
id2label = {int(k): v for k, v in label_mappings["id2label"].items()}
|
|
|
|
| 23 |
print(f"Loaded {len(id2label)} NER labels from {LABELS_FILE}.")
|
| 24 |
except FileNotFoundError:
|
| 25 |
print(f"Error: {LABELS_FILE} not found. Please ensure it's uploaded to the Space.")
|
| 26 |
+
# Fallback labels (should not be reached if the file is present)
|
| 27 |
+
id2label = {0: "O", 1: "B-PII", 2: "I-PII"}
|
| 28 |
+
label2id = {v: k for k, v in id2label.items()}
|
|
|
|
| 29 |
except Exception as e:
|
| 30 |
print(f"Error loading {LABELS_FILE}: {e}")
|
| 31 |
+
id2label = {0: "O", 1: "B-PII", 2: "I-PII"}
|
| 32 |
+
label2id = {v: k for k, v in id2label.items()}
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
# --- Load the Model and Tokenizer ---
|
|
|
|
|
|
|
| 36 |
print(f"Loading NER pipeline for model: {MODEL_ID}...")
|
| 37 |
try:
|
|
|
|
| 38 |
ner_pipeline = pipeline(
|
| 39 |
"ner",
|
| 40 |
model=MODEL_ID,
|
| 41 |
tokenizer=MODEL_ID,
|
| 42 |
+
aggregation_strategy="simple",
|
| 43 |
+
# <<< 讛讘注讬讛 谞驻转专讛 讻讗谉: 讛住专谞讜 讗转 讛讗专讙讜诪谞讟 id2label
|
| 44 |
)
|
| 45 |
print("NER pipeline loaded successfully.")
|
| 46 |
except Exception as e:
|
| 47 |
print(f"Failed to load NER pipeline: {e}")
|
| 48 |
print("Please ensure the model ID is correct and accessible on Hugging Face Hub.")
|
| 49 |
+
print("If your model is private, ensure your Space is also private.")
|
| 50 |
+
ner_pipeline = None
|
| 51 |
|
| 52 |
# --- Prediction Function for Gradio ---
|
| 53 |
def predict_pii(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
if ner_pipeline is None:
|
| 55 |
return {"text": text, "entities": []}, "Error: Model not loaded. Please check logs."
|
| 56 |
|
| 57 |
results = ner_pipeline(text)
|
| 58 |
|
|
|
|
|
|
|
|
|
|
| 59 |
formatted_entities = []
|
| 60 |
for entity in results:
|
|
|
|
| 61 |
formatted_entities.append({
|
| 62 |
"start": entity['start'],
|
| 63 |
"end": entity['end'],
|
| 64 |
+
"entity": entity['entity_group'],
|
| 65 |
"score": entity['score']
|
| 66 |
})
|
| 67 |
|
| 68 |
return {"text": text, "entities": formatted_entities}, "PII detection complete."
|
| 69 |
|
| 70 |
# --- Gradio Interface ---
|
|
|
|
| 71 |
examples = [
|
| 72 |
"[2024-07-30 10:30:00] INFO: User John Doe from 192.168.1.10 accessed /data/reports on web-server-01. Email: john.doe@example.com, Phone: +972-50-1234567. ID: 123456789.",
|
| 73 |
"[2024-07-30 11:15:20] ERROR: Failed authentication for username admin with password 'MyS3cr3tP@ssw0rd!' from IP 203.0.113.45. API Key: sk-xyz123abc.",
|
|
|
|
| 76 |
"[2024-07-30 15:00:00] INFO: 讘拽砖讛 诪-172.16.0.10 诇-api.mycompany.co.il. 住讟讟讜住: 200 OK. 砖诐 诪砖转诪砖: upwwe13.",
|
| 77 |
]
|
| 78 |
|
|
|
|
| 79 |
iface = gr.Interface(
|
| 80 |
fn=predict_pii,
|
| 81 |
inputs=gr.Textbox(lines=5, placeholder="讛讻谞住 讻讗谉 砖讜专转 诇讜讙 诇讘讚讬拽转 PII...", label="砖讜专转 诇讜讙"),
|
|
|
|
| 86 |
title="诪讜讚诇 讝讬讛讜讬 PII 讘拽讘爪讬 诇讜讙 (注讘专讬转 讜讗谞讙诇讬转)",
|
| 87 |
description="讛讻谞住 砖讜专转 诇讜讙 讻讚讬 诇讝讛讜转 诪讬讚注 诪讝讛讛 讗讬砖讬转 (PII) 讻讙讜谉 讻转讜讘讜转 IP, 砖诪讜转 诪砖转诪砖, 住讬住诪讗讜转, 诪讬讬诇讬诐, 诪住驻专讬 讟诇驻讜谉, 驻专讟讬 讞砖讘讜谉 讘谞拽 讜注讜讚.",
|
| 88 |
examples=examples,
|
| 89 |
+
flagging_mode="never" # <<< 转讜拽谞讛 讛讗讝讛专讛 砖诇 Gradio
|
| 90 |
)
|
| 91 |
|
|
|
|
| 92 |
iface.launch()
|