logAnonymizer / app.py
doronpe12's picture
Update app.py
62e3da0 verified
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import json
import os
# Define your model ID on Hugging Face Hub
# Make sure to replace 'doronpe12/log-anonymizer-heb-support' with your exact model repository name.
MODEL_ID = "doronpe12/log-anonymizer-heb-support"
# Path to the ner_labels.json file.
# This file is still required for the app's internal logic, even if not needed by the pipeline directly.
LABELS_FILE = "ner_labels.json"
# --- Load Label Mappings ---
# This part ensures the Gradio app knows how to interpret the NER tags.
# We still need this file for the app's logic, but the pipeline itself will get
# the labels from the model's config.json.
try:
with open(LABELS_FILE, 'r', encoding='utf-8') as f:
label_mappings = json.load(f)
id2label = {int(k): v for k, v in label_mappings["id2label"].items()}
label2id = label_mappings["label2id"]
print(f"Loaded {len(id2label)} NER labels from {LABELS_FILE}.")
except FileNotFoundError:
print(f"Error: {LABELS_FILE} not found. Please ensure it's uploaded to the Space.")
# Fallback labels (should not be reached if the file is present)
id2label = {0: "O", 1: "B-PII", 2: "I-PII"}
label2id = {v: k for k, v in id2label.items()}
except Exception as e:
print(f"Error loading {LABELS_FILE}: {e}")
id2label = {0: "O", 1: "B-PII", 2: "I-PII"}
label2id = {v: k for k, v in id2label.items()}
# --- Load the Model and Tokenizer ---
print(f"Loading NER pipeline for model: {MODEL_ID}...")
try:
ner_pipeline = pipeline(
"ner",
model=MODEL_ID,
tokenizer=MODEL_ID,
aggregation_strategy="simple",
# <<< 讛讘注讬讛 谞驻转专讛 讻讗谉: 讛住专谞讜 讗转 讛讗专讙讜诪谞讟 id2label
)
print("NER pipeline loaded successfully.")
except Exception as e:
print(f"Failed to load NER pipeline: {e}")
print("Please ensure the model ID is correct and accessible on Hugging Face Hub.")
print("If your model is private, ensure your Space is also private.")
ner_pipeline = None
# --- Prediction Function for Gradio ---
def predict_pii(text):
if ner_pipeline is None:
return {"text": text, "entities": []}, "Error: Model not loaded. Please check logs."
results = ner_pipeline(text)
formatted_entities = []
for entity in results:
formatted_entities.append({
"start": entity['start'],
"end": entity['end'],
"entity": entity['entity_group'],
"score": entity['score']
})
return {"text": text, "entities": formatted_entities}, "PII detection complete."
# --- Gradio Interface ---
examples = [
"[2024-07-30 10:30:00] INFO: User John Doe from 192.168.1.10 accessed /data/reports on web-server-01. Email: john.doe@example.com, Phone: +972-50-1234567. ID: 123456789.",
"[2024-07-30 11:15:20] ERROR: Failed authentication for username admin with password 'MyS3cr3tP@ssw0rd!' from IP 203.0.113.45. API Key: sk-xyz123abc.",
"[2024-07-30 12:00:00] DEBUG: System heartbeat ok. No PII here.",
"[2024-07-30 13:45:10] AUDIT: Transaction IL110111522200000333999 initiated by Jane Smith for 1,500,000.50 ILS. Bank: Bank Hapoalim.",
"[2024-07-30 15:00:00] INFO: 讘拽砖讛 诪-172.16.0.10 诇-api.mycompany.co.il. 住讟讟讜住: 200 OK. 砖诐 诪砖转诪砖: upwwe13.",
]
iface = gr.Interface(
fn=predict_pii,
inputs=gr.Textbox(lines=5, placeholder="讛讻谞住 讻讗谉 砖讜专转 诇讜讙 诇讘讚讬拽转 PII...", label="砖讜专转 诇讜讙"),
outputs=[
gr.HighlightedText(label="转讜爪讗讜转 讝讬讛讜讬 PII"),
gr.Textbox(label="住讟讟讜住/讛讜讚注讜转")
],
title="诪讜讚诇 讝讬讛讜讬 PII 讘拽讘爪讬 诇讜讙 (注讘专讬转 讜讗谞讙诇讬转)",
description="讛讻谞住 砖讜专转 诇讜讙 讻讚讬 诇讝讛讜转 诪讬讚注 诪讝讛讛 讗讬砖讬转 (PII) 讻讙讜谉 讻转讜讘讜转 IP, 砖诪讜转 诪砖转诪砖, 住讬住诪讗讜转, 诪讬讬诇讬诐, 诪住驻专讬 讟诇驻讜谉, 驻专讟讬 讞砖讘讜谉 讘谞拽 讜注讜讚.",
examples=examples,
flagging_mode="never" # <<< 转讜拽谞讛 讛讗讝讛专讛 砖诇 Gradio
)
iface.launch()