Spaces:

doronpe12
/

logAnonymizer

Sleeping

App Files Files Community

logAnonymizer / app.py

doronpe12

Update app.py

62e3da0 verified 7 months ago

raw

history blame contribute delete

4.1 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
	import json
	import os

	# Define your model ID on Hugging Face Hub
	# Make sure to replace 'doronpe12/log-anonymizer-heb-support' with your exact model repository name.
	MODEL_ID = "doronpe12/log-anonymizer-heb-support"

	# Path to the ner_labels.json file.
	# This file is still required for the app's internal logic, even if not needed by the pipeline directly.
	LABELS_FILE = "ner_labels.json"

	# --- Load Label Mappings ---
	# This part ensures the Gradio app knows how to interpret the NER tags.
	# We still need this file for the app's logic, but the pipeline itself will get
	# the labels from the model's config.json.
	try:
	with open(LABELS_FILE, 'r', encoding='utf-8') as f:
	label_mappings = json.load(f)
	id2label = {int(k): v for k, v in label_mappings["id2label"].items()}
	label2id = label_mappings["label2id"]
	print(f"Loaded {len(id2label)} NER labels from {LABELS_FILE}.")
	except FileNotFoundError:
	print(f"Error: {LABELS_FILE} not found. Please ensure it's uploaded to the Space.")
	# Fallback labels (should not be reached if the file is present)
	id2label = {0: "O", 1: "B-PII", 2: "I-PII"}
	label2id = {v: k for k, v in id2label.items()}
	except Exception as e:
	print(f"Error loading {LABELS_FILE}: {e}")
	id2label = {0: "O", 1: "B-PII", 2: "I-PII"}
	label2id = {v: k for k, v in id2label.items()}


	# --- Load the Model and Tokenizer ---
	print(f"Loading NER pipeline for model: {MODEL_ID}...")
	try:
	ner_pipeline = pipeline(
	"ner",
	model=MODEL_ID,
	tokenizer=MODEL_ID,
	aggregation_strategy="simple",
	# <<< הבעיה נפתרה כאן: הסרנו את הארגומנט id2label
	)
	print("NER pipeline loaded successfully.")
	except Exception as e:
	print(f"Failed to load NER pipeline: {e}")
	print("Please ensure the model ID is correct and accessible on Hugging Face Hub.")
	print("If your model is private, ensure your Space is also private.")
	ner_pipeline = None

	# --- Prediction Function for Gradio ---
	def predict_pii(text):
	if ner_pipeline is None:
	return {"text": text, "entities": []}, "Error: Model not loaded. Please check logs."

	results = ner_pipeline(text)

	formatted_entities = []
	for entity in results:
	formatted_entities.append({
	"start": entity['start'],
	"end": entity['end'],
	"entity": entity['entity_group'],
	"score": entity['score']
	})

	return {"text": text, "entities": formatted_entities}, "PII detection complete."

	# --- Gradio Interface ---
	examples = [
	"[2024-07-30 10:30:00] INFO: User John Doe from 192.168.1.10 accessed /data/reports on web-server-01. Email: john.doe@example.com, Phone: +972-50-1234567. ID: 123456789.",
	"[2024-07-30 11:15:20] ERROR: Failed authentication for username admin with password 'MyS3cr3tP@ssw0rd!' from IP 203.0.113.45. API Key: sk-xyz123abc.",
	"[2024-07-30 12:00:00] DEBUG: System heartbeat ok. No PII here.",
	"[2024-07-30 13:45:10] AUDIT: Transaction IL110111522200000333999 initiated by Jane Smith for 1,500,000.50 ILS. Bank: Bank Hapoalim.",
	"[2024-07-30 15:00:00] INFO: בקשה מ-172.16.0.10 ל-api.mycompany.co.il. סטטוס: 200 OK. שם משתמש: upwwe13.",
	]

	iface = gr.Interface(
	fn=predict_pii,
	inputs=gr.Textbox(lines=5, placeholder="הכנס כאן שורת לוג לבדיקת PII...", label="שורת לוג"),
	outputs=[
	gr.HighlightedText(label="תוצאות זיהוי PII"),
	gr.Textbox(label="סטטוס/הודעות")
	],
	title="מודל זיהוי PII בקבצי לוג (עברית ואנגלית)",
	description="הכנס שורת לוג כדי לזהות מידע מזהה אישית (PII) כגון כתובות IP, שמות משתמש, סיסמאות, מיילים, מספרי טלפון, פרטי חשבון בנק ועוד.",
	examples=examples,
	flagging_mode="never" # <<< תוקנה האזהרה של Gradio
	)

	iface.launch()