import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification import json import os # Define your model ID on Hugging Face Hub # Make sure to replace 'doronpe12/log-anonymizer-heb-support' with your exact model repository name. MODEL_ID = "doronpe12/log-anonymizer-heb-support" # Path to the ner_labels.json file. # This file is still required for the app's internal logic, even if not needed by the pipeline directly. LABELS_FILE = "ner_labels.json" # --- Load Label Mappings --- # This part ensures the Gradio app knows how to interpret the NER tags. # We still need this file for the app's logic, but the pipeline itself will get # the labels from the model's config.json. try: with open(LABELS_FILE, 'r', encoding='utf-8') as f: label_mappings = json.load(f) id2label = {int(k): v for k, v in label_mappings["id2label"].items()} label2id = label_mappings["label2id"] print(f"Loaded {len(id2label)} NER labels from {LABELS_FILE}.") except FileNotFoundError: print(f"Error: {LABELS_FILE} not found. Please ensure it's uploaded to the Space.") # Fallback labels (should not be reached if the file is present) id2label = {0: "O", 1: "B-PII", 2: "I-PII"} label2id = {v: k for k, v in id2label.items()} except Exception as e: print(f"Error loading {LABELS_FILE}: {e}") id2label = {0: "O", 1: "B-PII", 2: "I-PII"} label2id = {v: k for k, v in id2label.items()} # --- Load the Model and Tokenizer --- print(f"Loading NER pipeline for model: {MODEL_ID}...") try: ner_pipeline = pipeline( "ner", model=MODEL_ID, tokenizer=MODEL_ID, aggregation_strategy="simple", # <<< הבעיה נפתרה כאן: הסרנו את הארגומנט id2label ) print("NER pipeline loaded successfully.") except Exception as e: print(f"Failed to load NER pipeline: {e}") print("Please ensure the model ID is correct and accessible on Hugging Face Hub.") print("If your model is private, ensure your Space is also private.") ner_pipeline = None # --- Prediction Function for Gradio --- def predict_pii(text): if ner_pipeline is None: return {"text": text, "entities": []}, "Error: Model not loaded. Please check logs." results = ner_pipeline(text) formatted_entities = [] for entity in results: formatted_entities.append({ "start": entity['start'], "end": entity['end'], "entity": entity['entity_group'], "score": entity['score'] }) return {"text": text, "entities": formatted_entities}, "PII detection complete." # --- Gradio Interface --- examples = [ "[2024-07-30 10:30:00] INFO: User John Doe from 192.168.1.10 accessed /data/reports on web-server-01. Email: john.doe@example.com, Phone: +972-50-1234567. ID: 123456789.", "[2024-07-30 11:15:20] ERROR: Failed authentication for username admin with password 'MyS3cr3tP@ssw0rd!' from IP 203.0.113.45. API Key: sk-xyz123abc.", "[2024-07-30 12:00:00] DEBUG: System heartbeat ok. No PII here.", "[2024-07-30 13:45:10] AUDIT: Transaction IL110111522200000333999 initiated by Jane Smith for 1,500,000.50 ILS. Bank: Bank Hapoalim.", "[2024-07-30 15:00:00] INFO: בקשה מ-172.16.0.10 ל-api.mycompany.co.il. סטטוס: 200 OK. שם משתמש: upwwe13.", ] iface = gr.Interface( fn=predict_pii, inputs=gr.Textbox(lines=5, placeholder="הכנס כאן שורת לוג לבדיקת PII...", label="שורת לוג"), outputs=[ gr.HighlightedText(label="תוצאות זיהוי PII"), gr.Textbox(label="סטטוס/הודעות") ], title="מודל זיהוי PII בקבצי לוג (עברית ואנגלית)", description="הכנס שורת לוג כדי לזהות מידע מזהה אישית (PII) כגון כתובות IP, שמות משתמש, סיסמאות, מיילים, מספרי טלפון, פרטי חשבון בנק ועוד.", examples=examples, flagging_mode="never" # <<< תוקנה האזהרה של Gradio ) iface.launch()