import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import json
import os

# Define your model ID on Hugging Face Hub
# Make sure to replace 'doronpe12/log-anonymizer-heb-support' with your exact model repository name.
MODEL_ID = "doronpe12/log-anonymizer-heb-support"

# Path to the ner_labels.json file.
# This file is still required for the app's internal logic, even if not needed by the pipeline directly.
LABELS_FILE = "ner_labels.json"

# --- Load Label Mappings ---
# This part ensures the Gradio app knows how to interpret the NER tags.
# We still need this file for the app's logic, but the pipeline itself will get
# the labels from the model's config.json.
try:
    with open(LABELS_FILE, 'r', encoding='utf-8') as f:
        label_mappings = json.load(f)
    id2label = {int(k): v for k, v in label_mappings["id2label"].items()}
    label2id = label_mappings["label2id"]
    print(f"Loaded {len(id2label)} NER labels from {LABELS_FILE}.")
except FileNotFoundError:
    print(f"Error: {LABELS_FILE} not found. Please ensure it's uploaded to the Space.")
    # Fallback labels (should not be reached if the file is present)
    id2label = {0: "O", 1: "B-PII", 2: "I-PII"}
    label2id = {v: k for k, v in id2label.items()}
except Exception as e:
    print(f"Error loading {LABELS_FILE}: {e}")
    id2label = {0: "O", 1: "B-PII", 2: "I-PII"}
    label2id = {v: k for k, v in id2label.items()}


# --- Load the Model and Tokenizer ---
print(f"Loading NER pipeline for model: {MODEL_ID}...")
try:
    ner_pipeline = pipeline(
        "ner",
        model=MODEL_ID,
        tokenizer=MODEL_ID,
        aggregation_strategy="simple",
        # <<< הבעיה נפתרה כאן: הסרנו את הארגומנט id2label
    )
    print("NER pipeline loaded successfully.")
except Exception as e:
    print(f"Failed to load NER pipeline: {e}")
    print("Please ensure the model ID is correct and accessible on Hugging Face Hub.")
    print("If your model is private, ensure your Space is also private.")
    ner_pipeline = None

# --- Prediction Function for Gradio ---
def predict_pii(text):
    if ner_pipeline is None:
        return {"text": text, "entities": []}, "Error: Model not loaded. Please check logs."

    results = ner_pipeline(text)
    
    formatted_entities = []
    for entity in results:
        formatted_entities.append({
            "start": entity['start'],
            "end": entity['end'],
            "entity": entity['entity_group'],
            "score": entity['score']
        })
    
    return {"text": text, "entities": formatted_entities}, "PII detection complete."

# --- Gradio Interface ---
examples = [
    "[2024-07-30 10:30:00] INFO: User John Doe from 192.168.1.10 accessed /data/reports on web-server-01. Email: john.doe@example.com, Phone: +972-50-1234567. ID: 123456789.",
    "[2024-07-30 11:15:20] ERROR: Failed authentication for username admin with password 'MyS3cr3tP@ssw0rd!' from IP 203.0.113.45. API Key: sk-xyz123abc.",
    "[2024-07-30 12:00:00] DEBUG: System heartbeat ok. No PII here.",
    "[2024-07-30 13:45:10] AUDIT: Transaction IL110111522200000333999 initiated by Jane Smith for 1,500,000.50 ILS. Bank: Bank Hapoalim.",
    "[2024-07-30 15:00:00] INFO: בקשה מ-172.16.0.10 ל-api.mycompany.co.il. סטטוס: 200 OK. שם משתמש: upwwe13.",
]

iface = gr.Interface(
    fn=predict_pii,
    inputs=gr.Textbox(lines=5, placeholder="הכנס כאן שורת לוג לבדיקת PII...", label="שורת לוג"),
    outputs=[
        gr.HighlightedText(label="תוצאות זיהוי PII"),
        gr.Textbox(label="סטטוס/הודעות")
    ],
    title="מודל זיהוי PII בקבצי לוג (עברית ואנגלית)",
    description="הכנס שורת לוג כדי לזהות מידע מזהה אישית (PII) כגון כתובות IP, שמות משתמש, סיסמאות, מיילים, מספרי טלפון, פרטי חשבון בנק ועוד.",
    examples=examples,
    flagging_mode="never" # <<< תוקנה האזהרה של Gradio
)

iface.launch()