from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification
import librosa
from detect_face import detect_face
from transformers import AutoModelForImageClassification
from transformers import AutoImageProcessor
from PIL import Image
import torch
import gradio as gr
from extract_frames import extract_frames

import os
import shutil

# =========================
# โหลดโมเดลหลัก
# =========================
model = AutoModelForImageClassification.from_pretrained(
    "Jabrave/deepfake-detector"
)

processor = AutoImageProcessor.from_pretrained(
    "Jabrave/deepfake-detector"
)

# =========================
# โหลดโมเดลใบหน้า
# =========================
face_model = AutoModelForImageClassification.from_pretrained(
    "Jabrave/face-detector"
)

face_processor = AutoImageProcessor.from_pretrained(
    "Jabrave/face-detector"
)

voice_model = AutoModelForAudioClassification.from_pretrained(
    "Jabrave/voice-detector"
)

voice_processor = AutoFeatureExtractor.from_pretrained(
    "Jabrave/voice-detector"
)

# =========================
# function predict model
# =========================
def predict_with_model(image, model, processor):

    inputs = processor(
        images=image,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits

    predicted_class = logits.argmax(-1).item()

    confidence = torch.softmax(
        logits,
        dim=1
    )[0][predicted_class].item()

    label = model.config.id2label[predicted_class]

    return {
        "label": label,
        "confidence": round(confidence * 100, 2)
    }


def predict_audio(audio_path):

    waveform, sr = librosa.load(audio_path, sr=16000)

    inputs = voice_processor(
        waveform,
        sampling_rate=16000,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = voice_model(**inputs)

    logits = outputs.logits
    predicted_class = logits.argmax(-1).item()

    confidence = torch.softmax(logits, dim=1)[0][predicted_class].item()

    label = voice_model.config.id2label[predicted_class]

    return {
        "label": label,
        "confidence": round(confidence * 100, 2)
    }

# =========================
# IMAGE PREDICT
# =========================
def predict(image):

    temp_path = "temp_image.jpg"

    Image.fromarray(image).save(temp_path)

    # ----------------------
    # วิเคราะห์ภาพเต็ม
    # ----------------------
    full_image = Image.open(temp_path)

    full_result = predict_with_model(
        full_image,
        model,
        processor
    )

    # ----------------------
    # detect faces
    # ----------------------
    os.makedirs("faces", exist_ok=True)

    faces = detect_face(temp_path)

    face_scores = []

    fake_face_found = False

    for face_path in faces:

        face_image = Image.open(face_path)

        face_result = predict_with_model(
            face_image,
            face_model,
            face_processor
        )

        face_scores.append(
            face_result["confidence"]
        )

        if face_result["label"] != "real":
            fake_face_found = True

    # ----------------------
    # combine score
    # ----------------------
    full_score = full_result["confidence"]

    avg_face_score = (
        sum(face_scores) / len(face_scores)
        if face_scores else full_score
    )

    final_score = (
        full_score + avg_face_score
    ) / 2

    final_label = (
        "artificial"
        if (
            full_result["label"] != "real"
            or fake_face_found
        )
        else "real"
    )

    # cleanup
    if os.path.exists(temp_path):
        os.remove(temp_path)

    if os.path.exists("faces"):
        shutil.rmtree("faces")

    return {
        "label": final_label,
        "final_score": round(final_score, 2),
        "full_image_score": round(full_score, 2),
        "face_score": round(avg_face_score, 2),
        "faces_detected": len(faces)
    }

# =========================
# VIDEO PREDICT
# =========================
def predict_video(video_path):

    # cleanup folders
    if os.path.exists("frames"):
        shutil.rmtree("frames")

    if os.path.exists("faces"):
        shutil.rmtree("faces")

    os.makedirs("frames", exist_ok=True)
    os.makedirs("faces", exist_ok=True)

    # extract frames
    extract_frames(
        video_path,
        "frames"
    )

    frame_files = os.listdir("frames")

    fake_frames = 0

    total_frames = 0

    full_scores = []

    face_scores = []

    for frame in frame_files:

        frame_path = os.path.join(
            "frames",
            frame
        )

        # ----------------------
        # วิเคราะห์ภาพเต็ม
        # ----------------------
        frame_image = Image.open(frame_path)

        full_result = predict_with_model(
            frame_image,
            model,
            processor
        )

        full_scores.append(
            full_result["confidence"]
        )

        # ----------------------
        # detect faces
        # ----------------------
        faces = detect_face(frame_path)

        face_fake_found = False

        for face_path in faces:

            face_image = Image.open(face_path)

            face_result = predict_with_model(
                face_image,
                face_model,
                face_processor
            )

            face_scores.append(
                face_result["confidence"]
            )

            if face_result["label"] != "real":
                face_fake_found = True

        # ----------------------
        # final frame decision
        # ----------------------
        if (
            full_result["label"] != "real"
            or face_fake_found
        ):
            fake_frames += 1

        total_frames += 1

    # ----------------------
    # final score
    # ----------------------
    avg_full = (
        sum(full_scores) / len(full_scores)
        if full_scores else 0
    )

    avg_face = (
        sum(face_scores) / len(face_scores)
        if face_scores else avg_full
    )

    final_score = (
        avg_full + avg_face
    ) / 2

    final_label = (
        "artificial"
        if fake_frames > total_frames * 0.3
        else "real"
    )

    # cleanup
    if os.path.exists("frames"):
        shutil.rmtree("frames")

    if os.path.exists("faces"):
        shutil.rmtree("faces")

    return {
        "label": final_label,
        "final_score": round(final_score, 2),
        "fake_frames": fake_frames,
        "total_frames": total_frames,
        "full_image_score": round(avg_full, 2),
        "face_score": round(avg_face, 2)
    }

# =========================
# UI
# =========================
image_ui = gr.Interface(
    fn=predict,
    inputs=gr.Image(),
    outputs=gr.JSON(),
    title="Image Deepfake Detector"
)

video_ui = gr.Interface(
    fn=predict_video,
    inputs=gr.Video(),
    outputs=gr.JSON(),
    title="Video Deepfake Detector"
)

audio_ui = gr.Interface(
    fn=predict_audio,
    inputs=gr.Audio(type="filepath"),
    outputs=gr.JSON(),
    title="Voice Deepfake Detector"
)

demo = gr.TabbedInterface(
    [image_ui, video_ui, audio_ui],
    ["Image", "Video", "Audio"]
)

demo.launch()