deepfake-api / app.py
Jabrave's picture
Update app.py
6495d4e verified
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification
import librosa
from detect_face import detect_face
from transformers import AutoModelForImageClassification
from transformers import AutoImageProcessor
from PIL import Image
import torch
import gradio as gr
from extract_frames import extract_frames
import os
import shutil
# =========================
# โหลดโมเดลหลัก
# =========================
model = AutoModelForImageClassification.from_pretrained(
"Jabrave/deepfake-detector"
)
processor = AutoImageProcessor.from_pretrained(
"Jabrave/deepfake-detector"
)
# =========================
# โหลดโมเดลใบหน้า
# =========================
face_model = AutoModelForImageClassification.from_pretrained(
"Jabrave/face-detector"
)
face_processor = AutoImageProcessor.from_pretrained(
"Jabrave/face-detector"
)
voice_model = AutoModelForAudioClassification.from_pretrained(
"Jabrave/voice-detector"
)
voice_processor = AutoFeatureExtractor.from_pretrained(
"Jabrave/voice-detector"
)
# =========================
# function predict model
# =========================
def predict_with_model(image, model, processor):
inputs = processor(
images=image,
return_tensors="pt"
)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predicted_class = logits.argmax(-1).item()
confidence = torch.softmax(
logits,
dim=1
)[0][predicted_class].item()
label = model.config.id2label[predicted_class]
return {
"label": label,
"confidence": round(confidence * 100, 2)
}
def predict_audio(audio_path):
waveform, sr = librosa.load(audio_path, sr=16000)
inputs = voice_processor(
waveform,
sampling_rate=16000,
return_tensors="pt"
)
with torch.no_grad():
outputs = voice_model(**inputs)
logits = outputs.logits
predicted_class = logits.argmax(-1).item()
confidence = torch.softmax(logits, dim=1)[0][predicted_class].item()
label = voice_model.config.id2label[predicted_class]
return {
"label": label,
"confidence": round(confidence * 100, 2)
}
# =========================
# IMAGE PREDICT
# =========================
def predict(image):
temp_path = "temp_image.jpg"
Image.fromarray(image).save(temp_path)
# ----------------------
# วิเคราะห์ภาพเต็ม
# ----------------------
full_image = Image.open(temp_path)
full_result = predict_with_model(
full_image,
model,
processor
)
# ----------------------
# detect faces
# ----------------------
os.makedirs("faces", exist_ok=True)
faces = detect_face(temp_path)
face_scores = []
fake_face_found = False
for face_path in faces:
face_image = Image.open(face_path)
face_result = predict_with_model(
face_image,
face_model,
face_processor
)
face_scores.append(
face_result["confidence"]
)
if face_result["label"] != "real":
fake_face_found = True
# ----------------------
# combine score
# ----------------------
full_score = full_result["confidence"]
avg_face_score = (
sum(face_scores) / len(face_scores)
if face_scores else full_score
)
final_score = (
full_score + avg_face_score
) / 2
final_label = (
"artificial"
if (
full_result["label"] != "real"
or fake_face_found
)
else "real"
)
# cleanup
if os.path.exists(temp_path):
os.remove(temp_path)
if os.path.exists("faces"):
shutil.rmtree("faces")
return {
"label": final_label,
"final_score": round(final_score, 2),
"full_image_score": round(full_score, 2),
"face_score": round(avg_face_score, 2),
"faces_detected": len(faces)
}
# =========================
# VIDEO PREDICT
# =========================
def predict_video(video_path):
# cleanup folders
if os.path.exists("frames"):
shutil.rmtree("frames")
if os.path.exists("faces"):
shutil.rmtree("faces")
os.makedirs("frames", exist_ok=True)
os.makedirs("faces", exist_ok=True)
# extract frames
extract_frames(
video_path,
"frames"
)
frame_files = os.listdir("frames")
fake_frames = 0
total_frames = 0
full_scores = []
face_scores = []
for frame in frame_files:
frame_path = os.path.join(
"frames",
frame
)
# ----------------------
# วิเคราะห์ภาพเต็ม
# ----------------------
frame_image = Image.open(frame_path)
full_result = predict_with_model(
frame_image,
model,
processor
)
full_scores.append(
full_result["confidence"]
)
# ----------------------
# detect faces
# ----------------------
faces = detect_face(frame_path)
face_fake_found = False
for face_path in faces:
face_image = Image.open(face_path)
face_result = predict_with_model(
face_image,
face_model,
face_processor
)
face_scores.append(
face_result["confidence"]
)
if face_result["label"] != "real":
face_fake_found = True
# ----------------------
# final frame decision
# ----------------------
if (
full_result["label"] != "real"
or face_fake_found
):
fake_frames += 1
total_frames += 1
# ----------------------
# final score
# ----------------------
avg_full = (
sum(full_scores) / len(full_scores)
if full_scores else 0
)
avg_face = (
sum(face_scores) / len(face_scores)
if face_scores else avg_full
)
final_score = (
avg_full + avg_face
) / 2
final_label = (
"artificial"
if fake_frames > total_frames * 0.3
else "real"
)
# cleanup
if os.path.exists("frames"):
shutil.rmtree("frames")
if os.path.exists("faces"):
shutil.rmtree("faces")
return {
"label": final_label,
"final_score": round(final_score, 2),
"fake_frames": fake_frames,
"total_frames": total_frames,
"full_image_score": round(avg_full, 2),
"face_score": round(avg_face, 2)
}
# =========================
# UI
# =========================
image_ui = gr.Interface(
fn=predict,
inputs=gr.Image(),
outputs=gr.JSON(),
title="Image Deepfake Detector"
)
video_ui = gr.Interface(
fn=predict_video,
inputs=gr.Video(),
outputs=gr.JSON(),
title="Video Deepfake Detector"
)
audio_ui = gr.Interface(
fn=predict_audio,
inputs=gr.Audio(type="filepath"),
outputs=gr.JSON(),
title="Voice Deepfake Detector"
)
demo = gr.TabbedInterface(
[image_ui, video_ui, audio_ui],
["Image", "Video", "Audio"]
)
demo.launch()