from transformers import AutoFeatureExtractor from transformers import AutoModelForAudioClassification import librosa from detect_face import detect_face from transformers import AutoModelForImageClassification from transformers import AutoImageProcessor from PIL import Image import torch import gradio as gr from extract_frames import extract_frames import os import shutil # ========================= # โหลดโมเดลหลัก # ========================= model = AutoModelForImageClassification.from_pretrained( "Jabrave/deepfake-detector" ) processor = AutoImageProcessor.from_pretrained( "Jabrave/deepfake-detector" ) # ========================= # โหลดโมเดลใบหน้า # ========================= face_model = AutoModelForImageClassification.from_pretrained( "Jabrave/face-detector" ) face_processor = AutoImageProcessor.from_pretrained( "Jabrave/face-detector" ) voice_model = AutoModelForAudioClassification.from_pretrained( "Jabrave/voice-detector" ) voice_processor = AutoFeatureExtractor.from_pretrained( "Jabrave/voice-detector" ) # ========================= # function predict model # ========================= def predict_with_model(image, model, processor): inputs = processor( images=image, return_tensors="pt" ) with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits predicted_class = logits.argmax(-1).item() confidence = torch.softmax( logits, dim=1 )[0][predicted_class].item() label = model.config.id2label[predicted_class] return { "label": label, "confidence": round(confidence * 100, 2) } def predict_audio(audio_path): waveform, sr = librosa.load(audio_path, sr=16000) inputs = voice_processor( waveform, sampling_rate=16000, return_tensors="pt" ) with torch.no_grad(): outputs = voice_model(**inputs) logits = outputs.logits predicted_class = logits.argmax(-1).item() confidence = torch.softmax(logits, dim=1)[0][predicted_class].item() label = voice_model.config.id2label[predicted_class] return { "label": label, "confidence": round(confidence * 100, 2) } # ========================= # IMAGE PREDICT # ========================= def predict(image): temp_path = "temp_image.jpg" Image.fromarray(image).save(temp_path) # ---------------------- # วิเคราะห์ภาพเต็ม # ---------------------- full_image = Image.open(temp_path) full_result = predict_with_model( full_image, model, processor ) # ---------------------- # detect faces # ---------------------- os.makedirs("faces", exist_ok=True) faces = detect_face(temp_path) face_scores = [] fake_face_found = False for face_path in faces: face_image = Image.open(face_path) face_result = predict_with_model( face_image, face_model, face_processor ) face_scores.append( face_result["confidence"] ) if face_result["label"] != "real": fake_face_found = True # ---------------------- # combine score # ---------------------- full_score = full_result["confidence"] avg_face_score = ( sum(face_scores) / len(face_scores) if face_scores else full_score ) final_score = ( full_score + avg_face_score ) / 2 final_label = ( "artificial" if ( full_result["label"] != "real" or fake_face_found ) else "real" ) # cleanup if os.path.exists(temp_path): os.remove(temp_path) if os.path.exists("faces"): shutil.rmtree("faces") return { "label": final_label, "final_score": round(final_score, 2), "full_image_score": round(full_score, 2), "face_score": round(avg_face_score, 2), "faces_detected": len(faces) } # ========================= # VIDEO PREDICT # ========================= def predict_video(video_path): # cleanup folders if os.path.exists("frames"): shutil.rmtree("frames") if os.path.exists("faces"): shutil.rmtree("faces") os.makedirs("frames", exist_ok=True) os.makedirs("faces", exist_ok=True) # extract frames extract_frames( video_path, "frames" ) frame_files = os.listdir("frames") fake_frames = 0 total_frames = 0 full_scores = [] face_scores = [] for frame in frame_files: frame_path = os.path.join( "frames", frame ) # ---------------------- # วิเคราะห์ภาพเต็ม # ---------------------- frame_image = Image.open(frame_path) full_result = predict_with_model( frame_image, model, processor ) full_scores.append( full_result["confidence"] ) # ---------------------- # detect faces # ---------------------- faces = detect_face(frame_path) face_fake_found = False for face_path in faces: face_image = Image.open(face_path) face_result = predict_with_model( face_image, face_model, face_processor ) face_scores.append( face_result["confidence"] ) if face_result["label"] != "real": face_fake_found = True # ---------------------- # final frame decision # ---------------------- if ( full_result["label"] != "real" or face_fake_found ): fake_frames += 1 total_frames += 1 # ---------------------- # final score # ---------------------- avg_full = ( sum(full_scores) / len(full_scores) if full_scores else 0 ) avg_face = ( sum(face_scores) / len(face_scores) if face_scores else avg_full ) final_score = ( avg_full + avg_face ) / 2 final_label = ( "artificial" if fake_frames > total_frames * 0.3 else "real" ) # cleanup if os.path.exists("frames"): shutil.rmtree("frames") if os.path.exists("faces"): shutil.rmtree("faces") return { "label": final_label, "final_score": round(final_score, 2), "fake_frames": fake_frames, "total_frames": total_frames, "full_image_score": round(avg_full, 2), "face_score": round(avg_face, 2) } # ========================= # UI # ========================= image_ui = gr.Interface( fn=predict, inputs=gr.Image(), outputs=gr.JSON(), title="Image Deepfake Detector" ) video_ui = gr.Interface( fn=predict_video, inputs=gr.Video(), outputs=gr.JSON(), title="Video Deepfake Detector" ) audio_ui = gr.Interface( fn=predict_audio, inputs=gr.Audio(type="filepath"), outputs=gr.JSON(), title="Voice Deepfake Detector" ) demo = gr.TabbedInterface( [image_ui, video_ui, audio_ui], ["Image", "Video", "Audio"] ) demo.launch()