Pafkun333's picture
Commiting first one
aeaf3f3
import gradio as gr
import torch
from torchvision import transforms
from PIL import Image
from gtts import gTTS
import os
import uuid
import random
import time
from model import load_face_classifier_model # Import the model loading function
# Define the same validation transform used during training
val_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Load the model using the function from model.py
model = load_face_classifier_model(model_path='model_2.pth', num_classes=5)
def cleanup_audio_files(directory=".", prefix="prediction_", max_age_seconds=30):
now = time.time()
for filename in os.listdir(directory):
if filename.startswith(prefix) and filename.endswith(".mp3"):
filepath = os.path.join(directory, filename)
file_age = now - os.path.getmtime(filepath)
if file_age > max_age_seconds:
try:
os.remove(filepath)
except Exception as e:
print(f"Error deleting {filename}: {e}")
def classify_face_with_audio_new(image: Image.Image):
"""
Classifies a single image (captured from camera) using a trained model
and generates an audio file of the prediction.
Args:
image (PIL.Image.Image): The input image.
Returns:
tuple: A tuple containing the predicted class name (str)
and the path to the generated audio file (str).
"""
byjd_audio = ["Не ме гледай! Дай ми пауч!", "Писи Писи, Мяу Мяу", "просто мяу",
"мррррррррррр"]
bleyla_audio = ["Плешкиииииитуууууууууууу", "Дай ми цун!", "Отивам при Вес Божа",
"А къде е прасетуу ?"]
jenny_audio = ["Офффф гладна съм!", "Здравейте, аз съм в овулация.", "Да пием кафе на 43.12 и да ядем шницел!",
"Офф бе Павееел!", "Обичам Дони Донсъна."]
sachu_audio = ["Мишо, ще ти счупя носа!", "Засъхнало аку на дупи на кучии.", "Чекии ли си правиш бе, педалче малко?",
"Обичам пръцкото на Сога!"]
falafel_audio = ["Дааарлинг, къде са ми чорапите?", "Маняк, измий си краката.", "Молим те, изкъпи се!",
"Обичам пръцкото на Жени!"]
if image is None:
return "Error: Could not capture image from webcam. Please try again.", None
# Ensure image is in RGB format and apply transform
image = image.convert("RGB")
image = val_transform(image).unsqueeze(0) # Add batch dimension
# Move the image to the device (assuming GPU is available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image = image.to(device)
model.to(device) # Move the model to the device as well
# Perform inference
with torch.no_grad():
outputs = model(image)
# Get the predicted class index
_, predicted_idx = torch.max(outputs.data, 1)
# Get the predicted class name
class_names = ['bleyla', 'byjd', 'falafel', 'jenny', 'sachu']
predicted_class = class_names[predicted_idx.item()]
# Generate audio
if predicted_class == "falafel":
text_to_speak = random.choice(falafel_audio)
elif predicted_class == "sachu":
text_to_speak = random.choice(sachu_audio)
elif predicted_class == "jenny":
text_to_speak = random.choice(jenny_audio)
elif predicted_class == "bleyla":
text_to_speak = random.choice(bleyla_audio)
elif predicted_class == "byjd":
text_to_speak = random.choice(byjd_audio)
else:
text_to_speak = "Unknown class"
tts = gTTS(text=text_to_speak, lang='bg')
audio_file = f"prediction_{uuid.uuid4()}.mp3"
tts.save(audio_file)
# Ensure file cleanup
cleanup_audio_files()
return predicted_class, audio_file
# Create the Gradio interface
interface = gr.Interface(
fn=classify_face_with_audio_new,
inputs=gr.Image(type="pil", label="Upload an image or use your camera"),
outputs=[
gr.Textbox(label="Predicted Class"),
gr.Audio(label="Audio Pronunciation")
],
title="Russian Monument Classifier",
description="Upload an image or use your camera to classify Russian Monument Citizens.",
examples=[["examples/bleyla_new.jpg"], ["examples/byjd_new.jpg"], ["examples/falafelcho.jpg"]] # Examples should be a list of lists
)
# Launch the interface
if __name__ == "__main__":
interface.launch()