Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from torchvision import transforms | |
| from PIL import Image | |
| from gtts import gTTS | |
| import os | |
| import uuid | |
| import random | |
| import time | |
| from model import load_face_classifier_model # Import the model loading function | |
| # Define the same validation transform used during training | |
| val_transform = transforms.Compose([ | |
| transforms.Resize(256), | |
| transforms.CenterCrop(224), | |
| transforms.ToTensor(), | |
| transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |
| ]) | |
| # Load the model using the function from model.py | |
| model = load_face_classifier_model(model_path='model_2.pth', num_classes=5) | |
| def cleanup_audio_files(directory=".", prefix="prediction_", max_age_seconds=30): | |
| now = time.time() | |
| for filename in os.listdir(directory): | |
| if filename.startswith(prefix) and filename.endswith(".mp3"): | |
| filepath = os.path.join(directory, filename) | |
| file_age = now - os.path.getmtime(filepath) | |
| if file_age > max_age_seconds: | |
| try: | |
| os.remove(filepath) | |
| except Exception as e: | |
| print(f"Error deleting {filename}: {e}") | |
| def classify_face_with_audio_new(image: Image.Image): | |
| """ | |
| Classifies a single image (captured from camera) using a trained model | |
| and generates an audio file of the prediction. | |
| Args: | |
| image (PIL.Image.Image): The input image. | |
| Returns: | |
| tuple: A tuple containing the predicted class name (str) | |
| and the path to the generated audio file (str). | |
| """ | |
| byjd_audio = ["Не ме гледай! Дай ми пауч!", "Писи Писи, Мяу Мяу", "просто мяу", | |
| "мррррррррррр"] | |
| bleyla_audio = ["Плешкиииииитуууууууууууу", "Дай ми цун!", "Отивам при Вес Божа", | |
| "А къде е прасетуу ?"] | |
| jenny_audio = ["Офффф гладна съм!", "Здравейте, аз съм в овулация.", "Да пием кафе на 43.12 и да ядем шницел!", | |
| "Офф бе Павееел!", "Обичам Дони Донсъна."] | |
| sachu_audio = ["Мишо, ще ти счупя носа!", "Засъхнало аку на дупи на кучии.", "Чекии ли си правиш бе, педалче малко?", | |
| "Обичам пръцкото на Сога!"] | |
| falafel_audio = ["Дааарлинг, къде са ми чорапите?", "Маняк, измий си краката.", "Молим те, изкъпи се!", | |
| "Обичам пръцкото на Жени!"] | |
| if image is None: | |
| return "Error: Could not capture image from webcam. Please try again.", None | |
| # Ensure image is in RGB format and apply transform | |
| image = image.convert("RGB") | |
| image = val_transform(image).unsqueeze(0) # Add batch dimension | |
| # Move the image to the device (assuming GPU is available) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| image = image.to(device) | |
| model.to(device) # Move the model to the device as well | |
| # Perform inference | |
| with torch.no_grad(): | |
| outputs = model(image) | |
| # Get the predicted class index | |
| _, predicted_idx = torch.max(outputs.data, 1) | |
| # Get the predicted class name | |
| class_names = ['bleyla', 'byjd', 'falafel', 'jenny', 'sachu'] | |
| predicted_class = class_names[predicted_idx.item()] | |
| # Generate audio | |
| if predicted_class == "falafel": | |
| text_to_speak = random.choice(falafel_audio) | |
| elif predicted_class == "sachu": | |
| text_to_speak = random.choice(sachu_audio) | |
| elif predicted_class == "jenny": | |
| text_to_speak = random.choice(jenny_audio) | |
| elif predicted_class == "bleyla": | |
| text_to_speak = random.choice(bleyla_audio) | |
| elif predicted_class == "byjd": | |
| text_to_speak = random.choice(byjd_audio) | |
| else: | |
| text_to_speak = "Unknown class" | |
| tts = gTTS(text=text_to_speak, lang='bg') | |
| audio_file = f"prediction_{uuid.uuid4()}.mp3" | |
| tts.save(audio_file) | |
| # Ensure file cleanup | |
| cleanup_audio_files() | |
| return predicted_class, audio_file | |
| # Create the Gradio interface | |
| interface = gr.Interface( | |
| fn=classify_face_with_audio_new, | |
| inputs=gr.Image(type="pil", label="Upload an image or use your camera"), | |
| outputs=[ | |
| gr.Textbox(label="Predicted Class"), | |
| gr.Audio(label="Audio Pronunciation") | |
| ], | |
| title="Russian Monument Classifier", | |
| description="Upload an image or use your camera to classify Russian Monument Citizens.", | |
| examples=[["examples/bleyla_new.jpg"], ["examples/byjd_new.jpg"], ["examples/falafelcho.jpg"]] # Examples should be a list of lists | |
| ) | |
| # Launch the interface | |
| if __name__ == "__main__": | |
| interface.launch() | |