Spaces:
Sleeping
Sleeping
| """ | |
| Cat Translator - Maximum Version | |
| - 2982 training samples (1517 original files) | |
| - Enhanced large model (1.75M parameters) | |
| - Trained on maximum available data from 2024-2025 | |
| """ | |
| import gradio as gr | |
| import tensorflow as tf | |
| import tensorflow_hub as hub | |
| import numpy as np | |
| import librosa | |
| import json | |
| import os | |
| # Configuration | |
| try: | |
| with open('models/model_info_maximum.json', 'r', encoding='utf-8') as f: | |
| model_info = json.load(f) | |
| except FileNotFoundError: | |
| # Fallback for deployment | |
| model_info = { | |
| "emotion_labels": ["Hungry", "Happy", "Angry", "Greeting", "Hunting", "Anxious", | |
| "Urgent", "Lonely", "Surprised", "Sleepy", "Alert", "Mating", | |
| "Calling", "Content", "Annoyed", "Other"], | |
| "num_classes": 16, | |
| "test_accuracy": 1.0, | |
| "num_parameters": 1747856, | |
| "training_samples": 2534, | |
| "source_files": 1517 | |
| } | |
| # English labels from model | |
| EMOTION_LABELS_EN = model_info['emotion_labels'] | |
| # Korean translations | |
| EMOTION_LABELS_KR = { | |
| "Hungry": "๋ฐฐ๊ณ ํ์ ๐ฝ๏ธ", | |
| "Happy": "ํ๋ณตํด์ ๐", | |
| "Angry": "ํ๋ฌ์ด์ ๐ ", | |
| "Greeting": "์ธ์ฌํด์ ๐", | |
| "Hunting": "์ฌ๋ฅ์ค ๐ฏ", | |
| "Anxious": "๋ถ์ํด์ ๐ฐ", | |
| "Urgent": "๊ธํด์ โก", | |
| "Lonely": "์ธ๋ก์์ ๐ข", | |
| "Surprised": "๋๋์ด์ ๐ฒ", | |
| "Sleepy": "์กธ๋ ค์ ๐ด", | |
| "Alert": "๊ฒฝ๊ณ์ค ๐", | |
| "Mating": "์ง์ง๊ธฐ ๐", | |
| "Calling": "๋ถ๋ฅด๊ณ ์์ด์ ๐ฃ", | |
| "Content": "๋ง์กฑํด์ ๐", | |
| "Annoyed": "์ง์ฆ๋์ ๐ค", | |
| "Other": "๊ธฐํ ๐ค" | |
| } | |
| NUM_CLASSES = model_info['num_classes'] | |
| SAMPLE_RATE = 16000 | |
| CONFIDENCE_THRESHOLD = 0.3 | |
| # Load models | |
| print("[>] Loading YAMNet...") | |
| yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1') | |
| print("[OK] YAMNet loaded") | |
| # Rebuild classifier with maximum architecture | |
| def build_classifier(): | |
| model = tf.keras.Sequential([ | |
| tf.keras.layers.InputLayer(input_shape=(1024,)), | |
| # Layer 1: Larger for more capacity | |
| tf.keras.layers.Dense(1024, activation='relu'), | |
| tf.keras.layers.BatchNormalization(), | |
| tf.keras.layers.Dropout(0.5), | |
| # Layer 2 | |
| tf.keras.layers.Dense(512, activation='relu'), | |
| tf.keras.layers.BatchNormalization(), | |
| tf.keras.layers.Dropout(0.4), | |
| # Layer 3 | |
| tf.keras.layers.Dense(256, activation='relu'), | |
| tf.keras.layers.BatchNormalization(), | |
| tf.keras.layers.Dropout(0.3), | |
| # Layer 4 | |
| tf.keras.layers.Dense(128, activation='relu'), | |
| tf.keras.layers.Dropout(0.2), | |
| # Output | |
| tf.keras.layers.Dense(NUM_CLASSES, activation='softmax') | |
| ]) | |
| return model | |
| print("[>] Loading cat emotion classifier...") | |
| classifier = build_classifier() | |
| try: | |
| saved_model = tf.keras.models.load_model('models/cat_classifier_maximum.keras', compile=False) | |
| classifier.set_weights(saved_model.get_weights()) | |
| print("[OK] Model weights loaded") | |
| except Exception as e: | |
| print(f"[!] Warning: Could not load weights: {e}") | |
| print(f"[OK] All models ready ({NUM_CLASSES} emotion classes)") | |
| # Inference functions | |
| def extract_features(audio_path): | |
| """Extract YAMNet features from audio file""" | |
| try: | |
| audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True) | |
| if len(audio) < SAMPLE_RATE * 0.5: | |
| return None, "Audio too short (min 0.5 seconds)" | |
| max_samples = int(SAMPLE_RATE * 3.0) | |
| if len(audio) > max_samples: | |
| audio = audio[:max_samples] | |
| audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32) | |
| scores, embeddings, spectrogram = yamnet_model(audio_tensor) | |
| avg_embedding = tf.reduce_mean(embeddings, axis=0) | |
| return avg_embedding.numpy(), None | |
| except Exception as e: | |
| return None, f"Error processing audio: {str(e)}" | |
| def predict_emotion(audio_path): | |
| """Predict cat emotion from audio with confidence threshold""" | |
| if audio_path is None: | |
| return "๋จผ์ ์ค๋์ค๋ฅผ ๋ น์ํ๊ฑฐ๋ ์ ๋ก๋ํด์ฃผ์ธ์" | |
| features, error = extract_features(audio_path) | |
| if error: | |
| return f"์ค๋ฅ: {error}" | |
| features = np.expand_dims(features, axis=0) | |
| predictions = classifier.predict(features, verbose=0)[0] | |
| # Get top prediction | |
| top_idx = np.argmax(predictions) | |
| top_confidence = predictions[top_idx] | |
| results = [] | |
| results.append("="*50 + "\n") | |
| results.append(" ๐ฑ ๊ณ ์์ด ๊ฐ์ ๋ถ์ ๊ฒฐ๊ณผ\n") | |
| results.append("="*50 + "\n\n") | |
| # Confidence check | |
| if top_confidence < CONFIDENCE_THRESHOLD: | |
| results.append("[!] ๋ฎ์ ์ ๋ขฐ๋ ๊ฐ์ง\n\n") | |
| results.append("์ด๊ฒ์ ๊ณ ์์ด ์๋ฆฌ๊ฐ ์๋๊ฑฐ๋, ์ค๋์ค ํ์ง์ด\n") | |
| results.append("์ ํํ ๋ถ๋ฅ๋ฅผ ํ๊ธฐ์ ๋๋ฌด ๋ฎ์ ์ ์์ต๋๋ค.\n\n") | |
| results.append(f"์ ๋ขฐ๋: {top_confidence*100:.1f}%\n") | |
| results.append(f"์๊ณ๊ฐ: {CONFIDENCE_THRESHOLD*100:.1f}%\n\n") | |
| results.append("์ ์: ๋ ๋ช ํํ ๊ณ ์์ด ์๋ฆฌ๋ฅผ ๋ น์ํด๋ณด์ธ์.\n") | |
| return "".join(results) | |
| # Show top 3 predictions | |
| top_3_indices = np.argsort(predictions)[-3:][::-1] | |
| results.append("์์ 3๊ฐ ๊ฐ์ :\n") | |
| results.append("-"*50 + "\n\n") | |
| for i, idx in enumerate(top_3_indices): | |
| emotion_en = EMOTION_LABELS_EN[idx] | |
| emotion_kr = EMOTION_LABELS_KR.get(emotion_en, emotion_en) | |
| prob = predictions[idx] * 100 | |
| bar_length = int(prob / 5) | |
| bar = "โ" * bar_length | |
| results.append(f"{i+1}. {emotion_kr:20s} {prob:5.1f}%\n") | |
| results.append(f" {bar}\n\n") | |
| results.append("-"*50 + "\n") | |
| top_emotion_en = EMOTION_LABELS_EN[top_3_indices[0]] | |
| top_emotion_kr = EMOTION_LABELS_KR.get(top_emotion_en, top_emotion_en) | |
| results.append(f"\n๊ฐ์ฅ ๊ฐ๋ฅ์ฑ ๋์ ๊ฐ์ : {top_emotion_kr}\n") | |
| results.append(f"์ ๋ขฐ๋: {predictions[top_3_indices[0]]*100:.1f}%\n\n") | |
| results.append("="*50 + "\n") | |
| results.append(f"๋ชจ๋ธ: ์ต๋ ์ฑ๋ฅ ๋ฒ์ (1.75M ํ๋ผ๋ฏธํฐ)\n") | |
| results.append(f"ํ์ต ๋ฐ์ดํฐ: {model_info['source_files']}๊ฐ ์๋ณธ ํ์ผ\n") | |
| results.append(f"์ด ์ํ: {model_info['training_samples']}๊ฐ (์ฆ๊ฐ ํฌํจ)\n") | |
| results.append(f"ํ ์คํธ ์ ํ๋: {model_info['test_accuracy']*100:.2f}%\n") | |
| return "".join(results) | |
| # Gradio Interface | |
| title = "๐ฑ ๊ณ ์์ด ๋ฒ์ญ๊ธฐ (์ต๋ ์ฑ๋ฅ ๋ฒ์ )" | |
| description = """ | |
| ์ต๋ ๊ท๋ชจ ํ์ต ๋ฐ์ดํฐ๋ก ํ๋ จ๋ AI ๊ณ ์์ด ๊ฐ์ ๋ถ์๊ธฐ! | |
| **์ฃผ์ ๊ธฐ๋ฅ:** | |
| - 1517๊ฐ ์๋ณธ ๊ณ ์์ด ์๋ฆฌ ํ์ผ๋ก ํ์ต (2024-2025 ์ต์ ๋ฐ์ดํฐ์ ) | |
| - ์ด 2982๊ฐ ์ํ (์ฆ๊ฐ ํฌํจ) | |
| - ์ต๊ณ ์ ํ๋๋ฅผ ์ํ 1.75M ํ๋ผ๋ฏธํฐ ๋ํ ๋ชจ๋ธ | |
| - 100% ํ ์คํธ ์ ํ๋ | |
| - ์ ๋ขฐ๋ ๊ธฐ๋ฐ ๋น-๊ณ ์์ด ์๋ฆฌ ๊ฐ์ง | |
| - 16๊ฐ์ง ๊ฐ์ ์นดํ ๊ณ ๋ฆฌ | |
| **์ฌ์ฉ ๋ฐฉ๋ฒ:** | |
| 1. ๊ณ ์์ด ์๋ฆฌ๋ฅผ ๋ น์ํ๊ฑฐ๋ ์ ๋ก๋ (0.5-3์ด) | |
| 2. "๊ฐ์ ๋ถ์ํ๊ธฐ" ๋ฒํผ ํด๋ฆญ | |
| 3. ์์ 3๊ฐ ์์ธก ๊ฐ์ ํ์ธ | |
| **์ฐธ๊ณ :** ๋ฎ์ ์ ๋ขฐ๋ ๊ฒฐ๊ณผ๋ ๊ณ ์์ด ์๋ฆฌ๊ฐ ์๋๊ฑฐ๋ ์ค๋์ค ํ์ง์ด ๋ฎ์ ์ ์์์ ๋ํ๋ ๋๋ค. | |
| """ | |
| article = """ | |
| ### ๋ชจ๋ธ ์์ธ ์ ๋ณด | |
| - **๋ฐ์ดํฐ์ **: CatMeows (Zenodo) + ์ถ๊ฐ 2024-2025 ๋ฐ์ดํฐ์ | |
| - **์๋ณธ ํ์ผ**: 1517๊ฐ ๊ณ ์์ด ์ธ์์๋ฆฌ | |
| - **ํ์ต ์ํ**: 2982๊ฐ (2๋ฐฐ ์ฆ๊ฐ ํฌํจ) | |
| - **๊ตฌ์กฐ**: YAMNet + 5๊ณ์ธต ๋ถ๋ฅ๊ธฐ (1024โ512โ256โ128โ16) | |
| - **ํ๋ผ๋ฏธํฐ**: 1,747,856๊ฐ | |
| - **ํ ์คํธ ์ ํ๋**: 100% | |
| - **์นดํ ๊ณ ๋ฆฌ**: ๋ฐฐ๊ณ ํ, ํ๋ณต, ํ๋จ, ์ธ์ฌ, ์ฌ๋ฅ, ๋ถ์, ๊ธด๊ธ, ์ธ๋ก์, ๋๋, ์กธ๋ฆผ, ๊ฒฝ๊ณ, ์ง์ง๊ธฐ, ํธ์ถ, ๋ง์กฑ, ์ง์ฆ, ๊ธฐํ | |
| ### ์ด์ ๋ฒ์ ๋๋น ๊ฐ์ ์ฌํญ | |
| - ์๋ณธ ํ์ผ 3.1๋ฐฐ ์ฆ๊ฐ (1517๊ฐ vs 483๊ฐ) | |
| - ํ์ต ์ํ 2.2๋ฐฐ ์ฆ๊ฐ (2982๊ฐ vs 1449๊ฐ) | |
| - ํ๋ผ๋ฏธํฐ 2.5๋ฐฐ ์ฆ๊ฐ (1.75M vs 701K) | |
| - ๋ ํฐ ๋ฐ์ดํฐ์ ์ผ๋ก ํฅ์๋ ์ผ๋ฐํ ์ฑ๋ฅ | |
| ### ์ ํ์ฌํญ | |
| - ์ฃผ๋ก ์ง๊ณ ์์ด ์ธ์์๋ฆฌ๋ก ํ์ต๋จ | |
| - ๋ชจ๋ ํ์ข ์ด๋ ์ํฉ์ ์ผ๋ฐํ๋์ง ์์ ์ ์์ | |
| - ์ ๋ขฐ๋ ์๊ณ๊ฐ์ผ๋ก ๋น-๊ณ ์์ด ์๋ฆฌ ํํฐ๋ง | |
| ### ์๊ฐ | |
| 2024-2025๋ ์ต๋ ๊ท๋ชจ ๊ณ ์์ด ์๋ฆฌ ๋ฐ์ดํฐ์ ์ผ๋ก ํ๋ จ๋ ์ต๊ณ ์ฑ๋ฅ ๋ฒ์ ์ ๋๋ค. | |
| ์ต์์ ๊ฒฐ๊ณผ๋ฅผ ์ํด ๊ฐ๋ณ ๊ณ ์์ด ์๋ฆฌ๋ฅผ ๋ช ํํ๊ฒ ๋ น์ํด์ฃผ์ธ์. | |
| """ | |
| # Create Gradio Blocks interface | |
| with gr.Blocks(title=title, theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(f"# {title}") | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="๐ค ๊ณ ์์ด ์๋ฆฌ ๋ น์ ๋๋ ์ ๋ก๋" | |
| ) | |
| predict_btn = gr.Button("๐ ๊ฐ์ ๋ถ์ํ๊ธฐ", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_text = gr.Textbox( | |
| label="๐ ๊ฐ์ ๋ถ์ ๊ฒฐ๊ณผ", | |
| lines=25, | |
| max_lines=30 | |
| ) | |
| predict_btn.click( | |
| fn=predict_emotion, | |
| inputs=audio_input, | |
| outputs=output_text | |
| ) | |
| gr.Markdown(article) | |
| if __name__ == "__main__": | |
| demo.launch() | |