Spaces:
Sleeping
Sleeping
| """ | |
| Cat Translator - Advanced 2025 Version | |
| - ๊ณ ๊ธ ์ฆ๊ฐ ๊ธฐ๋ฒ ์ ์ฉ (19๊ฐ์ง) | |
| - Mixup ๋ฐ์ดํฐ ์์ฑ | |
| - 5์ธต ์ฌ์ธต ์ํคํ ์ฒ | |
| - 96.7% ํ ์คํธ ์ ํ๋ | |
| - 3๊ฐ์ง ์ปจํ ์คํธ ๋ถ๋ฅ (๋จน์ด, ๋น์ง, ๊ฒฉ๋ฆฌ) | |
| """ | |
| import gradio as gr | |
| import tensorflow as tf | |
| import tensorflow_hub as hub | |
| import numpy as np | |
| import librosa | |
| import json | |
| import os | |
| # Configuration | |
| try: | |
| with open('models/model_info_advanced.json', 'r', encoding='utf-8') as f: | |
| model_info = json.load(f) | |
| except FileNotFoundError: | |
| # Fallback | |
| model_info = { | |
| "num_classes": 3, | |
| "context_labels": {"0": "Food", "1": "Brushing", "2": "Isolation"}, | |
| "context_labels_kr": {"0": "๋จน์ด ๋๊ธฐ ๐ฝ๏ธ", "1": "๋น์ง ๐บ", "2": "๊ฒฉ๋ฆฌ/์ธ๋ก์ ๐ฟ"}, | |
| "test_accuracy": 0.7606, | |
| "num_parameters": 1359747, | |
| "training_samples": 1870, | |
| "test_samples": 330 | |
| } | |
| # Labels | |
| CONTEXT_LABELS_EN = {int(k): v for k, v in model_info['context_labels'].items()} | |
| CONTEXT_LABELS_KR = {int(k): v for k, v in model_info['context_labels_kr'].items()} | |
| NUM_CLASSES = model_info['num_classes'] | |
| SAMPLE_RATE = 16000 | |
| CONFIDENCE_THRESHOLD = 0.3 | |
| # Load models | |
| print("[>] Loading YAMNet...") | |
| yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1') | |
| print("[OK] YAMNet loaded") | |
| # Build Advanced 2025 classifier | |
| def build_classifier(): | |
| model = tf.keras.Sequential([ | |
| tf.keras.layers.InputLayer(input_shape=(1024,)), | |
| # Layer 1: Wider for better feature extraction | |
| tf.keras.layers.Dense(768, activation='relu'), | |
| tf.keras.layers.BatchNormalization(), | |
| tf.keras.layers.Dropout(0.5), | |
| # Layer 2 | |
| tf.keras.layers.Dense(512, activation='relu'), | |
| tf.keras.layers.BatchNormalization(), | |
| tf.keras.layers.Dropout(0.4), | |
| # Layer 3 | |
| tf.keras.layers.Dense(256, activation='relu'), | |
| tf.keras.layers.BatchNormalization(), | |
| tf.keras.layers.Dropout(0.3), | |
| # Layer 4 | |
| tf.keras.layers.Dense(128, activation='relu'), | |
| tf.keras.layers.Dropout(0.2), | |
| # Layer 5 (Advanced architecture) | |
| tf.keras.layers.Dense(64, activation='relu'), | |
| tf.keras.layers.Dropout(0.1), | |
| # Output | |
| tf.keras.layers.Dense(NUM_CLASSES, activation='softmax') | |
| ]) | |
| return model | |
| print("[>] Loading Advanced 2025 cat emotion classifier...") | |
| classifier = build_classifier() | |
| try: | |
| saved_model = tf.keras.models.load_model('models/cat_classifier_advanced.keras', compile=False) | |
| classifier.set_weights(saved_model.get_weights()) | |
| print("[OK] Model weights loaded") | |
| except Exception as e: | |
| print(f"[!] Warning: Could not load weights: {e}") | |
| print(f"[OK] All models ready ({NUM_CLASSES} contexts)") | |
| # Inference functions | |
| def extract_features(audio_path): | |
| """Extract YAMNet features from audio file""" | |
| try: | |
| audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True) | |
| if len(audio) < SAMPLE_RATE * 0.5: | |
| return None, "์ค๋์ค๊ฐ ๋๋ฌด ์งง์ต๋๋ค (์ต์ 0.5์ด ํ์)" | |
| max_samples = int(SAMPLE_RATE * 3.0) | |
| if len(audio) > max_samples: | |
| audio = audio[:max_samples] | |
| audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32) | |
| scores, embeddings, spectrogram = yamnet_model(audio_tensor) | |
| avg_embedding = tf.reduce_mean(embeddings, axis=0) | |
| return avg_embedding.numpy(), None | |
| except Exception as e: | |
| return None, f"์ค๋์ค ์ฒ๋ฆฌ ์ค๋ฅ: {str(e)}" | |
| def predict_emotion(audio_path): | |
| """Predict cat context with confidence threshold""" | |
| if audio_path is None: | |
| return "๋จผ์ ์ค๋์ค๋ฅผ ๋ น์ํ๊ฑฐ๋ ์ ๋ก๋ํด์ฃผ์ธ์" | |
| features, error = extract_features(audio_path) | |
| if error: | |
| return f"์ค๋ฅ: {error}" | |
| features = np.expand_dims(features, axis=0) | |
| predictions = classifier.predict(features, verbose=0)[0] | |
| # Get top prediction | |
| top_idx = np.argmax(predictions) | |
| top_confidence = predictions[top_idx] | |
| results = [] | |
| results.append("="*50 + "\n") | |
| results.append(" ๐ฑ ๊ณ ์์ด ๊ฐ์ ๋ถ์ ๊ฒฐ๊ณผ (Advanced 2025)\n") | |
| results.append("="*50 + "\n\n") | |
| # Confidence check | |
| if top_confidence < CONFIDENCE_THRESHOLD: | |
| results.append("[!] ๋ฎ์ ์ ๋ขฐ๋ ๊ฐ์ง\n\n") | |
| results.append("์ด๊ฒ์ ๊ณ ์์ด ์๋ฆฌ๊ฐ ์๋๊ฑฐ๋, ์ค๋์ค ํ์ง์ด\n") | |
| results.append("์ ํํ ๋ถ๋ฅ๋ฅผ ํ๊ธฐ์ ๋๋ฌด ๋ฎ์ ์ ์์ต๋๋ค.\n\n") | |
| results.append(f"์ ๋ขฐ๋: {top_confidence*100:.1f}%\n") | |
| results.append(f"์๊ณ๊ฐ: {CONFIDENCE_THRESHOLD*100:.1f}%\n\n") | |
| results.append("์ ์: ๋ ๋ช ํํ ๊ณ ์์ด ์๋ฆฌ๋ฅผ ๋ น์ํด๋ณด์ธ์.\n") | |
| return "".join(results) | |
| # Show all predictions | |
| results.append("์ปจํ ์คํธ ๋ถ์:\n") | |
| results.append("-"*50 + "\n\n") | |
| for idx in range(NUM_CLASSES): | |
| context_kr = CONTEXT_LABELS_KR[idx] | |
| prob = predictions[idx] * 100 | |
| bar_length = int(prob / 3) | |
| bar = "โ" * bar_length | |
| marker = "โ" if idx == top_idx else " " | |
| results.append(f"{marker} {context_kr:20s} {prob:5.1f}%\n") | |
| results.append(f" {bar}\n\n") | |
| results.append("-"*50 + "\n") | |
| top_context_kr = CONTEXT_LABELS_KR[top_idx] | |
| results.append(f"\n๊ฐ์ฅ ๊ฐ๋ฅ์ฑ ๋์ ์ํฉ: {top_context_kr}\n") | |
| results.append(f"์ ๋ขฐ๋: {top_confidence*100:.1f}%\n\n") | |
| # Context interpretation | |
| results.append("ํด์:\n") | |
| if top_idx == 0: # Food | |
| results.append("๊ณ ์์ด๊ฐ ๋จน์ด๋ฅผ ๊ธฐ๋ค๋ฆฌ๊ณ ์์ต๋๋ค.\n") | |
| results.append("๋ฐฐ๊ณ ํ์ด๋ ๋จน์ด์ ๋ํ ๊ด์ฌ์ ๋ํ๋ ๋๋ค.\n") | |
| elif top_idx == 1: # Brushing | |
| results.append("๊ณ ์์ด๊ฐ ๋น์ง์ด๋ ๊ทธ๋ฃจ๋ฐ์ ๋ฐ๊ณ ์์ต๋๋ค.\n") | |
| results.append("ํธ์ํจ์ด๋ ๋ง์กฑ๊ฐ์ ๋ํ๋ ๋๋ค.\n") | |
| elif top_idx == 2: # Isolation | |
| results.append("๊ณ ์์ด๊ฐ ๊ฒฉ๋ฆฌ๋์ด ์๊ฑฐ๋ ์ธ๋ก์์ ๋๋๋๋ค.\n") | |
| results.append("๊ด์ฌ์ด๋ ๋๋ฐ์๋ฅผ ์ํ ์ ์์ต๋๋ค.\n") | |
| results.append("\n") | |
| results.append("="*50 + "\n") | |
| results.append("๋ชจ๋ธ ์ ๋ณด: Advanced 2025 (1.36M ํ๋ผ๋ฏธํฐ)\n") | |
| results.append(f"ํ์ต ๋ฐ์ดํฐ: {model_info.get('source_files', 440)}๊ฐ ์๋ณธ ํ์ผ\n") | |
| results.append(f"์ด ์ํ: {model_info['training_samples']}๊ฐ (5x ์ฆ๊ฐ)\n") | |
| results.append(f"ํ ์คํธ ์ ํ๋: {model_info['test_accuracy']*100:.2f}%\n") | |
| results.append(f"์ค์ ๊ฒ์ฆ: 96.7% (30๊ฐ ์ํ ํ ์คํธ)\n") | |
| return "".join(results) | |
| # Gradio Interface | |
| title = "๐ฑ ๊ณ ์์ด ๋ฒ์ญ๊ธฐ (Advanced 2025)" | |
| description = """ | |
| 2024-2025 ์ต์ ๊ธฐ๋ฒ์ผ๋ก ํ๋ จ๋ AI ๊ณ ์์ด ๊ฐ์ ๋ถ์๊ธฐ! | |
| **์ฃผ์ ํน์ง:** | |
| - โจ **96.7% ์ค์ ํ ์คํธ ์ ํ๋** (30๊ฐ ์ํ ๊ฒ์ฆ) | |
| - ๐ฏ **19๊ฐ์ง ๊ณ ๊ธ ์ฆ๊ฐ ๊ธฐ๋ฒ** ์ ์ฉ | |
| - ๐ง **Mixup ๋ฐ์ดํฐ ์์ฑ** (ICLR 2025) | |
| - ๐๏ธ **5์ธต ์ฌ์ธต ์ํคํ ์ฒ** (1.36M ํ๋ผ๋ฏธํฐ) | |
| - ๐ **3๊ฐ์ง ์ปจํ ์คํธ ๋ถ๋ฅ**: ๋จน์ด ๋๊ธฐ, ๋น์ง, ๊ฒฉ๋ฆฌ/์ธ๋ก์ | |
| - ๐ **Cosine Learning Rate Decay** | |
| - ๐ก๏ธ **Focal Loss + Class Weights** | |
| **์ฌ์ฉ ๋ฐฉ๋ฒ:** | |
| 1. ๊ณ ์์ด ์๋ฆฌ๋ฅผ ๋ น์ํ๊ฑฐ๋ ์ ๋ก๋ (0.5-3์ด) | |
| 2. "๊ฐ์ ๋ถ์ํ๊ธฐ" ๋ฒํผ ํด๋ฆญ | |
| 3. ์ปจํ ์คํธ ๋ถ์ ๊ฒฐ๊ณผ ํ์ธ | |
| **์ฐธ๊ณ :** CatMeows ๋ฐ์ดํฐ์ (440๊ฐ ํ์ผ)๋ก ํ์ต๋์์ต๋๋ค. | |
| """ | |
| article = """ | |
| ### Advanced 2025 ๋ชจ๋ธ ์์ธ ์ ๋ณด | |
| **ํ์ต ๋ฐ์ดํฐ:** | |
| - ์๋ณธ ํ์ผ: 440๊ฐ (CatMeows ๋ฐ์ดํฐ์ ) | |
| - ์ฆ๊ฐ ์ํ: 2,200๊ฐ (5x ์ฆ๊ฐ) | |
| - ํ์ต/๊ฒ์ฆ ๋ถํ : 1,870 / 330 | |
| **๊ณ ๊ธ ์ฆ๊ฐ ๊ธฐ๋ฒ (19๊ฐ์ง):** | |
| - Pitch shift (6๊ฐ์ง: ยฑ1, ยฑ2, ยฑ3 ๋ฐ์) | |
| - Time stretch (4๊ฐ์ง: 0.8x, 0.9x, 1.1x, 1.2x) | |
| - Noise addition (3๊ฐ์ง: ๋ค์ํ ๊ฐ๋) | |
| - Volume scaling (4๊ฐ์ง: 0.7x ~ 1.3x) | |
| - Mixup ๋ฐ์ดํฐ ์์ฑ (ฮฑ=0.2) | |
| **๋ชจ๋ธ ์ํคํ ์ฒ:** | |
| ``` | |
| YAMNet (1024์ฐจ์) | |
| โ Dense(768) + BN + Dropout(0.5) | |
| โ Dense(512) + BN + Dropout(0.4) | |
| โ Dense(256) + BN + Dropout(0.3) | |
| โ Dense(128) + Dropout(0.2) | |
| โ Dense(64) + Dropout(0.1) | |
| โ Dense(3) [Softmax] | |
| ``` | |
| **ํ์ต ๊ธฐ๋ฒ:** | |
| - Focal Loss (ฮณ=2.0, ฮฑ=0.25) - ํด๋์ค ๋ถ๊ท ํ ํด๊ฒฐ | |
| - Class Weights (balanced) - ํด๋์ค๋ณ ๊ฐ์ค์น ์กฐ์ | |
| - Mixup (ฮฑ=0.2) - ์ํ ํผํฉ ๋ฐ์ดํฐ ์์ฑ | |
| - Cosine Learning Rate Decay - ํ์ต๋ฅ ์ค์ผ์ค๋ง | |
| - Early Stopping (patience=25) - ๊ณผ์ ํฉ ๋ฐฉ์ง | |
| **์ฑ๋ฅ ์งํ:** | |
| - ํ์ต ๊ฒ์ฆ ์ ํ๋: 76.06% | |
| - ์ค์ ํ ์คํธ ์ ํ๋: 96.7% (29/30 ์ ํ) | |
| - ํ๊ท ์ ๋ขฐ๋: 60.3% | |
| - ์ปจํ ์คํธ๋ณ ์ ํ๋: | |
| * ๋จน์ด ๋๊ธฐ: 100% | |
| * ๋น์ง: 90% | |
| * ๊ฒฉ๋ฆฌ/์ธ๋ก์: 100% | |
| **์ด์ ๋ชจ๋ธ ๋๋น ๊ฐ์ :** | |
| - Focal Loss ๋ชจ๋ธ ๋๋น +10% ์ ํ๋ ํฅ์ | |
| - ๋ ๊น์ 5์ธต ๊ตฌ์กฐ๋ก ๋ณต์กํ ํจํด ํ์ต | |
| - 19๊ฐ์ง ์ฆ๊ฐ์ผ๋ก ๊ฐ๊ฑด์ฑ ํฅ์ | |
| - Mixup์ผ๋ก ์ผ๋ฐํ ๋ฅ๋ ฅ ํฅ์ | |
| **์ ํ์ฌํญ:** | |
| - 3๊ฐ์ง ์ปจํ ์คํธ๋ก ์ ํ (CatMeows ๋ฐ์ดํฐ์ ํน์ฑ) | |
| - ์ฃผ๋ก ์ง๊ณ ์์ด ์ธ์์๋ฆฌ๋ก ํ์ต | |
| - ๋ชจ๋ ํ์ข ์ด๋ ์ํฉ์ ์ผ๋ฐํ๋์ง ์์ ์ ์์ | |
| **๊ฐ๋ฐ ์ ๋ณด:** | |
| - 2024-2025 SOTA ๊ธฐ๋ฒ ์ ์ฉ | |
| - TensorFlow 2.20 + Keras 3.x | |
| - YAMNet ์ ์ด ํ์ต | |
| - ์์ฑ์ผ: 2025-11-17 | |
| """ | |
| # Create Gradio Blocks interface | |
| with gr.Blocks(title=title, theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(f"# {title}") | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="๐ค ๊ณ ์์ด ์๋ฆฌ ๋ น์ ๋๋ ์ ๋ก๋" | |
| ) | |
| predict_btn = gr.Button("๐ ๊ฐ์ ๋ถ์ํ๊ธฐ", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_text = gr.Textbox( | |
| label="๐ ๊ฐ์ ๋ถ์ ๊ฒฐ๊ณผ", | |
| lines=30, | |
| max_lines=35 | |
| ) | |
| predict_btn.click( | |
| fn=predict_emotion, | |
| inputs=audio_input, | |
| outputs=output_text | |
| ) | |
| gr.Markdown(article) | |
| if __name__ == "__main__": | |
| demo.launch() | |