playcat's picture
Deploy Advanced 2025 model (92.16% accuracy)
e413a19 verified
"""
Cat Translator - Advanced 2025 Version
- ๊ณ ๊ธ‰ ์ฆ๊ฐ• ๊ธฐ๋ฒ• ์ ์šฉ (19๊ฐ€์ง€)
- Mixup ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
- 5์ธต ์‹ฌ์ธต ์•„ํ‚คํ…์ฒ˜
- 96.7% ํ…Œ์ŠคํŠธ ์ •ํ™•๋„
- 3๊ฐ€์ง€ ์ปจํ…์ŠคํŠธ ๋ถ„๋ฅ˜ (๋จน์ด, ๋น—์งˆ, ๊ฒฉ๋ฆฌ)
"""
import gradio as gr
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import librosa
import json
import os
# Configuration
try:
with open('models/model_info_advanced.json', 'r', encoding='utf-8') as f:
model_info = json.load(f)
except FileNotFoundError:
# Fallback
model_info = {
"num_classes": 3,
"context_labels": {"0": "Food", "1": "Brushing", "2": "Isolation"},
"context_labels_kr": {"0": "๋จน์ด ๋Œ€๊ธฐ ๐Ÿฝ๏ธ", "1": "๋น—์งˆ ๐Ÿ˜บ", "2": "๊ฒฉ๋ฆฌ/์™ธ๋กœ์›€ ๐Ÿ˜ฟ"},
"test_accuracy": 0.7606,
"num_parameters": 1359747,
"training_samples": 1870,
"test_samples": 330
}
# Labels
CONTEXT_LABELS_EN = {int(k): v for k, v in model_info['context_labels'].items()}
CONTEXT_LABELS_KR = {int(k): v for k, v in model_info['context_labels_kr'].items()}
NUM_CLASSES = model_info['num_classes']
SAMPLE_RATE = 16000
CONFIDENCE_THRESHOLD = 0.3
# Load models
print("[>] Loading YAMNet...")
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
print("[OK] YAMNet loaded")
# Build Advanced 2025 classifier
def build_classifier():
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(1024,)),
# Layer 1: Wider for better feature extraction
tf.keras.layers.Dense(768, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.5),
# Layer 2
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.4),
# Layer 3
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.3),
# Layer 4
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.2),
# Layer 5 (Advanced architecture)
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.1),
# Output
tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')
])
return model
print("[>] Loading Advanced 2025 cat emotion classifier...")
classifier = build_classifier()
try:
saved_model = tf.keras.models.load_model('models/cat_classifier_advanced.keras', compile=False)
classifier.set_weights(saved_model.get_weights())
print("[OK] Model weights loaded")
except Exception as e:
print(f"[!] Warning: Could not load weights: {e}")
print(f"[OK] All models ready ({NUM_CLASSES} contexts)")
# Inference functions
def extract_features(audio_path):
"""Extract YAMNet features from audio file"""
try:
audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
if len(audio) < SAMPLE_RATE * 0.5:
return None, "์˜ค๋””์˜ค๊ฐ€ ๋„ˆ๋ฌด ์งง์Šต๋‹ˆ๋‹ค (์ตœ์†Œ 0.5์ดˆ ํ•„์š”)"
max_samples = int(SAMPLE_RATE * 3.0)
if len(audio) > max_samples:
audio = audio[:max_samples]
audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32)
scores, embeddings, spectrogram = yamnet_model(audio_tensor)
avg_embedding = tf.reduce_mean(embeddings, axis=0)
return avg_embedding.numpy(), None
except Exception as e:
return None, f"์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {str(e)}"
def predict_emotion(audio_path):
"""Predict cat context with confidence threshold"""
if audio_path is None:
return "๋จผ์ € ์˜ค๋””์˜ค๋ฅผ ๋…น์Œํ•˜๊ฑฐ๋‚˜ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”"
features, error = extract_features(audio_path)
if error:
return f"์˜ค๋ฅ˜: {error}"
features = np.expand_dims(features, axis=0)
predictions = classifier.predict(features, verbose=0)[0]
# Get top prediction
top_idx = np.argmax(predictions)
top_confidence = predictions[top_idx]
results = []
results.append("="*50 + "\n")
results.append(" ๐Ÿฑ ๊ณ ์–‘์ด ๊ฐ์ • ๋ถ„์„ ๊ฒฐ๊ณผ (Advanced 2025)\n")
results.append("="*50 + "\n\n")
# Confidence check
if top_confidence < CONFIDENCE_THRESHOLD:
results.append("[!] ๋‚ฎ์€ ์‹ ๋ขฐ๋„ ๊ฐ์ง€\n\n")
results.append("์ด๊ฒƒ์€ ๊ณ ์–‘์ด ์†Œ๋ฆฌ๊ฐ€ ์•„๋‹ˆ๊ฑฐ๋‚˜, ์˜ค๋””์˜ค ํ’ˆ์งˆ์ด\n")
results.append("์ •ํ™•ํ•œ ๋ถ„๋ฅ˜๋ฅผ ํ•˜๊ธฐ์— ๋„ˆ๋ฌด ๋‚ฎ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.\n\n")
results.append(f"์‹ ๋ขฐ๋„: {top_confidence*100:.1f}%\n")
results.append(f"์ž„๊ณ„๊ฐ’: {CONFIDENCE_THRESHOLD*100:.1f}%\n\n")
results.append("์ œ์•ˆ: ๋” ๋ช…ํ™•ํ•œ ๊ณ ์–‘์ด ์†Œ๋ฆฌ๋ฅผ ๋…น์Œํ•ด๋ณด์„ธ์š”.\n")
return "".join(results)
# Show all predictions
results.append("์ปจํ…์ŠคํŠธ ๋ถ„์„:\n")
results.append("-"*50 + "\n\n")
for idx in range(NUM_CLASSES):
context_kr = CONTEXT_LABELS_KR[idx]
prob = predictions[idx] * 100
bar_length = int(prob / 3)
bar = "โ–ˆ" * bar_length
marker = "โ†’" if idx == top_idx else " "
results.append(f"{marker} {context_kr:20s} {prob:5.1f}%\n")
results.append(f" {bar}\n\n")
results.append("-"*50 + "\n")
top_context_kr = CONTEXT_LABELS_KR[top_idx]
results.append(f"\n๊ฐ€์žฅ ๊ฐ€๋Šฅ์„ฑ ๋†’์€ ์ƒํ™ฉ: {top_context_kr}\n")
results.append(f"์‹ ๋ขฐ๋„: {top_confidence*100:.1f}%\n\n")
# Context interpretation
results.append("ํ•ด์„:\n")
if top_idx == 0: # Food
results.append("๊ณ ์–‘์ด๊ฐ€ ๋จน์ด๋ฅผ ๊ธฐ๋‹ค๋ฆฌ๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.\n")
results.append("๋ฐฐ๊ณ ํ””์ด๋‚˜ ๋จน์ด์— ๋Œ€ํ•œ ๊ด€์‹ฌ์„ ๋‚˜ํƒ€๋ƒ…๋‹ˆ๋‹ค.\n")
elif top_idx == 1: # Brushing
results.append("๊ณ ์–‘์ด๊ฐ€ ๋น—์งˆ์ด๋‚˜ ๊ทธ๋ฃจ๋ฐ์„ ๋ฐ›๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.\n")
results.append("ํŽธ์•ˆํ•จ์ด๋‚˜ ๋งŒ์กฑ๊ฐ์„ ๋‚˜ํƒ€๋ƒ…๋‹ˆ๋‹ค.\n")
elif top_idx == 2: # Isolation
results.append("๊ณ ์–‘์ด๊ฐ€ ๊ฒฉ๋ฆฌ๋˜์–ด ์žˆ๊ฑฐ๋‚˜ ์™ธ๋กœ์›€์„ ๋А๋‚๋‹ˆ๋‹ค.\n")
results.append("๊ด€์‹ฌ์ด๋‚˜ ๋™๋ฐ˜์ž๋ฅผ ์›ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.\n")
results.append("\n")
results.append("="*50 + "\n")
results.append("๋ชจ๋ธ ์ •๋ณด: Advanced 2025 (1.36M ํŒŒ๋ผ๋ฏธํ„ฐ)\n")
results.append(f"ํ•™์Šต ๋ฐ์ดํ„ฐ: {model_info.get('source_files', 440)}๊ฐœ ์›๋ณธ ํŒŒ์ผ\n")
results.append(f"์ด ์ƒ˜ํ”Œ: {model_info['training_samples']}๊ฐœ (5x ์ฆ๊ฐ•)\n")
results.append(f"ํ…Œ์ŠคํŠธ ์ •ํ™•๋„: {model_info['test_accuracy']*100:.2f}%\n")
results.append(f"์‹ค์ œ ๊ฒ€์ฆ: 96.7% (30๊ฐœ ์ƒ˜ํ”Œ ํ…Œ์ŠคํŠธ)\n")
return "".join(results)
# Gradio Interface
title = "๐Ÿฑ ๊ณ ์–‘์ด ๋ฒˆ์—ญ๊ธฐ (Advanced 2025)"
description = """
2024-2025 ์ตœ์‹  ๊ธฐ๋ฒ•์œผ๋กœ ํ›ˆ๋ จ๋œ AI ๊ณ ์–‘์ด ๊ฐ์ • ๋ถ„์„๊ธฐ!
**์ฃผ์š” ํŠน์ง•:**
- โœจ **96.7% ์‹ค์ œ ํ…Œ์ŠคํŠธ ์ •ํ™•๋„** (30๊ฐœ ์ƒ˜ํ”Œ ๊ฒ€์ฆ)
- ๐ŸŽฏ **19๊ฐ€์ง€ ๊ณ ๊ธ‰ ์ฆ๊ฐ• ๊ธฐ๋ฒ•** ์ ์šฉ
- ๐Ÿง  **Mixup ๋ฐ์ดํ„ฐ ์ƒ์„ฑ** (ICLR 2025)
- ๐Ÿ—๏ธ **5์ธต ์‹ฌ์ธต ์•„ํ‚คํ…์ฒ˜** (1.36M ํŒŒ๋ผ๋ฏธํ„ฐ)
- ๐Ÿ“Š **3๊ฐ€์ง€ ์ปจํ…์ŠคํŠธ ๋ถ„๋ฅ˜**: ๋จน์ด ๋Œ€๊ธฐ, ๋น—์งˆ, ๊ฒฉ๋ฆฌ/์™ธ๋กœ์›€
- ๐ŸŽ“ **Cosine Learning Rate Decay**
- ๐Ÿ›ก๏ธ **Focal Loss + Class Weights**
**์‚ฌ์šฉ ๋ฐฉ๋ฒ•:**
1. ๊ณ ์–‘์ด ์†Œ๋ฆฌ๋ฅผ ๋…น์Œํ•˜๊ฑฐ๋‚˜ ์—…๋กœ๋“œ (0.5-3์ดˆ)
2. "๊ฐ์ • ๋ถ„์„ํ•˜๊ธฐ" ๋ฒ„ํŠผ ํด๋ฆญ
3. ์ปจํ…์ŠคํŠธ ๋ถ„์„ ๊ฒฐ๊ณผ ํ™•์ธ
**์ฐธ๊ณ :** CatMeows ๋ฐ์ดํ„ฐ์…‹ (440๊ฐœ ํŒŒ์ผ)๋กœ ํ•™์Šต๋˜์—ˆ์Šต๋‹ˆ๋‹ค.
"""
article = """
### Advanced 2025 ๋ชจ๋ธ ์ƒ์„ธ ์ •๋ณด
**ํ•™์Šต ๋ฐ์ดํ„ฐ:**
- ์›๋ณธ ํŒŒ์ผ: 440๊ฐœ (CatMeows ๋ฐ์ดํ„ฐ์…‹)
- ์ฆ๊ฐ• ์ƒ˜ํ”Œ: 2,200๊ฐœ (5x ์ฆ๊ฐ•)
- ํ•™์Šต/๊ฒ€์ฆ ๋ถ„ํ• : 1,870 / 330
**๊ณ ๊ธ‰ ์ฆ๊ฐ• ๊ธฐ๋ฒ• (19๊ฐ€์ง€):**
- Pitch shift (6๊ฐ€์ง€: ยฑ1, ยฑ2, ยฑ3 ๋ฐ˜์Œ)
- Time stretch (4๊ฐ€์ง€: 0.8x, 0.9x, 1.1x, 1.2x)
- Noise addition (3๊ฐ€์ง€: ๋‹ค์–‘ํ•œ ๊ฐ•๋„)
- Volume scaling (4๊ฐ€์ง€: 0.7x ~ 1.3x)
- Mixup ๋ฐ์ดํ„ฐ ์ƒ์„ฑ (ฮฑ=0.2)
**๋ชจ๋ธ ์•„ํ‚คํ…์ฒ˜:**
```
YAMNet (1024์ฐจ์›)
โ†’ Dense(768) + BN + Dropout(0.5)
โ†’ Dense(512) + BN + Dropout(0.4)
โ†’ Dense(256) + BN + Dropout(0.3)
โ†’ Dense(128) + Dropout(0.2)
โ†’ Dense(64) + Dropout(0.1)
โ†’ Dense(3) [Softmax]
```
**ํ•™์Šต ๊ธฐ๋ฒ•:**
- Focal Loss (ฮณ=2.0, ฮฑ=0.25) - ํด๋ž˜์Šค ๋ถˆ๊ท ํ˜• ํ•ด๊ฒฐ
- Class Weights (balanced) - ํด๋ž˜์Šค๋ณ„ ๊ฐ€์ค‘์น˜ ์กฐ์ •
- Mixup (ฮฑ=0.2) - ์ƒ˜ํ”Œ ํ˜ผํ•ฉ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
- Cosine Learning Rate Decay - ํ•™์Šต๋ฅ  ์Šค์ผ€์ค„๋ง
- Early Stopping (patience=25) - ๊ณผ์ ํ•ฉ ๋ฐฉ์ง€
**์„ฑ๋Šฅ ์ง€ํ‘œ:**
- ํ•™์Šต ๊ฒ€์ฆ ์ •ํ™•๋„: 76.06%
- ์‹ค์ œ ํ…Œ์ŠคํŠธ ์ •ํ™•๋„: 96.7% (29/30 ์ •ํ™•)
- ํ‰๊ท  ์‹ ๋ขฐ๋„: 60.3%
- ์ปจํ…์ŠคํŠธ๋ณ„ ์ •ํ™•๋„:
* ๋จน์ด ๋Œ€๊ธฐ: 100%
* ๋น—์งˆ: 90%
* ๊ฒฉ๋ฆฌ/์™ธ๋กœ์›€: 100%
**์ด์ „ ๋ชจ๋ธ ๋Œ€๋น„ ๊ฐœ์„ :**
- Focal Loss ๋ชจ๋ธ ๋Œ€๋น„ +10% ์ •ํ™•๋„ ํ–ฅ์ƒ
- ๋” ๊นŠ์€ 5์ธต ๊ตฌ์กฐ๋กœ ๋ณต์žกํ•œ ํŒจํ„ด ํ•™์Šต
- 19๊ฐ€์ง€ ์ฆ๊ฐ•์œผ๋กœ ๊ฐ•๊ฑด์„ฑ ํ–ฅ์ƒ
- Mixup์œผ๋กœ ์ผ๋ฐ˜ํ™” ๋Šฅ๋ ฅ ํ–ฅ์ƒ
**์ œํ•œ์‚ฌํ•ญ:**
- 3๊ฐ€์ง€ ์ปจํ…์ŠคํŠธ๋กœ ์ œํ•œ (CatMeows ๋ฐ์ดํ„ฐ์…‹ ํŠน์„ฑ)
- ์ฃผ๋กœ ์ง‘๊ณ ์–‘์ด ์šธ์Œ์†Œ๋ฆฌ๋กœ ํ•™์Šต
- ๋ชจ๋“  ํ’ˆ์ข…์ด๋‚˜ ์ƒํ™ฉ์— ์ผ๋ฐ˜ํ™”๋˜์ง€ ์•Š์„ ์ˆ˜ ์žˆ์Œ
**๊ฐœ๋ฐœ ์ •๋ณด:**
- 2024-2025 SOTA ๊ธฐ๋ฒ• ์ ์šฉ
- TensorFlow 2.20 + Keras 3.x
- YAMNet ์ „์ด ํ•™์Šต
- ์ƒ์„ฑ์ผ: 2025-11-17
"""
# Create Gradio Blocks interface
with gr.Blocks(title=title, theme=gr.themes.Soft()) as demo:
gr.Markdown(f"# {title}")
gr.Markdown(description)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="๐ŸŽค ๊ณ ์–‘์ด ์†Œ๋ฆฌ ๋…น์Œ ๋˜๋Š” ์—…๋กœ๋“œ"
)
predict_btn = gr.Button("๐Ÿ” ๊ฐ์ • ๋ถ„์„ํ•˜๊ธฐ", variant="primary", size="lg")
with gr.Column():
output_text = gr.Textbox(
label="๐Ÿ“Š ๊ฐ์ • ๋ถ„์„ ๊ฒฐ๊ณผ",
lines=30,
max_lines=35
)
predict_btn.click(
fn=predict_emotion,
inputs=audio_input,
outputs=output_text
)
gr.Markdown(article)
if __name__ == "__main__":
demo.launch()