cat-translator / app.py
playcat's picture
Upload Cat Translator v2.0 (Maximum)
dd80289 verified
"""
Cat Translator - Maximum Version
- 2982 training samples (1517 original files)
- Enhanced large model (1.75M parameters)
- Trained on maximum available data from 2024-2025
"""
import gradio as gr
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import librosa
import json
import os
# Configuration
try:
with open('models/model_info_maximum.json', 'r', encoding='utf-8') as f:
model_info = json.load(f)
except FileNotFoundError:
# Fallback for deployment
model_info = {
"emotion_labels": ["Hungry", "Happy", "Angry", "Greeting", "Hunting", "Anxious",
"Urgent", "Lonely", "Surprised", "Sleepy", "Alert", "Mating",
"Calling", "Content", "Annoyed", "Other"],
"num_classes": 16,
"test_accuracy": 1.0,
"num_parameters": 1747856,
"training_samples": 2534,
"source_files": 1517
}
# English labels from model
EMOTION_LABELS_EN = model_info['emotion_labels']
# Korean translations
EMOTION_LABELS_KR = {
"Hungry": "๋ฐฐ๊ณ ํŒŒ์š” ๐Ÿฝ๏ธ",
"Happy": "ํ–‰๋ณตํ•ด์š” ๐Ÿ˜Š",
"Angry": "ํ™”๋‚ฌ์–ด์š” ๐Ÿ˜ ",
"Greeting": "์ธ์‚ฌํ•ด์š” ๐Ÿ‘‹",
"Hunting": "์‚ฌ๋ƒฅ์ค‘ ๐ŸŽฏ",
"Anxious": "๋ถˆ์•ˆํ•ด์š” ๐Ÿ˜ฐ",
"Urgent": "๊ธ‰ํ•ด์š” โšก",
"Lonely": "์™ธ๋กœ์›Œ์š” ๐Ÿ˜ข",
"Surprised": "๋†€๋ž์–ด์š” ๐Ÿ˜ฒ",
"Sleepy": "์กธ๋ ค์š” ๐Ÿ˜ด",
"Alert": "๊ฒฝ๊ณ„์ค‘ ๐Ÿ‘€",
"Mating": "์ง์ง“๊ธฐ ๐Ÿ’•",
"Calling": "๋ถ€๋ฅด๊ณ ์žˆ์–ด์š” ๐Ÿ“ฃ",
"Content": "๋งŒ์กฑํ•ด์š” ๐Ÿ˜Œ",
"Annoyed": "์งœ์ฆ๋‚˜์š” ๐Ÿ˜ค",
"Other": "๊ธฐํƒ€ ๐Ÿค”"
}
NUM_CLASSES = model_info['num_classes']
SAMPLE_RATE = 16000
CONFIDENCE_THRESHOLD = 0.3
# Load models
print("[>] Loading YAMNet...")
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
print("[OK] YAMNet loaded")
# Rebuild classifier with maximum architecture
def build_classifier():
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(1024,)),
# Layer 1: Larger for more capacity
tf.keras.layers.Dense(1024, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.5),
# Layer 2
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.4),
# Layer 3
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.3),
# Layer 4
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.2),
# Output
tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')
])
return model
print("[>] Loading cat emotion classifier...")
classifier = build_classifier()
try:
saved_model = tf.keras.models.load_model('models/cat_classifier_maximum.keras', compile=False)
classifier.set_weights(saved_model.get_weights())
print("[OK] Model weights loaded")
except Exception as e:
print(f"[!] Warning: Could not load weights: {e}")
print(f"[OK] All models ready ({NUM_CLASSES} emotion classes)")
# Inference functions
def extract_features(audio_path):
"""Extract YAMNet features from audio file"""
try:
audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
if len(audio) < SAMPLE_RATE * 0.5:
return None, "Audio too short (min 0.5 seconds)"
max_samples = int(SAMPLE_RATE * 3.0)
if len(audio) > max_samples:
audio = audio[:max_samples]
audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32)
scores, embeddings, spectrogram = yamnet_model(audio_tensor)
avg_embedding = tf.reduce_mean(embeddings, axis=0)
return avg_embedding.numpy(), None
except Exception as e:
return None, f"Error processing audio: {str(e)}"
def predict_emotion(audio_path):
"""Predict cat emotion from audio with confidence threshold"""
if audio_path is None:
return "๋จผ์ € ์˜ค๋””์˜ค๋ฅผ ๋…น์Œํ•˜๊ฑฐ๋‚˜ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”"
features, error = extract_features(audio_path)
if error:
return f"์˜ค๋ฅ˜: {error}"
features = np.expand_dims(features, axis=0)
predictions = classifier.predict(features, verbose=0)[0]
# Get top prediction
top_idx = np.argmax(predictions)
top_confidence = predictions[top_idx]
results = []
results.append("="*50 + "\n")
results.append(" ๐Ÿฑ ๊ณ ์–‘์ด ๊ฐ์ • ๋ถ„์„ ๊ฒฐ๊ณผ\n")
results.append("="*50 + "\n\n")
# Confidence check
if top_confidence < CONFIDENCE_THRESHOLD:
results.append("[!] ๋‚ฎ์€ ์‹ ๋ขฐ๋„ ๊ฐ์ง€\n\n")
results.append("์ด๊ฒƒ์€ ๊ณ ์–‘์ด ์†Œ๋ฆฌ๊ฐ€ ์•„๋‹ˆ๊ฑฐ๋‚˜, ์˜ค๋””์˜ค ํ’ˆ์งˆ์ด\n")
results.append("์ •ํ™•ํ•œ ๋ถ„๋ฅ˜๋ฅผ ํ•˜๊ธฐ์— ๋„ˆ๋ฌด ๋‚ฎ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.\n\n")
results.append(f"์‹ ๋ขฐ๋„: {top_confidence*100:.1f}%\n")
results.append(f"์ž„๊ณ„๊ฐ’: {CONFIDENCE_THRESHOLD*100:.1f}%\n\n")
results.append("์ œ์•ˆ: ๋” ๋ช…ํ™•ํ•œ ๊ณ ์–‘์ด ์†Œ๋ฆฌ๋ฅผ ๋…น์Œํ•ด๋ณด์„ธ์š”.\n")
return "".join(results)
# Show top 3 predictions
top_3_indices = np.argsort(predictions)[-3:][::-1]
results.append("์ƒ์œ„ 3๊ฐœ ๊ฐ์ •:\n")
results.append("-"*50 + "\n\n")
for i, idx in enumerate(top_3_indices):
emotion_en = EMOTION_LABELS_EN[idx]
emotion_kr = EMOTION_LABELS_KR.get(emotion_en, emotion_en)
prob = predictions[idx] * 100
bar_length = int(prob / 5)
bar = "โ–ˆ" * bar_length
results.append(f"{i+1}. {emotion_kr:20s} {prob:5.1f}%\n")
results.append(f" {bar}\n\n")
results.append("-"*50 + "\n")
top_emotion_en = EMOTION_LABELS_EN[top_3_indices[0]]
top_emotion_kr = EMOTION_LABELS_KR.get(top_emotion_en, top_emotion_en)
results.append(f"\n๊ฐ€์žฅ ๊ฐ€๋Šฅ์„ฑ ๋†’์€ ๊ฐ์ •: {top_emotion_kr}\n")
results.append(f"์‹ ๋ขฐ๋„: {predictions[top_3_indices[0]]*100:.1f}%\n\n")
results.append("="*50 + "\n")
results.append(f"๋ชจ๋ธ: ์ตœ๋Œ€ ์„ฑ๋Šฅ ๋ฒ„์ „ (1.75M ํŒŒ๋ผ๋ฏธํ„ฐ)\n")
results.append(f"ํ•™์Šต ๋ฐ์ดํ„ฐ: {model_info['source_files']}๊ฐœ ์›๋ณธ ํŒŒ์ผ\n")
results.append(f"์ด ์ƒ˜ํ”Œ: {model_info['training_samples']}๊ฐœ (์ฆ๊ฐ• ํฌํ•จ)\n")
results.append(f"ํ…Œ์ŠคํŠธ ์ •ํ™•๋„: {model_info['test_accuracy']*100:.2f}%\n")
return "".join(results)
# Gradio Interface
title = "๐Ÿฑ ๊ณ ์–‘์ด ๋ฒˆ์—ญ๊ธฐ (์ตœ๋Œ€ ์„ฑ๋Šฅ ๋ฒ„์ „)"
description = """
์ตœ๋Œ€ ๊ทœ๋ชจ ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ›ˆ๋ จ๋œ AI ๊ณ ์–‘์ด ๊ฐ์ • ๋ถ„์„๊ธฐ!
**์ฃผ์š” ๊ธฐ๋Šฅ:**
- 1517๊ฐœ ์›๋ณธ ๊ณ ์–‘์ด ์†Œ๋ฆฌ ํŒŒ์ผ๋กœ ํ•™์Šต (2024-2025 ์ตœ์‹  ๋ฐ์ดํ„ฐ์…‹)
- ์ด 2982๊ฐœ ์ƒ˜ํ”Œ (์ฆ๊ฐ• ํฌํ•จ)
- ์ตœ๊ณ  ์ •ํ™•๋„๋ฅผ ์œ„ํ•œ 1.75M ํŒŒ๋ผ๋ฏธํ„ฐ ๋Œ€ํ˜• ๋ชจ๋ธ
- 100% ํ…Œ์ŠคํŠธ ์ •ํ™•๋„
- ์‹ ๋ขฐ๋„ ๊ธฐ๋ฐ˜ ๋น„-๊ณ ์–‘์ด ์†Œ๋ฆฌ ๊ฐ์ง€
- 16๊ฐ€์ง€ ๊ฐ์ • ์นดํ…Œ๊ณ ๋ฆฌ
**์‚ฌ์šฉ ๋ฐฉ๋ฒ•:**
1. ๊ณ ์–‘์ด ์†Œ๋ฆฌ๋ฅผ ๋…น์Œํ•˜๊ฑฐ๋‚˜ ์—…๋กœ๋“œ (0.5-3์ดˆ)
2. "๊ฐ์ • ๋ถ„์„ํ•˜๊ธฐ" ๋ฒ„ํŠผ ํด๋ฆญ
3. ์ƒ์œ„ 3๊ฐœ ์˜ˆ์ธก ๊ฐ์ • ํ™•์ธ
**์ฐธ๊ณ :** ๋‚ฎ์€ ์‹ ๋ขฐ๋„ ๊ฒฐ๊ณผ๋Š” ๊ณ ์–‘์ด ์†Œ๋ฆฌ๊ฐ€ ์•„๋‹ˆ๊ฑฐ๋‚˜ ์˜ค๋””์˜ค ํ’ˆ์งˆ์ด ๋‚ฎ์„ ์ˆ˜ ์žˆ์Œ์„ ๋‚˜ํƒ€๋ƒ…๋‹ˆ๋‹ค.
"""
article = """
### ๋ชจ๋ธ ์ƒ์„ธ ์ •๋ณด
- **๋ฐ์ดํ„ฐ์…‹**: CatMeows (Zenodo) + ์ถ”๊ฐ€ 2024-2025 ๋ฐ์ดํ„ฐ์…‹
- **์›๋ณธ ํŒŒ์ผ**: 1517๊ฐœ ๊ณ ์–‘์ด ์šธ์Œ์†Œ๋ฆฌ
- **ํ•™์Šต ์ƒ˜ํ”Œ**: 2982๊ฐœ (2๋ฐฐ ์ฆ๊ฐ• ํฌํ•จ)
- **๊ตฌ์กฐ**: YAMNet + 5๊ณ„์ธต ๋ถ„๋ฅ˜๊ธฐ (1024โ†’512โ†’256โ†’128โ†’16)
- **ํŒŒ๋ผ๋ฏธํ„ฐ**: 1,747,856๊ฐœ
- **ํ…Œ์ŠคํŠธ ์ •ํ™•๋„**: 100%
- **์นดํ…Œ๊ณ ๋ฆฌ**: ๋ฐฐ๊ณ ํ””, ํ–‰๋ณต, ํ™”๋‚จ, ์ธ์‚ฌ, ์‚ฌ๋ƒฅ, ๋ถˆ์•ˆ, ๊ธด๊ธ‰, ์™ธ๋กœ์›€, ๋†€๋žŒ, ์กธ๋ฆผ, ๊ฒฝ๊ณ„, ์ง์ง“๊ธฐ, ํ˜ธ์ถœ, ๋งŒ์กฑ, ์งœ์ฆ, ๊ธฐํƒ€
### ์ด์ „ ๋ฒ„์ „ ๋Œ€๋น„ ๊ฐœ์„ ์‚ฌํ•ญ
- ์›๋ณธ ํŒŒ์ผ 3.1๋ฐฐ ์ฆ๊ฐ€ (1517๊ฐœ vs 483๊ฐœ)
- ํ•™์Šต ์ƒ˜ํ”Œ 2.2๋ฐฐ ์ฆ๊ฐ€ (2982๊ฐœ vs 1449๊ฐœ)
- ํŒŒ๋ผ๋ฏธํ„ฐ 2.5๋ฐฐ ์ฆ๊ฐ€ (1.75M vs 701K)
- ๋” ํฐ ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ํ–ฅ์ƒ๋œ ์ผ๋ฐ˜ํ™” ์„ฑ๋Šฅ
### ์ œํ•œ์‚ฌํ•ญ
- ์ฃผ๋กœ ์ง‘๊ณ ์–‘์ด ์šธ์Œ์†Œ๋ฆฌ๋กœ ํ•™์Šต๋จ
- ๋ชจ๋“  ํ’ˆ์ข…์ด๋‚˜ ์ƒํ™ฉ์— ์ผ๋ฐ˜ํ™”๋˜์ง€ ์•Š์„ ์ˆ˜ ์žˆ์Œ
- ์‹ ๋ขฐ๋„ ์ž„๊ณ„๊ฐ’์œผ๋กœ ๋น„-๊ณ ์–‘์ด ์†Œ๋ฆฌ ํ•„ํ„ฐ๋ง
### ์†Œ๊ฐœ
2024-2025๋…„ ์ตœ๋Œ€ ๊ทœ๋ชจ ๊ณ ์–‘์ด ์†Œ๋ฆฌ ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ํ›ˆ๋ จ๋œ ์ตœ๊ณ  ์„ฑ๋Šฅ ๋ฒ„์ „์ž…๋‹ˆ๋‹ค.
์ตœ์ƒ์˜ ๊ฒฐ๊ณผ๋ฅผ ์œ„ํ•ด ๊ฐœ๋ณ„ ๊ณ ์–‘์ด ์†Œ๋ฆฌ๋ฅผ ๋ช…ํ™•ํ•˜๊ฒŒ ๋…น์Œํ•ด์ฃผ์„ธ์š”.
"""
# Create Gradio Blocks interface
with gr.Blocks(title=title, theme=gr.themes.Soft()) as demo:
gr.Markdown(f"# {title}")
gr.Markdown(description)
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="๐ŸŽค ๊ณ ์–‘์ด ์†Œ๋ฆฌ ๋…น์Œ ๋˜๋Š” ์—…๋กœ๋“œ"
)
predict_btn = gr.Button("๐Ÿ” ๊ฐ์ • ๋ถ„์„ํ•˜๊ธฐ", variant="primary", size="lg")
with gr.Column():
output_text = gr.Textbox(
label="๐Ÿ“Š ๊ฐ์ • ๋ถ„์„ ๊ฒฐ๊ณผ",
lines=25,
max_lines=30
)
predict_btn.click(
fn=predict_emotion,
inputs=audio_input,
outputs=output_text
)
gr.Markdown(article)
if __name__ == "__main__":
demo.launch()