Create Readme.md
Browse files
Readme.md
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
language:
|
| 4 |
+
- ko
|
| 5 |
+
library_name: transformers
|
| 6 |
+
pipeline_tag: automatic-speech-recognition
|
| 7 |
+
tags:
|
| 8 |
+
- speech
|
| 9 |
+
- audio
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# hubert-base-korean
|
| 13 |
+
|
| 14 |
+
## Model Details
|
| 15 |
+
|
| 16 |
+
Hubert(Hidden-Unit BERT)๋ Facebook์์ ์ ์ํ Speech Representation Learning ๋ชจ๋ธ์
๋๋ค.
|
| 17 |
+
Hubert๋ ๊ธฐ์กด์ ์์ฑ ์ธ์ ๋ชจ๋ธ๊ณผ ๋ฌ๋ฆฌ, ์์ฑ ์ ํธ๋ฅผ raw waveform์์ ๋ฐ๋ก ํ์ตํ๋ self-supervised learning ๋ฐฉ์์ ์ฌ์ฉํฉ๋๋ค.
|
| 18 |
+
|
| 19 |
+
https://huggingface.co/team-lucid/hubert-base-korean ๋ฅผ ๋ฒ ์ด์ค๋ชจ๋ธ๋ก ํ์ฉํ์ต๋๋ค.
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
## How to Get Started with the Model
|
| 23 |
+
|
| 24 |
+
### Pytorch
|
| 25 |
+
|
| 26 |
+
```py
|
| 27 |
+
import torch
|
| 28 |
+
import librosa
|
| 29 |
+
from transformers import AutoFeatureExtractor, AutoConfig
|
| 30 |
+
import whisper
|
| 31 |
+
from pytorch_lightning import Trainer
|
| 32 |
+
import pytorch_lightning as pl
|
| 33 |
+
from torch import nn
|
| 34 |
+
from transformers import HubertForSequenceClassification
|
| 35 |
+
|
| 36 |
+
class MyLitModel(pl.LightningModule):
|
| 37 |
+
def __init__(self, audio_model_name, num_label2s, n_layers=1, projector=True, classifier=True, dropout=0.07, lr_decay=1):
|
| 38 |
+
super(MyLitModel, self).__init__()
|
| 39 |
+
self.config = AutoConfig.from_pretrained(audio_model_name)
|
| 40 |
+
self.config.output_hidden_states = True
|
| 41 |
+
self.audio_model = HubertForSequenceClassification.from_pretrained(audio_model_name, config=self.config)
|
| 42 |
+
self.label2_classifier = nn.Linear(self.audio_model.config.hidden_size, num_label2s)
|
| 43 |
+
self.intensity_regressor = nn.Linear(self.audio_model.config.hidden_size, 1)
|
| 44 |
+
|
| 45 |
+
def forward(self, audio_values, audio_attn_mask=None):
|
| 46 |
+
outputs = self.audio_model(input_values=audio_values, attention_mask=audio_attn_mask)
|
| 47 |
+
label2_logits = self.label2_classifier(outputs.hidden_states[-1][:, 0, :])
|
| 48 |
+
intensity_preds = self.intensity_regressor(outputs.hidden_states[-1][:, 0, :]).squeeze(-1)
|
| 49 |
+
return label2_logits, intensity_preds
|
| 50 |
+
|
| 51 |
+
# ๋ชจ๋ธ ๊ด๋ จ ์ค์
|
| 52 |
+
audio_model_name = "team-lucid/hubert-base-korean"
|
| 53 |
+
NUM_LABELS = 7
|
| 54 |
+
SAMPLING_RATE = 16000
|
| 55 |
+
|
| 56 |
+
# Hubert ๋ชจ๋ธ ๋ก๋
|
| 57 |
+
pretrained_model_path = "" # ๋ชจ๋ธ ์ฒดํฌํฌ์ธํธ
|
| 58 |
+
hubert_model = MyLitModel.load_from_checkpoint(
|
| 59 |
+
pretrained_model_path,
|
| 60 |
+
audio_model_name=audio_model_name,
|
| 61 |
+
num_label2s=NUM_LABELS,
|
| 62 |
+
)
|
| 63 |
+
hubert_model.eval()
|
| 64 |
+
hubert_model.to("cuda" if torch.cuda.is_available() else "cpu")
|
| 65 |
+
|
| 66 |
+
# Feature extractor ๋ก๋
|
| 67 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(audio_model_name)
|
| 68 |
+
|
| 69 |
+
# ์์ฑ ํ์ผ ์ฒ๋ฆฌ
|
| 70 |
+
audio_path = "" # ์ฒ๋ฆฌํ ์์ฑ ํ์ผ ๊ฒฝ๋ก
|
| 71 |
+
audio_np, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
|
| 72 |
+
inputs = feature_extractor(raw_speech=audio_np, return_tensors="pt", sampling_rate=SAMPLING_RATE)
|
| 73 |
+
audio_values = inputs["input_values"].to(hubert_model.device)
|
| 74 |
+
audio_attn_mask = inputs.get("attention_mask", None)
|
| 75 |
+
if audio_attn_mask is not None:
|
| 76 |
+
audio_attn_mask = audio_attn_mask.to(hubert_model.device)
|
| 77 |
+
|
| 78 |
+
# ๊ฐ์ ๋ถ์
|
| 79 |
+
with torch.no_grad():
|
| 80 |
+
if audio_attn_mask is None:
|
| 81 |
+
label2_logits, intensity_preds = hubert_model(audio_values)
|
| 82 |
+
else:
|
| 83 |
+
label2_logits, intensity_preds = hubert_model(audio_values, audio_attn_mask)
|
| 84 |
+
|
| 85 |
+
emotion_label = torch.argmax(label2_logits, dim=-1).item()
|
| 86 |
+
emotion_intensity = intensity_preds.item()
|
| 87 |
+
|
| 88 |
+
print(f"Emotion Label: {emotion_label}, Emotion Intensity: {emotion_intensity}")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
## Training Details
|
| 96 |
+
|
| 97 |
+
### Training Data
|
| 98 |
+
|
| 99 |
+
ํด๋น ๋ชจ๋ธ์ AI hub์ ๊ฐ์ ๋ถ๋ฅ๋ฅผ ์ํ ๋ํ์์ฑ๋ฐ์ดํฐ์
(https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&dataSetSn=263) ์ค
|
| 100 |
+
๊ฐ ๋ผ๋ฒจ ๋ณ ๋ฐ์ดํฐ์
1000๊ฐ์ฉ, ์ด 7000๊ฐ๋ฅผ ํ์ฉํด ํ์ต์ ์งํํ์ต๋๋ค.
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
### Training Procedure
|
| 104 |
+
|
| 105 |
+
๊ฐ 7๊ฐ์ง ๊ฐ์ (ํ๋ณต, ๋ถ๋
ธ, ํ์ค, ๊ณตํฌ, ์ค๋ฆฝ, ์ฌํ, ๋๋)๊ณผ ๊ฐ ๊ฐ์ ์ ๊ฐ๋(0-2)๋ฅผ ๋์์ ํ์ตํ๋ ๋ฉํฐํ
์คํฌ ๋ชจ๋ธ๋ก ์ค๊ณํ์ต๋๋ค.
|
| 106 |
+
|
| 107 |
+
#### Training Hyperparameters
|
| 108 |
+
|
| 109 |
+
| Hyperparameter | Base |
|
| 110 |
+
|:--------------------|---------|
|
| 111 |
+
| Learning Rates | 1e-5 |
|
| 112 |
+
| Learning Rate Decay | 0.8 |
|
| 113 |
+
| Batch Size | 8 |
|
| 114 |
+
| Weight Decay | 0.01 |
|
| 115 |
+
| Epoch | 30 |
|