Spaces:
Running
Running
feat: Finalize architecture for Hugging Face deployment
Browse files- Dockerfile +3 -0
- analyzer/ASR_en_us.py +13 -20
- analyzer/ASR_fr_fr.py +13 -20
Dockerfile
CHANGED
|
@@ -1,6 +1,9 @@
|
|
| 1 |
# 1. 選擇一個包含 Python 的官方 Linux 映像
|
| 2 |
FROM python:3.10-slim
|
| 3 |
|
|
|
|
|
|
|
|
|
|
| 4 |
# 2. 設定容器內的工作目錄
|
| 5 |
WORKDIR /app
|
| 6 |
|
|
|
|
| 1 |
# 1. 選擇一個包含 Python 的官方 Linux 映像
|
| 2 |
FROM python:3.10-slim
|
| 3 |
|
| 4 |
+
ENV HF_HOME=/tmp/huggingface
|
| 5 |
+
ENV HF_DATASETS_CACHE=/tmp/huggingface/datasets
|
| 6 |
+
|
| 7 |
# 2. 設定容器內的工作目錄
|
| 8 |
WORKDIR /app
|
| 9 |
|
analyzer/ASR_en_us.py
CHANGED
|
@@ -13,41 +13,34 @@ print(f"INFO: ASR_fr_fr.py is configured to use device: {DEVICE}")
|
|
| 13 |
|
| 14 |
# --- 1. 全域設定與模型載入函數 (保持不變) ---
|
| 15 |
MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
|
| 16 |
-
MODEL_SAVE_PATH = "./ASRs/MultiBridge-wav2vec-LnNor-IPA-ft-local"
|
| 17 |
|
| 18 |
processor = None
|
| 19 |
model = None
|
| 20 |
|
| 21 |
def load_model():
|
| 22 |
"""
|
| 23 |
-
|
| 24 |
-
|
| 25 |
"""
|
| 26 |
global processor, model
|
| 27 |
if processor and model:
|
| 28 |
-
print("
|
| 29 |
return True
|
| 30 |
|
| 31 |
-
print(f"
|
|
|
|
| 32 |
try:
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
processor_to_save.save_pretrained(MODEL_SAVE_PATH)
|
| 38 |
-
model_to_save.save_pretrained(MODEL_SAVE_PATH)
|
| 39 |
-
print("模型已成功下載並儲存。")
|
| 40 |
-
else:
|
| 41 |
-
print(f"在 '{MODEL_SAVE_PATH}' 中找到本地模型。")
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
model.to(DEVICE) # 將模型移動到檢測到的設備上
|
| 46 |
-
print("英文 (en-us) 模型和處理器載入成功!")
|
| 47 |
return True
|
| 48 |
except Exception as e:
|
| 49 |
-
print(f"
|
| 50 |
-
raise RuntimeError(f"Failed to load
|
| 51 |
|
| 52 |
# --- 2. 智能 IPA 切分函數 (已更新) ---
|
| 53 |
# 移除了包含 'ː' 的組合,因為我們將在源頭移除它
|
|
|
|
| 13 |
|
| 14 |
# --- 1. 全域設定與模型載入函數 (保持不變) ---
|
| 15 |
MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
|
|
|
|
| 16 |
|
| 17 |
processor = None
|
| 18 |
model = None
|
| 19 |
|
| 20 |
def load_model():
|
| 21 |
"""
|
| 22 |
+
(方案 A) 讓 transformers 自動處理模型的下載、快取和加載。
|
| 23 |
+
它會自動使用 Dockerfile 中設定的 HF_HOME 環境變數。
|
| 24 |
"""
|
| 25 |
global processor, model
|
| 26 |
if processor and model:
|
| 27 |
+
print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
|
| 28 |
return True
|
| 29 |
|
| 30 |
+
print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
|
| 31 |
+
print(f"Transformers 將自動在 HF_HOME 指定的快取中尋找或下載。")
|
| 32 |
try:
|
| 33 |
+
# 直接使用模型的線上名稱調用 from_pretrained
|
| 34 |
+
# 這就是魔法發生的地方!
|
| 35 |
+
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
|
| 36 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
model.to(DEVICE)
|
| 39 |
+
print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
|
|
|
|
|
|
|
| 40 |
return True
|
| 41 |
except Exception as e:
|
| 42 |
+
print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
|
| 43 |
+
raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
|
| 44 |
|
| 45 |
# --- 2. 智能 IPA 切分函數 (已更新) ---
|
| 46 |
# 移除了包含 'ː' 的組合,因為我們將在源頭移除它
|
analyzer/ASR_fr_fr.py
CHANGED
|
@@ -16,41 +16,34 @@ print(f"INFO: ASR_fr_fr.py is configured to use device: {DEVICE}")
|
|
| 16 |
|
| 17 |
# --- 1. 全域設定與模型載入函數 (已修改為法語模型) ---
|
| 18 |
MODEL_NAME = "Cnam-LMSSC/wav2vec2-french-phonemizer"
|
| 19 |
-
MODEL_SAVE_PATH = "./ASRs/Cnam-LMSSC-wav2vec2-french-phonemizer-local"
|
| 20 |
|
| 21 |
processor = None
|
| 22 |
model = None
|
| 23 |
|
| 24 |
def load_model():
|
| 25 |
"""
|
| 26 |
-
|
| 27 |
-
|
| 28 |
"""
|
| 29 |
global processor, model
|
| 30 |
if processor and model:
|
| 31 |
-
print("
|
| 32 |
return True
|
| 33 |
|
| 34 |
-
print(f"
|
|
|
|
| 35 |
try:
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
processor_to_save.save_pretrained(MODEL_SAVE_PATH)
|
| 41 |
-
model_to_save.save_pretrained(MODEL_SAVE_PATH)
|
| 42 |
-
print("模型已成功下載並儲存。")
|
| 43 |
-
else:
|
| 44 |
-
print(f"在 '{MODEL_SAVE_PATH}' 中找到本地模型。")
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
model.to(DEVICE) # 將模型移動到檢測到的設備上
|
| 49 |
-
print("法語 (fr-fr) 模型和處理器載入成功!")
|
| 50 |
return True
|
| 51 |
except Exception as e:
|
| 52 |
-
print(f"
|
| 53 |
-
raise RuntimeError(f"Failed to load
|
| 54 |
|
| 55 |
def _tokenize_unicode_ipa(ipa_string: str) -> list:
|
| 56 |
"""
|
|
|
|
| 16 |
|
| 17 |
# --- 1. 全域設定與模型載入函數 (已修改為法語模型) ---
|
| 18 |
MODEL_NAME = "Cnam-LMSSC/wav2vec2-french-phonemizer"
|
|
|
|
| 19 |
|
| 20 |
processor = None
|
| 21 |
model = None
|
| 22 |
|
| 23 |
def load_model():
|
| 24 |
"""
|
| 25 |
+
(方案 A) 讓 transformers 自動處理模型的下載、快取和加載。
|
| 26 |
+
它會自動使用 Dockerfile 中設定的 HF_HOME 環境變數。
|
| 27 |
"""
|
| 28 |
global processor, model
|
| 29 |
if processor and model:
|
| 30 |
+
print(f"模型 '{MODEL_NAME}' 已載入,跳過。")
|
| 31 |
return True
|
| 32 |
|
| 33 |
+
print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
|
| 34 |
+
print(f"Transformers 將自動在 HF_HOME 指定的快取中尋找或下載。")
|
| 35 |
try:
|
| 36 |
+
# 直接使用模型的線上名稱調用 from_pretrained
|
| 37 |
+
# 這就是魔法發生的地方!
|
| 38 |
+
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
|
| 39 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
model.to(DEVICE)
|
| 42 |
+
print(f"模型 '{MODEL_NAME}' 和處理器載入成功!")
|
|
|
|
|
|
|
| 43 |
return True
|
| 44 |
except Exception as e:
|
| 45 |
+
print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
|
| 46 |
+
raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
|
| 47 |
|
| 48 |
def _tokenize_unicode_ipa(ipa_string: str) -> list:
|
| 49 |
"""
|