Spaces:

tbdavid2019
/

whisper-stt

Sleeping

App Files Files Community

whisper-stt / app-cli.py

tbdavid2019

Refactor code structure for improved readability and maintainability

fa5006a 3 months ago

raw

history blame contribute delete

8.33 kB

	import os
	import sys
	import whisper
	import requests
	import subprocess
	import math
	from pydub import AudioSegment
	import time

	# 常量定義
	MODEL_OPTIONS = ["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"]
	DEFAULT_MODEL = "small"
	DEFAULT_PROMPT = "請轉錄以下內容為繁體中文"
	MAX_FILE_SIZE_MB = 25 # OpenAI API 文件大小限制為 25MB

	# 檢查 ffmpeg 是否已安裝
	def is_ffmpeg_installed():
	try:
	subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
	return True
	except FileNotFoundError:
	return False

	# 檢查文件大小
	def check_file_size(file_path, max_size_mb=MAX_FILE_SIZE_MB):
	try:
	file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
	return file_size_mb <= max_size_mb, file_size_mb
	except Exception as e:
	print(f"檢查文件大小時出錯: {str(e)}")
	return True, 0

	# 分割音頻文件為較小的片段
	def split_audio_file(file_path, segment_duration_ms=30000): # 默認 30 秒一段
	try:
	# 載入音頻
	print(f"正在載入音頻文件: {file_path}")
	audio = AudioSegment.from_file(file_path)
	duration_ms = len(audio)

	# 計算需要分割的段數
	segments_count = math.ceil(duration_ms / segment_duration_ms)
	print(f"音頻總長度: {duration_ms/1000:.1f} 秒，將分割為 {segments_count} 個片段")

	# 分割音頻
	file_name = os.path.splitext(file_path)[0]
	file_ext = os.path.splitext(file_path)[1]
	segment_files = []

	for i in range(segments_count):
	start_time = i * segment_duration_ms
	end_time = min((i + 1) * segment_duration_ms, duration_ms)
	segment = audio[start_time:end_time]

	segment_path = f"{file_name}_part{i+1}{file_ext}"
	print(f"正在導出片段 {i+1}/{segments_count}...")
	segment.export(segment_path, format=file_ext.replace(".", ""))
	segment_files.append(segment_path)

	return segment_files
	except Exception as e:
	print(f"分割音頻文件時出錯: {str(e)}")
	return [file_path] # 發生錯誤時返回原始文件

	# 使用本地 Whisper 模型進行轉錄，實時顯示結果
	def transcribe_with_local_model(audio_path, model_size, prompt):
	if not is_ffmpeg_installed():
	print("錯誤: 找不到 ffmpeg。請安裝 ffmpeg 後再使用本地模型。")
	print("在 Mac 上，您可以使用 'brew install ffmpeg' 命令安裝。")
	return

	try:
	print(f"正在載入 {model_size} 模型...")
	model = whisper.load_model(model_size)

	# 分割音頻為較小的片段以實時顯示結果
	segments = split_audio_file(audio_path)

	full_text = ""
	for i, segment_path in enumerate(segments):
	print(f"\n處理片段 {i+1}/{len(segments)}...")

	# 使用 Whisper 模型轉錄當前片段
	result = model.transcribe(
	segment_path,
	language="zh",
	task="transcribe",
	initial_prompt=prompt
	)

	# 顯示當前片段的轉錄結果
	segment_text = result["text"]
	print(f"片段 {i+1} 轉錄結果: {segment_text}")

	# 累積完整文本
	full_text += segment_text + " "

	# 如果是臨時創建的分割文件，則刪除
	if segment_path != audio_path:
	os.remove(segment_path)

	return full_text.strip()

	except Exception as e:
	print(f"使用本地模型轉錄時出錯: {str(e)}")
	return None

	# 使用 OpenAI API 進行轉錄，實時顯示結果
	def transcribe_with_openai_api(audio_path, api_key, prompt):
	# 檢查文件大小
	is_size_ok, file_size_mb = check_file_size(audio_path)
	if not is_size_ok:
	print(f"警告: 音頻文件大小 ({file_size_mb:.1f}MB) 超過 OpenAI API 限制 (25MB)。")
	print("將嘗試分割文件...")

	try:
	# 分割音頻為較小的片段
	segments = split_audio_file(audio_path)

	full_text = ""
	for i, segment_path in enumerate(segments):
	print(f"\n處理片段 {i+1}/{len(segments)}...")

	# 檢查當前片段的大小
	is_segment_size_ok, segment_size_mb = check_file_size(segment_path)
	if not is_segment_size_ok:
	print(f"警告: 片段 {i+1} 大小 ({segment_size_mb:.1f}MB) 超過限制，將跳過此片段。")
	continue

	# 使用 OpenAI API 轉錄當前片段
	headers = {
	"Authorization": f"Bearer {api_key}"
	}

	with open(segment_path, "rb") as f:
	files = {
	"file": (os.path.basename(segment_path), f)
	}

	data = {
	"model": "whisper-1",
	"prompt": prompt
	}

	print(f"正在發送片段 {i+1} 到 OpenAI API...")
	response = requests.post(
	"https://api.openai.com/v1/audio/transcriptions",
	headers=headers,
	files=files,
	data=data
	)

	if response.status_code == 200:
	segment_text = response.json().get("text", "")
	print(f"片段 {i+1} 轉錄結果: {segment_text}")
	full_text += segment_text + " "
	else:
	print(f"片段 {i+1} 轉錄失敗: {response.status_code} - {response.text}")

	# 如果是臨時創建的分割文件，則刪除
	if segment_path != audio_path:
	os.remove(segment_path)

	return full_text.strip()

	except Exception as e:
	print(f"使用 OpenAI API 轉錄時出錯: {str(e)}")
	return None

	# 主函數
	def main():
	print("===== Whisper 語音轉錄 CLI 工具 =====")
	print("這個工具可以將音頻文件轉錄為文字，並在處理過程中實時顯示結果。")

	# 獲取音頻文件路徑
	audio_path = input("\n請輸入音頻文件路徑: ").strip()
	if not os.path.exists(audio_path):
	print(f"錯誤: 找不到文件 '{audio_path}'")
	return

	# 選擇使用本地模型還是 OpenAI API
	use_openai = input("\n是否使用 OpenAI API? (y/n，默認: n): ").strip().lower() == 'y'

	if use_openai:
	# 使用 OpenAI API
	api_key = input("\n請輸入您的 OpenAI API Key: ").strip()
	if not api_key:
	print("錯誤: API Key 不能為空")
	return

	prompt = input("\n請輸入轉錄提示 (默認: '請轉錄以下內容為繁體中文'): ").strip()
	if not prompt:
	prompt = DEFAULT_PROMPT

	print("\n開始使用 OpenAI API 進行轉錄...")
	full_text = transcribe_with_openai_api(audio_path, api_key, prompt)

	else:
	# 使用本地模型
	print("\n可用的模型: " + ", ".join(MODEL_OPTIONS))
	model_size = input(f"請選擇模型 (默認: {DEFAULT_MODEL}): ").strip()
	if not model_size or model_size not in MODEL_OPTIONS:
	model_size = DEFAULT_MODEL

	prompt = input("\n請輸入轉錄提示 (默認: '請轉錄以下內容為繁體中文'): ").strip()
	if not prompt:
	prompt = DEFAULT_PROMPT

	print("\n開始使用本地模型進行轉錄...")
	full_text = transcribe_with_local_model(audio_path, model_size, prompt)

	if full_text:
	print("\n===== 完整轉錄結果 =====")
	print(full_text)

	# 保存結果到文件
	output_path = os.path.splitext(audio_path)[0] + "_transcript.txt"
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(full_text)
	print(f"\n轉錄結果已保存到: {output_path}")

	if __name__ == "__main__":
	main()