MichaelChou0806's picture
Update app.py
bc06406 verified
raw
history blame
14.7 kB
import os, shutil, base64, uuid, mimetypes, json, time
from pydub import AudioSegment
from openai import OpenAI
import gradio as gr
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
# ====== 基本設定 ======
PASSWORD = os.getenv("APP_PASSWORD", "chou")
MAX_SIZE = 25 * 1024 * 1024
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
print("===== 🚀 啟動中 =====")
print(f"APP_PASSWORD: {'✅ 已載入' if PASSWORD else '❌ 未載入'}")
# ====== 工具:把 data:URL 轉成臨時檔 ======
MIME_EXT = {
"audio/mp4": "m4a", "audio/m4a": "m4a", "audio/aac": "aac",
"audio/mpeg": "mp3", "audio/wav": "wav", "audio/x-wav": "wav",
"audio/ogg": "ogg", "audio/webm": "webm", "audio/opus": "opus",
"video/mp4": "mp4",
}
def _dataurl_to_file(data_url: str, orig_name: str | None = None) -> str:
"""將 data URL 轉換為本地檔案"""
print(f" → [_dataurl_to_file] 開始處理 data URL...")
try:
header, b64 = data_url.split(",", 1)
except ValueError:
raise ValueError("data URL format error")
mime = header.split(";")[0].split(":", 1)[-1].strip()
ext = MIME_EXT.get(mime) or (mimetypes.guess_extension(mime) or "m4a").lstrip(".")
fname = orig_name if (orig_name and "." in orig_name) else f"upload_{uuid.uuid4().hex}.{ext}"
print(f" → [_dataurl_to_file] 檔名: {fname}, Base64長度: {len(b64)}")
with open(fname, "wb") as f:
f.write(base64.b64decode(b64))
file_size = os.path.getsize(fname)
print(f" → [_dataurl_to_file] ✅ 檔案已建立, 大小: {file_size} bytes")
return fname
def _extract_effective_path(file_obj) -> str:
"""從各種格式中提取有效檔案路徑"""
print(f"[_extract_effective_path] 收到類型: {type(file_obj)}")
# 字串模式
if isinstance(file_obj, str):
s = file_obj.strip().strip('"')
if s.startswith("data:"):
print(f" → 偵測到 data URL")
return _dataurl_to_file(s, None)
if os.path.isfile(s):
print(f" → 找到檔案路徑: {s}")
return s
# 字典模式
if isinstance(file_obj, dict):
print(f" → 字典模式, Keys: {list(file_obj.keys())}")
data = file_obj.get("data")
if isinstance(data, str) and data.startswith("data:"):
print(f" → 找到 data URL")
return _dataurl_to_file(data, file_obj.get("orig_name"))
p = str(file_obj.get("path") or "").strip().strip('"')
if p and os.path.isfile(p):
return p
# 物件模式
for attr in ("name", "path"):
p = getattr(file_obj, attr, None)
if isinstance(p, str):
s = p.strip().strip('"')
if os.path.isfile(s):
return s
raise FileNotFoundError("Cannot parse uploaded file")
# ====== 分段處理 ======
def split_audio(path):
"""將音訊檔案分割成多個小於 25MB 的片段"""
size = os.path.getsize(path)
print(f"[split_audio] 檔案大小: {size} bytes ({size/1024/1024:.2f} MB)")
if size <= MAX_SIZE:
print(f"[split_audio] 不需分割")
return [path]
print(f"[split_audio] 開始分割...")
audio = AudioSegment.from_file(path)
n = int(size / MAX_SIZE) + 1
chunk_ms = len(audio) / n
print(f"[split_audio] 分割成 {n} 個片段")
parts = []
for i in range(n):
fn = f"chunk_{i+1}.wav"
audio[int(i*chunk_ms):int((i+1)*chunk_ms)].export(fn, format="wav")
parts.append(fn)
return parts
# ====== 轉錄核心 ======
def transcribe_core(path, model="whisper-1"):
"""使用 Whisper 進行語音轉錄,並使用 GPT 進行繁簡轉換和摘要"""
print(f"\n{'='*60}")
print(f"[transcribe_core] 開始轉錄: {path}")
print(f"{'='*60}")
start_time = time.time()
# 處理 MP4 格式
if path.lower().endswith(".mp4"):
fixed = path[:-4] + ".m4a"
try:
shutil.copy(path, fixed)
path = fixed
except:
pass
# 分割音訊
chunks = split_audio(path)
print(f"\n[transcribe_core] === Whisper 轉錄 ({len(chunks)} 片段) ===")
raw = []
for i, c in enumerate(chunks, 1):
print(f"[transcribe_core] 轉錄片段 {i}/{len(chunks)}")
with open(c, "rb") as af:
txt = client.audio.transcriptions.create(
model=model, file=af, response_format="text"
)
raw.append(txt)
print(f"[transcribe_core] ✅ 片段 {i} 完成")
raw_txt = "\n".join(raw)
print(f"[transcribe_core] 原始轉錄: {len(raw_txt)} 字元")
# 簡轉繁
print(f"\n[transcribe_core] === 簡轉繁 ===")
conv = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role":"system","content":"你是嚴格的繁體中文轉換器"},
{"role":"user","content":f"將以下內容轉為台灣繁體,不意譯:\n{raw_txt}"}
],
temperature=0.0
)
trad = conv.choices[0].message.content.strip()
print(f"[transcribe_core] ✅ 繁體轉換完成: {len(trad)} 字元")
# AI 摘要
print(f"\n[transcribe_core] === AI 摘要 ===")
summ = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role":"system","content":"你是繁體摘要助手"},
{"role":"user","content":f"請用台灣繁體中文摘要;內容多則條列重點,內容短則一句話:\n{trad}"}
],
temperature=0.2
)
summary = summ.choices[0].message.content.strip()
total_time = time.time() - start_time
print(f"\n{'='*60}")
print(f"[transcribe_core] ✅✅✅ 全部完成! 總耗時: {total_time:.1f}秒")
print(f"{'='*60}\n")
return trad, summary
# ====== Gradio UI 函式 ======
def transcribe_ui(password, file):
"""網頁界面的轉錄處理函式"""
print(f"\n🌐 [UI] 網頁版請求")
if not password or password.strip() != PASSWORD:
return "❌ Password incorrect", "", ""
if not file:
return "⚠️ No file uploaded", "", ""
try:
path = _extract_effective_path(file)
text, summary = transcribe_core(path)
return "✅ Transcription completed", text, summary
except Exception as e:
import traceback
print(f"❌ [UI] 錯誤:\n{traceback.format_exc()}")
return f"❌ Error: {e}", "", ""
# ====== 建立 FastAPI 應用 ======
fastapi_app = FastAPI()
# CORS 設定
fastapi_app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ====== 完全同步的 API 端點 ======
@fastapi_app.post("/api/transcribe")
async def api_transcribe_sync(request: Request):
"""
完全同步的 API 端點 - 直接返回結果,不用輪詢
請求格式:
{
"password": "chou",
"file_data": "data:audio/m4a;base64,...",
"file_name": "recording.m4a"
}
"""
try:
body = await request.json()
print(f"\n{'📱'*30}")
print(f"🎯 [SYNC API] 收到同步 API 請求")
print(f"📦 Keys: {list(body.keys())}")
print(f"{'📱'*30}")
password = body.get("password", "")
if password.strip() != PASSWORD:
print(f"❌ [SYNC API] 密碼錯誤")
return JSONResponse(
status_code=401,
content={"status": "error", "error": "Password incorrect"}
)
file_data = body.get("file_data", "")
file_name = body.get("file_name", "recording.m4a")
if not file_data or not file_data.startswith("data:"):
print(f"❌ [SYNC API] 檔案格式錯誤")
return JSONResponse(
status_code=400,
content={"status": "error", "error": "Invalid file data format"}
)
print(f"[SYNC API] 檔案長度: {len(file_data)}, 檔名: {file_name}")
# 直接處理,同步執行
file_dict = {"data": file_data, "orig_name": file_name}
path = _extract_effective_path(file_dict)
print(f"✅ [SYNC API] 檔案解析成功: {path}")
text, summary = transcribe_core(path)
result = {
"status": "success",
"transcription": text,
"summary": summary
}
print(f"\n{'✅'*30}")
print(f"✅✅✅ [SYNC API] 完成! 返回結果")
print(json.dumps(result, ensure_ascii=False, indent=2))
print(f"{'✅'*30}\n")
return JSONResponse(content=result)
except Exception as e:
import traceback
error_trace = traceback.format_exc()
print(f"\n{'❌'*30}")
print(f"❌ [SYNC API] 錯誤:\n{error_trace}")
print(f"{'❌'*30}\n")
return JSONResponse(
status_code=500,
content={"status": "error", "error": str(e)}
)
# ====== 自定義 CSS ======
custom_css = """
.gradio-container {
max-width: 1200px !important;
margin: auto !important;
}
/* 主標題 */
.main-header {
text-align: center;
padding: 2.5rem 1rem;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 12px;
margin-bottom: 2rem;
color: white;
}
.main-header h1 {
font-size: 2.2rem;
margin: 0 0 0.5rem 0;
font-weight: 700;
}
.main-header p {
font-size: 1rem;
margin: 0;
opacity: 0.95;
}
/* 按鈕 */
.primary-btn {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
border: none !important;
color: white !important;
font-weight: 600 !important;
font-size: 1.05rem !important;
}
/* 文字框 */
textarea {
font-size: 0.95rem !important;
line-height: 1.6 !important;
}
/* 資訊卡片 */
.info-box {
background: #f0f9ff;
border-left: 4px solid #3b82f6;
padding: 1rem;
border-radius: 6px;
margin: 1rem 0;
font-size: 0.9rem;
}
/* 程式碼 */
pre {
background: #1f2937 !important;
color: #f3f4f6 !important;
padding: 1rem !important;
border-radius: 6px !important;
font-size: 0.85rem !important;
}
code {
background: #e5e7eb !important;
color: #1f2937 !important;
padding: 0.2rem 0.4rem !important;
border-radius: 3px !important;
font-size: 0.9rem !important;
}
"""
# ====== 建立 Gradio 介面 ======
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="Audio Transcription") as demo:
# 標題
gr.HTML("""
<div class="main-header">
<h1>🎧 Audio Transcription Service</h1>
<p>AI-Powered Speech-to-Text with Summarization</p>
</div>
""")
with gr.Tabs():
# ====== Tab 1: Upload ======
with gr.Tab("🌐 Web Upload"):
with gr.Row():
with gr.Column(scale=1):
pw = gr.Textbox(label="Password", type="password", placeholder="Enter password")
audio_file = gr.File(label="Audio File", file_types=["audio", ".mp4"])
submit_btn = gr.Button("🚀 Start Transcription", variant="primary", elem_classes="primary-btn")
gr.HTML("""
<div class="info-box">
<strong>Supported:</strong> MP3, M4A, WAV, OGG, WEBM, MP4<br>
<strong>Max Size:</strong> Auto-split for large files
</div>
""")
with gr.Column(scale=2):
status = gr.Textbox(label="Status", interactive=False)
transcription = gr.Textbox(label="Transcription", lines=12, show_copy_button=True)
summary = gr.Textbox(label="Summary", lines=5, show_copy_button=True)
submit_btn.click(transcribe_ui, [pw, audio_file], [status, transcription, summary])
# ====== Tab 2: API ======
with gr.Tab("📱 API Documentation"):
gr.Markdown("""
## API Endpoint
**URL:** `/api/transcribe` (POST)
**Type:** Synchronous - returns complete results in one request
### Request Format
```json
{
"password": "your_password",
"file_data": "data:audio/m4a;base64,UklGR...",
"file_name": "recording.m4a"
}
```
### Response Format
```json
{
"status": "success",
"transcription": "Full transcription text...",
"summary": "AI-generated summary..."
}
```
---
## iPhone Shortcuts Setup
1. **Get File** → Audio recording
2. **Base64 Encode** → File content
3. **Text** → Create data URL:
```
data:audio/m4a;base64,[Base64 Result]
```
4. **Dictionary** → Request body:
- `password`: `chou`
- `file_data`: [Text from step 3]
- `file_name`: `recording.m4a`
5. **Get Contents of URL**:
- URL: `https://your-domain.com/api/transcribe`
- Method: `POST`
- Headers: `Content-Type: application/json`
- Body: [Dictionary], Type: `JSON`
6. **Get Dictionary Value**:
- `transcription` → Full text
- `summary` → Summary
---
## Testing with cURL
```bash
curl -X POST https://your-domain.com/api/transcribe \\
-H "Content-Type: application/json" \\
-d '{
"password": "chou",
"file_data": "data:audio/m4a;base64,AAAA...",
"file_name": "test.m4a"
}'
```
---
## Technical Details
- **Transcription:** OpenAI Whisper (high accuracy)
- **Summarization:** GPT-4o-mini
- **Output:** Traditional Chinese (Taiwan)
- **Processing:** Fully synchronous, no polling needed
- **File Handling:** Auto-split for files > 25MB
---
## Error Codes
- `401` - Incorrect password
- `400` - Invalid file format
- `500` - Processing error
For support, contact your administrator.
""")
# 頁腳
gr.HTML("""
<div style="text-align: center; margin-top: 2rem; padding: 1.5rem; background: #f9fafb; border-radius: 8px;">
<p style="color: #6b7280; font-size: 0.9rem; margin: 0;">
Audio Transcription Service v2.0 | Powered by OpenAI
</p>
</div>
""")
# ====== 掛載 Gradio 到 FastAPI ======
app = gr.mount_gradio_app(fastapi_app, demo, path="/")
# ====== 啟動 ======
if __name__ == "__main__":
print("\n" + "="*60)
print("🚀 啟動 FastAPI + Gradio 應用")
print("📱 同步 API: /api/transcribe")
print("🌐 網頁介面: /")
print("="*60 + "\n")
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)