Spaces:
Sleeping
Sleeping
File size: 4,232 Bytes
ec97684 c12e8b3 ec97684 c12e8b3 438fb2c ec97684 c12e8b3 ec97684 c12e8b3 ec97684 c12e8b3 0be25e9 c12e8b3 4d0661a c12e8b3 438fb2c c12e8b3 ec97684 c12e8b3 ec97684 c12e8b3 c131fae c12e8b3 438fb2c c12e8b3 83b28b5 ec97684 c12e8b3 413592c c12e8b3 ec97684 c12e8b3 ec97684 c12e8b3 2705dd2 c12e8b3 4d0661a c12e8b3 410151c c12e8b3 ec97684 c12e8b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import os
import re
import uuid
import gdown
import whisper
from concurrent.futures import ThreadPoolExecutor
from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.responses import JSONResponse
from dotenv import load_dotenv
from postmarker.core import PostmarkClient
# κ΅¬κΈ λλΌμ΄λΈ λ§ν¬, μ΄λ€ μΈμ΄, λ°μ μ΄λ©μΌ μ£Όμ
# .env νμΌμμ μ€μν νκ²½ λ³μ λ‘λ
load_dotenv()
app = FastAPI(
version="0.0.1",
servers=[
{
"url": "https://leekwoon-whisper-api.hf.space",
"description": "video/audio transcription API",
}
],
)
# Whisper λͺ¨λΈ λ‘λ
model = whisper.load_model("large-v2")
postmark = PostmarkClient(server_token=os.getenv("POSTMARK_API_KEY"))
executor = ThreadPoolExecutor(max_workers=3) # μ΅λ 3κ°μ μ€λ λλ‘ λΉλκΈ° μμ
μ²λ¦¬
def extract_file_id(drive_url: str) -> str:
"""
Google Drive URLμμ νμΌ IDλ₯Ό μΆμΆν©λλ€.
"""
match = re.search(r'/d/([a-zA-Z0-9_-]+)', drive_url)
if match:
return match.group(1)
match = re.search(r'file/d/([a-zA-Z0-9_-]+)', drive_url)
if match:
return match.group(1)
match = re.search(r'([a-zA-Z0-9_-]{33,})', drive_url)
if match:
return match.group(1)
raise ValueError("Invalid Google Drive URL")
def send_email(to_email: str, srt_file_path: str, transcription_time: float):
subject = "[kyobody - μλ§μμ±] μμ
μ΄ μλ£λμμ΅λλ€."
body = f"[kyobody - μλ§μμ±] μμ
μ΄ μλ£λμμ΅λλ€. μ΄ μμ μκ°: {transcription_time:.2f} μ΄. SRT νμΌμ 첨λΆνμ¬ μ λ¬λ립λλ€."
email = postmark.emails.Email(
From=os.getenv("FROM_EMAIL"),
To=to_email,
Subject=subject,
# HtmlBody='<html><body><strong>Hello</strong> dear Postmark user.</body></html>'
HtmlBody=body
)
email['X-Accept-Language'] = 'ko'
email.attach(srt_file_path)
email.send()
def transcribe_and_send_email(temp_input_file: str, srt_file_path: str, email: str, language: str):
try:
# Transcribe the video/audio file
import time
start_time = time.time()
result = model.transcribe(temp_input_file, language=language)
transcription_time = time.time() - start_time
# Save the transcription to an SRT file
with open(srt_file_path, "w") as srt_file:
for i, segment in enumerate(result["segments"]):
start = segment['start']
end = segment['end']
text = segment['text'][1:]
start_time = f"{int(start // 3600):02}:{int((start % 3600) // 60):02}:{int(start % 60):02},{int((start * 1000) % 1000):03}"
end_time = f"{int(end // 3600):02}:{int((end % 3600) // 60):02}:{int(end % 60):02},{int((end * 1000) % 1000):03}"
srt_file.write(f"{i + 1}\n")
srt_file.write(f"{start_time} --> {end_time}\n")
srt_file.write(f"{text}\n\n")
# Send the result via email
send_email(email, srt_file_path, transcription_time)
# Clean up the temporary files
os.remove(temp_input_file)
os.remove(srt_file_path)
except Exception as e:
raise e
@app.post("/transcribe/")
def transcribe_video(url: str, email: str, background_tasks: BackgroundTasks, language: str = "ko"):
try:
# Extract file ID and download the file
file_id = extract_file_id(url)
download_url = f"https://drive.google.com/uc?id={file_id}"
temp_input_file = f'/tmp/{uuid.uuid4()}.mp4'
gdown.download(download_url, temp_input_file, quiet=False)
# Define SRT file path
srt_file_path = f'/tmp/{uuid.uuid4()}.srt'
# Schedule the transcription and email sending in the background
background_tasks.add_task(executor.submit, transcribe_and_send_email, temp_input_file, srt_file_path, email, language)
# Respond to the client immediately
return JSONResponse(status_code=202, content={"message": "Transcription started, you will receive an email when it's done."})
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
|