Spaces:
Sleeping
Sleeping
add WebshareProxyConfig
Browse files- app.py +50 -19
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -48,6 +48,8 @@ from pydub import AudioSegment
|
|
| 48 |
|
| 49 |
|
| 50 |
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
|
|
|
| 51 |
from youtube_transcript_api._errors import NoTranscriptFound
|
| 52 |
import yt_dlp
|
| 53 |
|
|
@@ -106,6 +108,8 @@ if is_env_local:
|
|
| 106 |
AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
|
| 107 |
AWS_REGION_NAME = config["AWS_REGION_NAME"]
|
| 108 |
OUTPUT_PATH = config["OUTPUT_PATH"]
|
|
|
|
|
|
|
| 109 |
|
| 110 |
else:
|
| 111 |
IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
|
|
@@ -124,6 +128,8 @@ else:
|
|
| 124 |
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
|
| 125 |
AWS_REGION_NAME = 'us-west-2'
|
| 126 |
OUTPUT_PATH = 'videos'
|
|
|
|
|
|
|
| 127 |
|
| 128 |
TRANSCRIPTS = []
|
| 129 |
CURRENT_INDEX = 0
|
|
@@ -391,40 +397,65 @@ def extract_youtube_id(url):
|
|
| 391 |
else:
|
| 392 |
return None
|
| 393 |
|
| 394 |
-
def
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
for language in languages:
|
|
|
|
|
|
|
|
|
|
| 402 |
try:
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
| 404 |
print("===transcript===")
|
| 405 |
print(yt_api_transcript)
|
| 406 |
-
|
| 407 |
original_transcript = ""
|
| 408 |
for entry in yt_api_transcript:
|
| 409 |
transcript_part = (f"{entry['start']:.0f}s: {entry['text']}")
|
| 410 |
print(transcript_part)
|
| 411 |
original_transcript += f"{transcript_part} \n"
|
| 412 |
print("===transcript===")
|
| 413 |
-
|
| 414 |
transcript = convert_transcription_to_json(original_transcript)
|
| 415 |
-
return transcript
|
| 416 |
except NoTranscriptFound:
|
| 417 |
-
continue
|
| 418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
|
| 420 |
def generate_transcription_by_gemini(video_id):
|
| 421 |
"""使用 Google Gemini 生成影片逐字稿"""
|
| 422 |
print("====generate_transcription_by_gemini====")
|
| 423 |
# 準備 YouTube 影片 URL
|
| 424 |
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
| 425 |
-
|
| 426 |
-
# 初始化 Gemini Pro Vision 模型
|
| 427 |
-
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash-preview-05-20")
|
| 428 |
|
| 429 |
# 建立影片部分
|
| 430 |
video_part = Part.from_uri(
|
|
@@ -481,7 +512,7 @@ def convert_transcription_to_json(original_transcription):
|
|
| 481 |
return None
|
| 482 |
|
| 483 |
# 使用 Vertex AI 來處理轉換
|
| 484 |
-
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash
|
| 485 |
|
| 486 |
# 設定每段最大字數
|
| 487 |
# 考慮到:
|
|
@@ -535,7 +566,7 @@ def convert_transcription_to_json(original_transcription):
|
|
| 535 |
4. 回傳格式為 JSON array
|
| 536 |
5. 合理的合併句子,不要有不合理的斷句,一句話至少要有完整的主詞、謂詞
|
| 537 |
6. 每句話盡量在 10~15 個字左右,但要��完整語意為主
|
| 538 |
-
7. 如果遇到 [Music] 這類的標記,可以直接忽略不計
|
| 539 |
8. 這是第 {i+1}/{len(chunks)} 段,請確保時間軸的連續性
|
| 540 |
|
| 541 |
請直接返回 JSON 格式,不要加入任何說明文字或 markdown 標記。
|
|
@@ -659,7 +690,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 659 |
# transcript = generate_transcription_by_gemini(video_id)
|
| 660 |
except Exception as e:
|
| 661 |
print(f" Error generating transcription: {str(e)}")
|
| 662 |
-
transcript = generate_transcription_by_gemini(video_id)
|
| 663 |
# transcript = generate_transcription_by_whisper(video_id)
|
| 664 |
|
| 665 |
upload_transcript_to_gcs(video_id, transcript)
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 51 |
+
from youtube_transcript_api.proxies import WebshareProxyConfig
|
| 52 |
+
|
| 53 |
from youtube_transcript_api._errors import NoTranscriptFound
|
| 54 |
import yt_dlp
|
| 55 |
|
|
|
|
| 108 |
AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
|
| 109 |
AWS_REGION_NAME = config["AWS_REGION_NAME"]
|
| 110 |
OUTPUT_PATH = config["OUTPUT_PATH"]
|
| 111 |
+
PROXY_USERNAME = config["PROXY_USERNAME"]
|
| 112 |
+
PROXY_PASSWORD = config["PROXY_PASSWORD"]
|
| 113 |
|
| 114 |
else:
|
| 115 |
IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
|
|
|
|
| 128 |
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
|
| 129 |
AWS_REGION_NAME = 'us-west-2'
|
| 130 |
OUTPUT_PATH = 'videos'
|
| 131 |
+
PROXY_USERNAME = os.getenv("PROXY_USERNAME")
|
| 132 |
+
PROXY_PASSWORD = os.getenv("PROXY_PASSWORD")
|
| 133 |
|
| 134 |
TRANSCRIPTS = []
|
| 135 |
CURRENT_INDEX = 0
|
|
|
|
| 397 |
else:
|
| 398 |
return None
|
| 399 |
|
| 400 |
+
def try_get_transcript(video_id, use_proxy=False):
|
| 401 |
+
if use_proxy:
|
| 402 |
+
proxy_config = WebshareProxyConfig(
|
| 403 |
+
proxy_username=PROXY_USERNAME,
|
| 404 |
+
proxy_password=PROXY_PASSWORD
|
| 405 |
+
)
|
| 406 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxy_config)
|
| 407 |
+
else:
|
| 408 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
| 409 |
+
# 指定語言優先順序
|
| 410 |
+
language_priority = ["en", "zh-TW", "zh-CN", "ja"]
|
| 411 |
+
# 取得 transcript_list 中有的語言,並依照 priority 排序
|
| 412 |
+
available_languages = [t.language_code for t in transcript_list]
|
| 413 |
+
languages = [lang for lang in language_priority if lang in available_languages]
|
| 414 |
for language in languages:
|
| 415 |
+
print("===language===")
|
| 416 |
+
print(f"use language: {language}")
|
| 417 |
+
print("===language===")
|
| 418 |
try:
|
| 419 |
+
if use_proxy:
|
| 420 |
+
yt_api_transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language], proxies=proxy_config)
|
| 421 |
+
else:
|
| 422 |
+
yt_api_transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
|
| 423 |
print("===transcript===")
|
| 424 |
print(yt_api_transcript)
|
|
|
|
| 425 |
original_transcript = ""
|
| 426 |
for entry in yt_api_transcript:
|
| 427 |
transcript_part = (f"{entry['start']:.0f}s: {entry['text']}")
|
| 428 |
print(transcript_part)
|
| 429 |
original_transcript += f"{transcript_part} \n"
|
| 430 |
print("===transcript===")
|
|
|
|
| 431 |
transcript = convert_transcription_to_json(original_transcript)
|
| 432 |
+
return transcript
|
| 433 |
except NoTranscriptFound:
|
| 434 |
+
continue
|
| 435 |
+
raise NoTranscriptFound("No transcript found for supported languages.")
|
| 436 |
+
|
| 437 |
+
def get_transcript_by_yt_api(video_id):
|
| 438 |
+
print("====get_transcript_by_yt_api====")
|
| 439 |
+
# 先不用 proxy
|
| 440 |
+
try:
|
| 441 |
+
print("====try_get_transcript without proxy====")
|
| 442 |
+
return try_get_transcript(video_id, use_proxy=False)
|
| 443 |
+
except Exception as e1:
|
| 444 |
+
print(f"No proxy transcript error: {e1}")
|
| 445 |
+
# 再用 proxy
|
| 446 |
+
try:
|
| 447 |
+
print("====try_get_transcript with proxy====")
|
| 448 |
+
return try_get_transcript(video_id, use_proxy=True)
|
| 449 |
+
except Exception as e2:
|
| 450 |
+
print(f"With proxy transcript error: {e2}")
|
| 451 |
+
raise e2
|
| 452 |
|
| 453 |
def generate_transcription_by_gemini(video_id):
|
| 454 |
"""使用 Google Gemini 生成影片逐字稿"""
|
| 455 |
print("====generate_transcription_by_gemini====")
|
| 456 |
# 準備 YouTube 影片 URL
|
| 457 |
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
| 458 |
+
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash")
|
|
|
|
|
|
|
| 459 |
|
| 460 |
# 建立影片部分
|
| 461 |
video_part = Part.from_uri(
|
|
|
|
| 512 |
return None
|
| 513 |
|
| 514 |
# 使用 Vertex AI 來處理轉換
|
| 515 |
+
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash")
|
| 516 |
|
| 517 |
# 設定每段最大字數
|
| 518 |
# 考慮到:
|
|
|
|
| 566 |
4. 回傳格式為 JSON array
|
| 567 |
5. 合理的合併句子,不要有不合理的斷句,一句話至少要有完整的主詞、謂詞
|
| 568 |
6. 每句話盡量在 10~15 個字左右,但要��完整語意為主
|
| 569 |
+
7. 如果遇到 [Music] [Laughter] [Crowd] [Cheering] [Applause]這類的標記,可以直接忽略不計
|
| 570 |
8. 這是第 {i+1}/{len(chunks)} 段,請確保時間軸的連續性
|
| 571 |
|
| 572 |
請直接返回 JSON 格式,不要加入任何說明文字或 markdown 標記。
|
|
|
|
| 690 |
# transcript = generate_transcription_by_gemini(video_id)
|
| 691 |
except Exception as e:
|
| 692 |
print(f" Error generating transcription: {str(e)}")
|
| 693 |
+
# transcript = generate_transcription_by_gemini(video_id)
|
| 694 |
# transcript = generate_transcription_by_whisper(video_id)
|
| 695 |
|
| 696 |
upload_transcript_to_gcs(video_id, transcript)
|
requirements.txt
CHANGED
|
@@ -3,7 +3,7 @@ pandas
|
|
| 3 |
openai>=1.16.2
|
| 4 |
requests
|
| 5 |
python-docx
|
| 6 |
-
youtube-transcript-api
|
| 7 |
moviepy==1.0.3
|
| 8 |
pytube
|
| 9 |
google-auth
|
|
|
|
| 3 |
openai>=1.16.2
|
| 4 |
requests
|
| 5 |
python-docx
|
| 6 |
+
youtube-transcript-api >= 1.1.0
|
| 7 |
moviepy==1.0.3
|
| 8 |
pytube
|
| 9 |
google-auth
|