Spaces:
Sleeping
Sleeping
get_transcript_by_yt_api
Browse files
app.py
CHANGED
|
@@ -155,7 +155,7 @@ def check_open_ai_access(open_ai_api_key):
|
|
| 155 |
client = OpenAI(api_key=open_ai_api_key)
|
| 156 |
try:
|
| 157 |
response = client.chat.completions.create(
|
| 158 |
-
model="gpt-
|
| 159 |
messages=[
|
| 160 |
{"role": "user", "content": "This is a test."},
|
| 161 |
],
|
|
@@ -399,10 +399,18 @@ def get_transcript_by_yt_api(video_id):
|
|
| 399 |
|
| 400 |
for language in languages:
|
| 401 |
try:
|
| 402 |
-
|
| 403 |
print("===transcript===")
|
| 404 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
print("===transcript===")
|
|
|
|
|
|
|
| 406 |
return transcript # 成功獲取字幕,直接返回結果
|
| 407 |
except NoTranscriptFound:
|
| 408 |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
|
|
@@ -415,7 +423,7 @@ def generate_transcription_by_gemini(video_id):
|
|
| 415 |
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
| 416 |
|
| 417 |
# 初始化 Gemini Pro Vision 模型
|
| 418 |
-
model = vertexai.generative_models.GenerativeModel("gemini-2.
|
| 419 |
|
| 420 |
# 建立影片部分
|
| 421 |
video_part = Part.from_uri(
|
|
@@ -424,7 +432,7 @@ def generate_transcription_by_gemini(video_id):
|
|
| 424 |
)
|
| 425 |
|
| 426 |
# 設定提示詞
|
| 427 |
-
prompt = "
|
| 428 |
|
| 429 |
# 生成逐字稿
|
| 430 |
original_transcription = ""
|
|
@@ -434,7 +442,7 @@ def generate_transcription_by_gemini(video_id):
|
|
| 434 |
generation_config=vertexai.generative_models.GenerationConfig(
|
| 435 |
temperature=1.0,
|
| 436 |
top_p=0.95,
|
| 437 |
-
max_output_tokens=
|
| 438 |
candidate_count=1
|
| 439 |
),
|
| 440 |
stream=False
|
|
@@ -462,7 +470,7 @@ def generate_transcription_by_gemini(video_id):
|
|
| 462 |
|
| 463 |
def convert_transcription_to_json(original_transcription):
|
| 464 |
"""
|
| 465 |
-
將原始逐字稿轉換成指定的 JSON
|
| 466 |
|
| 467 |
Args:
|
| 468 |
original_transcription (str): 原始逐字稿文本
|
|
@@ -470,63 +478,104 @@ def convert_transcription_to_json(original_transcription):
|
|
| 470 |
Returns:
|
| 471 |
list: 包含逐字稿段落的列表,每個段落包含 text, start, end, duration
|
| 472 |
"""
|
| 473 |
-
|
|
|
|
|
|
|
|
|
|
| 474 |
# 使用 Vertex AI 來處理轉換
|
| 475 |
-
model = vertexai.generative_models.GenerativeModel("gemini-2.
|
| 476 |
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
4. 回傳格式為 JSON array
|
| 486 |
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
{{
|
| 490 |
-
"text": "在一片無人的森林裡",
|
| 491 |
-
"start": 1,
|
| 492 |
-
"end": 2,
|
| 493 |
-
"duration": 1
|
| 494 |
-
}},
|
| 495 |
-
{{
|
| 496 |
-
"text": "你撿到一張羊皮紙",
|
| 497 |
-
"start": 2,
|
| 498 |
-
"end": 4,
|
| 499 |
-
"duration": 2
|
| 500 |
-
}}
|
| 501 |
-
]
|
| 502 |
|
| 503 |
-
|
| 504 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
|
|
|
|
|
|
|
|
|
| 513 |
|
| 514 |
-
|
| 515 |
-
|
|
|
|
| 516 |
|
| 517 |
-
|
| 518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
if not all(k in entry for k in ["text", "start", "end", "duration"]):
|
| 523 |
-
raise ValueError("JSON 格式錯誤:缺少必要欄位")
|
| 524 |
-
|
| 525 |
-
return transcript_json
|
| 526 |
|
| 527 |
-
|
| 528 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
|
| 531 |
def generate_transcription_by_whisper(video_id):
|
| 532 |
youtube_url = f'https://www.youtube.com/watch?v={video_id}'
|
|
@@ -607,9 +656,11 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 607 |
if not exists:
|
| 608 |
print("==== video transcript is not exists ====")
|
| 609 |
try:
|
| 610 |
-
transcript =
|
|
|
|
| 611 |
except Exception as e:
|
| 612 |
print(f"generate_transcription_by_gemini Error generating transcription: {str(e)}")
|
|
|
|
| 613 |
# transcript = generate_transcription_by_whisper(video_id)
|
| 614 |
|
| 615 |
upload_transcript_to_gcs(video_id, transcript)
|
|
@@ -640,6 +691,10 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 640 |
is_new_transcript = True
|
| 641 |
except Exception as e:
|
| 642 |
print(f"Error processing screenshot: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
else:
|
| 644 |
entry['img_file_id'] = ""
|
| 645 |
print(f"截圖空白")
|
|
|
|
| 155 |
client = OpenAI(api_key=open_ai_api_key)
|
| 156 |
try:
|
| 157 |
response = client.chat.completions.create(
|
| 158 |
+
model="gpt-4o",
|
| 159 |
messages=[
|
| 160 |
{"role": "user", "content": "This is a test."},
|
| 161 |
],
|
|
|
|
| 399 |
|
| 400 |
for language in languages:
|
| 401 |
try:
|
| 402 |
+
yt_api_transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
|
| 403 |
print("===transcript===")
|
| 404 |
+
print(yt_api_transcript)
|
| 405 |
+
|
| 406 |
+
transcript = ""
|
| 407 |
+
for entry in yt_api_transcript:
|
| 408 |
+
transcript_part = (f"{entry['start']:.0f}s: {entry['text']}")
|
| 409 |
+
print(transcript_part)
|
| 410 |
+
original_transcript += f"{transcript_part} \n"
|
| 411 |
print("===transcript===")
|
| 412 |
+
|
| 413 |
+
transcript = convert_transcription_to_json(original_transcript)
|
| 414 |
return transcript # 成功獲取字幕,直接返回結果
|
| 415 |
except NoTranscriptFound:
|
| 416 |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
|
|
|
|
| 423 |
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
| 424 |
|
| 425 |
# 初始化 Gemini Pro Vision 模型
|
| 426 |
+
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash-preview-05-20")
|
| 427 |
|
| 428 |
# 建立影片部分
|
| 429 |
video_part = Part.from_uri(
|
|
|
|
| 432 |
)
|
| 433 |
|
| 434 |
# 設定提示詞
|
| 435 |
+
prompt = "給我包含時間軸的完整逐字稿,包含時間軸跟原文內容,一句話一行"
|
| 436 |
|
| 437 |
# 生成逐字稿
|
| 438 |
original_transcription = ""
|
|
|
|
| 442 |
generation_config=vertexai.generative_models.GenerationConfig(
|
| 443 |
temperature=1.0,
|
| 444 |
top_p=0.95,
|
| 445 |
+
max_output_tokens=65535,
|
| 446 |
candidate_count=1
|
| 447 |
),
|
| 448 |
stream=False
|
|
|
|
| 470 |
|
| 471 |
def convert_transcription_to_json(original_transcription):
|
| 472 |
"""
|
| 473 |
+
將原始逐字稿轉換成指定的 JSON 格式,支援長文本分段處理
|
| 474 |
|
| 475 |
Args:
|
| 476 |
original_transcription (str): 原始逐字稿文本
|
|
|
|
| 478 |
Returns:
|
| 479 |
list: 包含逐字稿段落的列表,每個段落包含 text, start, end, duration
|
| 480 |
"""
|
| 481 |
+
if not original_transcription:
|
| 482 |
+
print("原始逐字稿為空")
|
| 483 |
+
return None
|
| 484 |
+
|
| 485 |
# 使用 Vertex AI 來處理轉換
|
| 486 |
+
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash-preview-05-20")
|
| 487 |
|
| 488 |
+
# 設定每段最大字數
|
| 489 |
+
# 考慮到:
|
| 490 |
+
# 1. Gemini 輸出限制為 65,535 tokens
|
| 491 |
+
# 2. 需要預留空間給系統提示詞
|
| 492 |
+
# 3. JSON 格式會增加額外字符
|
| 493 |
+
# 4. 中文一個字約等於 2-3 個 tokens
|
| 494 |
+
MAX_CHUNK_SIZE = 15000
|
| 495 |
|
| 496 |
+
# 分段處理
|
| 497 |
+
chunks = []
|
| 498 |
+
current_chunk = []
|
| 499 |
+
current_size = 0
|
|
|
|
| 500 |
|
| 501 |
+
# 按行分割文本
|
| 502 |
+
lines = original_transcription.split('\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
|
| 504 |
+
for line in lines:
|
| 505 |
+
line = line.strip()
|
| 506 |
+
if not line:
|
| 507 |
+
continue
|
| 508 |
+
|
| 509 |
+
# 如果這一行加入後會超過限制,就先處理當前chunk
|
| 510 |
+
if current_size + len(line) > MAX_CHUNK_SIZE and current_chunk:
|
| 511 |
+
chunks.append('\n'.join(current_chunk))
|
| 512 |
+
current_chunk = []
|
| 513 |
+
current_size = 0
|
| 514 |
+
|
| 515 |
+
current_chunk.append(line)
|
| 516 |
+
current_size += len(line)
|
| 517 |
|
| 518 |
+
# 處理最後一個chunk
|
| 519 |
+
if current_chunk:
|
| 520 |
+
chunks.append('\n'.join(current_chunk))
|
| 521 |
+
|
| 522 |
+
# 用於儲存所有處理結果
|
| 523 |
+
all_results = []
|
| 524 |
+
|
| 525 |
+
# 處理每個chunk
|
| 526 |
+
for i, chunk in enumerate(chunks):
|
| 527 |
+
print(f"===chunk: {i+1}===")
|
| 528 |
|
| 529 |
+
prompt = f"""
|
| 530 |
+
請將以下逐字稿轉換成 JSON 格式:
|
| 531 |
+
{chunk}
|
| 532 |
|
| 533 |
+
轉換規則:
|
| 534 |
+
1. 每個段落需包含 text, start, end, duration
|
| 535 |
+
2. 時間格式需轉換為秒數(例如 1:02 轉為 62 秒)
|
| 536 |
+
3. duration 為 end - start 的差值
|
| 537 |
+
4. 回傳格式為 JSON array
|
| 538 |
+
5. 合理的合併句子,不要有不合理的斷句,一句話至少要有完整的主詞、謂詞
|
| 539 |
+
6. 每句話盡量在 10~15 個字左右,但要以完整語意為主
|
| 540 |
+
7. 如果遇到 [Music] 這類的標記,可以直接忽略不計
|
| 541 |
+
8. 這是第 {i+1}/{len(chunks)} 段,請確保時間軸的連續性
|
| 542 |
|
| 543 |
+
請直接返回 JSON 格式,不要加入任何說明文字或 markdown 標記。
|
| 544 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
|
| 546 |
+
try:
|
| 547 |
+
response = model.generate_content(prompt)
|
| 548 |
+
json_str = response.text
|
| 549 |
+
|
| 550 |
+
print(f"===json_str for chunk {i+1}===")
|
| 551 |
+
print(json_str)
|
| 552 |
+
print(f"===json_str for chunk {i+1}===")
|
| 553 |
+
|
| 554 |
+
# 移除可能的 markdown 標記
|
| 555 |
+
json_str = json_str.replace("```json", "").replace("```", "").strip()
|
| 556 |
+
|
| 557 |
+
# 解析 JSON
|
| 558 |
+
chunk_result = json.loads(json_str)
|
| 559 |
+
|
| 560 |
+
# 驗證格式
|
| 561 |
+
for entry in chunk_result:
|
| 562 |
+
if not all(k in entry for k in ["text", "start", "end", "duration"]):
|
| 563 |
+
raise ValueError(f"JSON 格式錯誤:缺少必要欄位,在第 {i+1} 段")
|
| 564 |
+
|
| 565 |
+
all_results.extend(chunk_result)
|
| 566 |
+
|
| 567 |
+
except Exception as e:
|
| 568 |
+
print(f"處理第 {i+1} 段時發生錯誤:{str(e)}")
|
| 569 |
+
continue
|
| 570 |
+
|
| 571 |
+
# 如果沒有任何有效結果,返回 None
|
| 572 |
+
if not all_results:
|
| 573 |
return None
|
| 574 |
+
|
| 575 |
+
# 按時間排序
|
| 576 |
+
all_results.sort(key=lambda x: x["start"])
|
| 577 |
+
|
| 578 |
+
return all_results
|
| 579 |
|
| 580 |
def generate_transcription_by_whisper(video_id):
|
| 581 |
youtube_url = f'https://www.youtube.com/watch?v={video_id}'
|
|
|
|
| 656 |
if not exists:
|
| 657 |
print("==== video transcript is not exists ====")
|
| 658 |
try:
|
| 659 |
+
transcript = get_transcript_by_yt_api(video_id)
|
| 660 |
+
# transcript = generate_transcription_by_gemini(video_id)
|
| 661 |
except Exception as e:
|
| 662 |
print(f"generate_transcription_by_gemini Error generating transcription: {str(e)}")
|
| 663 |
+
transcript = generate_transcription_by_gemini(video_id)
|
| 664 |
# transcript = generate_transcription_by_whisper(video_id)
|
| 665 |
|
| 666 |
upload_transcript_to_gcs(video_id, transcript)
|
|
|
|
| 691 |
is_new_transcript = True
|
| 692 |
except Exception as e:
|
| 693 |
print(f"Error processing screenshot: {str(e)}")
|
| 694 |
+
# 如果影片有下載成功,但是截圖失敗,則將 img_file_id 設為空字串
|
| 695 |
+
entry['img_file_id'] = ""
|
| 696 |
+
print(f"截圖空白")
|
| 697 |
+
is_new_transcript = True
|
| 698 |
else:
|
| 699 |
entry['img_file_id'] = ""
|
| 700 |
print(f"截圖空白")
|