Spaces:
Sleeping
Sleeping
with open(chunk_path, "rb") as chunk_file:
Browse files
app.py
CHANGED
|
@@ -373,6 +373,9 @@ def get_transcript(video_id):
|
|
| 373 |
for language in languages:
|
| 374 |
try:
|
| 375 |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
|
|
|
|
|
|
|
|
|
|
| 376 |
return transcript # 成功獲取字幕,直接返回結果
|
| 377 |
except NoTranscriptFound:
|
| 378 |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
|
|
@@ -413,73 +416,33 @@ def generate_transcription(video_id):
|
|
| 413 |
chunk_path = f"{OUTPUT_PATH}/{video_id}_part_{i}.{codec_name}"
|
| 414 |
chunk.export(chunk_path, format=codec_name)
|
| 415 |
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
|
|
|
|
|
|
|
|
|
| 434 |
|
| 435 |
# Remove temporary chunk files after processing
|
| 436 |
os.remove(chunk_path)
|
| 437 |
|
| 438 |
return transcription
|
| 439 |
|
| 440 |
-
def process_transcript_and_screenshots(video_id):
|
| 441 |
-
print("====process_transcript_and_screenshots====")
|
| 442 |
-
|
| 443 |
-
# Drive
|
| 444 |
-
service = init_drive_service()
|
| 445 |
-
parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL'
|
| 446 |
-
folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id)
|
| 447 |
-
|
| 448 |
-
# 逐字稿文件名
|
| 449 |
-
file_name = f'{video_id}_transcript.json'
|
| 450 |
-
# 检查逐字稿是否存在
|
| 451 |
-
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 452 |
-
if not exists:
|
| 453 |
-
# 从YouTube获取逐字稿并上传
|
| 454 |
-
transcript = get_transcript(video_id)
|
| 455 |
-
if transcript:
|
| 456 |
-
print("成功獲取字幕")
|
| 457 |
-
else:
|
| 458 |
-
print("沒有找到字幕")
|
| 459 |
-
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 460 |
-
file_id = upload_content_directly(service, file_name, folder_id, transcript_text)
|
| 461 |
-
print("逐字稿已上传到Google Drive")
|
| 462 |
-
else:
|
| 463 |
-
# 逐字稿已存在,下载逐字稿内容
|
| 464 |
-
print("逐字稿已存在于Google Drive中")
|
| 465 |
-
transcript_text = download_file_as_string(service, file_id)
|
| 466 |
-
transcript = json.loads(transcript_text)
|
| 467 |
-
|
| 468 |
-
# 处理逐字稿中的每个条目,检查并上传截图
|
| 469 |
-
for entry in transcript:
|
| 470 |
-
if 'img_file_id' not in entry:
|
| 471 |
-
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
| 472 |
-
img_file_id = upload_img_directly(service, f"{video_id}_{entry['start']}.jpg", folder_id, screenshot_path)
|
| 473 |
-
set_public_permission(service, img_file_id)
|
| 474 |
-
entry['img_file_id'] = img_file_id
|
| 475 |
-
print(f"截图已上传到Google Drive: {img_file_id}")
|
| 476 |
-
|
| 477 |
-
# 更新逐字稿文件
|
| 478 |
-
updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 479 |
-
update_file_on_drive(service, file_id, updated_transcript_text)
|
| 480 |
-
print("逐字稿已更新,包括截图链接")
|
| 481 |
-
return transcript
|
| 482 |
-
|
| 483 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
| 484 |
print("====process_transcript_and_screenshots_on_gcs====")
|
| 485 |
# GCS
|
|
|
|
| 373 |
for language in languages:
|
| 374 |
try:
|
| 375 |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
|
| 376 |
+
print("===transcript===")
|
| 377 |
+
print(transcript)
|
| 378 |
+
print("===transcript===")
|
| 379 |
return transcript # 成功獲取字幕,直接返回結果
|
| 380 |
except NoTranscriptFound:
|
| 381 |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
|
|
|
|
| 416 |
chunk_path = f"{OUTPUT_PATH}/{video_id}_part_{i}.{codec_name}"
|
| 417 |
chunk.export(chunk_path, format=codec_name)
|
| 418 |
|
| 419 |
+
try:
|
| 420 |
+
with open(chunk_path, "rb") as chunk_file:
|
| 421 |
+
response = OPEN_AI_CLIENT.audio.transcriptions.create(
|
| 422 |
+
model="whisper-1",
|
| 423 |
+
file=chunk_file,
|
| 424 |
+
response_format="verbose_json",
|
| 425 |
+
timestamp_granularities=["segment"],
|
| 426 |
+
prompt="Transcribe the following audio file. if chinese, please using 'language: zh-TW' ",
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
# Adjusting the timestamps for the chunk based on its position in the full audio
|
| 430 |
+
adjusted_segments = [{
|
| 431 |
+
'text': segment['text'],
|
| 432 |
+
'start': math.ceil(segment['start'] + start_time / 1000.0), # Converting milliseconds to seconds
|
| 433 |
+
'end': math.ceil(segment['end'] + start_time / 1000.0),
|
| 434 |
+
'duration': math.ceil(segment['end'] - segment['start'])
|
| 435 |
+
} for segment in response.segments]
|
| 436 |
+
|
| 437 |
+
transcription.extend(adjusted_segments)
|
| 438 |
+
except Exception as e:
|
| 439 |
+
print(f"Error processing chunk {i}: {str(e)}")
|
| 440 |
|
| 441 |
# Remove temporary chunk files after processing
|
| 442 |
os.remove(chunk_path)
|
| 443 |
|
| 444 |
return transcription
|
| 445 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
| 447 |
print("====process_transcript_and_screenshots_on_gcs====")
|
| 448 |
# GCS
|