Spaces:
Sleeping
Sleeping
def get_video_duration(video_id):
Browse files
app.py
CHANGED
|
@@ -415,7 +415,14 @@ def generate_transcription_by_whisper(video_id):
|
|
| 415 |
|
| 416 |
def get_video_duration(video_id):
|
| 417 |
yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
|
| 418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
|
| 420 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
| 421 |
print("====process_transcript_and_screenshots_on_gcs====")
|
|
@@ -428,7 +435,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 428 |
# 检查逐字稿是否存在
|
| 429 |
is_new_transcript = False
|
| 430 |
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
| 431 |
-
|
| 432 |
if not is_transcript_exists:
|
| 433 |
print("逐字稿文件不存在于GCS中,重新建立")
|
| 434 |
# 从YouTube获取逐字稿并上传
|
|
@@ -444,8 +451,9 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 444 |
else:
|
| 445 |
print("沒有找到字幕")
|
| 446 |
transcript = generate_transcription_by_whisper(video_id)
|
| 447 |
-
|
| 448 |
-
|
|
|
|
| 449 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 450 |
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
| 451 |
|
|
@@ -455,7 +463,8 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 455 |
print("逐字稿已存在于GCS中")
|
| 456 |
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
| 457 |
transcript = json.loads(transcript_text)
|
| 458 |
-
|
|
|
|
| 459 |
|
| 460 |
# print("===確認其他衍生文件===")
|
| 461 |
# source = "gcs"
|
|
|
|
| 415 |
|
| 416 |
def get_video_duration(video_id):
|
| 417 |
yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
|
| 418 |
+
try:
|
| 419 |
+
video_duration = yt.length
|
| 420 |
+
except:
|
| 421 |
+
video_duration = None
|
| 422 |
+
|
| 423 |
+
print(f"video_duration: {video_duration}")
|
| 424 |
+
|
| 425 |
+
return video_duration
|
| 426 |
|
| 427 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
| 428 |
print("====process_transcript_and_screenshots_on_gcs====")
|
|
|
|
| 435 |
# 检查逐字稿是否存在
|
| 436 |
is_new_transcript = False
|
| 437 |
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
| 438 |
+
video_duration = get_video_duration(video_id)
|
| 439 |
if not is_transcript_exists:
|
| 440 |
print("逐字稿文件不存在于GCS中,重新建立")
|
| 441 |
# 从YouTube获取逐字稿并上传
|
|
|
|
| 451 |
else:
|
| 452 |
print("沒有找到字幕")
|
| 453 |
transcript = generate_transcription_by_whisper(video_id)
|
| 454 |
+
if video_duration:
|
| 455 |
+
transcript = [entry for entry in transcript if entry['start'] <= video_duration]
|
| 456 |
+
|
| 457 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 458 |
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
| 459 |
|
|
|
|
| 463 |
print("逐字稿已存在于GCS中")
|
| 464 |
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
| 465 |
transcript = json.loads(transcript_text)
|
| 466 |
+
if video_duration:
|
| 467 |
+
transcript = [entry for entry in transcript if entry['start'] <= video_duration]
|
| 468 |
|
| 469 |
# print("===確認其他衍生文件===")
|
| 470 |
# source = "gcs"
|