Spaces:
Sleeping
Sleeping
transcript = process_transcript_and_screenshots_on_gcs(video_id)
Browse files
app.py
CHANGED
|
@@ -59,6 +59,7 @@ client = OpenAI(api_key=OPEN_AI_KEY)
|
|
| 59 |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 60 |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 61 |
|
|
|
|
| 62 |
def init_gcs_client(service_account_key_string):
|
| 63 |
"""使用服务账号密钥文件创建 GCS 客户端"""
|
| 64 |
credentials_json_string = service_account_key_string
|
|
@@ -112,6 +113,23 @@ def make_blob_public(gcs_client, bucket_name, blob_name):
|
|
| 112 |
blob.make_public()
|
| 113 |
print(f"Blob {blob_name} is now publicly accessible at {blob.public_url}")
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
def copy_all_files_from_drive_to_gcs(drive_service, gcs_client, drive_folder_id, bucket_name, gcs_folder_name):
|
| 116 |
# Get all files from the folder
|
| 117 |
query = f"'{drive_folder_id}' in parents and trashed = false"
|
|
@@ -141,7 +159,7 @@ def copy_file_from_drive_to_gcs(drive_service, gcs_client, file_id, bucket_name,
|
|
| 141 |
blob.upload_from_string(file_content)
|
| 142 |
print(f"File {file_id} copied to GCS at {gcs_destination_path}.")
|
| 143 |
|
| 144 |
-
# # ====drive====初始化
|
| 145 |
def init_drive_service():
|
| 146 |
credentials_json_string = DRIVE_KEY
|
| 147 |
credentials_dict = json.loads(credentials_json_string)
|
|
@@ -391,6 +409,50 @@ def process_transcript_and_screenshots(video_id):
|
|
| 391 |
|
| 392 |
return transcript
|
| 393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
def process_youtube_link(link):
|
| 395 |
# 使用 YouTube API 获取逐字稿
|
| 396 |
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
|
@@ -400,7 +462,8 @@ def process_youtube_link(link):
|
|
| 400 |
download_youtube_video(video_id, output_path=OUTPUT_PATH)
|
| 401 |
|
| 402 |
try:
|
| 403 |
-
transcript = process_transcript_and_screenshots(video_id)
|
|
|
|
| 404 |
except Exception as e:
|
| 405 |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
|
| 406 |
print("===process_youtube_link error===")
|
|
|
|
| 59 |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 60 |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 61 |
|
| 62 |
+
# ====gcs====
|
| 63 |
def init_gcs_client(service_account_key_string):
|
| 64 |
"""使用服务账号密钥文件创建 GCS 客户端"""
|
| 65 |
credentials_json_string = service_account_key_string
|
|
|
|
| 113 |
blob.make_public()
|
| 114 |
print(f"Blob {blob_name} is now publicly accessible at {blob.public_url}")
|
| 115 |
|
| 116 |
+
def get_blob_public_url(gcs_client, bucket_name, blob_name):
|
| 117 |
+
"""获取指定 GCS 对象的公开 URL"""
|
| 118 |
+
bucket = gcs_client.bucket(bucket_name)
|
| 119 |
+
blob = bucket.blob(blob_name)
|
| 120 |
+
return blob.public_url
|
| 121 |
+
|
| 122 |
+
def upload_img_and_get_public_url(gcs_client, bucket_name, file_name, file_path):
|
| 123 |
+
"""上传图片到 GCS 并获取其公开 URL"""
|
| 124 |
+
# 上传图片
|
| 125 |
+
upload_file_to_gcs(gcs_client, bucket_name, file_name, file_path)
|
| 126 |
+
# 将上传的图片设置为公开
|
| 127 |
+
make_blob_public(gcs_client, bucket_name, file_name)
|
| 128 |
+
# 获取图片的公开 URL
|
| 129 |
+
public_url = get_blob_public_url(gcs_client, bucket_name, file_name)
|
| 130 |
+
print(f"Public URL for the uploaded image: {public_url}")
|
| 131 |
+
return public_url
|
| 132 |
+
|
| 133 |
def copy_all_files_from_drive_to_gcs(drive_service, gcs_client, drive_folder_id, bucket_name, gcs_folder_name):
|
| 134 |
# Get all files from the folder
|
| 135 |
query = f"'{drive_folder_id}' in parents and trashed = false"
|
|
|
|
| 159 |
blob.upload_from_string(file_content)
|
| 160 |
print(f"File {file_id} copied to GCS at {gcs_destination_path}.")
|
| 161 |
|
| 162 |
+
# # ====drive====初始化
|
| 163 |
def init_drive_service():
|
| 164 |
credentials_json_string = DRIVE_KEY
|
| 165 |
credentials_dict = json.loads(credentials_json_string)
|
|
|
|
| 409 |
|
| 410 |
return transcript
|
| 411 |
|
| 412 |
+
def process_transcript_and_screenshots_on_gcs(video_id):
|
| 413 |
+
print("====process_transcript_and_screenshots_on_gcs====")
|
| 414 |
+
# GCS
|
| 415 |
+
gcs_client = init_gcs_client(GCS_KEY)
|
| 416 |
+
bucket_name = 'video_ai_assistant'
|
| 417 |
+
# 检查 folder 是否存在
|
| 418 |
+
is_gcs_exists = gcs_check_folder_exists(gcs_client, bucket_name, video_id)
|
| 419 |
+
if not is_gcs_exists:
|
| 420 |
+
gcs_create_bucket_folder_if_not_exists(gcs_client, bucket_name, video_id)
|
| 421 |
+
print("GCS folder:{video_id} 已创建")
|
| 422 |
+
else:
|
| 423 |
+
print("GCS folder:{video_id} 已存在")
|
| 424 |
+
|
| 425 |
+
# 逐字稿文件名
|
| 426 |
+
file_name = f'{video_id}_transcript.json'
|
| 427 |
+
# 检查逐字稿是否存在
|
| 428 |
+
exists = gcs_check_file_exists(gcs_client, bucket_name, file_name)
|
| 429 |
+
if not exists:
|
| 430 |
+
# 从YouTube获取逐字稿并上传
|
| 431 |
+
transcript = get_transcript(video_id)
|
| 432 |
+
if transcript:
|
| 433 |
+
print("成功獲取字幕")
|
| 434 |
+
else:
|
| 435 |
+
print("沒有找到字幕")
|
| 436 |
+
transcript
|
| 437 |
+
|
| 438 |
+
# 处理逐字稿中的每个条目,检查并上传截图 到 GCS,然後設定 GCS 權限
|
| 439 |
+
for entry in transcript:
|
| 440 |
+
if 'img_file_id' not in entry:
|
| 441 |
+
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
| 442 |
+
img_file_id = upload_img_and_get_public_url(gcs_client, bucket_name, f"{video_id}_{entry['start']}.jpg", screenshot_path)
|
| 443 |
+
entry['img_file_id'] = img_file_id
|
| 444 |
+
print(f"截图已上传到GCS: {img_file_id}")
|
| 445 |
+
|
| 446 |
+
# 更新逐字稿文件
|
| 447 |
+
updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 448 |
+
upload_file_to_gcs(gcs_client, bucket_name, file_name, updated_transcript_text)
|
| 449 |
+
print("逐字稿已更新,包括截图链接")
|
| 450 |
+
|
| 451 |
+
return transcript
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
|
| 456 |
def process_youtube_link(link):
|
| 457 |
# 使用 YouTube API 获取逐字稿
|
| 458 |
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
|
|
|
| 462 |
download_youtube_video(video_id, output_path=OUTPUT_PATH)
|
| 463 |
|
| 464 |
try:
|
| 465 |
+
# transcript = process_transcript_and_screenshots(video_id)
|
| 466 |
+
transcript = process_transcript_and_screenshots_on_gcs(video_id)
|
| 467 |
except Exception as e:
|
| 468 |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
|
| 469 |
print("===process_youtube_link error===")
|