Spaces:

JunyiAcademy
/

vaitor2

Sleeping

App Files Files Community

youngtsai commited on Feb 24, 2024

Commit

1491bd4

1 Parent(s): f0d8f54

transcript = process_transcript_and_screenshots_on_gcs(video_id)

Browse files

Files changed (1) hide show

app.py +65 -2

app.py CHANGED Viewed

@@ -59,6 +59,7 @@ client = OpenAI(api_key=OPEN_AI_KEY)
 DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
 GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
 def init_gcs_client(service_account_key_string):
     """使用服务账号密钥文件创建 GCS 客户端"""
     credentials_json_string = service_account_key_string
@@ -112,6 +113,23 @@ def make_blob_public(gcs_client, bucket_name, blob_name):
     blob.make_public()
     print(f"Blob {blob_name} is now publicly accessible at {blob.public_url}")
 def copy_all_files_from_drive_to_gcs(drive_service, gcs_client, drive_folder_id, bucket_name, gcs_folder_name):
     # Get all files from the folder
     query = f"'{drive_folder_id}' in parents and trashed = false"
@@ -141,7 +159,7 @@ def copy_file_from_drive_to_gcs(drive_service, gcs_client, file_id, bucket_name,
     blob.upload_from_string(file_content)
     print(f"File {file_id} copied to GCS at {gcs_destination_path}.")
-# # ====drive====初始化Google Drive服务
 def init_drive_service():
     credentials_json_string = DRIVE_KEY
     credentials_dict = json.loads(credentials_json_string)
@@ -391,6 +409,50 @@ def process_transcript_and_screenshots(video_id):
     return transcript
 def process_youtube_link(link):
     # 使用 YouTube API 获取逐字稿
     # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
@@ -400,7 +462,8 @@ def process_youtube_link(link):
     download_youtube_video(video_id, output_path=OUTPUT_PATH)
     try:
-        transcript = process_transcript_and_screenshots(video_id)
     except Exception as e:
         error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
         print("===process_youtube_link error===")

 DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
 GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
+# ====gcs====
 def init_gcs_client(service_account_key_string):
     """使用服务账号密钥文件创建 GCS 客户端"""
     credentials_json_string = service_account_key_string
     blob.make_public()
     print(f"Blob {blob_name} is now publicly accessible at {blob.public_url}")
+def get_blob_public_url(gcs_client, bucket_name, blob_name):
+    """获取指定 GCS 对象的公开 URL"""
+    bucket = gcs_client.bucket(bucket_name)
+    blob = bucket.blob(blob_name)
+    return blob.public_url
+def upload_img_and_get_public_url(gcs_client, bucket_name, file_name, file_path):
+    """上传图片到 GCS 并获取其公开 URL"""
+    # 上传图片
+    upload_file_to_gcs(gcs_client, bucket_name, file_name, file_path)
+    # 将上传的图片设置为公开
+    make_blob_public(gcs_client, bucket_name, file_name)
+    # 获取图片的公开 URL
+    public_url = get_blob_public_url(gcs_client, bucket_name, file_name)
+    print(f"Public URL for the uploaded image: {public_url}")
+    return public_url
 def copy_all_files_from_drive_to_gcs(drive_service, gcs_client, drive_folder_id, bucket_name, gcs_folder_name):
     # Get all files from the folder
     query = f"'{drive_folder_id}' in parents and trashed = false"
     blob.upload_from_string(file_content)
     print(f"File {file_id} copied to GCS at {gcs_destination_path}.")
+# # ====drive====初始化
 def init_drive_service():
     credentials_json_string = DRIVE_KEY
     credentials_dict = json.loads(credentials_json_string)
     return transcript
+def process_transcript_and_screenshots_on_gcs(video_id):
+    print("====process_transcript_and_screenshots_on_gcs====")
+    # GCS
+    gcs_client = init_gcs_client(GCS_KEY)
+    bucket_name = 'video_ai_assistant'
+    # 检查 folder 是否存在
+    is_gcs_exists = gcs_check_folder_exists(gcs_client, bucket_name, video_id)
+    if not is_gcs_exists:
+        gcs_create_bucket_folder_if_not_exists(gcs_client, bucket_name, video_id)
+        print("GCS folder:{video_id} 已创建")
+    else:
+        print("GCS folder:{video_id} 已存在")
+    # 逐字稿文件名
+    file_name = f'{video_id}_transcript.json'
+    # 检查逐字稿是否存在
+    exists = gcs_check_file_exists(gcs_client, bucket_name, file_name)
+    if not exists:
+        # 从YouTube获取逐字稿并上传
+        transcript = get_transcript(video_id)
+        if transcript:
+            print("成功獲取字幕")
+        else:
+            print("沒有找到字幕")
+        transcript
+    # 处理逐字稿中的每个条目，检查并上传截图 到 GCS，然後設定 GCS 權限
+    for entry in transcript:
+        if 'img_file_id' not in entry:
+            screenshot_path = screenshot_youtube_video(video_id, entry['start'])
+            img_file_id = upload_img_and_get_public_url(gcs_client, bucket_name, f"{video_id}_{entry['start']}.jpg", screenshot_path)
+            entry['img_file_id'] = img_file_id
+            print(f"截图已上传到GCS: {img_file_id}")
+    # 更新逐字稿文件
+    updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
+    upload_file_to_gcs(gcs_client, bucket_name, file_name, updated_transcript_text)
+    print("逐字稿已更新，包括截图链接")
+    return transcript
 def process_youtube_link(link):
     # 使用 YouTube API 获取逐字稿
     # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
     download_youtube_video(video_id, output_path=OUTPUT_PATH)
     try:
+        # transcript = process_transcript_and_screenshots(video_id)
+        transcript = process_transcript_and_screenshots_on_gcs(video_id)
     except Exception as e:
         error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
         print("===process_youtube_link error===")