Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
|
@@ -49,6 +49,7 @@ print(gr.__version__)
|
|
| 49 |
if is_env_local:
|
| 50 |
with open("local_config.json") as f:
|
| 51 |
config = json.load(f)
|
|
|
|
| 52 |
PASSWORD = config["PASSWORD"]
|
| 53 |
GCS_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
| 54 |
DRIVE_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
|
@@ -64,7 +65,9 @@ if is_env_local:
|
|
| 64 |
AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
|
| 65 |
AWS_REGION_NAME = config["AWS_REGION_NAME"]
|
| 66 |
OUTPUT_PATH = config["OUTPUT_PATH"]
|
|
|
|
| 67 |
else:
|
|
|
|
| 68 |
PASSWORD = os.getenv("PASSWORD")
|
| 69 |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 70 |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
|
@@ -426,54 +429,14 @@ def get_video_duration(video_id):
|
|
| 426 |
|
| 427 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
| 428 |
print("====process_transcript_and_screenshots_on_gcs====")
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
# 检查逐字稿是否存在
|
| 435 |
-
is_new_transcript = False
|
| 436 |
-
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
| 437 |
-
video_duration = get_video_duration(video_id)
|
| 438 |
-
if not is_transcript_exists:
|
| 439 |
-
print("逐字稿文件不存在于GCS中,重新建立")
|
| 440 |
-
# 从YouTube获取逐字稿并上传
|
| 441 |
-
try:
|
| 442 |
-
transcript = get_transcript_by_yt_api(video_id)
|
| 443 |
-
except:
|
| 444 |
-
# call open ai whisper
|
| 445 |
-
print("===call open ai whisper===")
|
| 446 |
-
transcript = generate_transcription_by_whisper(video_id)
|
| 447 |
-
|
| 448 |
-
if transcript:
|
| 449 |
-
print("成功獲取字幕")
|
| 450 |
-
else:
|
| 451 |
-
print("沒有找到字幕")
|
| 452 |
-
transcript = generate_transcription_by_whisper(video_id)
|
| 453 |
-
if video_duration:
|
| 454 |
-
transcript = [entry for entry in transcript if entry['start'] <= video_duration]
|
| 455 |
-
|
| 456 |
-
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 457 |
-
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
| 458 |
-
|
| 459 |
-
is_new_transcript = True
|
| 460 |
-
else:
|
| 461 |
-
# 逐字稿已存在,下载逐字稿内容
|
| 462 |
-
print("逐字稿已存在于GCS中")
|
| 463 |
-
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
| 464 |
-
transcript = json.loads(transcript_text)
|
| 465 |
-
if video_duration:
|
| 466 |
-
transcript = [entry for entry in transcript if entry['start'] <= video_duration]
|
| 467 |
-
|
| 468 |
-
# print("===確認其他衍生文件===")
|
| 469 |
-
# source = "gcs"
|
| 470 |
-
# get_questions(video_id, transcript_text, source)
|
| 471 |
-
# get_video_id_summary(video_id, transcript_text, source)
|
| 472 |
-
# get_mind_map(video_id, transcript_text, source)
|
| 473 |
-
# print("===確認其他衍生文件 end ===")
|
| 474 |
-
|
| 475 |
|
| 476 |
# 處理截圖
|
|
|
|
| 477 |
for entry in transcript:
|
| 478 |
if 'img_file_id' not in entry:
|
| 479 |
# 檢查 OUTPUT_PATH 是否存在 video_id.mp4
|
|
@@ -488,38 +451,67 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 488 |
if i == 4:
|
| 489 |
raise gr.Error(f"下载视频失败: {str(e)}")
|
| 490 |
time.sleep(5)
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
|
|
|
|
|
|
| 498 |
|
| 499 |
-
# 確認是否更新逐字稿文件
|
| 500 |
if is_new_transcript:
|
| 501 |
-
# 更新逐字稿文件
|
| 502 |
-
print("===更新逐字稿文件===")
|
| 503 |
-
print(transcript)
|
| 504 |
print("===更新逐字稿文件===")
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
else:
|
| 510 |
-
|
|
|
|
| 511 |
|
| 512 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
|
| 514 |
def process_youtube_link(password, link):
|
| 515 |
verify_password(password)
|
| 516 |
-
|
| 517 |
-
# 使用 YouTube API 获取逐字稿
|
| 518 |
-
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
| 519 |
video_id = extract_youtube_id(link)
|
| 520 |
-
|
| 521 |
try:
|
| 522 |
-
|
|
|
|
|
|
|
|
|
|
| 523 |
except Exception as e:
|
| 524 |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
|
| 525 |
print("===process_youtube_link error===")
|
|
@@ -2615,6 +2607,8 @@ def init_params(text, request: gr.Request):
|
|
| 2615 |
chatbot_ai = gr.update(visible=False)
|
| 2616 |
ai_chatbot_params = gr.update(visible=True)
|
| 2617 |
|
|
|
|
|
|
|
| 2618 |
# if youtube_link in query_params
|
| 2619 |
if "youtube_id" in request.query_params:
|
| 2620 |
youtube_id = request.query_params["youtube_id"]
|
|
@@ -2633,11 +2627,15 @@ def init_params(text, request: gr.Request):
|
|
| 2633 |
lesson_plan_accordion = gr.update(visible=False)
|
| 2634 |
exit_ticket_accordion = gr.update(visible=False)
|
| 2635 |
ai_chatbot_params = gr.update(visible=False)
|
|
|
|
|
|
|
|
|
|
| 2636 |
|
| 2637 |
return admin, reading_passage_admin, summary_admin, see_detail, \
|
| 2638 |
worksheet_accordion, lesson_plan_accordion, exit_ticket_accordion, \
|
| 2639 |
password_text, youtube_link, \
|
| 2640 |
-
chatbot_open_ai_streaming, chatbot_ai, ai_chatbot_params
|
|
|
|
| 2641 |
|
| 2642 |
def update_state(content_subject, content_grade, trascript, key_moments, questions_answers):
|
| 2643 |
# inputs=[content_subject, content_grade, df_string_output],
|
|
@@ -2699,6 +2697,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 2699 |
# web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
| 2700 |
user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
|
| 2701 |
youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
|
|
|
|
| 2702 |
with gr.Row() as data_state:
|
| 2703 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
| 2704 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
|
@@ -3567,6 +3566,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 3567 |
chatbot_open_ai_streaming,
|
| 3568 |
chatbot_ai,
|
| 3569 |
ai_chatbot_params,
|
|
|
|
| 3570 |
]
|
| 3571 |
demo.load(
|
| 3572 |
init_params,
|
|
|
|
| 49 |
if is_env_local:
|
| 50 |
with open("local_config.json") as f:
|
| 51 |
config = json.load(f)
|
| 52 |
+
IS_ENV_PROD = "False"
|
| 53 |
PASSWORD = config["PASSWORD"]
|
| 54 |
GCS_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
| 55 |
DRIVE_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
|
|
|
| 65 |
AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
|
| 66 |
AWS_REGION_NAME = config["AWS_REGION_NAME"]
|
| 67 |
OUTPUT_PATH = config["OUTPUT_PATH"]
|
| 68 |
+
|
| 69 |
else:
|
| 70 |
+
IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
|
| 71 |
PASSWORD = os.getenv("PASSWORD")
|
| 72 |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 73 |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
|
|
|
| 429 |
|
| 430 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
| 431 |
print("====process_transcript_and_screenshots_on_gcs====")
|
| 432 |
+
transcript, exists = get_transcript_from_gcs(video_id)
|
| 433 |
+
if not exists:
|
| 434 |
+
print("Transcript file does not exist, creating new transcript...")
|
| 435 |
+
transcript = generate_transcription_by_whisper(video_id)
|
| 436 |
+
upload_transcript_to_gcs(video_id, transcript)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
|
| 438 |
# 處理截圖
|
| 439 |
+
is_new_transcript = False
|
| 440 |
for entry in transcript:
|
| 441 |
if 'img_file_id' not in entry:
|
| 442 |
# 檢查 OUTPUT_PATH 是否存在 video_id.mp4
|
|
|
|
| 451 |
if i == 4:
|
| 452 |
raise gr.Error(f"下载视频失败: {str(e)}")
|
| 453 |
time.sleep(5)
|
| 454 |
+
try:
|
| 455 |
+
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
| 456 |
+
screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
|
| 457 |
+
img_file_id = GCS_SERVICE.upload_image_and_get_public_url('video_ai_assistant', screenshot_blob_name, screenshot_path)
|
| 458 |
+
entry['img_file_id'] = img_file_id
|
| 459 |
+
print(f"截图已上传到GCS: {img_file_id}")
|
| 460 |
+
is_new_transcript = True
|
| 461 |
+
except Exception as e:
|
| 462 |
+
print(f"Error processing screenshot: {str(e)}")
|
| 463 |
|
|
|
|
| 464 |
if is_new_transcript:
|
|
|
|
|
|
|
|
|
|
| 465 |
print("===更新逐字稿文件===")
|
| 466 |
+
upload_transcript_to_gcs(video_id, transcript)
|
| 467 |
+
|
| 468 |
+
return transcript
|
| 469 |
+
|
| 470 |
+
def get_transcript(video_id):
|
| 471 |
+
print("====get_transcript====")
|
| 472 |
+
transcript, exists = get_transcript_from_gcs(video_id)
|
| 473 |
+
if not exists:
|
| 474 |
+
raise gr.Error("逐字稿文件不存在於GCS中。")
|
| 475 |
+
|
| 476 |
+
if any('img_file_id' not in entry for entry in transcript):
|
| 477 |
+
raise gr.Error("Some entries in the transcript do not have an associated img_file_id.")
|
| 478 |
+
|
| 479 |
+
print("Transcript is verified with all necessary images.")
|
| 480 |
+
return transcript
|
| 481 |
+
|
| 482 |
+
def get_transcript_from_gcs(video_id):
|
| 483 |
+
print("Checking for transcript in GCS...")
|
| 484 |
+
bucket_name = 'video_ai_assistant'
|
| 485 |
+
transcript_file_name = f'{video_id}_transcript.json'
|
| 486 |
+
transcript_blob_name = f"{video_id}/{transcript_file_name}"
|
| 487 |
+
# Check if the transcript exists in GCS
|
| 488 |
+
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
| 489 |
+
if is_transcript_exists:
|
| 490 |
+
# Download the transcript if it exists
|
| 491 |
+
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
| 492 |
+
return json.loads(transcript_text), True
|
| 493 |
else:
|
| 494 |
+
print("No transcript found for video ID:", video_id)
|
| 495 |
+
return None, False
|
| 496 |
|
| 497 |
+
def upload_transcript_to_gcs(video_id, transcript):
|
| 498 |
+
print("Uploading updated transcript to GCS...")
|
| 499 |
+
bucket_name = 'video_ai_assistant'
|
| 500 |
+
transcript_file_name = f'{video_id}_transcript.json'
|
| 501 |
+
transcript_blob_name = f"{video_id}/{transcript_file_name}"
|
| 502 |
+
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 503 |
+
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
| 504 |
+
print("Transcript uploaded successfully.")
|
| 505 |
|
| 506 |
def process_youtube_link(password, link):
|
| 507 |
verify_password(password)
|
|
|
|
|
|
|
|
|
|
| 508 |
video_id = extract_youtube_id(link)
|
| 509 |
+
|
| 510 |
try:
|
| 511 |
+
if IS_ENV_PROD == "True":
|
| 512 |
+
transcript = get_transcript(video_id)
|
| 513 |
+
else:
|
| 514 |
+
transcript = process_transcript_and_screenshots_on_gcs(video_id)
|
| 515 |
except Exception as e:
|
| 516 |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
|
| 517 |
print("===process_youtube_link error===")
|
|
|
|
| 2607 |
chatbot_ai = gr.update(visible=False)
|
| 2608 |
ai_chatbot_params = gr.update(visible=True)
|
| 2609 |
|
| 2610 |
+
is_env_prod = gr.update(value=False)
|
| 2611 |
+
|
| 2612 |
# if youtube_link in query_params
|
| 2613 |
if "youtube_id" in request.query_params:
|
| 2614 |
youtube_id = request.query_params["youtube_id"]
|
|
|
|
| 2627 |
lesson_plan_accordion = gr.update(visible=False)
|
| 2628 |
exit_ticket_accordion = gr.update(visible=False)
|
| 2629 |
ai_chatbot_params = gr.update(visible=False)
|
| 2630 |
+
|
| 2631 |
+
if IS_ENV_PROD == "True":
|
| 2632 |
+
is_env_prod = gr.update(value=True)
|
| 2633 |
|
| 2634 |
return admin, reading_passage_admin, summary_admin, see_detail, \
|
| 2635 |
worksheet_accordion, lesson_plan_accordion, exit_ticket_accordion, \
|
| 2636 |
password_text, youtube_link, \
|
| 2637 |
+
chatbot_open_ai_streaming, chatbot_ai, ai_chatbot_params, \
|
| 2638 |
+
is_env_prod
|
| 2639 |
|
| 2640 |
def update_state(content_subject, content_grade, trascript, key_moments, questions_answers):
|
| 2641 |
# inputs=[content_subject, content_grade, df_string_output],
|
|
|
|
| 2697 |
# web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
| 2698 |
user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
|
| 2699 |
youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
|
| 2700 |
+
is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
|
| 2701 |
with gr.Row() as data_state:
|
| 2702 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
| 2703 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
|
|
|
| 3566 |
chatbot_open_ai_streaming,
|
| 3567 |
chatbot_ai,
|
| 3568 |
ai_chatbot_params,
|
| 3569 |
+
is_env_prod,
|
| 3570 |
]
|
| 3571 |
demo.load(
|
| 3572 |
init_params,
|