Spaces:
Sleeping
Sleeping
GCS_SERVICE refactor
Browse files
app.py
CHANGED
|
@@ -93,87 +93,6 @@ def verify_password(password):
|
|
| 93 |
raise gr.Error("密碼錯誤")
|
| 94 |
|
| 95 |
# ====gcs====
|
| 96 |
-
def gcs_check_file_exists(gcs_client, bucket_name, file_name):
|
| 97 |
-
"""
|
| 98 |
-
检查 GCS 存储桶中是否存在指定的文件
|
| 99 |
-
file_name 格式:{folder_name}/{file_name}
|
| 100 |
-
"""
|
| 101 |
-
bucket = gcs_client.bucket(bucket_name)
|
| 102 |
-
blob = bucket.blob(file_name)
|
| 103 |
-
return blob.exists()
|
| 104 |
-
|
| 105 |
-
def upload_file_to_gcs(gcs_client, bucket_name, destination_blob_name, file_path):
|
| 106 |
-
"""上传文件到指定的 GCS 存储桶"""
|
| 107 |
-
bucket = gcs_client.bucket(bucket_name)
|
| 108 |
-
blob = bucket.blob(destination_blob_name)
|
| 109 |
-
blob.upload_from_filename(file_path)
|
| 110 |
-
print(f"File {file_path} uploaded to {destination_blob_name} in GCS.")
|
| 111 |
-
|
| 112 |
-
def upload_file_to_gcs_with_json_string(gcs_client, bucket_name, destination_blob_name, json_string):
|
| 113 |
-
"""上传字符串到指定的 GCS 存储桶"""
|
| 114 |
-
bucket = gcs_client.bucket(bucket_name)
|
| 115 |
-
blob = bucket.blob(destination_blob_name)
|
| 116 |
-
blob.upload_from_string(json_string)
|
| 117 |
-
print(f"JSON string uploaded to {destination_blob_name} in GCS.")
|
| 118 |
-
|
| 119 |
-
def download_blob_to_string(gcs_client, bucket_name, source_blob_name):
|
| 120 |
-
"""从 GCS 下载文件内容到字符串"""
|
| 121 |
-
bucket = gcs_client.bucket(bucket_name)
|
| 122 |
-
blob = bucket.blob(source_blob_name)
|
| 123 |
-
return blob.download_as_text()
|
| 124 |
-
|
| 125 |
-
def make_blob_public(gcs_client, bucket_name, blob_name):
|
| 126 |
-
"""将指定的 GCS 对象设置为公共可读"""
|
| 127 |
-
bucket = gcs_client.bucket(bucket_name)
|
| 128 |
-
blob = bucket.blob(blob_name)
|
| 129 |
-
blob.make_public()
|
| 130 |
-
print(f"Blob {blob_name} is now publicly accessible at {blob.public_url}")
|
| 131 |
-
|
| 132 |
-
def get_blob_public_url(gcs_client, bucket_name, blob_name):
|
| 133 |
-
"""获取指定 GCS 对象的公开 URL"""
|
| 134 |
-
bucket = gcs_client.bucket(bucket_name)
|
| 135 |
-
blob = bucket.blob(blob_name)
|
| 136 |
-
return blob.public_url
|
| 137 |
-
|
| 138 |
-
def upload_img_and_get_public_url(gcs_client, bucket_name, file_name, file_path):
|
| 139 |
-
"""上传图片到 GCS 并获取其公开 URL"""
|
| 140 |
-
# 上传图片
|
| 141 |
-
upload_file_to_gcs(gcs_client, bucket_name, file_name, file_path)
|
| 142 |
-
# 将上传的图片设置为公开
|
| 143 |
-
make_blob_public(gcs_client, bucket_name, file_name)
|
| 144 |
-
# 获取图片的公开 URL
|
| 145 |
-
public_url = get_blob_public_url(gcs_client, bucket_name, file_name)
|
| 146 |
-
print(f"Public URL for the uploaded image: {public_url}")
|
| 147 |
-
return public_url
|
| 148 |
-
|
| 149 |
-
def copy_all_files_from_drive_to_gcs(drive_service, gcs_client, drive_folder_id, bucket_name, gcs_folder_name):
|
| 150 |
-
# Get all files from the folder
|
| 151 |
-
query = f"'{drive_folder_id}' in parents and trashed = false"
|
| 152 |
-
response = drive_service.files().list(q=query).execute()
|
| 153 |
-
files = response.get('files', [])
|
| 154 |
-
for file in files:
|
| 155 |
-
# Copy each file to GCS
|
| 156 |
-
file_id = file['id']
|
| 157 |
-
file_name = file['name']
|
| 158 |
-
gcs_destination_path = f"{gcs_folder_name}/{file_name}"
|
| 159 |
-
copy_file_from_drive_to_gcs(drive_service, gcs_client, file_id, bucket_name, gcs_destination_path)
|
| 160 |
-
|
| 161 |
-
def copy_file_from_drive_to_gcs(drive_service, gcs_client, file_id, bucket_name, gcs_destination_path):
|
| 162 |
-
# Download file content from Drive
|
| 163 |
-
request = drive_service.files().get_media(fileId=file_id)
|
| 164 |
-
fh = io.BytesIO()
|
| 165 |
-
downloader = MediaIoBaseDownload(fh, request)
|
| 166 |
-
done = False
|
| 167 |
-
while not done:
|
| 168 |
-
status, done = downloader.next_chunk()
|
| 169 |
-
fh.seek(0)
|
| 170 |
-
file_content = fh.getvalue()
|
| 171 |
-
|
| 172 |
-
# Upload file content to GCS
|
| 173 |
-
bucket = gcs_client.bucket(bucket_name)
|
| 174 |
-
blob = bucket.blob(gcs_destination_path)
|
| 175 |
-
blob.upload_from_string(file_content)
|
| 176 |
-
print(f"File {file_id} copied to GCS at {gcs_destination_path}.")
|
| 177 |
|
| 178 |
def delete_blob(gcs_client, bucket_name, blob_name):
|
| 179 |
"""删除指定的 GCS 对象"""
|
|
@@ -483,12 +402,13 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 483 |
transcript = generate_transcription_by_whisper(video_id)
|
| 484 |
|
| 485 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 486 |
-
|
|
|
|
| 487 |
is_new_transcript = True
|
| 488 |
else:
|
| 489 |
# 逐字稿已存在,下载逐字稿内容
|
| 490 |
print("逐字稿已存在于GCS中")
|
| 491 |
-
transcript_text =
|
| 492 |
transcript = json.loads(transcript_text)
|
| 493 |
|
| 494 |
# print("===確認其他衍生文件===")
|
|
@@ -517,7 +437,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 517 |
# 截图
|
| 518 |
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
| 519 |
screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
|
| 520 |
-
img_file_id =
|
| 521 |
entry['img_file_id'] = img_file_id
|
| 522 |
print(f"截图已上传到GCS: {img_file_id}")
|
| 523 |
is_new_transcript = True
|
|
@@ -529,7 +449,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 529 |
print(transcript)
|
| 530 |
print("===更新逐字稿文件===")
|
| 531 |
updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 532 |
-
|
| 533 |
print("逐字稿已更新,包括截图链接")
|
| 534 |
updated_transcript_json = json.loads(updated_transcript_text)
|
| 535 |
else:
|
|
@@ -723,12 +643,12 @@ def get_reading_passage(video_id, df_string, source):
|
|
| 723 |
reading_passage = generate_reading_passage(df_string)
|
| 724 |
reading_passage_json = {"reading_passage": str(reading_passage)}
|
| 725 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
| 726 |
-
|
| 727 |
print("reading_passage已上传到GCS")
|
| 728 |
else:
|
| 729 |
# reading_passage已存在,下载内容
|
| 730 |
print("reading_passage已存在于GCS中")
|
| 731 |
-
reading_passage_text =
|
| 732 |
reading_passage_json = json.loads(reading_passage_text)
|
| 733 |
|
| 734 |
elif source == "drive":
|
|
@@ -805,12 +725,12 @@ def get_mind_map(video_id, df_string, source):
|
|
| 805 |
mind_map = generate_mind_map(df_string)
|
| 806 |
mind_map_json = {"mind_map": str(mind_map)}
|
| 807 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
| 808 |
-
|
| 809 |
print("mind_map已上傳到GCS")
|
| 810 |
else:
|
| 811 |
# mindmap已存在,下载内容
|
| 812 |
print("mind_map已存在于GCS中")
|
| 813 |
-
mind_map_text =
|
| 814 |
mind_map_json = json.loads(mind_map_text)
|
| 815 |
|
| 816 |
elif source == "drive":
|
|
@@ -889,12 +809,12 @@ def get_video_id_summary(video_id, df_string, source):
|
|
| 889 |
summary = generate_summarise(df_string, meta_data)
|
| 890 |
summary_json = {"summary": str(summary)}
|
| 891 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
| 892 |
-
|
| 893 |
print("summary已上传到GCS")
|
| 894 |
else:
|
| 895 |
# summary已存在,下载内容
|
| 896 |
print("summary已存在于GCS中")
|
| 897 |
-
summary_text =
|
| 898 |
summary_json = json.loads(summary_text)
|
| 899 |
|
| 900 |
elif source == "drive":
|
|
@@ -1012,12 +932,12 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
| 1012 |
if not is_questions_exists:
|
| 1013 |
questions = generate_questions(df_string)
|
| 1014 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
| 1015 |
-
|
| 1016 |
print("questions已上傳到GCS")
|
| 1017 |
else:
|
| 1018 |
# 逐字稿已存在,下载逐字稿内容
|
| 1019 |
print("questions已存在于GCS中")
|
| 1020 |
-
questions_text =
|
| 1021 |
questions = json.loads(questions_text)
|
| 1022 |
|
| 1023 |
elif source == "drive":
|
|
@@ -1103,12 +1023,12 @@ def get_questions_answers(video_id, df_string, source="gcs"):
|
|
| 1103 |
if not is_questions_answers_exists:
|
| 1104 |
questions_answers = generate_questions_answers(df_string)
|
| 1105 |
questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
| 1106 |
-
|
| 1107 |
print("questions_answers已上傳到GCS")
|
| 1108 |
else:
|
| 1109 |
# questions_answers已存在,下载内容
|
| 1110 |
print("questions_answers已存在于GCS中")
|
| 1111 |
-
questions_answers_text =
|
| 1112 |
questions_answers = json.loads(questions_answers_text)
|
| 1113 |
except:
|
| 1114 |
questions = get_questions(video_id, df_string, source)
|
|
@@ -1202,12 +1122,12 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
| 1202 |
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
| 1203 |
key_moments_json = {"key_moments": key_moments}
|
| 1204 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
| 1205 |
-
|
| 1206 |
print("key_moments已上傳到GCS")
|
| 1207 |
else:
|
| 1208 |
# key_moments已存在,下载内容
|
| 1209 |
print("key_moments已存在于GCS中")
|
| 1210 |
-
key_moments_text =
|
| 1211 |
key_moments_json = json.loads(key_moments_text)
|
| 1212 |
# 檢查 key_moments 是否有 keywords
|
| 1213 |
print("===檢查 key_moments 是否有 keywords===")
|
|
@@ -1222,8 +1142,8 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
| 1222 |
has_keywords_added = True
|
| 1223 |
if has_keywords_added:
|
| 1224 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
| 1225 |
-
|
| 1226 |
-
key_moments_text =
|
| 1227 |
key_moments_json = json.loads(key_moments_text)
|
| 1228 |
|
| 1229 |
elif source == "drive":
|
|
@@ -1545,7 +1465,7 @@ def get_LLM_content(video_id, kind):
|
|
| 1545 |
# 检查 file 是否存在
|
| 1546 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 1547 |
if is_file_exists:
|
| 1548 |
-
content =
|
| 1549 |
content_json = json.loads(content)
|
| 1550 |
if kind == "reading_passage_latex":
|
| 1551 |
content_text = content_json["reading_passage"]
|
|
@@ -1569,7 +1489,7 @@ def delete_LLM_content(video_id, kind):
|
|
| 1569 |
# 检查 file 是否存在
|
| 1570 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 1571 |
if is_file_exists:
|
| 1572 |
-
delete_blob(
|
| 1573 |
print(f"{file_name}已从GCS中删除")
|
| 1574 |
return gr.update(value="", interactive=False)
|
| 1575 |
|
|
@@ -1585,17 +1505,17 @@ def update_LLM_content(video_id, new_content, kind):
|
|
| 1585 |
print(new_content)
|
| 1586 |
reading_passage_json = {"reading_passage": str(new_content)}
|
| 1587 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
| 1588 |
-
|
| 1589 |
updated_content = new_content
|
| 1590 |
elif kind == "summary_markdown":
|
| 1591 |
summary_json = {"summary": str(new_content)}
|
| 1592 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
| 1593 |
-
|
| 1594 |
updated_content = new_content
|
| 1595 |
elif kind == "mind_map":
|
| 1596 |
mind_map_json = {"mind_map": str(new_content)}
|
| 1597 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
| 1598 |
-
|
| 1599 |
updated_content = mind_map_text
|
| 1600 |
elif kind == "key_moments":
|
| 1601 |
# from update_LLM_btn -> new_content is a string
|
|
@@ -1606,7 +1526,7 @@ def update_LLM_content(video_id, new_content, kind):
|
|
| 1606 |
key_moments_list = new_content
|
| 1607 |
key_moments_json = {"key_moments": key_moments_list}
|
| 1608 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
| 1609 |
-
|
| 1610 |
updated_content = key_moments_text
|
| 1611 |
elif kind == "transcript":
|
| 1612 |
if isinstance(new_content, str):
|
|
@@ -1614,7 +1534,7 @@ def update_LLM_content(video_id, new_content, kind):
|
|
| 1614 |
else:
|
| 1615 |
transcript_json = new_content
|
| 1616 |
transcript_text = json.dumps(transcript_json, ensure_ascii=False, indent=2)
|
| 1617 |
-
|
| 1618 |
updated_content = transcript_text
|
| 1619 |
elif kind == "questions":
|
| 1620 |
# from update_LLM_btn -> new_content is a string
|
|
@@ -1624,7 +1544,7 @@ def update_LLM_content(video_id, new_content, kind):
|
|
| 1624 |
else:
|
| 1625 |
questions_json = new_content
|
| 1626 |
questions_text = json.dumps(questions_json, ensure_ascii=False, indent=2)
|
| 1627 |
-
|
| 1628 |
updated_content = questions_text
|
| 1629 |
elif kind == "questions_answers":
|
| 1630 |
# from update_LLM_btn -> new_content is a string
|
|
@@ -1634,7 +1554,7 @@ def update_LLM_content(video_id, new_content, kind):
|
|
| 1634 |
else:
|
| 1635 |
questions_answers_json = new_content
|
| 1636 |
questions_answers_text = json.dumps(questions_answers_json, ensure_ascii=False, indent=2)
|
| 1637 |
-
|
| 1638 |
updated_content = questions_answers_text
|
| 1639 |
|
| 1640 |
print(f"{kind} 已更新到GCS")
|
|
@@ -1701,7 +1621,7 @@ def reading_passage_add_latex_version(video_id):
|
|
| 1701 |
|
| 1702 |
# 逐字稿已存在,下载逐字稿内容
|
| 1703 |
print("reading_passage 已存在于GCS中,轉換 Latex 模式")
|
| 1704 |
-
reading_passage_text =
|
| 1705 |
reading_passage_json = json.loads(reading_passage_text)
|
| 1706 |
original_reading_passage = reading_passage_json["reading_passage"]
|
| 1707 |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
|
@@ -1734,7 +1654,7 @@ def reading_passage_add_latex_version(video_id):
|
|
| 1734 |
# 另存為 reading_passage_latex.json
|
| 1735 |
new_file_name = f'{video_id}_reading_passage_latex.json'
|
| 1736 |
new_blob_name = f"{video_id}/{new_file_name}"
|
| 1737 |
-
|
| 1738 |
|
| 1739 |
return new_reading_passage
|
| 1740 |
|
|
@@ -1754,7 +1674,7 @@ def summary_add_markdown_version(video_id):
|
|
| 1754 |
|
| 1755 |
# 逐字稿已存在,下载逐字稿内容
|
| 1756 |
print("summary 已存在于GCS中,轉換 Markdown 模式")
|
| 1757 |
-
summary_text =
|
| 1758 |
summary_json = json.loads(summary_text)
|
| 1759 |
original_summary = summary_json["summary"]
|
| 1760 |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
|
@@ -1803,7 +1723,7 @@ def summary_add_markdown_version(video_id):
|
|
| 1803 |
# 另存為 summary_markdown.json
|
| 1804 |
new_file_name = f'{video_id}_summary_markdown.json'
|
| 1805 |
new_blob_name = f"{video_id}/{new_file_name}"
|
| 1806 |
-
|
| 1807 |
|
| 1808 |
return new_summary
|
| 1809 |
|
|
@@ -1827,7 +1747,7 @@ def get_meta_data(video_id, source="gcs"):
|
|
| 1827 |
else:
|
| 1828 |
# meta_data已存在,下载内容
|
| 1829 |
print("meta_data已存在于GCS中")
|
| 1830 |
-
meta_data_text =
|
| 1831 |
meta_data_json = json.loads(meta_data_text)
|
| 1832 |
|
| 1833 |
# meta_data_json grade 數字轉換成文字
|
|
@@ -1865,11 +1785,11 @@ def get_ai_content(password, video_id, df_string, topic, grade, level, specific_
|
|
| 1865 |
# 先建立一個 ai_content_list.json
|
| 1866 |
ai_content_list = []
|
| 1867 |
ai_content_text = json.dumps(ai_content_list, ensure_ascii=False, indent=2)
|
| 1868 |
-
|
| 1869 |
print("ai_content_list [] 已上傳到GCS")
|
| 1870 |
|
| 1871 |
# 此時 ai_content_list 已存在
|
| 1872 |
-
ai_content_list_string =
|
| 1873 |
ai_content_list = json.loads(ai_content_list_string)
|
| 1874 |
# by key 找到 ai_content (topic, grade, level, specific_feature, content_type)
|
| 1875 |
target_kvs = {
|
|
@@ -1896,7 +1816,7 @@ def get_ai_content(password, video_id, df_string, topic, grade, level, specific_
|
|
| 1896 |
|
| 1897 |
ai_content_list.append(ai_content_json)
|
| 1898 |
ai_content_text = json.dumps(ai_content_list, ensure_ascii=False, indent=2)
|
| 1899 |
-
|
| 1900 |
print("ai_content已上傳到GCS")
|
| 1901 |
else:
|
| 1902 |
ai_content_json = ai_content_json[-1]
|
|
|
|
| 93 |
raise gr.Error("密碼錯誤")
|
| 94 |
|
| 95 |
# ====gcs====
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
def delete_blob(gcs_client, bucket_name, blob_name):
|
| 98 |
"""删除指定的 GCS 对象"""
|
|
|
|
| 402 |
transcript = generate_transcription_by_whisper(video_id)
|
| 403 |
|
| 404 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 405 |
+
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
| 406 |
+
|
| 407 |
is_new_transcript = True
|
| 408 |
else:
|
| 409 |
# 逐字稿已存在,下载逐字稿内容
|
| 410 |
print("逐字稿已存在于GCS中")
|
| 411 |
+
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
| 412 |
transcript = json.loads(transcript_text)
|
| 413 |
|
| 414 |
# print("===確認其他衍生文件===")
|
|
|
|
| 437 |
# 截图
|
| 438 |
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
| 439 |
screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
|
| 440 |
+
img_file_id = GCS_SERVICE.upload_image_and_get_public_url(bucket_name, screenshot_blob_name, screenshot_path)
|
| 441 |
entry['img_file_id'] = img_file_id
|
| 442 |
print(f"截图已上传到GCS: {img_file_id}")
|
| 443 |
is_new_transcript = True
|
|
|
|
| 449 |
print(transcript)
|
| 450 |
print("===更新逐字稿文件===")
|
| 451 |
updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 452 |
+
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, updated_transcript_text)
|
| 453 |
print("逐字稿已更新,包括截图链接")
|
| 454 |
updated_transcript_json = json.loads(updated_transcript_text)
|
| 455 |
else:
|
|
|
|
| 643 |
reading_passage = generate_reading_passage(df_string)
|
| 644 |
reading_passage_json = {"reading_passage": str(reading_passage)}
|
| 645 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
| 646 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
|
| 647 |
print("reading_passage已上传到GCS")
|
| 648 |
else:
|
| 649 |
# reading_passage已存在,下载内容
|
| 650 |
print("reading_passage已存在于GCS中")
|
| 651 |
+
reading_passage_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 652 |
reading_passage_json = json.loads(reading_passage_text)
|
| 653 |
|
| 654 |
elif source == "drive":
|
|
|
|
| 725 |
mind_map = generate_mind_map(df_string)
|
| 726 |
mind_map_json = {"mind_map": str(mind_map)}
|
| 727 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
| 728 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
|
| 729 |
print("mind_map已上傳到GCS")
|
| 730 |
else:
|
| 731 |
# mindmap已存在,下载内容
|
| 732 |
print("mind_map已存在于GCS中")
|
| 733 |
+
mind_map_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 734 |
mind_map_json = json.loads(mind_map_text)
|
| 735 |
|
| 736 |
elif source == "drive":
|
|
|
|
| 809 |
summary = generate_summarise(df_string, meta_data)
|
| 810 |
summary_json = {"summary": str(summary)}
|
| 811 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
| 812 |
+
GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
|
| 813 |
print("summary已上传到GCS")
|
| 814 |
else:
|
| 815 |
# summary已存在,下载内容
|
| 816 |
print("summary已存在于GCS中")
|
| 817 |
+
summary_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 818 |
summary_json = json.loads(summary_text)
|
| 819 |
|
| 820 |
elif source == "drive":
|
|
|
|
| 932 |
if not is_questions_exists:
|
| 933 |
questions = generate_questions(df_string)
|
| 934 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
| 935 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
|
| 936 |
print("questions已上傳到GCS")
|
| 937 |
else:
|
| 938 |
# 逐字稿已存在,下载逐字稿内容
|
| 939 |
print("questions已存在于GCS中")
|
| 940 |
+
questions_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 941 |
questions = json.loads(questions_text)
|
| 942 |
|
| 943 |
elif source == "drive":
|
|
|
|
| 1023 |
if not is_questions_answers_exists:
|
| 1024 |
questions_answers = generate_questions_answers(df_string)
|
| 1025 |
questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
| 1026 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
|
| 1027 |
print("questions_answers已上傳到GCS")
|
| 1028 |
else:
|
| 1029 |
# questions_answers已存在,下载内容
|
| 1030 |
print("questions_answers已存在于GCS中")
|
| 1031 |
+
questions_answers_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 1032 |
questions_answers = json.loads(questions_answers_text)
|
| 1033 |
except:
|
| 1034 |
questions = get_questions(video_id, df_string, source)
|
|
|
|
| 1122 |
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
| 1123 |
key_moments_json = {"key_moments": key_moments}
|
| 1124 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
| 1125 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
| 1126 |
print("key_moments已上傳到GCS")
|
| 1127 |
else:
|
| 1128 |
# key_moments已存在,下载内容
|
| 1129 |
print("key_moments已存在于GCS中")
|
| 1130 |
+
key_moments_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 1131 |
key_moments_json = json.loads(key_moments_text)
|
| 1132 |
# 檢查 key_moments 是否有 keywords
|
| 1133 |
print("===檢查 key_moments 是否有 keywords===")
|
|
|
|
| 1142 |
has_keywords_added = True
|
| 1143 |
if has_keywords_added:
|
| 1144 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
| 1145 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
| 1146 |
+
key_moments_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 1147 |
key_moments_json = json.loads(key_moments_text)
|
| 1148 |
|
| 1149 |
elif source == "drive":
|
|
|
|
| 1465 |
# 检查 file 是否存在
|
| 1466 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 1467 |
if is_file_exists:
|
| 1468 |
+
content = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 1469 |
content_json = json.loads(content)
|
| 1470 |
if kind == "reading_passage_latex":
|
| 1471 |
content_text = content_json["reading_passage"]
|
|
|
|
| 1489 |
# 检查 file 是否存在
|
| 1490 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 1491 |
if is_file_exists:
|
| 1492 |
+
GCS_SERVICE.delete_blob(bucket_name, blob_name)
|
| 1493 |
print(f"{file_name}已从GCS中删除")
|
| 1494 |
return gr.update(value="", interactive=False)
|
| 1495 |
|
|
|
|
| 1505 |
print(new_content)
|
| 1506 |
reading_passage_json = {"reading_passage": str(new_content)}
|
| 1507 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
| 1508 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
|
| 1509 |
updated_content = new_content
|
| 1510 |
elif kind == "summary_markdown":
|
| 1511 |
summary_json = {"summary": str(new_content)}
|
| 1512 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
| 1513 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, summary_text)
|
| 1514 |
updated_content = new_content
|
| 1515 |
elif kind == "mind_map":
|
| 1516 |
mind_map_json = {"mind_map": str(new_content)}
|
| 1517 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
| 1518 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
|
| 1519 |
updated_content = mind_map_text
|
| 1520 |
elif kind == "key_moments":
|
| 1521 |
# from update_LLM_btn -> new_content is a string
|
|
|
|
| 1526 |
key_moments_list = new_content
|
| 1527 |
key_moments_json = {"key_moments": key_moments_list}
|
| 1528 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
| 1529 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
| 1530 |
updated_content = key_moments_text
|
| 1531 |
elif kind == "transcript":
|
| 1532 |
if isinstance(new_content, str):
|
|
|
|
| 1534 |
else:
|
| 1535 |
transcript_json = new_content
|
| 1536 |
transcript_text = json.dumps(transcript_json, ensure_ascii=False, indent=2)
|
| 1537 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, transcript_text)
|
| 1538 |
updated_content = transcript_text
|
| 1539 |
elif kind == "questions":
|
| 1540 |
# from update_LLM_btn -> new_content is a string
|
|
|
|
| 1544 |
else:
|
| 1545 |
questions_json = new_content
|
| 1546 |
questions_text = json.dumps(questions_json, ensure_ascii=False, indent=2)
|
| 1547 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
|
| 1548 |
updated_content = questions_text
|
| 1549 |
elif kind == "questions_answers":
|
| 1550 |
# from update_LLM_btn -> new_content is a string
|
|
|
|
| 1554 |
else:
|
| 1555 |
questions_answers_json = new_content
|
| 1556 |
questions_answers_text = json.dumps(questions_answers_json, ensure_ascii=False, indent=2)
|
| 1557 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
|
| 1558 |
updated_content = questions_answers_text
|
| 1559 |
|
| 1560 |
print(f"{kind} 已更新到GCS")
|
|
|
|
| 1621 |
|
| 1622 |
# 逐字稿已存在,下载逐字稿内容
|
| 1623 |
print("reading_passage 已存在于GCS中,轉換 Latex 模式")
|
| 1624 |
+
reading_passage_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 1625 |
reading_passage_json = json.loads(reading_passage_text)
|
| 1626 |
original_reading_passage = reading_passage_json["reading_passage"]
|
| 1627 |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
|
|
|
| 1654 |
# 另存為 reading_passage_latex.json
|
| 1655 |
new_file_name = f'{video_id}_reading_passage_latex.json'
|
| 1656 |
new_blob_name = f"{video_id}/{new_file_name}"
|
| 1657 |
+
GCS_SERVICE.upload_json_string(bucket_name, new_blob_name, reading_passage_text)
|
| 1658 |
|
| 1659 |
return new_reading_passage
|
| 1660 |
|
|
|
|
| 1674 |
|
| 1675 |
# 逐字稿已存在,下载逐字稿内容
|
| 1676 |
print("summary 已存在于GCS中,轉換 Markdown 模式")
|
| 1677 |
+
summary_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 1678 |
summary_json = json.loads(summary_text)
|
| 1679 |
original_summary = summary_json["summary"]
|
| 1680 |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
|
|
|
| 1723 |
# 另存為 summary_markdown.json
|
| 1724 |
new_file_name = f'{video_id}_summary_markdown.json'
|
| 1725 |
new_blob_name = f"{video_id}/{new_file_name}"
|
| 1726 |
+
GCS_SERVICE.upload_json_string(bucket_name, new_blob_name, summary_text)
|
| 1727 |
|
| 1728 |
return new_summary
|
| 1729 |
|
|
|
|
| 1747 |
else:
|
| 1748 |
# meta_data已存在,下载内容
|
| 1749 |
print("meta_data已存在于GCS中")
|
| 1750 |
+
meta_data_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 1751 |
meta_data_json = json.loads(meta_data_text)
|
| 1752 |
|
| 1753 |
# meta_data_json grade 數字轉換成文字
|
|
|
|
| 1785 |
# 先建立一個 ai_content_list.json
|
| 1786 |
ai_content_list = []
|
| 1787 |
ai_content_text = json.dumps(ai_content_list, ensure_ascii=False, indent=2)
|
| 1788 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, ai_content_text)
|
| 1789 |
print("ai_content_list [] 已上傳到GCS")
|
| 1790 |
|
| 1791 |
# 此時 ai_content_list 已存在
|
| 1792 |
+
ai_content_list_string = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
| 1793 |
ai_content_list = json.loads(ai_content_list_string)
|
| 1794 |
# by key 找到 ai_content (topic, grade, level, specific_feature, content_type)
|
| 1795 |
target_kvs = {
|
|
|
|
| 1816 |
|
| 1817 |
ai_content_list.append(ai_content_json)
|
| 1818 |
ai_content_text = json.dumps(ai_content_list, ensure_ascii=False, indent=2)
|
| 1819 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, ai_content_text)
|
| 1820 |
print("ai_content已上傳到GCS")
|
| 1821 |
else:
|
| 1822 |
ai_content_json = ai_content_json[-1]
|