Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
|
@@ -503,7 +503,7 @@ def upload_transcript_to_gcs(video_id, transcript):
|
|
| 503 |
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
| 504 |
print("Transcript uploaded successfully.")
|
| 505 |
|
| 506 |
-
def process_youtube_link(password, link):
|
| 507 |
verify_password(password)
|
| 508 |
video_id = extract_youtube_id(link)
|
| 509 |
|
|
@@ -545,21 +545,21 @@ def process_youtube_link(password, link):
|
|
| 545 |
|
| 546 |
# 基于逐字稿生成其他所需的输出
|
| 547 |
source = "gcs"
|
| 548 |
-
questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source)
|
| 549 |
questions_answers_json = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
| 550 |
-
summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source)
|
| 551 |
summary_text = summary_json["summary"]
|
| 552 |
summary = summary_json["summary"]
|
| 553 |
-
key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source)
|
| 554 |
key_moments = key_moments_json["key_moments"]
|
| 555 |
key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
|
| 556 |
key_moments_html = get_key_moments_html(key_moments)
|
| 557 |
html_content = format_transcript_to_html(formatted_transcript)
|
| 558 |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
|
| 559 |
-
mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source)
|
| 560 |
mind_map = mind_map_json["mind_map"]
|
| 561 |
mind_map_html = get_mind_map_html(mind_map)
|
| 562 |
-
reading_passage_json = get_reading_passage(video_id, formatted_simple_transcript, source)
|
| 563 |
reading_passage_text = reading_passage_json["reading_passage"]
|
| 564 |
reading_passage = reading_passage_json["reading_passage"]
|
| 565 |
meta_data = get_meta_data(video_id)
|
|
@@ -703,70 +703,75 @@ def split_data(df_string, word_base=100000):
|
|
| 703 |
|
| 704 |
return segments
|
| 705 |
|
| 706 |
-
def
|
| 707 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 708 |
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
# 使用 OPEN AI 生成 Reading Passage
|
| 712 |
-
messages = [
|
| 713 |
-
{"role": "system", "content": sys_content},
|
| 714 |
-
{"role": "user", "content": user_content}
|
| 715 |
-
]
|
| 716 |
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
"max_tokens": 4000,
|
| 721 |
-
"response_format": response_format
|
| 722 |
-
}
|
| 723 |
|
| 724 |
-
|
| 725 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
except Exception as e:
|
| 730 |
-
print(f"Error generating reading passage: {str(e)}")
|
| 731 |
-
print("using REDROCK")
|
| 732 |
-
# 使用 REDROCK 生成 Reading Passage
|
| 733 |
-
messages = [
|
| 734 |
-
{"role": "user", "content": user_content}
|
| 735 |
-
]
|
| 736 |
-
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
|
| 737 |
-
# model_id = "anthropic.claude-3-haiku-20240307-v1:0"
|
| 738 |
-
kwargs = {
|
| 739 |
-
"modelId": model_id,
|
| 740 |
-
"contentType": "application/json",
|
| 741 |
-
"accept": "application/json",
|
| 742 |
-
"body": json.dumps({
|
| 743 |
-
"anthropic_version": "bedrock-2023-05-31",
|
| 744 |
-
"max_tokens": 4000,
|
| 745 |
-
"system": sys_content,
|
| 746 |
-
"messages": messages
|
| 747 |
-
})
|
| 748 |
-
}
|
| 749 |
-
response = BEDROCK_CLIENT.invoke_model(**kwargs)
|
| 750 |
-
response_body = json.loads(response.get('body').read())
|
| 751 |
-
content = response_body.get('content')[0].get('text')
|
| 752 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
print("=====content=====")
|
| 754 |
print(content)
|
| 755 |
print("=====content=====")
|
| 756 |
|
| 757 |
return content
|
| 758 |
|
| 759 |
-
def get_reading_passage(video_id, df_string, source):
|
| 760 |
if source == "gcs":
|
| 761 |
print("===get_reading_passage on gcs===")
|
| 762 |
-
gcs_client = GCS_CLIENT
|
| 763 |
bucket_name = 'video_ai_assistant'
|
| 764 |
file_name = f'{video_id}_reading_passage_latex.json'
|
| 765 |
blob_name = f"{video_id}/{file_name}"
|
| 766 |
# 检查 reading_passage 是否存在
|
| 767 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 768 |
if not is_file_exists:
|
| 769 |
-
reading_passage = generate_reading_passage(df_string)
|
| 770 |
reading_passage_json = {"reading_passage": str(reading_passage)}
|
| 771 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
| 772 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
|
|
@@ -799,7 +804,7 @@ def get_reading_passage(video_id, df_string, source):
|
|
| 799 |
|
| 800 |
return reading_passage_json
|
| 801 |
|
| 802 |
-
def generate_reading_passage(df_string):
|
| 803 |
print("===generate_reading_passage===")
|
| 804 |
segments = split_data(df_string, word_base=100000)
|
| 805 |
all_content = []
|
|
@@ -818,7 +823,7 @@ def generate_reading_passage(df_string):
|
|
| 818 |
加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
|
| 819 |
請直接給出文章,不用介紹怎麼處理的或是文章字數等等
|
| 820 |
"""
|
| 821 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
| 822 |
all_content.append(content + "\n")
|
| 823 |
|
| 824 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
|
@@ -831,7 +836,7 @@ def text_to_speech(video_id, text):
|
|
| 831 |
tts.save(filename)
|
| 832 |
return filename
|
| 833 |
|
| 834 |
-
def get_mind_map(video_id, df_string, source):
|
| 835 |
if source == "gcs":
|
| 836 |
print("===get_mind_map on gcs===")
|
| 837 |
gcs_client = GCS_CLIENT
|
|
@@ -841,7 +846,7 @@ def get_mind_map(video_id, df_string, source):
|
|
| 841 |
# 检查檔案是否存在
|
| 842 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 843 |
if not is_file_exists:
|
| 844 |
-
mind_map = generate_mind_map(df_string)
|
| 845 |
mind_map_json = {"mind_map": str(mind_map)}
|
| 846 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
| 847 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
|
|
@@ -862,7 +867,7 @@ def get_mind_map(video_id, df_string, source):
|
|
| 862 |
# 检查檔案是否存在
|
| 863 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 864 |
if not exists:
|
| 865 |
-
mind_map = generate_mind_map(df_string)
|
| 866 |
mind_map_json = {"mind_map": str(mind_map)}
|
| 867 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
| 868 |
upload_content_directly(service, file_name, folder_id, mind_map_text)
|
|
@@ -875,7 +880,7 @@ def get_mind_map(video_id, df_string, source):
|
|
| 875 |
|
| 876 |
return mind_map_json
|
| 877 |
|
| 878 |
-
def generate_mind_map(df_string):
|
| 879 |
print("===generate_mind_map===")
|
| 880 |
segments = split_data(df_string, word_base=100000)
|
| 881 |
all_content = []
|
|
@@ -887,7 +892,7 @@ def generate_mind_map(df_string):
|
|
| 887 |
注意:不需要前後文敘述,直接給出 markdown 文本即可
|
| 888 |
這對我很重要
|
| 889 |
"""
|
| 890 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
| 891 |
all_content.append(content + "\n")
|
| 892 |
|
| 893 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
|
@@ -906,10 +911,9 @@ def get_mind_map_html(mind_map):
|
|
| 906 |
"""
|
| 907 |
return mind_map_html
|
| 908 |
|
| 909 |
-
def get_video_id_summary(video_id, df_string, source):
|
| 910 |
if source == "gcs":
|
| 911 |
print("===get_video_id_summary on gcs===")
|
| 912 |
-
gcs_client = GCS_CLIENT
|
| 913 |
bucket_name = 'video_ai_assistant'
|
| 914 |
file_name = f'{video_id}_summary_markdown.json'
|
| 915 |
summary_file_blob_name = f"{video_id}/{file_name}"
|
|
@@ -917,7 +921,7 @@ def get_video_id_summary(video_id, df_string, source):
|
|
| 917 |
is_summary_file_exists = GCS_SERVICE.check_file_exists(bucket_name, summary_file_blob_name)
|
| 918 |
if not is_summary_file_exists:
|
| 919 |
meta_data = get_meta_data(video_id)
|
| 920 |
-
summary = generate_summarise(df_string, meta_data)
|
| 921 |
summary_json = {"summary": str(summary)}
|
| 922 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
| 923 |
GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
|
|
@@ -939,7 +943,7 @@ def get_video_id_summary(video_id, df_string, source):
|
|
| 939 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 940 |
if not exists:
|
| 941 |
meta_data = get_meta_data(video_id)
|
| 942 |
-
summary = generate_summarise(df_string, meta_data)
|
| 943 |
summary_json = {"summary": str(summary)}
|
| 944 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
| 945 |
|
|
@@ -960,7 +964,7 @@ def get_video_id_summary(video_id, df_string, source):
|
|
| 960 |
|
| 961 |
return summary_json
|
| 962 |
|
| 963 |
-
def generate_summarise(df_string, metadata=None):
|
| 964 |
print("===generate_summarise===")
|
| 965 |
# 使用 OpenAI 生成基于上传数据的问题
|
| 966 |
if metadata:
|
|
@@ -1008,7 +1012,7 @@ def generate_summarise(df_string, metadata=None):
|
|
| 1008 |
## ❓ 延伸小問題
|
| 1009 |
- (一個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
| 1010 |
"""
|
| 1011 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
| 1012 |
all_content.append(content + "\n")
|
| 1013 |
|
| 1014 |
if len(all_content) > 1:
|
|
@@ -1047,13 +1051,13 @@ def generate_summarise(df_string, metadata=None):
|
|
| 1047 |
## ❓ 延伸小問題
|
| 1048 |
- ( {all_content_cnt} 個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
| 1049 |
"""
|
| 1050 |
-
final_content = generate_content_by_LLM(sys_content, user_content)
|
| 1051 |
else:
|
| 1052 |
final_content = all_content[0]
|
| 1053 |
|
| 1054 |
return final_content
|
| 1055 |
|
| 1056 |
-
def get_questions(video_id, df_string, source="gcs"):
|
| 1057 |
if source == "gcs":
|
| 1058 |
# 去 gcs 確認是有有 video_id_questions.json
|
| 1059 |
print("===get_questions on gcs===")
|
|
@@ -1064,7 +1068,7 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
| 1064 |
# 检查檔案是否存在
|
| 1065 |
is_questions_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 1066 |
if not is_questions_exists:
|
| 1067 |
-
questions = generate_questions(df_string)
|
| 1068 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
| 1069 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
|
| 1070 |
print("questions已上傳到GCS")
|
|
@@ -1085,7 +1089,7 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
| 1085 |
# 检查檔案是否存在
|
| 1086 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 1087 |
if not exists:
|
| 1088 |
-
questions = generate_questions(df_string)
|
| 1089 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
| 1090 |
upload_content_directly(service, file_name, folder_id, questions_text)
|
| 1091 |
print("questions已上傳到Google Drive")
|
|
@@ -1105,7 +1109,7 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
| 1105 |
print("=====get_questions=====")
|
| 1106 |
return q1, q2, q3
|
| 1107 |
|
| 1108 |
-
def generate_questions(df_string):
|
| 1109 |
print("===generate_questions===")
|
| 1110 |
# 使用 OpenAI 生成基于上传数据的问题
|
| 1111 |
if isinstance(df_string, str):
|
|
@@ -1128,69 +1132,26 @@ def generate_questions(df_string):
|
|
| 1128 |
[q1的敘述text, q2的敘述text, q3的敘述text]
|
| 1129 |
}}
|
| 1130 |
"""
|
| 1131 |
-
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
messages = [
|
| 1135 |
-
{"role": "system", "content": sys_content},
|
| 1136 |
-
{"role": "user", "content": user_content}
|
| 1137 |
-
]
|
| 1138 |
-
response_format = { "type": "json_object" }
|
| 1139 |
-
|
| 1140 |
-
print("=====messages=====")
|
| 1141 |
-
print(messages)
|
| 1142 |
-
print("=====messages=====")
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
request_payload = {
|
| 1146 |
-
"model": model,
|
| 1147 |
-
"messages": messages,
|
| 1148 |
-
"max_tokens": 4000,
|
| 1149 |
-
"response_format": response_format
|
| 1150 |
-
}
|
| 1151 |
-
|
| 1152 |
-
response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
|
| 1153 |
-
questions = json.loads(response.choices[0].message.content)["questions"]
|
| 1154 |
-
except:
|
| 1155 |
-
messages = [
|
| 1156 |
-
{"role": "user", "content": user_content}
|
| 1157 |
-
]
|
| 1158 |
-
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
|
| 1159 |
-
# model_id = "anthropic.claude-3-haiku-20240307-v1:0"
|
| 1160 |
-
kwargs = {
|
| 1161 |
-
"modelId": model_id,
|
| 1162 |
-
"contentType": "application/json",
|
| 1163 |
-
"accept": "application/json",
|
| 1164 |
-
"body": json.dumps({
|
| 1165 |
-
"anthropic_version": "bedrock-2023-05-31",
|
| 1166 |
-
"max_tokens": 4000,
|
| 1167 |
-
"system": sys_content,
|
| 1168 |
-
"messages": messages
|
| 1169 |
-
})
|
| 1170 |
-
}
|
| 1171 |
-
response = BEDROCK_CLIENT.invoke_model(**kwargs)
|
| 1172 |
-
response_body = json.loads(response.get('body').read())
|
| 1173 |
-
response_completion = response_body.get('content')[0].get('text')
|
| 1174 |
-
questions = json.loads(response_completion)["questions"]
|
| 1175 |
-
|
| 1176 |
print("=====json_response=====")
|
| 1177 |
-
print(
|
| 1178 |
print("=====json_response=====")
|
| 1179 |
|
| 1180 |
-
return
|
| 1181 |
|
| 1182 |
-
def get_questions_answers(video_id, df_string, source="gcs"):
|
| 1183 |
if source == "gcs":
|
| 1184 |
try:
|
| 1185 |
print("===get_questions_answers on gcs===")
|
| 1186 |
-
gcs_client = GCS_CLIENT
|
| 1187 |
bucket_name = 'video_ai_assistant'
|
| 1188 |
file_name = f'{video_id}_questions_answers.json'
|
| 1189 |
blob_name = f"{video_id}/{file_name}"
|
| 1190 |
# 检查檔案是否存在
|
| 1191 |
is_questions_answers_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 1192 |
if not is_questions_answers_exists:
|
| 1193 |
-
questions_answers = generate_questions_answers(df_string)
|
| 1194 |
questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
| 1195 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
|
| 1196 |
print("questions_answers已上傳到GCS")
|
|
@@ -1201,12 +1162,12 @@ def get_questions_answers(video_id, df_string, source="gcs"):
|
|
| 1201 |
questions_answers = json.loads(questions_answers_text)
|
| 1202 |
except Exception as e:
|
| 1203 |
print(f"Error getting questions_answers: {str(e)}")
|
| 1204 |
-
|
| 1205 |
-
questions_answers = [{"question": q, "answer": ""} for q in
|
| 1206 |
|
| 1207 |
return questions_answers
|
| 1208 |
|
| 1209 |
-
def generate_questions_answers(df_string):
|
| 1210 |
print("===generate_questions_answers===")
|
| 1211 |
segments = split_data(df_string, word_base=100000)
|
| 1212 |
all_content = []
|
|
@@ -1232,7 +1193,7 @@ def generate_questions_answers(df_string):
|
|
| 1232 |
}}
|
| 1233 |
"""
|
| 1234 |
response_format = { "type": "json_object" }
|
| 1235 |
-
content = generate_content_by_LLM(sys_content, user_content, response_format)
|
| 1236 |
content_json = json.loads(content)["questions_answers"]
|
| 1237 |
all_content += content_json
|
| 1238 |
|
|
@@ -1256,7 +1217,7 @@ def change_questions(password, df_string):
|
|
| 1256 |
print("=====get_questions=====")
|
| 1257 |
return q1, q2, q3
|
| 1258 |
|
| 1259 |
-
def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source):
|
| 1260 |
if source == "gcs":
|
| 1261 |
print("===get_key_moments on gcs===")
|
| 1262 |
gcs_client = GCS_CLIENT
|
|
@@ -1266,7 +1227,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
| 1266 |
# 检查檔案是否存在
|
| 1267 |
is_key_moments_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 1268 |
if not is_key_moments_exists:
|
| 1269 |
-
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
| 1270 |
key_moments_json = {"key_moments": key_moments}
|
| 1271 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
| 1272 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
|
@@ -1282,7 +1243,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
| 1282 |
for key_moment in key_moments_json["key_moments"]:
|
| 1283 |
if "keywords" not in key_moment:
|
| 1284 |
transcript = key_moment["transcript"]
|
| 1285 |
-
key_moment["keywords"] = generate_key_moments_keywords(transcript)
|
| 1286 |
print("===keywords===")
|
| 1287 |
print(key_moment["keywords"])
|
| 1288 |
print("===keywords===")
|
|
@@ -1303,7 +1264,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
| 1303 |
# 检查檔案是否存在
|
| 1304 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 1305 |
if not exists:
|
| 1306 |
-
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
| 1307 |
key_moments_json = {"key_moments": key_moments}
|
| 1308 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
| 1309 |
upload_content_directly(service, file_name, folder_id, key_moments_text)
|
|
@@ -1316,7 +1277,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
| 1316 |
|
| 1317 |
return key_moments_json
|
| 1318 |
|
| 1319 |
-
def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
| 1320 |
print("===generate_key_moments===")
|
| 1321 |
segments = split_data(formatted_simple_transcript, word_base=100000)
|
| 1322 |
all_content = []
|
|
@@ -1343,7 +1304,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
|
| 1343 |
}}
|
| 1344 |
"""
|
| 1345 |
response_format = { "type": "json_object" }
|
| 1346 |
-
content = generate_content_by_LLM(sys_content, user_content, response_format)
|
| 1347 |
key_moments = json.loads(content)["key_moments"]
|
| 1348 |
|
| 1349 |
# "transcript": get text from formatted_simple_transcript
|
|
@@ -1371,7 +1332,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
|
| 1371 |
|
| 1372 |
return all_content
|
| 1373 |
|
| 1374 |
-
def generate_key_moments_keywords(transcript):
|
| 1375 |
print("===generate_key_moments_keywords===")
|
| 1376 |
segments = split_data(transcript, word_base=100000)
|
| 1377 |
all_content = []
|
|
@@ -1384,7 +1345,7 @@ def generate_key_moments_keywords(transcript):
|
|
| 1384 |
不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
|
| 1385 |
transcript:{segment}
|
| 1386 |
"""
|
| 1387 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
| 1388 |
keywords = content.strip().split(",")
|
| 1389 |
all_content += keywords
|
| 1390 |
|
|
@@ -1665,7 +1626,6 @@ def delete_LLM_content(video_id, kind):
|
|
| 1665 |
|
| 1666 |
def update_LLM_content(video_id, new_content, kind):
|
| 1667 |
print(f"===upfdate kind on gcs===")
|
| 1668 |
-
gcs_client = GCS_CLIENT
|
| 1669 |
bucket_name = 'video_ai_assistant'
|
| 1670 |
file_name = f'{video_id}_{kind}.json'
|
| 1671 |
blob_name = f"{video_id}/{file_name}"
|
|
@@ -1739,16 +1699,16 @@ def update_LLM_content(video_id, new_content, kind):
|
|
| 1739 |
print(f"{kind} 已更新到GCS")
|
| 1740 |
return gr.update(value=updated_content, interactive=False)
|
| 1741 |
|
| 1742 |
-
def create_LLM_content(video_id, df_string, kind):
|
| 1743 |
print(f"===create_{kind}===")
|
| 1744 |
print(f"video_id: {video_id}")
|
| 1745 |
|
| 1746 |
if kind == "reading_passage_latex":
|
| 1747 |
-
content = generate_reading_passage(df_string)
|
| 1748 |
update_LLM_content(video_id, content, kind)
|
| 1749 |
elif kind == "summary_markdown":
|
| 1750 |
meta_data = get_meta_data(video_id)
|
| 1751 |
-
content = generate_summarise(df_string, meta_data)
|
| 1752 |
update_LLM_content(video_id, content, kind)
|
| 1753 |
elif kind == "mind_map":
|
| 1754 |
content = generate_mind_map(df_string)
|
|
@@ -1760,7 +1720,7 @@ def create_LLM_content(video_id, df_string, kind):
|
|
| 1760 |
transcript = df_string
|
| 1761 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
| 1762 |
formatted_transcript = create_formatted_transcript(video_id, transcript)
|
| 1763 |
-
gen_content = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
| 1764 |
update_LLM_content(video_id, gen_content, kind)
|
| 1765 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
| 1766 |
elif kind == "transcript":
|
|
@@ -1768,7 +1728,7 @@ def create_LLM_content(video_id, df_string, kind):
|
|
| 1768 |
update_LLM_content(video_id, gen_content, kind)
|
| 1769 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
| 1770 |
elif kind == "questions":
|
| 1771 |
-
gen_content = generate_questions(df_string)
|
| 1772 |
update_LLM_content(video_id, gen_content, kind)
|
| 1773 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
| 1774 |
elif kind == "questions_answers":
|
|
@@ -1777,7 +1737,7 @@ def create_LLM_content(video_id, df_string, kind):
|
|
| 1777 |
else:
|
| 1778 |
transcript = df_string
|
| 1779 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
| 1780 |
-
gen_content = generate_questions_answers(formatted_simple_transcript)
|
| 1781 |
update_LLM_content(video_id, gen_content, kind)
|
| 1782 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
| 1783 |
|
|
@@ -2690,14 +2650,20 @@ HEAD = """
|
|
| 2690 |
|
| 2691 |
with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, text_size = gr.themes.sizes.text_lg), head=HEAD) as demo:
|
| 2692 |
with gr.Row() as admin:
|
| 2693 |
-
|
| 2694 |
-
|
| 2695 |
-
|
| 2696 |
-
|
| 2697 |
-
|
| 2698 |
-
|
| 2699 |
-
|
| 2700 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2701 |
with gr.Row() as data_state:
|
| 2702 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
| 2703 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
|
@@ -3170,7 +3136,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 3170 |
)
|
| 3171 |
|
| 3172 |
# 当输入 YouTube 链接时触发
|
| 3173 |
-
process_youtube_link_inputs = [password, youtube_link]
|
| 3174 |
process_youtube_link_outputs = [
|
| 3175 |
video_id,
|
| 3176 |
questions_answers_json,
|
|
@@ -3251,7 +3217,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 3251 |
{
|
| 3252 |
'button': transcript_create_button,
|
| 3253 |
'action': create_LLM_content,
|
| 3254 |
-
'inputs': [video_id, df_string_output, transcript_kind],
|
| 3255 |
'outputs': [df_string_output]
|
| 3256 |
},
|
| 3257 |
{
|
|
@@ -3282,7 +3248,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 3282 |
{
|
| 3283 |
'button': reading_passage_create_button,
|
| 3284 |
'action': create_LLM_content,
|
| 3285 |
-
'inputs': [video_id, df_string_output, reading_passage_kind],
|
| 3286 |
'outputs': [reading_passage_text]
|
| 3287 |
},
|
| 3288 |
{
|
|
@@ -3313,7 +3279,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 3313 |
{
|
| 3314 |
'button': summary_create_button,
|
| 3315 |
'action': create_LLM_content,
|
| 3316 |
-
'inputs': [video_id, df_string_output, summary_kind],
|
| 3317 |
'outputs': [summary_text]
|
| 3318 |
},
|
| 3319 |
{
|
|
@@ -3344,7 +3310,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 3344 |
{
|
| 3345 |
'button': key_moments_create_button,
|
| 3346 |
'action': create_LLM_content,
|
| 3347 |
-
'inputs': [video_id, df_string_output, key_moments_kind],
|
| 3348 |
'outputs': [key_moments]
|
| 3349 |
},
|
| 3350 |
{
|
|
@@ -3375,7 +3341,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 3375 |
{
|
| 3376 |
'button': questions_create_button,
|
| 3377 |
'action': create_LLM_content,
|
| 3378 |
-
'inputs': [video_id, df_string_output, questions_kind],
|
| 3379 |
'outputs': [questions_json]
|
| 3380 |
},
|
| 3381 |
{
|
|
@@ -3406,7 +3372,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 3406 |
{
|
| 3407 |
'button': questions_answers_create_button,
|
| 3408 |
'action': create_LLM_content,
|
| 3409 |
-
'inputs': [video_id, df_string_output, questions_answers_kind],
|
| 3410 |
'outputs': [questions_answers_json]
|
| 3411 |
},
|
| 3412 |
{
|
|
@@ -3437,7 +3403,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 3437 |
{
|
| 3438 |
'button': worksheet_create_button,
|
| 3439 |
'action': create_LLM_content,
|
| 3440 |
-
'inputs': [video_id, df_string_output, worksheet_kind],
|
| 3441 |
'outputs': [worksheet_json]
|
| 3442 |
},
|
| 3443 |
{
|
|
|
|
| 503 |
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
| 504 |
print("Transcript uploaded successfully.")
|
| 505 |
|
| 506 |
+
def process_youtube_link(password, link, LLM_model=None):
|
| 507 |
verify_password(password)
|
| 508 |
video_id = extract_youtube_id(link)
|
| 509 |
|
|
|
|
| 545 |
|
| 546 |
# 基于逐字稿生成其他所需的输出
|
| 547 |
source = "gcs"
|
| 548 |
+
questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source, LLM_model)
|
| 549 |
questions_answers_json = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
| 550 |
+
summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source, LLM_model)
|
| 551 |
summary_text = summary_json["summary"]
|
| 552 |
summary = summary_json["summary"]
|
| 553 |
+
key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source, LLM_model)
|
| 554 |
key_moments = key_moments_json["key_moments"]
|
| 555 |
key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
|
| 556 |
key_moments_html = get_key_moments_html(key_moments)
|
| 557 |
html_content = format_transcript_to_html(formatted_transcript)
|
| 558 |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
|
| 559 |
+
mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source, LLM_model)
|
| 560 |
mind_map = mind_map_json["mind_map"]
|
| 561 |
mind_map_html = get_mind_map_html(mind_map)
|
| 562 |
+
reading_passage_json = get_reading_passage(video_id, formatted_simple_transcript, source, LLM_model)
|
| 563 |
reading_passage_text = reading_passage_json["reading_passage"]
|
| 564 |
reading_passage = reading_passage_json["reading_passage"]
|
| 565 |
meta_data = get_meta_data(video_id)
|
|
|
|
| 703 |
|
| 704 |
return segments
|
| 705 |
|
| 706 |
+
def generate_content_by_open_ai(sys_content, user_content, response_format=None):
|
| 707 |
+
print("LLM using OPEN AI")
|
| 708 |
+
model = "gpt-4-turbo"
|
| 709 |
+
messages = [
|
| 710 |
+
{"role": "system", "content": sys_content},
|
| 711 |
+
{"role": "user", "content": user_content}
|
| 712 |
+
]
|
| 713 |
+
request_payload = {
|
| 714 |
+
"model": model,
|
| 715 |
+
"messages": messages,
|
| 716 |
+
"max_tokens": 4000,
|
| 717 |
+
}
|
| 718 |
|
| 719 |
+
if response_format is not None:
|
| 720 |
+
request_payload["response_format"] = response_format
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 721 |
|
| 722 |
+
response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
|
| 723 |
+
content = response.choices[0].message.content.strip()
|
| 724 |
+
return content
|
|
|
|
|
|
|
|
|
|
| 725 |
|
| 726 |
+
def generate_content_by_bedrock(sys_content, user_content):
|
| 727 |
+
print("LLM using REDROCK")
|
| 728 |
+
messages = [
|
| 729 |
+
{"role": "user", "content": user_content +"(如果是 JSON 格式,value 的引號,請用單引號,或是用反斜線+雙引號,避免 JSON Decoder error )"}
|
| 730 |
+
]
|
| 731 |
+
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
|
| 732 |
+
# model_id = "anthropic.claude-3-haiku-20240307-v1:0"
|
| 733 |
+
kwargs = {
|
| 734 |
+
"modelId": model_id,
|
| 735 |
+
"contentType": "application/json",
|
| 736 |
+
"accept": "application/json",
|
| 737 |
+
"body": json.dumps({
|
| 738 |
+
"anthropic_version": "bedrock-2023-05-31",
|
| 739 |
+
"max_tokens": 4000,
|
| 740 |
+
"system": sys_content,
|
| 741 |
+
"messages": messages
|
| 742 |
+
})
|
| 743 |
+
}
|
| 744 |
+
response = BEDROCK_CLIENT.invoke_model(**kwargs)
|
| 745 |
+
response_body = json.loads(response.get('body').read())
|
| 746 |
+
content = response_body.get('content')[0].get('text')
|
| 747 |
+
return content
|
| 748 |
|
| 749 |
+
def generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=None):
|
| 750 |
+
# 使用 OpenAI 生成基于上传数据的问题
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
|
| 752 |
+
if LLM_model == "anthropic-claude-3-sonnet":
|
| 753 |
+
print(f"LLM: {LLM_model}")
|
| 754 |
+
content = generate_content_by_bedrock(sys_content, user_content)
|
| 755 |
+
else:
|
| 756 |
+
print(f"LLM: {LLM_model}")
|
| 757 |
+
content = generate_content_by_open_ai(sys_content, user_content, response_format)
|
| 758 |
+
|
| 759 |
print("=====content=====")
|
| 760 |
print(content)
|
| 761 |
print("=====content=====")
|
| 762 |
|
| 763 |
return content
|
| 764 |
|
| 765 |
+
def get_reading_passage(video_id, df_string, source, LLM_model=None):
|
| 766 |
if source == "gcs":
|
| 767 |
print("===get_reading_passage on gcs===")
|
|
|
|
| 768 |
bucket_name = 'video_ai_assistant'
|
| 769 |
file_name = f'{video_id}_reading_passage_latex.json'
|
| 770 |
blob_name = f"{video_id}/{file_name}"
|
| 771 |
# 检查 reading_passage 是否存在
|
| 772 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 773 |
if not is_file_exists:
|
| 774 |
+
reading_passage = generate_reading_passage(df_string, LLM_model)
|
| 775 |
reading_passage_json = {"reading_passage": str(reading_passage)}
|
| 776 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
| 777 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
|
|
|
|
| 804 |
|
| 805 |
return reading_passage_json
|
| 806 |
|
| 807 |
+
def generate_reading_passage(df_string, LLM_model=None):
|
| 808 |
print("===generate_reading_passage===")
|
| 809 |
segments = split_data(df_string, word_base=100000)
|
| 810 |
all_content = []
|
|
|
|
| 823 |
加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
|
| 824 |
請直接給出文章,不用介紹怎麼處理的或是文章字數等等
|
| 825 |
"""
|
| 826 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
| 827 |
all_content.append(content + "\n")
|
| 828 |
|
| 829 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
|
|
|
| 836 |
tts.save(filename)
|
| 837 |
return filename
|
| 838 |
|
| 839 |
+
def get_mind_map(video_id, df_string, source, LLM_model=None):
|
| 840 |
if source == "gcs":
|
| 841 |
print("===get_mind_map on gcs===")
|
| 842 |
gcs_client = GCS_CLIENT
|
|
|
|
| 846 |
# 检查檔案是否存在
|
| 847 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 848 |
if not is_file_exists:
|
| 849 |
+
mind_map = generate_mind_map(df_string, LLM_model)
|
| 850 |
mind_map_json = {"mind_map": str(mind_map)}
|
| 851 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
| 852 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
|
|
|
|
| 867 |
# 检查檔案是否存在
|
| 868 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 869 |
if not exists:
|
| 870 |
+
mind_map = generate_mind_map(df_string, LLM_model)
|
| 871 |
mind_map_json = {"mind_map": str(mind_map)}
|
| 872 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
| 873 |
upload_content_directly(service, file_name, folder_id, mind_map_text)
|
|
|
|
| 880 |
|
| 881 |
return mind_map_json
|
| 882 |
|
| 883 |
+
def generate_mind_map(df_string, LLM_model=None):
|
| 884 |
print("===generate_mind_map===")
|
| 885 |
segments = split_data(df_string, word_base=100000)
|
| 886 |
all_content = []
|
|
|
|
| 892 |
注意:不需要前後文敘述,直接給出 markdown 文本即可
|
| 893 |
這對我很重要
|
| 894 |
"""
|
| 895 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
| 896 |
all_content.append(content + "\n")
|
| 897 |
|
| 898 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
|
|
|
| 911 |
"""
|
| 912 |
return mind_map_html
|
| 913 |
|
| 914 |
+
def get_video_id_summary(video_id, df_string, source, LLM_model=None):
|
| 915 |
if source == "gcs":
|
| 916 |
print("===get_video_id_summary on gcs===")
|
|
|
|
| 917 |
bucket_name = 'video_ai_assistant'
|
| 918 |
file_name = f'{video_id}_summary_markdown.json'
|
| 919 |
summary_file_blob_name = f"{video_id}/{file_name}"
|
|
|
|
| 921 |
is_summary_file_exists = GCS_SERVICE.check_file_exists(bucket_name, summary_file_blob_name)
|
| 922 |
if not is_summary_file_exists:
|
| 923 |
meta_data = get_meta_data(video_id)
|
| 924 |
+
summary = generate_summarise(df_string, meta_data, LLM_model)
|
| 925 |
summary_json = {"summary": str(summary)}
|
| 926 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
| 927 |
GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
|
|
|
|
| 943 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 944 |
if not exists:
|
| 945 |
meta_data = get_meta_data(video_id)
|
| 946 |
+
summary = generate_summarise(df_string, meta_data, LLM_model)
|
| 947 |
summary_json = {"summary": str(summary)}
|
| 948 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
| 949 |
|
|
|
|
| 964 |
|
| 965 |
return summary_json
|
| 966 |
|
| 967 |
+
def generate_summarise(df_string, metadata=None, LLM_model=None):
|
| 968 |
print("===generate_summarise===")
|
| 969 |
# 使用 OpenAI 生成基于上传数据的问题
|
| 970 |
if metadata:
|
|
|
|
| 1012 |
## ❓ 延伸小問題
|
| 1013 |
- (一個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
| 1014 |
"""
|
| 1015 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
| 1016 |
all_content.append(content + "\n")
|
| 1017 |
|
| 1018 |
if len(all_content) > 1:
|
|
|
|
| 1051 |
## ❓ 延伸小問題
|
| 1052 |
- ( {all_content_cnt} 個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
| 1053 |
"""
|
| 1054 |
+
final_content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
| 1055 |
else:
|
| 1056 |
final_content = all_content[0]
|
| 1057 |
|
| 1058 |
return final_content
|
| 1059 |
|
| 1060 |
+
def get_questions(video_id, df_string, source="gcs", LLM_model=None):
|
| 1061 |
if source == "gcs":
|
| 1062 |
# 去 gcs 確認是有有 video_id_questions.json
|
| 1063 |
print("===get_questions on gcs===")
|
|
|
|
| 1068 |
# 检查檔案是否存在
|
| 1069 |
is_questions_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 1070 |
if not is_questions_exists:
|
| 1071 |
+
questions = generate_questions(df_string, LLM_model)
|
| 1072 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
| 1073 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
|
| 1074 |
print("questions已上傳到GCS")
|
|
|
|
| 1089 |
# 检查檔案是否存在
|
| 1090 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 1091 |
if not exists:
|
| 1092 |
+
questions = generate_questions(df_string, LLM_model)
|
| 1093 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
| 1094 |
upload_content_directly(service, file_name, folder_id, questions_text)
|
| 1095 |
print("questions已上傳到Google Drive")
|
|
|
|
| 1109 |
print("=====get_questions=====")
|
| 1110 |
return q1, q2, q3
|
| 1111 |
|
| 1112 |
+
def generate_questions(df_string, LLM_model=None):
|
| 1113 |
print("===generate_questions===")
|
| 1114 |
# 使用 OpenAI 生成基于上传数据的问题
|
| 1115 |
if isinstance(df_string, str):
|
|
|
|
| 1132 |
[q1的敘述text, q2的敘述text, q3的敘述text]
|
| 1133 |
}}
|
| 1134 |
"""
|
| 1135 |
+
response_format = { "type": "json_object" }
|
| 1136 |
+
questions = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
|
| 1137 |
+
questions_list = json.loads(questions)["questions"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1138 |
print("=====json_response=====")
|
| 1139 |
+
print(questions_list)
|
| 1140 |
print("=====json_response=====")
|
| 1141 |
|
| 1142 |
+
return questions_list
|
| 1143 |
|
| 1144 |
+
def get_questions_answers(video_id, df_string, source="gcs", LLM_model=None):
|
| 1145 |
if source == "gcs":
|
| 1146 |
try:
|
| 1147 |
print("===get_questions_answers on gcs===")
|
|
|
|
| 1148 |
bucket_name = 'video_ai_assistant'
|
| 1149 |
file_name = f'{video_id}_questions_answers.json'
|
| 1150 |
blob_name = f"{video_id}/{file_name}"
|
| 1151 |
# 检查檔案是否存在
|
| 1152 |
is_questions_answers_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 1153 |
if not is_questions_answers_exists:
|
| 1154 |
+
questions_answers = generate_questions_answers(df_string, LLM_model)
|
| 1155 |
questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
| 1156 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
|
| 1157 |
print("questions_answers已上傳到GCS")
|
|
|
|
| 1162 |
questions_answers = json.loads(questions_answers_text)
|
| 1163 |
except Exception as e:
|
| 1164 |
print(f"Error getting questions_answers: {str(e)}")
|
| 1165 |
+
questions_list = get_questions(video_id, df_string, source, LLM_model)
|
| 1166 |
+
questions_answers = [{"question": q, "answer": ""} for q in questions_list]
|
| 1167 |
|
| 1168 |
return questions_answers
|
| 1169 |
|
| 1170 |
+
def generate_questions_answers(df_string, LLM_model=None):
|
| 1171 |
print("===generate_questions_answers===")
|
| 1172 |
segments = split_data(df_string, word_base=100000)
|
| 1173 |
all_content = []
|
|
|
|
| 1193 |
}}
|
| 1194 |
"""
|
| 1195 |
response_format = { "type": "json_object" }
|
| 1196 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
|
| 1197 |
content_json = json.loads(content)["questions_answers"]
|
| 1198 |
all_content += content_json
|
| 1199 |
|
|
|
|
| 1217 |
print("=====get_questions=====")
|
| 1218 |
return q1, q2, q3
|
| 1219 |
|
| 1220 |
+
def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source, LLM_model=None):
|
| 1221 |
if source == "gcs":
|
| 1222 |
print("===get_key_moments on gcs===")
|
| 1223 |
gcs_client = GCS_CLIENT
|
|
|
|
| 1227 |
# 检查檔案是否存在
|
| 1228 |
is_key_moments_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
| 1229 |
if not is_key_moments_exists:
|
| 1230 |
+
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
|
| 1231 |
key_moments_json = {"key_moments": key_moments}
|
| 1232 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
| 1233 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
|
|
|
| 1243 |
for key_moment in key_moments_json["key_moments"]:
|
| 1244 |
if "keywords" not in key_moment:
|
| 1245 |
transcript = key_moment["transcript"]
|
| 1246 |
+
key_moment["keywords"] = generate_key_moments_keywords(transcript, LLM_model)
|
| 1247 |
print("===keywords===")
|
| 1248 |
print(key_moment["keywords"])
|
| 1249 |
print("===keywords===")
|
|
|
|
| 1264 |
# 检查檔案是否存在
|
| 1265 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 1266 |
if not exists:
|
| 1267 |
+
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
|
| 1268 |
key_moments_json = {"key_moments": key_moments}
|
| 1269 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
| 1270 |
upload_content_directly(service, file_name, folder_id, key_moments_text)
|
|
|
|
| 1277 |
|
| 1278 |
return key_moments_json
|
| 1279 |
|
| 1280 |
+
def generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model=None):
|
| 1281 |
print("===generate_key_moments===")
|
| 1282 |
segments = split_data(formatted_simple_transcript, word_base=100000)
|
| 1283 |
all_content = []
|
|
|
|
| 1304 |
}}
|
| 1305 |
"""
|
| 1306 |
response_format = { "type": "json_object" }
|
| 1307 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
|
| 1308 |
key_moments = json.loads(content)["key_moments"]
|
| 1309 |
|
| 1310 |
# "transcript": get text from formatted_simple_transcript
|
|
|
|
| 1332 |
|
| 1333 |
return all_content
|
| 1334 |
|
| 1335 |
+
def generate_key_moments_keywords(transcript, LLM_model=None):
|
| 1336 |
print("===generate_key_moments_keywords===")
|
| 1337 |
segments = split_data(transcript, word_base=100000)
|
| 1338 |
all_content = []
|
|
|
|
| 1345 |
不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
|
| 1346 |
transcript:{segment}
|
| 1347 |
"""
|
| 1348 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
| 1349 |
keywords = content.strip().split(",")
|
| 1350 |
all_content += keywords
|
| 1351 |
|
|
|
|
| 1626 |
|
| 1627 |
def update_LLM_content(video_id, new_content, kind):
|
| 1628 |
print(f"===upfdate kind on gcs===")
|
|
|
|
| 1629 |
bucket_name = 'video_ai_assistant'
|
| 1630 |
file_name = f'{video_id}_{kind}.json'
|
| 1631 |
blob_name = f"{video_id}/{file_name}"
|
|
|
|
| 1699 |
print(f"{kind} 已更新到GCS")
|
| 1700 |
return gr.update(value=updated_content, interactive=False)
|
| 1701 |
|
| 1702 |
+
def create_LLM_content(video_id, df_string, kind, LLM_model=None):
|
| 1703 |
print(f"===create_{kind}===")
|
| 1704 |
print(f"video_id: {video_id}")
|
| 1705 |
|
| 1706 |
if kind == "reading_passage_latex":
|
| 1707 |
+
content = generate_reading_passage(df_string, LLM_model)
|
| 1708 |
update_LLM_content(video_id, content, kind)
|
| 1709 |
elif kind == "summary_markdown":
|
| 1710 |
meta_data = get_meta_data(video_id)
|
| 1711 |
+
content = generate_summarise(df_string, meta_data, LLM_model)
|
| 1712 |
update_LLM_content(video_id, content, kind)
|
| 1713 |
elif kind == "mind_map":
|
| 1714 |
content = generate_mind_map(df_string)
|
|
|
|
| 1720 |
transcript = df_string
|
| 1721 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
| 1722 |
formatted_transcript = create_formatted_transcript(video_id, transcript)
|
| 1723 |
+
gen_content = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
|
| 1724 |
update_LLM_content(video_id, gen_content, kind)
|
| 1725 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
| 1726 |
elif kind == "transcript":
|
|
|
|
| 1728 |
update_LLM_content(video_id, gen_content, kind)
|
| 1729 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
| 1730 |
elif kind == "questions":
|
| 1731 |
+
gen_content = generate_questions(df_string, LLM_model)
|
| 1732 |
update_LLM_content(video_id, gen_content, kind)
|
| 1733 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
| 1734 |
elif kind == "questions_answers":
|
|
|
|
| 1737 |
else:
|
| 1738 |
transcript = df_string
|
| 1739 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
| 1740 |
+
gen_content = generate_questions_answers(formatted_simple_transcript, LLM_model)
|
| 1741 |
update_LLM_content(video_id, gen_content, kind)
|
| 1742 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
| 1743 |
|
|
|
|
| 2650 |
|
| 2651 |
with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, text_size = gr.themes.sizes.text_lg), head=HEAD) as demo:
|
| 2652 |
with gr.Row() as admin:
|
| 2653 |
+
with gr.Column(scale=4):
|
| 2654 |
+
with gr.Row():
|
| 2655 |
+
password = gr.Textbox(label="Password", type="password", elem_id="password_input", visible=True)
|
| 2656 |
+
youtube_link = gr.Textbox(label="Enter YouTube Link", elem_id="youtube_link_input", visible=True)
|
| 2657 |
+
video_id = gr.Textbox(label="video_id", visible=True)
|
| 2658 |
+
# file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
|
| 2659 |
+
# web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
| 2660 |
+
user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
|
| 2661 |
+
with gr.Row():
|
| 2662 |
+
is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
|
| 2663 |
+
LLM_model = gr.Dropdown(label="LLM Model", choices=["open-ai-gpt-4", "anthropic-claude-3-sonnet"], value="open-ai-gpt-4", visible=True, interactive=True)
|
| 2664 |
+
with gr.Column(scale=1):
|
| 2665 |
+
with gr.Row():
|
| 2666 |
+
youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
|
| 2667 |
with gr.Row() as data_state:
|
| 2668 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
| 2669 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
|
|
|
| 3136 |
)
|
| 3137 |
|
| 3138 |
# 当输入 YouTube 链接时触发
|
| 3139 |
+
process_youtube_link_inputs = [password, youtube_link, LLM_model]
|
| 3140 |
process_youtube_link_outputs = [
|
| 3141 |
video_id,
|
| 3142 |
questions_answers_json,
|
|
|
|
| 3217 |
{
|
| 3218 |
'button': transcript_create_button,
|
| 3219 |
'action': create_LLM_content,
|
| 3220 |
+
'inputs': [video_id, df_string_output, transcript_kind, LLM_model],
|
| 3221 |
'outputs': [df_string_output]
|
| 3222 |
},
|
| 3223 |
{
|
|
|
|
| 3248 |
{
|
| 3249 |
'button': reading_passage_create_button,
|
| 3250 |
'action': create_LLM_content,
|
| 3251 |
+
'inputs': [video_id, df_string_output, reading_passage_kind, LLM_model],
|
| 3252 |
'outputs': [reading_passage_text]
|
| 3253 |
},
|
| 3254 |
{
|
|
|
|
| 3279 |
{
|
| 3280 |
'button': summary_create_button,
|
| 3281 |
'action': create_LLM_content,
|
| 3282 |
+
'inputs': [video_id, df_string_output, summary_kind, LLM_model],
|
| 3283 |
'outputs': [summary_text]
|
| 3284 |
},
|
| 3285 |
{
|
|
|
|
| 3310 |
{
|
| 3311 |
'button': key_moments_create_button,
|
| 3312 |
'action': create_LLM_content,
|
| 3313 |
+
'inputs': [video_id, df_string_output, key_moments_kind, LLM_model],
|
| 3314 |
'outputs': [key_moments]
|
| 3315 |
},
|
| 3316 |
{
|
|
|
|
| 3341 |
{
|
| 3342 |
'button': questions_create_button,
|
| 3343 |
'action': create_LLM_content,
|
| 3344 |
+
'inputs': [video_id, df_string_output, questions_kind, LLM_model],
|
| 3345 |
'outputs': [questions_json]
|
| 3346 |
},
|
| 3347 |
{
|
|
|
|
| 3372 |
{
|
| 3373 |
'button': questions_answers_create_button,
|
| 3374 |
'action': create_LLM_content,
|
| 3375 |
+
'inputs': [video_id, df_string_output, questions_answers_kind, LLM_model],
|
| 3376 |
'outputs': [questions_answers_json]
|
| 3377 |
},
|
| 3378 |
{
|
|
|
|
| 3403 |
{
|
| 3404 |
'button': worksheet_create_button,
|
| 3405 |
'action': create_LLM_content,
|
| 3406 |
+
'inputs': [video_id, df_string_output, worksheet_kind, LLM_model],
|
| 3407 |
'outputs': [worksheet_json]
|
| 3408 |
},
|
| 3409 |
{
|