Spaces:
Sleeping
Sleeping
pw
Browse files
app.py
CHANGED
|
@@ -5,6 +5,8 @@ from bs4 import BeautifulSoup
|
|
| 5 |
from docx import Document
|
| 6 |
import os
|
| 7 |
from openai import OpenAI
|
|
|
|
|
|
|
| 8 |
import json
|
| 9 |
|
| 10 |
from youtube_transcript_api import YouTubeTranscriptApi
|
|
@@ -55,11 +57,24 @@ TRANSCRIPTS = []
|
|
| 55 |
CURRENT_INDEX = 0
|
| 56 |
VIDEO_ID = ""
|
| 57 |
|
|
|
|
|
|
|
| 58 |
OPEN_AI_KEY = os.getenv("OPEN_AI_KEY")
|
| 59 |
OPEN_AI_CLIENT = OpenAI(api_key=OPEN_AI_KEY)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 61 |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
# ====gcs====
|
| 64 |
def init_gcs_client(service_account_key_string):
|
| 65 |
"""使用服务账号密钥文件创建 GCS 客户端"""
|
|
@@ -293,7 +308,9 @@ def update_file_on_drive(service, file_id, file_content):
|
|
| 293 |
|
| 294 |
|
| 295 |
# ---- Main Functions ----
|
| 296 |
-
def process_file(file):
|
|
|
|
|
|
|
| 297 |
# 读取文件
|
| 298 |
if file.name.endswith('.csv'):
|
| 299 |
df = pd.read_csv(file)
|
|
@@ -476,7 +493,9 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
| 476 |
|
| 477 |
return updated_transcript_json
|
| 478 |
|
| 479 |
-
def process_youtube_link(link):
|
|
|
|
|
|
|
| 480 |
# 使用 YouTube API 获取逐字稿
|
| 481 |
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
| 482 |
video_id = extract_youtube_id(link)
|
|
@@ -870,7 +889,9 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
| 870 |
print("=====get_questions=====")
|
| 871 |
return q1, q2, q3
|
| 872 |
|
| 873 |
-
def change_questions(df_string):
|
|
|
|
|
|
|
| 874 |
questions = generate_questions(df_string)
|
| 875 |
q1 = questions[0] if len(questions) > 0 else ""
|
| 876 |
q2 = questions[1] if len(questions) > 1 else ""
|
|
@@ -882,7 +903,9 @@ def change_questions(df_string):
|
|
| 882 |
print("=====get_questions=====")
|
| 883 |
return q1, q2, q3
|
| 884 |
|
| 885 |
-
def respond(user_message, data, chat_history, socratic_mode=False):
|
|
|
|
|
|
|
| 886 |
print("=== 變數:user_message ===")
|
| 887 |
print(user_message)
|
| 888 |
print("=== 變數:chat_history ===")
|
|
@@ -966,7 +989,96 @@ def respond(user_message, data, chat_history, socratic_mode=False):
|
|
| 966 |
# 返回聊天历史和空字符串清空输入框
|
| 967 |
return "", chat_history
|
| 968 |
|
| 969 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
# 先計算 user_message 是否超過 500 個字
|
| 971 |
if len(user_message) > 1500:
|
| 972 |
error_msg = "你的訊息太長了,請縮短訊息長度至五百字以內"
|
|
@@ -1068,6 +1180,7 @@ def chat_with_youtube_transcript(youtube_id, thread_id, trascript, user_message,
|
|
| 1068 |
# 返回聊天历史和空字符串清空输入框
|
| 1069 |
return "", chat_history, thread.id
|
| 1070 |
|
|
|
|
| 1071 |
def poll_run_status(run_id, thread_id, timeout=600, poll_interval=5):
|
| 1072 |
"""
|
| 1073 |
Polls the status of a Run and handles different statuses appropriately.
|
|
@@ -1177,8 +1290,9 @@ HEAD = """
|
|
| 1177 |
with gr.Blocks() as demo:
|
| 1178 |
with gr.Row():
|
| 1179 |
with gr.Column(scale=2):
|
|
|
|
| 1180 |
file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
|
| 1181 |
-
youtube_link = gr.Textbox(label="Enter YouTube Link")
|
| 1182 |
video_id = gr.Textbox(label="video_id", visible=False)
|
| 1183 |
youtube_link_btn = gr.Button("Submit_YouTube_Link")
|
| 1184 |
web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
|
@@ -1188,6 +1302,10 @@ with gr.Blocks() as demo:
|
|
| 1188 |
msg = gr.Textbox(label="Message")
|
| 1189 |
send_button = gr.Button("Send")
|
| 1190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1191 |
with gr.Column(scale=3):
|
| 1192 |
with gr.Tab("圖文"):
|
| 1193 |
transcript_html = gr.HTML(label="YouTube Transcript and Video")
|
|
@@ -1260,27 +1378,33 @@ with gr.Blocks() as demo:
|
|
| 1260 |
# chat_with_youtube_transcript
|
| 1261 |
send_button.click(
|
| 1262 |
chat_with_youtube_transcript,
|
| 1263 |
-
inputs=[video_id, thread_id, df_string_output, msg, chatbot, socratic_mode_btn],
|
| 1264 |
outputs=[msg, chatbot, thread_id]
|
| 1265 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1266 |
# 连接按钮点击事件
|
| 1267 |
btn_1.click(
|
| 1268 |
chat_with_youtube_transcript,
|
| 1269 |
-
inputs=[video_id, thread_id, df_string_output, btn_1, chatbot, socratic_mode_btn],
|
| 1270 |
outputs=[msg, chatbot, thread_id]
|
| 1271 |
)
|
| 1272 |
btn_2.click(
|
| 1273 |
chat_with_youtube_transcript,
|
| 1274 |
-
inputs=[video_id, thread_id, df_string_output, btn_2, chatbot, socratic_mode_btn],
|
| 1275 |
outputs=[msg, chatbot, thread_id]
|
| 1276 |
)
|
| 1277 |
btn_3.click(
|
| 1278 |
chat_with_youtube_transcript,
|
| 1279 |
-
inputs=[video_id, thread_id, df_string_output, btn_3, chatbot, socratic_mode_btn],
|
| 1280 |
outputs=[msg, chatbot, thread_id]
|
| 1281 |
)
|
| 1282 |
|
| 1283 |
-
btn_create_question.click(change_questions, inputs = [df_string_output], outputs = [btn_1, btn_2, btn_3])
|
| 1284 |
|
| 1285 |
# file_upload.change(process_file, inputs=file_upload, outputs=df_string_output)
|
| 1286 |
file_upload.change(process_file, inputs=file_upload, outputs=[btn_1, btn_2, btn_3, df_summarise, df_string_output])
|
|
@@ -1288,7 +1412,7 @@ with gr.Blocks() as demo:
|
|
| 1288 |
# 当输入 YouTube 链接时触发
|
| 1289 |
youtube_link.change(
|
| 1290 |
process_youtube_link,
|
| 1291 |
-
inputs=youtube_link,
|
| 1292 |
outputs=[
|
| 1293 |
video_id,
|
| 1294 |
btn_1,
|
|
@@ -1307,7 +1431,7 @@ with gr.Blocks() as demo:
|
|
| 1307 |
|
| 1308 |
youtube_link_btn.click(
|
| 1309 |
process_youtube_link,
|
| 1310 |
-
inputs=youtube_link,
|
| 1311 |
outputs=[
|
| 1312 |
video_id,
|
| 1313 |
btn_1,
|
|
|
|
| 5 |
from docx import Document
|
| 6 |
import os
|
| 7 |
from openai import OpenAI
|
| 8 |
+
from groq import Groq
|
| 9 |
+
|
| 10 |
import json
|
| 11 |
|
| 12 |
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
| 57 |
CURRENT_INDEX = 0
|
| 58 |
VIDEO_ID = ""
|
| 59 |
|
| 60 |
+
PASSWORD = os.getenv("PASSWORD")
|
| 61 |
+
|
| 62 |
OPEN_AI_KEY = os.getenv("OPEN_AI_KEY")
|
| 63 |
OPEN_AI_CLIENT = OpenAI(api_key=OPEN_AI_KEY)
|
| 64 |
+
|
| 65 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
| 66 |
+
GROQ_CLIENT = Groq(api_key=GROQ_API_KEY)
|
| 67 |
+
|
| 68 |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 69 |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
| 70 |
|
| 71 |
+
# 驗證 password
|
| 72 |
+
def verify_password(password):
|
| 73 |
+
if password == PASSWORD:
|
| 74 |
+
return True
|
| 75 |
+
else:
|
| 76 |
+
raise gr.Error("密碼錯誤")
|
| 77 |
+
|
| 78 |
# ====gcs====
|
| 79 |
def init_gcs_client(service_account_key_string):
|
| 80 |
"""使用服务账号密钥文件创建 GCS 客户端"""
|
|
|
|
| 308 |
|
| 309 |
|
| 310 |
# ---- Main Functions ----
|
| 311 |
+
def process_file(password, file):
|
| 312 |
+
verify_password(password)
|
| 313 |
+
|
| 314 |
# 读取文件
|
| 315 |
if file.name.endswith('.csv'):
|
| 316 |
df = pd.read_csv(file)
|
|
|
|
| 493 |
|
| 494 |
return updated_transcript_json
|
| 495 |
|
| 496 |
+
def process_youtube_link(password, link):
|
| 497 |
+
verify_password(password)
|
| 498 |
+
|
| 499 |
# 使用 YouTube API 获取逐字稿
|
| 500 |
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
| 501 |
video_id = extract_youtube_id(link)
|
|
|
|
| 889 |
print("=====get_questions=====")
|
| 890 |
return q1, q2, q3
|
| 891 |
|
| 892 |
+
def change_questions(password, df_string):
|
| 893 |
+
verify_password(password)
|
| 894 |
+
|
| 895 |
questions = generate_questions(df_string)
|
| 896 |
q1 = questions[0] if len(questions) > 0 else ""
|
| 897 |
q2 = questions[1] if len(questions) > 1 else ""
|
|
|
|
| 903 |
print("=====get_questions=====")
|
| 904 |
return q1, q2, q3
|
| 905 |
|
| 906 |
+
def respond(password, user_message, data, chat_history, socratic_mode=False):
|
| 907 |
+
verify_password(password)
|
| 908 |
+
|
| 909 |
print("=== 變數:user_message ===")
|
| 910 |
print(user_message)
|
| 911 |
print("=== 變數:chat_history ===")
|
|
|
|
| 989 |
# 返回聊天历史和空字符串清空输入框
|
| 990 |
return "", chat_history
|
| 991 |
|
| 992 |
+
def chat_with_groq(password, user_message, data, chat_history, socratic_mode=False):
|
| 993 |
+
verify_password(password)
|
| 994 |
+
|
| 995 |
+
print("=== 變數:user_message ===")
|
| 996 |
+
print(user_message)
|
| 997 |
+
print("=== 變數:chat_history ===")
|
| 998 |
+
print(chat_history)
|
| 999 |
+
|
| 1000 |
+
data_json = json.loads(data)
|
| 1001 |
+
for entry in data_json:
|
| 1002 |
+
entry.pop('embed_url', None) # Remove 'embed_url' if it exists
|
| 1003 |
+
entry.pop('screenshot_path', None)
|
| 1004 |
+
|
| 1005 |
+
if socratic_mode:
|
| 1006 |
+
sys_content = f"""
|
| 1007 |
+
你是一個擅長資料分析跟影片教學的老師,user 為學生
|
| 1008 |
+
請用 {data} 為資料文本,自行判斷資料的種類,
|
| 1009 |
+
並進行對話,使用 台灣人的口與表達,及繁體中文zh-TW
|
| 1010 |
+
|
| 1011 |
+
如果是影片類型,不用解釋逐字稿格式,直接回答學生問題
|
| 1012 |
+
請你用蘇格拉底式的提問方式,引導學生思考,並且給予學生一些提示
|
| 1013 |
+
不要直接給予答案,讓學生自己思考
|
| 1014 |
+
但可以給予一些提示跟引導,例如給予影片的時間軸,讓學生自己去找答案
|
| 1015 |
+
|
| 1016 |
+
如果學生問了一些問題你無法判斷,請告訴學生你無法判斷,並建議學生可以問其他問題
|
| 1017 |
+
或者你可以問學生一些問題,幫助學生更好的理解資料
|
| 1018 |
+
|
| 1019 |
+
如果學生的問題與資料文本無關,請告訴學生你無法回答超出範圍的問題
|
| 1020 |
+
|
| 1021 |
+
最後,在你回答的開頭標註【蘇格拉底助教】
|
| 1022 |
+
"""
|
| 1023 |
+
else:
|
| 1024 |
+
sys_content = f"""
|
| 1025 |
+
你是一個擅長資料分析跟影片教學的老師,user 為學生
|
| 1026 |
+
請用 {data} 為資料文本,自行判斷資料的種類,
|
| 1027 |
+
並進行對話,使用 zh-TW
|
| 1028 |
+
|
| 1029 |
+
如果是影片類型,不用解釋逐字稿格式,直接回答學生問題
|
| 1030 |
+
但可以給予一些提示跟引導,例如給予影片的時間軸,讓學生可以找到相對應的時間點
|
| 1031 |
+
|
| 1032 |
+
如果學生問了一些問題你無法判斷,請告訴學生你無法判斷,並建議學生可以問其他問題
|
| 1033 |
+
或者你可以問學生一些問題,幫助學生更好的理解資料
|
| 1034 |
+
|
| 1035 |
+
如果學生的問題與資料文本無關,請告訴學生你無法回答超出範圍的問題
|
| 1036 |
+
"""
|
| 1037 |
+
|
| 1038 |
+
messages = [
|
| 1039 |
+
{"role": "system", "content": sys_content}
|
| 1040 |
+
]
|
| 1041 |
+
|
| 1042 |
+
# if chat_history is not none, append role, content to messages
|
| 1043 |
+
# chat_history = [(user, assistant), (user, assistant), ...]
|
| 1044 |
+
# In the list, first one is user, then assistant
|
| 1045 |
+
if chat_history is not None:
|
| 1046 |
+
# 如果超過10則訊息,只保留最後10則訊息
|
| 1047 |
+
if len(chat_history) > 10:
|
| 1048 |
+
chat_history = chat_history[-10:]
|
| 1049 |
+
|
| 1050 |
+
for chat in chat_history:
|
| 1051 |
+
old_messages = [
|
| 1052 |
+
{"role": "user", "content": chat[0]},
|
| 1053 |
+
{"role": "assistant", "content": chat[1]}
|
| 1054 |
+
]
|
| 1055 |
+
messages += old_messages
|
| 1056 |
+
else:
|
| 1057 |
+
pass
|
| 1058 |
+
|
| 1059 |
+
messages.append({"role": "user", "content": user_message})
|
| 1060 |
+
request_payload = {
|
| 1061 |
+
"model": "mixtral-8x7b-32768",
|
| 1062 |
+
"messages": messages,
|
| 1063 |
+
"max_tokens": 4000 # 設定一個較大的值,可根據需要調整
|
| 1064 |
+
}
|
| 1065 |
+
response = GROQ_CLIENT.chat.completions.create(**request_payload)
|
| 1066 |
+
response_text = response.choices[0].message.content.strip()
|
| 1067 |
+
|
| 1068 |
+
# 更新聊天历史
|
| 1069 |
+
new_chat_history = (user_message, response_text)
|
| 1070 |
+
if chat_history is None:
|
| 1071 |
+
chat_history = [new_chat_history]
|
| 1072 |
+
else:
|
| 1073 |
+
chat_history.append(new_chat_history)
|
| 1074 |
+
|
| 1075 |
+
# 返回聊天历史和空字符串清空输入框
|
| 1076 |
+
return "", chat_history
|
| 1077 |
+
|
| 1078 |
+
|
| 1079 |
+
def chat_with_youtube_transcript(password, youtube_id, thread_id, trascript, user_message, chat_history, socratic_mode=False):
|
| 1080 |
+
verify_password(password)
|
| 1081 |
+
|
| 1082 |
# 先計算 user_message 是否超過 500 個字
|
| 1083 |
if len(user_message) > 1500:
|
| 1084 |
error_msg = "你的訊息太長了,請縮短訊息長度至五百字以內"
|
|
|
|
| 1180 |
# 返回聊天历史和空字符串清空输入框
|
| 1181 |
return "", chat_history, thread.id
|
| 1182 |
|
| 1183 |
+
|
| 1184 |
def poll_run_status(run_id, thread_id, timeout=600, poll_interval=5):
|
| 1185 |
"""
|
| 1186 |
Polls the status of a Run and handles different statuses appropriately.
|
|
|
|
| 1290 |
with gr.Blocks() as demo:
|
| 1291 |
with gr.Row():
|
| 1292 |
with gr.Column(scale=2):
|
| 1293 |
+
password = gr.Textbox(label="Password", type="password", elem_id="password_input")
|
| 1294 |
file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
|
| 1295 |
+
youtube_link = gr.Textbox(label="Enter YouTube Link", elem_id="youtube_link_input")
|
| 1296 |
video_id = gr.Textbox(label="video_id", visible=False)
|
| 1297 |
youtube_link_btn = gr.Button("Submit_YouTube_Link")
|
| 1298 |
web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
|
|
|
| 1302 |
msg = gr.Textbox(label="Message")
|
| 1303 |
send_button = gr.Button("Send")
|
| 1304 |
|
| 1305 |
+
groq_chatbot = gr.Chatbot(label="groq mode chatbot")
|
| 1306 |
+
groq_msg = gr.Textbox(label="Message")
|
| 1307 |
+
groq_send_button = gr.Button("Send")
|
| 1308 |
+
|
| 1309 |
with gr.Column(scale=3):
|
| 1310 |
with gr.Tab("圖文"):
|
| 1311 |
transcript_html = gr.HTML(label="YouTube Transcript and Video")
|
|
|
|
| 1378 |
# chat_with_youtube_transcript
|
| 1379 |
send_button.click(
|
| 1380 |
chat_with_youtube_transcript,
|
| 1381 |
+
inputs=[password, video_id, thread_id, df_string_output, msg, chatbot, socratic_mode_btn],
|
| 1382 |
outputs=[msg, chatbot, thread_id]
|
| 1383 |
)
|
| 1384 |
+
# GROQ 模式
|
| 1385 |
+
groq_send_button.click(
|
| 1386 |
+
chat_with_groq,
|
| 1387 |
+
inputs=[password, groq_msg, df_string_output, groq_chatbot, socratic_mode_btn],
|
| 1388 |
+
outputs=[groq_msg, groq_chatbot]
|
| 1389 |
+
)
|
| 1390 |
# 连接按钮点击事件
|
| 1391 |
btn_1.click(
|
| 1392 |
chat_with_youtube_transcript,
|
| 1393 |
+
inputs=[password, video_id, thread_id, df_string_output, btn_1, chatbot, socratic_mode_btn],
|
| 1394 |
outputs=[msg, chatbot, thread_id]
|
| 1395 |
)
|
| 1396 |
btn_2.click(
|
| 1397 |
chat_with_youtube_transcript,
|
| 1398 |
+
inputs=[password, video_id, thread_id, df_string_output, btn_2, chatbot, socratic_mode_btn],
|
| 1399 |
outputs=[msg, chatbot, thread_id]
|
| 1400 |
)
|
| 1401 |
btn_3.click(
|
| 1402 |
chat_with_youtube_transcript,
|
| 1403 |
+
inputs=[password, video_id, thread_id, df_string_output, btn_3, chatbot, socratic_mode_btn],
|
| 1404 |
outputs=[msg, chatbot, thread_id]
|
| 1405 |
)
|
| 1406 |
|
| 1407 |
+
btn_create_question.click(change_questions, inputs = [password, df_string_output], outputs = [btn_1, btn_2, btn_3])
|
| 1408 |
|
| 1409 |
# file_upload.change(process_file, inputs=file_upload, outputs=df_string_output)
|
| 1410 |
file_upload.change(process_file, inputs=file_upload, outputs=[btn_1, btn_2, btn_3, df_summarise, df_string_output])
|
|
|
|
| 1412 |
# 当输入 YouTube 链接时触发
|
| 1413 |
youtube_link.change(
|
| 1414 |
process_youtube_link,
|
| 1415 |
+
inputs=[password,youtube_link],
|
| 1416 |
outputs=[
|
| 1417 |
video_id,
|
| 1418 |
btn_1,
|
|
|
|
| 1431 |
|
| 1432 |
youtube_link_btn.click(
|
| 1433 |
process_youtube_link,
|
| 1434 |
+
inputs=[password, youtube_link],
|
| 1435 |
outputs=[
|
| 1436 |
video_id,
|
| 1437 |
btn_1,
|