Spaces:
Sleeping
Sleeping
update
Browse files- README.md +1 -1
- app.py +64 -28
- requirements.txt +4 -2
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: 📚
|
|
| 4 |
colorFrom: red
|
| 5 |
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 4.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
|
|
|
| 4 |
colorFrom: red
|
| 5 |
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.36.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
app.py
CHANGED
|
@@ -34,6 +34,10 @@ from googleapiclient.http import MediaIoBaseUpload
|
|
| 34 |
|
| 35 |
from educational_material import EducationalMaterial
|
| 36 |
from storage_service import GoogleCloudStorage
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
import boto3
|
| 39 |
|
|
@@ -92,6 +96,19 @@ TRANSCRIPTS = []
|
|
| 92 |
CURRENT_INDEX = 0
|
| 93 |
CHAT_LIMIT = 5
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
# CLIENTS CONFIG
|
| 96 |
GBQ_CLIENT = bigquery.Client.from_service_account_info(json.loads(GBQ_KEY))
|
| 97 |
GROQ_CLIENT = Groq(api_key=GROQ_API_KEY)
|
|
@@ -716,18 +733,17 @@ def split_data(df_string, word_base=100000):
|
|
| 716 |
start_idx = i * part_size
|
| 717 |
end_idx = min((i + 1) * part_size, len(data))
|
| 718 |
# Serialize the segment back to a JSON string
|
| 719 |
-
segment = json.dumps(data[start_idx:end_idx])
|
| 720 |
segments.append(segment)
|
| 721 |
-
|
| 722 |
return segments
|
| 723 |
|
| 724 |
def generate_content_by_open_ai(sys_content, user_content, response_format=None, model_name=None):
|
| 725 |
-
print("
|
| 726 |
if model_name == "gpt-4-turbo":
|
| 727 |
model = "gpt-4-turbo"
|
| 728 |
else:
|
| 729 |
model = "gpt-4o"
|
| 730 |
-
print(f"model: {model}")
|
| 731 |
|
| 732 |
messages = [
|
| 733 |
{"role": "system", "content": sys_content},
|
|
@@ -770,16 +786,29 @@ def generate_content_by_open_ai(sys_content, user_content, response_format=None,
|
|
| 770 |
# content = response_body.get('content')[0].get('text')
|
| 771 |
# return content
|
| 772 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
def generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=None, model_name=None):
|
| 774 |
# 使用 OpenAI 生成基于上传数据的问题
|
| 775 |
|
| 776 |
-
|
|
|
|
|
|
|
|
|
|
| 777 |
# print(f"LLM: {LLM_model}")
|
| 778 |
# content = generate_content_by_bedrock(sys_content, user_content)
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
|
| 784 |
print("=====content=====")
|
| 785 |
print(content)
|
|
@@ -830,30 +859,36 @@ def get_reading_passage(video_id, df_string, source, LLM_model=None):
|
|
| 830 |
return reading_passage_json
|
| 831 |
|
| 832 |
def generate_reading_passage(df_string, LLM_model=None):
|
| 833 |
-
print("===generate_reading_passage===")
|
|
|
|
|
|
|
| 834 |
segments = split_data(df_string, word_base=100000)
|
| 835 |
all_content = []
|
| 836 |
model_name = "gpt-4-turbo"
|
| 837 |
# model_name = "gpt-4o"
|
| 838 |
|
| 839 |
for segment in segments:
|
| 840 |
-
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
| 841 |
user_content = f"""
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model, model_name=model_name)
|
| 858 |
all_content.append(content + "\n")
|
| 859 |
|
|
@@ -1330,7 +1365,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_
|
|
| 1330 |
# 文本:{segment}
|
| 1331 |
|
| 1332 |
# Rule
|
| 1333 |
-
1. 請根據文本,提取出 5 段重點摘要,並給出對應的時間軸,每一段重點的時間軸範圍大於1分鐘,但小於 1/3 總逐字稿長度
|
| 1334 |
2. 內容當中,如果有列舉方法、模式或是工具,就用 bulletpoint 或是 編號方式 列出,並在列舉部分的頭尾用[]匡列(example: FAANG 是以下五間公司: [1. A公司 2.B公司 3.C公司 4.D公司 5.E公司 ],...)
|
| 1335 |
3. 注意不要遺漏任何一段時間軸的內容 從零秒開始,以這種方式分析整個文本,從零秒開始分析,直到結束。這很重要
|
| 1336 |
4. 結尾的時間如果有總結性的話,也要擷取
|
|
@@ -1342,11 +1377,12 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_
|
|
| 1342 |
1. 請一定要用 zh-TW,這非常重要!
|
| 1343 |
2. 如果是疑似主播、主持人的圖片場景,且沒有任何有用的資訊,請不要選取
|
| 1344 |
3. 如果頭尾的情節不是重點,特別是打招呼或是介紹自己是誰、或是finally say goodbye 就是不重要的情節,就不用擷取
|
|
|
|
| 1345 |
|
| 1346 |
Example: retrun JSON
|
| 1347 |
{{key_moments:[{{
|
| 1348 |
"start": "00:00",
|
| 1349 |
-
"end": "01:
|
| 1350 |
"text": "逐字稿的重點摘要",
|
| 1351 |
"keywords": ["關鍵字", "關鍵字"]
|
| 1352 |
}}]
|
|
@@ -3191,7 +3227,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
| 3191 |
with gr.Accordion("See Details", open=False) as see_details:
|
| 3192 |
with gr.Row():
|
| 3193 |
is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
|
| 3194 |
-
LLM_model = gr.Dropdown(label="LLM Model", choices=["open-ai-gpt-4o", "anthropic-claude-3-sonnet"], value="open-ai-gpt-4o", visible=True, interactive=True)
|
| 3195 |
with gr.Tab("逐字稿本文"):
|
| 3196 |
with gr.Row() as transcript_admmin:
|
| 3197 |
transcript_kind = gr.Textbox(value="transcript", show_label=False)
|
|
|
|
| 34 |
|
| 35 |
from educational_material import EducationalMaterial
|
| 36 |
from storage_service import GoogleCloudStorage
|
| 37 |
+
from google.cloud import aiplatform
|
| 38 |
+
from vertexai.preview.generative_models import GenerativeModel
|
| 39 |
+
from google.oauth2.service_account import Credentials
|
| 40 |
+
|
| 41 |
|
| 42 |
import boto3
|
| 43 |
|
|
|
|
| 96 |
CURRENT_INDEX = 0
|
| 97 |
CHAT_LIMIT = 5
|
| 98 |
|
| 99 |
+
# Google aiplatform
|
| 100 |
+
google_service_account_info_dict = json.loads(GBQ_KEY)
|
| 101 |
+
GOOGPE_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]
|
| 102 |
+
google_creds = Credentials.from_service_account_info(
|
| 103 |
+
google_service_account_info_dict, scopes=GOOGPE_SCOPES
|
| 104 |
+
)
|
| 105 |
+
aiplatform.init(
|
| 106 |
+
project="junyiacademy",
|
| 107 |
+
service_account=google_service_account_info_dict,
|
| 108 |
+
credentials=google_creds,
|
| 109 |
+
)
|
| 110 |
+
GEMINI_MODEL = GenerativeModel("gemini-pro")
|
| 111 |
+
|
| 112 |
# CLIENTS CONFIG
|
| 113 |
GBQ_CLIENT = bigquery.Client.from_service_account_info(json.loads(GBQ_KEY))
|
| 114 |
GROQ_CLIENT = Groq(api_key=GROQ_API_KEY)
|
|
|
|
| 733 |
start_idx = i * part_size
|
| 734 |
end_idx = min((i + 1) * part_size, len(data))
|
| 735 |
# Serialize the segment back to a JSON string
|
| 736 |
+
segment = json.dumps(data[start_idx:end_idx]).encode('utf-8').decode('unicode_escape')
|
| 737 |
segments.append(segment)
|
|
|
|
| 738 |
return segments
|
| 739 |
|
| 740 |
def generate_content_by_open_ai(sys_content, user_content, response_format=None, model_name=None):
|
| 741 |
+
print("generate_content_by_open_ai")
|
| 742 |
if model_name == "gpt-4-turbo":
|
| 743 |
model = "gpt-4-turbo"
|
| 744 |
else:
|
| 745 |
model = "gpt-4o"
|
| 746 |
+
print(f"LLM model: {model}")
|
| 747 |
|
| 748 |
messages = [
|
| 749 |
{"role": "system", "content": sys_content},
|
|
|
|
| 786 |
# content = response_body.get('content')[0].get('text')
|
| 787 |
# return content
|
| 788 |
|
| 789 |
+
def generate_content_by_gemini(sys_content, user_content, response_format=None, model_name=None):
|
| 790 |
+
print("generate_content_by_gemini")
|
| 791 |
+
print(f"LLM using: {model_name}")
|
| 792 |
+
model_response = GEMINI_MODEL.generate_content(
|
| 793 |
+
f"{sys_content}, {user_content}"
|
| 794 |
+
)
|
| 795 |
+
content = model_response.candidates[0].content.parts[0].text
|
| 796 |
+
return content
|
| 797 |
+
|
| 798 |
+
|
| 799 |
def generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=None, model_name=None):
|
| 800 |
# 使用 OpenAI 生成基于上传数据的问题
|
| 801 |
|
| 802 |
+
if LLM_model == "gemini-pro":
|
| 803 |
+
print(f"LLM: {LLM_model}")
|
| 804 |
+
content = generate_content_by_gemini(sys_content, user_content, response_format, model_name=model_name)
|
| 805 |
+
# elif LLM_model == "anthropic-claude-3-sonnet":
|
| 806 |
# print(f"LLM: {LLM_model}")
|
| 807 |
# content = generate_content_by_bedrock(sys_content, user_content)
|
| 808 |
+
else:
|
| 809 |
+
print(f"LLM: {LLM_model}")
|
| 810 |
+
print(f"model_name: {model_name}")
|
| 811 |
+
content = generate_content_by_open_ai(sys_content, user_content, response_format, model_name=model_name)
|
| 812 |
|
| 813 |
print("=====content=====")
|
| 814 |
print(content)
|
|
|
|
| 859 |
return reading_passage_json
|
| 860 |
|
| 861 |
def generate_reading_passage(df_string, LLM_model=None):
|
| 862 |
+
print("===generate_reading_passage 0===")
|
| 863 |
+
print(df_string)
|
| 864 |
+
|
| 865 |
segments = split_data(df_string, word_base=100000)
|
| 866 |
all_content = []
|
| 867 |
model_name = "gpt-4-turbo"
|
| 868 |
# model_name = "gpt-4o"
|
| 869 |
|
| 870 |
for segment in segments:
|
| 871 |
+
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
| 872 |
user_content = f"""
|
| 873 |
+
# 文本 {segment}
|
| 874 |
+
|
| 875 |
+
# rules:
|
| 876 |
+
- 根據文本,抓取重點
|
| 877 |
+
- 去除人類講課時口語的問答句,重新拆解成文章,建立適合閱讀語句通順的 Reading Passage
|
| 878 |
+
- 只需要專注提供 Reading Passage,字數在 500 字以內
|
| 879 |
+
- 敘述中,請把數學或是專業術語,用 Latex 包覆($...$)
|
| 880 |
+
- 加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
|
| 881 |
+
|
| 882 |
+
# restrictions:
|
| 883 |
+
- 請一定要使用繁體中文 zh-TW,這很重要
|
| 884 |
+
- 產生的結果不要前後文解釋,也不要敘述這篇文章怎麼產生的
|
| 885 |
+
- 請直接給出文章,不用介紹怎麼處理的或是文章字數等等
|
| 886 |
+
- 字數在 500 字以內
|
| 887 |
"""
|
| 888 |
+
|
| 889 |
+
print("======user_content 0 ===")
|
| 890 |
+
print(user_content)
|
| 891 |
+
|
| 892 |
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model, model_name=model_name)
|
| 893 |
all_content.append(content + "\n")
|
| 894 |
|
|
|
|
| 1365 |
# 文本:{segment}
|
| 1366 |
|
| 1367 |
# Rule
|
| 1368 |
+
1. 請根據文本,提取出 5~8 段重點摘要,並給出對應的時間軸,每一段重點的時間軸範圍大於1分鐘,但小於 1/3 總逐字稿長度
|
| 1369 |
2. 內容當中,如果有列舉方法、模式或是工具,就用 bulletpoint 或是 編號方式 列出,並在列舉部分的頭尾用[]匡列(example: FAANG 是以下五間公司: [1. A公司 2.B公司 3.C公司 4.D公司 5.E公司 ],...)
|
| 1370 |
3. 注意不要遺漏任何一段時間軸的內容 從零秒開始,以這種方式分析整個文本,從零秒開始分析,直到結束。這很重要
|
| 1371 |
4. 結尾的時間如果有總結性的話,也要擷取
|
|
|
|
| 1377 |
1. 請一定要用 zh-TW,這非常重要!
|
| 1378 |
2. 如果是疑似主播、主持人的圖片場景,且沒有任何有用的資訊,請不要選取
|
| 1379 |
3. 如果頭尾的情節不是重點,特別是打招呼或是介紹自己是誰、或是finally say goodbye 就是不重要的情節,就不用擷取
|
| 1380 |
+
4. 時間軸請取到秒數,不要只取到分鐘數,這很重要
|
| 1381 |
|
| 1382 |
Example: retrun JSON
|
| 1383 |
{{key_moments:[{{
|
| 1384 |
"start": "00:00",
|
| 1385 |
+
"end": "01:35",
|
| 1386 |
"text": "逐字稿的重點摘要",
|
| 1387 |
"keywords": ["關鍵字", "關鍵字"]
|
| 1388 |
}}]
|
|
|
|
| 3227 |
with gr.Accordion("See Details", open=False) as see_details:
|
| 3228 |
with gr.Row():
|
| 3229 |
is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
|
| 3230 |
+
LLM_model = gr.Dropdown(label="LLM Model", choices=["open-ai-gpt-4o", "anthropic-claude-3-sonnet", "gemini-pro"], value="open-ai-gpt-4o", visible=True, interactive=True)
|
| 3231 |
with gr.Tab("逐字稿本文"):
|
| 3232 |
with gr.Row() as transcript_admmin:
|
| 3233 |
transcript_kind = gr.Textbox(value="transcript", show_label=False)
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
gradio==4.
|
| 2 |
pandas
|
| 3 |
openai>=1.16.2
|
| 4 |
requests
|
|
@@ -12,9 +12,11 @@ google-auth-httplib2
|
|
| 12 |
google-auth-oauthlib
|
| 13 |
google-cloud-storage
|
| 14 |
google-cloud-bigquery
|
|
|
|
| 15 |
groq
|
| 16 |
yt_dlp
|
| 17 |
uuid
|
| 18 |
gtts
|
| 19 |
boto3
|
| 20 |
-
pydub
|
|
|
|
|
|
| 1 |
+
gradio==4.36.0
|
| 2 |
pandas
|
| 3 |
openai>=1.16.2
|
| 4 |
requests
|
|
|
|
| 12 |
google-auth-oauthlib
|
| 13 |
google-cloud-storage
|
| 14 |
google-cloud-bigquery
|
| 15 |
+
google-cloud-aiplatform
|
| 16 |
groq
|
| 17 |
yt_dlp
|
| 18 |
uuid
|
| 19 |
gtts
|
| 20 |
boto3
|
| 21 |
+
pydub
|
| 22 |
+
vertexai
|