Spaces:
Sleeping
Sleeping
transcript = process_transcript_and_screenshots(video_id)
Browse files
app.py
CHANGED
|
@@ -136,6 +136,27 @@ def set_public_permission(service, file_id):
|
|
| 136 |
fields='id',
|
| 137 |
).execute()
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
def process_file(file):
|
| 140 |
# 读取文件
|
| 141 |
if file.name.endswith('.csv'):
|
|
@@ -193,33 +214,48 @@ def extract_youtube_id(url):
|
|
| 193 |
else:
|
| 194 |
return None
|
| 195 |
|
| 196 |
-
def
|
| 197 |
-
# 使用 YouTube API 获取逐字稿
|
| 198 |
-
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
| 199 |
-
video_id = extract_youtube_id(link)
|
| 200 |
service = init_drive_service()
|
| 201 |
-
parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL'
|
| 202 |
-
|
| 203 |
-
# 检查/创建视频ID命名的子文件夹
|
| 204 |
folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id)
|
| 205 |
-
file_name = f
|
| 206 |
|
| 207 |
# 检查逐字稿是否存在
|
| 208 |
-
transcript = None
|
| 209 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 210 |
if not exists:
|
|
|
|
| 211 |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW'])
|
| 212 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 213 |
-
upload_content_directly(service, file_name, folder_id, transcript_text)
|
| 214 |
print("逐字稿已上传到Google Drive")
|
| 215 |
else:
|
|
|
|
| 216 |
print("逐字稿已存在于Google Drive中")
|
| 217 |
transcript_text = download_file_as_string(service, file_id)
|
| 218 |
transcript = json.loads(transcript_text)
|
| 219 |
|
| 220 |
-
#
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
formatted_transcript = []
|
| 225 |
screenshot_paths = []
|
|
@@ -228,8 +264,7 @@ def process_youtube_link(link):
|
|
| 228 |
start_time = format_seconds_to_time(entry['start'])
|
| 229 |
end_time = format_seconds_to_time(entry['start'] + entry['duration'])
|
| 230 |
embed_url = get_embedded_youtube_link(video_id, entry['start'])
|
| 231 |
-
|
| 232 |
-
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
| 233 |
line = {
|
| 234 |
"start_time": start_time,
|
| 235 |
"end_time": end_time,
|
|
@@ -245,6 +280,10 @@ def process_youtube_link(link):
|
|
| 245 |
print(html_content)
|
| 246 |
print("=====html_content=====")
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
# 确保返回与 UI 组件预期匹配的输出
|
| 249 |
return questions[0] if len(questions) > 0 else "", \
|
| 250 |
questions[1] if len(questions) > 1 else "", \
|
|
|
|
| 136 |
fields='id',
|
| 137 |
).execute()
|
| 138 |
|
| 139 |
+
def update_file_on_drive(service, file_id, file_content):
|
| 140 |
+
"""
|
| 141 |
+
更新Google Drive上的文件内容。
|
| 142 |
+
|
| 143 |
+
参数:
|
| 144 |
+
- service: Google Drive API服务实例。
|
| 145 |
+
- file_id: 要更新的文件的ID。
|
| 146 |
+
- file_content: 新的文件内容,字符串格式。
|
| 147 |
+
"""
|
| 148 |
+
# 将新的文件内容转换为字节流
|
| 149 |
+
fh = io.BytesIO(file_content.encode('utf-8'))
|
| 150 |
+
media = MediaIoBaseUpload(fh, mimetype='application/json', resumable=True)
|
| 151 |
+
|
| 152 |
+
# 更新文件
|
| 153 |
+
updated_file = service.files().update(
|
| 154 |
+
fileId=file_id,
|
| 155 |
+
media_body=media
|
| 156 |
+
).execute()
|
| 157 |
+
|
| 158 |
+
print(f"文件已更新,文件ID: {updated_file['id']}")
|
| 159 |
+
|
| 160 |
def process_file(file):
|
| 161 |
# 读取文件
|
| 162 |
if file.name.endswith('.csv'):
|
|
|
|
| 214 |
else:
|
| 215 |
return None
|
| 216 |
|
| 217 |
+
def process_transcript_and_screenshots(video_id):
|
|
|
|
|
|
|
|
|
|
| 218 |
service = init_drive_service()
|
| 219 |
+
parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL'
|
|
|
|
|
|
|
| 220 |
folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id)
|
| 221 |
+
file_name = f'{video_id}_transcript.json'
|
| 222 |
|
| 223 |
# 检查逐字稿是否存在
|
|
|
|
| 224 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
| 225 |
if not exists:
|
| 226 |
+
# 从YouTube获取逐字稿并上传
|
| 227 |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW'])
|
| 228 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 229 |
+
file_id = upload_content_directly(service, file_name, folder_id, transcript_text)
|
| 230 |
print("逐字稿已上传到Google Drive")
|
| 231 |
else:
|
| 232 |
+
# 逐字稿已存在,下载逐字稿内容
|
| 233 |
print("逐字稿已存在于Google Drive中")
|
| 234 |
transcript_text = download_file_as_string(service, file_id)
|
| 235 |
transcript = json.loads(transcript_text)
|
| 236 |
|
| 237 |
+
# 处理逐字稿中的每个条目,检查并上传截图
|
| 238 |
+
for entry in transcript:
|
| 239 |
+
if 'img_src' not in entry:
|
| 240 |
+
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
| 241 |
+
img_file_id = upload_img_directly(service, f"{video_id}_{entry['start']}.jpg", folder_id, screenshot_path)
|
| 242 |
+
img_src = f"https://drive.google.com/uc?export=view&id={img_file_id}"
|
| 243 |
+
entry['img_src'] = img_src
|
| 244 |
+
# 删除本地截图文件
|
| 245 |
+
os.remove(screenshot_path)
|
| 246 |
+
|
| 247 |
+
# 更新逐字稿文件
|
| 248 |
+
updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
| 249 |
+
update_file_on_drive(service, file_id, updated_transcript_text)
|
| 250 |
+
print("逐字稿已更新,包括截图链接")
|
| 251 |
+
|
| 252 |
+
return transcript
|
| 253 |
+
|
| 254 |
+
def process_youtube_link(link):
|
| 255 |
+
# 使用 YouTube API 获取逐字稿
|
| 256 |
+
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
| 257 |
+
video_id = extract_youtube_id(link)
|
| 258 |
+
transcript = process_transcript_and_screenshots(video_id)
|
| 259 |
|
| 260 |
formatted_transcript = []
|
| 261 |
screenshot_paths = []
|
|
|
|
| 264 |
start_time = format_seconds_to_time(entry['start'])
|
| 265 |
end_time = format_seconds_to_time(entry['start'] + entry['duration'])
|
| 266 |
embed_url = get_embedded_youtube_link(video_id, entry['start'])
|
| 267 |
+
screenshot_path = entry['img_src']
|
|
|
|
| 268 |
line = {
|
| 269 |
"start_time": start_time,
|
| 270 |
"end_time": end_time,
|
|
|
|
| 280 |
print(html_content)
|
| 281 |
print("=====html_content=====")
|
| 282 |
|
| 283 |
+
# 基于逐字稿生成其他所需的输出
|
| 284 |
+
questions = generate_questions(transcript)
|
| 285 |
+
df_summarise = generate_df_summarise(transcript)
|
| 286 |
+
|
| 287 |
# 确保返回与 UI 组件预期匹配的输出
|
| 288 |
return questions[0] if len(questions) > 0 else "", \
|
| 289 |
questions[1] if len(questions) > 1 else "", \
|