tomo2chin2 commited on
Commit
b4bfd35
·
verified ·
1 Parent(s): 4224207

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -406
app.py CHANGED
@@ -7,7 +7,7 @@ import gradio as gr
7
  from fastapi import FastAPI, HTTPException
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel, HttpUrl
10
- from typing import Optional, Union, List, Dict
11
  import requests
12
  import tempfile
13
  import os
@@ -16,7 +16,6 @@ import numpy as np
16
  from datetime import datetime
17
  import uuid
18
  from pathlib import Path
19
- from concurrent.futures import ThreadPoolExecutor, as_completed
20
 
21
  # 画像・動画処理ライブラリ
22
  from pdf2image import convert_from_path
@@ -31,11 +30,6 @@ from huggingface_hub import HfApi, login
31
  logging.basicConfig(level=logging.INFO)
32
  logger = logging.getLogger(__name__)
33
 
34
- MAX_EDUCATION_TTS_WORKERS = max(
35
- 1,
36
- int(os.getenv("EDUCATION_TTS_MAX_WORKERS", "3")),
37
- )
38
-
39
  # ==============================
40
  # リクエスト/レスポンスモデル
41
  # ==============================
@@ -80,29 +74,6 @@ class AudioVideoResponse(BaseModel):
80
  total_slides: Optional[int] = None
81
  video_duration: Optional[float] = None
82
 
83
- # ==============================
84
- # 賢杉賢太郎連携バージョン - 追加モデル
85
- # ==============================
86
-
87
- class EducationNotesItem(BaseModel):
88
- """賢杉賢太郎: notes配列要素"""
89
- slide_index: Optional[int] = None
90
- text: str
91
- speaking_rate: Optional[float] = 1.25
92
- padding_seconds: Optional[float] = None
93
-
94
- class EducationPlaybackPolicy(BaseModel):
95
- """賢杉賢太郎: 再生ポリシー"""
96
- match_audio_length: bool = True
97
- fallback_seconds_per_slide: float = 6.0
98
- padding_seconds: float = 0.6
99
-
100
- class EducationVideoRequest(BaseModel):
101
- """賢杉賢太郎連携バージョン - notesをそのまま動画化"""
102
- pdf_url: str
103
- notes: Union[str, List[Union[str, Dict]]]
104
- playback_policy: Optional[EducationPlaybackPolicy] = None
105
-
106
  # ==============================
107
  # URL前処理ユーティリティ
108
  # ==============================
@@ -286,68 +257,6 @@ def extract_audio_text_v2(slide: dict, slide_index: int, history: list) -> str:
286
  return ""
287
 
288
 
289
- def normalize_notes_payload(notes_payload: Union[str, List[Union[str, Dict]]]) -> List[dict]:
290
- """
291
- 賢杉賢太郎用notesペイロードを正規化
292
-
293
- Args:
294
- notes_payload: list もしくは JSON文字列
295
-
296
- Returns:
297
- list[dict]: slide_index / text / speaking_rate / padding_seconds を含む辞書配列
298
- """
299
- import json
300
-
301
- if isinstance(notes_payload, str):
302
- try:
303
- raw_notes = json.loads(notes_payload)
304
- except json.JSONDecodeError as exc:
305
- raise ValueError(f"notesのJSON解析に失敗しました: {exc}")
306
- else:
307
- raw_notes = notes_payload or []
308
-
309
- normalized: List[dict] = []
310
-
311
- for idx, item in enumerate(raw_notes):
312
- if isinstance(item, dict):
313
- slide_index = item.get("slide_index", idx)
314
- text = str(item.get("text", "")).strip()
315
- speaking_rate = item.get("speaking_rate", 1.25)
316
- padding = item.get("padding_seconds")
317
- else:
318
- slide_index = idx
319
- text = str(item).strip()
320
- speaking_rate = 1.25
321
- padding = None
322
-
323
- try:
324
- slide_index = int(slide_index)
325
- except (TypeError, ValueError):
326
- slide_index = idx
327
-
328
- try:
329
- speaking_rate = float(speaking_rate) if speaking_rate is not None else 1.0
330
- except (TypeError, ValueError):
331
- speaking_rate = 1.0
332
- if speaking_rate <= 0:
333
- speaking_rate = 1.0
334
-
335
- if padding is not None:
336
- try:
337
- padding = float(padding)
338
- except (TypeError, ValueError):
339
- padding = None
340
-
341
- normalized.append({
342
- "slide_index": slide_index,
343
- "text": text,
344
- "speaking_rate": speaking_rate,
345
- "padding_seconds": padding
346
- })
347
-
348
- return normalized
349
-
350
-
351
  def convert_pil_to_array(pil_image: Image.Image, target_size: tuple) -> np.ndarray:
352
  """
353
  PIL ImageをNumPy配列に変換し、指定サイズにリサイズ
@@ -375,28 +284,20 @@ def convert_pil_to_array(pil_image: Image.Image, target_size: tuple) -> np.ndarr
375
  # V2.0: Gemini TTS音声生成
376
  # ==============================
377
 
378
- def generate_audio_with_gemini(
379
- audio_text: str,
380
- gemini_token: str,
381
- model: str = "gemini-2.5-pro-preview-tts",
382
- ) -> bytes:
383
  """
384
  Gemini REST APIでテキストから音声を生成
385
 
386
  Args:
387
  audio_text: 読み上げるテキスト
388
  gemini_token: GEMINI_TOKEN環境変数
389
- model: 利用するGemini TTSモデルID
390
 
391
  Returns:
392
  WAVバイナリデータ(24kHz PCM16)
393
  """
394
  import base64
395
 
396
- url = (
397
- "https://generativelanguage.googleapis.com/v1beta/models/"
398
- f"{model}:generateContent?key={gemini_token}"
399
- )
400
 
401
  headers = {
402
  "Content-Type": "application/json"
@@ -428,10 +329,10 @@ def generate_audio_with_gemini(
428
  }
429
  }
430
 
431
- logger.info(f"Gemini TTS API呼び出し: {len(audio_text)}文字, model={model}")
432
  logger.info(f"Payload: {payload}")
433
 
434
- response = requests.post(url, json=payload, headers=headers, timeout=120)
435
 
436
  # エラーレスポンスの詳細をログ出力
437
  if response.status_code != 200:
@@ -946,254 +847,6 @@ def create_video_with_audio_from_slides_v2(
946
  except Exception as e:
947
  logger.warning(f"動画ファイル削除エラー: {e}")
948
 
949
-
950
- def create_video_with_notes(
951
- pdf_url: str,
952
- notes_payload: Union[str, List[Union[str, Dict]]],
953
- gemini_token: str,
954
- playback_policy: Optional[dict] = None,
955
- progress_callback=None
956
- ) -> tuple:
957
- """
958
- 賢杉賢太郎連携バージョン:
959
- notesフィールド(スピーカーノート)から音声付き動画を生成する。
960
-
961
- Args:
962
- pdf_url: GASが生成したPDFのURL
963
- notes_payload: notes配列(list or JSON string)
964
- gemini_token: Gemini TTS用トークン
965
- playback_policy: 再生ポリシー辞書
966
- progress_callback: Gradio用進捗更新
967
-
968
- Returns:
969
- tuple: (video_url, page2_image_url, audio_info_list, total_slides, total_duration)
970
- """
971
- pdf_path = None
972
- audio_files: List[str] = []
973
- video_path = None
974
- page2_image_path = None
975
- clips = []
976
- audio_info_list = []
977
- total_duration = 0.0
978
-
979
- policy = playback_policy or {}
980
- match_audio = bool(policy.get("match_audio_length", True))
981
- fallback_seconds = policy.get("fallback_seconds_per_slide", 6.0)
982
- if fallback_seconds is None or fallback_seconds <= 0:
983
- fallback_seconds = 6.0
984
- padding_default = policy.get("padding_seconds", 0.6)
985
- if padding_default is None or padding_default < 0:
986
- padding_default = 0.6
987
-
988
- try:
989
- normalized_notes = normalize_notes_payload(notes_payload)
990
- notes_map = {entry["slide_index"]: entry for entry in normalized_notes}
991
-
992
- if progress_callback:
993
- progress_callback(0.05, desc="PDFダウンロード中...")
994
-
995
- pdf_path = download_pdf_from_url(sanitize_url(pdf_url))
996
-
997
- if progress_callback:
998
- progress_callback(0.1, desc="PDF→画像変換中...")
999
-
1000
- images = convert_pdf_to_images(pdf_path, dpi=150)
1001
- total_slides = len(images)
1002
-
1003
- if total_slides == 0:
1004
- raise Exception("PDFにページが含まれていません")
1005
-
1006
- note_entries: List[Dict] = []
1007
- text_map: Dict[int, str] = {}
1008
- tts_results: Dict[int, Optional[bytes]] = {}
1009
-
1010
- for idx in range(total_slides):
1011
- note_entry = notes_map.get(idx, {
1012
- "slide_index": idx,
1013
- "text": "",
1014
- "speaking_rate": 1.0,
1015
- "padding_seconds": None
1016
- })
1017
- note_entries.append(note_entry)
1018
- text = str(note_entry.get("text", "")).strip()
1019
- text_map[idx] = text
1020
-
1021
- total_audio_jobs = sum(1 for text in text_map.values() if text)
1022
-
1023
- if progress_callback:
1024
- progress_callback(0.1, desc="音声生成ジョブ準備中...")
1025
-
1026
- if total_audio_jobs > 0:
1027
- max_workers = min(MAX_EDUCATION_TTS_WORKERS, total_audio_jobs)
1028
- futures = {}
1029
- completed_jobs = 0
1030
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
1031
- for idx, text in text_map.items():
1032
- if not text:
1033
- tts_results[idx] = None
1034
- continue
1035
- futures[executor.submit(
1036
- generate_audio_with_gemini,
1037
- text,
1038
- gemini_token,
1039
- model="gemini-2.5-flash-preview-tts",
1040
- )] = idx
1041
-
1042
- for future in as_completed(futures):
1043
- idx = futures[future]
1044
- try:
1045
- wav_bytes = future.result()
1046
- except Exception as exc:
1047
- logger.error(f"Gemini TTS生成失敗 (slide={idx}): {exc}")
1048
- raise
1049
- tts_results[idx] = wav_bytes
1050
- completed_jobs += 1
1051
- if progress_callback:
1052
- progress = 0.1 + (completed_jobs / total_audio_jobs) * 0.4
1053
- progress_callback(
1054
- min(progress, 0.5),
1055
- desc=f"音声生成中 ({completed_jobs}/{total_audio_jobs})"
1056
- )
1057
- else:
1058
- if progress_callback:
1059
- progress_callback(0.5, desc="音声生成スキップ(テキストなし)")
1060
-
1061
- for idx, pil_image in enumerate(images):
1062
- note_entry = note_entries[idx]
1063
- text = text_map[idx]
1064
- speaking_rate = note_entry.get("speaking_rate", 1.25) or 1.0
1065
- if speaking_rate <= 0:
1066
- speaking_rate = 1.0
1067
- padding_seconds = note_entry.get("padding_seconds")
1068
- if padding_seconds is None or padding_seconds < 0:
1069
- padding_seconds = padding_default
1070
-
1071
- audio_duration = 0.0
1072
- slide_duration = fallback_seconds
1073
- audio_url = None
1074
- audio_path = None
1075
-
1076
- if text:
1077
- wav_bytes = tts_results.get(idx)
1078
- if wav_bytes is None:
1079
- raise RuntimeError(f"TTS音声が取得できませんでした (slide_index={idx})")
1080
-
1081
- if speaking_rate and abs(speaking_rate - 1.0) > 0.01:
1082
- wav_bytes = speed_up_audio(wav_bytes, speed_factor=speaking_rate)
1083
-
1084
- audio_duration = get_audio_duration(wav_bytes)
1085
- if match_audio:
1086
- slide_duration = max(audio_duration + padding_seconds, fallback_seconds)
1087
- else:
1088
- slide_duration = fallback_seconds
1089
-
1090
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
1091
- tmp_audio.write(wav_bytes)
1092
- audio_path = tmp_audio.name
1093
- audio_files.append(audio_path)
1094
-
1095
- audio_url = save_audio_to_hf(wav_bytes, prefix=f"education_slide_{idx:02d}")
1096
-
1097
- else:
1098
- slide_duration = fallback_seconds
1099
-
1100
- if progress_callback and total_slides:
1101
- progress = 0.5 + ((idx + 1) / total_slides) * 0.2
1102
- progress_callback(
1103
- min(progress, 0.7),
1104
- desc=f"動画クリップ生成中 ({idx + 1}/{total_slides})"
1105
- )
1106
-
1107
- img_array = convert_pil_to_array(pil_image, target_size=(1280, 720))
1108
- img_clip = ImageClip(img_array, duration=slide_duration)
1109
-
1110
- if audio_path:
1111
- audio_clip = AudioFileClip(audio_path)
1112
- img_clip = img_clip.set_audio(audio_clip)
1113
-
1114
- clips.append(img_clip)
1115
-
1116
- audio_info_list.append({
1117
- "slide_index": idx,
1118
- "slide_type": "notes",
1119
- "audio_url": audio_url,
1120
- "duration": audio_duration,
1121
- "text": text,
1122
- "speaking_rate": speaking_rate,
1123
- "playback_duration": slide_duration
1124
- })
1125
-
1126
- total_duration += slide_duration
1127
-
1128
- if not clips:
1129
- raise Exception("動画クリップが生成されませんでした(notesに有効なテキストがありません)")
1130
-
1131
- if progress_callback:
1132
- progress_callback(0.7, desc="動画をレンダリング中...")
1133
-
1134
- final_video = concatenate_videoclips(clips, method="compose")
1135
- tmp_video = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
1136
- video_path = tmp_video.name
1137
- tmp_video.close()
1138
-
1139
- final_video.write_videofile(
1140
- video_path,
1141
- fps=24,
1142
- codec="libx264",
1143
- audio_codec="aac",
1144
- temp_audiofile=os.path.join(tempfile.gettempdir(), f"temp_audio_{uuid.uuid4().hex}.m4a"),
1145
- remove_temp=True,
1146
- verbose=False,
1147
- logger=None
1148
- )
1149
- final_video.close()
1150
-
1151
- for clip in clips:
1152
- clip.close()
1153
-
1154
- if progress_callback:
1155
- progress_callback(0.85, desc="動画をアップロード中...")
1156
-
1157
- video_url = video_uploader.upload_video(video_path, prefix="education_video")
1158
-
1159
- page2_image_url = None
1160
- if total_slides >= 2:
1161
- with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_img:
1162
- page2_image_path = tmp_img.name
1163
- images[1].save(page2_image_path, format="JPEG", quality=90)
1164
- page2_image_url = video_uploader.upload_image(page2_image_path, prefix="education_page2")
1165
-
1166
- if progress_callback:
1167
- progress_callback(1.0, desc="完了!")
1168
-
1169
- return (video_url, page2_image_url, audio_info_list, total_slides, total_duration)
1170
-
1171
- finally:
1172
- for audio_file in audio_files:
1173
- if os.path.exists(audio_file):
1174
- try:
1175
- os.remove(audio_file)
1176
- except Exception as e:
1177
- logger.warning(f"音声ファイル削除エラー: {e}")
1178
-
1179
- if video_path and os.path.exists(video_path):
1180
- try:
1181
- os.remove(video_path)
1182
- except Exception as e:
1183
- logger.warning(f"動画ファイル削除エラー: {e}")
1184
-
1185
- if page2_image_path and os.path.exists(page2_image_path):
1186
- try:
1187
- os.remove(page2_image_path)
1188
- except Exception as e:
1189
- logger.warning(f"画像ファイル削除エラー: {e}")
1190
-
1191
- if pdf_path and os.path.exists(pdf_path):
1192
- try:
1193
- os.remove(pdf_path)
1194
- except Exception as e:
1195
- logger.warning(f"PDFファイル削除エラー: {e}")
1196
-
1197
  # ==============================
1198
  # コア機能実装
1199
  # ==============================
@@ -1616,60 +1269,6 @@ async def slidedata_to_video(request: SlideDataToVideoRequest):
1616
  detail=f"動画生成に失敗しました: {str(e)}"
1617
  )
1618
 
1619
-
1620
- @app.post(
1621
- "/api/education/notes-to-video",
1622
- response_model=AudioVideoResponse,
1623
- tags=["Video Generation", "Education"],
1624
- summary="賢杉賢太郎: notes配列から音声付き動画を生成",
1625
- description="賢杉賢太郎連携バージョン。GASが返すPDF URLとnotes配列を渡すと、音声付き動画を生成してアップロードします。"
1626
- )
1627
- async def education_notes_to_video(request: EducationVideoRequest):
1628
- """賢杉賢太郎連携バージョン: notesフィールドを活用した動画生成エンドポイント"""
1629
- gemini_token = os.environ.get("GEMINI_TOKEN")
1630
- if not gemini_token:
1631
- raise HTTPException(
1632
- status_code=500,
1633
- detail="GEMINI_TOKEN環境変数が設定されていません"
1634
- )
1635
-
1636
- try:
1637
- logger.info("賢杉賢太郎向けAPIリクエスト受信")
1638
- playback_policy = request.playback_policy.dict() if request.playback_policy else {}
1639
-
1640
- (
1641
- video_url,
1642
- page2_image_url,
1643
- audio_info_list,
1644
- total_slides,
1645
- total_duration
1646
- ) = create_video_with_notes(
1647
- pdf_url=request.pdf_url,
1648
- notes_payload=request.notes,
1649
- gemini_token=gemini_token,
1650
- playback_policy=playback_policy
1651
- )
1652
-
1653
- logger.info(f"賢杉賢太郎向け動画生成完了: {video_url}")
1654
-
1655
- return AudioVideoResponse(
1656
- status="success",
1657
- video_url=video_url,
1658
- page2_image_url=page2_image_url,
1659
- audio_urls=audio_info_list,
1660
- message="賢杉賢太郎用の音声付き動画の生成とアップロードに成功しました",
1661
- total_slides=total_slides,
1662
- video_duration=total_duration
1663
- )
1664
-
1665
- except HTTPException:
1666
- raise
1667
- except Exception as e:
1668
- logger.error(f"賢杉賢太郎向け動画生成エラー: {e}", exc_info=True)
1669
- raise HTTPException(
1670
- status_code=500,
1671
- detail=f"賢杉賢太郎向け動画生成に失敗しました: {str(e)}"
1672
- )
1673
  @app.get("/health")
1674
  async def health_check():
1675
  """ヘルスチェックエンドポイント"""
 
7
  from fastapi import FastAPI, HTTPException
8
  from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel, HttpUrl
10
+ from typing import Optional, Union
11
  import requests
12
  import tempfile
13
  import os
 
16
  from datetime import datetime
17
  import uuid
18
  from pathlib import Path
 
19
 
20
  # 画像・動画処理ライブラリ
21
  from pdf2image import convert_from_path
 
30
  logging.basicConfig(level=logging.INFO)
31
  logger = logging.getLogger(__name__)
32
 
 
 
 
 
 
33
  # ==============================
34
  # リクエスト/レスポンスモデル
35
  # ==============================
 
74
  total_slides: Optional[int] = None
75
  video_duration: Optional[float] = None
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # ==============================
78
  # URL前処理ユーティリティ
79
  # ==============================
 
257
  return ""
258
 
259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  def convert_pil_to_array(pil_image: Image.Image, target_size: tuple) -> np.ndarray:
261
  """
262
  PIL ImageをNumPy配列に変換し、指定サイズにリサイズ
 
284
  # V2.0: Gemini TTS音声生成
285
  # ==============================
286
 
287
+ def generate_audio_with_gemini(audio_text: str, gemini_token: str) -> bytes:
 
 
 
 
288
  """
289
  Gemini REST APIでテキストから音声を生成
290
 
291
  Args:
292
  audio_text: 読み上げるテキスト
293
  gemini_token: GEMINI_TOKEN環境変数
 
294
 
295
  Returns:
296
  WAVバイナリデータ(24kHz PCM16)
297
  """
298
  import base64
299
 
300
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro-preview-tts:generateContent?key={gemini_token}"
 
 
 
301
 
302
  headers = {
303
  "Content-Type": "application/json"
 
329
  }
330
  }
331
 
332
+ logger.info(f"Gemini TTS API呼び出し: {len(audio_text)}文字")
333
  logger.info(f"Payload: {payload}")
334
 
335
+ response = requests.post(url, json=payload, headers=headers, timeout=60)
336
 
337
  # エラーレスポンスの詳細をログ出力
338
  if response.status_code != 200:
 
847
  except Exception as e:
848
  logger.warning(f"動画ファイル削除エラー: {e}")
849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
850
  # ==============================
851
  # コア機能実装
852
  # ==============================
 
1269
  detail=f"動画生成に失敗しました: {str(e)}"
1270
  )
1271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1272
  @app.get("/health")
1273
  async def health_check():
1274
  """ヘルスチェックエンドポイント"""