Spaces:
Paused
Paused
Upload app.py
Browse files
app.py
CHANGED
|
@@ -52,7 +52,9 @@ class VideoResponse(BaseModel):
|
|
| 52 |
# V2.0: スライドデータ→音声付き動画変換モデル
|
| 53 |
class SlideDataToVideoRequest(BaseModel):
|
| 54 |
"""スライドデータ→音声付き動画変換リクエスト"""
|
| 55 |
-
slide_data: list
|
|
|
|
|
|
|
| 56 |
|
| 57 |
class AudioInfo(BaseModel):
|
| 58 |
"""音声情報"""
|
|
@@ -109,16 +111,17 @@ def sanitize_url(url: str) -> str:
|
|
| 109 |
|
| 110 |
def clean_mnemonic(text: str) -> str:
|
| 111 |
"""
|
| 112 |
-
語呂合わせから(数字)パターンを除去
|
| 113 |
|
| 114 |
Args:
|
| 115 |
-
text: 語呂合わせテキスト(例: "いい国つくろう鎌倉幕府(1192)")
|
| 116 |
|
| 117 |
Returns:
|
| 118 |
-
str: 数字を除去したテキスト(例: "いい国つくろう鎌倉幕府")
|
| 119 |
"""
|
| 120 |
import re
|
| 121 |
-
|
|
|
|
| 122 |
return cleaned
|
| 123 |
|
| 124 |
|
|
@@ -200,6 +203,69 @@ def extract_audio_text(slide: dict) -> str:
|
|
| 200 |
else:
|
| 201 |
return ""
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# ==============================
|
| 204 |
# V2.0: Gemini TTS音声生成
|
| 205 |
# ==============================
|
|
@@ -557,6 +623,189 @@ def create_video_with_audio_from_slides(
|
|
| 557 |
except Exception as e:
|
| 558 |
logger.warning(f"動画ファイル削除エラー: {e}")
|
| 559 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
# ==============================
|
| 561 |
# コア機能実装
|
| 562 |
# ==============================
|
|
@@ -946,9 +1195,11 @@ async def slidedata_to_video(request: SlideDataToVideoRequest):
|
|
| 946 |
try:
|
| 947 |
logger.info(f"API リクエスト受信: {len(request.slide_data)}スライド")
|
| 948 |
|
| 949 |
-
# 動画生成
|
| 950 |
-
video_url, page2_image_url, audio_info_list =
|
| 951 |
slide_data=request.slide_data,
|
|
|
|
|
|
|
| 952 |
gemini_token=gemini_token
|
| 953 |
)
|
| 954 |
|
|
|
|
| 52 |
# V2.0: スライドデータ→音声付き動画変換モデル
|
| 53 |
class SlideDataToVideoRequest(BaseModel):
|
| 54 |
"""スライドデータ→音声付き動画変換リクエスト"""
|
| 55 |
+
slide_data: list # 14枚のスライド構造
|
| 56 |
+
pdf_url: str # GASが生成したPDF URL
|
| 57 |
+
history: list # 6件の元イベント(年号・語呂合わせ・サマリー)
|
| 58 |
|
| 59 |
class AudioInfo(BaseModel):
|
| 60 |
"""音声情報"""
|
|
|
|
| 111 |
|
| 112 |
def clean_mnemonic(text: str) -> str:
|
| 113 |
"""
|
| 114 |
+
語呂合わせから(数字)または(数字)パターンを除去
|
| 115 |
|
| 116 |
Args:
|
| 117 |
+
text: 語呂合わせテキスト(例: "いい国つくろう鎌倉幕府(1192)" または "兄(2)さん(3)ク(9)イーン")
|
| 118 |
|
| 119 |
Returns:
|
| 120 |
+
str: 数字を除去したテキスト(例: "いい国つくろう鎌倉幕府" または "兄さんクイーン")
|
| 121 |
"""
|
| 122 |
import re
|
| 123 |
+
# 全角・半角両対応
|
| 124 |
+
cleaned = re.sub(r'[((]\d+[))]', '', text)
|
| 125 |
return cleaned
|
| 126 |
|
| 127 |
|
|
|
|
| 203 |
else:
|
| 204 |
return ""
|
| 205 |
|
| 206 |
+
|
| 207 |
+
def extract_audio_text_v2(slide: dict, slide_index: int, history: list) -> str:
|
| 208 |
+
"""
|
| 209 |
+
スライドインデックスとhistoryから音声テキストを抽出
|
| 210 |
+
|
| 211 |
+
Args:
|
| 212 |
+
slide: スライドデータ辞書
|
| 213 |
+
slide_index: 0-13のスライドインデックス
|
| 214 |
+
history: 6件の元イベントデータ(year/mnemonic/summary)
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
str: 読み上げるテキスト
|
| 218 |
+
"""
|
| 219 |
+
slide_type = determine_slide_type(slide)
|
| 220 |
+
|
| 221 |
+
if slide_type == "title":
|
| 222 |
+
return slide.get("title", "")
|
| 223 |
+
|
| 224 |
+
elif slide_type == "closing":
|
| 225 |
+
return slide.get("notes", "本日の学習は以上です。復習を忘れずに。")
|
| 226 |
+
|
| 227 |
+
elif slide_type == "imageText_image_only":
|
| 228 |
+
# slide_index 1, 3, 5, 7, 9, 11 → history[0, 1, 2, 3, 4, 5]
|
| 229 |
+
history_index = (slide_index - 1) // 2
|
| 230 |
+
event = history[history_index]
|
| 231 |
+
year = str(event.get("year", ""))
|
| 232 |
+
mnemonic = clean_mnemonic(str(event.get("mnemonic", "")))
|
| 233 |
+
return f"{year}年、{mnemonic}。{year}年、{mnemonic}。"
|
| 234 |
+
|
| 235 |
+
elif slide_type == "imageText_with_text":
|
| 236 |
+
# slide_index 2, 4, 6, 8, 10, 12 → history[0, 1, 2, 3, 4, 5]
|
| 237 |
+
history_index = (slide_index - 2) // 2
|
| 238 |
+
event = history[history_index]
|
| 239 |
+
summary = str(event.get("summary", ""))
|
| 240 |
+
return summary
|
| 241 |
+
|
| 242 |
+
else:
|
| 243 |
+
return ""
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def convert_pil_to_array(pil_image: Image.Image, target_size: tuple) -> np.ndarray:
|
| 247 |
+
"""
|
| 248 |
+
PIL ImageをNumPy配列に変換し、指定サイズにリサイズ
|
| 249 |
+
|
| 250 |
+
Args:
|
| 251 |
+
pil_image: PIL Image
|
| 252 |
+
target_size: (width, height) - 例: (1280, 720)
|
| 253 |
+
|
| 254 |
+
Returns:
|
| 255 |
+
numpy array (RGB)
|
| 256 |
+
"""
|
| 257 |
+
# アスペクト比を保ってリサイズ
|
| 258 |
+
pil_image = pil_image.resize(target_size, Image.Resampling.LANCZOS)
|
| 259 |
+
|
| 260 |
+
# RGB変換
|
| 261 |
+
if pil_image.mode != 'RGB':
|
| 262 |
+
pil_image = pil_image.convert('RGB')
|
| 263 |
+
|
| 264 |
+
# numpy array変換
|
| 265 |
+
img_array = np.array(pil_image)
|
| 266 |
+
|
| 267 |
+
return img_array
|
| 268 |
+
|
| 269 |
# ==============================
|
| 270 |
# V2.0: Gemini TTS音声生成
|
| 271 |
# ==============================
|
|
|
|
| 623 |
except Exception as e:
|
| 624 |
logger.warning(f"動画ファイル削除エラー: {e}")
|
| 625 |
|
| 626 |
+
|
| 627 |
+
def create_video_with_audio_from_slides_v2(
|
| 628 |
+
slide_data: list,
|
| 629 |
+
pdf_url: str,
|
| 630 |
+
history: list,
|
| 631 |
+
gemini_token: str,
|
| 632 |
+
progress_callback=None
|
| 633 |
+
) -> tuple:
|
| 634 |
+
"""
|
| 635 |
+
PDF画像とslideData/historyから音声付き動画を生成(V2.0完全版)
|
| 636 |
+
|
| 637 |
+
Args:
|
| 638 |
+
slide_data: 14枚のスライドデータJSON配列
|
| 639 |
+
pdf_url: GASが生成したPDF URL
|
| 640 |
+
history: 6件の元イベントデータ(year/mnemonic/summary)
|
| 641 |
+
gemini_token: GEMINI_TOKEN環境変数
|
| 642 |
+
progress_callback: 進捗コールバック関数(Gradio用)
|
| 643 |
+
|
| 644 |
+
Returns:
|
| 645 |
+
tuple: (video_url, page2_image_url, audio_info_list)
|
| 646 |
+
"""
|
| 647 |
+
pdf_path = None
|
| 648 |
+
audio_files = []
|
| 649 |
+
clips = []
|
| 650 |
+
audio_info_list = []
|
| 651 |
+
video_path = None
|
| 652 |
+
|
| 653 |
+
try:
|
| 654 |
+
# 1. PDFダウンロード
|
| 655 |
+
if progress_callback:
|
| 656 |
+
progress_callback(0.05, desc="PDFダウンロード中...")
|
| 657 |
+
|
| 658 |
+
pdf_path = download_pdf_from_url(sanitize_url(pdf_url))
|
| 659 |
+
|
| 660 |
+
# 2. PDF → 画像変換(14ページ → 14枚)
|
| 661 |
+
if progress_callback:
|
| 662 |
+
progress_callback(0.1, desc="PDF→画像変換中...")
|
| 663 |
+
|
| 664 |
+
images = convert_pdf_to_images(pdf_path, dpi=150)
|
| 665 |
+
|
| 666 |
+
# 画像枚数とスライドデータの整合性チェック
|
| 667 |
+
if len(images) != len(slide_data):
|
| 668 |
+
raise Exception(f"画像枚数とスライドデータが不一致: {len(images)}枚 vs {len(slide_data)}枚")
|
| 669 |
+
|
| 670 |
+
total_slides = len(slide_data)
|
| 671 |
+
logger.info(f"PDF変換完了: {total_slides}枚の画像を取得")
|
| 672 |
+
|
| 673 |
+
# 3. 各スライドの音声生成と動画クリップ作成
|
| 674 |
+
for idx, (slide, pil_image) in enumerate(zip(slide_data, images)):
|
| 675 |
+
if progress_callback:
|
| 676 |
+
progress_callback(0.1 + (idx / total_slides) * 0.5, desc=f"音声生成中 ({idx+1}/{total_slides})")
|
| 677 |
+
|
| 678 |
+
logger.info(f"スライド {idx+1}/{total_slides} 処理中...")
|
| 679 |
+
|
| 680 |
+
# 音声テキスト抽出(historyを使用)
|
| 681 |
+
audio_text = extract_audio_text_v2(slide, idx, history)
|
| 682 |
+
|
| 683 |
+
if not audio_text:
|
| 684 |
+
logger.warning(f"スライド {idx+1}: 音声テキストが空です")
|
| 685 |
+
continue
|
| 686 |
+
|
| 687 |
+
# 音声生成
|
| 688 |
+
wav_bytes = generate_audio_with_gemini(audio_text, gemini_token)
|
| 689 |
+
|
| 690 |
+
# 1.25倍速処理
|
| 691 |
+
wav_bytes = speed_up_audio(wav_bytes, speed_factor=1.25)
|
| 692 |
+
|
| 693 |
+
# 音声長さ測定(倍速処理後)
|
| 694 |
+
audio_duration = get_audio_duration(wav_bytes)
|
| 695 |
+
|
| 696 |
+
# スライド再生時間計算(音声 + 0.6秒余白)
|
| 697 |
+
slide_duration = audio_duration + 0.6
|
| 698 |
+
|
| 699 |
+
# 音声を一時ファイルに保存
|
| 700 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
|
| 701 |
+
tmp_audio.write(wav_bytes)
|
| 702 |
+
audio_path = tmp_audio.name
|
| 703 |
+
audio_files.append(audio_path)
|
| 704 |
+
|
| 705 |
+
# HFアップロード
|
| 706 |
+
slide_type = determine_slide_type(slide)
|
| 707 |
+
audio_url = save_audio_to_hf(wav_bytes, prefix=f"slide_{idx:02d}_{slide_type}")
|
| 708 |
+
|
| 709 |
+
# 音声情報記録
|
| 710 |
+
audio_info_list.append({
|
| 711 |
+
"slide_index": idx,
|
| 712 |
+
"slide_type": slide_type,
|
| 713 |
+
"audio_url": audio_url,
|
| 714 |
+
"duration": audio_duration,
|
| 715 |
+
"text": audio_text
|
| 716 |
+
})
|
| 717 |
+
|
| 718 |
+
# PIL Image → NumPy配列(720p)
|
| 719 |
+
img_array = convert_pil_to_array(pil_image, target_size=(1280, 720))
|
| 720 |
+
|
| 721 |
+
# moviepyクリップ作成
|
| 722 |
+
img_clip = ImageClip(img_array, duration=slide_duration)
|
| 723 |
+
audio_clip = AudioFileClip(audio_path)
|
| 724 |
+
|
| 725 |
+
# 音声を動画に設定
|
| 726 |
+
video_clip = img_clip.set_audio(audio_clip)
|
| 727 |
+
clips.append(video_clip)
|
| 728 |
+
|
| 729 |
+
logger.info(f"スライド {idx+1}: 音声{audio_duration:.2f}秒, 再生時間{slide_duration:.2f}秒")
|
| 730 |
+
|
| 731 |
+
if not clips:
|
| 732 |
+
raise Exception("動画クリップが生成されませんでした")
|
| 733 |
+
|
| 734 |
+
if progress_callback:
|
| 735 |
+
progress_callback(0.7, desc="動画を結合中...")
|
| 736 |
+
|
| 737 |
+
# 4. 全クリップを連結
|
| 738 |
+
final_video = concatenate_videoclips(clips, method="compose")
|
| 739 |
+
|
| 740 |
+
# 一時動画ファイルに出力
|
| 741 |
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_video:
|
| 742 |
+
video_path = tmp_video.name
|
| 743 |
+
|
| 744 |
+
if progress_callback:
|
| 745 |
+
progress_callback(0.8, desc="動画をエンコード中...")
|
| 746 |
+
|
| 747 |
+
# 5. 動画エンコード
|
| 748 |
+
final_video.write_videofile(
|
| 749 |
+
video_path,
|
| 750 |
+
fps=30,
|
| 751 |
+
codec='libx264',
|
| 752 |
+
audio_codec='aac',
|
| 753 |
+
logger=None # moviepyのログを抑制
|
| 754 |
+
)
|
| 755 |
+
|
| 756 |
+
# クリップをクローズ
|
| 757 |
+
final_video.close()
|
| 758 |
+
for clip in clips:
|
| 759 |
+
clip.close()
|
| 760 |
+
|
| 761 |
+
if progress_callback:
|
| 762 |
+
progress_callback(0.9, desc="動画をアップロード中...")
|
| 763 |
+
|
| 764 |
+
# 6. HFアップロード
|
| 765 |
+
video_url = video_uploader.upload_video(video_path, prefix="slidedata_video_v2")
|
| 766 |
+
|
| 767 |
+
# 7. 2ページ目画像アップロード
|
| 768 |
+
page2_image_url = None
|
| 769 |
+
if len(images) >= 2:
|
| 770 |
+
page2_image = images[1]
|
| 771 |
+
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_img:
|
| 772 |
+
page2_image_path = tmp_img.name
|
| 773 |
+
page2_image.save(page2_image_path, format='JPEG', quality=90)
|
| 774 |
+
|
| 775 |
+
page2_image_url = video_uploader.upload_image(page2_image_path, prefix="slidedata_page2")
|
| 776 |
+
|
| 777 |
+
# 一時ファイル削除
|
| 778 |
+
if os.path.exists(page2_image_path):
|
| 779 |
+
os.remove(page2_image_path)
|
| 780 |
+
|
| 781 |
+
if progress_callback:
|
| 782 |
+
progress_callback(1.0, desc="完了!")
|
| 783 |
+
|
| 784 |
+
logger.info(f"動画生成完了: {video_url}")
|
| 785 |
+
|
| 786 |
+
return (video_url, page2_image_url, audio_info_list)
|
| 787 |
+
|
| 788 |
+
finally:
|
| 789 |
+
# 一時ファイルクリーンアップ
|
| 790 |
+
if pdf_path and os.path.exists(pdf_path):
|
| 791 |
+
try:
|
| 792 |
+
os.remove(pdf_path)
|
| 793 |
+
except Exception as e:
|
| 794 |
+
logger.warning(f"PDFファイル削除エラー: {e}")
|
| 795 |
+
|
| 796 |
+
for audio_file in audio_files:
|
| 797 |
+
if os.path.exists(audio_file):
|
| 798 |
+
try:
|
| 799 |
+
os.remove(audio_file)
|
| 800 |
+
except Exception as e:
|
| 801 |
+
logger.warning(f"音声ファイル削除エラー: {e}")
|
| 802 |
+
|
| 803 |
+
if video_path and os.path.exists(video_path):
|
| 804 |
+
try:
|
| 805 |
+
os.remove(video_path)
|
| 806 |
+
except Exception as e:
|
| 807 |
+
logger.warning(f"動画ファイル削除エラー: {e}")
|
| 808 |
+
|
| 809 |
# ==============================
|
| 810 |
# コア機能実装
|
| 811 |
# ==============================
|
|
|
|
| 1195 |
try:
|
| 1196 |
logger.info(f"API リクエスト受信: {len(request.slide_data)}スライド")
|
| 1197 |
|
| 1198 |
+
# 動画生成(V2.0完全版)
|
| 1199 |
+
video_url, page2_image_url, audio_info_list = create_video_with_audio_from_slides_v2(
|
| 1200 |
slide_data=request.slide_data,
|
| 1201 |
+
pdf_url=request.pdf_url,
|
| 1202 |
+
history=request.history,
|
| 1203 |
gemini_token=gemini_token
|
| 1204 |
)
|
| 1205 |
|