Spaces:

Corin1998
/

multimodal-elements-to-assets

Sleeping

App Files Files Community

Corin1998 commited on Aug 30, 2025

Commit

86e8ae4

verified ·

1 Parent(s): 62a7fcf

Upload 9 files

Browse files

Files changed (9) hide show

README.md +26 -11
app.py +144 -0
generator.py +67 -0
packages.txt +1 -0
prompts.py +33 -0
requirements.txt +6 -0
tts_subtitles.py +42 -0
ui.py +114 -0
video.py +111 -0

README.md CHANGED Viewed

@@ -1,12 +1,27 @@
----
-title: Multimodal Elements To Assets
-emoji: 🐢
-colorFrom: yellow
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.44.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# マルチモーダル “要素→素材” ジェネレーター（Hugging Face Space）
+製品名・訴求点・ターゲットを入力すると、
+- 見出し
+- 短尺動画用台本
+- サムネ文言
+- OGP文言（title/description/alt）
+を **OpenAI LLM** で構造化生成します。さらに **OpenAI TTS → Whisper(SRT) → ffmpeg** で **縦型ショート動画** を自動合成します。
+## 🚀 セットアップ（Hugging Face Spaces）
+1. 新規 Space を作成（Python）。
+2. このリポジトリのファイルをアップロード。
+3. **Secrets** に以下を追加：
+   - `OPENAI_API_KEY`: OpenAIのAPIキー
+   - （任意）`OPENAI_LLM_MODEL` デフォルト `gpt-4o-mini`
+   - （任意）`OPENAI_TTS_MODEL` デフォルト `tts-1`
+   - （任意）`OPENAI_WHISPER_MODEL` デフォルト `whisper-1`
+   - （任意）`HUGGINGFACE_TOKEN`（pyannote利用時）
+4. `packages.txt` により ffmpeg が自動導入されます。
+5. Space を起動。UIから入力し「生成する」。
+## 🧩 ローカル実行
+```bash
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+cp .env.example .env  # APIキーを設定
+python app.py

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os
+import json
+import tempfile
+import gradio as gr
+from generator import generate_marketing_assets
+from tts_subtitles import synthesize_tts_openai, transcribe_to_srt_openai
+from video import compose_video_with_subtitles, make_background_image
+from PIL import Image
+DEFAULT_MODEL = os.getenv("OPENAI_LLM_MODEL", "gpt-4o-mini")
+DEFAULT_TTS_MODEL = os.getenv("OPENAI_TTS_MODEL", "tts-1")
+DEFAULT_WHISPER_MODEL = os.getenv("OPENAI_WHISPER_MODEL", "whisper-1")
+VOICE_CHOICES = ["alloy", "verse", "breeze", "bright", "calm"]
+EXAMPLE = {
+    "product_name": "FLDapp HbA1c測定",
+    "value_props": "10秒で測定/非接触・非侵襲/酸素飽和度と脈拍の同時測定",
+    "target": "30-50代の健康意識が高いビジネスパーソン",
+    "tone": "信頼性とスピード感",
+    "lang": "ja",
+    "seconds": 30,
+    "voice": "alloy",
+}
+def run_pipeline(product_name, value_props, target, tone, lang, seconds, voice):
+    if not os.getenv("OPENAI_API_KEY"):
+        raise gr.Error("OPENAI_API_KEY が設定されていません（SpacesのSecretsに追加）。")
+    # 1) LLMで素材群をJSON生成
+    assets = generate_marketing_assets(
+        product_name=product_name,
+        value_props=value_props,
+        target=target,
+        tone=tone,
+        lang=lang,
+        seconds=int(seconds),
+        model=DEFAULT_MODEL,
+    )
+    headline = assets["headline"]
+    video_script = assets["video_script"]
+    thumbnail_text = assets["thumbnail_text"]
+    ogp = assets["ogp"]
+    # 2) TTS（音声合成）→ 3) Whisperで字幕（SRT）
+    with tempfile.TemporaryDirectory() as td:
+        audio_path = os.path.join(td, "narration.mp3")
+        synthesize_tts_openai(
+            text=video_script,
+            out_path=audio_path,
+            voice=voice,
+            model=DEFAULT_TTS_MODEL,
+            format="mp3",
+        )
+        srt_path = os.path.join(td, "captions.srt")
+        srt_text = transcribe_to_srt_openai(
+            audio_path=audio_path,
+            model=DEFAULT_WHISPER_MODEL,
+        )
+        with open(srt_path, "w", encoding="utf-8") as f:
+            f.write(srt_text)
+        # 4) 背景画像の生成（簡易サムネ/OGPにも流用）
+        bg_path = os.path.join(td, "bg.png")
+        make_background_image(
+            out_path=bg_path,
+            title=headline,
+            subtitle=thumbnail_text,
+            lang=lang,
+        )
+        # 5) ffmpegで短尺動画合成（縦 1080x1920）
+        video_out = os.path.join(td, "short.mp4")
+        compose_video_with_subtitles(
+            image_path=bg_path,
+            audio_path=audio_path,
+            srt_path=srt_path,
+            out_path=video_out,
+            width=1080,
+            height=1920,
+        )
+        # 6) OGP画像（横 1200x630）
+        ogp_img_path = os.path.join(td, "ogp.png")
+        img = Image.open(bg_path).convert("RGB").resize((1200, 630))
+        img.save(ogp_img_path)
+        return (
+            headline,
+            video_script,
+            thumbnail_text,
+            json.dumps(ogp, ensure_ascii=False, indent=2),
+            audio_path,
+            srt_path,
+            video_out,
+            ogp_img_path,
+        )
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎬 マルチモーダル要素→素材ジェネレーター
+    入力（製品名 / 訴求点 / ターゲット）から **見出し・短尺動画用台本・サムネ文言・OGP文言** を生成し、
+    さらに **TTS音声 + Whisper字幕 + ffmpeg** で **縦型ショート動画** を自動合成します。
+    """)
+    with gr.Row():
+        with gr.Column():
+            product_name = gr.Textbox(label="製品名")
+            value_props = gr.Textbox(label="訴求点（カンマ区切り推奨）")
+            target = gr.Textbox(label="ターゲット")
+            tone = gr.Textbox(label="トーン（例：信頼性/スピード感/ワクワク）", value="信頼性とスピード感")
+            lang = gr.Dropdown(["ja", "en"], value="ja", label="言語")
+            seconds = gr.Slider(8, 45, value=20, step=1, label="動画尺（秒）")
+            voice = gr.Dropdown(VOICE_CHOICES, value="alloy", label="ナレーション音声（OpenAI TTS）")
+            run_btn = gr.Button("生成する", variant="primary")
+        with gr.Column():
+            headline = gr.Textbox(label="見出し/ヘッドライン")
+            video_script = gr.Textbox(label="短尺動画 用 台本", lines=10)
+            thumbnail_text = gr.Textbox(label="サムネ文言")
+            ogp_json = gr.Code(label="OGP JSON（title/description/alt）", language="json")
+            audio = gr.File(label="音声（mp3）")
+            srt = gr.File(label="字幕（srt）")
+            video = gr.File(label="動画（mp4）")
+            ogp_img = gr.File(label="OGP画像（png）")
+    gr.Examples(
+        examples=[
+            [EXAMPLE["product_name"], EXAMPLE["value_props"], EXAMPLE["target"], EXAMPLE["tone"], EXAMPLE["lang"], EXAMPLE["seconds"], EXAMPLE["voice"]],
+        ],
+        inputs=[product_name, value_props, target, tone, lang, seconds, voice],
+    )
+    run_btn.click(
+        fn=run_pipeline,
+        inputs=[product_name, value_props, target, tone, lang, seconds, voice],
+        outputs=[headline, video_script, thumbnail_text, ogp_json, audio, srt, video, ogp_img],
+        api_name="generate",
+    )
+if __name__ == "__main__":
+    demo.launch()

generator.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import json
+import re
+import requests
+from prompts import build_strctured_prompt
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
+HEADERS = {
+    "Authorization": f"Bearer{OPENAI_API_KEY}",
+    "Content-Type":"application/json",
+}
+SYSTEM = (
+    "You are a bilingual Japanses/English marketing copywriter. "
+    "Return results ONLY as strict JSON with UTF-8,never prose."
+)
+def _post_chat(messages,model:str, response_format_json: bool = True):
+    url = f"{OPENAI_BASE_URL}/chat/completions"
+    payload = {
+        "model":model,
+        "messsages":messages,
+        "temperature":0.7,
+    }
+    if response_format_json:
+        payload["response_format"]={"type":"json_object"}
+    r = requests.post(url, headers=HEADERS, json=payload, timeout=120)
+    r.raise_for_status()
+    data = r.json()
+    return data["choices"][0]["message"]["content"]
+def _safe_json_extract(text: str):
+    try:
+        return json.loads(text)
+    except Exception:
+        m = re.search(r"\{[\s\S]*\}",text)
+        if m:
+            return json.loads(m.group(0))
+        raise
+def generate_marketing_assets(product_name:str, value_props:str, target:str, tone:str, lang:str, seconds:int, model:str):
+    user_prompt = build_strctured_prompt(
+        product_name=product_name,
+        value_props=value_props,
+        target=target,
+        tone=tone,
+        lang=lang,
+        seconds=seconds,
+    )
+    messages = [
+        {"role": "system", "content": SYSTEM},
+        {"role": "user","content": user_prompt},
+    ]
+    raw = _post_chat(messages,model=model, response_format_json=True)
+    data = _safe_json_extract(raw)
+    required = ["headline","video_script","thumbnail_text","ogp"]
+    for k in required:
+        if k not in data:
+            raise ValueError(f"LLM JSON missing key:{k}")
+        for k in ["little", "description","alt"]:
+            data["ogp"].setdefault(k,"")
+        return data

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

prompts.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from textwrap import dedent
+# 構造化プロンプト（JSON出力に限定）
+def build_structured_prompt(product_name: str, value_props: str, target: str, tone: str, lang: str, seconds: int) -> str:
+    return dedent(f"""
+    You are a top-tier performance marketer and short-form video copywriter.
+    Language: {lang}
+    Inputs:
+    - Product: {product_name}
+    - Value Props (comma-separated): {value_props}
+    - Target: {target}
+    - Tone: {tone}
+    - Target video length: {seconds} seconds
+    Task:
+    1) Craft a high-converting headline.
+    2) Write a short-form video script designed for {seconds}s. Use 3–5 concise scenes, strong hook in the first 2s. Keep narrator-friendly phrasing.
+    3) Produce a punchy thumbnail text (<= 14 chars in ja / <= 6 words in en).
+    4) Provide OGP fields.
+    Return STRICT JSON with the following schema ONLY (no explanations):
+    {{
+      "headline": string,
+      "video_script": string,   # multi-line; include scene markers like [HOOK], [SCENE2], [CTA]
+      "thumbnail_text": string,
+      "ogp": {{
+        "title": string,
+        "description": string,
+        "alt": string
+      }}
+    }}
+    """)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=4.36.0
+requests>=2.31.0
+Pillow>=10.3.0
+python-dotenv>=1.0.1
+# Optional (heavy):
+# pyannote.audio>=3.1.0

tts_subtitles.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import requests
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
+HEADERS_AUTH = {"Authorization": f"Bearer {OPENAI_API_KEY}"}
+def synthesize_tts_openai(text: str, out_path: str, voice: str = "alloy", model: str = "tts-1", format: str = "mp3"):
+    url = f"{OPENAI_BASE_URL}/audio/speech"
+    payload = {
+        "model": model,
+        "voice": voice,
+        "input": text,
+        "format": format,
+    }
+    headers = {**HEADERS_AUTH, "Content-Type": "application/json"}
+    r = requests.post(url, headers=headers, json=payload, timeout=300)
+    r.raise_for_status()
+    with open(out_path, "wb") as f:
+        f.write(r.content)
+    return out_path
+def transcribe_to_srt_openai(audio_path: str, model: str = "whisper-1") -> str:
+    url = f"{OPENAI_BASE_URL}/audio/transcriptions"
+    headers = HEADERS_AUTH
+    with open(audio_path, "rb") as af:
+        files = {"file": af}
+        data = {"model": model, "response_format": "srt"}
+        r = requests.post(url, headers=headers, files=files, data=data, timeout=600)
+    r.raise_for_status()
+    return r.text
+# --- Optional: pyannote による VAD/クリーンアップ（重いモデルのためデフォルト無効） ---
+# from pyannote.audio import Pipeline
+# def refine_srt_with_pyannote(audio_path: str, srt_text: str) -> str:
+#     token = os.getenv("HUGGINGFACE_TOKEN")
+#     if not token:
+#         return srt_text
+#     pipeline = Pipeline.from_pretrained("pyannote/segmentation", use_auth_token=token)
+#     # ここで音声区間検出→SRTのタイミングを補正する処理を実装（省略）
+#     return srt_text

ui.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import gradio as gr
+import httpx
+import json
+def build_ui(fastapi_app):
+    base = ""  # 同一プロセスなので相対パスでOK
+    async def do_summarize(platforms, keywords_csv, brand, limit, language):
+        keywords = [k.strip() for k in keywords_csv.split(",") if k.strip()]
+        async with httpx.AsyncClient() as client:
+            r = await client.post(f"{base}/api/summarize_trends", json={
+                "platforms": platforms, "keywords": keywords, "brand": brand,
+                "limit": int(limit), "language": language
+            })
+        r.raise_for_status()
+        data = r.json()
+        items = data["items"]
+        summary = data["summary"]
+        return json.dumps(items, ensure_ascii=False, indent=2), summary
+    async def do_generate_plan(brand, language, platforms, keywords_csv, start_date, tone, cta, image_style_hint):
+        keywords = [k.strip() for k in keywords_csv.split(",") if k.strip()]
+        async with httpx.AsyncClient() as client:
+            r = await client.post(f"{base}/api/generate_week_plan", json={
+                "brand": brand, "language": language, "platforms": platforms,
+                "keywords": keywords, "start_date": (start_date or None),
+                "tone": tone, "cta": cta, "image_style_hint": image_style_hint
+            })
+        r.raise_for_status()
+        posts = r.json()
+        return json.dumps(posts, ensure_ascii=False, indent=2)
+    async def do_list_calendar():
+        async with httpx.AsyncClient() as client:
+            r = await client.get(f"{base}/api/calendar")
+        r.raise_for_status()
+        return json.dumps(r.json(), ensure_ascii=False, indent=2)
+    async def do_approve(post_id):
+        async with httpx.AsyncClient() as client:
+            r = await client.post(f"{base}/api/approve_post/{int(post_id)}")
+        r.raise_for_status()
+        return json.dumps(r.json(), ensure_ascii=False, indent=2)
+    async def do_schedule(post_id, iso):
+        async with httpx.AsyncClient() as client:
+            r = await client.post(f"{base}/api/schedule_post/{int(post_id)}", json={"scheduled_at": iso})
+        r.raise_for_status()
+        return json.dumps(r.json(), ensure_ascii=False, indent=2)
+    async def save_keywords(keywords_csv):
+        keywords = [k.strip() for k in keywords_csv.split(",") if k.strip()]
+        async with httpx.AsyncClient() as client:
+            r = await client.post(f"{base}/api/keywords", json={"keywords": keywords})
+        r.raise_for_status()
+        return "保存しました"
+    async def load_keywords():
+        async with httpx.AsyncClient() as client:
+            r = await client.get(f"{base}/api/keywords")
+        r.raise_for_status()
+        data = r.json()
+        return ", ".join(data.get("keywords", []))
+    with gr.Blocks(title="SNS運用AIライト") as demo:
+        gr.Markdown("## SNS運用AIライト — 競合/トレンド要約 → 1週間案 → 承認 → 予約投稿")
+        with gr.Tab("1) トレンド要約"):
+            platforms = gr.CheckboxGroup(choices=["x","instagram"], value=["x","instagram"], label="対象プラットフォーム")
+            keywords = gr.Textbox(label="監視キーワード（カンマ区切り）", placeholder="自社名, 競合名, 業界ワード")
+            brand = gr.Textbox(label="ブランド名（任意）", placeholder="HitC Inc. など")
+            limit = gr.Slider(5, 50, step=1, value=20, label="取得件数（疑似/本API対応）")
+            language = gr.Dropdown(["ja","en"], value="ja", label="出力言語")
+            btn = gr.Button("トレンド要約を実行")
+            items_json = gr.Code(label="取得結果（JSON）", language="json")
+            summary = gr.Textbox(label="要約", lines=12)
+            btn.click(do_summarize, [platforms, keywords, brand, limit, language], [items_json, summary])
+        with gr.Tab("2) 1週間の投稿案生成"):
+            brand2 = gr.Textbox(label="ブランド名", placeholder="HitC Inc.")
+            language2 = gr.Dropdown(["ja","en"], value="ja", label="言語")
+            platforms2 = gr.CheckboxGroup(choices=["x","instagram"], value=["x","instagram"], label="プラットフォーム")
+            keywords2 = gr.Textbox(label="キーワード（任意/カンマ区切り）")
+            start_date = gr.Textbox(label="開始日（ISO, 任意）", placeholder="2025-09-01")
+            tone = gr.Textbox(label="トーン", value="プロフェッショナルで親しみやすい")
+            cta = gr.Textbox(label="CTA", value="詳細はこちら")
+            imgstyle = gr.Textbox(label="画像ラフのスタイルヒント", value="ミニマル、端的なタイポ、ブランドカラー意識")
+            btn2 = gr.Button("投稿案を生成 & DB保存")
+            posts_json = gr.Code(label="投稿案（DBにdraft保存）", language="json")
+            btn2.click(do_generate_plan, [brand2, language2, platforms2, keywords2, start_date, tone, cta, imgstyle], [posts_json])
+        with gr.Tab("3) 承認・予約・カレンダー"):
+            gr.Markdown("承認→予約→公開（APScheduler）が動きます。")
+            post_id = gr.Number(label="Post ID")
+            approve_btn = gr.Button("承認する")
+            schedule_iso = gr.Textbox(label="予約日時（ISO, 例: 2025-09-01T09:00:00Z）")
+            schedule_btn = gr.Button("予約に登録")
+            out = gr.Code(label="レスポンス", language="json")
+            approve_btn.click(do_approve, [post_id], [out])
+            schedule_btn.click(do_schedule, [post_id, schedule_iso], [out])
+            cal_btn = gr.Button("カレンダー取得")
+            cal_json = gr.Code(label="カレンダー", language="json")
+            cal_btn.click(do_list_calendar, [], [cal_json])
+        with gr.Tab("設定"):
+            kw_in = gr.Textbox(label="監視キーワード（カンマ区切り）")
+            load_btn = gr.Button("読み込み")
+            save_btn = gr.Button("保存")
+            msg = gr.Textbox(label="メッセージ")
+            load_btn.click(load_keywords, [], [kw_in])
+            save_btn.click(save_keywords, [kw_in], [msg])
+    return demo

video.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+import subprocess
+from PIL import Image, ImageDraw, ImageFont
+def _run(cmd: list):
+    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if p.returncode != 0:
+        raise RuntimeError(p.stderr[:2000])
+    return p.stdout
+def _audio_duration_seconds(audio_path: str) -> float:
+    cmd = [
+        "ffprobe", "-v", "error", "-show_entries", "format=duration",
+        "-of", "default=noprint_wrappers=1:nokey=1", audio_path,
+    ]
+    out = _run(cmd).strip()
+    return float(out)
+def _wrap_text(draw: ImageDraw.ImageDraw, text: str, font: ImageFont.FreeTypeFont, max_width: int) -> str:
+    lines, cur = [], ""
+    for ch in text:
+        test = cur + ch
+        if draw.textlength(test, font=font) <= max_width:
+            cur = test
+        else:
+            if cur:
+                lines.append(cur)
+            cur = ch
+    if cur:
+        lines.append(cur)
+    return "\n".join(lines)
+def make_background_image(out_path: str, title: str, subtitle: str, lang: str = "ja", width: int = 1080, height: int = 1920):
+    # シンプル背景 + テキスト（フォントはDejaVuを想定）
+    img = Image.new("RGB", (width, height), (20, 24, 32))
+    draw = ImageDraw.Draw(img)
+    try:
+        font_title = ImageFont.truetype("DejaVuSans-Bold.ttf", size=72)
+        font_sub = ImageFont.truetype("DejaVuSans.ttf", size=48)
+    except Exception:
+        font_title = ImageFont.load_default()
+        font_sub = ImageFont.load_default()
+    margin = 40
+    max_w = width - margin * 2
+    title_wrapped = _wrap_text(draw, title[:100], font_title, max_w)
+    sub_wrapped = _wrap_text(draw, subtitle[:80], font_sub, max_w)
+    # bbox → width/height
+    t_bbox = draw.multiline_textbbox((0, 0), title_wrapped, font=font_title, align="center")
+    s_bbox = draw.multiline_textbbox((0, 0), sub_wrapped, font=font_sub, align="center")
+    title_w, title_h = (t_bbox[2] - t_bbox[0], t_bbox[3] - t_bbox[1])
+    sub_w, sub_h = (s_bbox[2] - s_bbox[0], s_bbox[3] - s_bbox[1])
+    total_h = title_h + 20 + sub_h
+    y = (height - total_h) // 2
+    # 描画（簡易ドロップシャドウ）
+    shadow = (0, 0, 0)
+    color = (255, 255, 255)
+    for dx, dy in [(2, 2), (1, 1)]:
+        draw.multiline_text(((width - title_w) // 2 + dx, y + dy), title_wrapped, fill=shadow, font=font_title, align="center")
+    draw.multiline_text(((width - title_w) // 2, y), title_wrapped, fill=color, font=font_title, align="center")
+    y2 = y + title_h + 20
+    for dx, dy in [(2, 2), (1, 1)]:
+        draw.multiline_text(((width - sub_w) // 2 + dx, y2 + dy), sub_wrapped, fill=shadow, font=font_sub, align="center")
+    draw.multiline_text(((width - sub_w) // 2, y2), sub_wrapped, fill=color, font=font_sub, align="center")
+    img.save(out_path)
+    return out_path
+def compose_video_with_subtitles(image_path: str, audio_path: str, srt_path: str, out_path: str, width: int = 1080, height: int = 1920, fps: int = 30):
+    duration = _audio_duration_seconds(audio_path)
+    # 画像→動画 + 音声を結合
+    temp_video = out_path.replace(".mp4", "_temp.mp4")
+    cmd = [
+        "ffmpeg", "-y",
+        "-loop", "1", "-i", image_path,
+        "-i", audio_path,
+        "-c:v", "libx264",
+        "-t", f"{duration:.2f}",
+        "-pix_fmt", "yuv420p",
+        "-vf", f"scale={width}:{height},fps={fps}",
+        "-c:a", "aac", "-shortest",
+        temp_video,
+    ]
+    _run(cmd)
+    # 字幕焼き込み（libass必須）。パスに空白があっても安全にするためクォート
+    # Windowsでは異なるエスケープが必要だが、Spaces(Linux)前提
+    srt_escaped = srt_path.replace("\\", "\\\\").replace(":", r"\:").replace("'", r"\\'")
+    cmd2 = [
+        "ffmpeg", "-y",
+        "-i", temp_video,
+        "-vf", f"subtitles='{srt_escaped}'",
+        "-c:a", "copy",
+        out_path,
+    ]
+    _run(cmd2)
+    try:
+        os.remove(temp_video)
+    except Exception:
+        pass
+    return out_path