Spaces:

slxhere
/

doc_alive

Sleeping

File size: 6,602 Bytes

5c9f0d9

import os, json
from pathlib import Path
from typing import Dict, Any

import gradio as gr

from rag.extract_text import extract_text
from rag.rag import OpenAIEmbedRAG
from llm.call_llm import call_llm_structured
from generation.gen_img import generate_image_with_openai_from_llm_spec
from generation.gen_audio import generate_audio_with_openai_from_llm_spec

# ---------- Output directory ----------
OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)


# ---------- Helpers ----------
def hits_to_snippets_json(hits, raw_fallback: str, limit: int = 500) -> str:
    """Safe serialization for retrieved chunks."""
    items = []
    if hits is None:
        hits = []
    for i, h in enumerate(hits):
        if not isinstance(h, dict):
            continue
        t = h.get("text")
        if not t:
            continue
        items.append({"id": h.get("id", i), "excerpt": t[:limit]})
    if not items:
        fb = (raw_fallback or "")[:limit]
        if fb:
            items = [{"id": 0, "excerpt": fb}]
    return json.dumps(items, ensure_ascii=False)


# ---------- Core pipeline ----------
def run_pipeline(
    file_path: str,
    openai_api_key: str,
    user_goal: str = "Generate an image and a narration audio that reflects the essence of the text",
    topk: int = 6,
    llm_model: str = "gpt-5-nano",
):
    """
    Runs the full pipeline using the provided API key.
    SECURITY NOTE: We do not write the API key to disk or include it in any outputs.
    """

    if not openai_api_key or not openai_api_key.strip():
        raise ValueError("OpenAI API key is required.")

    # 1) Extract text
    raw = extract_text(file_path)

    # 2) RAG (embeddings + search)
    rag = OpenAIEmbedRAG(model="text-embedding-3-small", openai_key=openai_api_key)
    rag.build(raw)
    hits = rag.search(user_goal, topk=topk)

    snippets_json = hits_to_snippets_json(hits, raw)

    # 3) LLM → structured JSON (image/audio/debug)
    system_prompt = """
    You are a prompt engineer for **visual** and **audio** generation.

    Return a JSON object strictly matching this schema:

    {
    "image": {
        "prompt": "string, a detailed description of what the image should show",
        "negative_prompt": "string, optional description of what to avoid",
        "style": ["string", ...],   // optional styles like "cinematic", "oil painting"
        "width": int,               // optional, default 1024
        "height": int               // optional, default 1024
    },
    "audio": {
        "text": "string, the exact narration script to be read aloud, written in natural spoken language, only contains the script itself without extra beginning",
        "voice": "string, choose from [alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse], default 'alloy', should match the atmosphere of the text",
        "speed": float,             // optional, default 1.0
    },
    "debug": {
        "reasoning": "string, brief reasoning why you designed the prompts this way"
    }
    }

    Rules:
    - Always output valid JSON only, no explanations outside JSON.
    - The narration text (`audio.text`) must be **rewritten into a smooth script** that could be directly read aloud,
    not raw excerpts.
    - All places in JSON must be filled with valid content, do not leave any entry empty!
    - Keep the image prompt concise but vivid.
    """.strip()

    user_prompt = f"""
    Goal: {user_goal}

    Below are the most relevant retrieved excerpts (with source IDs):
    {snippets_json}

    Now produce the JSON object strictly following the schema.
    """.strip()

    spec = call_llm_structured(
        system_prompt,
        user_prompt,
        model=llm_model,
        openai_key=openai_api_key,
    )

    print(spec)

    # 4) Image generation
    result_img = generate_image_with_openai_from_llm_spec(
        spec,
        out_dir=str(OUT_DIR),
        openai_key=openai_api_key,
    )
    img_obj = result_img["image"]

    # 5) Audio generation
    result_audio = generate_audio_with_openai_from_llm_spec(
        spec,
        out_dir=str(OUT_DIR),
        openai_key=openai_api_key,
    )
    audio_bytes = result_audio["audio_bytes"]

    # 6) Pretty meta for UI
    pretty = {
        "spec": spec,
        "used_chunks_preview": json.loads(snippets_json),
    }
    raw_json = json.dumps(pretty, ensure_ascii=False, indent=2)
    return img_obj, audio_bytes, pretty, raw_json


# ---------- Gradio UI ----------
def ui_pipeline(file, api_key, goal, topk, model_name):
    if file is None:
        return None, None, {"error": "Please upload a file (.txt/.md/.pdf)."}, ""
    try:
        return run_pipeline(
            file_path=file.name,
            openai_api_key=api_key,
            user_goal=goal,
            topk=int(topk),
            llm_model=model_name,
        )
    except Exception as e:
        return None, None, {"error": str(e)}, ""


with gr.Blocks(title="File → (RAG + LLM) → Prompts → Image+Audio") as demo:
    gr.Markdown(
        "# 📦→🧠→🎨+🔊  Generate illustration and narration for your documents\n"
        "**Bring your own OpenAI API key. We do not store your key; it is only used in memory for this run.**\n\n"
        "- Upload a text/PDF, retrieve with embeddings, let the LLM craft prompts.\n"
        "- Generate both an image **and** an audio narration.\n"
    )

    with gr.Row():
        file_in = gr.File(label="Upload file (.txt/.md/.pdf)")
        goal = gr.Textbox(
            label="Your goal (more detail → better results)",
            value="Generate an illustration and a narration that matches the spirit of this text",
            lines=2,
        )

    with gr.Row():
        api_key_in = gr.Textbox(
            label="OpenAI API key",
            placeholder="sk-...",
            type="password",
        )
        topk = gr.Slider(1, 12, value=6, step=1, label="Top-K retrieved chunks")
        model_llm = gr.Dropdown(
            choices=["gpt-5-nano"],
            value="gpt-5-nano",
            label="LLM model",
        )

    run_btn = gr.Button("Run", variant="primary")

    out_img = gr.Image(label="Generated image")
    out_audio = gr.Audio(label="Generated audio", type="numpy")  # ⚡ 改成 numpy
    out_json = gr.JSON(label="Spec & Metadata")
    out_raw = gr.Code(label="Raw JSON (debug)", language="json")

    run_btn.click(
        ui_pipeline,
        inputs=[file_in, api_key_in, goal, topk, model_llm],
        outputs=[out_img, out_audio, out_json, out_raw],
    )

if __name__ == "__main__":
    demo.launch()