import os, json from pathlib import Path from typing import Dict, Any import gradio as gr from rag.extract_text import extract_text from rag.rag import OpenAIEmbedRAG from llm.call_llm import call_llm_structured from generation.gen_img import generate_image_with_openai_from_llm_spec from generation.gen_audio import generate_audio_with_openai_from_llm_spec # ---------- Output directory ---------- OUT_DIR = Path("outputs") OUT_DIR.mkdir(parents=True, exist_ok=True) # ---------- Helpers ---------- def hits_to_snippets_json(hits, raw_fallback: str, limit: int = 500) -> str: """Safe serialization for retrieved chunks.""" items = [] if hits is None: hits = [] for i, h in enumerate(hits): if not isinstance(h, dict): continue t = h.get("text") if not t: continue items.append({"id": h.get("id", i), "excerpt": t[:limit]}) if not items: fb = (raw_fallback or "")[:limit] if fb: items = [{"id": 0, "excerpt": fb}] return json.dumps(items, ensure_ascii=False) # ---------- Core pipeline ---------- def run_pipeline( file_path: str, openai_api_key: str, user_goal: str = "Generate an image and a narration audio that reflects the essence of the text", topk: int = 6, llm_model: str = "gpt-5-nano", ): """ Runs the full pipeline using the provided API key. SECURITY NOTE: We do not write the API key to disk or include it in any outputs. """ if not openai_api_key or not openai_api_key.strip(): raise ValueError("OpenAI API key is required.") # 1) Extract text raw = extract_text(file_path) # 2) RAG (embeddings + search) rag = OpenAIEmbedRAG(model="text-embedding-3-small", openai_key=openai_api_key) rag.build(raw) hits = rag.search(user_goal, topk=topk) snippets_json = hits_to_snippets_json(hits, raw) # 3) LLM → structured JSON (image/audio/debug) system_prompt = """ You are a prompt engineer for **visual** and **audio** generation. Return a JSON object strictly matching this schema: { "image": { "prompt": "string, a detailed description of what the image should show", "negative_prompt": "string, optional description of what to avoid", "style": ["string", ...], // optional styles like "cinematic", "oil painting" "width": int, // optional, default 1024 "height": int // optional, default 1024 }, "audio": { "text": "string, the exact narration script to be read aloud, written in natural spoken language, only contains the script itself without extra beginning", "voice": "string, choose from [alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse], default 'alloy', should match the atmosphere of the text", "speed": float, // optional, default 1.0 }, "debug": { "reasoning": "string, brief reasoning why you designed the prompts this way" } } Rules: - Always output valid JSON only, no explanations outside JSON. - The narration text (`audio.text`) must be **rewritten into a smooth script** that could be directly read aloud, not raw excerpts. - All places in JSON must be filled with valid content, do not leave any entry empty! - Keep the image prompt concise but vivid. """.strip() user_prompt = f""" Goal: {user_goal} Below are the most relevant retrieved excerpts (with source IDs): {snippets_json} Now produce the JSON object strictly following the schema. """.strip() spec = call_llm_structured( system_prompt, user_prompt, model=llm_model, openai_key=openai_api_key, ) print(spec) # 4) Image generation result_img = generate_image_with_openai_from_llm_spec( spec, out_dir=str(OUT_DIR), openai_key=openai_api_key, ) img_obj = result_img["image"] # 5) Audio generation result_audio = generate_audio_with_openai_from_llm_spec( spec, out_dir=str(OUT_DIR), openai_key=openai_api_key, ) audio_bytes = result_audio["audio_bytes"] # 6) Pretty meta for UI pretty = { "spec": spec, "used_chunks_preview": json.loads(snippets_json), } raw_json = json.dumps(pretty, ensure_ascii=False, indent=2) return img_obj, audio_bytes, pretty, raw_json # ---------- Gradio UI ---------- def ui_pipeline(file, api_key, goal, topk, model_name): if file is None: return None, None, {"error": "Please upload a file (.txt/.md/.pdf)."}, "" try: return run_pipeline( file_path=file.name, openai_api_key=api_key, user_goal=goal, topk=int(topk), llm_model=model_name, ) except Exception as e: return None, None, {"error": str(e)}, "" with gr.Blocks(title="File → (RAG + LLM) → Prompts → Image+Audio") as demo: gr.Markdown( "# 📦→🧠→🎨+🔊 Generate illustration and narration for your documents\n" "**Bring your own OpenAI API key. We do not store your key; it is only used in memory for this run.**\n\n" "- Upload a text/PDF, retrieve with embeddings, let the LLM craft prompts.\n" "- Generate both an image **and** an audio narration.\n" ) with gr.Row(): file_in = gr.File(label="Upload file (.txt/.md/.pdf)") goal = gr.Textbox( label="Your goal (more detail → better results)", value="Generate an illustration and a narration that matches the spirit of this text", lines=2, ) with gr.Row(): api_key_in = gr.Textbox( label="OpenAI API key", placeholder="sk-...", type="password", ) topk = gr.Slider(1, 12, value=6, step=1, label="Top-K retrieved chunks") model_llm = gr.Dropdown( choices=["gpt-5-nano"], value="gpt-5-nano", label="LLM model", ) run_btn = gr.Button("Run", variant="primary") out_img = gr.Image(label="Generated image") out_audio = gr.Audio(label="Generated audio", type="numpy") # ⚡ 改成 numpy out_json = gr.JSON(label="Spec & Metadata") out_raw = gr.Code(label="Raw JSON (debug)", language="json") run_btn.click( ui_pipeline, inputs=[file_in, api_key_in, goal, topk, model_llm], outputs=[out_img, out_audio, out_json, out_raw], ) if __name__ == "__main__": demo.launch()