doc_alive / app.py
slxhere's picture
Add audio generation
5c9f0d9
import os, json
from pathlib import Path
from typing import Dict, Any
import gradio as gr
from rag.extract_text import extract_text
from rag.rag import OpenAIEmbedRAG
from llm.call_llm import call_llm_structured
from generation.gen_img import generate_image_with_openai_from_llm_spec
from generation.gen_audio import generate_audio_with_openai_from_llm_spec
# ---------- Output directory ----------
OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)
# ---------- Helpers ----------
def hits_to_snippets_json(hits, raw_fallback: str, limit: int = 500) -> str:
"""Safe serialization for retrieved chunks."""
items = []
if hits is None:
hits = []
for i, h in enumerate(hits):
if not isinstance(h, dict):
continue
t = h.get("text")
if not t:
continue
items.append({"id": h.get("id", i), "excerpt": t[:limit]})
if not items:
fb = (raw_fallback or "")[:limit]
if fb:
items = [{"id": 0, "excerpt": fb}]
return json.dumps(items, ensure_ascii=False)
# ---------- Core pipeline ----------
def run_pipeline(
file_path: str,
openai_api_key: str,
user_goal: str = "Generate an image and a narration audio that reflects the essence of the text",
topk: int = 6,
llm_model: str = "gpt-5-nano",
):
"""
Runs the full pipeline using the provided API key.
SECURITY NOTE: We do not write the API key to disk or include it in any outputs.
"""
if not openai_api_key or not openai_api_key.strip():
raise ValueError("OpenAI API key is required.")
# 1) Extract text
raw = extract_text(file_path)
# 2) RAG (embeddings + search)
rag = OpenAIEmbedRAG(model="text-embedding-3-small", openai_key=openai_api_key)
rag.build(raw)
hits = rag.search(user_goal, topk=topk)
snippets_json = hits_to_snippets_json(hits, raw)
# 3) LLM β†’ structured JSON (image/audio/debug)
system_prompt = """
You are a prompt engineer for **visual** and **audio** generation.
Return a JSON object strictly matching this schema:
{
"image": {
"prompt": "string, a detailed description of what the image should show",
"negative_prompt": "string, optional description of what to avoid",
"style": ["string", ...], // optional styles like "cinematic", "oil painting"
"width": int, // optional, default 1024
"height": int // optional, default 1024
},
"audio": {
"text": "string, the exact narration script to be read aloud, written in natural spoken language, only contains the script itself without extra beginning",
"voice": "string, choose from [alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse], default 'alloy', should match the atmosphere of the text",
"speed": float, // optional, default 1.0
},
"debug": {
"reasoning": "string, brief reasoning why you designed the prompts this way"
}
}
Rules:
- Always output valid JSON only, no explanations outside JSON.
- The narration text (`audio.text`) must be **rewritten into a smooth script** that could be directly read aloud,
not raw excerpts.
- All places in JSON must be filled with valid content, do not leave any entry empty!
- Keep the image prompt concise but vivid.
""".strip()
user_prompt = f"""
Goal: {user_goal}
Below are the most relevant retrieved excerpts (with source IDs):
{snippets_json}
Now produce the JSON object strictly following the schema.
""".strip()
spec = call_llm_structured(
system_prompt,
user_prompt,
model=llm_model,
openai_key=openai_api_key,
)
print(spec)
# 4) Image generation
result_img = generate_image_with_openai_from_llm_spec(
spec,
out_dir=str(OUT_DIR),
openai_key=openai_api_key,
)
img_obj = result_img["image"]
# 5) Audio generation
result_audio = generate_audio_with_openai_from_llm_spec(
spec,
out_dir=str(OUT_DIR),
openai_key=openai_api_key,
)
audio_bytes = result_audio["audio_bytes"]
# 6) Pretty meta for UI
pretty = {
"spec": spec,
"used_chunks_preview": json.loads(snippets_json),
}
raw_json = json.dumps(pretty, ensure_ascii=False, indent=2)
return img_obj, audio_bytes, pretty, raw_json
# ---------- Gradio UI ----------
def ui_pipeline(file, api_key, goal, topk, model_name):
if file is None:
return None, None, {"error": "Please upload a file (.txt/.md/.pdf)."}, ""
try:
return run_pipeline(
file_path=file.name,
openai_api_key=api_key,
user_goal=goal,
topk=int(topk),
llm_model=model_name,
)
except Exception as e:
return None, None, {"error": str(e)}, ""
with gr.Blocks(title="File β†’ (RAG + LLM) β†’ Prompts β†’ Image+Audio") as demo:
gr.Markdown(
"# πŸ“¦β†’πŸ§ β†’πŸŽ¨+πŸ”Š Generate illustration and narration for your documents\n"
"**Bring your own OpenAI API key. We do not store your key; it is only used in memory for this run.**\n\n"
"- Upload a text/PDF, retrieve with embeddings, let the LLM craft prompts.\n"
"- Generate both an image **and** an audio narration.\n"
)
with gr.Row():
file_in = gr.File(label="Upload file (.txt/.md/.pdf)")
goal = gr.Textbox(
label="Your goal (more detail β†’ better results)",
value="Generate an illustration and a narration that matches the spirit of this text",
lines=2,
)
with gr.Row():
api_key_in = gr.Textbox(
label="OpenAI API key",
placeholder="sk-...",
type="password",
)
topk = gr.Slider(1, 12, value=6, step=1, label="Top-K retrieved chunks")
model_llm = gr.Dropdown(
choices=["gpt-5-nano"],
value="gpt-5-nano",
label="LLM model",
)
run_btn = gr.Button("Run", variant="primary")
out_img = gr.Image(label="Generated image")
out_audio = gr.Audio(label="Generated audio", type="numpy") # ⚑ ζ”Ήζˆ numpy
out_json = gr.JSON(label="Spec & Metadata")
out_raw = gr.Code(label="Raw JSON (debug)", language="json")
run_btn.click(
ui_pipeline,
inputs=[file_in, api_key_in, goal, topk, model_llm],
outputs=[out_img, out_audio, out_json, out_raw],
)
if __name__ == "__main__":
demo.launch()