|
|
import os, json |
|
|
from pathlib import Path |
|
|
from typing import Dict, Any |
|
|
|
|
|
import gradio as gr |
|
|
|
|
|
from rag.extract_text import extract_text |
|
|
from rag.rag import OpenAIEmbedRAG |
|
|
from llm.call_llm import call_llm_structured |
|
|
from generation.gen_img import generate_image_with_openai_from_llm_spec |
|
|
from generation.gen_audio import generate_audio_with_openai_from_llm_spec |
|
|
|
|
|
|
|
|
OUT_DIR = Path("outputs") |
|
|
OUT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
|
|
|
def hits_to_snippets_json(hits, raw_fallback: str, limit: int = 500) -> str: |
|
|
"""Safe serialization for retrieved chunks.""" |
|
|
items = [] |
|
|
if hits is None: |
|
|
hits = [] |
|
|
for i, h in enumerate(hits): |
|
|
if not isinstance(h, dict): |
|
|
continue |
|
|
t = h.get("text") |
|
|
if not t: |
|
|
continue |
|
|
items.append({"id": h.get("id", i), "excerpt": t[:limit]}) |
|
|
if not items: |
|
|
fb = (raw_fallback or "")[:limit] |
|
|
if fb: |
|
|
items = [{"id": 0, "excerpt": fb}] |
|
|
return json.dumps(items, ensure_ascii=False) |
|
|
|
|
|
|
|
|
|
|
|
def run_pipeline( |
|
|
file_path: str, |
|
|
openai_api_key: str, |
|
|
user_goal: str = "Generate an image and a narration audio that reflects the essence of the text", |
|
|
topk: int = 6, |
|
|
llm_model: str = "gpt-5-nano", |
|
|
): |
|
|
""" |
|
|
Runs the full pipeline using the provided API key. |
|
|
SECURITY NOTE: We do not write the API key to disk or include it in any outputs. |
|
|
""" |
|
|
|
|
|
if not openai_api_key or not openai_api_key.strip(): |
|
|
raise ValueError("OpenAI API key is required.") |
|
|
|
|
|
|
|
|
raw = extract_text(file_path) |
|
|
|
|
|
|
|
|
rag = OpenAIEmbedRAG(model="text-embedding-3-small", openai_key=openai_api_key) |
|
|
rag.build(raw) |
|
|
hits = rag.search(user_goal, topk=topk) |
|
|
|
|
|
snippets_json = hits_to_snippets_json(hits, raw) |
|
|
|
|
|
|
|
|
system_prompt = """ |
|
|
You are a prompt engineer for **visual** and **audio** generation. |
|
|
|
|
|
Return a JSON object strictly matching this schema: |
|
|
|
|
|
{ |
|
|
"image": { |
|
|
"prompt": "string, a detailed description of what the image should show", |
|
|
"negative_prompt": "string, optional description of what to avoid", |
|
|
"style": ["string", ...], // optional styles like "cinematic", "oil painting" |
|
|
"width": int, // optional, default 1024 |
|
|
"height": int // optional, default 1024 |
|
|
}, |
|
|
"audio": { |
|
|
"text": "string, the exact narration script to be read aloud, written in natural spoken language, only contains the script itself without extra beginning", |
|
|
"voice": "string, choose from [alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse], default 'alloy', should match the atmosphere of the text", |
|
|
"speed": float, // optional, default 1.0 |
|
|
}, |
|
|
"debug": { |
|
|
"reasoning": "string, brief reasoning why you designed the prompts this way" |
|
|
} |
|
|
} |
|
|
|
|
|
Rules: |
|
|
- Always output valid JSON only, no explanations outside JSON. |
|
|
- The narration text (`audio.text`) must be **rewritten into a smooth script** that could be directly read aloud, |
|
|
not raw excerpts. |
|
|
- All places in JSON must be filled with valid content, do not leave any entry empty! |
|
|
- Keep the image prompt concise but vivid. |
|
|
""".strip() |
|
|
|
|
|
user_prompt = f""" |
|
|
Goal: {user_goal} |
|
|
|
|
|
Below are the most relevant retrieved excerpts (with source IDs): |
|
|
{snippets_json} |
|
|
|
|
|
Now produce the JSON object strictly following the schema. |
|
|
""".strip() |
|
|
|
|
|
spec = call_llm_structured( |
|
|
system_prompt, |
|
|
user_prompt, |
|
|
model=llm_model, |
|
|
openai_key=openai_api_key, |
|
|
) |
|
|
|
|
|
print(spec) |
|
|
|
|
|
|
|
|
result_img = generate_image_with_openai_from_llm_spec( |
|
|
spec, |
|
|
out_dir=str(OUT_DIR), |
|
|
openai_key=openai_api_key, |
|
|
) |
|
|
img_obj = result_img["image"] |
|
|
|
|
|
|
|
|
result_audio = generate_audio_with_openai_from_llm_spec( |
|
|
spec, |
|
|
out_dir=str(OUT_DIR), |
|
|
openai_key=openai_api_key, |
|
|
) |
|
|
audio_bytes = result_audio["audio_bytes"] |
|
|
|
|
|
|
|
|
pretty = { |
|
|
"spec": spec, |
|
|
"used_chunks_preview": json.loads(snippets_json), |
|
|
} |
|
|
raw_json = json.dumps(pretty, ensure_ascii=False, indent=2) |
|
|
return img_obj, audio_bytes, pretty, raw_json |
|
|
|
|
|
|
|
|
|
|
|
def ui_pipeline(file, api_key, goal, topk, model_name): |
|
|
if file is None: |
|
|
return None, None, {"error": "Please upload a file (.txt/.md/.pdf)."}, "" |
|
|
try: |
|
|
return run_pipeline( |
|
|
file_path=file.name, |
|
|
openai_api_key=api_key, |
|
|
user_goal=goal, |
|
|
topk=int(topk), |
|
|
llm_model=model_name, |
|
|
) |
|
|
except Exception as e: |
|
|
return None, None, {"error": str(e)}, "" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="File β (RAG + LLM) β Prompts β Image+Audio") as demo: |
|
|
gr.Markdown( |
|
|
"# π¦βπ§ βπ¨+π Generate illustration and narration for your documents\n" |
|
|
"**Bring your own OpenAI API key. We do not store your key; it is only used in memory for this run.**\n\n" |
|
|
"- Upload a text/PDF, retrieve with embeddings, let the LLM craft prompts.\n" |
|
|
"- Generate both an image **and** an audio narration.\n" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
file_in = gr.File(label="Upload file (.txt/.md/.pdf)") |
|
|
goal = gr.Textbox( |
|
|
label="Your goal (more detail β better results)", |
|
|
value="Generate an illustration and a narration that matches the spirit of this text", |
|
|
lines=2, |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
api_key_in = gr.Textbox( |
|
|
label="OpenAI API key", |
|
|
placeholder="sk-...", |
|
|
type="password", |
|
|
) |
|
|
topk = gr.Slider(1, 12, value=6, step=1, label="Top-K retrieved chunks") |
|
|
model_llm = gr.Dropdown( |
|
|
choices=["gpt-5-nano"], |
|
|
value="gpt-5-nano", |
|
|
label="LLM model", |
|
|
) |
|
|
|
|
|
run_btn = gr.Button("Run", variant="primary") |
|
|
|
|
|
out_img = gr.Image(label="Generated image") |
|
|
out_audio = gr.Audio(label="Generated audio", type="numpy") |
|
|
out_json = gr.JSON(label="Spec & Metadata") |
|
|
out_raw = gr.Code(label="Raw JSON (debug)", language="json") |
|
|
|
|
|
run_btn.click( |
|
|
ui_pipeline, |
|
|
inputs=[file_in, api_key_in, goal, topk, model_llm], |
|
|
outputs=[out_img, out_audio, out_json, out_raw], |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|