File size: 6,602 Bytes
5c9f0d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import os, json
from pathlib import Path
from typing import Dict, Any
import gradio as gr
from rag.extract_text import extract_text
from rag.rag import OpenAIEmbedRAG
from llm.call_llm import call_llm_structured
from generation.gen_img import generate_image_with_openai_from_llm_spec
from generation.gen_audio import generate_audio_with_openai_from_llm_spec
# ---------- Output directory ----------
OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)
# ---------- Helpers ----------
def hits_to_snippets_json(hits, raw_fallback: str, limit: int = 500) -> str:
"""Safe serialization for retrieved chunks."""
items = []
if hits is None:
hits = []
for i, h in enumerate(hits):
if not isinstance(h, dict):
continue
t = h.get("text")
if not t:
continue
items.append({"id": h.get("id", i), "excerpt": t[:limit]})
if not items:
fb = (raw_fallback or "")[:limit]
if fb:
items = [{"id": 0, "excerpt": fb}]
return json.dumps(items, ensure_ascii=False)
# ---------- Core pipeline ----------
def run_pipeline(
file_path: str,
openai_api_key: str,
user_goal: str = "Generate an image and a narration audio that reflects the essence of the text",
topk: int = 6,
llm_model: str = "gpt-5-nano",
):
"""
Runs the full pipeline using the provided API key.
SECURITY NOTE: We do not write the API key to disk or include it in any outputs.
"""
if not openai_api_key or not openai_api_key.strip():
raise ValueError("OpenAI API key is required.")
# 1) Extract text
raw = extract_text(file_path)
# 2) RAG (embeddings + search)
rag = OpenAIEmbedRAG(model="text-embedding-3-small", openai_key=openai_api_key)
rag.build(raw)
hits = rag.search(user_goal, topk=topk)
snippets_json = hits_to_snippets_json(hits, raw)
# 3) LLM β structured JSON (image/audio/debug)
system_prompt = """
You are a prompt engineer for **visual** and **audio** generation.
Return a JSON object strictly matching this schema:
{
"image": {
"prompt": "string, a detailed description of what the image should show",
"negative_prompt": "string, optional description of what to avoid",
"style": ["string", ...], // optional styles like "cinematic", "oil painting"
"width": int, // optional, default 1024
"height": int // optional, default 1024
},
"audio": {
"text": "string, the exact narration script to be read aloud, written in natural spoken language, only contains the script itself without extra beginning",
"voice": "string, choose from [alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse], default 'alloy', should match the atmosphere of the text",
"speed": float, // optional, default 1.0
},
"debug": {
"reasoning": "string, brief reasoning why you designed the prompts this way"
}
}
Rules:
- Always output valid JSON only, no explanations outside JSON.
- The narration text (`audio.text`) must be **rewritten into a smooth script** that could be directly read aloud,
not raw excerpts.
- All places in JSON must be filled with valid content, do not leave any entry empty!
- Keep the image prompt concise but vivid.
""".strip()
user_prompt = f"""
Goal: {user_goal}
Below are the most relevant retrieved excerpts (with source IDs):
{snippets_json}
Now produce the JSON object strictly following the schema.
""".strip()
spec = call_llm_structured(
system_prompt,
user_prompt,
model=llm_model,
openai_key=openai_api_key,
)
print(spec)
# 4) Image generation
result_img = generate_image_with_openai_from_llm_spec(
spec,
out_dir=str(OUT_DIR),
openai_key=openai_api_key,
)
img_obj = result_img["image"]
# 5) Audio generation
result_audio = generate_audio_with_openai_from_llm_spec(
spec,
out_dir=str(OUT_DIR),
openai_key=openai_api_key,
)
audio_bytes = result_audio["audio_bytes"]
# 6) Pretty meta for UI
pretty = {
"spec": spec,
"used_chunks_preview": json.loads(snippets_json),
}
raw_json = json.dumps(pretty, ensure_ascii=False, indent=2)
return img_obj, audio_bytes, pretty, raw_json
# ---------- Gradio UI ----------
def ui_pipeline(file, api_key, goal, topk, model_name):
if file is None:
return None, None, {"error": "Please upload a file (.txt/.md/.pdf)."}, ""
try:
return run_pipeline(
file_path=file.name,
openai_api_key=api_key,
user_goal=goal,
topk=int(topk),
llm_model=model_name,
)
except Exception as e:
return None, None, {"error": str(e)}, ""
with gr.Blocks(title="File β (RAG + LLM) β Prompts β Image+Audio") as demo:
gr.Markdown(
"# π¦βπ§ βπ¨+π Generate illustration and narration for your documents\n"
"**Bring your own OpenAI API key. We do not store your key; it is only used in memory for this run.**\n\n"
"- Upload a text/PDF, retrieve with embeddings, let the LLM craft prompts.\n"
"- Generate both an image **and** an audio narration.\n"
)
with gr.Row():
file_in = gr.File(label="Upload file (.txt/.md/.pdf)")
goal = gr.Textbox(
label="Your goal (more detail β better results)",
value="Generate an illustration and a narration that matches the spirit of this text",
lines=2,
)
with gr.Row():
api_key_in = gr.Textbox(
label="OpenAI API key",
placeholder="sk-...",
type="password",
)
topk = gr.Slider(1, 12, value=6, step=1, label="Top-K retrieved chunks")
model_llm = gr.Dropdown(
choices=["gpt-5-nano"],
value="gpt-5-nano",
label="LLM model",
)
run_btn = gr.Button("Run", variant="primary")
out_img = gr.Image(label="Generated image")
out_audio = gr.Audio(label="Generated audio", type="numpy") # β‘ ζΉζ numpy
out_json = gr.JSON(label="Spec & Metadata")
out_raw = gr.Code(label="Raw JSON (debug)", language="json")
run_btn.click(
ui_pipeline,
inputs=[file_in, api_key_in, goal, topk, model_llm],
outputs=[out_img, out_audio, out_json, out_raw],
)
if __name__ == "__main__":
demo.launch()
|