File size: 6,602 Bytes
5c9f0d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import os, json
from pathlib import Path
from typing import Dict, Any

import gradio as gr

from rag.extract_text import extract_text
from rag.rag import OpenAIEmbedRAG
from llm.call_llm import call_llm_structured
from generation.gen_img import generate_image_with_openai_from_llm_spec
from generation.gen_audio import generate_audio_with_openai_from_llm_spec

# ---------- Output directory ----------
OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)


# ---------- Helpers ----------
def hits_to_snippets_json(hits, raw_fallback: str, limit: int = 500) -> str:
    """Safe serialization for retrieved chunks."""
    items = []
    if hits is None:
        hits = []
    for i, h in enumerate(hits):
        if not isinstance(h, dict):
            continue
        t = h.get("text")
        if not t:
            continue
        items.append({"id": h.get("id", i), "excerpt": t[:limit]})
    if not items:
        fb = (raw_fallback or "")[:limit]
        if fb:
            items = [{"id": 0, "excerpt": fb}]
    return json.dumps(items, ensure_ascii=False)


# ---------- Core pipeline ----------
def run_pipeline(
    file_path: str,
    openai_api_key: str,
    user_goal: str = "Generate an image and a narration audio that reflects the essence of the text",
    topk: int = 6,
    llm_model: str = "gpt-5-nano",
):
    """
    Runs the full pipeline using the provided API key.
    SECURITY NOTE: We do not write the API key to disk or include it in any outputs.
    """

    if not openai_api_key or not openai_api_key.strip():
        raise ValueError("OpenAI API key is required.")

    # 1) Extract text
    raw = extract_text(file_path)

    # 2) RAG (embeddings + search)
    rag = OpenAIEmbedRAG(model="text-embedding-3-small", openai_key=openai_api_key)
    rag.build(raw)
    hits = rag.search(user_goal, topk=topk)

    snippets_json = hits_to_snippets_json(hits, raw)

    # 3) LLM β†’ structured JSON (image/audio/debug)
    system_prompt = """
    You are a prompt engineer for **visual** and **audio** generation.

    Return a JSON object strictly matching this schema:

    {
    "image": {
        "prompt": "string, a detailed description of what the image should show",
        "negative_prompt": "string, optional description of what to avoid",
        "style": ["string", ...],   // optional styles like "cinematic", "oil painting"
        "width": int,               // optional, default 1024
        "height": int               // optional, default 1024
    },
    "audio": {
        "text": "string, the exact narration script to be read aloud, written in natural spoken language, only contains the script itself without extra beginning",
        "voice": "string, choose from [alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse], default 'alloy', should match the atmosphere of the text",
        "speed": float,             // optional, default 1.0
    },
    "debug": {
        "reasoning": "string, brief reasoning why you designed the prompts this way"
    }
    }

    Rules:
    - Always output valid JSON only, no explanations outside JSON.
    - The narration text (`audio.text`) must be **rewritten into a smooth script** that could be directly read aloud,
    not raw excerpts.
    - All places in JSON must be filled with valid content, do not leave any entry empty!
    - Keep the image prompt concise but vivid.
    """.strip()

    user_prompt = f"""
    Goal: {user_goal}

    Below are the most relevant retrieved excerpts (with source IDs):
    {snippets_json}

    Now produce the JSON object strictly following the schema.
    """.strip()

    spec = call_llm_structured(
        system_prompt,
        user_prompt,
        model=llm_model,
        openai_key=openai_api_key,
    )

    print(spec)

    # 4) Image generation
    result_img = generate_image_with_openai_from_llm_spec(
        spec,
        out_dir=str(OUT_DIR),
        openai_key=openai_api_key,
    )
    img_obj = result_img["image"]

    # 5) Audio generation
    result_audio = generate_audio_with_openai_from_llm_spec(
        spec,
        out_dir=str(OUT_DIR),
        openai_key=openai_api_key,
    )
    audio_bytes = result_audio["audio_bytes"]

    # 6) Pretty meta for UI
    pretty = {
        "spec": spec,
        "used_chunks_preview": json.loads(snippets_json),
    }
    raw_json = json.dumps(pretty, ensure_ascii=False, indent=2)
    return img_obj, audio_bytes, pretty, raw_json


# ---------- Gradio UI ----------
def ui_pipeline(file, api_key, goal, topk, model_name):
    if file is None:
        return None, None, {"error": "Please upload a file (.txt/.md/.pdf)."}, ""
    try:
        return run_pipeline(
            file_path=file.name,
            openai_api_key=api_key,
            user_goal=goal,
            topk=int(topk),
            llm_model=model_name,
        )
    except Exception as e:
        return None, None, {"error": str(e)}, ""


with gr.Blocks(title="File β†’ (RAG + LLM) β†’ Prompts β†’ Image+Audio") as demo:
    gr.Markdown(
        "# πŸ“¦β†’πŸ§ β†’πŸŽ¨+πŸ”Š  Generate illustration and narration for your documents\n"
        "**Bring your own OpenAI API key. We do not store your key; it is only used in memory for this run.**\n\n"
        "- Upload a text/PDF, retrieve with embeddings, let the LLM craft prompts.\n"
        "- Generate both an image **and** an audio narration.\n"
    )

    with gr.Row():
        file_in = gr.File(label="Upload file (.txt/.md/.pdf)")
        goal = gr.Textbox(
            label="Your goal (more detail β†’ better results)",
            value="Generate an illustration and a narration that matches the spirit of this text",
            lines=2,
        )

    with gr.Row():
        api_key_in = gr.Textbox(
            label="OpenAI API key",
            placeholder="sk-...",
            type="password",
        )
        topk = gr.Slider(1, 12, value=6, step=1, label="Top-K retrieved chunks")
        model_llm = gr.Dropdown(
            choices=["gpt-5-nano"],
            value="gpt-5-nano",
            label="LLM model",
        )

    run_btn = gr.Button("Run", variant="primary")

    out_img = gr.Image(label="Generated image")
    out_audio = gr.Audio(label="Generated audio", type="numpy")  # ⚑ ζ”Ήζˆ numpy
    out_json = gr.JSON(label="Spec & Metadata")
    out_raw = gr.Code(label="Raw JSON (debug)", language="json")

    run_btn.click(
        ui_pipeline,
        inputs=[file_in, api_key_in, goal, topk, model_llm],
        outputs=[out_img, out_audio, out_json, out_raw],
    )

if __name__ == "__main__":
    demo.launch()