Spaces:

slxhere
/

doc_alive

Sleeping

App Files Files Community

doc_alive / app.py

slxhere

Add audio generation

5c9f0d9 4 months ago

raw

history blame contribute delete

6.6 kB

	import os, json
	from pathlib import Path
	from typing import Dict, Any

	import gradio as gr

	from rag.extract_text import extract_text
	from rag.rag import OpenAIEmbedRAG
	from llm.call_llm import call_llm_structured
	from generation.gen_img import generate_image_with_openai_from_llm_spec
	from generation.gen_audio import generate_audio_with_openai_from_llm_spec

	# ---------- Output directory ----------
	OUT_DIR = Path("outputs")
	OUT_DIR.mkdir(parents=True, exist_ok=True)


	# ---------- Helpers ----------
	def hits_to_snippets_json(hits, raw_fallback: str, limit: int = 500) -> str:
	"""Safe serialization for retrieved chunks."""
	items = []
	if hits is None:
	hits = []
	for i, h in enumerate(hits):
	if not isinstance(h, dict):
	continue
	t = h.get("text")
	if not t:
	continue
	items.append({"id": h.get("id", i), "excerpt": t[:limit]})
	if not items:
	fb = (raw_fallback or "")[:limit]
	if fb:
	items = [{"id": 0, "excerpt": fb}]
	return json.dumps(items, ensure_ascii=False)


	# ---------- Core pipeline ----------
	def run_pipeline(
	file_path: str,
	openai_api_key: str,
	user_goal: str = "Generate an image and a narration audio that reflects the essence of the text",
	topk: int = 6,
	llm_model: str = "gpt-5-nano",
	):
	"""
	Runs the full pipeline using the provided API key.
	SECURITY NOTE: We do not write the API key to disk or include it in any outputs.
	"""

	if not openai_api_key or not openai_api_key.strip():
	raise ValueError("OpenAI API key is required.")

	# 1) Extract text
	raw = extract_text(file_path)

	# 2) RAG (embeddings + search)
	rag = OpenAIEmbedRAG(model="text-embedding-3-small", openai_key=openai_api_key)
	rag.build(raw)
	hits = rag.search(user_goal, topk=topk)

	snippets_json = hits_to_snippets_json(hits, raw)

	# 3) LLM → structured JSON (image/audio/debug)
	system_prompt = """
	You are a prompt engineer for visual and audio generation.

	Return a JSON object strictly matching this schema:

	{
	"image": {
	"prompt": "string, a detailed description of what the image should show",
	"negative_prompt": "string, optional description of what to avoid",
	"style": ["string", ...], // optional styles like "cinematic", "oil painting"
	"width": int, // optional, default 1024
	"height": int // optional, default 1024
	},
	"audio": {
	"text": "string, the exact narration script to be read aloud, written in natural spoken language, only contains the script itself without extra beginning",
	"voice": "string, choose from [alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse], default 'alloy', should match the atmosphere of the text",
	"speed": float, // optional, default 1.0
	},
	"debug": {
	"reasoning": "string, brief reasoning why you designed the prompts this way"
	}
	}

	Rules:
	- Always output valid JSON only, no explanations outside JSON.
	- The narration text (`audio.text`) must be rewritten into a smooth script that could be directly read aloud,
	not raw excerpts.
	- All places in JSON must be filled with valid content, do not leave any entry empty!
	- Keep the image prompt concise but vivid.
	""".strip()

	user_prompt = f"""
	Goal: {user_goal}

	Below are the most relevant retrieved excerpts (with source IDs):
	{snippets_json}

	Now produce the JSON object strictly following the schema.
	""".strip()

	spec = call_llm_structured(
	system_prompt,
	user_prompt,
	model=llm_model,
	openai_key=openai_api_key,
	)

	print(spec)

	# 4) Image generation
	result_img = generate_image_with_openai_from_llm_spec(
	spec,
	out_dir=str(OUT_DIR),
	openai_key=openai_api_key,
	)
	img_obj = result_img["image"]

	# 5) Audio generation
	result_audio = generate_audio_with_openai_from_llm_spec(
	spec,
	out_dir=str(OUT_DIR),
	openai_key=openai_api_key,
	)
	audio_bytes = result_audio["audio_bytes"]

	# 6) Pretty meta for UI
	pretty = {
	"spec": spec,
	"used_chunks_preview": json.loads(snippets_json),
	}
	raw_json = json.dumps(pretty, ensure_ascii=False, indent=2)
	return img_obj, audio_bytes, pretty, raw_json


	# ---------- Gradio UI ----------
	def ui_pipeline(file, api_key, goal, topk, model_name):
	if file is None:
	return None, None, {"error": "Please upload a file (.txt/.md/.pdf)."}, ""
	try:
	return run_pipeline(
	file_path=file.name,
	openai_api_key=api_key,
	user_goal=goal,
	topk=int(topk),
	llm_model=model_name,
	)
	except Exception as e:
	return None, None, {"error": str(e)}, ""


	with gr.Blocks(title="File → (RAG + LLM) → Prompts → Image+Audio") as demo:
	gr.Markdown(
	"# 📦→🧠→🎨+🔊 Generate illustration and narration for your documents\n"
	"Bring your own OpenAI API key. We do not store your key; it is only used in memory for this run.\n\n"
	"- Upload a text/PDF, retrieve with embeddings, let the LLM craft prompts.\n"
	"- Generate both an image and an audio narration.\n"
	)

	with gr.Row():
	file_in = gr.File(label="Upload file (.txt/.md/.pdf)")
	goal = gr.Textbox(
	label="Your goal (more detail → better results)",
	value="Generate an illustration and a narration that matches the spirit of this text",
	lines=2,
	)

	with gr.Row():
	api_key_in = gr.Textbox(
	label="OpenAI API key",
	placeholder="sk-...",
	type="password",
	)
	topk = gr.Slider(1, 12, value=6, step=1, label="Top-K retrieved chunks")
	model_llm = gr.Dropdown(
	choices=["gpt-5-nano"],
	value="gpt-5-nano",
	label="LLM model",
	)

	run_btn = gr.Button("Run", variant="primary")

	out_img = gr.Image(label="Generated image")
	out_audio = gr.Audio(label="Generated audio", type="numpy") # ⚡ 改成 numpy
	out_json = gr.JSON(label="Spec & Metadata")
	out_raw = gr.Code(label="Raw JSON (debug)", language="json")

	run_btn.click(
	ui_pipeline,
	inputs=[file_in, api_key_in, goal, topk, model_llm],
	outputs=[out_img, out_audio, out_json, out_raw],
	)

	if __name__ == "__main__":
	demo.launch()