voice2comic / app.py
fizzarif7's picture
Update app.py
e306d8d verified
import os
import gradio as gr
from PIL import Image
from io import BytesIO
from dotenv import load_dotenv
from gtts import gTTS
import tempfile
import traceback
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
import re
import google.generativeai as genai
# Load API keys
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise EnvironmentError("Missing API keys. Check GOOGLE_API_KEY in .env.")
# Configure models
genai.configure(api_key=api_key)
text_model = genai.GenerativeModel("gemini-1.5-flash")
image_model = genai.GenerativeModel(
model_name="gemini-2.0-flash-preview-image-generation",
generation_config={"response_modalities": ["TEXT", "IMAGE"]}
)
# -------------------- Utility Functions --------------------
def generate_image_from_text(prompt):
try:
response = image_model.generate_content(prompt)
for part in response.candidates[0].content.parts:
if hasattr(part, 'inline_data') and part.inline_data.mime_type.startswith("image/"):
return Image.open(BytesIO(part.inline_data.data))
except Exception as e:
print("Image generation error:", e)
return None
def summarize_scene(scene_text):
try:
response = text_model.generate_content(f"Summarize this scene in one sentence: {scene_text}")
return response.text.strip()
except Exception as e:
print("Summary error:", e)
return "Summary unavailable."
def explain_scene(image):
try:
response = text_model.generate_content([image, "Explain this image scene in detail."])
return response.text.strip()
except:
return "Explanation unavailable."
def text_to_speech(text):
try:
tts = gTTS(text)
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(tmp.name)
return tmp.name
except Exception as e:
print("TTS error:", e)
return None
def generate_pdf(images, explanations, title="AI-Generated Story Scenes"):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
doc = SimpleDocTemplate(tmp.name, pagesize=letter)
styles = getSampleStyleSheet()
story = [Paragraph(title, styles["Title"]), Spacer(1, 12)]
for i, img in enumerate(images):
if img:
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as img_tmp:
img.save(img_tmp.name)
story += [
RLImage(img_tmp.name, width=400, height=300),
Spacer(1, 12),
Paragraph(f"Scene {i + 1} Explanation", styles["Heading3"]),
Paragraph(explanations[i], styles["BodyText"]),
Spacer(1, 24),
]
doc.build(story)
return tmp.name
def reset_fields():
return "", "", "", "", "", "Fantasy"
def recreate_scene_handler(
num_scenes, theme, char_count, character_names, dialogue,
dialogue_speaker, char_styles, char_moods, bg_style,
images, summaries, explanations, scene_number_to_recreate):
index = int(scene_number_to_recreate) - 1
if index < 0 or index >= len(images):
return (
None, "", "❌ Invalid input. You have not generated Scene {} yet.".format(scene_number_to_recreate),
images, summaries, explanations,
f"⚠️ Scene {scene_number_to_recreate} is not available. Generate it first.", gr.update(visible=False)
)
return generate_scene(
num_scenes, theme, char_count, character_names, dialogue,
dialogue_speaker, char_styles, char_moods, bg_style,
images, summaries, explanations,
recreate_mode=True, current_index=index
)
# -------------------- Core Logic --------------------
def generate_scene(num_scenes, theme, char_count, character_names, dialogue,
dialogue_speaker, char_styles, char_moods, bg_style,
images, summaries, explanations, recreate_mode=False, current_index=0):
prompt = (
f"A {bg_style}-style illustration for Scene {current_index + 1} with {char_count} characters in a '{theme}' setting. "
f"Characters: {character_names}. They are dressed as: {char_styles}. Current mood: {char_moods}. "
)
if dialogue:
prompt += f'The character "{dialogue_speaker}" says: "{dialogue}". Display this in a speech bubble.'
prompt += f' Please visualize this as a speech bubble above {dialogue_speaker}, like in a cartoon.'
image = generate_image_from_text(prompt)
summary = summarize_scene(prompt)
explanation = explain_scene(image) if image else "Explanation unavailable."
if recreate_mode:
images[current_index] = image
summaries[current_index] = summary
explanations[current_index] = explanation
else:
if len(images) >= int(num_scenes):
return gr.update(), gr.update(), gr.update(), images, summaries, explanations, \
f"βœ… All {num_scenes} scenes have been generated.", gr.update(visible=True)
images.append(image)
summaries.append(summary)
explanations.append(explanation)
status = f"βœ… Scene {current_index + 1} {'recreated' if recreate_mode else 'generated'}."
done_visible = len(images) == int(num_scenes)
return image, summary, explanation, images, summaries, explanations, status, gr.update(visible=done_visible)
def finalize_story(images, explanations, title):
if not images or not explanations:
return None, None
pdf = generate_pdf(images, explanations,title)
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as txt:
for i, exp in enumerate(explanations):
txt.write(f"Scene {i+1}:\n{exp}\n\n")
txt_path = txt.name
return txt_path, pdf
def ai_write_scene(theme, total_scenes, scene_summaries):
try:
scene_index = len(scene_summaries) + 1
if scene_index > int(total_scenes):
return "", "", "", "", "", "Fantasy", f"βœ… All {total_scenes} scenes completed."
story_so_far = "\n".join([f"Scene {i+1}: {s}" for i, s in enumerate(scene_summaries)]) if scene_summaries else ""
prompt = f"""
You are co-writing a story set in this theme: "{theme}".
Generate Scene {scene_index} of {total_scenes}.
Continue the story logically based on previous scenes (if any):
{story_so_far}
Return:
1. Character names (existing or new),
2. A single dialogue line,
3. The speaker of that dialogue,
4. Outfits worn,
5. Characters' emotional moods,
6. Background style (choose from: Realistic, Cartoon, Fantasy, Dark Fantasy).
Only provide the raw values, no headers.
"""
response = text_model.generate_content(prompt).text.strip()
parts = [re.sub(r"^\d+\.\s*", "", line.strip()) for line in response.split("\n") if line.strip()]
char_names = parts[0] if len(parts) > 0 else ""
dialogue = parts[1] if len(parts) > 1 else ""
speaker = parts[2] if len(parts) > 2 else ""
outfits = parts[3] if len(parts) > 3 else ""
moods = parts[4] if len(parts) > 4 else ""
bg_style = parts[5] if len(parts) > 5 else "Fantasy"
return char_names, dialogue, speaker, outfits, moods, bg_style, f"πŸ“ Scene {scene_index} ready to generate."
except Exception as e:
print("AI write error:", e)
return "", "", "", "", "", "Fantasy", "⚠️ AI scene generation failed."
# -------------------- UI --------------------
with gr.Blocks(
title="Comic Creator", css="""
body {
background-color: #ffffff;
}
.gradio-container {
max-width: 1000px;
margin: 2rem auto;
padding: 32px;
background: #d5f2ee;
border-radius: 20px;
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.08);
font-family: 'Segoe UI', sans-serif;
}
.gr-markdown h2, .gr-markdown h3 {
color: #2e4053;
}
.gr-button {
border-radius: 10px;
font-weight: bold;
padding: 12px 24px;
transition: all 0.4s ease;
box-shadow: 0 4px 10px rgba(0,0,0,0.15);
}
#generate-btn,
#ai-write-btn,
#recreate-btn,
#reset-btn,
#done-btn,
#recreate-scene-index {
background-color: #66b2b2;
color: white;
border: 1px solid #000000;
border-radius:10px;
margin:12px;
}
#generate-btn, #ai-write-btn{
display: flex;
justify-content: center;
gap: 10px;
}
#recreate-btn,#reset-btn, #done-btn {
display: flex;
justify-content: center;
gap: 10px;
}
#tts-btn{
background-color: #66b2b2;
color: white;
border: 1px solid #000000;
border-radius:10px;
width:700px;
height:80px;
}
#generate-btn:hover,
#ai-write-btn:hover,
#recreate-btn:hover,
#reset-btn:hover,
#done-btn:hover,
#tts-btn:hover{
background: #008080 ;
transform: scale(1.05);
color: #000;
cursor: pointer;
border-radius:10px;
}
input, textarea, select {
border-radius: 8px ;
border: 1px solid #004c4c;
padding: 10px !important;
background-color: #d5f2ee;
box-shadow: inset 0 1px 3px rgba(0,0,0,0.05);
transition: all 0.2s ease-in-out;
}
input:focus, textarea:focus, select:focus {
border-color: #004c4c !important;
box-shadow: 0 0 6px rgba(255, 105, 180, 0.3);
outline: none;
}
.custom-rows{
background: #e1f5f5;
padding: 10px;
border-radius: 8px;
}
/* πŸŽ† Scene Image Border */
.gr-image img {
border: 4px dashed #004c4c;
border-radius: 16px;
padding: 4px;
}
.gr-accordion {
background-color: #d4a373;
border: 1px solid #004c4c;
border-radius: 12px;
padding: 12px;
margin-bottom: 12px;
}
.gr-accordion .gr-box {
background-color: #d4a373;
border-radius: 10px;
padding: 12px;
}
.gr-image, .gr-audio, .gr-file {
border: 1px solid #d6eaf8;
border-radius: 12px;
background-color: #d1e8e8;
padding: 12px;
}
.animated-title {
font-size: 3rem;
font-weight: bold;
text-align: center;
color: #147d5a;
animation: float 3s ease-in-out infinite;
text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
margin-bottom: 1.5rem;
font-family: 'Comic Sans MS', cursive, sans-serif;
}
#accordion {
background: linear-gradient(to right, #ffffff, #66b2b2);
border: 2px solid #004c4c;
border-radius: 12px;
padding: 16px;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
font-family: 'Segoe UI', sans-serif;
transition: all 0.3s ease-in-out;
}
#accordion:hover {
box-shadow: 0 6px 20px rgba(0,0,0,0.2);
transform: scale(1.01);
}
#accordion h2 {
color: #343a40;
font-weight: 600;
font-size: 1.2rem;
font-color: black;
}
#accordion1{
background: linear-gradient(to right, #ffffff, #66b2b2);
border: 2px solid #004c4c;
border-radius: 12px;
padding: 16px;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
font-family: 'Segoe UI', sans-serif;
transition: all 0.3s ease-in-out;
}
#accordion1:hover {
box-shadow: 0 6px 20px rgba(0,0,0,0.2);
transform: scale(1.01);
}
#accordion1 h2 {
color: #343a40;
font-weight: 600;
font-size: 1.2rem;
}
#accordion2{
background: linear-gradient(to right, #ffffff, #66b2b2);
border: 2px solid #004c4c;
border-radius: 12px;
padding: 16px;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
font-family: 'Segoe UI', sans-serif;
transition: all 0.3s ease-in-out;
}
#accordion2:hover {
box-shadow: 0 6px 20px rgba(0,0,0,0.2);
transform: scale(1.01);
}
#accordion2 h2 {
color: #343a40;
font-weight: 600;
font-size: 1.2rem;
}
#custom-dropdown select {
background-color: #d5f2ee;
border: 2px solid #008080;
border-radius: 10px;
padding: 10px 12px;
font-size: 1rem;
color: #004c4c;
font-weight: 600;
transition: all 0.3s ease-in-out;
box-shadow: inset 0 1px 3px rgba(0,0,0,0.08);
}
#custom-dropdown select:focus {
outline: none;
border-color: #00a3a3;
box-shadow: 0 0 8px rgba(0, 163, 163, 0.3);
background-color: #d5f2ee;
}
#custom-dropdown label {
font-weight: bold;
color: #2e4053;
margin-bottom: 6px;
}
#output {
background: #fff0f5;
border: 2px dashed #004c4c;
border-radius: 14px;
padding: 16px;
font-size: 1.1rem;
color: #4a235a;
animation: floatX 4s ease-in-out infinite;
box-shadow: 0 4px 8px rgba(214, 51, 132, 0.15);
transition: all 0.3s ease-in-out;
}
#output:hover {
background: #ffe0ec;
transform: scale(1.01);
cursor: default;
}
@keyframes float {
0% { transform: translateY(0px); }
50% { transform: translateY(-10px); }
100% { transform: translateY(0px); }
}
@keyframes floatX {
0% { transform: translateX(0); }
50% { transform: translateX(15px); }
100% { transform: translateX(0); }
}
@keyframes pulseColor {
0% { color: #ff4081; }
50% { color: #7e57c2; }
100% { color: #42a5f5; }
}
@keyframes backgroundMove {
0% { background-position: 0% 50%; }
100% { background-position: 100% 50%; }
}
@keyframes bounce {
0% { transform: scale(1); }
50% { transform: scale(1.1); }
100% { transform: scale(1); }
}
.gr-file label[for^=component-] {
background-color: #ffe6f0;
border: 2px solid #006666;
border-radius: 12px;
padding: 10px;
transition: all 0.3s ease-in-out;
box-shadow: 0 4px 10px rgba(255, 105, 180, 0.2);
font-weight: bold;
color: #4a235a;
font-size: 1rem;
text-align: center;
}
.gr-file label[for^=component-]:hover {
background-color: #f8bbd0;
transform: scale(1.03);
box-shadow: 0 6px 12px rgba(255, 105, 180, 0.3);
cursor: pointer;
}
.gr-audio {
border: 2px solid #006666;
background-color: ##b2d8d8;
border-radius: 16px;
padding: left 12px;
animation: pulseColor 3s infinite;
transition: transform 0.3s ease;
}
.gr-audio:hover {
transform: scale(1.02);
box-shadow: 0 6px 12px rgba(171, 71, 188, 0.2);
}
""") as demo:
gr.Markdown('<h1 class="animated-title">🎬 Comic Generator</h1>')
gr.Markdown("Describe your story one scene at a time, with AI-generated images, summaries, and explanations.\nPut your own GOOGLE API KEY (named as GOOGLE_API_KEY) in the SECRET VARIABLE.")
with gr.Accordion("🧩 Story Setup", open=True, elem_id="accordion"):
title = gr.Textbox(label="πŸ“– Story Title", placeholder="e.g. The Enchanted Forest",elem_id="textarea")
scene_total = gr.Number(label="πŸ”’ Number of Scenes", precision=0, value=3, elem_id="input")
theme = gr.Textbox(label="🌍 Global Theme", placeholder="e.g. A magical forest full of glowing creatures", elem_id="textarea")
gr.Markdown("### ✨ Describe Your Next Scene", elem_id="accordion1")
with gr.Group(elem_id = "accordion"):
with gr.Row(elem_classes="custom-rows"):
char_count = gr.Number(label="πŸ‘₯ Number of Characters", precision=0, value=2, elem_id="input")
character_names = gr.Textbox(label="πŸ§™β€β™‚οΈ Character Names", elem_id="textarea")
with gr.Row(elem_classes="custom-rows"):
dialogue = gr.Textbox(label="πŸ’¬ Dialogue (optional)", placeholder="e.g. 'Protect the forest!'", elem_id="textarea")
dialogue_speaker = gr.Textbox(label="πŸ—£οΈ Who says the dialogue?", placeholder="e.g. Bramble", elem_id = "textarea")
with gr.Row(elem_classes="custom-rows"):
char_styles = gr.Textbox(label="πŸ§₯ Outfit Descriptions", placeholder="e.g. Elora wears a leafy cloak, Bramble has a warrior vest", elem_id="textarea")
char_moods = gr.Textbox(label="😠 Character Moods", placeholder="e.g. Elora is cautious, Bramble is brave", elem_id="textarea")
bg_style = gr.Textbox(label="🎨 Background Style", placeholder="e.g. Realistic, Cartoon, Fantasy, Dark Fantasy" ,elem_id="textarea")
with gr.Group(elem_id="accordion"):
with gr.Row(elem_classes="custom-rows"):
with gr.Row(elem_classes="custom-rows"):
generate_btn = gr.Button("βž• Generate The Scene", elem_id="generate-btn")
ai_coauthor_btn = gr.Button("πŸ€– Let AI Write This One", elem_id="ai-write-btn")
with gr.Row(elem_classes="custom-rows"):
recreate_btn = gr.Button("πŸ”„ Recreate The Scene", elem_id="recreate-btn")
reset_btn = gr.Button("\n⏭️ Reset", elem_id="reset-btn")
with gr.Row(elem_classes="custom-rows"):
recreate_scene_index = gr.Number(label="πŸ”’ Scene Number to Recreate", precision=0, value=1, elem_id="recreate_scene_index")
status = gr.Markdown(elem_classes="gr-image")
image_output = gr.Image(label="πŸ–ΌοΈ Scene Image", type="pil", elem_id="output")
summary_output = gr.Markdown(label="πŸ“ Scene Summary", elem_id = "output")
explanation_output = gr.Textbox(label="πŸ“– Scene Explanation", lines=6, elem_id="output")
with gr.Group(elem_id="accordion"):
with gr.Row(elem_classes="custom-rows"):
tts_btn = gr.Button("πŸ”Š Read Aloud", elem_id="tts-btn")
tts_audio = gr.Audio(label="Audio", autoplay=False,elem_classes="gr-audio")
done_btn = gr.Button("βœ… Done", visible = False, elem_id="done-btn")
with gr.Group(elem_id="accordion2"):
with gr.Row(elem_classes="custom-rows"):
txt_file = gr.File(label="πŸ“„ Explanations (.txt)")
pdf_file = gr.File(label="πŸ“˜ Scene PDF")
# States
scene_images = gr.State([])
scene_explanations = gr.State([])
scene_summaries = gr.State([])
current_scene_index = gr.State(0)
recreate_mode = gr.State(True)
generate_btn.click(
fn=generate_scene,
inputs=[
scene_total, theme, char_count, character_names, dialogue,
dialogue_speaker, char_styles, char_moods, bg_style,
scene_images, scene_summaries, scene_explanations,
gr.State(False), current_scene_index
],
outputs=[
image_output, summary_output, explanation_output,
scene_images, scene_summaries, scene_explanations,
status, done_btn
]
)
recreate_btn.click(
fn=recreate_scene_handler,
inputs=[
scene_total, theme, char_count, character_names, dialogue,
dialogue_speaker, char_styles, char_moods, bg_style,
scene_images, scene_summaries, scene_explanations,
recreate_scene_index
],
outputs=[
image_output, summary_output, explanation_output,
scene_images, scene_summaries, scene_explanations,
status, done_btn
]
)
ai_coauthor_btn.click(
fn=ai_write_scene,
inputs=[theme, scene_total, scene_summaries],
outputs=[character_names, dialogue, dialogue_speaker, char_styles, char_moods, bg_style, status]
)
done_btn.click(
fn=finalize_story,
inputs=[scene_images, scene_explanations, title],
outputs=[txt_file, pdf_file]
)
reset_btn.click(
fn=reset_fields,
inputs=[],
outputs=[
character_names,
dialogue,
dialogue_speaker,
char_styles,
char_moods,
bg_style
]
)
tts_btn.click(
fn=text_to_speech,
inputs=[explanation_output],
outputs=[tts_audio]
)
demo.launch()