Spaces:
Sleeping
Sleeping
| import io # for creating in-memory binary streams | |
| import wave # for writing WAV audio files | |
| import re # for regular expression utilities | |
| import streamlit as st # Streamlit UI library | |
| from transformers import pipeline # Hugging Face inference pipelines | |
| from PIL import Image # Python Imaging Library for image loading | |
| import numpy as np # numerical operations, especially array handling | |
| # 1) CACHE & LOAD MODELS (CPU only) | |
| def load_captioner(): | |
| return pipeline( | |
| "image-to-text", | |
| model="Salesforce/blip-image-captioning-base", | |
| device=-1 # force CPU | |
| ) | |
| def load_story_pipe(): | |
| return pipeline( | |
| "text2text-generation", | |
| model="google/flan-t5-base", | |
| device=-1 # force CPU | |
| ) | |
| def load_tts_pipe(): | |
| return pipeline( | |
| "text-to-speech", | |
| model="facebook/mms-tts-eng", | |
| device=-1 # force CPU | |
| ) | |
| # 2) HELPER FUNCTIONS | |
| def sentence_case(text: str) -> str: | |
| parts = re.split(r'([.!?])', text) | |
| out = [] | |
| for i in range(0, len(parts) - 1, 2): | |
| sentence = parts[i].strip() | |
| delimiter = parts[i + 1] | |
| if sentence: | |
| formatted = sentence[0].upper() + sentence[1:] | |
| out.append(f"{formatted}{delimiter}") | |
| if len(parts) % 2: | |
| last = parts[-1].strip() | |
| if last: | |
| formatted = last[0].upper() + last[1:] | |
| out.append(formatted) | |
| return " ".join(" ".join(out).split()) | |
| def caption_image(img: Image.Image, captioner) -> str: | |
| if img.mode != "RGB": | |
| img = img.convert("RGB") | |
| results = captioner(img) | |
| return (results[0].get("generated_text", "") if results else "") | |
| def story_from_caption(caption: str, pipe) -> str: | |
| if not caption: | |
| return "Could not generate a story without a caption." | |
| prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}\n\nWrite a creative and descriptive short story." | |
| results = pipe( | |
| prompt, | |
| max_length=120, | |
| min_length=60, | |
| do_sample=True, | |
| top_k=100, | |
| top_p=0.9, | |
| temperature=0.8, | |
| repetition_penalty=1.1, | |
| no_repeat_ngram_size=4, | |
| early_stopping=False | |
| ) | |
| raw = results[0]["generated_text"].strip() | |
| # Remove prompt echo if present | |
| raw = re.sub(re.escape(prompt), "", raw, flags=re.IGNORECASE).strip() | |
| # Trim to last full sentence | |
| idx = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?")) | |
| if idx != -1: | |
| raw = raw[:idx+1] | |
| elif len(raw) > 80: | |
| raw = raw[:raw.rfind(" ") if raw.rfind(" ") > 60 else 80] + "..." | |
| return sentence_case(raw) | |
| def tts_bytes(text: str, tts_pipe) -> bytes: | |
| if not text: | |
| return b"" | |
| cleaned = re.sub(r'^["\']|["\']$', '', text).strip() | |
| cleaned = re.sub(r'\.{2,}', '.', cleaned).replace('…', '...') | |
| if cleaned[-1] not in ".!?": | |
| cleaned += "." | |
| cleaned = " ".join(cleaned.split()) | |
| output = tts_pipe(cleaned) | |
| result = output[0] if isinstance(output, list) else output | |
| audio_array = result.get("audio") | |
| rate = result.get("sampling_rate") | |
| if audio_array is None or rate is None: | |
| return b"" | |
| if audio_array.ndim == 1: | |
| data = audio_array[:, np.newaxis] | |
| else: | |
| data = audio_array.T | |
| pcm = (data * 32767).astype(np.int16) | |
| buf = io.BytesIO() | |
| wf = wave.open(buf, "wb") | |
| wf.setnchannels(data.shape[1]) | |
| wf.setsampwidth(2) | |
| wf.setframerate(rate) | |
| wf.writeframes(pcm.tobytes()) | |
| wf.close() | |
| buf.seek(0) | |
| return buf.read() | |
| # 3) STREAMLIT USER INTERFACE | |
| st.set_page_config(page_title="✨ Imagine & Narrate", page_icon="✨", layout="centered") | |
| # Persist upload across reruns | |
| if "uploaded_file" not in st.session_state: | |
| st.session_state.uploaded_file = None | |
| new_upload = st.file_uploader( | |
| "Choose an image file", | |
| type=["jpg", "jpeg", "png"] | |
| ) | |
| if new_upload is not None: | |
| st.session_state.uploaded_file = new_upload | |
| if st.session_state.uploaded_file is None: | |
| st.title("✨ Imagine & Narrate") | |
| st.info("➡️ Upload an image above to start the magic!") | |
| st.stop() | |
| uploaded = st.session_state.uploaded_file | |
| try: | |
| img = Image.open(uploaded) | |
| except Exception as e: | |
| st.error(f"Could not load the image: {e}") | |
| st.stop() | |
| st.title("✨ Imagine & Narrate") | |
| st.subheader("📸 Your Visual Input") | |
| st.image(img, caption=uploaded.name, use_container_width=True) | |
| st.divider() | |
| # Step 1: Generate Caption | |
| st.subheader("🧠 Generating Caption") | |
| with st.spinner("Analyzing image..."): | |
| captioner = load_captioner() | |
| raw_caption = caption_image(img, captioner) | |
| if not raw_caption: | |
| st.error("Failed to generate caption.") | |
| st.stop() | |
| caption = sentence_case(raw_caption) | |
| st.markdown(f"**Identified Scene:** {caption}") | |
| st.divider() | |
| # Step 2: Generate Story | |
| st.subheader("📖 Crafting a Story") | |
| with st.spinner("Writing story..."): | |
| story_pipe = load_story_pipe() | |
| story = story_from_caption(caption, story_pipe) | |
| if not story or story.strip() in {".", "..", "..."}: | |
| st.error("Failed to generate story.") | |
| st.stop() | |
| st.write(story) | |
| st.divider() | |
| # Step 3: Synthesize Audio | |
| st.subheader("👂 Hear the Story") | |
| with st.spinner("Synthesizing audio..."): | |
| tts_pipe = load_tts_pipe() | |
| audio_bytes = tts_bytes(story, tts_pipe) | |
| if not audio_bytes: | |
| st.warning("Audio generation failed.") | |
| else: | |
| st.audio(audio_bytes, format="audio/wav") | |
| st.balloons() |