Spaces:

justREE
/

Assignment1

Sleeping

App Files Files Community

justREE commited on May 1, 2025

Commit

77c4802

verified ·

1 Parent(s): 936f674

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +208 -40

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,208 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+# app.py
+import io                   # for creating in-memory binary streams
+import wave                 # for writing WAV audio files
+import re                   # for regular expression utilities
+import streamlit as st      # Streamlit UI library
+from transformers import pipeline  # Hugging Face inference pipelines
+from PIL import Image       # Python Imaging Library for image loading
+import numpy as np          # numerical operations, especially array handling
+# 1) CACHE & LOAD MODELS
+@st.cache_resource(show_spinner=False)
+def load_captioner():
+    # Loads BLIP image-to-text model; cached so it loads only once.
+    # Returns: a function captioner(image: PIL.Image) -> List[Dict],
+    return pipeline(
+        "image-to-text",
+        model="Salesforce/blip-image-captioning-base",
+        device="cpu" # Can change to "cuda" if GPU is available
+    )
+@st.cache_resource(show_spinner=False)
+def load_story_pipe():
+    # Loads FLAN-T5 text-to-text model for story generation; cached once.
+    # Returns: a function story_pipe(prompt: str, **kwargs) -> List[Dict].
+    return pipeline(
+        "text2text-generation",
+        model="google/flan-t5-base",
+        device="cpu" # Can change to "cuda" if GPU is available
+    )
+@st.cache_resource(show_spinner=False)
+def load_tts_pipe():
+    # Loads Meta MMS-TTS text-to-speech model; cached once.
+    # Returns: a function tts_pipe(text: str) -> List[Dict] with "audio" and "sampling_rate".
+    return pipeline(
+        "text-to-speech",
+        model="facebook/mms-tts-eng",
+        device="cpu" # Can change to "cuda" if GPU is available
+    )
+# 2) HELPER FUNCTIONS
+def sentence_case(text: str) -> str:
+    # Splits text into sentences on .!? delimiters,
+    # capitalizes the first character of each sentence,
+    # then rejoins into a single string.
+    parts = re.split(r'([.!?])', text)  # ["hello", ".", " world", "!"]
+    out = []
+    for i in range(0, len(parts) - 1, 2):
+        sentence = parts[i].strip().capitalize()  # capitalize first letter
+        delimiter = parts[i + 1]                  # punctuation
+        # Ensure a space before the sentence if it wasn't the very first part
+        if out and not sentence.startswith(' ') and out[-1][-1] not in '.!?':
+             out.append(f" {sentence}{delimiter}")
+        else:
+            out.append(f"{sentence}{delimiter}")
+    # If trailing text without punctuation exists, capitalize and append it.
+    if len(parts) % 2:
+        last = parts[-1].strip().capitalize()
+        if last:
+             # Ensure a space before if needed
+             if out and not last.startswith(' ') and out[-1][-1] not in '.!?':
+                 out.append(f" {last}")
+             else:
+                 out.append(last)
+    # Clean up potential multiple spaces resulting from split/join
+    return " ".join(" ".join(out).split())
+def caption_image(img: Image.Image, captioner) -> str:
+    # Given a PIL image and a captioner pipeline, returns a single-line caption.
+    results = captioner(img)  # run model
+    if not results:
+        return ""
+    # extract "generated_text" field from first result
+    return results[0].get("generated_text", "")
+def story_from_caption(caption: str, pipe) -> str:
+    # Given a caption string and a text2text pipeline, returns a ~100-word story.
+    prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}"
+    results = pipe(
+        prompt,
+        max_length=120,            # increased max length slightly
+        min_length=80,             # minimum generated tokens
+        do_sample=True,            # enable sampling
+        top_k=100,                 # sample from top_k tokens
+        top_p=0.9,                 # nucleus sampling threshold
+        temperature=0.7,           # sampling temperature
+        repetition_penalty=1.1,    # discourage repetition
+        no_repeat_ngram_size=4,    # block repeated n-grams
+        early_stopping=False
+    )
+    raw = results[0]["generated_text"].strip()  # full generated text
+    # strip out the prompt if it echoes back - make comparison case-insensitive
+    if raw.lower().startswith(prompt.lower()):
+        raw = raw[len(prompt):].strip()
+    # trim to last complete sentence ending in . ! or ?
+    match = re.search(r'[.!?]', raw[::-1]) # Search for the first punctuation from the end
+    if match:
+        raw = raw[:len(raw) - match.start()] # Trim at that position
+    elif len(raw) > 80: # If no punctuation found but story is long, trim to a reasonable length
+         raw = raw[:80] + "..."
+    return sentence_case(raw)
+def tts_bytes(text: str, tts_pipe) -> bytes:
+    # Given a text string and a tts pipeline, returns WAV-format bytes.
+    # Clean up text for TTS - remove leading/trailing quotes, etc.
+    cleaned_text = re.sub(r'^["\']|["\']$', '', text).strip()
+    # Basic punctuation cleaning (optional, depending on TTS model)
+    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text) # Replace multiple periods with one
+    cleaned_text = cleaned_text.replace('…', '...') # Replace ellipsis char with dots
+    # Add a period if the text doesn't end with punctuation (helps TTS model finalize)
+    if cleaned_text and cleaned_text[-1] not in '.!?':
+         cleaned_text += '.'
+    output = tts_pipe(cleaned_text)
+    # pipeline may return list or single dict
+    result = output[0] if isinstance(output, list) else output
+    audio_array = result["audio"]            # numpy array: (channels, samples) or (samples,)
+    rate = result["sampling_rate"]           # sampling rate integer
+    # ensure audio_array is 2D (samples, channels) for consistent handling
+    if audio_array.ndim == 1:
+        data = audio_array[:, np.newaxis] # add channel dimension
+    else:
+        data = audio_array.T # transpose from (channels, samples) to (samples, channels)
+    # convert float32 [-1..1] to int16 PCM [-32768..32767]
+    pcm = (data * 32767).astype(np.int16)
+    buffer = io.BytesIO()
+    wf = wave.open(buffer, "wb")
+    wf.setnchannels(data.shape[1])         # number of channels
+    wf.setsampwidth(2)                     # 16 bits = 2 bytes
+    wf.setframerate(rate)                  # samples per second
+    wf.writeframes(pcm.tobytes())          # write PCM data
+    wf.close()
+    buffer.seek(0)
+    return buffer.read()                   # return raw WAV bytes
+# 3) STREAMLIT USER INTERFACE
+st.set_page_config(page_title="Imagine & Narrate", page_icon="✨", layout="centered")
+st.title("✨ Imagine & Narrate")
+st.write("Upload any image below to see AI imagine and narrate a story about it!")
+# -- Upload image widget --
+uploaded = st.file_uploader(
+    "Choose an image file",
+    type=["jpg", "jpeg", "png"]
+)
+if not uploaded:
+    st.info("➡️ Upload an image above to start the magic!")
+    st.stop()
+# Load the uploaded file into a PIL Image
+try:
+    img = Image.open(uploaded)
+except Exception as e:
+    st.error(f"Error loading image: {e}")
+    st.stop()
+# -- Step 1: Display the image --
+st.subheader("📸 Your Visual Input")
+st.image(img, use_container_width=True)
+st.divider()
+# -- Step 2: Generate and display caption --
+st.subheader("🧠 Generating Insights")
+with st.spinner("Scanning image for key elements…"):
+    captioner = load_captioner()
+    raw_caption = caption_image(img, captioner)
+    if not raw_caption:
+         st.warning("Could not generate a caption for the image.")
+         st.stop()
+    caption = sentence_case(raw_caption)
+st.markdown(f"**Identified Scene:** {caption}")
+st.divider()
+# -- Step 3: Generate and display story --
+st.subheader("📖 Crafting a Narrative")
+with st.spinner("Writing a compelling story…"):
+    story_pipe = load_story_pipe()
+    story = story_from_caption(caption, story_pipe)
+    if not story or story.strip() == '...': # Check for empty or minimal story
+         st.warning("Could not generate a meaningful story from the caption.")
+         st.stop()
+st.write(story)
+st.divider()
+# -- Step 4: Synthesize and play audio --
+st.subheader("👂 Hear the Story")
+with st.spinner("Synthesizing audio narration…"):
+    tts_pipe = load_tts_pipe()
+    try:
+        audio_bytes = tts_bytes(story, tts_pipe)
+        st.audio(audio_bytes, format="audio/wav")
+    except Exception as e:
+        st.error(f"Error generating audio: {e}")
+# Celebration animation
+st.balloons()