Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

App Files Files Community

pratik-250620 commited on Feb 18

Commit

fb5a7e6

verified ·

1 Parent(s): 960dff6

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +163 -51
src/embeddings/audio_embedder.py +13 -5

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ Live demonstration of multimodal generation + coherence evaluation.
 Enter a scene description and the system produces coherent text, image,
 and audio with real-time MSCI scoring.
-Pipeline: HF Inference API (text + planning) + CLIP retrieval (image) + CLAP retrieval (audio)
 Planning modes: direct, planner, council (3-way), extended_prompt (3x tokens)
 """
@@ -15,6 +15,7 @@ import json
 import logging
 import os
 import sys
 import time
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -406,6 +407,13 @@ def plan_extended(prompt: str) -> Optional[Any]:
 # Generation / retrieval functions
 # ---------------------------------------------------------------------------
 def gen_text(prompt: str, mode: str) -> dict:
     """Generate text and optional plan using HF Inference API."""
     # Step 1: Plan (if not direct mode)
@@ -457,6 +465,50 @@ def gen_text(prompt: str, mode: str) -> dict:
     }
 def retrieve_image(prompt: str) -> dict:
     r = load_image_retriever().retrieve(prompt)
     return {
@@ -540,6 +592,15 @@ def main():
     with st.sidebar:
         st.markdown("#### Configuration")
         mode = st.selectbox(
             "Planning Mode",
             ["direct", "planner", "council", "extended_prompt"],
@@ -567,16 +628,22 @@ def main():
             "council": "3 LLM calls merged for richer planning",
             "extended_prompt": "Single LLM call with 3x token budget",
         }
         st.markdown(
             f'<div class="sidebar-info">'
             f'<b>Text</b> HF Inference API<br>'
             f'<b>Planning</b> {mode_desc[mode]}<br>'
-            f'<b>Image</b> CLIP retrieval (57 images)<br>'
-            f'<b>Audio</b> CLAP retrieval (104 clips)<br><br>'
             f'<b>Metric</b> MSCI = 0.45 &times; s<sub>t,i</sub> + 0.45 &times; s<sub>t,a</sub><br><br>'
             f'<b>Models</b><br>'
-            f'CLIP ViT-B/32 (text-image)<br>'
-            f'CLAP HTSAT-unfused (text-audio)'
             f'</div>', unsafe_allow_html=True)
     # Prompt input
@@ -595,9 +662,13 @@ def main():
         mlbl = {"direct": "Direct", "planner": "Planner", "council": "Council", "extended_prompt": "Extended"}[mode]
         mcls = "chip-amber" if mode != "direct" else "chip-purple"
         mdot = "chip-dot-amber" if mode != "direct" else "chip-dot-purple"
         st.markdown(
             f'<div class="chip-row">'
-            f'<span class="chip chip-pink"><span class="chip-dot chip-dot-pink"></span>Generative</span>'
             f'<span class="chip {mcls}"><span class="chip-dot {mdot}"></span>{mlbl}</span>'
             f'<span class="chip chip-green"><span class="chip-dot chip-dot-green"></span>CLIP + CLAP</span>'
             f'</div>', unsafe_allow_html=True)
@@ -613,7 +684,7 @@ def main():
         return
     if go and prompt.strip():
-        st.session_state["last_result"] = run_pipeline(prompt.strip(), mode)
     if "last_result" in st.session_state:
         show_results(st.session_state["last_result"])
@@ -623,8 +694,8 @@ def main():
 # Pipeline
 # ---------------------------------------------------------------------------
-def run_pipeline(prompt: str, mode: str) -> dict:
-    R: dict = {"mode": mode}
     t_all = time.time()
     # 1) Text + Planning
@@ -647,33 +718,53 @@ def run_pipeline(prompt: str, mode: str) -> dict:
     ip = R["text"].get("image_prompt", prompt)
     ap = R["text"].get("audio_prompt", prompt)
-    # 2) Image retrieval
-    with st.status("Retrieving image...", expanded=True) as s:
         t0 = time.time()
         try:
-            R["image"] = retrieve_image(ip)
             R["t_img"] = time.time() - t0
-            f = R["image"].get("failed", False)
-            lbl = f"Image retrieved (sim={R['image']['similarity']:.3f}, {R['t_img']:.1f}s)"
-            if f:
-                lbl += " \u2014 below threshold"
-            s.update(label=lbl, state="complete" if not f else "error")
         except Exception as e:
             s.update(label=f"Image failed: {e}", state="error")
             R["image"] = None
             R["t_img"] = time.time() - t0
-    # 3) Audio retrieval
-    with st.status("Retrieving audio...", expanded=True) as s:
         t0 = time.time()
         try:
-            R["audio"] = retrieve_audio(ap)
             R["t_aud"] = time.time() - t0
-            f = R["audio"].get("failed", False)
-            lbl = f"Audio retrieved (sim={R['audio']['similarity']:.3f}, {R['t_aud']:.1f}s)"
-            if f:
-                lbl += " \u2014 below threshold"
-            s.update(label=lbl, state="complete" if not f else "error")
         except Exception as e:
             s.update(label=f"Audio failed: {e}", state="error")
             R["audio"] = None
@@ -743,45 +834,54 @@ def show_results(R: dict):
         st.markdown(f'<div class="text-card">{txt}</div>', unsafe_allow_html=True)
     with ci:
-        st.markdown('<div class="sec-label">Image</div>', unsafe_allow_html=True)
         ii = R.get("image")
         if ii and ii.get("path"):
             ip = Path(ii["path"])
-            failed = ii.get("failed", False)
-            sim = ii.get("similarity")
-            if failed:
                 st.markdown(
-                    f'<div class="warn-banner"><b>Below threshold</b> '
-                    f'(sim={sim:.3f} &lt; {IMAGE_SIM_THRESHOLD}) '
-                    f'\u2014 best match from index.</div>',
                     unsafe_allow_html=True)
             if ip.exists():
                 st.image(str(ip), use_container_width=True)
-                dom = ii.get("domain", "other")
-                ic = DOMAIN_ICONS.get(dom, "\U0001f4cd")
-                st.caption(f"{ic} {dom} \u00b7 sim **{sim:.3f}** \u00b7 {ip.name}")
         else:
             st.info("No image.")
     with ca:
-        st.markdown('<div class="sec-label">Audio</div>', unsafe_allow_html=True)
         ai = R.get("audio")
         if ai and ai.get("path"):
             ap = Path(ai["path"])
-            sim = ai.get("similarity")
-            failed = ai.get("failed", False)
-            if failed:
                 st.markdown(
-                    f'<div class="warn-banner"><b>Below threshold</b> '
-                    f'(sim={sim:.3f} &lt; {AUDIO_SIM_THRESHOLD}).</div>',
                     unsafe_allow_html=True)
             if ap.exists():
                 st.audio(str(ap))
-                st.caption(f"sim **{sim:.3f}** \u00b7 {ap.name}")
         else:
             st.info("No audio.")
@@ -819,22 +919,34 @@ def show_results(R: dict):
             else:
                 st.write(f"Planning ({mode}) did not produce a valid plan. Fell back to direct mode.")
-    with st.expander("Retrieval Details"):
         r1, r2 = st.columns(2)
         with r1:
             ii = R.get("image")
-            if ii and ii.get("top_5"):
-                st.markdown("**Image \u2014 Top 5 candidates**")
-                bars = "".join(sim_bar_html(n, s) for n, s in ii["top_5"])
-                st.markdown(bars, unsafe_allow_html=True)
             else:
                 st.write("No image data.")
         with r2:
             ai = R.get("audio")
-            if ai and ai.get("top_5"):
-                st.markdown("**Audio \u2014 Top 5 candidates**")
-                bars = "".join(sim_bar_html(n, s) for n, s in ai["top_5"])
-                st.markdown(bars, unsafe_allow_html=True)
             else:
                 st.write("No audio data.")

 Enter a scene description and the system produces coherent text, image,
 and audio with real-time MSCI scoring.
+Pipeline: HF Inference API (text + planning + image + audio) with CLIP/CLAP retrieval fallback
 Planning modes: direct, planner, council (3-way), extended_prompt (3x tokens)
 """
 import logging
 import os
 import sys
+import tempfile
 import time
 from pathlib import Path
 from typing import Any, Dict, Optional
 # Generation / retrieval functions
 # ---------------------------------------------------------------------------
+# HF Inference API model IDs
+IMAGE_GEN_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
+AUDIO_GEN_MODELS = [
+    "cvssp/audioldm2",
+    "facebook/musicgen-small",
+]
 def gen_text(prompt: str, mode: str) -> dict:
     """Generate text and optional plan using HF Inference API."""
     # Step 1: Plan (if not direct mode)
     }
+def generate_image(prompt: str) -> dict:
+    """Generate image via HF Inference API (SDXL), fallback to retrieval."""
+    client = get_inference_client()
+    try:
+        image = client.text_to_image(prompt, model=IMAGE_GEN_MODEL)
+        tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False, dir="/tmp")
+        image.save(tmp.name)
+        return {
+            "path": tmp.name, "backend": "generative",
+            "model": "SDXL", "failed": False,
+        }
+    except Exception as e:
+        logger.warning("Image generation failed: %s — falling back to retrieval", e)
+        return retrieve_image(prompt)
+def generate_audio(prompt: str) -> dict:
+    """Generate audio via HF Inference API, fallback to retrieval."""
+    client = get_inference_client()
+    for model_id in AUDIO_GEN_MODELS:
+        try:
+            audio_bytes = client.text_to_audio(prompt, model=model_id)
+            suffix = ".flac" if "musicgen" in model_id else ".wav"
+            tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, dir="/tmp")
+            if isinstance(audio_bytes, bytes):
+                tmp.write(audio_bytes)
+                tmp.flush()
+            else:
+                # Some API versions return object with .read() or similar
+                tmp.write(bytes(audio_bytes))
+                tmp.flush()
+            model_name = model_id.split("/")[-1]
+            return {
+                "path": tmp.name, "backend": "generative",
+                "model": model_name, "failed": False,
+            }
+        except Exception as e:
+            logger.warning("Audio gen with %s failed: %s", model_id, e)
+            continue
+    # All generative models failed — fall back to retrieval
+    logger.warning("All audio generation models failed — falling back to retrieval")
+    return retrieve_audio(prompt)
 def retrieve_image(prompt: str) -> dict:
     r = load_image_retriever().retrieve(prompt)
     return {
     with st.sidebar:
         st.markdown("#### Configuration")
+        backend = st.selectbox(
+            "Backend",
+            ["generative", "retrieval"],
+            format_func=lambda x: {
+                "generative": "Generative (SDXL + AudioLDM2)",
+                "retrieval": "Retrieval (CLIP + CLAP index)",
+            }[x],
+        )
         mode = st.selectbox(
             "Planning Mode",
             ["direct", "planner", "council", "extended_prompt"],
             "council": "3 LLM calls merged for richer planning",
             "extended_prompt": "Single LLM call with 3x token budget",
         }
+        if backend == "generative":
+            img_info = "SDXL via HF API"
+            aud_info = "AudioLDM2 / MusicGen via HF API"
+        else:
+            img_info = "CLIP retrieval (57 images)"
+            aud_info = "CLAP retrieval (104 clips)"
         st.markdown(
             f'<div class="sidebar-info">'
             f'<b>Text</b> HF Inference API<br>'
             f'<b>Planning</b> {mode_desc[mode]}<br>'
+            f'<b>Image</b> {img_info}<br>'
+            f'<b>Audio</b> {aud_info}<br><br>'
             f'<b>Metric</b> MSCI = 0.45 &times; s<sub>t,i</sub> + 0.45 &times; s<sub>t,a</sub><br><br>'
             f'<b>Models</b><br>'
+            f'CLIP ViT-B/32 (coherence eval)<br>'
+            f'CLAP HTSAT-unfused (coherence eval)'
             f'</div>', unsafe_allow_html=True)
     # Prompt input
         mlbl = {"direct": "Direct", "planner": "Planner", "council": "Council", "extended_prompt": "Extended"}[mode]
         mcls = "chip-amber" if mode != "direct" else "chip-purple"
         mdot = "chip-dot-amber" if mode != "direct" else "chip-dot-purple"
+        if backend == "generative":
+            bchip = '<span class="chip chip-pink"><span class="chip-dot chip-dot-pink"></span>Generative</span>'
+        else:
+            bchip = '<span class="chip chip-purple"><span class="chip-dot chip-dot-purple"></span>Retrieval</span>'
         st.markdown(
             f'<div class="chip-row">'
+            f'{bchip}'
             f'<span class="chip {mcls}"><span class="chip-dot {mdot}"></span>{mlbl}</span>'
             f'<span class="chip chip-green"><span class="chip-dot chip-dot-green"></span>CLIP + CLAP</span>'
             f'</div>', unsafe_allow_html=True)
         return
     if go and prompt.strip():
+        st.session_state["last_result"] = run_pipeline(prompt.strip(), mode, backend)
     if "last_result" in st.session_state:
         show_results(st.session_state["last_result"])
 # Pipeline
 # ---------------------------------------------------------------------------
+def run_pipeline(prompt: str, mode: str, backend: str = "generative") -> dict:
+    R: dict = {"mode": mode, "backend": backend}
     t_all = time.time()
     # 1) Text + Planning
     ip = R["text"].get("image_prompt", prompt)
     ap = R["text"].get("audio_prompt", prompt)
+    # 2) Image
+    img_label = "Generating image (SDXL)..." if backend == "generative" else "Retrieving image..."
+    with st.status(img_label, expanded=True) as s:
         t0 = time.time()
         try:
+            if backend == "generative":
+                R["image"] = generate_image(ip)
+            else:
+                R["image"] = retrieve_image(ip)
             R["t_img"] = time.time() - t0
+            img_backend = R["image"].get("backend", "unknown")
+            model = R["image"].get("model", "")
+            if img_backend == "generative":
+                lbl = f"Image generated via {model} ({R['t_img']:.1f}s)"
+            else:
+                sim = R["image"].get("similarity", 0)
+                failed = R["image"].get("failed", False)
+                lbl = f"Image retrieved (sim={sim:.3f}, {R['t_img']:.1f}s)"
+                if failed:
+                    lbl += " \u2014 below threshold"
+            s.update(label=lbl, state="complete")
         except Exception as e:
             s.update(label=f"Image failed: {e}", state="error")
             R["image"] = None
             R["t_img"] = time.time() - t0
+    # 3) Audio
+    aud_label = "Generating audio..." if backend == "generative" else "Retrieving audio..."
+    with st.status(aud_label, expanded=True) as s:
         t0 = time.time()
         try:
+            if backend == "generative":
+                R["audio"] = generate_audio(ap)
+            else:
+                R["audio"] = retrieve_audio(ap)
             R["t_aud"] = time.time() - t0
+            aud_backend = R["audio"].get("backend", "unknown")
+            model = R["audio"].get("model", "")
+            if aud_backend == "generative":
+                lbl = f"Audio generated via {model} ({R['t_aud']:.1f}s)"
+            else:
+                sim = R["audio"].get("similarity", 0)
+                failed = R["audio"].get("failed", False)
+                lbl = f"Audio retrieved (sim={sim:.3f}, {R['t_aud']:.1f}s)"
+                if failed:
+                    lbl += " \u2014 below threshold"
+            s.update(label=lbl, state="complete")
         except Exception as e:
             s.update(label=f"Audio failed: {e}", state="error")
             R["audio"] = None
         st.markdown(f'<div class="text-card">{txt}</div>', unsafe_allow_html=True)
     with ci:
+        st.markdown('<div class="sec-label">Generated Image</div>', unsafe_allow_html=True)
         ii = R.get("image")
         if ii and ii.get("path"):
             ip = Path(ii["path"])
+            backend = ii.get("backend", "unknown")
+            if backend == "retrieval" and ii.get("failed", False):
+                sim = ii.get("similarity", 0)
                 st.markdown(
+                    f'<div class="warn-banner"><b>Retrieval fallback</b> '
+                    f'(sim={sim:.3f}) \u2014 generation unavailable.</div>',
                     unsafe_allow_html=True)
             if ip.exists():
                 st.image(str(ip), use_container_width=True)
+                model = ii.get("model", "")
+                if backend == "generative":
+                    st.caption(f"Generated via **{model}**")
+                else:
+                    sim = ii.get("similarity", 0)
+                    dom = ii.get("domain", "other")
+                    ic = DOMAIN_ICONS.get(dom, "\U0001f4cd")
+                    st.caption(f"{ic} {dom} \u00b7 sim **{sim:.3f}** \u00b7 Retrieved")
         else:
             st.info("No image.")
     with ca:
+        st.markdown('<div class="sec-label">Generated Audio</div>', unsafe_allow_html=True)
         ai = R.get("audio")
         if ai and ai.get("path"):
             ap = Path(ai["path"])
+            backend = ai.get("backend", "unknown")
+            if backend == "retrieval" and ai.get("failed", False):
+                sim = ai.get("similarity", 0)
                 st.markdown(
+                    f'<div class="warn-banner"><b>Retrieval fallback</b> '
+                    f'(sim={sim:.3f}) \u2014 generation unavailable.</div>',
                     unsafe_allow_html=True)
             if ap.exists():
                 st.audio(str(ap))
+                model = ai.get("model", "")
+                if backend == "generative":
+                    st.caption(f"Generated via **{model}**")
+                else:
+                    sim = ai.get("similarity", 0)
+                    st.caption(f"sim **{sim:.3f}** \u00b7 Retrieved")
         else:
             st.info("No audio.")
             else:
                 st.write(f"Planning ({mode}) did not produce a valid plan. Fell back to direct mode.")
+    with st.expander("Generation Details"):
         r1, r2 = st.columns(2)
         with r1:
             ii = R.get("image")
+            if ii:
+                backend = ii.get("backend", "unknown")
+                model = ii.get("model", "")
+                if backend == "generative":
+                    st.markdown(f"**Image** generated via **{model}**")
+                    st.markdown(f"Prompt: *{R.get('text', {}).get('image_prompt', '')}*")
+                elif ii.get("top_5"):
+                    st.markdown("**Image** (retrieval fallback)")
+                    bars = "".join(sim_bar_html(n, s) for n, s in ii["top_5"])
+                    st.markdown(bars, unsafe_allow_html=True)
             else:
                 st.write("No image data.")
         with r2:
             ai = R.get("audio")
+            if ai:
+                backend = ai.get("backend", "unknown")
+                model = ai.get("model", "")
+                if backend == "generative":
+                    st.markdown(f"**Audio** generated via **{model}**")
+                    st.markdown(f"Prompt: *{R.get('text', {}).get('audio_prompt', '')}*")
+                elif ai.get("top_5"):
+                    st.markdown("**Audio** (retrieval fallback)")
+                    bars = "".join(sim_bar_html(n, s) for n, s in ai["top_5"])
+                    st.markdown(bars, unsafe_allow_html=True)
             else:
                 st.write("No audio data.")

src/embeddings/audio_embedder.py CHANGED Viewed

@@ -56,11 +56,19 @@ class AudioEmbedder:
     def embed(self, audio_path: str) -> np.ndarray:
         waveform, _ = librosa.load(audio_path, sr=self.target_sr, mono=True)
-        inputs = self.processor(
-            audios=waveform,
-            sampling_rate=self.target_sr,
-            return_tensors="pt",
-        ).to(self.device)
         outputs = self.model.get_audio_features(**inputs)
         emb = self._extract_features(outputs, "audio_projection")

     def embed(self, audio_path: str) -> np.ndarray:
         waveform, _ = librosa.load(audio_path, sr=self.target_sr, mono=True)
+        # Use 'audio' (newer transformers) with fallback to 'audios' (older)
+        try:
+            inputs = self.processor(
+                audio=waveform,
+                sampling_rate=self.target_sr,
+                return_tensors="pt",
+            ).to(self.device)
+        except TypeError:
+            inputs = self.processor(
+                audios=waveform,
+                sampling_rate=self.target_sr,
+                return_tensors="pt",
+            ).to(self.device)
         outputs = self.model.get_audio_features(**inputs)
         emb = self._extract_features(outputs, "audio_projection")