Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

App Files Files Community

pratik-250620 commited on Feb 20

Commit

1812196

verified ·

1 Parent(s): 0b7335c

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +50 -54

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ Live demonstration of multimodal generation + coherence evaluation.
 Enter a scene description and the system produces coherent text, image,
 and audio with real-time MSCI scoring.
-Pipeline: Groq LLM (text) + Pollinations (image/audio) with CLIP/CLAP retrieval fallback
 Planning modes: direct, planner, council (3-way), extended_prompt (3x tokens)
 """
@@ -1284,8 +1284,11 @@ def plan_extended(prompt: str) -> Optional[Any]:
 # Pollinations endpoints
 POLLINATIONS_IMAGE_FREE_URL = "https://image.pollinations.ai/prompt"  # Free, no auth
 POLLINATIONS_GEN_IMAGE_URL = "https://gen.pollinations.ai/image"       # Needs API key
-POLLINATIONS_AUDIO_URL = "https://gen.pollinations.ai/v1/audio/speech"  # Needs API key
-POLLINATIONS_TTS_URL = "https://gen.pollinations.ai/audio"              # Needs API key
 # Stable Horde (free, crowdsourced, no key)
 STABLE_HORDE_URL = "https://stablehorde.net/api/v2"
@@ -1475,59 +1478,52 @@ def generate_image(prompt: str) -> dict:
     return retrieve_image(prompt)
-def generate_audio(prompt: str) -> dict:
-    """Generate audio via Pollinations.ai (with API key) → CLAP retrieval fallback.
-    Tries Pollinations TTS to narrate the scene ambience (with API key),
-    then falls back to CLAP retrieval.
     """
-    headers = _pollinations_headers()
-    if not headers:
-        logger.info("No POLLINATIONS_API_KEY — skipping audio generation")
-    else:
-        # --- Attempt 1: Pollinations TTS (scene description as speech) ---
-        try:
-            resp = _requests.post(
-                POLLINATIONS_AUDIO_URL,
-                headers=headers,
-                json={
-                    "model": "openai-audio",
-                    "input": prompt,
-                    "voice": "shimmer",
-                },
-                timeout=60,
-            )
-            if resp.status_code == 200 and len(resp.content) > 1000:
-                tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False, dir="/tmp")
-                tmp.write(resp.content)
-                tmp.flush()
-                return {
-                    "path": tmp.name, "backend": "generative",
-                    "model": "Pollinations-TTS", "failed": False,
-                }
-            logger.warning("Pollinations TTS returned %s: %s", resp.status_code, resp.text[:200])
-        except Exception as e:
-            logger.warning("Pollinations TTS failed: %s", e)
-        # --- Attempt 2: Pollinations simple GET TTS ---
-        try:
-            encoded = _urlparse.quote(prompt)
-            resp = _requests.get(
-                f"{POLLINATIONS_TTS_URL}/{encoded}?voice=nova",
-                headers=headers,
-                timeout=60,
-            )
-            if resp.status_code == 200 and len(resp.content) > 1000:
-                tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False, dir="/tmp")
-                tmp.write(resp.content)
-                tmp.flush()
-                return {
-                    "path": tmp.name, "backend": "generative",
-                    "model": "Pollinations-TTS", "failed": False,
-                }
-            logger.warning("Pollinations GET TTS returned %s", resp.status_code)
-        except Exception as e:
-            logger.warning("Pollinations GET TTS failed: %s", e)
     # --- Fallback: CLAP retrieval ---
     logger.info("Audio generation unavailable — using CLAP retrieval")
@@ -1674,7 +1670,7 @@ def main():
         }
         if backend == "generative":
             img_info = "Pollinations FLUX / Stable Horde (free)"
-            aud_info = "Pollinations TTS / CLAP retrieval (free)"
         else:
             img_info = "CLIP retrieval (57 images)"
             aud_info = "CLAP retrieval (104 clips)"

 Enter a scene description and the system produces coherent text, image,
 and audio with real-time MSCI scoring.
+Pipeline: Groq LLM (text) + Pollinations (image) + ElevenLabs (audio SFX) with CLIP/CLAP retrieval fallback
 Planning modes: direct, planner, council (3-way), extended_prompt (3x tokens)
 """
 # Pollinations endpoints
 POLLINATIONS_IMAGE_FREE_URL = "https://image.pollinations.ai/prompt"  # Free, no auth
 POLLINATIONS_GEN_IMAGE_URL = "https://gen.pollinations.ai/image"       # Needs API key
+POLLINATIONS_AUDIO_URL = "https://gen.pollinations.ai/v1/audio/speech"  # Needs API key (TTS only)
+POLLINATIONS_TTS_URL = "https://gen.pollinations.ai/audio"              # Needs API key (TTS only)
+# ElevenLabs (sound effects — actual ambient sounds, NOT speech)
+ELEVENLABS_SFX_URL = "https://api.elevenlabs.io/v1/sound-generation"
 # Stable Horde (free, crowdsourced, no key)
 STABLE_HORDE_URL = "https://stablehorde.net/api/v2"
     return retrieve_image(prompt)
+def _elevenlabs_sfx(prompt: str, duration: float = 8.0) -> Optional[bytes]:
+    """Generate ambient sound effects via ElevenLabs Sound Generation API.
+    Returns MP3 bytes or None on failure.  Free tier: ~50 generations/month.
     """
+    key = os.environ.get("ELEVENLABS_API_KEY", "")
+    if not key:
+        return None
+    try:
+        resp = _requests.post(
+            ELEVENLABS_SFX_URL,
+            headers={
+                "xi-api-key": key,
+                "Content-Type": "application/json",
+            },
+            json={
+                "text": prompt,
+                "duration_seconds": duration,
+                "prompt_influence": 0.5,
+            },
+            timeout=90,
+        )
+        if resp.status_code == 200 and len(resp.content) > 1000:
+            return resp.content
+        logger.warning("ElevenLabs SFX returned %s: %s", resp.status_code, resp.text[:200])
+    except Exception as e:
+        logger.warning("ElevenLabs SFX failed: %s", e)
+    return None
+def generate_audio(prompt: str) -> dict:
+    """Generate ambient audio via ElevenLabs SFX → CLAP retrieval fallback.
+    Uses ElevenLabs Sound Effects API to generate actual ambient sounds
+    (NOT text-to-speech).  Falls back to CLAP retrieval if unavailable.
+    """
+    # --- Attempt 1: ElevenLabs Sound Effects (actual ambient sounds) ---
+    audio_bytes = _elevenlabs_sfx(prompt, duration=8.0)
+    if audio_bytes:
+        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False, dir="/tmp")
+        tmp.write(audio_bytes)
+        tmp.flush()
+        return {
+            "path": tmp.name, "backend": "generative",
+            "model": "ElevenLabs-SFX", "failed": False,
+        }
     # --- Fallback: CLAP retrieval ---
     logger.info("Audio generation unavailable — using CLAP retrieval")
         }
         if backend == "generative":
             img_info = "Pollinations FLUX / Stable Horde (free)"
+            aud_info = "ElevenLabs SFX / CLAP retrieval (free)"
         else:
             img_info = "CLIP retrieval (57 images)"
             aud_info = "CLAP retrieval (104 clips)"