pratik-250620 commited on
Commit
1812196
·
verified ·
1 Parent(s): 0b7335c

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +50 -54
app.py CHANGED
@@ -5,7 +5,7 @@ Live demonstration of multimodal generation + coherence evaluation.
5
  Enter a scene description and the system produces coherent text, image,
6
  and audio with real-time MSCI scoring.
7
 
8
- Pipeline: Groq LLM (text) + Pollinations (image/audio) with CLIP/CLAP retrieval fallback
9
  Planning modes: direct, planner, council (3-way), extended_prompt (3x tokens)
10
  """
11
 
@@ -1284,8 +1284,11 @@ def plan_extended(prompt: str) -> Optional[Any]:
1284
  # Pollinations endpoints
1285
  POLLINATIONS_IMAGE_FREE_URL = "https://image.pollinations.ai/prompt" # Free, no auth
1286
  POLLINATIONS_GEN_IMAGE_URL = "https://gen.pollinations.ai/image" # Needs API key
1287
- POLLINATIONS_AUDIO_URL = "https://gen.pollinations.ai/v1/audio/speech" # Needs API key
1288
- POLLINATIONS_TTS_URL = "https://gen.pollinations.ai/audio" # Needs API key
 
 
 
1289
 
1290
  # Stable Horde (free, crowdsourced, no key)
1291
  STABLE_HORDE_URL = "https://stablehorde.net/api/v2"
@@ -1475,59 +1478,52 @@ def generate_image(prompt: str) -> dict:
1475
  return retrieve_image(prompt)
1476
 
1477
 
1478
- def generate_audio(prompt: str) -> dict:
1479
- """Generate audio via Pollinations.ai (with API key) CLAP retrieval fallback.
1480
 
1481
- Tries Pollinations TTS to narrate the scene ambience (with API key),
1482
- then falls back to CLAP retrieval.
1483
  """
1484
- headers = _pollinations_headers()
1485
- if not headers:
1486
- logger.info("No POLLINATIONS_API_KEY — skipping audio generation")
1487
- else:
1488
- # --- Attempt 1: Pollinations TTS (scene description as speech) ---
1489
- try:
1490
- resp = _requests.post(
1491
- POLLINATIONS_AUDIO_URL,
1492
- headers=headers,
1493
- json={
1494
- "model": "openai-audio",
1495
- "input": prompt,
1496
- "voice": "shimmer",
1497
- },
1498
- timeout=60,
1499
- )
1500
- if resp.status_code == 200 and len(resp.content) > 1000:
1501
- tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False, dir="/tmp")
1502
- tmp.write(resp.content)
1503
- tmp.flush()
1504
- return {
1505
- "path": tmp.name, "backend": "generative",
1506
- "model": "Pollinations-TTS", "failed": False,
1507
- }
1508
- logger.warning("Pollinations TTS returned %s: %s", resp.status_code, resp.text[:200])
1509
- except Exception as e:
1510
- logger.warning("Pollinations TTS failed: %s", e)
1511
 
1512
- # --- Attempt 2: Pollinations simple GET TTS ---
1513
- try:
1514
- encoded = _urlparse.quote(prompt)
1515
- resp = _requests.get(
1516
- f"{POLLINATIONS_TTS_URL}/{encoded}?voice=nova",
1517
- headers=headers,
1518
- timeout=60,
1519
- )
1520
- if resp.status_code == 200 and len(resp.content) > 1000:
1521
- tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False, dir="/tmp")
1522
- tmp.write(resp.content)
1523
- tmp.flush()
1524
- return {
1525
- "path": tmp.name, "backend": "generative",
1526
- "model": "Pollinations-TTS", "failed": False,
1527
- }
1528
- logger.warning("Pollinations GET TTS returned %s", resp.status_code)
1529
- except Exception as e:
1530
- logger.warning("Pollinations GET TTS failed: %s", e)
1531
 
1532
  # --- Fallback: CLAP retrieval ---
1533
  logger.info("Audio generation unavailable — using CLAP retrieval")
@@ -1674,7 +1670,7 @@ def main():
1674
  }
1675
  if backend == "generative":
1676
  img_info = "Pollinations FLUX / Stable Horde (free)"
1677
- aud_info = "Pollinations TTS / CLAP retrieval (free)"
1678
  else:
1679
  img_info = "CLIP retrieval (57 images)"
1680
  aud_info = "CLAP retrieval (104 clips)"
 
5
  Enter a scene description and the system produces coherent text, image,
6
  and audio with real-time MSCI scoring.
7
 
8
+ Pipeline: Groq LLM (text) + Pollinations (image) + ElevenLabs (audio SFX) with CLIP/CLAP retrieval fallback
9
  Planning modes: direct, planner, council (3-way), extended_prompt (3x tokens)
10
  """
11
 
 
1284
  # Pollinations endpoints
1285
  POLLINATIONS_IMAGE_FREE_URL = "https://image.pollinations.ai/prompt" # Free, no auth
1286
  POLLINATIONS_GEN_IMAGE_URL = "https://gen.pollinations.ai/image" # Needs API key
1287
+ POLLINATIONS_AUDIO_URL = "https://gen.pollinations.ai/v1/audio/speech" # Needs API key (TTS only)
1288
+ POLLINATIONS_TTS_URL = "https://gen.pollinations.ai/audio" # Needs API key (TTS only)
1289
+
1290
+ # ElevenLabs (sound effects — actual ambient sounds, NOT speech)
1291
+ ELEVENLABS_SFX_URL = "https://api.elevenlabs.io/v1/sound-generation"
1292
 
1293
  # Stable Horde (free, crowdsourced, no key)
1294
  STABLE_HORDE_URL = "https://stablehorde.net/api/v2"
 
1478
  return retrieve_image(prompt)
1479
 
1480
 
1481
+ def _elevenlabs_sfx(prompt: str, duration: float = 8.0) -> Optional[bytes]:
1482
+ """Generate ambient sound effects via ElevenLabs Sound Generation API.
1483
 
1484
+ Returns MP3 bytes or None on failure. Free tier: ~50 generations/month.
 
1485
  """
1486
+ key = os.environ.get("ELEVENLABS_API_KEY", "")
1487
+ if not key:
1488
+ return None
1489
+ try:
1490
+ resp = _requests.post(
1491
+ ELEVENLABS_SFX_URL,
1492
+ headers={
1493
+ "xi-api-key": key,
1494
+ "Content-Type": "application/json",
1495
+ },
1496
+ json={
1497
+ "text": prompt,
1498
+ "duration_seconds": duration,
1499
+ "prompt_influence": 0.5,
1500
+ },
1501
+ timeout=90,
1502
+ )
1503
+ if resp.status_code == 200 and len(resp.content) > 1000:
1504
+ return resp.content
1505
+ logger.warning("ElevenLabs SFX returned %s: %s", resp.status_code, resp.text[:200])
1506
+ except Exception as e:
1507
+ logger.warning("ElevenLabs SFX failed: %s", e)
1508
+ return None
 
 
 
 
1509
 
1510
+
1511
+ def generate_audio(prompt: str) -> dict:
1512
+ """Generate ambient audio via ElevenLabs SFX → CLAP retrieval fallback.
1513
+
1514
+ Uses ElevenLabs Sound Effects API to generate actual ambient sounds
1515
+ (NOT text-to-speech). Falls back to CLAP retrieval if unavailable.
1516
+ """
1517
+ # --- Attempt 1: ElevenLabs Sound Effects (actual ambient sounds) ---
1518
+ audio_bytes = _elevenlabs_sfx(prompt, duration=8.0)
1519
+ if audio_bytes:
1520
+ tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False, dir="/tmp")
1521
+ tmp.write(audio_bytes)
1522
+ tmp.flush()
1523
+ return {
1524
+ "path": tmp.name, "backend": "generative",
1525
+ "model": "ElevenLabs-SFX", "failed": False,
1526
+ }
 
 
1527
 
1528
  # --- Fallback: CLAP retrieval ---
1529
  logger.info("Audio generation unavailable — using CLAP retrieval")
 
1670
  }
1671
  if backend == "generative":
1672
  img_info = "Pollinations FLUX / Stable Horde (free)"
1673
+ aud_info = "ElevenLabs SFX / CLAP retrieval (free)"
1674
  else:
1675
  img_info = "CLIP retrieval (57 images)"
1676
  aud_info = "CLAP retrieval (104 clips)"