Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -5,7 +5,7 @@ Live demonstration of multimodal generation + coherence evaluation.
|
|
| 5 |
Enter a scene description and the system produces coherent text, image,
|
| 6 |
and audio with real-time MSCI scoring.
|
| 7 |
|
| 8 |
-
Pipeline: Groq LLM (text) + Pollinations (image
|
| 9 |
Planning modes: direct, planner, council (3-way), extended_prompt (3x tokens)
|
| 10 |
"""
|
| 11 |
|
|
@@ -1284,8 +1284,11 @@ def plan_extended(prompt: str) -> Optional[Any]:
|
|
| 1284 |
# Pollinations endpoints
|
| 1285 |
POLLINATIONS_IMAGE_FREE_URL = "https://image.pollinations.ai/prompt" # Free, no auth
|
| 1286 |
POLLINATIONS_GEN_IMAGE_URL = "https://gen.pollinations.ai/image" # Needs API key
|
| 1287 |
-
POLLINATIONS_AUDIO_URL = "https://gen.pollinations.ai/v1/audio/speech" # Needs API key
|
| 1288 |
-
POLLINATIONS_TTS_URL = "https://gen.pollinations.ai/audio" # Needs API key
|
|
|
|
|
|
|
|
|
|
| 1289 |
|
| 1290 |
# Stable Horde (free, crowdsourced, no key)
|
| 1291 |
STABLE_HORDE_URL = "https://stablehorde.net/api/v2"
|
|
@@ -1475,59 +1478,52 @@ def generate_image(prompt: str) -> dict:
|
|
| 1475 |
return retrieve_image(prompt)
|
| 1476 |
|
| 1477 |
|
| 1478 |
-
def
|
| 1479 |
-
"""Generate
|
| 1480 |
|
| 1481 |
-
|
| 1482 |
-
then falls back to CLAP retrieval.
|
| 1483 |
"""
|
| 1484 |
-
|
| 1485 |
-
if not
|
| 1486 |
-
|
| 1487 |
-
|
| 1488 |
-
|
| 1489 |
-
|
| 1490 |
-
|
| 1491 |
-
|
| 1492 |
-
|
| 1493 |
-
|
| 1494 |
-
|
| 1495 |
-
|
| 1496 |
-
|
| 1497 |
-
|
| 1498 |
-
|
| 1499 |
-
|
| 1500 |
-
|
| 1501 |
-
|
| 1502 |
-
|
| 1503 |
-
|
| 1504 |
-
|
| 1505 |
-
|
| 1506 |
-
|
| 1507 |
-
}
|
| 1508 |
-
logger.warning("Pollinations TTS returned %s: %s", resp.status_code, resp.text[:200])
|
| 1509 |
-
except Exception as e:
|
| 1510 |
-
logger.warning("Pollinations TTS failed: %s", e)
|
| 1511 |
|
| 1512 |
-
|
| 1513 |
-
|
| 1514 |
-
|
| 1515 |
-
|
| 1516 |
-
|
| 1517 |
-
|
| 1518 |
-
|
| 1519 |
-
|
| 1520 |
-
|
| 1521 |
-
|
| 1522 |
-
|
| 1523 |
-
|
| 1524 |
-
|
| 1525 |
-
|
| 1526 |
-
|
| 1527 |
-
|
| 1528 |
-
|
| 1529 |
-
except Exception as e:
|
| 1530 |
-
logger.warning("Pollinations GET TTS failed: %s", e)
|
| 1531 |
|
| 1532 |
# --- Fallback: CLAP retrieval ---
|
| 1533 |
logger.info("Audio generation unavailable — using CLAP retrieval")
|
|
@@ -1674,7 +1670,7 @@ def main():
|
|
| 1674 |
}
|
| 1675 |
if backend == "generative":
|
| 1676 |
img_info = "Pollinations FLUX / Stable Horde (free)"
|
| 1677 |
-
aud_info = "
|
| 1678 |
else:
|
| 1679 |
img_info = "CLIP retrieval (57 images)"
|
| 1680 |
aud_info = "CLAP retrieval (104 clips)"
|
|
|
|
| 5 |
Enter a scene description and the system produces coherent text, image,
|
| 6 |
and audio with real-time MSCI scoring.
|
| 7 |
|
| 8 |
+
Pipeline: Groq LLM (text) + Pollinations (image) + ElevenLabs (audio SFX) with CLIP/CLAP retrieval fallback
|
| 9 |
Planning modes: direct, planner, council (3-way), extended_prompt (3x tokens)
|
| 10 |
"""
|
| 11 |
|
|
|
|
| 1284 |
# Pollinations endpoints
|
| 1285 |
POLLINATIONS_IMAGE_FREE_URL = "https://image.pollinations.ai/prompt" # Free, no auth
|
| 1286 |
POLLINATIONS_GEN_IMAGE_URL = "https://gen.pollinations.ai/image" # Needs API key
|
| 1287 |
+
POLLINATIONS_AUDIO_URL = "https://gen.pollinations.ai/v1/audio/speech" # Needs API key (TTS only)
|
| 1288 |
+
POLLINATIONS_TTS_URL = "https://gen.pollinations.ai/audio" # Needs API key (TTS only)
|
| 1289 |
+
|
| 1290 |
+
# ElevenLabs (sound effects — actual ambient sounds, NOT speech)
|
| 1291 |
+
ELEVENLABS_SFX_URL = "https://api.elevenlabs.io/v1/sound-generation"
|
| 1292 |
|
| 1293 |
# Stable Horde (free, crowdsourced, no key)
|
| 1294 |
STABLE_HORDE_URL = "https://stablehorde.net/api/v2"
|
|
|
|
| 1478 |
return retrieve_image(prompt)
|
| 1479 |
|
| 1480 |
|
| 1481 |
+
def _elevenlabs_sfx(prompt: str, duration: float = 8.0) -> Optional[bytes]:
|
| 1482 |
+
"""Generate ambient sound effects via ElevenLabs Sound Generation API.
|
| 1483 |
|
| 1484 |
+
Returns MP3 bytes or None on failure. Free tier: ~50 generations/month.
|
|
|
|
| 1485 |
"""
|
| 1486 |
+
key = os.environ.get("ELEVENLABS_API_KEY", "")
|
| 1487 |
+
if not key:
|
| 1488 |
+
return None
|
| 1489 |
+
try:
|
| 1490 |
+
resp = _requests.post(
|
| 1491 |
+
ELEVENLABS_SFX_URL,
|
| 1492 |
+
headers={
|
| 1493 |
+
"xi-api-key": key,
|
| 1494 |
+
"Content-Type": "application/json",
|
| 1495 |
+
},
|
| 1496 |
+
json={
|
| 1497 |
+
"text": prompt,
|
| 1498 |
+
"duration_seconds": duration,
|
| 1499 |
+
"prompt_influence": 0.5,
|
| 1500 |
+
},
|
| 1501 |
+
timeout=90,
|
| 1502 |
+
)
|
| 1503 |
+
if resp.status_code == 200 and len(resp.content) > 1000:
|
| 1504 |
+
return resp.content
|
| 1505 |
+
logger.warning("ElevenLabs SFX returned %s: %s", resp.status_code, resp.text[:200])
|
| 1506 |
+
except Exception as e:
|
| 1507 |
+
logger.warning("ElevenLabs SFX failed: %s", e)
|
| 1508 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1509 |
|
| 1510 |
+
|
| 1511 |
+
def generate_audio(prompt: str) -> dict:
|
| 1512 |
+
"""Generate ambient audio via ElevenLabs SFX → CLAP retrieval fallback.
|
| 1513 |
+
|
| 1514 |
+
Uses ElevenLabs Sound Effects API to generate actual ambient sounds
|
| 1515 |
+
(NOT text-to-speech). Falls back to CLAP retrieval if unavailable.
|
| 1516 |
+
"""
|
| 1517 |
+
# --- Attempt 1: ElevenLabs Sound Effects (actual ambient sounds) ---
|
| 1518 |
+
audio_bytes = _elevenlabs_sfx(prompt, duration=8.0)
|
| 1519 |
+
if audio_bytes:
|
| 1520 |
+
tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False, dir="/tmp")
|
| 1521 |
+
tmp.write(audio_bytes)
|
| 1522 |
+
tmp.flush()
|
| 1523 |
+
return {
|
| 1524 |
+
"path": tmp.name, "backend": "generative",
|
| 1525 |
+
"model": "ElevenLabs-SFX", "failed": False,
|
| 1526 |
+
}
|
|
|
|
|
|
|
| 1527 |
|
| 1528 |
# --- Fallback: CLAP retrieval ---
|
| 1529 |
logger.info("Audio generation unavailable — using CLAP retrieval")
|
|
|
|
| 1670 |
}
|
| 1671 |
if backend == "generative":
|
| 1672 |
img_info = "Pollinations FLUX / Stable Horde (free)"
|
| 1673 |
+
aud_info = "ElevenLabs SFX / CLAP retrieval (free)"
|
| 1674 |
else:
|
| 1675 |
img_info = "CLIP retrieval (57 images)"
|
| 1676 |
aud_info = "CLAP retrieval (104 clips)"
|