pratik-250620 commited on
Commit
59ba68f
·
verified ·
1 Parent(s): 81f27f8

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +36 -62
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1478,51 +1478,48 @@ def generate_image(prompt: str) -> dict:
1478
  return retrieve_image(prompt)
1479
 
1480
 
1481
- def generate_audio(prompt: str) -> dict:
1482
- """Generate ambient audio via ElevenLabs SFX CLAP retrieval fallback."""
1483
- key = os.environ.get("ELEVENLABS_API_KEY", "")
1484
- if not key:
1485
- logger.warning("ELEVENLABS_API_KEY not set — skipping SFX generation")
1486
- result = retrieve_audio(prompt)
1487
- result["generation_unavailable"] = True
1488
- result["sfx_error"] = "ELEVENLABS_API_KEY not set"
1489
- return result
1490
-
1491
- # --- ElevenLabs Sound Effects (actual ambient sounds, NOT speech) ---
1492
  try:
1493
- logger.info("ElevenLabs SFX: calling API with key=%s...", key[:8])
1494
- resp = _requests.post(
1495
- ELEVENLABS_SFX_URL,
1496
- headers={
1497
- "xi-api-key": key,
1498
- "Content-Type": "application/json",
1499
- },
1500
- json={
1501
- "text": prompt,
1502
- "duration_seconds": 8.0,
1503
- "prompt_influence": 0.5,
1504
- },
1505
- timeout=120,
1506
  )
1507
- logger.info("ElevenLabs SFX: HTTP %s, %d bytes", resp.status_code, len(resp.content))
1508
- if resp.status_code == 200 and len(resp.content) > 1000:
1509
- tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False, dir="/tmp")
1510
- tmp.write(resp.content)
1511
- tmp.flush()
1512
- return {
1513
- "path": tmp.name, "backend": "generative",
1514
- "model": "ElevenLabs-SFX", "failed": False,
1515
- }
1516
- err = f"HTTP {resp.status_code}: {resp.text[:200]}"
1517
- logger.warning("ElevenLabs SFX failed: %s", err)
1518
  except Exception as e:
1519
- err = str(e)
1520
- logger.warning("ElevenLabs SFX exception: %s", err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1521
 
1522
  # --- Fallback: CLAP retrieval ---
 
1523
  result = retrieve_audio(prompt)
1524
  result["generation_unavailable"] = True
1525
- result["sfx_error"] = err
1526
  return result
1527
 
1528
 
@@ -1664,7 +1661,7 @@ def main():
1664
  }
1665
  if backend == "generative":
1666
  img_info = "Pollinations FLUX / Stable Horde (free)"
1667
- aud_info = "ElevenLabs SFX / CLAP retrieval (free)"
1668
  else:
1669
  img_info = "CLIP retrieval (57 images)"
1670
  aud_info = "CLAP retrieval (104 clips)"
@@ -1681,29 +1678,6 @@ def main():
1681
  f'CLAP HTSAT-unfused (coherence eval)'
1682
  f'</div>', unsafe_allow_html=True)
1683
 
1684
- # --- Audio API diagnostic ---
1685
- with st.expander("Audio API diagnostic"):
1686
- el_key = os.environ.get("ELEVENLABS_API_KEY", "")
1687
- if el_key:
1688
- st.success(f"ELEVENLABS_API_KEY set ({el_key[:8]}...)")
1689
- if st.button("Test ElevenLabs SFX"):
1690
- try:
1691
- test_resp = _requests.post(
1692
- ELEVENLABS_SFX_URL,
1693
- headers={"xi-api-key": el_key, "Content-Type": "application/json"},
1694
- json={"text": "gentle rain on leaves", "duration_seconds": 3, "prompt_influence": 0.5},
1695
- timeout=60,
1696
- )
1697
- if test_resp.status_code == 200 and len(test_resp.content) > 500:
1698
- st.success(f"OK! {len(test_resp.content)} bytes generated")
1699
- st.audio(test_resp.content, format="audio/mp3")
1700
- else:
1701
- st.error(f"HTTP {test_resp.status_code}: {test_resp.text[:300]}")
1702
- except Exception as e:
1703
- st.error(f"Error: {e}")
1704
- else:
1705
- st.error("ELEVENLABS_API_KEY is NOT set!")
1706
-
1707
  # Apply CSS based on mode
1708
  if kid_mode:
1709
  st.markdown(KID_CSS, unsafe_allow_html=True) # kid theme (includes all needed overrides)
 
1478
  return retrieve_image(prompt)
1479
 
1480
 
1481
+ def _stable_audio_generate(prompt: str, duration: float = 8.0) -> Optional[str]:
1482
+ """Generate ambient audio via Stable Audio Open (free Gradio Space, no API key).
1483
+
1484
+ Returns path to generated WAV file or None on failure.
1485
+ """
 
 
 
 
 
 
1486
  try:
1487
+ from gradio_client import Client as GradioClient
1488
+ client = GradioClient("artificialguybr/Stable-Audio-Open-Zero", verbose=False)
1489
+ result = client.predict(
1490
+ prompt=prompt,
1491
+ seconds_total=duration,
1492
+ steps=50,
1493
+ cfg_scale=7,
1494
+ api_name="/predict",
 
 
 
 
 
1495
  )
1496
+ if result and os.path.exists(result):
1497
+ logger.info("Stable Audio generated: %s (%d bytes)", result, os.path.getsize(result))
1498
+ return result
1499
+ logger.warning("Stable Audio returned invalid path: %s", result)
 
 
 
 
 
 
 
1500
  except Exception as e:
1501
+ logger.warning("Stable Audio failed: %s", e)
1502
+ return None
1503
+
1504
+
1505
+ def generate_audio(prompt: str) -> dict:
1506
+ """Generate ambient audio via Stable Audio Open → CLAP retrieval fallback.
1507
+
1508
+ Uses a free GPU-powered Gradio Space (no API key needed) to generate
1509
+ actual ambient sounds from text prompts.
1510
+ """
1511
+ # --- Attempt 1: Stable Audio Open (free, GPU-powered, real ambient audio) ---
1512
+ path = _stable_audio_generate(prompt, duration=8.0)
1513
+ if path:
1514
+ return {
1515
+ "path": path, "backend": "generative",
1516
+ "model": "Stable-Audio-Open", "failed": False,
1517
+ }
1518
 
1519
  # --- Fallback: CLAP retrieval ---
1520
+ logger.info("Audio generation unavailable — using CLAP retrieval")
1521
  result = retrieve_audio(prompt)
1522
  result["generation_unavailable"] = True
 
1523
  return result
1524
 
1525
 
 
1661
  }
1662
  if backend == "generative":
1663
  img_info = "Pollinations FLUX / Stable Horde (free)"
1664
+ aud_info = "Stable Audio Open / CLAP retrieval (free)"
1665
  else:
1666
  img_info = "CLIP retrieval (57 images)"
1667
  aud_info = "CLAP retrieval (104 clips)"
 
1678
  f'CLAP HTSAT-unfused (coherence eval)'
1679
  f'</div>', unsafe_allow_html=True)
1680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1681
  # Apply CSS based on mode
1682
  if kid_mode:
1683
  st.markdown(KID_CSS, unsafe_allow_html=True) # kid theme (includes all needed overrides)
requirements.txt CHANGED
@@ -13,3 +13,4 @@ pillow>=10.0.0
13
  pydantic>=2.0.0
14
  pydantic-settings>=2.0.0
15
  requests>=2.28.0
 
 
13
  pydantic>=2.0.0
14
  pydantic-settings>=2.0.0
15
  requests>=2.28.0
16
+ gradio_client>=1.0.0