pratik-250620 commited on
Commit
5d25cc4
·
verified ·
1 Parent(s): 59ba68f

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +67 -10
app.py CHANGED
@@ -553,7 +553,7 @@ section[data-testid="stSidebar"] p {
553
  background: rgba(255,237,213,0.7);
554
  }
555
 
556
- /* Button override */
557
  .stButton > button[kind="primary"] {
558
  background: linear-gradient(135deg, #8b5cf6, #ec4899) !important;
559
  border: none !important; border-radius: 16px !important;
@@ -567,6 +567,35 @@ section[data-testid="stSidebar"] p {
567
  box-shadow: 0 6px 25px rgba(139,92,246,0.4) !important;
568
  }
569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  /* Divider */
571
  hr { border-color: rgba(139,92,246,0.15) !important; }
572
  </style>
@@ -1478,6 +1507,30 @@ def generate_image(prompt: str) -> dict:
1478
  return retrieve_image(prompt)
1479
 
1480
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1481
  def _stable_audio_generate(prompt: str, duration: float = 8.0) -> Optional[str]:
1482
  """Generate ambient audio via Stable Audio Open (free Gradio Space, no API key).
1483
 
@@ -1503,22 +1556,26 @@ def _stable_audio_generate(prompt: str, duration: float = 8.0) -> Optional[str]:
1503
 
1504
 
1505
  def generate_audio(prompt: str) -> dict:
1506
- """Generate ambient audio via Stable Audio Open → CLAP retrieval fallback.
1507
 
1508
- Uses a free GPU-powered Gradio Space (no API key needed) to generate
1509
- actual ambient sounds from text prompts.
 
1510
  """
1511
- # --- Attempt 1: Stable Audio Open (free, GPU-powered, real ambient audio) ---
1512
- path = _stable_audio_generate(prompt, duration=8.0)
 
 
 
1513
  if path:
1514
  return {
1515
  "path": path, "backend": "generative",
1516
  "model": "Stable-Audio-Open", "failed": False,
1517
  }
1518
 
1519
- # --- Fallback: CLAP retrieval ---
1520
- logger.info("Audio generation unavailable — using CLAP retrieval")
1521
- result = retrieve_audio(prompt)
1522
  result["generation_unavailable"] = True
1523
  return result
1524
 
@@ -1661,7 +1718,7 @@ def main():
1661
  }
1662
  if backend == "generative":
1663
  img_info = "Pollinations FLUX / Stable Horde (free)"
1664
- aud_info = "Stable Audio Open / CLAP retrieval (free)"
1665
  else:
1666
  img_info = "CLIP retrieval (57 images)"
1667
  aud_info = "CLAP retrieval (104 clips)"
 
553
  background: rgba(255,237,213,0.7);
554
  }
555
 
556
+ /* Button override — primary (Let's Go / Generate) */
557
  .stButton > button[kind="primary"] {
558
  background: linear-gradient(135deg, #8b5cf6, #ec4899) !important;
559
  border: none !important; border-radius: 16px !important;
 
567
  box-shadow: 0 6px 25px rgba(139,92,246,0.4) !important;
568
  }
569
 
570
+ /* Button override — secondary (prompt suggestion buttons in sidebar) */
571
+ .stButton > button[kind="secondary"],
572
+ .stButton > button:not([kind="primary"]) {
573
+ background: rgba(255,255,255,0.85) !important;
574
+ color: #4c1d95 !important;
575
+ border: 2px solid #c4b5fd !important;
576
+ border-radius: 14px !important;
577
+ font-weight: 600 !important;
578
+ font-size: 0.88rem !important;
579
+ padding: 0.5rem 0.8rem !important;
580
+ transition: all 0.2s ease !important;
581
+ }
582
+ .stButton > button[kind="secondary"]:hover,
583
+ .stButton > button:not([kind="primary"]):hover {
584
+ background: linear-gradient(135deg, #ede9fe, #fce7f3) !important;
585
+ border-color: #8b5cf6 !important;
586
+ color: #3b0764 !important;
587
+ transform: scale(1.02) !important;
588
+ box-shadow: 0 3px 12px rgba(139,92,246,0.2) !important;
589
+ }
590
+
591
+ /* Expander headers in sidebar — light and readable */
592
+ section[data-testid="stSidebar"] details summary {
593
+ background: rgba(255,255,255,0.6) !important;
594
+ color: #4c1d95 !important;
595
+ border-radius: 12px !important;
596
+ font-weight: 700 !important;
597
+ }
598
+
599
  /* Divider */
600
  hr { border-color: rgba(139,92,246,0.15) !important; }
601
  </style>
 
1507
  return retrieve_image(prompt)
1508
 
1509
 
1510
+ def _make_audio_query(scene_prompt: str) -> str:
1511
+ """Use LLM to convert a scene description into an audio-focused search query."""
1512
+ try:
1513
+ result = _llm_chat(
1514
+ system=(
1515
+ "Convert the scene into a short ambient sound description (max 15 words). "
1516
+ "Describe ONLY the sounds you would hear — no visuals, no story. "
1517
+ "Examples: 'gentle rain on leaves with distant thunder', "
1518
+ "'busy city traffic with car horns and pedestrians', "
1519
+ "'ocean waves on sandy beach with seagulls calling'."
1520
+ ),
1521
+ user=scene_prompt,
1522
+ max_tokens=60,
1523
+ temperature=0.3,
1524
+ )
1525
+ query = result.strip().strip('"').strip("'")
1526
+ if len(query) > 10:
1527
+ logger.info("Audio query: %s -> %s", scene_prompt[:50], query)
1528
+ return query
1529
+ except Exception as e:
1530
+ logger.warning("Audio query LLM failed: %s", e)
1531
+ return scene_prompt
1532
+
1533
+
1534
  def _stable_audio_generate(prompt: str, duration: float = 8.0) -> Optional[str]:
1535
  """Generate ambient audio via Stable Audio Open (free Gradio Space, no API key).
1536
 
 
1556
 
1557
 
1558
  def generate_audio(prompt: str) -> dict:
1559
+ """Generate ambient audio via Stable Audio Open → AI-enhanced CLAP retrieval.
1560
 
1561
+ 1. LLM converts scene prompt into a sound-focused query
1562
+ 2. Stable Audio Open generates ambient audio (if GPU quota available)
1563
+ 3. Fallback: CLAP retrieval with the optimized audio query
1564
  """
1565
+ # Step 1: Convert scene prompt to sound-focused query
1566
+ audio_query = _make_audio_query(prompt)
1567
+
1568
+ # --- Attempt 1: Stable Audio Open (free, GPU-powered) ---
1569
+ path = _stable_audio_generate(audio_query, duration=8.0)
1570
  if path:
1571
  return {
1572
  "path": path, "backend": "generative",
1573
  "model": "Stable-Audio-Open", "failed": False,
1574
  }
1575
 
1576
+ # --- Fallback: CLAP retrieval with optimized audio query ---
1577
+ logger.info("Stable Audio unavailable — using AI-enhanced CLAP retrieval")
1578
+ result = retrieve_audio(audio_query)
1579
  result["generation_unavailable"] = True
1580
  return result
1581
 
 
1718
  }
1719
  if backend == "generative":
1720
  img_info = "Pollinations FLUX / Stable Horde (free)"
1721
+ aud_info = "Stable Audio / AI-matched ambience (free)"
1722
  else:
1723
  img_info = "CLIP retrieval (57 images)"
1724
  aud_info = "CLAP retrieval (104 clips)"