Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -553,7 +553,7 @@ section[data-testid="stSidebar"] p {
|
|
| 553 |
background: rgba(255,237,213,0.7);
|
| 554 |
}
|
| 555 |
|
| 556 |
-
/* Button override */
|
| 557 |
.stButton > button[kind="primary"] {
|
| 558 |
background: linear-gradient(135deg, #8b5cf6, #ec4899) !important;
|
| 559 |
border: none !important; border-radius: 16px !important;
|
|
@@ -567,6 +567,35 @@ section[data-testid="stSidebar"] p {
|
|
| 567 |
box-shadow: 0 6px 25px rgba(139,92,246,0.4) !important;
|
| 568 |
}
|
| 569 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 570 |
/* Divider */
|
| 571 |
hr { border-color: rgba(139,92,246,0.15) !important; }
|
| 572 |
</style>
|
|
@@ -1478,6 +1507,30 @@ def generate_image(prompt: str) -> dict:
|
|
| 1478 |
return retrieve_image(prompt)
|
| 1479 |
|
| 1480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1481 |
def _stable_audio_generate(prompt: str, duration: float = 8.0) -> Optional[str]:
|
| 1482 |
"""Generate ambient audio via Stable Audio Open (free Gradio Space, no API key).
|
| 1483 |
|
|
@@ -1503,22 +1556,26 @@ def _stable_audio_generate(prompt: str, duration: float = 8.0) -> Optional[str]:
|
|
| 1503 |
|
| 1504 |
|
| 1505 |
def generate_audio(prompt: str) -> dict:
|
| 1506 |
-
"""Generate ambient audio via Stable Audio Open → CLAP retrieval
|
| 1507 |
|
| 1508 |
-
|
| 1509 |
-
|
|
|
|
| 1510 |
"""
|
| 1511 |
-
#
|
| 1512 |
-
|
|
|
|
|
|
|
|
|
|
| 1513 |
if path:
|
| 1514 |
return {
|
| 1515 |
"path": path, "backend": "generative",
|
| 1516 |
"model": "Stable-Audio-Open", "failed": False,
|
| 1517 |
}
|
| 1518 |
|
| 1519 |
-
# --- Fallback: CLAP retrieval ---
|
| 1520 |
-
logger.info("Audio
|
| 1521 |
-
result = retrieve_audio(
|
| 1522 |
result["generation_unavailable"] = True
|
| 1523 |
return result
|
| 1524 |
|
|
@@ -1661,7 +1718,7 @@ def main():
|
|
| 1661 |
}
|
| 1662 |
if backend == "generative":
|
| 1663 |
img_info = "Pollinations FLUX / Stable Horde (free)"
|
| 1664 |
-
aud_info = "Stable Audio
|
| 1665 |
else:
|
| 1666 |
img_info = "CLIP retrieval (57 images)"
|
| 1667 |
aud_info = "CLAP retrieval (104 clips)"
|
|
|
|
| 553 |
background: rgba(255,237,213,0.7);
|
| 554 |
}
|
| 555 |
|
| 556 |
+
/* Button override — primary (Let's Go / Generate) */
|
| 557 |
.stButton > button[kind="primary"] {
|
| 558 |
background: linear-gradient(135deg, #8b5cf6, #ec4899) !important;
|
| 559 |
border: none !important; border-radius: 16px !important;
|
|
|
|
| 567 |
box-shadow: 0 6px 25px rgba(139,92,246,0.4) !important;
|
| 568 |
}
|
| 569 |
|
| 570 |
+
/* Button override — secondary (prompt suggestion buttons in sidebar) */
|
| 571 |
+
.stButton > button[kind="secondary"],
|
| 572 |
+
.stButton > button:not([kind="primary"]) {
|
| 573 |
+
background: rgba(255,255,255,0.85) !important;
|
| 574 |
+
color: #4c1d95 !important;
|
| 575 |
+
border: 2px solid #c4b5fd !important;
|
| 576 |
+
border-radius: 14px !important;
|
| 577 |
+
font-weight: 600 !important;
|
| 578 |
+
font-size: 0.88rem !important;
|
| 579 |
+
padding: 0.5rem 0.8rem !important;
|
| 580 |
+
transition: all 0.2s ease !important;
|
| 581 |
+
}
|
| 582 |
+
.stButton > button[kind="secondary"]:hover,
|
| 583 |
+
.stButton > button:not([kind="primary"]):hover {
|
| 584 |
+
background: linear-gradient(135deg, #ede9fe, #fce7f3) !important;
|
| 585 |
+
border-color: #8b5cf6 !important;
|
| 586 |
+
color: #3b0764 !important;
|
| 587 |
+
transform: scale(1.02) !important;
|
| 588 |
+
box-shadow: 0 3px 12px rgba(139,92,246,0.2) !important;
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
/* Expander headers in sidebar — light and readable */
|
| 592 |
+
section[data-testid="stSidebar"] details summary {
|
| 593 |
+
background: rgba(255,255,255,0.6) !important;
|
| 594 |
+
color: #4c1d95 !important;
|
| 595 |
+
border-radius: 12px !important;
|
| 596 |
+
font-weight: 700 !important;
|
| 597 |
+
}
|
| 598 |
+
|
| 599 |
/* Divider */
|
| 600 |
hr { border-color: rgba(139,92,246,0.15) !important; }
|
| 601 |
</style>
|
|
|
|
| 1507 |
return retrieve_image(prompt)
|
| 1508 |
|
| 1509 |
|
| 1510 |
+
def _make_audio_query(scene_prompt: str) -> str:
|
| 1511 |
+
"""Use LLM to convert a scene description into an audio-focused search query."""
|
| 1512 |
+
try:
|
| 1513 |
+
result = _llm_chat(
|
| 1514 |
+
system=(
|
| 1515 |
+
"Convert the scene into a short ambient sound description (max 15 words). "
|
| 1516 |
+
"Describe ONLY the sounds you would hear — no visuals, no story. "
|
| 1517 |
+
"Examples: 'gentle rain on leaves with distant thunder', "
|
| 1518 |
+
"'busy city traffic with car horns and pedestrians', "
|
| 1519 |
+
"'ocean waves on sandy beach with seagulls calling'."
|
| 1520 |
+
),
|
| 1521 |
+
user=scene_prompt,
|
| 1522 |
+
max_tokens=60,
|
| 1523 |
+
temperature=0.3,
|
| 1524 |
+
)
|
| 1525 |
+
query = result.strip().strip('"').strip("'")
|
| 1526 |
+
if len(query) > 10:
|
| 1527 |
+
logger.info("Audio query: %s -> %s", scene_prompt[:50], query)
|
| 1528 |
+
return query
|
| 1529 |
+
except Exception as e:
|
| 1530 |
+
logger.warning("Audio query LLM failed: %s", e)
|
| 1531 |
+
return scene_prompt
|
| 1532 |
+
|
| 1533 |
+
|
| 1534 |
def _stable_audio_generate(prompt: str, duration: float = 8.0) -> Optional[str]:
|
| 1535 |
"""Generate ambient audio via Stable Audio Open (free Gradio Space, no API key).
|
| 1536 |
|
|
|
|
| 1556 |
|
| 1557 |
|
| 1558 |
def generate_audio(prompt: str) -> dict:
|
| 1559 |
+
"""Generate ambient audio via Stable Audio Open → AI-enhanced CLAP retrieval.
|
| 1560 |
|
| 1561 |
+
1. LLM converts scene prompt into a sound-focused query
|
| 1562 |
+
2. Stable Audio Open generates ambient audio (if GPU quota available)
|
| 1563 |
+
3. Fallback: CLAP retrieval with the optimized audio query
|
| 1564 |
"""
|
| 1565 |
+
# Step 1: Convert scene prompt to sound-focused query
|
| 1566 |
+
audio_query = _make_audio_query(prompt)
|
| 1567 |
+
|
| 1568 |
+
# --- Attempt 1: Stable Audio Open (free, GPU-powered) ---
|
| 1569 |
+
path = _stable_audio_generate(audio_query, duration=8.0)
|
| 1570 |
if path:
|
| 1571 |
return {
|
| 1572 |
"path": path, "backend": "generative",
|
| 1573 |
"model": "Stable-Audio-Open", "failed": False,
|
| 1574 |
}
|
| 1575 |
|
| 1576 |
+
# --- Fallback: CLAP retrieval with optimized audio query ---
|
| 1577 |
+
logger.info("Stable Audio unavailable — using AI-enhanced CLAP retrieval")
|
| 1578 |
+
result = retrieve_audio(audio_query)
|
| 1579 |
result["generation_unavailable"] = True
|
| 1580 |
return result
|
| 1581 |
|
|
|
|
| 1718 |
}
|
| 1719 |
if backend == "generative":
|
| 1720 |
img_info = "Pollinations FLUX / Stable Horde (free)"
|
| 1721 |
+
aud_info = "Stable Audio / AI-matched ambience (free)"
|
| 1722 |
else:
|
| 1723 |
img_info = "CLIP retrieval (57 images)"
|
| 1724 |
aud_info = "CLAP retrieval (104 clips)"
|