Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

App Files Files Community

pratik-250620 commited on Feb 21

Commit

5d25cc4

verified ·

1 Parent(s): 59ba68f

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +67 -10

app.py CHANGED Viewed

@@ -553,7 +553,7 @@ section[data-testid="stSidebar"] p {
     background: rgba(255,237,213,0.7);
 }
-/* Button override */
 .stButton > button[kind="primary"] {
     background: linear-gradient(135deg, #8b5cf6, #ec4899) !important;
     border: none !important; border-radius: 16px !important;
@@ -567,6 +567,35 @@ section[data-testid="stSidebar"] p {
     box-shadow: 0 6px 25px rgba(139,92,246,0.4) !important;
 }
 /* Divider */
 hr { border-color: rgba(139,92,246,0.15) !important; }
 </style>
@@ -1478,6 +1507,30 @@ def generate_image(prompt: str) -> dict:
     return retrieve_image(prompt)
 def _stable_audio_generate(prompt: str, duration: float = 8.0) -> Optional[str]:
     """Generate ambient audio via Stable Audio Open (free Gradio Space, no API key).
@@ -1503,22 +1556,26 @@ def _stable_audio_generate(prompt: str, duration: float = 8.0) -> Optional[str]:
 def generate_audio(prompt: str) -> dict:
-    """Generate ambient audio via Stable Audio Open → CLAP retrieval fallback.
-    Uses a free GPU-powered Gradio Space (no API key needed) to generate
-    actual ambient sounds from text prompts.
     """
-    # --- Attempt 1: Stable Audio Open (free, GPU-powered, real ambient audio) ---
-    path = _stable_audio_generate(prompt, duration=8.0)
     if path:
         return {
             "path": path, "backend": "generative",
             "model": "Stable-Audio-Open", "failed": False,
         }
-    # --- Fallback: CLAP retrieval ---
-    logger.info("Audio generation unavailable — using CLAP retrieval")
-    result = retrieve_audio(prompt)
     result["generation_unavailable"] = True
     return result
@@ -1661,7 +1718,7 @@ def main():
         }
         if backend == "generative":
             img_info = "Pollinations FLUX / Stable Horde (free)"
-            aud_info = "Stable Audio Open / CLAP retrieval (free)"
         else:
             img_info = "CLIP retrieval (57 images)"
             aud_info = "CLAP retrieval (104 clips)"

     background: rgba(255,237,213,0.7);
 }
+/* Button override — primary (Let's Go / Generate) */
 .stButton > button[kind="primary"] {
     background: linear-gradient(135deg, #8b5cf6, #ec4899) !important;
     border: none !important; border-radius: 16px !important;
     box-shadow: 0 6px 25px rgba(139,92,246,0.4) !important;
 }
+/* Button override — secondary (prompt suggestion buttons in sidebar) */
+.stButton > button[kind="secondary"],
+.stButton > button:not([kind="primary"]) {
+    background: rgba(255,255,255,0.85) !important;
+    color: #4c1d95 !important;
+    border: 2px solid #c4b5fd !important;
+    border-radius: 14px !important;
+    font-weight: 600 !important;
+    font-size: 0.88rem !important;
+    padding: 0.5rem 0.8rem !important;
+    transition: all 0.2s ease !important;
+}
+.stButton > button[kind="secondary"]:hover,
+.stButton > button:not([kind="primary"]):hover {
+    background: linear-gradient(135deg, #ede9fe, #fce7f3) !important;
+    border-color: #8b5cf6 !important;
+    color: #3b0764 !important;
+    transform: scale(1.02) !important;
+    box-shadow: 0 3px 12px rgba(139,92,246,0.2) !important;
+}
+/* Expander headers in sidebar — light and readable */
+section[data-testid="stSidebar"] details summary {
+    background: rgba(255,255,255,0.6) !important;
+    color: #4c1d95 !important;
+    border-radius: 12px !important;
+    font-weight: 700 !important;
+}
 /* Divider */
 hr { border-color: rgba(139,92,246,0.15) !important; }
 </style>
     return retrieve_image(prompt)
+def _make_audio_query(scene_prompt: str) -> str:
+    """Use LLM to convert a scene description into an audio-focused search query."""
+    try:
+        result = _llm_chat(
+            system=(
+                "Convert the scene into a short ambient sound description (max 15 words). "
+                "Describe ONLY the sounds you would hear — no visuals, no story. "
+                "Examples: 'gentle rain on leaves with distant thunder', "
+                "'busy city traffic with car horns and pedestrians', "
+                "'ocean waves on sandy beach with seagulls calling'."
+            ),
+            user=scene_prompt,
+            max_tokens=60,
+            temperature=0.3,
+        )
+        query = result.strip().strip('"').strip("'")
+        if len(query) > 10:
+            logger.info("Audio query: %s -> %s", scene_prompt[:50], query)
+            return query
+    except Exception as e:
+        logger.warning("Audio query LLM failed: %s", e)
+    return scene_prompt
 def _stable_audio_generate(prompt: str, duration: float = 8.0) -> Optional[str]:
     """Generate ambient audio via Stable Audio Open (free Gradio Space, no API key).
 def generate_audio(prompt: str) -> dict:
+    """Generate ambient audio via Stable Audio Open → AI-enhanced CLAP retrieval.
+    1. LLM converts scene prompt into a sound-focused query
+    2. Stable Audio Open generates ambient audio (if GPU quota available)
+    3. Fallback: CLAP retrieval with the optimized audio query
     """
+    # Step 1: Convert scene prompt to sound-focused query
+    audio_query = _make_audio_query(prompt)
+    # --- Attempt 1: Stable Audio Open (free, GPU-powered) ---
+    path = _stable_audio_generate(audio_query, duration=8.0)
     if path:
         return {
             "path": path, "backend": "generative",
             "model": "Stable-Audio-Open", "failed": False,
         }
+    # --- Fallback: CLAP retrieval with optimized audio query ---
+    logger.info("Stable Audio unavailable — using AI-enhanced CLAP retrieval")
+    result = retrieve_audio(audio_query)
     result["generation_unavailable"] = True
     return result
         }
         if backend == "generative":
             img_info = "Pollinations FLUX / Stable Horde (free)"
+            aud_info = "Stable Audio / AI-matched ambience (free)"
         else:
             img_info = "CLIP retrieval (57 images)"
             aud_info = "CLAP retrieval (104 clips)"