Spaces:
Sleeping
Sleeping
| import os | |
| import numpy as np | |
| from datetime import datetime | |
| from src.musiclime.explainer import MusicLIMEExplainer | |
| from src.musiclime.wrapper import MusicLIMEPredictor, AudioOnlyPredictor | |
| from src.musiclime.print_utils import green_bold | |
| def musiclime_multimodal(audio_data, lyrics_text): | |
| """ | |
| Generate multimodal MusicLIME explanations for audio and lyrics. | |
| Parameters | |
| ---------- | |
| audio_data : array-like | |
| Audio waveform data from librosa.load or similar | |
| lyrics_text : str | |
| String containing song lyrics | |
| Returns | |
| ------- | |
| dict | |
| Structured explanation results containing prediction info, feature explanations, | |
| and processing metadata | |
| """ | |
| start_time = datetime.now() | |
| # Get number of samples from environment variable, default to 1000 | |
| num_samples = int(os.getenv("MUSICLIME_NUM_SAMPLES", "1000")) | |
| num_features = int(os.getenv("MUSICLIME_NUM_FEATURES", "10")) | |
| print(f"[MusicLIME] Using num_samples={num_samples}, num_features={num_features}") | |
| # Create musiclime instances | |
| explainer = MusicLIMEExplainer(random_state=42) | |
| predictor = MusicLIMEPredictor() | |
| # Then generate explanations | |
| explanation = explainer.explain_instance( | |
| audio=audio_data, | |
| lyrics=lyrics_text, | |
| predict_fn=predictor, | |
| num_samples=num_samples, | |
| labels=(1,), | |
| ) | |
| # Get prediction info | |
| original_prediction = explanation.predictions[0] | |
| predicted_class = np.argmax(original_prediction) | |
| confidence = float(np.max(original_prediction)) | |
| # Get top features (I also made this configurable to prevent rebuilding) | |
| top_features = explanation.get_explanation(label=1, num_features=num_features) | |
| # Calculate runtime | |
| end_time = datetime.now() | |
| runtime_seconds = (end_time - start_time).total_seconds() | |
| return { | |
| "prediction": { | |
| "class": int(predicted_class), | |
| "class_name": "Human-Composed" if predicted_class == 1 else "AI-Generated", | |
| "confidence": confidence, | |
| "probabilities": original_prediction.tolist(), | |
| }, | |
| "explanations": [ | |
| { | |
| "rank": i + 1, | |
| "modality": item["type"], | |
| "feature_text": item["feature"], | |
| "weight": float(item["weight"]), | |
| "importance": abs(float(item["weight"])), | |
| } | |
| for i, item in enumerate(top_features) | |
| ], | |
| "summary": { | |
| "total_features_analyzed": len(top_features), | |
| "audio_features_count": len( | |
| [f for f in top_features if f["type"] == "audio"] | |
| ), | |
| "lyrics_features_count": len( | |
| [f for f in top_features if f["type"] == "lyrics"] | |
| ), | |
| "runtime_seconds": runtime_seconds, | |
| "samples_generated": num_samples, | |
| "timestamp": start_time.isoformat(), | |
| }, | |
| } | |
| def musiclime_unimodal(audio_data, modality="audio"): | |
| """ | |
| Generate unimodal MusicLIME explanations for single modality. | |
| Parameters | |
| ---------- | |
| audio_data : array-like | |
| Audio waveform data from librosa.load or similar | |
| modality : str, default='audio' | |
| Explanation modality, currently only supports 'audio' | |
| Returns | |
| ------- | |
| dict | |
| Structured explanation results containing prediction info, audio-only feature | |
| explanations, and processing metadata | |
| Raises | |
| ------ | |
| ValueError | |
| If modality is not 'audio' (lyrics is not yet implemented) | |
| """ | |
| if modality != "audio": | |
| raise ValueError( | |
| "Currently only 'audio' modality is supported for unimodal explanations" | |
| ) | |
| start_time = datetime.now() | |
| # Get number of samples from environment variable, default to 1000 | |
| num_samples = int(os.getenv("MUSICLIME_NUM_SAMPLES", "1000")) | |
| num_features = int(os.getenv("MUSICLIME_NUM_FEATURES", "10")) | |
| print( | |
| f"[MusicLIME] Using num_samples={num_samples}, num_features={num_features} (audio-only mode)" | |
| ) | |
| # Create musiclime instances | |
| explainer = MusicLIMEExplainer(random_state=42) | |
| predictor = AudioOnlyPredictor() | |
| # Use empty lyrics for audio-only since they're ignored anyways | |
| dummy_lyrics = "" | |
| # Generate explanation | |
| explanation = explainer.explain_instance( | |
| audio=audio_data, | |
| lyrics=dummy_lyrics, | |
| predict_fn=predictor, | |
| num_samples=num_samples, | |
| labels=(1,), | |
| modality=modality, | |
| ) | |
| # Get prediction info | |
| original_prediction = explanation.predictions[0] | |
| predicted_class = np.argmax(original_prediction) | |
| confidence = float(np.max(original_prediction)) | |
| # Get top features | |
| top_features = explanation.get_explanation(label=1, num_features=num_features) | |
| # Calculate runtime | |
| end_time = datetime.now() | |
| runtime_seconds = (end_time - start_time).total_seconds() | |
| return { | |
| "prediction": { | |
| "class": int(predicted_class), | |
| "class_name": "Human-Composed" if predicted_class == 1 else "AI-Generated", | |
| "confidence": confidence, | |
| "probabilities": original_prediction.tolist(), | |
| }, | |
| "explanations": [ | |
| { | |
| "rank": i + 1, | |
| "modality": item["type"], # "audio" for all features | |
| "feature_text": item["feature"], | |
| "weight": float(item["weight"]), | |
| "importance": abs(float(item["weight"])), | |
| } | |
| for i, item in enumerate(top_features) | |
| ], | |
| "summary": { | |
| "total_features_analyzed": len(top_features), | |
| "audio_features_count": len(top_features), # All features are audio | |
| "lyrics_features_count": 0, # No lyrics features | |
| "runtime_seconds": runtime_seconds, | |
| "samples_generated": num_samples, | |
| "timestamp": start_time.isoformat(), | |
| }, | |
| } | |
| def musiclime_combined(audio_data, lyrics_text): | |
| """ | |
| Generate both multimodal and audio-only MusicLIME explanations efficiently. | |
| Performs source separation once and generates both explanation types | |
| to reduce total processing time by ~50% compared to separate calls. | |
| Parameters | |
| ---------- | |
| audio_data : array-like | |
| Audio waveform data from librosa.load or similar | |
| lyrics_text : str | |
| String containing song lyrics | |
| Returns | |
| ------- | |
| dict | |
| Combined results containing both multimodal and audio-only explanations | |
| """ | |
| from src.musiclime.factorization import OpenUnmixFactorization | |
| from src.musiclime.text_utils import LineIndexedString | |
| start_time = datetime.now() | |
| # Get configuration | |
| num_samples = int(os.getenv("MUSICLIME_NUM_SAMPLES", "1000")) | |
| num_features = int(os.getenv("MUSICLIME_NUM_FEATURES", "10")) | |
| print( | |
| "[MusicLIME] Combined mode: generating both multimodal and audio-only explanations" | |
| ) | |
| print(f"[MusicLIME] Using num_samples={num_samples}, num_features={num_features}") | |
| # Create factorizations once | |
| print("[MusicLIME] Creating factorizations once for both explanations...") | |
| factorization_start = datetime.now() | |
| audio_factorization = OpenUnmixFactorization( | |
| audio_data, temporal_segmentation_params=10 | |
| ) | |
| text_factorization = LineIndexedString(lyrics_text) | |
| factorization_time = (datetime.now() - factorization_start).total_seconds() | |
| print( | |
| green_bold(f"[MusicLIME] Factorization completed in {factorization_time:.2f}s") | |
| ) | |
| # Create explainer and predictors | |
| explainer = MusicLIMEExplainer(random_state=42) | |
| multimodal_predictor = MusicLIMEPredictor() | |
| audio_predictor = AudioOnlyPredictor() | |
| # Generate multimodal explanation (reusing factorizations) | |
| print("[MusicLIME] Generating multimodal explanation...") | |
| multimodal_start = datetime.now() | |
| multimodal_explanation = explainer.explain_instance_with_factorization( | |
| audio_factorization, | |
| text_factorization, | |
| multimodal_predictor, | |
| num_samples=num_samples, | |
| labels=(1,), | |
| modality="both", | |
| ) | |
| multimodal_time = (datetime.now() - multimodal_start).total_seconds() | |
| print( | |
| green_bold( | |
| f"[MusicLIME] Multimodal explanation completed in {multimodal_time:.2f}s" | |
| ) | |
| ) | |
| # Generate audio-only explanation (reusing the same factorization) | |
| print("[MusicLIME] Generating audio-only explanation (reusing factorizations)...") | |
| audio_start = datetime.now() | |
| audio_explanation = explainer.explain_instance_with_factorization( | |
| audio_factorization, | |
| text_factorization, | |
| audio_predictor, | |
| num_samples=num_samples, | |
| labels=(1,), | |
| modality="audio", | |
| ) | |
| audio_time = (datetime.now() - audio_start).total_seconds() | |
| print( | |
| green_bold(f"[MusicLIME] Audio-only explanation completed in {audio_time:.2f}s") | |
| ) | |
| # Process multimodal results | |
| multimodal_prediction = multimodal_explanation.predictions[0] | |
| multimodal_class = np.argmax(multimodal_prediction) | |
| multimodal_confidence = float(np.max(multimodal_prediction)) | |
| multimodal_features = multimodal_explanation.get_explanation( | |
| label=1, num_features=num_features | |
| ) | |
| # Process audio-only results | |
| audio_prediction = audio_explanation.predictions[0] | |
| audio_class = np.argmax(audio_prediction) | |
| audio_confidence = float(np.max(audio_prediction)) | |
| audio_features = audio_explanation.get_explanation( | |
| label=1, num_features=num_features | |
| ) | |
| # Calculate total runtime | |
| end_time = datetime.now() | |
| total_runtime = (end_time - start_time).total_seconds() | |
| print(green_bold("[MusicLIME] Combined explanation completed!")) | |
| print(f"[MusicLIME] Factorization: {factorization_time:.2f}s (done once)") | |
| print(f"[MusicLIME] Multimodal: {multimodal_time:.2f}s") | |
| print(f"[MusicLIME] Audio-only: {audio_time:.2f}s") | |
| print(f"[MusicLIME] Total: {total_runtime:.2f}s") | |
| return { | |
| "multimodal": { | |
| "prediction": { | |
| "class": int(multimodal_class), | |
| "class_name": ( | |
| "Human-Composed" if multimodal_class == 1 else "AI-Generated" | |
| ), | |
| "confidence": multimodal_confidence, | |
| "probabilities": multimodal_prediction.tolist(), | |
| }, | |
| "explanations": [ | |
| { | |
| "rank": i + 1, | |
| "modality": item["type"], | |
| "feature_text": item["feature"], | |
| "weight": float(item["weight"]), | |
| "importance": abs(float(item["weight"])), | |
| } | |
| for i, item in enumerate(multimodal_features) | |
| ], | |
| "summary": { | |
| "total_features_analyzed": len(multimodal_features), | |
| "audio_features_count": len( | |
| [f for f in multimodal_features if f["type"] == "audio"] | |
| ), | |
| "lyrics_features_count": len( | |
| [f for f in multimodal_features if f["type"] == "lyrics"] | |
| ), | |
| "runtime_seconds": multimodal_time, | |
| "samples_generated": num_samples, | |
| }, | |
| }, | |
| "audio_only": { | |
| "prediction": { | |
| "class": int(audio_class), | |
| "class_name": "Human-Composed" if audio_class == 1 else "AI-Generated", | |
| "confidence": audio_confidence, | |
| "probabilities": audio_prediction.tolist(), | |
| }, | |
| "explanations": [ | |
| { | |
| "rank": i + 1, | |
| "modality": item["type"], | |
| "feature_text": item["feature"], | |
| "weight": float(item["weight"]), | |
| "importance": abs(float(item["weight"])), | |
| } | |
| for i, item in enumerate(audio_features) | |
| ], | |
| "summary": { | |
| "total_features_analyzed": len(audio_features), | |
| "audio_features_count": len(audio_features), | |
| "lyrics_features_count": 0, | |
| "runtime_seconds": audio_time, | |
| "samples_generated": num_samples, | |
| }, | |
| }, | |
| "combined_summary": { | |
| "total_runtime_seconds": total_runtime, | |
| "factorization_time_seconds": factorization_time, | |
| "source_separation_reused": True, | |
| "timestamp": start_time.isoformat(), | |
| }, | |
| } | |