bach-or-bot / scripts /explain.py
krislette's picture
Auto-deploy from GitHub: bb659763110ffbe4c2a85e186bebb84edb7010de
0534c29
import os
import numpy as np
from datetime import datetime
from src.musiclime.explainer import MusicLIMEExplainer
from src.musiclime.wrapper import MusicLIMEPredictor, AudioOnlyPredictor
from src.musiclime.print_utils import green_bold
def musiclime_multimodal(audio_data, lyrics_text):
"""
Generate multimodal MusicLIME explanations for audio and lyrics.
Parameters
----------
audio_data : array-like
Audio waveform data from librosa.load or similar
lyrics_text : str
String containing song lyrics
Returns
-------
dict
Structured explanation results containing prediction info, feature explanations,
and processing metadata
"""
start_time = datetime.now()
# Get number of samples from environment variable, default to 1000
num_samples = int(os.getenv("MUSICLIME_NUM_SAMPLES", "1000"))
num_features = int(os.getenv("MUSICLIME_NUM_FEATURES", "10"))
print(f"[MusicLIME] Using num_samples={num_samples}, num_features={num_features}")
# Create musiclime instances
explainer = MusicLIMEExplainer(random_state=42)
predictor = MusicLIMEPredictor()
# Then generate explanations
explanation = explainer.explain_instance(
audio=audio_data,
lyrics=lyrics_text,
predict_fn=predictor,
num_samples=num_samples,
labels=(1,),
)
# Get prediction info
original_prediction = explanation.predictions[0]
predicted_class = np.argmax(original_prediction)
confidence = float(np.max(original_prediction))
# Get top features (I also made this configurable to prevent rebuilding)
top_features = explanation.get_explanation(label=1, num_features=num_features)
# Calculate runtime
end_time = datetime.now()
runtime_seconds = (end_time - start_time).total_seconds()
return {
"prediction": {
"class": int(predicted_class),
"class_name": "Human-Composed" if predicted_class == 1 else "AI-Generated",
"confidence": confidence,
"probabilities": original_prediction.tolist(),
},
"explanations": [
{
"rank": i + 1,
"modality": item["type"],
"feature_text": item["feature"],
"weight": float(item["weight"]),
"importance": abs(float(item["weight"])),
}
for i, item in enumerate(top_features)
],
"summary": {
"total_features_analyzed": len(top_features),
"audio_features_count": len(
[f for f in top_features if f["type"] == "audio"]
),
"lyrics_features_count": len(
[f for f in top_features if f["type"] == "lyrics"]
),
"runtime_seconds": runtime_seconds,
"samples_generated": num_samples,
"timestamp": start_time.isoformat(),
},
}
def musiclime_unimodal(audio_data, modality="audio"):
"""
Generate unimodal MusicLIME explanations for single modality.
Parameters
----------
audio_data : array-like
Audio waveform data from librosa.load or similar
modality : str, default='audio'
Explanation modality, currently only supports 'audio'
Returns
-------
dict
Structured explanation results containing prediction info, audio-only feature
explanations, and processing metadata
Raises
------
ValueError
If modality is not 'audio' (lyrics is not yet implemented)
"""
if modality != "audio":
raise ValueError(
"Currently only 'audio' modality is supported for unimodal explanations"
)
start_time = datetime.now()
# Get number of samples from environment variable, default to 1000
num_samples = int(os.getenv("MUSICLIME_NUM_SAMPLES", "1000"))
num_features = int(os.getenv("MUSICLIME_NUM_FEATURES", "10"))
print(
f"[MusicLIME] Using num_samples={num_samples}, num_features={num_features} (audio-only mode)"
)
# Create musiclime instances
explainer = MusicLIMEExplainer(random_state=42)
predictor = AudioOnlyPredictor()
# Use empty lyrics for audio-only since they're ignored anyways
dummy_lyrics = ""
# Generate explanation
explanation = explainer.explain_instance(
audio=audio_data,
lyrics=dummy_lyrics,
predict_fn=predictor,
num_samples=num_samples,
labels=(1,),
modality=modality,
)
# Get prediction info
original_prediction = explanation.predictions[0]
predicted_class = np.argmax(original_prediction)
confidence = float(np.max(original_prediction))
# Get top features
top_features = explanation.get_explanation(label=1, num_features=num_features)
# Calculate runtime
end_time = datetime.now()
runtime_seconds = (end_time - start_time).total_seconds()
return {
"prediction": {
"class": int(predicted_class),
"class_name": "Human-Composed" if predicted_class == 1 else "AI-Generated",
"confidence": confidence,
"probabilities": original_prediction.tolist(),
},
"explanations": [
{
"rank": i + 1,
"modality": item["type"], # "audio" for all features
"feature_text": item["feature"],
"weight": float(item["weight"]),
"importance": abs(float(item["weight"])),
}
for i, item in enumerate(top_features)
],
"summary": {
"total_features_analyzed": len(top_features),
"audio_features_count": len(top_features), # All features are audio
"lyrics_features_count": 0, # No lyrics features
"runtime_seconds": runtime_seconds,
"samples_generated": num_samples,
"timestamp": start_time.isoformat(),
},
}
def musiclime_combined(audio_data, lyrics_text):
"""
Generate both multimodal and audio-only MusicLIME explanations efficiently.
Performs source separation once and generates both explanation types
to reduce total processing time by ~50% compared to separate calls.
Parameters
----------
audio_data : array-like
Audio waveform data from librosa.load or similar
lyrics_text : str
String containing song lyrics
Returns
-------
dict
Combined results containing both multimodal and audio-only explanations
"""
from src.musiclime.factorization import OpenUnmixFactorization
from src.musiclime.text_utils import LineIndexedString
start_time = datetime.now()
# Get configuration
num_samples = int(os.getenv("MUSICLIME_NUM_SAMPLES", "1000"))
num_features = int(os.getenv("MUSICLIME_NUM_FEATURES", "10"))
print(
"[MusicLIME] Combined mode: generating both multimodal and audio-only explanations"
)
print(f"[MusicLIME] Using num_samples={num_samples}, num_features={num_features}")
# Create factorizations once
print("[MusicLIME] Creating factorizations once for both explanations...")
factorization_start = datetime.now()
audio_factorization = OpenUnmixFactorization(
audio_data, temporal_segmentation_params=10
)
text_factorization = LineIndexedString(lyrics_text)
factorization_time = (datetime.now() - factorization_start).total_seconds()
print(
green_bold(f"[MusicLIME] Factorization completed in {factorization_time:.2f}s")
)
# Create explainer and predictors
explainer = MusicLIMEExplainer(random_state=42)
multimodal_predictor = MusicLIMEPredictor()
audio_predictor = AudioOnlyPredictor()
# Generate multimodal explanation (reusing factorizations)
print("[MusicLIME] Generating multimodal explanation...")
multimodal_start = datetime.now()
multimodal_explanation = explainer.explain_instance_with_factorization(
audio_factorization,
text_factorization,
multimodal_predictor,
num_samples=num_samples,
labels=(1,),
modality="both",
)
multimodal_time = (datetime.now() - multimodal_start).total_seconds()
print(
green_bold(
f"[MusicLIME] Multimodal explanation completed in {multimodal_time:.2f}s"
)
)
# Generate audio-only explanation (reusing the same factorization)
print("[MusicLIME] Generating audio-only explanation (reusing factorizations)...")
audio_start = datetime.now()
audio_explanation = explainer.explain_instance_with_factorization(
audio_factorization,
text_factorization,
audio_predictor,
num_samples=num_samples,
labels=(1,),
modality="audio",
)
audio_time = (datetime.now() - audio_start).total_seconds()
print(
green_bold(f"[MusicLIME] Audio-only explanation completed in {audio_time:.2f}s")
)
# Process multimodal results
multimodal_prediction = multimodal_explanation.predictions[0]
multimodal_class = np.argmax(multimodal_prediction)
multimodal_confidence = float(np.max(multimodal_prediction))
multimodal_features = multimodal_explanation.get_explanation(
label=1, num_features=num_features
)
# Process audio-only results
audio_prediction = audio_explanation.predictions[0]
audio_class = np.argmax(audio_prediction)
audio_confidence = float(np.max(audio_prediction))
audio_features = audio_explanation.get_explanation(
label=1, num_features=num_features
)
# Calculate total runtime
end_time = datetime.now()
total_runtime = (end_time - start_time).total_seconds()
print(green_bold("[MusicLIME] Combined explanation completed!"))
print(f"[MusicLIME] Factorization: {factorization_time:.2f}s (done once)")
print(f"[MusicLIME] Multimodal: {multimodal_time:.2f}s")
print(f"[MusicLIME] Audio-only: {audio_time:.2f}s")
print(f"[MusicLIME] Total: {total_runtime:.2f}s")
return {
"multimodal": {
"prediction": {
"class": int(multimodal_class),
"class_name": (
"Human-Composed" if multimodal_class == 1 else "AI-Generated"
),
"confidence": multimodal_confidence,
"probabilities": multimodal_prediction.tolist(),
},
"explanations": [
{
"rank": i + 1,
"modality": item["type"],
"feature_text": item["feature"],
"weight": float(item["weight"]),
"importance": abs(float(item["weight"])),
}
for i, item in enumerate(multimodal_features)
],
"summary": {
"total_features_analyzed": len(multimodal_features),
"audio_features_count": len(
[f for f in multimodal_features if f["type"] == "audio"]
),
"lyrics_features_count": len(
[f for f in multimodal_features if f["type"] == "lyrics"]
),
"runtime_seconds": multimodal_time,
"samples_generated": num_samples,
},
},
"audio_only": {
"prediction": {
"class": int(audio_class),
"class_name": "Human-Composed" if audio_class == 1 else "AI-Generated",
"confidence": audio_confidence,
"probabilities": audio_prediction.tolist(),
},
"explanations": [
{
"rank": i + 1,
"modality": item["type"],
"feature_text": item["feature"],
"weight": float(item["weight"]),
"importance": abs(float(item["weight"])),
}
for i, item in enumerate(audio_features)
],
"summary": {
"total_features_analyzed": len(audio_features),
"audio_features_count": len(audio_features),
"lyrics_features_count": 0,
"runtime_seconds": audio_time,
"samples_generated": num_samples,
},
},
"combined_summary": {
"total_runtime_seconds": total_runtime,
"factorization_time_seconds": factorization_time,
"source_separation_reused": True,
"timestamp": start_time.isoformat(),
},
}