Sonifier / app.py
dreadedeyes's picture
Update app.py
ed58c12
import streamlit as st
import numpy as np
import librosa
from PIL import Image, ImageOps
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter
import cv2
import os
import soundfile as sf
from pydub import AudioSegment
from pywavefront import Wavefront
import pywavefront
def img_to_audio(image, time=3.0, rate=44100, n_fft=2048, n_iter=64, hop_length=512, contrast_stretch=False, hist_equalize=False, improve_reconstruction=False):
# Load image
img = Image.fromarray(image).convert("L")
# Apply preprocessing techniques
if contrast_stretch:
img = Image.fromarray(np.uint8(255 * (np.asarray(img) - np.min(img)) / (np.max(img) - np.min(img))))
if hist_equalize:
img = ImageOps.equalize(ImageOps.autocontrast(img)).convert("L")
# Calculate spectrogram size
spec_shape = (int(librosa.time_to_frames(1.0, sr=rate, hop_length=hop_length, n_fft=n_fft) * time), n_fft)
spec = np.asarray(img.resize(spec_shape))
spec = np.interp(spec, (spec.min(), spec.max()), (-30, 10)) # Adjust the range
spec = librosa.db_to_amplitude(spec)
if improve_reconstruction:
# Use advanced reconstruction method
audio = librosa.effects.preemphasis(librosa.feature.inverse.mel_to_audio(spec))
else:
# Use Griffin-Lim for reconstruction
audio = librosa.griffinlim(spec, n_iter=n_iter, hop_length=hop_length)
# Apply smoothing to make the audio more appealing
audio = smooth_audio(audio)
return rate, audio
def smooth_audio(audio, sigma=1):
# Apply Gaussian smoothing to the audio
smoothed_audio = gaussian_filter(audio, sigma=sigma)
return smoothed_audio
def generate_waveform(audio, rate):
# Plot the audio waveform
plt.figure(figsize=(10, 4))
plt.plot(np.arange(len(audio)) / rate, audio, color='b')
plt.title('Audio Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.grid(True)
plt.tight_layout()
st.set_option('deprecation.showPyplotGlobalUse', False)
# Display the waveform plot
st.pyplot()
def read_video_frames(uploaded_file, frame_skip=1):
# Save the uploaded video temporarily to a file
with open("temp_video.mp4", "wb") as temp_video_file:
temp_video_file.write(uploaded_file.read())
# Open the temporarily saved video file
cap = cv2.VideoCapture("temp_video.mp4")
frames = []
frame_count = 0
try:
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frame_count += 1
# Skip frames based on frame_skip value
if frame_count % frame_skip == 0:
frames.append(frame)
except Exception as e:
st.error(f"Error processing frames: {str(e)}")
finally:
# Close the video capture object and release the file
cap.release()
# Remove the temporary video file
os.remove("temp_video.mp4")
return frames
def video_to_audio(video_frames, output_audio_path, time=3.0, rate=44100, n_fft=2048, n_iter=64, hop_length=512, contrast_stretch=False, hist_equalize=False, improve_reconstruction=False):
audio_frames = []
video_frame_rate = len(video_frames) / time # Calculate the frame rate of the video
for i, frame in enumerate(video_frames):
# Calculate the corresponding time for the audio segment
audio_time = i / video_frame_rate
# Ensure that we generate audio only within the specified time
if audio_time >= time:
break
audio = img_to_audio(frame, time, rate, n_fft, n_iter, hop_length, contrast_stretch, hist_equalize, improve_reconstruction)
audio_frames.append(audio[1])
audio_frames = np.concatenate(audio_frames)
# Save the resulting audio as a WAV file
sf.write(output_audio_path, audio_frames, rate)
def obj_to_audio(obj_file_path, time=3.0, rate=22050, sigma=1):
try:
# Read and process the .obj file
obj = Wavefront(obj_file_path)
# Define the range for mapping x-coordinate (min_x and max_x) to pitch (min_pitch and max_pitch)
min_x = min(vertices, key=lambda x: x[0])[0] # Find the minimum x-coordinate in your vertices
max_x = max(vertices, key=lambda x: x[0])[0] # Find the maximum x-coordinate in your vertices
min_pitch = 100 # Minimum pitch value
max_pitch = 1000 # Maximum pitch value
# Define the range for mapping y-coordinate (min_y and max_y) to volume (min_volume and max_volume)
min_y = min(vertices, key=lambda x: x[1])[1] # Find the minimum y-coordinate in your vertices
max_y = max(vertices, key=lambda x: x[1])[1] # Find the maximum y-coordinate in your vertices
min_volume = -20 # Minimum volume in dB (e.g., -20 dB)
max_volume = 0 # Maximum volume in dB (e.g., 0 dB)
# Extract vertex positions
vertices = obj.vertices
audio_segments = [] # Store audio segments for each vertex
for vertex in vertices:
if len(vertex) != 3:
continue
x, y, z = vertex
# Map vertex positions to audio parameters (example: pitch and volume)
pitch = map_to_range(x, min_x, max_x, min_pitch, max_pitch)
volume = map_to_range(z, min_x, max_x, min_volume, max_volume)
# Create an audio segment for this vertex
vertex_audio = AudioSegment.silent(duration=int(time * 1000)) # Duration in milliseconds
vertex_audio = vertex_audio + AudioSegment.silent(duration=100) # A small gap between vertex sounds
vertex_audio = vertex_audio + create_audio_from_parameters(time, rate, pitch, volume)
audio_segments.append(vertex_audio)
# Combine all audio segments into a single audio
audio = AudioSegment.silent(duration=0)
for segment in audio_segments:
audio += segment
# Apply Gaussian smoothing to the audio
audio = audio.low_pass_filter(sigma * 1000) # Sigma in Hz
# Generate a unique output audio path
audio_path = f"output_audio_{int(time)}s.wav"
audio.export(audio_path, format="wav")
return audio_path
except Exception as e:
st.error(f"Error processing OBJ file: {str(e)}")
return None
# Helper function to map values from one range to another
def map_to_range(value, from_min, from_max, to_min, to_max):
return (value - from_min) / (from_max - from_min) * (to_max - to_min) + to_min
def create_audio_from_parameters(time, rate, pitch, volume):
# Define audio properties
duration_ms = int(time * 1000) # Duration in milliseconds
sample_rate = rate # Sample rate in Hz
num_samples = int(duration_ms * sample_rate / 1000)
# Create time values for the audio
t = np.linspace(0, time, num_samples, endpoint=False)
# Generate audio waveform based on pitch and volume
frequency = 440.0 * 2**(pitch / 12.0) # Calculate frequency from pitch (assuming A440 reference)
amplitude = 0.5 * volume # Adjust volume
# Generate a simple sine wave as an example
audio_data = amplitude * np.sin(2 * np.pi * frequency * t)
# Convert the audio data to a PyDub AudioSegment
audio_segment = AudioSegment(
audio_data.tobytes(), # Audio data as bytes
frame_rate=sample_rate, # Sample rate
sample_width=audio_data.dtype.itemsize, # Sample width in bytes
channels=1 # Mono audio
)
return audio_segment
def main():
st.title("Improved Image, Video, and 3D Object Sonification")
time = st.slider("Audio Time (seconds)", 1.0, 50.0, 3.0, 0.1)
n_fft = st.slider("n_fft", 512, 2048, 1024, 64)
hop_length = st.slider("hop_length", 256, 1024, 512, 64)
n_iter = st.slider("n_iter", 10, 100, 64, 10)
contrast_stretch = st.checkbox("Apply Contrast Stretching")
hist_equalize = st.checkbox("Apply Histogram Equalization")
improve_reconstruction = st.checkbox("Improve Griffin-Lim Reconstruction")
uploaded_file = st.file_uploader("Upload a 3D image or video", type=["jpg", "png", "jpeg", "mp4", "obj"])
frame_skip = st.slider("Frame Skip", 1, 100, 1)
if uploaded_file is not None:
# Check if the uploaded file is an image, video, or 3D object
if uploaded_file.type.startswith('video'):
# Handle video
video_frames = read_video_frames(uploaded_file, frame_skip)
st.video(uploaded_file)
output_audio_path = "output_audio.wav"
video_to_audio(
video_frames,
output_audio_path,
time=time,
n_fft=n_fft,
hop_length=hop_length,
n_iter=n_iter,
contrast_stretch=contrast_stretch,
hist_equalize=hist_equalize,
improve_reconstruction=improve_reconstruction,
)
st.success("Audio generation complete. Click the button below to download the audio.")
audio_bytes = open(output_audio_path, "rb").read()
st.audio(audio_bytes, format="audio/wav")
elif uploaded_file.name.endswith('.obj'):
# Handle 3D object
st.info("Processing the 3D object...")
# Save the uploaded .obj file temporarily
with open("temp_obj.obj", "wb") as temp_obj_file:
temp_obj_file.write(uploaded_file.read())
# Perform sonification
audio_path = obj_to_audio("temp_obj.obj")
st.success("Sonification complete. Click the button below to play the audio.")
st.audio(audio_path, format="audio/wav")
# Remove the temporary .obj file
os.remove("temp_obj.obj")
else:
# Handle image
if uploaded_file is not None:
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
if st.button("Generate Audio"):
# Convert the Image object to a NumPy array
image_np = np.array(image)
audio = img_to_audio(
image_np,
time=time,
n_fft=n_fft,
hop_length=hop_length,
n_iter=n_iter,
contrast_stretch=contrast_stretch,
hist_equalize=hist_equalize,
improve_reconstruction=improve_reconstruction,
)
# Display the audio with the sample rate as metadata
st.audio(audio[1], format="audio/wav", sample_rate=audio[0])
# Generate and display the waveform plot
generate_waveform(audio[1], audio[0])
if __name__ == "__main__":
main()