import streamlit as st import numpy as np import librosa from PIL import Image, ImageOps import matplotlib.pyplot as plt from scipy.ndimage import gaussian_filter import cv2 import os import soundfile as sf from pydub import AudioSegment from pywavefront import Wavefront import pywavefront def img_to_audio(image, time=3.0, rate=44100, n_fft=2048, n_iter=64, hop_length=512, contrast_stretch=False, hist_equalize=False, improve_reconstruction=False): # Load image img = Image.fromarray(image).convert("L") # Apply preprocessing techniques if contrast_stretch: img = Image.fromarray(np.uint8(255 * (np.asarray(img) - np.min(img)) / (np.max(img) - np.min(img)))) if hist_equalize: img = ImageOps.equalize(ImageOps.autocontrast(img)).convert("L") # Calculate spectrogram size spec_shape = (int(librosa.time_to_frames(1.0, sr=rate, hop_length=hop_length, n_fft=n_fft) * time), n_fft) spec = np.asarray(img.resize(spec_shape)) spec = np.interp(spec, (spec.min(), spec.max()), (-30, 10)) # Adjust the range spec = librosa.db_to_amplitude(spec) if improve_reconstruction: # Use advanced reconstruction method audio = librosa.effects.preemphasis(librosa.feature.inverse.mel_to_audio(spec)) else: # Use Griffin-Lim for reconstruction audio = librosa.griffinlim(spec, n_iter=n_iter, hop_length=hop_length) # Apply smoothing to make the audio more appealing audio = smooth_audio(audio) return rate, audio def smooth_audio(audio, sigma=1): # Apply Gaussian smoothing to the audio smoothed_audio = gaussian_filter(audio, sigma=sigma) return smoothed_audio def generate_waveform(audio, rate): # Plot the audio waveform plt.figure(figsize=(10, 4)) plt.plot(np.arange(len(audio)) / rate, audio, color='b') plt.title('Audio Waveform') plt.xlabel('Time (s)') plt.ylabel('Amplitude') plt.grid(True) plt.tight_layout() st.set_option('deprecation.showPyplotGlobalUse', False) # Display the waveform plot st.pyplot() def read_video_frames(uploaded_file, frame_skip=1): # Save the uploaded video temporarily to a file with open("temp_video.mp4", "wb") as temp_video_file: temp_video_file.write(uploaded_file.read()) # Open the temporarily saved video file cap = cv2.VideoCapture("temp_video.mp4") frames = [] frame_count = 0 try: while cap.isOpened(): ret, frame = cap.read() if not ret: break frame_count += 1 # Skip frames based on frame_skip value if frame_count % frame_skip == 0: frames.append(frame) except Exception as e: st.error(f"Error processing frames: {str(e)}") finally: # Close the video capture object and release the file cap.release() # Remove the temporary video file os.remove("temp_video.mp4") return frames def video_to_audio(video_frames, output_audio_path, time=3.0, rate=44100, n_fft=2048, n_iter=64, hop_length=512, contrast_stretch=False, hist_equalize=False, improve_reconstruction=False): audio_frames = [] video_frame_rate = len(video_frames) / time # Calculate the frame rate of the video for i, frame in enumerate(video_frames): # Calculate the corresponding time for the audio segment audio_time = i / video_frame_rate # Ensure that we generate audio only within the specified time if audio_time >= time: break audio = img_to_audio(frame, time, rate, n_fft, n_iter, hop_length, contrast_stretch, hist_equalize, improve_reconstruction) audio_frames.append(audio[1]) audio_frames = np.concatenate(audio_frames) # Save the resulting audio as a WAV file sf.write(output_audio_path, audio_frames, rate) def obj_to_audio(obj_file_path, time=3.0, rate=22050, sigma=1): try: # Read and process the .obj file obj = Wavefront(obj_file_path) # Define the range for mapping x-coordinate (min_x and max_x) to pitch (min_pitch and max_pitch) min_x = min(vertices, key=lambda x: x[0])[0] # Find the minimum x-coordinate in your vertices max_x = max(vertices, key=lambda x: x[0])[0] # Find the maximum x-coordinate in your vertices min_pitch = 100 # Minimum pitch value max_pitch = 1000 # Maximum pitch value # Define the range for mapping y-coordinate (min_y and max_y) to volume (min_volume and max_volume) min_y = min(vertices, key=lambda x: x[1])[1] # Find the minimum y-coordinate in your vertices max_y = max(vertices, key=lambda x: x[1])[1] # Find the maximum y-coordinate in your vertices min_volume = -20 # Minimum volume in dB (e.g., -20 dB) max_volume = 0 # Maximum volume in dB (e.g., 0 dB) # Extract vertex positions vertices = obj.vertices audio_segments = [] # Store audio segments for each vertex for vertex in vertices: if len(vertex) != 3: continue x, y, z = vertex # Map vertex positions to audio parameters (example: pitch and volume) pitch = map_to_range(x, min_x, max_x, min_pitch, max_pitch) volume = map_to_range(z, min_x, max_x, min_volume, max_volume) # Create an audio segment for this vertex vertex_audio = AudioSegment.silent(duration=int(time * 1000)) # Duration in milliseconds vertex_audio = vertex_audio + AudioSegment.silent(duration=100) # A small gap between vertex sounds vertex_audio = vertex_audio + create_audio_from_parameters(time, rate, pitch, volume) audio_segments.append(vertex_audio) # Combine all audio segments into a single audio audio = AudioSegment.silent(duration=0) for segment in audio_segments: audio += segment # Apply Gaussian smoothing to the audio audio = audio.low_pass_filter(sigma * 1000) # Sigma in Hz # Generate a unique output audio path audio_path = f"output_audio_{int(time)}s.wav" audio.export(audio_path, format="wav") return audio_path except Exception as e: st.error(f"Error processing OBJ file: {str(e)}") return None # Helper function to map values from one range to another def map_to_range(value, from_min, from_max, to_min, to_max): return (value - from_min) / (from_max - from_min) * (to_max - to_min) + to_min def create_audio_from_parameters(time, rate, pitch, volume): # Define audio properties duration_ms = int(time * 1000) # Duration in milliseconds sample_rate = rate # Sample rate in Hz num_samples = int(duration_ms * sample_rate / 1000) # Create time values for the audio t = np.linspace(0, time, num_samples, endpoint=False) # Generate audio waveform based on pitch and volume frequency = 440.0 * 2**(pitch / 12.0) # Calculate frequency from pitch (assuming A440 reference) amplitude = 0.5 * volume # Adjust volume # Generate a simple sine wave as an example audio_data = amplitude * np.sin(2 * np.pi * frequency * t) # Convert the audio data to a PyDub AudioSegment audio_segment = AudioSegment( audio_data.tobytes(), # Audio data as bytes frame_rate=sample_rate, # Sample rate sample_width=audio_data.dtype.itemsize, # Sample width in bytes channels=1 # Mono audio ) return audio_segment def main(): st.title("Improved Image, Video, and 3D Object Sonification") time = st.slider("Audio Time (seconds)", 1.0, 50.0, 3.0, 0.1) n_fft = st.slider("n_fft", 512, 2048, 1024, 64) hop_length = st.slider("hop_length", 256, 1024, 512, 64) n_iter = st.slider("n_iter", 10, 100, 64, 10) contrast_stretch = st.checkbox("Apply Contrast Stretching") hist_equalize = st.checkbox("Apply Histogram Equalization") improve_reconstruction = st.checkbox("Improve Griffin-Lim Reconstruction") uploaded_file = st.file_uploader("Upload a 3D image or video", type=["jpg", "png", "jpeg", "mp4", "obj"]) frame_skip = st.slider("Frame Skip", 1, 100, 1) if uploaded_file is not None: # Check if the uploaded file is an image, video, or 3D object if uploaded_file.type.startswith('video'): # Handle video video_frames = read_video_frames(uploaded_file, frame_skip) st.video(uploaded_file) output_audio_path = "output_audio.wav" video_to_audio( video_frames, output_audio_path, time=time, n_fft=n_fft, hop_length=hop_length, n_iter=n_iter, contrast_stretch=contrast_stretch, hist_equalize=hist_equalize, improve_reconstruction=improve_reconstruction, ) st.success("Audio generation complete. Click the button below to download the audio.") audio_bytes = open(output_audio_path, "rb").read() st.audio(audio_bytes, format="audio/wav") elif uploaded_file.name.endswith('.obj'): # Handle 3D object st.info("Processing the 3D object...") # Save the uploaded .obj file temporarily with open("temp_obj.obj", "wb") as temp_obj_file: temp_obj_file.write(uploaded_file.read()) # Perform sonification audio_path = obj_to_audio("temp_obj.obj") st.success("Sonification complete. Click the button below to play the audio.") st.audio(audio_path, format="audio/wav") # Remove the temporary .obj file os.remove("temp_obj.obj") else: # Handle image if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image", use_column_width=True) if st.button("Generate Audio"): # Convert the Image object to a NumPy array image_np = np.array(image) audio = img_to_audio( image_np, time=time, n_fft=n_fft, hop_length=hop_length, n_iter=n_iter, contrast_stretch=contrast_stretch, hist_equalize=hist_equalize, improve_reconstruction=improve_reconstruction, ) # Display the audio with the sample rate as metadata st.audio(audio[1], format="audio/wav", sample_rate=audio[0]) # Generate and display the waveform plot generate_waveform(audio[1], audio[0]) if __name__ == "__main__": main()