Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import numpy as np | |
| import librosa | |
| from PIL import Image, ImageOps | |
| import matplotlib.pyplot as plt | |
| from scipy.ndimage import gaussian_filter | |
| import cv2 | |
| import os | |
| import soundfile as sf | |
| from pydub import AudioSegment | |
| from pywavefront import Wavefront | |
| import pywavefront | |
| def img_to_audio(image, time=3.0, rate=44100, n_fft=2048, n_iter=64, hop_length=512, contrast_stretch=False, hist_equalize=False, improve_reconstruction=False): | |
| # Load image | |
| img = Image.fromarray(image).convert("L") | |
| # Apply preprocessing techniques | |
| if contrast_stretch: | |
| img = Image.fromarray(np.uint8(255 * (np.asarray(img) - np.min(img)) / (np.max(img) - np.min(img)))) | |
| if hist_equalize: | |
| img = ImageOps.equalize(ImageOps.autocontrast(img)).convert("L") | |
| # Calculate spectrogram size | |
| spec_shape = (int(librosa.time_to_frames(1.0, sr=rate, hop_length=hop_length, n_fft=n_fft) * time), n_fft) | |
| spec = np.asarray(img.resize(spec_shape)) | |
| spec = np.interp(spec, (spec.min(), spec.max()), (-30, 10)) # Adjust the range | |
| spec = librosa.db_to_amplitude(spec) | |
| if improve_reconstruction: | |
| # Use advanced reconstruction method | |
| audio = librosa.effects.preemphasis(librosa.feature.inverse.mel_to_audio(spec)) | |
| else: | |
| # Use Griffin-Lim for reconstruction | |
| audio = librosa.griffinlim(spec, n_iter=n_iter, hop_length=hop_length) | |
| # Apply smoothing to make the audio more appealing | |
| audio = smooth_audio(audio) | |
| return rate, audio | |
| def smooth_audio(audio, sigma=1): | |
| # Apply Gaussian smoothing to the audio | |
| smoothed_audio = gaussian_filter(audio, sigma=sigma) | |
| return smoothed_audio | |
| def generate_waveform(audio, rate): | |
| # Plot the audio waveform | |
| plt.figure(figsize=(10, 4)) | |
| plt.plot(np.arange(len(audio)) / rate, audio, color='b') | |
| plt.title('Audio Waveform') | |
| plt.xlabel('Time (s)') | |
| plt.ylabel('Amplitude') | |
| plt.grid(True) | |
| plt.tight_layout() | |
| st.set_option('deprecation.showPyplotGlobalUse', False) | |
| # Display the waveform plot | |
| st.pyplot() | |
| def read_video_frames(uploaded_file, frame_skip=1): | |
| # Save the uploaded video temporarily to a file | |
| with open("temp_video.mp4", "wb") as temp_video_file: | |
| temp_video_file.write(uploaded_file.read()) | |
| # Open the temporarily saved video file | |
| cap = cv2.VideoCapture("temp_video.mp4") | |
| frames = [] | |
| frame_count = 0 | |
| try: | |
| while cap.isOpened(): | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| frame_count += 1 | |
| # Skip frames based on frame_skip value | |
| if frame_count % frame_skip == 0: | |
| frames.append(frame) | |
| except Exception as e: | |
| st.error(f"Error processing frames: {str(e)}") | |
| finally: | |
| # Close the video capture object and release the file | |
| cap.release() | |
| # Remove the temporary video file | |
| os.remove("temp_video.mp4") | |
| return frames | |
| def video_to_audio(video_frames, output_audio_path, time=3.0, rate=44100, n_fft=2048, n_iter=64, hop_length=512, contrast_stretch=False, hist_equalize=False, improve_reconstruction=False): | |
| audio_frames = [] | |
| video_frame_rate = len(video_frames) / time # Calculate the frame rate of the video | |
| for i, frame in enumerate(video_frames): | |
| # Calculate the corresponding time for the audio segment | |
| audio_time = i / video_frame_rate | |
| # Ensure that we generate audio only within the specified time | |
| if audio_time >= time: | |
| break | |
| audio = img_to_audio(frame, time, rate, n_fft, n_iter, hop_length, contrast_stretch, hist_equalize, improve_reconstruction) | |
| audio_frames.append(audio[1]) | |
| audio_frames = np.concatenate(audio_frames) | |
| # Save the resulting audio as a WAV file | |
| sf.write(output_audio_path, audio_frames, rate) | |
| def obj_to_audio(obj_file_path, time=3.0, rate=22050, sigma=1): | |
| try: | |
| # Read and process the .obj file | |
| obj = Wavefront(obj_file_path) | |
| # Define the range for mapping x-coordinate (min_x and max_x) to pitch (min_pitch and max_pitch) | |
| min_x = min(vertices, key=lambda x: x[0])[0] # Find the minimum x-coordinate in your vertices | |
| max_x = max(vertices, key=lambda x: x[0])[0] # Find the maximum x-coordinate in your vertices | |
| min_pitch = 100 # Minimum pitch value | |
| max_pitch = 1000 # Maximum pitch value | |
| # Define the range for mapping y-coordinate (min_y and max_y) to volume (min_volume and max_volume) | |
| min_y = min(vertices, key=lambda x: x[1])[1] # Find the minimum y-coordinate in your vertices | |
| max_y = max(vertices, key=lambda x: x[1])[1] # Find the maximum y-coordinate in your vertices | |
| min_volume = -20 # Minimum volume in dB (e.g., -20 dB) | |
| max_volume = 0 # Maximum volume in dB (e.g., 0 dB) | |
| # Extract vertex positions | |
| vertices = obj.vertices | |
| audio_segments = [] # Store audio segments for each vertex | |
| for vertex in vertices: | |
| if len(vertex) != 3: | |
| continue | |
| x, y, z = vertex | |
| # Map vertex positions to audio parameters (example: pitch and volume) | |
| pitch = map_to_range(x, min_x, max_x, min_pitch, max_pitch) | |
| volume = map_to_range(z, min_x, max_x, min_volume, max_volume) | |
| # Create an audio segment for this vertex | |
| vertex_audio = AudioSegment.silent(duration=int(time * 1000)) # Duration in milliseconds | |
| vertex_audio = vertex_audio + AudioSegment.silent(duration=100) # A small gap between vertex sounds | |
| vertex_audio = vertex_audio + create_audio_from_parameters(time, rate, pitch, volume) | |
| audio_segments.append(vertex_audio) | |
| # Combine all audio segments into a single audio | |
| audio = AudioSegment.silent(duration=0) | |
| for segment in audio_segments: | |
| audio += segment | |
| # Apply Gaussian smoothing to the audio | |
| audio = audio.low_pass_filter(sigma * 1000) # Sigma in Hz | |
| # Generate a unique output audio path | |
| audio_path = f"output_audio_{int(time)}s.wav" | |
| audio.export(audio_path, format="wav") | |
| return audio_path | |
| except Exception as e: | |
| st.error(f"Error processing OBJ file: {str(e)}") | |
| return None | |
| # Helper function to map values from one range to another | |
| def map_to_range(value, from_min, from_max, to_min, to_max): | |
| return (value - from_min) / (from_max - from_min) * (to_max - to_min) + to_min | |
| def create_audio_from_parameters(time, rate, pitch, volume): | |
| # Define audio properties | |
| duration_ms = int(time * 1000) # Duration in milliseconds | |
| sample_rate = rate # Sample rate in Hz | |
| num_samples = int(duration_ms * sample_rate / 1000) | |
| # Create time values for the audio | |
| t = np.linspace(0, time, num_samples, endpoint=False) | |
| # Generate audio waveform based on pitch and volume | |
| frequency = 440.0 * 2**(pitch / 12.0) # Calculate frequency from pitch (assuming A440 reference) | |
| amplitude = 0.5 * volume # Adjust volume | |
| # Generate a simple sine wave as an example | |
| audio_data = amplitude * np.sin(2 * np.pi * frequency * t) | |
| # Convert the audio data to a PyDub AudioSegment | |
| audio_segment = AudioSegment( | |
| audio_data.tobytes(), # Audio data as bytes | |
| frame_rate=sample_rate, # Sample rate | |
| sample_width=audio_data.dtype.itemsize, # Sample width in bytes | |
| channels=1 # Mono audio | |
| ) | |
| return audio_segment | |
| def main(): | |
| st.title("Improved Image, Video, and 3D Object Sonification") | |
| time = st.slider("Audio Time (seconds)", 1.0, 50.0, 3.0, 0.1) | |
| n_fft = st.slider("n_fft", 512, 2048, 1024, 64) | |
| hop_length = st.slider("hop_length", 256, 1024, 512, 64) | |
| n_iter = st.slider("n_iter", 10, 100, 64, 10) | |
| contrast_stretch = st.checkbox("Apply Contrast Stretching") | |
| hist_equalize = st.checkbox("Apply Histogram Equalization") | |
| improve_reconstruction = st.checkbox("Improve Griffin-Lim Reconstruction") | |
| uploaded_file = st.file_uploader("Upload a 3D image or video", type=["jpg", "png", "jpeg", "mp4", "obj"]) | |
| frame_skip = st.slider("Frame Skip", 1, 100, 1) | |
| if uploaded_file is not None: | |
| # Check if the uploaded file is an image, video, or 3D object | |
| if uploaded_file.type.startswith('video'): | |
| # Handle video | |
| video_frames = read_video_frames(uploaded_file, frame_skip) | |
| st.video(uploaded_file) | |
| output_audio_path = "output_audio.wav" | |
| video_to_audio( | |
| video_frames, | |
| output_audio_path, | |
| time=time, | |
| n_fft=n_fft, | |
| hop_length=hop_length, | |
| n_iter=n_iter, | |
| contrast_stretch=contrast_stretch, | |
| hist_equalize=hist_equalize, | |
| improve_reconstruction=improve_reconstruction, | |
| ) | |
| st.success("Audio generation complete. Click the button below to download the audio.") | |
| audio_bytes = open(output_audio_path, "rb").read() | |
| st.audio(audio_bytes, format="audio/wav") | |
| elif uploaded_file.name.endswith('.obj'): | |
| # Handle 3D object | |
| st.info("Processing the 3D object...") | |
| # Save the uploaded .obj file temporarily | |
| with open("temp_obj.obj", "wb") as temp_obj_file: | |
| temp_obj_file.write(uploaded_file.read()) | |
| # Perform sonification | |
| audio_path = obj_to_audio("temp_obj.obj") | |
| st.success("Sonification complete. Click the button below to play the audio.") | |
| st.audio(audio_path, format="audio/wav") | |
| # Remove the temporary .obj file | |
| os.remove("temp_obj.obj") | |
| else: | |
| # Handle image | |
| if uploaded_file is not None: | |
| image = Image.open(uploaded_file) | |
| st.image(image, caption="Uploaded Image", use_column_width=True) | |
| if st.button("Generate Audio"): | |
| # Convert the Image object to a NumPy array | |
| image_np = np.array(image) | |
| audio = img_to_audio( | |
| image_np, | |
| time=time, | |
| n_fft=n_fft, | |
| hop_length=hop_length, | |
| n_iter=n_iter, | |
| contrast_stretch=contrast_stretch, | |
| hist_equalize=hist_equalize, | |
| improve_reconstruction=improve_reconstruction, | |
| ) | |
| # Display the audio with the sample rate as metadata | |
| st.audio(audio[1], format="audio/wav", sample_rate=audio[0]) | |
| # Generate and display the waveform plot | |
| generate_waveform(audio[1], audio[0]) | |
| if __name__ == "__main__": | |
| main() | |