Spaces:
Runtime error
Runtime error
File size: 10,952 Bytes
dbd2d50 602919c dbd2d50 602919c dbd2d50 3c461f7 602919c ed58c12 dbd2d50 0405cb9 dbd2d50 602919c dbd2d50 3c461f7 dbd2d50 3c461f7 dbd2d50 3c461f7 dbd2d50 3c461f7 dbd2d50 3c461f7 dbd2d50 3c461f7 dbd2d50 3c461f7 dbd2d50 3c461f7 dbd2d50 3c461f7 8dcbc91 3c461f7 8dcbc91 3c461f7 8dcbc91 3c461f7 8dcbc91 3c461f7 8dcbc91 3c461f7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 | import streamlit as st
import numpy as np
import librosa
from PIL import Image, ImageOps
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter
import cv2
import os
import soundfile as sf
from pydub import AudioSegment
from pywavefront import Wavefront
import pywavefront
def img_to_audio(image, time=3.0, rate=44100, n_fft=2048, n_iter=64, hop_length=512, contrast_stretch=False, hist_equalize=False, improve_reconstruction=False):
# Load image
img = Image.fromarray(image).convert("L")
# Apply preprocessing techniques
if contrast_stretch:
img = Image.fromarray(np.uint8(255 * (np.asarray(img) - np.min(img)) / (np.max(img) - np.min(img))))
if hist_equalize:
img = ImageOps.equalize(ImageOps.autocontrast(img)).convert("L")
# Calculate spectrogram size
spec_shape = (int(librosa.time_to_frames(1.0, sr=rate, hop_length=hop_length, n_fft=n_fft) * time), n_fft)
spec = np.asarray(img.resize(spec_shape))
spec = np.interp(spec, (spec.min(), spec.max()), (-30, 10)) # Adjust the range
spec = librosa.db_to_amplitude(spec)
if improve_reconstruction:
# Use advanced reconstruction method
audio = librosa.effects.preemphasis(librosa.feature.inverse.mel_to_audio(spec))
else:
# Use Griffin-Lim for reconstruction
audio = librosa.griffinlim(spec, n_iter=n_iter, hop_length=hop_length)
# Apply smoothing to make the audio more appealing
audio = smooth_audio(audio)
return rate, audio
def smooth_audio(audio, sigma=1):
# Apply Gaussian smoothing to the audio
smoothed_audio = gaussian_filter(audio, sigma=sigma)
return smoothed_audio
def generate_waveform(audio, rate):
# Plot the audio waveform
plt.figure(figsize=(10, 4))
plt.plot(np.arange(len(audio)) / rate, audio, color='b')
plt.title('Audio Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.grid(True)
plt.tight_layout()
st.set_option('deprecation.showPyplotGlobalUse', False)
# Display the waveform plot
st.pyplot()
def read_video_frames(uploaded_file, frame_skip=1):
# Save the uploaded video temporarily to a file
with open("temp_video.mp4", "wb") as temp_video_file:
temp_video_file.write(uploaded_file.read())
# Open the temporarily saved video file
cap = cv2.VideoCapture("temp_video.mp4")
frames = []
frame_count = 0
try:
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frame_count += 1
# Skip frames based on frame_skip value
if frame_count % frame_skip == 0:
frames.append(frame)
except Exception as e:
st.error(f"Error processing frames: {str(e)}")
finally:
# Close the video capture object and release the file
cap.release()
# Remove the temporary video file
os.remove("temp_video.mp4")
return frames
def video_to_audio(video_frames, output_audio_path, time=3.0, rate=44100, n_fft=2048, n_iter=64, hop_length=512, contrast_stretch=False, hist_equalize=False, improve_reconstruction=False):
audio_frames = []
video_frame_rate = len(video_frames) / time # Calculate the frame rate of the video
for i, frame in enumerate(video_frames):
# Calculate the corresponding time for the audio segment
audio_time = i / video_frame_rate
# Ensure that we generate audio only within the specified time
if audio_time >= time:
break
audio = img_to_audio(frame, time, rate, n_fft, n_iter, hop_length, contrast_stretch, hist_equalize, improve_reconstruction)
audio_frames.append(audio[1])
audio_frames = np.concatenate(audio_frames)
# Save the resulting audio as a WAV file
sf.write(output_audio_path, audio_frames, rate)
def obj_to_audio(obj_file_path, time=3.0, rate=22050, sigma=1):
try:
# Read and process the .obj file
obj = Wavefront(obj_file_path)
# Define the range for mapping x-coordinate (min_x and max_x) to pitch (min_pitch and max_pitch)
min_x = min(vertices, key=lambda x: x[0])[0] # Find the minimum x-coordinate in your vertices
max_x = max(vertices, key=lambda x: x[0])[0] # Find the maximum x-coordinate in your vertices
min_pitch = 100 # Minimum pitch value
max_pitch = 1000 # Maximum pitch value
# Define the range for mapping y-coordinate (min_y and max_y) to volume (min_volume and max_volume)
min_y = min(vertices, key=lambda x: x[1])[1] # Find the minimum y-coordinate in your vertices
max_y = max(vertices, key=lambda x: x[1])[1] # Find the maximum y-coordinate in your vertices
min_volume = -20 # Minimum volume in dB (e.g., -20 dB)
max_volume = 0 # Maximum volume in dB (e.g., 0 dB)
# Extract vertex positions
vertices = obj.vertices
audio_segments = [] # Store audio segments for each vertex
for vertex in vertices:
if len(vertex) != 3:
continue
x, y, z = vertex
# Map vertex positions to audio parameters (example: pitch and volume)
pitch = map_to_range(x, min_x, max_x, min_pitch, max_pitch)
volume = map_to_range(z, min_x, max_x, min_volume, max_volume)
# Create an audio segment for this vertex
vertex_audio = AudioSegment.silent(duration=int(time * 1000)) # Duration in milliseconds
vertex_audio = vertex_audio + AudioSegment.silent(duration=100) # A small gap between vertex sounds
vertex_audio = vertex_audio + create_audio_from_parameters(time, rate, pitch, volume)
audio_segments.append(vertex_audio)
# Combine all audio segments into a single audio
audio = AudioSegment.silent(duration=0)
for segment in audio_segments:
audio += segment
# Apply Gaussian smoothing to the audio
audio = audio.low_pass_filter(sigma * 1000) # Sigma in Hz
# Generate a unique output audio path
audio_path = f"output_audio_{int(time)}s.wav"
audio.export(audio_path, format="wav")
return audio_path
except Exception as e:
st.error(f"Error processing OBJ file: {str(e)}")
return None
# Helper function to map values from one range to another
def map_to_range(value, from_min, from_max, to_min, to_max):
return (value - from_min) / (from_max - from_min) * (to_max - to_min) + to_min
def create_audio_from_parameters(time, rate, pitch, volume):
# Define audio properties
duration_ms = int(time * 1000) # Duration in milliseconds
sample_rate = rate # Sample rate in Hz
num_samples = int(duration_ms * sample_rate / 1000)
# Create time values for the audio
t = np.linspace(0, time, num_samples, endpoint=False)
# Generate audio waveform based on pitch and volume
frequency = 440.0 * 2**(pitch / 12.0) # Calculate frequency from pitch (assuming A440 reference)
amplitude = 0.5 * volume # Adjust volume
# Generate a simple sine wave as an example
audio_data = amplitude * np.sin(2 * np.pi * frequency * t)
# Convert the audio data to a PyDub AudioSegment
audio_segment = AudioSegment(
audio_data.tobytes(), # Audio data as bytes
frame_rate=sample_rate, # Sample rate
sample_width=audio_data.dtype.itemsize, # Sample width in bytes
channels=1 # Mono audio
)
return audio_segment
def main():
st.title("Improved Image, Video, and 3D Object Sonification")
time = st.slider("Audio Time (seconds)", 1.0, 50.0, 3.0, 0.1)
n_fft = st.slider("n_fft", 512, 2048, 1024, 64)
hop_length = st.slider("hop_length", 256, 1024, 512, 64)
n_iter = st.slider("n_iter", 10, 100, 64, 10)
contrast_stretch = st.checkbox("Apply Contrast Stretching")
hist_equalize = st.checkbox("Apply Histogram Equalization")
improve_reconstruction = st.checkbox("Improve Griffin-Lim Reconstruction")
uploaded_file = st.file_uploader("Upload a 3D image or video", type=["jpg", "png", "jpeg", "mp4", "obj"])
frame_skip = st.slider("Frame Skip", 1, 100, 1)
if uploaded_file is not None:
# Check if the uploaded file is an image, video, or 3D object
if uploaded_file.type.startswith('video'):
# Handle video
video_frames = read_video_frames(uploaded_file, frame_skip)
st.video(uploaded_file)
output_audio_path = "output_audio.wav"
video_to_audio(
video_frames,
output_audio_path,
time=time,
n_fft=n_fft,
hop_length=hop_length,
n_iter=n_iter,
contrast_stretch=contrast_stretch,
hist_equalize=hist_equalize,
improve_reconstruction=improve_reconstruction,
)
st.success("Audio generation complete. Click the button below to download the audio.")
audio_bytes = open(output_audio_path, "rb").read()
st.audio(audio_bytes, format="audio/wav")
elif uploaded_file.name.endswith('.obj'):
# Handle 3D object
st.info("Processing the 3D object...")
# Save the uploaded .obj file temporarily
with open("temp_obj.obj", "wb") as temp_obj_file:
temp_obj_file.write(uploaded_file.read())
# Perform sonification
audio_path = obj_to_audio("temp_obj.obj")
st.success("Sonification complete. Click the button below to play the audio.")
st.audio(audio_path, format="audio/wav")
# Remove the temporary .obj file
os.remove("temp_obj.obj")
else:
# Handle image
if uploaded_file is not None:
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
if st.button("Generate Audio"):
# Convert the Image object to a NumPy array
image_np = np.array(image)
audio = img_to_audio(
image_np,
time=time,
n_fft=n_fft,
hop_length=hop_length,
n_iter=n_iter,
contrast_stretch=contrast_stretch,
hist_equalize=hist_equalize,
improve_reconstruction=improve_reconstruction,
)
# Display the audio with the sample rate as metadata
st.audio(audio[1], format="audio/wav", sample_rate=audio[0])
# Generate and display the waveform plot
generate_waveform(audio[1], audio[0])
if __name__ == "__main__":
main()
|