Spaces:

gnosticdev
/

audio-to-video-generator

Sleeping

App Files Files Community

wower99 commited on Jan 25, 2025

Commit

e952cc2

1 Parent(s): 7aef441

Better UI with progressbar and download button

Browse files

Files changed (2) hide show

app.py +33 -32
utils.py +51 -97

app.py CHANGED Viewed

@@ -40,15 +40,6 @@ audio_file = st.file_uploader("🔼 Upload your audio file:", type=constants.SUP
 print(audio_file,'is the upload')
-# if audio_file is not None:
-#     # Check the duration of the uploaded audio file
-#     duration = get_audio_duration(audio_file)
-#     # Allow only files up to 5 minutes (300 seconds)
-#     if duration > 300:
-#         st.error("The uploaded audio file exceeds the 5-minute limit. Please upload a shorter file.")
-#     else:
-#         st.success(f"Audio file uploaded successfully! Duration: {duration/60:.2f} minutes")
 if audio_file:
     # Reset states only when a new file is uploaded
@@ -69,7 +60,7 @@ if audio_file:
     result = client.audio.transcriptions.create(
         file=(audio_file.name, file_bytes),  # Send the audio file content directly to the API
         model="whisper-large-v3-turbo",  # Model to use for transcription
-        prompt="Specify context or spelling",  # Optional context for better transcription accuracy
         response_format="verbose_json",  # Return detailed JSON response
         temperature=0.0,  # Control randomness in the transcription output
     )
@@ -115,35 +106,45 @@ if audio_file:
     # Generate images only if they have not been generated already
     if st.session_state.image_prompts and not st.session_state.generated_images:
-        with st.spinner("Generating images... Please wait."):
-            for prompt, image_path in generate_images(st.session_state.image_prompts):
-                # # Display each image as soon as it's generated
-                # st.image(image_path, caption=f"{prompt}", use_container_width=True)
-                # Append the generated image to the session state
-                st.session_state.generated_images.append((prompt, image_path))
-    # # Display all previously generated images (including newly generated ones)
-    # else:
-    #     for prompt, image_path in st.session_state.generated_images:
-    #         st.image(image_path, caption=f"{prompt}", use_container_width=True)
     # Generate video when all images are generated
     if st.session_state.generated_images and st.session_state.audio:
-        if st.button("Generate Video"):
-            with st.spinner("Generating video... Please wait."):
-                # Map images to segments
-                image_paths = [img[1] for img in st.session_state.generated_images]
-                generated_video_path = generate_video(
-                    audio_file=st.session_state.audio,
-                    images=image_paths,
-                    segments=st.session_state.segments
-                )
-                st.session_state.generated_video = generated_video_path
-                st.success("Video generated successfully!")
     # Display the generated video
     if st.session_state.generated_video:
         st.video(st.session_state.generated_video)
 else:
     st.warning("Please upload an audio file to proceed.")

 print(audio_file,'is the upload')
 if audio_file:
     # Reset states only when a new file is uploaded
     result = client.audio.transcriptions.create(
         file=(audio_file.name, file_bytes),  # Send the audio file content directly to the API
         model="whisper-large-v3-turbo",  # Model to use for transcription
+        prompt="Take Note of Overall Context of the Audio",  # Optional context for better transcription accuracy
         response_format="verbose_json",  # Return detailed JSON response
         temperature=0.0,  # Control randomness in the transcription output
     )
     # Generate images only if they have not been generated already
     if st.session_state.image_prompts and not st.session_state.generated_images:
+        progress_placeholder = st.empty()
+        progress_bar = st.progress(0)
+        total_images = len(st.session_state.image_prompts)
+        progress_placeholder.text(f"Generating images. Please be patient...")
+        for idx, (prompt, image_path) in enumerate(generate_images(st.session_state.image_prompts)):
+            st.session_state.generated_images.append((prompt, image_path))
+            progress = (idx + 1) / total_images
+            progress_bar.progress(progress)
+            progress_placeholder.text(f"Generated image {idx + 1} of {total_images}: {prompt[:50]}...")
+        progress_placeholder.text("✅ All images generated successfully!")
+        progress_bar.empty()
     # Generate video when all images are generated
     if st.session_state.generated_images and st.session_state.audio:
+        with st.spinner("Generating video... Please wait."):
+            # Map images to segments
+            image_paths = [img[1] for img in st.session_state.generated_images]
+            generated_video_path = generate_video(
+                audio_file=st.session_state.audio,
+                images=image_paths,
+                segments=st.session_state.segments
+            )
+            st.session_state.generated_video = generated_video_path
+            st.success("Video generated successfully!")
     # Display the generated video
     if st.session_state.generated_video:
         st.video(st.session_state.generated_video)
+        # Add a download button for the generated video
+        with open(st.session_state.generated_video, "rb") as file:
+            st.download_button(
+                label="Download Video",
+                data=file,
+                file_name="generated_video.mp4",
+                mime="video/mp4"
+            )
 else:
     st.warning("Please upload an audio file to proceed.")

utils.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import requests
 import constants
 import os
@@ -54,33 +53,7 @@ def get_translation(text: str):
         print(f"An exception occurred: {e}")
         return {"error_occured" : e}
-def old_get_image_prompts(text_input):
-    headers = {
-        "Authorization": f"Bearer {constants.HF_TOKEN}",  # Replace with your token
-        "Content-Type": "application/json"  # Optional, ensures JSON payload
-    }
-    endpoint = f"{constants.PROMPT_GENERATION_ENDPOINT}"
-    payload = {"text_input": text_input}
-    try:
-            # Send the POST request
-        print("making post request for image prompts", endpoint)
-        response = requests.post(endpoint, json=payload, headers=headers)
-        # Raise an exception for HTTP errors
-        response.raise_for_status()
-        # Parse JSON response
-        result = response.json()
-        return result
-    except requests.exceptions.RequestException as e:
-        print(f"Error during request: {e}")
-        return {"error": str(e)}
 def segments_to_chunks(segments):
     chunks = []
     for segment in segments:
@@ -98,7 +71,7 @@ def get_image_prompts(text_input : List):
     extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
     chunks_count = len(text_input)
     chunks = "chunk: " + "\nchunk: ".join(text_input)
-    prompt = f"""ROLE: You are a Highly Experienced Image Prompt Sythesizer
     TASK:  Generate {chunks_count} image prompts, Each per chunk\n\n {chunks}"""
     result = extractor.extract(prompt)
     return result.model_dump()   # returns dictionary version pydantic model
@@ -158,62 +131,15 @@ def tmp_folder(folder_name: str) -> str:
-def old_generate_video(audio_file, images, segments):
-    print(f"images: {images}")
-    print(f"segments: {segments}")
-    print(f"audio file: {audio_file.name}")
-    try:
-        # Save the uploaded audio file to a temporary location
-        file_extension = os.path.splitext(audio_file.name)[1]
-        temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=f"{file_extension}")
-        temp_audio_path.write(audio_file.read())
-        temp_audio_path.close()
-        # Load the audio file using MoviePy
-        audio = mp.AudioFileClip(temp_audio_path.name)
-        audio_duration = audio.duration
-        # Create video clips for each segment using the corresponding image
-        video_clips = []
-        for i, segment in enumerate(segments):
-            start_time = segment["start"]
-            end_time = segment["end"]
-            # Ensure the image index is within bounds
-            image_path = images[min(i, len(images) - 1)]
-            # Create an ImageClip for the current segment
-            image_clip = ImageClip(image_path, duration=end_time - start_time)
-            image_clip = image_clip.set_start(start_time).set_end(end_time)
-            video_clips.append(image_clip)
-        # Concatenate all the image clips to form the video
-        video = mp.concatenate_videoclips(video_clips, method="compose")
-        # Add the audio to the video
-        video = video.set_audio(audio)
-        # Save the video to a temporary file
-        temp_dir = tempfile.gettempdir()
-        video_path = os.path.join(temp_dir, "generated_video.mp4")
-        video.write_videofile(video_path, fps=24, codec="libx264", audio_codec="aac")
-        # Clean up the temporary audio file
-        os.remove(temp_audio_path.name)
-        return video_path
-    except Exception as e:
-        print(f"Error generating video: {e}")
-        return
-from moviepy.editor import *
 def generate_video(audio_file, images, segments):
-    print(f"images: {images}")
-    print(f"segments: {segments}")
-    print(f"audio file: {audio_file.name}")
     try:
         # Save the uploaded audio file to a temporary location
         file_extension = os.path.splitext(audio_file.name)[1]
@@ -223,36 +149,58 @@ def generate_video(audio_file, images, segments):
         # Load the audio file using MoviePy
         audio = AudioFileClip(temp_audio_path.name)
-        audio_duration = audio.duration
-        # Define YouTube-like dimensions (16:9 aspect ratio, e.g., 1920x1080)
-        frame_width = 1920
-        frame_height = 1080
-        # Create video clips for each segment using the corresponding image
         video_clips = []
-        for i, segment in enumerate(segments):
-            start_time = segment["start"]
-            end_time = segment["end"]
             # Ensure the image index is within bounds
             image_path = images[min(i, len(images) - 1)]
             # Create an ImageClip for the current segment
-            image_clip = ImageClip(image_path, duration=end_time - start_time)
             # Resize and pad the image to fit a 16:9 aspect ratio
             image_clip = image_clip.resize(height=frame_height).on_color(
                 size=(frame_width, frame_height),
                 color=(0, 0, 0),  # Black background
                 pos="center"      # Center the image
             )
-            # Set the timing of the clip
-            image_clip = image_clip.set_start(start_time).set_end(end_time)
             video_clips.append(image_clip)
         # Concatenate all the image clips to form the video
         video = concatenate_videoclips(video_clips, method="compose")
         # Add the audio to the video
@@ -261,16 +209,22 @@ def generate_video(audio_file, images, segments):
         # Save the video to a temporary file
         temp_dir = tempfile.gettempdir()
         video_path = os.path.join(temp_dir, "generated_video.mp4")
-        video.write_videofile(video_path, fps=24, codec="libx264", audio_codec="aac")
         # Clean up the temporary audio file
         os.remove(temp_audio_path.name)
         return video_path
     except Exception as e:
         print(f"Error generating video: {e}")
-        return
 # Example usage:

 import requests
 import constants
 import os
         print(f"An exception occurred: {e}")
         return {"error_occured" : e}
 def segments_to_chunks(segments):
     chunks = []
     for segment in segments:
     extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
     chunks_count = len(text_input)
     chunks = "chunk: " + "\nchunk: ".join(text_input)
+    prompt = f"""ROLE: You are a Highly Experienced Image Prompt Sythesizer (try to avoid explicit unethical prompt gracefully as much as possible)
     TASK:  Generate {chunks_count} image prompts, Each per chunk\n\n {chunks}"""
     result = extractor.extract(prompt)
     return result.model_dump()   # returns dictionary version pydantic model
+from moviepy.editor import *
+import os
+import tempfile
+from moviepy.editor import AudioFileClip, ImageClip, concatenate_videoclips
 def generate_video(audio_file, images, segments):
     try:
         # Save the uploaded audio file to a temporary location
         file_extension = os.path.splitext(audio_file.name)[1]
         # Load the audio file using MoviePy
         audio = AudioFileClip(temp_audio_path.name)
+        # Define YouTube-like dimensions (16:9 aspect ratio)
+        frame_width = 1280
+        frame_height = 720
         video_clips = []
+        total_segments = len(segments)
+        for i, current_segment in enumerate(segments):
+            start_time = current_segment["start"]
+            end_time = current_segment["end"]
+            # Calculate the actual duration including any gap until the next segment
+            if i < total_segments - 1:
+                # If there's a next segment, extend until it starts
+                next_segment = segments[i + 1]
+                actual_end_time = next_segment["start"]
+            else:
+                # For the last segment, use its end time
+                actual_end_time = end_time
+            # Calculate total duration including any gap
+            segment_duration = actual_end_time - start_time
+            print(f"\nProcessing segment {i + 1}/{total_segments}:")
+            print(f"  Start time: {start_time}s")
+            print(f"  Base end time: {end_time}s")
+            print(f"  Actual end time: {actual_end_time}s")
+            print(f"  Total duration: {segment_duration}s")
+            print(f"  Text: '{current_segment['text']}'")
             # Ensure the image index is within bounds
             image_path = images[min(i, len(images) - 1)]
             # Create an ImageClip for the current segment
+            image_clip = ImageClip(image_path)
             # Resize and pad the image to fit a 16:9 aspect ratio
             image_clip = image_clip.resize(height=frame_height).on_color(
                 size=(frame_width, frame_height),
                 color=(0, 0, 0),  # Black background
                 pos="center"      # Center the image
             )
+            # Set the duration and start time for the clip
+            image_clip = image_clip.set_duration(segment_duration)
+            image_clip = image_clip.set_start(start_time)  # Set the start time explicitly
             video_clips.append(image_clip)
         # Concatenate all the image clips to form the video
+        print("Concatenating video clips...")
         video = concatenate_videoclips(video_clips, method="compose")
         # Add the audio to the video
         # Save the video to a temporary file
         temp_dir = tempfile.gettempdir()
         video_path = os.path.join(temp_dir, "generated_video.mp4")
+        print(f"Writing video file to {video_path}...")
+        video.write_videofile(video_path, fps=30, codec="libx264", audio_codec="aac")
         # Clean up the temporary audio file
         os.remove(temp_audio_path.name)
+        print("Temporary audio file removed.")
         return video_path
     except Exception as e:
         print(f"Error generating video: {e}")
+        return None
 # Example usage: