Spaces:
Sleeping
Sleeping
Better UI with progressbar and download button
Browse files
app.py
CHANGED
|
@@ -40,15 +40,6 @@ audio_file = st.file_uploader("🔼 Upload your audio file:", type=constants.SUP
|
|
| 40 |
|
| 41 |
print(audio_file,'is the upload')
|
| 42 |
|
| 43 |
-
# if audio_file is not None:
|
| 44 |
-
# # Check the duration of the uploaded audio file
|
| 45 |
-
# duration = get_audio_duration(audio_file)
|
| 46 |
-
|
| 47 |
-
# # Allow only files up to 5 minutes (300 seconds)
|
| 48 |
-
# if duration > 300:
|
| 49 |
-
# st.error("The uploaded audio file exceeds the 5-minute limit. Please upload a shorter file.")
|
| 50 |
-
# else:
|
| 51 |
-
# st.success(f"Audio file uploaded successfully! Duration: {duration/60:.2f} minutes")
|
| 52 |
|
| 53 |
if audio_file:
|
| 54 |
# Reset states only when a new file is uploaded
|
|
@@ -69,7 +60,7 @@ if audio_file:
|
|
| 69 |
result = client.audio.transcriptions.create(
|
| 70 |
file=(audio_file.name, file_bytes), # Send the audio file content directly to the API
|
| 71 |
model="whisper-large-v3-turbo", # Model to use for transcription
|
| 72 |
-
prompt="
|
| 73 |
response_format="verbose_json", # Return detailed JSON response
|
| 74 |
temperature=0.0, # Control randomness in the transcription output
|
| 75 |
)
|
|
@@ -115,35 +106,45 @@ if audio_file:
|
|
| 115 |
|
| 116 |
# Generate images only if they have not been generated already
|
| 117 |
if st.session_state.image_prompts and not st.session_state.generated_images:
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
|
|
|
|
|
|
| 129 |
|
| 130 |
# Generate video when all images are generated
|
| 131 |
if st.session_state.generated_images and st.session_state.audio:
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
st.success("Video generated successfully!")
|
| 143 |
|
| 144 |
# Display the generated video
|
| 145 |
if st.session_state.generated_video:
|
| 146 |
st.video(st.session_state.generated_video)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
else:
|
| 149 |
st.warning("Please upload an audio file to proceed.")
|
|
|
|
| 40 |
|
| 41 |
print(audio_file,'is the upload')
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
if audio_file:
|
| 45 |
# Reset states only when a new file is uploaded
|
|
|
|
| 60 |
result = client.audio.transcriptions.create(
|
| 61 |
file=(audio_file.name, file_bytes), # Send the audio file content directly to the API
|
| 62 |
model="whisper-large-v3-turbo", # Model to use for transcription
|
| 63 |
+
prompt="Take Note of Overall Context of the Audio", # Optional context for better transcription accuracy
|
| 64 |
response_format="verbose_json", # Return detailed JSON response
|
| 65 |
temperature=0.0, # Control randomness in the transcription output
|
| 66 |
)
|
|
|
|
| 106 |
|
| 107 |
# Generate images only if they have not been generated already
|
| 108 |
if st.session_state.image_prompts and not st.session_state.generated_images:
|
| 109 |
+
progress_placeholder = st.empty()
|
| 110 |
+
progress_bar = st.progress(0)
|
| 111 |
+
total_images = len(st.session_state.image_prompts)
|
| 112 |
+
progress_placeholder.text(f"Generating images. Please be patient...")
|
| 113 |
+
|
| 114 |
+
for idx, (prompt, image_path) in enumerate(generate_images(st.session_state.image_prompts)):
|
| 115 |
+
st.session_state.generated_images.append((prompt, image_path))
|
| 116 |
+
progress = (idx + 1) / total_images
|
| 117 |
+
progress_bar.progress(progress)
|
| 118 |
+
progress_placeholder.text(f"Generated image {idx + 1} of {total_images}: {prompt[:50]}...")
|
| 119 |
+
|
| 120 |
+
progress_placeholder.text("✅ All images generated successfully!")
|
| 121 |
+
progress_bar.empty()
|
| 122 |
|
| 123 |
# Generate video when all images are generated
|
| 124 |
if st.session_state.generated_images and st.session_state.audio:
|
| 125 |
+
with st.spinner("Generating video... Please wait."):
|
| 126 |
+
# Map images to segments
|
| 127 |
+
image_paths = [img[1] for img in st.session_state.generated_images]
|
| 128 |
+
generated_video_path = generate_video(
|
| 129 |
+
audio_file=st.session_state.audio,
|
| 130 |
+
images=image_paths,
|
| 131 |
+
segments=st.session_state.segments
|
| 132 |
+
)
|
| 133 |
+
st.session_state.generated_video = generated_video_path
|
| 134 |
+
st.success("Video generated successfully!")
|
|
|
|
| 135 |
|
| 136 |
# Display the generated video
|
| 137 |
if st.session_state.generated_video:
|
| 138 |
st.video(st.session_state.generated_video)
|
| 139 |
+
|
| 140 |
+
# Add a download button for the generated video
|
| 141 |
+
with open(st.session_state.generated_video, "rb") as file:
|
| 142 |
+
st.download_button(
|
| 143 |
+
label="Download Video",
|
| 144 |
+
data=file,
|
| 145 |
+
file_name="generated_video.mp4",
|
| 146 |
+
mime="video/mp4"
|
| 147 |
+
)
|
| 148 |
|
| 149 |
else:
|
| 150 |
st.warning("Please upload an audio file to proceed.")
|
utils.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
import requests
|
| 3 |
import constants
|
| 4 |
import os
|
|
@@ -54,33 +53,7 @@ def get_translation(text: str):
|
|
| 54 |
print(f"An exception occurred: {e}")
|
| 55 |
return {"error_occured" : e}
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def old_get_image_prompts(text_input):
|
| 60 |
-
headers = {
|
| 61 |
-
"Authorization": f"Bearer {constants.HF_TOKEN}", # Replace with your token
|
| 62 |
-
"Content-Type": "application/json" # Optional, ensures JSON payload
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
endpoint = f"{constants.PROMPT_GENERATION_ENDPOINT}"
|
| 66 |
-
payload = {"text_input": text_input}
|
| 67 |
-
|
| 68 |
-
try:
|
| 69 |
-
# Send the POST request
|
| 70 |
-
print("making post request for image prompts", endpoint)
|
| 71 |
-
response = requests.post(endpoint, json=payload, headers=headers)
|
| 72 |
-
|
| 73 |
-
# Raise an exception for HTTP errors
|
| 74 |
-
response.raise_for_status()
|
| 75 |
-
|
| 76 |
-
# Parse JSON response
|
| 77 |
-
result = response.json()
|
| 78 |
-
return result
|
| 79 |
-
|
| 80 |
-
except requests.exceptions.RequestException as e:
|
| 81 |
-
print(f"Error during request: {e}")
|
| 82 |
-
return {"error": str(e)}
|
| 83 |
-
|
| 84 |
def segments_to_chunks(segments):
|
| 85 |
chunks = []
|
| 86 |
for segment in segments:
|
|
@@ -98,7 +71,7 @@ def get_image_prompts(text_input : List):
|
|
| 98 |
extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
|
| 99 |
chunks_count = len(text_input)
|
| 100 |
chunks = "chunk: " + "\nchunk: ".join(text_input)
|
| 101 |
-
prompt = f"""ROLE: You are a Highly Experienced Image Prompt Sythesizer
|
| 102 |
TASK: Generate {chunks_count} image prompts, Each per chunk\n\n {chunks}"""
|
| 103 |
result = extractor.extract(prompt)
|
| 104 |
return result.model_dump() # returns dictionary version pydantic model
|
|
@@ -158,62 +131,15 @@ def tmp_folder(folder_name: str) -> str:
|
|
| 158 |
|
| 159 |
|
| 160 |
|
| 161 |
-
|
| 162 |
-
print(f"images: {images}")
|
| 163 |
-
print(f"segments: {segments}")
|
| 164 |
-
print(f"audio file: {audio_file.name}")
|
| 165 |
-
try:
|
| 166 |
-
# Save the uploaded audio file to a temporary location
|
| 167 |
-
file_extension = os.path.splitext(audio_file.name)[1]
|
| 168 |
-
temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=f"{file_extension}")
|
| 169 |
-
temp_audio_path.write(audio_file.read())
|
| 170 |
-
temp_audio_path.close()
|
| 171 |
-
|
| 172 |
-
# Load the audio file using MoviePy
|
| 173 |
-
audio = mp.AudioFileClip(temp_audio_path.name)
|
| 174 |
-
audio_duration = audio.duration
|
| 175 |
-
|
| 176 |
-
# Create video clips for each segment using the corresponding image
|
| 177 |
-
video_clips = []
|
| 178 |
-
for i, segment in enumerate(segments):
|
| 179 |
-
start_time = segment["start"]
|
| 180 |
-
end_time = segment["end"]
|
| 181 |
-
|
| 182 |
-
# Ensure the image index is within bounds
|
| 183 |
-
image_path = images[min(i, len(images) - 1)]
|
| 184 |
-
|
| 185 |
-
# Create an ImageClip for the current segment
|
| 186 |
-
image_clip = ImageClip(image_path, duration=end_time - start_time)
|
| 187 |
-
image_clip = image_clip.set_start(start_time).set_end(end_time)
|
| 188 |
-
video_clips.append(image_clip)
|
| 189 |
-
|
| 190 |
-
# Concatenate all the image clips to form the video
|
| 191 |
-
video = mp.concatenate_videoclips(video_clips, method="compose")
|
| 192 |
-
|
| 193 |
-
# Add the audio to the video
|
| 194 |
-
video = video.set_audio(audio)
|
| 195 |
-
|
| 196 |
-
# Save the video to a temporary file
|
| 197 |
-
temp_dir = tempfile.gettempdir()
|
| 198 |
-
video_path = os.path.join(temp_dir, "generated_video.mp4")
|
| 199 |
-
video.write_videofile(video_path, fps=24, codec="libx264", audio_codec="aac")
|
| 200 |
-
|
| 201 |
-
# Clean up the temporary audio file
|
| 202 |
-
os.remove(temp_audio_path.name)
|
| 203 |
|
| 204 |
-
return video_path
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
|
| 211 |
-
from moviepy.editor import *
|
| 212 |
|
| 213 |
def generate_video(audio_file, images, segments):
|
| 214 |
-
print(f"images: {images}")
|
| 215 |
-
print(f"segments: {segments}")
|
| 216 |
-
print(f"audio file: {audio_file.name}")
|
| 217 |
try:
|
| 218 |
# Save the uploaded audio file to a temporary location
|
| 219 |
file_extension = os.path.splitext(audio_file.name)[1]
|
|
@@ -223,36 +149,58 @@ def generate_video(audio_file, images, segments):
|
|
| 223 |
|
| 224 |
# Load the audio file using MoviePy
|
| 225 |
audio = AudioFileClip(temp_audio_path.name)
|
| 226 |
-
audio_duration = audio.duration
|
| 227 |
|
| 228 |
-
# Define YouTube-like dimensions (16:9 aspect ratio
|
| 229 |
-
frame_width =
|
| 230 |
-
frame_height =
|
| 231 |
|
| 232 |
-
# Create video clips for each segment using the corresponding image
|
| 233 |
video_clips = []
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
# Ensure the image index is within bounds
|
| 239 |
image_path = images[min(i, len(images) - 1)]
|
| 240 |
-
|
| 241 |
# Create an ImageClip for the current segment
|
| 242 |
-
image_clip = ImageClip(image_path
|
| 243 |
-
|
| 244 |
# Resize and pad the image to fit a 16:9 aspect ratio
|
| 245 |
image_clip = image_clip.resize(height=frame_height).on_color(
|
| 246 |
size=(frame_width, frame_height),
|
| 247 |
color=(0, 0, 0), # Black background
|
| 248 |
pos="center" # Center the image
|
| 249 |
)
|
| 250 |
-
|
| 251 |
-
# Set the
|
| 252 |
-
image_clip = image_clip.
|
|
|
|
|
|
|
| 253 |
video_clips.append(image_clip)
|
| 254 |
|
| 255 |
# Concatenate all the image clips to form the video
|
|
|
|
| 256 |
video = concatenate_videoclips(video_clips, method="compose")
|
| 257 |
|
| 258 |
# Add the audio to the video
|
|
@@ -261,16 +209,22 @@ def generate_video(audio_file, images, segments):
|
|
| 261 |
# Save the video to a temporary file
|
| 262 |
temp_dir = tempfile.gettempdir()
|
| 263 |
video_path = os.path.join(temp_dir, "generated_video.mp4")
|
| 264 |
-
|
|
|
|
| 265 |
|
| 266 |
# Clean up the temporary audio file
|
| 267 |
os.remove(temp_audio_path.name)
|
|
|
|
| 268 |
|
| 269 |
return video_path
|
| 270 |
|
| 271 |
except Exception as e:
|
| 272 |
print(f"Error generating video: {e}")
|
| 273 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
|
| 276 |
# Example usage:
|
|
|
|
|
|
|
| 1 |
import requests
|
| 2 |
import constants
|
| 3 |
import os
|
|
|
|
| 53 |
print(f"An exception occurred: {e}")
|
| 54 |
return {"error_occured" : e}
|
| 55 |
|
| 56 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def segments_to_chunks(segments):
|
| 58 |
chunks = []
|
| 59 |
for segment in segments:
|
|
|
|
| 71 |
extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
|
| 72 |
chunks_count = len(text_input)
|
| 73 |
chunks = "chunk: " + "\nchunk: ".join(text_input)
|
| 74 |
+
prompt = f"""ROLE: You are a Highly Experienced Image Prompt Sythesizer (try to avoid explicit unethical prompt gracefully as much as possible)
|
| 75 |
TASK: Generate {chunks_count} image prompts, Each per chunk\n\n {chunks}"""
|
| 76 |
result = extractor.extract(prompt)
|
| 77 |
return result.model_dump() # returns dictionary version pydantic model
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
|
| 134 |
+
from moviepy.editor import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
|
|
|
| 136 |
|
| 137 |
+
import os
|
| 138 |
+
import tempfile
|
| 139 |
+
from moviepy.editor import AudioFileClip, ImageClip, concatenate_videoclips
|
|
|
|
| 140 |
|
|
|
|
| 141 |
|
| 142 |
def generate_video(audio_file, images, segments):
|
|
|
|
|
|
|
|
|
|
| 143 |
try:
|
| 144 |
# Save the uploaded audio file to a temporary location
|
| 145 |
file_extension = os.path.splitext(audio_file.name)[1]
|
|
|
|
| 149 |
|
| 150 |
# Load the audio file using MoviePy
|
| 151 |
audio = AudioFileClip(temp_audio_path.name)
|
|
|
|
| 152 |
|
| 153 |
+
# Define YouTube-like dimensions (16:9 aspect ratio)
|
| 154 |
+
frame_width = 1280
|
| 155 |
+
frame_height = 720
|
| 156 |
|
|
|
|
| 157 |
video_clips = []
|
| 158 |
+
total_segments = len(segments)
|
| 159 |
+
|
| 160 |
+
for i, current_segment in enumerate(segments):
|
| 161 |
+
start_time = current_segment["start"]
|
| 162 |
+
end_time = current_segment["end"]
|
| 163 |
+
|
| 164 |
+
# Calculate the actual duration including any gap until the next segment
|
| 165 |
+
if i < total_segments - 1:
|
| 166 |
+
# If there's a next segment, extend until it starts
|
| 167 |
+
next_segment = segments[i + 1]
|
| 168 |
+
actual_end_time = next_segment["start"]
|
| 169 |
+
else:
|
| 170 |
+
# For the last segment, use its end time
|
| 171 |
+
actual_end_time = end_time
|
| 172 |
+
|
| 173 |
+
# Calculate total duration including any gap
|
| 174 |
+
segment_duration = actual_end_time - start_time
|
| 175 |
+
|
| 176 |
+
print(f"\nProcessing segment {i + 1}/{total_segments}:")
|
| 177 |
+
print(f" Start time: {start_time}s")
|
| 178 |
+
print(f" Base end time: {end_time}s")
|
| 179 |
+
print(f" Actual end time: {actual_end_time}s")
|
| 180 |
+
print(f" Total duration: {segment_duration}s")
|
| 181 |
+
print(f" Text: '{current_segment['text']}'")
|
| 182 |
+
|
| 183 |
# Ensure the image index is within bounds
|
| 184 |
image_path = images[min(i, len(images) - 1)]
|
| 185 |
+
|
| 186 |
# Create an ImageClip for the current segment
|
| 187 |
+
image_clip = ImageClip(image_path)
|
| 188 |
+
|
| 189 |
# Resize and pad the image to fit a 16:9 aspect ratio
|
| 190 |
image_clip = image_clip.resize(height=frame_height).on_color(
|
| 191 |
size=(frame_width, frame_height),
|
| 192 |
color=(0, 0, 0), # Black background
|
| 193 |
pos="center" # Center the image
|
| 194 |
)
|
| 195 |
+
|
| 196 |
+
# Set the duration and start time for the clip
|
| 197 |
+
image_clip = image_clip.set_duration(segment_duration)
|
| 198 |
+
image_clip = image_clip.set_start(start_time) # Set the start time explicitly
|
| 199 |
+
|
| 200 |
video_clips.append(image_clip)
|
| 201 |
|
| 202 |
# Concatenate all the image clips to form the video
|
| 203 |
+
print("Concatenating video clips...")
|
| 204 |
video = concatenate_videoclips(video_clips, method="compose")
|
| 205 |
|
| 206 |
# Add the audio to the video
|
|
|
|
| 209 |
# Save the video to a temporary file
|
| 210 |
temp_dir = tempfile.gettempdir()
|
| 211 |
video_path = os.path.join(temp_dir, "generated_video.mp4")
|
| 212 |
+
print(f"Writing video file to {video_path}...")
|
| 213 |
+
video.write_videofile(video_path, fps=30, codec="libx264", audio_codec="aac")
|
| 214 |
|
| 215 |
# Clean up the temporary audio file
|
| 216 |
os.remove(temp_audio_path.name)
|
| 217 |
+
print("Temporary audio file removed.")
|
| 218 |
|
| 219 |
return video_path
|
| 220 |
|
| 221 |
except Exception as e:
|
| 222 |
print(f"Error generating video: {e}")
|
| 223 |
+
return None
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
|
| 228 |
|
| 229 |
|
| 230 |
# Example usage:
|