Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,7 +10,7 @@ from io import BytesIO
|
|
| 10 |
from openai import OpenAI
|
| 11 |
import whisper
|
| 12 |
from google.cloud import vision
|
| 13 |
-
|
| 14 |
# st.set_page_config(layout="wide")
|
| 15 |
|
| 16 |
load_dotenv()
|
|
@@ -196,13 +196,13 @@ def search_keyword(keyword, frame_texts):
|
|
| 196 |
|
| 197 |
|
| 198 |
# Function to generate description for video frames
|
| 199 |
-
def generate_description(base64_frames):
|
| 200 |
try:
|
| 201 |
prompt_messages = [
|
| 202 |
{
|
| 203 |
"role": "user",
|
| 204 |
"content": [
|
| 205 |
-
|
| 206 |
*map(lambda x: {"image": x, "resize": 428}, base64_frames),
|
| 207 |
],
|
| 208 |
},
|
|
@@ -212,10 +212,24 @@ def generate_description(base64_frames):
|
|
| 212 |
messages=prompt_messages,
|
| 213 |
max_tokens=3000,
|
| 214 |
)
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
except Exception as e:
|
| 217 |
print(f"Error in generate_description: {e}")
|
| 218 |
-
return None
|
| 219 |
|
| 220 |
def generate_overall_description(transcript_text, video_description):
|
| 221 |
try:
|
|
@@ -251,8 +265,21 @@ with col1:
|
|
| 251 |
keyword = st.text_input("Enter a keyword to filter the frames (optional):")
|
| 252 |
extract_frames_button = st.button("Extract Frames")
|
| 253 |
uploaded_video = st.file_uploader("Or upload a video file (MP4):", type=["mp4"])
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
# Slider to select the number of seconds for extraction
|
| 257 |
seconds = st.slider("Select the number of seconds for extraction:", min_value=1, max_value=60, value=10)
|
| 258 |
|
|
@@ -375,7 +402,7 @@ with col1:
|
|
| 375 |
|
| 376 |
# Get consolidated description for all frames
|
| 377 |
if ffmpeg_output:
|
| 378 |
-
description = generate_description(base64_frames)
|
| 379 |
if description:
|
| 380 |
st.markdown("**Frame Description:**")
|
| 381 |
st.write(description)
|
|
@@ -391,7 +418,7 @@ with col1:
|
|
| 391 |
|
| 392 |
# Get the transcript from whisper
|
| 393 |
transcript_text = get_transcript_from_audio(audio_tempfile.name)
|
| 394 |
-
description = generate_description(base64_frames)
|
| 395 |
# Generate overall description using transcript and video description
|
| 396 |
overall_description = generate_overall_description(transcript_text, description)
|
| 397 |
if overall_description:
|
|
@@ -429,6 +456,7 @@ with col1:
|
|
| 429 |
n_frames = len(frame_bytes_list)
|
| 430 |
base64_frames = [base64.b64encode(b'\xff\xd8' + frame_bytes).decode('utf-8') for frame_bytes in frame_bytes_list]
|
| 431 |
|
|
|
|
| 432 |
categories_results = []
|
| 433 |
frame_texts = {}
|
| 434 |
|
|
@@ -439,6 +467,7 @@ with col1:
|
|
| 439 |
col1, col2 = st.columns([3, 2])
|
| 440 |
with col1:
|
| 441 |
frame_bytes = base64.b64decode(frame_base64)
|
|
|
|
| 442 |
st.image(Image.open(BytesIO(frame_bytes)), caption=f'Frame {idx + 1}', use_column_width=True)
|
| 443 |
with col2:
|
| 444 |
st.write(f"Extracted Text: {extracted_text}")
|
|
@@ -482,7 +511,7 @@ with col1:
|
|
| 482 |
|
| 483 |
# Get consolidated description for all frames
|
| 484 |
if ffmpeg_output:
|
| 485 |
-
description = generate_description(base64_frames)
|
| 486 |
if description:
|
| 487 |
st.markdown("**Frame Description:**")
|
| 488 |
st.write(description)
|
|
@@ -503,7 +532,59 @@ with col1:
|
|
| 503 |
|
| 504 |
# Get the transcript from whisper
|
| 505 |
transcript_text = get_transcript_from_audio(audio_tempfile.name)
|
| 506 |
-
description = generate_description(base64_frames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
# Generate overall description using transcript and video description
|
| 508 |
overall_description = generate_overall_description(transcript_text, description)
|
| 509 |
if overall_description:
|
|
|
|
| 10 |
from openai import OpenAI
|
| 11 |
import whisper
|
| 12 |
from google.cloud import vision
|
| 13 |
+
import re
|
| 14 |
# st.set_page_config(layout="wide")
|
| 15 |
|
| 16 |
load_dotenv()
|
|
|
|
| 196 |
|
| 197 |
|
| 198 |
# Function to generate description for video frames
|
| 199 |
+
def generate_description(base64_frames,prompt):
|
| 200 |
try:
|
| 201 |
prompt_messages = [
|
| 202 |
{
|
| 203 |
"role": "user",
|
| 204 |
"content": [
|
| 205 |
+
prompt ,
|
| 206 |
*map(lambda x: {"image": x, "resize": 428}, base64_frames),
|
| 207 |
],
|
| 208 |
},
|
|
|
|
| 212 |
messages=prompt_messages,
|
| 213 |
max_tokens=3000,
|
| 214 |
)
|
| 215 |
+
description = response.choices[0].message.content
|
| 216 |
+
|
| 217 |
+
# Use regular expression to find frame numbers
|
| 218 |
+
frame_numbers = re.findall(r'Frames\s*:\s*(\d+(?:,\s*\d+)*)', response.choices[0].message.content)
|
| 219 |
+
|
| 220 |
+
# Convert the string of numbers into a list of integers
|
| 221 |
+
if frame_numbers:
|
| 222 |
+
frame_numbers = [int(num) for num in frame_numbers[0].split(',')]
|
| 223 |
+
else:
|
| 224 |
+
frame_numbers = []
|
| 225 |
+
|
| 226 |
+
print("Frame numbers to extract:", frame_numbers)
|
| 227 |
+
|
| 228 |
+
return description, frame_numbers
|
| 229 |
+
|
| 230 |
except Exception as e:
|
| 231 |
print(f"Error in generate_description: {e}")
|
| 232 |
+
return None, []
|
| 233 |
|
| 234 |
def generate_overall_description(transcript_text, video_description):
|
| 235 |
try:
|
|
|
|
| 265 |
keyword = st.text_input("Enter a keyword to filter the frames (optional):")
|
| 266 |
extract_frames_button = st.button("Extract Frames")
|
| 267 |
uploaded_video = st.file_uploader("Or upload a video file (MP4):", type=["mp4"])
|
| 268 |
+
prompt1 = "keyword is " + st.text_input("Enter a keyword for analysis:")
|
| 269 |
+
prompt2 = "1. Generate a description for this sequence of video frames in about 90 words. 2.Return the following:\
|
| 270 |
+
i. List of objects in the video \
|
| 271 |
+
ii. Any restrictive content or sensitive content and if so which frame. \
|
| 272 |
+
iii. The frames is supposed to contain news content and we want to detect non-news content such as an advertisement. \
|
| 273 |
+
So analyze specifically for any indications that the content might be promotional or an advertisement. \
|
| 274 |
+
Find the most portions of a video related to the keyword. \
|
| 275 |
+
The output will be targeted towards social media (like TikTok or Reels) or to news broadcasts. \
|
| 276 |
+
For the provided frames return the frames related to the keyword\
|
| 277 |
+
I am trying to fill these frames for a TikTok video. \
|
| 278 |
+
Hence while selecting the frames keep that in mind. \
|
| 279 |
+
You do not have to give me the script of the Tiktok video. \
|
| 280 |
+
Just return the most interesting frames in a sequence that will come for a tiktok video. \
|
| 281 |
+
List all frame numbers separated by commas at the end like this for eg, Frames : 1,2,4,7,9"
|
| 282 |
+
prompt = prompt2 + prompt1
|
| 283 |
# Slider to select the number of seconds for extraction
|
| 284 |
seconds = st.slider("Select the number of seconds for extraction:", min_value=1, max_value=60, value=10)
|
| 285 |
|
|
|
|
| 402 |
|
| 403 |
# Get consolidated description for all frames
|
| 404 |
if ffmpeg_output:
|
| 405 |
+
description = generate_description(base64_frames,prompt)
|
| 406 |
if description:
|
| 407 |
st.markdown("**Frame Description:**")
|
| 408 |
st.write(description)
|
|
|
|
| 418 |
|
| 419 |
# Get the transcript from whisper
|
| 420 |
transcript_text = get_transcript_from_audio(audio_tempfile.name)
|
| 421 |
+
description = generate_description(base64_frames,prompt)
|
| 422 |
# Generate overall description using transcript and video description
|
| 423 |
overall_description = generate_overall_description(transcript_text, description)
|
| 424 |
if overall_description:
|
|
|
|
| 456 |
n_frames = len(frame_bytes_list)
|
| 457 |
base64_frames = [base64.b64encode(b'\xff\xd8' + frame_bytes).decode('utf-8') for frame_bytes in frame_bytes_list]
|
| 458 |
|
| 459 |
+
frame_dict = {}
|
| 460 |
categories_results = []
|
| 461 |
frame_texts = {}
|
| 462 |
|
|
|
|
| 467 |
col1, col2 = st.columns([3, 2])
|
| 468 |
with col1:
|
| 469 |
frame_bytes = base64.b64decode(frame_base64)
|
| 470 |
+
frame_dict[idx + 1] = frame_bytes
|
| 471 |
st.image(Image.open(BytesIO(frame_bytes)), caption=f'Frame {idx + 1}', use_column_width=True)
|
| 472 |
with col2:
|
| 473 |
st.write(f"Extracted Text: {extracted_text}")
|
|
|
|
| 511 |
|
| 512 |
# Get consolidated description for all frames
|
| 513 |
if ffmpeg_output:
|
| 514 |
+
description,frame_numbers = generate_description(base64_frames,prompt)
|
| 515 |
if description:
|
| 516 |
st.markdown("**Frame Description:**")
|
| 517 |
st.write(description)
|
|
|
|
| 532 |
|
| 533 |
# Get the transcript from whisper
|
| 534 |
transcript_text = get_transcript_from_audio(audio_tempfile.name)
|
| 535 |
+
description = generate_description(base64_frames,prompt)
|
| 536 |
+
|
| 537 |
+
if frame_numbers:
|
| 538 |
+
print("Frame numbers to extract:", frame_numbers) # Check frame numbers
|
| 539 |
+
|
| 540 |
+
# Create a mapping from original frame numbers to sequential numbers
|
| 541 |
+
frame_mapping = {}
|
| 542 |
+
new_frame_numbers = []
|
| 543 |
+
for idx, frame_number in enumerate(sorted(frame_numbers)):
|
| 544 |
+
frame_mapping[frame_number] = idx + 1
|
| 545 |
+
new_frame_numbers.append(idx + 1)
|
| 546 |
+
|
| 547 |
+
print("New frame numbers:", new_frame_numbers)
|
| 548 |
+
print("Frame mapping:", frame_mapping)
|
| 549 |
+
|
| 550 |
+
# Create a temporary directory to store images
|
| 551 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 552 |
+
image_paths = []
|
| 553 |
+
for frame_number in frame_numbers:
|
| 554 |
+
if frame_number in frame_dict:
|
| 555 |
+
frame_path = os.path.join(temp_dir, f'frame_{frame_mapping[frame_number]:03}.jpg') # Updated file naming
|
| 556 |
+
image_paths.append(frame_path)
|
| 557 |
+
with open(frame_path, 'wb') as f:
|
| 558 |
+
f.write(frame_dict[frame_number])
|
| 559 |
+
|
| 560 |
+
#image = Image.open(BytesIO(frame_bytes))
|
| 561 |
+
#st.image(image, caption='Selected Frame', use_column_width=True)
|
| 562 |
+
#with open(frame_path, "rb") as file:
|
| 563 |
+
# btn = st.download_button(
|
| 564 |
+
# label="Download Frame",
|
| 565 |
+
# data=file,
|
| 566 |
+
# file_name=f'frame_{frame_number}.jpg',
|
| 567 |
+
# mime="image/jpeg"
|
| 568 |
+
# )
|
| 569 |
+
# Once all selected frames are saved as images, create a video from them using FFmpeg
|
| 570 |
+
video_output_path = os.path.join(temp_dir, 'output5.mp4')
|
| 571 |
+
framerate = 1 # Adjust framerate based on the number of frames
|
| 572 |
+
ffmpeg_command = [
|
| 573 |
+
'ffmpeg',
|
| 574 |
+
'-framerate', str(framerate), # Set framerate based on the number of frames
|
| 575 |
+
'-i', os.path.join(temp_dir, 'frame_%03d.jpg'), # Input pattern for all frame files
|
| 576 |
+
'-c:v', 'libx264',
|
| 577 |
+
'-pix_fmt', 'yuv420p',
|
| 578 |
+
video_output_path
|
| 579 |
+
]
|
| 580 |
+
|
| 581 |
+
print("FFmpeg command:", ' '.join(ffmpeg_command)) # Debug FFmpeg command
|
| 582 |
+
|
| 583 |
+
subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 584 |
+
|
| 585 |
+
# Display or provide a download link for the created video
|
| 586 |
+
st.header("Final Video")
|
| 587 |
+
st.video(video_output_path)
|
| 588 |
# Generate overall description using transcript and video description
|
| 589 |
overall_description = generate_overall_description(transcript_text, description)
|
| 590 |
if overall_description:
|