Spaces:

Insightly2
/

IntelliStream

Sleeping

App Files Files Community

PriyankaSatish commited on May 22, 2024

Commit

0a9d475

verified ·

1 Parent(s): c3eb6ff

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -11

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ from io import BytesIO
 from openai import OpenAI
 import whisper
 from google.cloud import vision
 # st.set_page_config(layout="wide")
 load_dotenv()
@@ -196,13 +196,13 @@ def search_keyword(keyword, frame_texts):
 # Function to generate description for video frames
-def generate_description(base64_frames):
     try:
         prompt_messages = [
             {
                 "role": "user",
                 "content": [
-                    "1. Generate a description for this sequence of video frames in about 90 words. 2.Return the following: i. List of objects in the video ii. Any restrictive content or sensitive content and if so which frame. iii. The frames is supposed to contain news content and we want to detect non-news content such as an advertisement. So analyze specifically for any indications that the content might be promotional or an advertisement.",
                     *map(lambda x: {"image": x, "resize": 428}, base64_frames),
                 ],
             },
@@ -212,10 +212,24 @@ def generate_description(base64_frames):
             messages=prompt_messages,
             max_tokens=3000,
         )
-        return response.choices[0].message.content
     except Exception as e:
         print(f"Error in generate_description: {e}")
-        return None
 def generate_overall_description(transcript_text, video_description):
     try:
@@ -251,8 +265,21 @@ with col1:
     keyword = st.text_input("Enter a keyword to filter the frames (optional):")
     extract_frames_button = st.button("Extract Frames")
     uploaded_video = st.file_uploader("Or upload a video file (MP4):", type=["mp4"])
     # Slider to select the number of seconds for extraction
     seconds = st.slider("Select the number of seconds for extraction:", min_value=1, max_value=60, value=10)
@@ -375,7 +402,7 @@ with col1:
         # Get consolidated description for all frames
             if ffmpeg_output:
-                description = generate_description(base64_frames)
                 if description:
                     st.markdown("**Frame Description:**")
                     st.write(description)
@@ -391,7 +418,7 @@ with col1:
             # Get the transcript from whisper
             transcript_text = get_transcript_from_audio(audio_tempfile.name)
-            description = generate_description(base64_frames)
         # Generate overall description using transcript and video description
             overall_description = generate_overall_description(transcript_text, description)
             if overall_description:
@@ -429,6 +456,7 @@ with col1:
                 n_frames = len(frame_bytes_list)
                 base64_frames = [base64.b64encode(b'\xff\xd8' + frame_bytes).decode('utf-8') for frame_bytes in frame_bytes_list]
                 categories_results = []
                 frame_texts = {}
@@ -439,6 +467,7 @@ with col1:
                     col1, col2 = st.columns([3, 2])
                     with col1:
                         frame_bytes = base64.b64decode(frame_base64)
                         st.image(Image.open(BytesIO(frame_bytes)), caption=f'Frame {idx + 1}', use_column_width=True)
                     with col2:
                         st.write(f"Extracted Text: {extracted_text}")
@@ -482,7 +511,7 @@ with col1:
                 # Get consolidated description for all frames
             if ffmpeg_output:
-                description = generate_description(base64_frames)
                 if description:
                     st.markdown("**Frame Description:**")
                     st.write(description)
@@ -503,7 +532,59 @@ with col1:
                     # Get the transcript from whisper
             transcript_text = get_transcript_from_audio(audio_tempfile.name)
-            description = generate_description(base64_frames)
         # Generate overall description using transcript and video description
             overall_description = generate_overall_description(transcript_text, description)
             if overall_description:

 from openai import OpenAI
 import whisper
 from google.cloud import vision
+import re
 # st.set_page_config(layout="wide")
 load_dotenv()
 # Function to generate description for video frames
+def generate_description(base64_frames,prompt):
     try:
         prompt_messages = [
             {
                 "role": "user",
                 "content": [
+                    prompt ,
                     *map(lambda x: {"image": x, "resize": 428}, base64_frames),
                 ],
             },
             messages=prompt_messages,
             max_tokens=3000,
         )
+        description = response.choices[0].message.content
+        # Use regular expression to find frame numbers
+        frame_numbers = re.findall(r'Frames\s*:\s*(\d+(?:,\s*\d+)*)', response.choices[0].message.content)
+        # Convert the string of numbers into a list of integers
+        if frame_numbers:
+            frame_numbers = [int(num) for num in frame_numbers[0].split(',')]
+        else:
+            frame_numbers = []
+        print("Frame numbers to extract:", frame_numbers)
+        return description, frame_numbers
     except Exception as e:
         print(f"Error in generate_description: {e}")
+        return None, []
 def generate_overall_description(transcript_text, video_description):
     try:
     keyword = st.text_input("Enter a keyword to filter the frames (optional):")
     extract_frames_button = st.button("Extract Frames")
     uploaded_video = st.file_uploader("Or upload a video file (MP4):", type=["mp4"])
+    prompt1 = "keyword is " + st.text_input("Enter a keyword for analysis:")
+    prompt2 = "1. Generate a description for this sequence of video frames in about 90 words. 2.Return the following:\
+                        i. List of objects in the video \
+                        ii. Any restrictive content or sensitive content and if so which frame. \
+                        iii. The frames is supposed to contain news content and we want to detect non-news content such as an advertisement. \
+                        So analyze specifically for any indications that the content might be promotional or an advertisement. \
+                        Find the most portions of a video related to the keyword.  \
+                        The output will be targeted towards social media (like TikTok or Reels) or to news broadcasts. \
+                        For the provided frames return the frames related to the keyword\
+                        I am trying to fill these frames for a TikTok video. \
+                        Hence while selecting the frames keep that in mind. \
+                        You do not have to give me the script of the Tiktok video. \
+                        Just return the most interesting frames in a sequence that will come for a tiktok video. \
+                        List all frame numbers separated by commas at the end like this for eg, Frames : 1,2,4,7,9"
+    prompt = prompt2 + prompt1
     # Slider to select the number of seconds for extraction
     seconds = st.slider("Select the number of seconds for extraction:", min_value=1, max_value=60, value=10)
         # Get consolidated description for all frames
             if ffmpeg_output:
+                description = generate_description(base64_frames,prompt)
                 if description:
                     st.markdown("**Frame Description:**")
                     st.write(description)
             # Get the transcript from whisper
             transcript_text = get_transcript_from_audio(audio_tempfile.name)
+            description = generate_description(base64_frames,prompt)
         # Generate overall description using transcript and video description
             overall_description = generate_overall_description(transcript_text, description)
             if overall_description:
                 n_frames = len(frame_bytes_list)
                 base64_frames = [base64.b64encode(b'\xff\xd8' + frame_bytes).decode('utf-8') for frame_bytes in frame_bytes_list]
+                frame_dict = {}
                 categories_results = []
                 frame_texts = {}
                     col1, col2 = st.columns([3, 2])
                     with col1:
                         frame_bytes = base64.b64decode(frame_base64)
+                        frame_dict[idx + 1] = frame_bytes
                         st.image(Image.open(BytesIO(frame_bytes)), caption=f'Frame {idx + 1}', use_column_width=True)
                     with col2:
                         st.write(f"Extracted Text: {extracted_text}")
                 # Get consolidated description for all frames
             if ffmpeg_output:
+                description,frame_numbers = generate_description(base64_frames,prompt)
                 if description:
                     st.markdown("**Frame Description:**")
                     st.write(description)
                     # Get the transcript from whisper
             transcript_text = get_transcript_from_audio(audio_tempfile.name)
+            description = generate_description(base64_frames,prompt)
+            if frame_numbers:
+                print("Frame numbers to extract:", frame_numbers)  # Check frame numbers
+            # Create a mapping from original frame numbers to sequential numbers
+            frame_mapping = {}
+            new_frame_numbers = []
+            for idx, frame_number in enumerate(sorted(frame_numbers)):
+                frame_mapping[frame_number] = idx + 1
+                new_frame_numbers.append(idx + 1)
+            print("New frame numbers:", new_frame_numbers)
+            print("Frame mapping:", frame_mapping)
+            # Create a temporary directory to store images
+            with tempfile.TemporaryDirectory() as temp_dir:
+                image_paths = []
+                for frame_number in frame_numbers:
+                    if frame_number in frame_dict:
+                        frame_path = os.path.join(temp_dir, f'frame_{frame_mapping[frame_number]:03}.jpg')  # Updated file naming
+                        image_paths.append(frame_path)
+                        with open(frame_path, 'wb') as f:
+                            f.write(frame_dict[frame_number])
+                        #image = Image.open(BytesIO(frame_bytes))
+                        #st.image(image, caption='Selected Frame', use_column_width=True)
+                        #with open(frame_path, "rb") as file:
+                        #    btn = st.download_button(
+                        #        label="Download Frame",
+                        #        data=file,
+                        #        file_name=f'frame_{frame_number}.jpg',
+                        #        mime="image/jpeg"
+                        #    )
+                # Once all selected frames are saved as images, create a video from them using FFmpeg
+                video_output_path =  os.path.join(temp_dir, 'output5.mp4')
+                framerate = 1  # Adjust framerate based on the number of frames
+                ffmpeg_command = [
+                    'ffmpeg',
+                    '-framerate', str(framerate),  # Set framerate based on the number of frames
+                    '-i', os.path.join(temp_dir, 'frame_%03d.jpg'),  # Input pattern for all frame files
+                    '-c:v', 'libx264',
+                    '-pix_fmt', 'yuv420p',
+                    video_output_path
+                ]
+                print("FFmpeg command:", ' '.join(ffmpeg_command))  # Debug FFmpeg command
+                subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                # Display or provide a download link for the created video
+                st.header("Final Video")
+                st.video(video_output_path)
         # Generate overall description using transcript and video description
             overall_description = generate_overall_description(transcript_text, description)
             if overall_description: