Spaces:

AmithAdiraju1694
/

Video_Summary_Beta

Sleeping

App Files Files Community

AmithAdiraju1694 commited on Feb 13, 2025

Commit

05be998

verified ·

1 Parent(s): 413383f

refactor_eh (#2)

Browse files

- Added exception handling, fixed models and tokenizer being on different devices, fixed boiler plate code (e8f31c7e08bdb0472e2071aa660fea5ffea2eda7)

Files changed (6) hide show

README.md +1 -3
app.py +0 -2
model_inference.py +26 -111
pages.py +38 -36
runtime.txt +0 -1
utils.py +2 -5

README.md CHANGED Viewed

@@ -4,10 +4,8 @@ emoji: 👁
 colorFrom: indigo
 colorTo: indigo
 sdk: streamlit
-python_version: 3.9.6
 sdk_version: 1.42.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: indigo
 colorTo: indigo
 sdk: streamlit
+python_version: 3.9.6-slim
 sdk_version: 1.42.0
 app_file: app.py
 pinned: false
 ---

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
-import streamlit as st
 from streamlit import session_state as sst
 import asyncio
-import torch
 from pages import landing_page, model_inference_page

 from streamlit import session_state as sst
 import asyncio
 from pages import landing_page, model_inference_page

model_inference.py CHANGED Viewed

@@ -1,75 +1,19 @@
-from transformers import pipeline
-import torch
-from PIL import Image
-import torch.nn as nn
-import torchvision.models as models
-import torch.nn.functional as F
-from PIL import Image
-from utils import prompt_frame_summarization, assistant_role, prompt_audio_summarization
 import streamlit as st
-from utils import timer
 import numpy as np
 import whisper
-from utils import batch_generator, cosine_sim
 from streamlit import session_state as sst
 import onnxruntime
-class SiameseNetwork(nn.Module):
-    def __init__(self, model_name="vit_b_16"):
-        super(SiameseNetwork, self).__init__()
-        self.encoder = models.vit_b_16(weights="IMAGENET1K_V1")  # Pretrained ViT
-        self.encoder.heads = nn.Identity()  # Remove classification head
-        self.fc = nn.Linear(768, 128)  # Reduce to 128-d embedding
-    def forward(self, video_frames1, video_frames2):
-        """
-        video1: (B, nf, H, W, C)  # Batch of videos (50 frames each)
-        video2: (B, nf, H, W, C)
-        """
-        B,num_frames,H,W,C = video_frames1.shape  # (Batch, Channels, H, W)
-        # Flatten frames into batch dimension for ViT
-        video_frames1 = video_frames1.permute(0,1,4,2,3).reshape(B * num_frames, C,H,W)
-        video_frames2 = video_frames2.reshape(0,1,4,2,3).reshape(B * num_frames, C,H,W)
-        # Extract frame-level embeddings
-        emb1 = self.encoder(video_frames1)  # (B*num_frames, 768)
-        emb2 = self.encoder(video_frames2)
-        # Reshape back to (B, T, 768) and average over T
-        #TODO: Change this to use LSTM instead of averaging
-        emb1 = emb1.reshape(B, num_frames, -1).mean(dim=1)  # (B, 768)
-        emb2 = emb2.reshape(B, num_frames, -1).mean(dim=1)
-        # Pass through fully connected layer
-        emb1 = self.fc(emb1)  # (B, 128)
-        emb2 = self.fc(emb2)
-        return emb1, emb2
-    def inference(self, video_frames):
-        """
-        video: (B, 50, C, H, W)
-        """
-        B, num_frames, H, W, C = video_frames.shape
-        video_frames = video_frames.permute(0,1,4,2,3).reshape(B * num_frames, C,H,W)
-        emb = self.encoder(video_frames)
-        emb = emb.reshape(B, num_frames, -1).mean(dim=1)
-        emb = self.fc(emb)
-        return emb
 @timer
-def get_text_from_audio(audio_tensors):
     """Transcribe multiple audio tensors in parallel using Whisper's batch processing."""
     # Transcribe the in-memory audio
     audio_tensors = audio_tensors.to(sst['device'])
@@ -80,52 +24,21 @@ def get_text_from_audio(audio_tensors):
 @timer
 def summarize_from_text(raw_transcription):
-    summary = text_summarizer(prompt_audio_summarization + raw_transcription,
-                              max_length=108,
-                              min_length=36, do_sample=False)[0]['summary_text']
-    return summary
-def get_important_frames_ML(frame):
-    """
-    Classifies frames using your second ML model.
-    """
-    # Implement your model's logic here
-    # ...
-    return None
-def Vit_Summarize_Video(video_frames):
-    """
-    Summarizes video frames into a text sentence.
-    """
-    processor = None
-    messages = None
-    model = None
-    tokenizer = None
-    if video_frames is None or len(video_frames) == 0:
-        return "Error: No video frames available."
-    # Ensure frames are properly formatted
-    video_frames = [Image.fromarray(frame.astype("uint8")) for frame in video_frames]
-    # Ensure correct format for processor
-    inputs = processor(messages, images=None, videos=[video_frames])
-    inputs.update({
-        "tokenizer": tokenizer,
-        "max_new_tokens": 54,
-        "decode_text": True,
-                 })
-    summary_text = model.generate(**inputs)
-    return summary_text
 @timer
 def rate_video_frames(video_frames):
@@ -154,7 +67,9 @@ def rate_video_frames(video_frames):
 def load_models():
     sst['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'
     transcriber = whisper.load_model("base", device = sst['device'])
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device = sst['device'])
     base_frame_emb = torch.tensor(
                                   np.load('base_frame_medoid.npz')['arr'],
@@ -167,7 +82,7 @@ def load_models():
                                            )
     return (
-        transcriber, summarizer, session, base_frame_emb
             )
 audio_transcriber_model, text_summarizer, video_rating_model,base_frame_emb = load_models()

 import streamlit as st
+import torch
+from utils import (
+    prompt_audio_summarization,
+    timer,
+    cosine_sim
+)
+from transformers import BartForConditionalGeneration, BartTokenizer
 import numpy as np
 import whisper
 from streamlit import session_state as sst
 import onnxruntime
 @timer
+def get_text_from_audio(audio_tensors) -> str:
     """Transcribe multiple audio tensors in parallel using Whisper's batch processing."""
     # Transcribe the in-memory audio
     audio_tensors = audio_tensors.to(sst['device'])
 @timer
 def summarize_from_text(raw_transcription):
+    inputs = text_summarizer[0](prompt_audio_summarization + raw_transcription,
+                                return_tensors="pt",
+                                max_length=1024,
+                                truncation=True)\
+                                .to(sst['device'])
+    summary_ids = text_summarizer[1].generate(**inputs,
+                                              max_length=150,
+                                              min_length=30,
+                                              length_penalty=2.0,
+                                              num_beams=4
+                                              )
+    return text_summarizer[0].decode(summary_ids[0], skip_special_tokens=True)
 @timer
 def rate_video_frames(video_frames):
 def load_models():
     sst['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'
     transcriber = whisper.load_model("base", device = sst['device'])
+    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(sst['device'])
+    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
     base_frame_emb = torch.tensor(
                                   np.load('base_frame_medoid.npz')['arr'],
                                            )
     return (
+        transcriber, (tokenizer, model), session, base_frame_emb
             )
 audio_transcriber_model, text_summarizer, video_rating_model,base_frame_emb = load_models()

pages.py CHANGED Viewed

@@ -1,13 +1,9 @@
 import streamlit as st
 from streamlit import session_state as sst
-import time
-import pandas as pd
 from utils import navigate_to
 from model_inference import rate_video_frames,get_text_from_audio, summarize_from_text
 from utils import read_important_frames, extract_audio
-import numpy as np
 # Define size limits (adjust based on your system)
@@ -33,25 +29,35 @@ async def landing_page():
         else:
             # bytes object which can be translated to audio or video
             video_bytes = uploaded_file.read()
             with st.spinner("Getting most important moments from your video."):
-                important_frames = read_important_frames(video_bytes, 100)
-            st.success(f"Got important moments.")
-            print(f"Time taken to extract {len(important_frames)} important frames: {read_important_frames.total_time}")
             with st.spinner("Getting audio transcript from your video for summary"):
-                audio_transcript_bytes = extract_audio(video_bytes)
-            st.success(f"Got audio transcript.")
-            print(f"Time taken to extract audio data: {extract_audio.total_time}")
-            # add important frames to session state and redirect to model inference page
-            sst["important_frames"] = important_frames
-            # add audio transcript to session state
-            sst["audio_transcript"] = audio_transcript_bytes
             st.button("Summarize & Analyze Video",
                       on_click = navigate_to,
@@ -67,13 +73,11 @@ async def model_inference_page():
         important_frames = sst["important_frames"]
         with st.spinner("Generating Movie Scale rating for your video"):
-            video_rating_scale = rate_video_frames(important_frames)
-        if len(video_rating_scale) > 0:
-            pass
-        else:
-            video_rating_scale = "Sorry, we couldn't find any images from your video, hence couldn't generate any summary"
         st.toast("Done")
         st.header("Movie Scale Rating of Your Video: ", divider = True)
         st.write(video_rating_scale)
@@ -84,21 +88,19 @@ async def model_inference_page():
     if "audio_transcript" in sst:
         with st.spinner("Extracting text from audio file"):
-            video_raw_text = get_text_from_audio(sst["audio_transcript"])
-        st.toast("Done")
-        with st.spinner("Summarizing text from entire transcript"):
-            video_summary_text = summarize_from_text(video_raw_text)
         st.toast("Done")
-        if len(video_summary_text) > 0:
-            pass
-        else:
-            video_summary_text = "Sorry, we couldn't find any audio data from your video, hence couldn't generate any summary"
-        print("Time taken to get raw text from audio in seconds: ", get_text_from_audio.total_time)
-        print("Time taken to generate text summary from raw text in seconds: ", summarize_from_text.total_time)
         st.header("Audio Transcript summary of your video: ", divider = True)
         st.write(video_summary_text)

 import streamlit as st
 from streamlit import session_state as sst
 from utils import navigate_to
 from model_inference import rate_video_frames,get_text_from_audio, summarize_from_text
 from utils import read_important_frames, extract_audio
 # Define size limits (adjust based on your system)
         else:
             # bytes object which can be translated to audio or video
             video_bytes = uploaded_file.read()
+            # Try to get important frames from this video, if not don't add this key for further inference processing
             with st.spinner("Getting most important moments from your video."):
+                try:
+                    important_frames = read_important_frames(video_bytes, 100)
+                    st.success(f"Got important moments.")
+                    # add important frames to session state and redirect to model inference page
+                    sst["important_frames"] = important_frames
+                except Exception as e:
+                    st.write(f"Sorry couldn't extract important frames from this video & can't rate this on movie scale, because of error: {e}")
+            # Try to get audio from this video, if not don't add this key for further inference processing
             with st.spinner("Getting audio transcript from your video for summary"):
+                try:
+                    audio_transcript_bytes = extract_audio(video_bytes)
+                    st.success(f"Got audio transcript.")
+                    # add audio transcript to session state
+                    sst["audio_transcript"] = audio_transcript_bytes
+                except Exception as e:
+                    st.write(f"Sorry couldn't extract audio from this video & can't rate summarize it, because of error: {e}")
             st.button("Summarize & Analyze Video",
                       on_click = navigate_to,
         important_frames = sst["important_frames"]
         with st.spinner("Generating Movie Scale rating for your video"):
+            try:
+                video_rating_scale = rate_video_frames(important_frames)
+            except Exception as e:
+                video_rating_scale = f"Sorry, we couldn't generate rating of your video because of this error: {e} "
         st.toast("Done")
         st.header("Movie Scale Rating of Your Video: ", divider = True)
         st.write(video_rating_scale)
     if "audio_transcript" in sst:
         with st.spinner("Extracting text from audio file"):
+            try:
+                video_summary_text = get_text_from_audio(sst["audio_transcript"])
+            except Exception as e:
+                video_summary_text = f"Sorry, we couldn't extract text from audio of this file because of this error: {e} "
         st.toast("Done")
+        if video_summary_text[:5] != "Sorry":
+            with st.spinner("Summarizing text from entire transcript"):
+                try:
+                    video_summary_text = summarize_from_text(video_summary_text)
+                except Exception as e:
+                    video_summary_text = f"Sorry, we couldn't summarize text from audio of this file because of this error: {e} "
+            st.toast("Done")
         st.header("Audio Transcript summary of your video: ", divider = True)
         st.write(video_summary_text)

runtime.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- 3.9.*

utils.py CHANGED Viewed

@@ -10,17 +10,14 @@ import numpy as np
 from preprocessing import preprocess_images
 import time
-import io
 from io import BytesIO
 import torch
 import soundfile as sf
 import subprocess
 from typing import List
-prompt_frame_summarization = "These are important frames of a video file. Please generate summary such that end user gets gist of what the video is about."
 prompt_audio_summarization = "This is a video transcript, tell me what is this about: "
-assistant_role = "You are agent who summarizes videos from important frames, use domain specific language to generate summary: sports, cartoon, education,finance etc."
 def timer(func):
     def wrapper(*args, **kwargs):
@@ -52,7 +49,7 @@ def navigate_to(page: str) -> None:
 def read_important_frames(video_bytes, top_k_frames) -> List:
     # reading uploaded vidoe in memory
-    video_io  = io.BytesIO(video_bytes)
     # opening uploaded video frames
     container = av.open(video_io, format='mp4')

 from preprocessing import preprocess_images
 import time
 from io import BytesIO
 import torch
 import soundfile as sf
 import subprocess
 from typing import List
 prompt_audio_summarization = "This is a video transcript, tell me what is this about: "
 def timer(func):
     def wrapper(*args, **kwargs):
 def read_important_frames(video_bytes, top_k_frames) -> List:
     # reading uploaded vidoe in memory
+    video_io  = BytesIO(video_bytes)
     # opening uploaded video frames
     container = av.open(video_io, format='mp4')