Spaces:

vidhi0405
/

VideoToText

Sleeping

App Files Files Community

vidhi0405 commited on 9 days ago

Commit

e08d4f8

1 Parent(s): aea95ed

commit 2

Browse files

Files changed (3) hide show

huggingface_exact_approach.py +68 -50
huggingface_segment_highlights.py +69 -10
src/smolvlm2_handler.py +4 -4

huggingface_exact_approach.py CHANGED Viewed

@@ -127,62 +127,80 @@ class VideoHighlightDetector:
         rewritten = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
         return self._normalize_sentences(rewritten, min_sentences, max_sentences)
     def analyze_video_content(self, video_path: str) -> str:
         """Analyze video content to determine its type and description."""
-        system_message = "You are a helpful assistant that can understand videos. Give a concise, accurate 3-4 sentence description."
-        user_prompt = (
-            "Describe the video in 3-5 clear, complete sentences. "
-            "Focus only on what is visually happening on screen.\n\n"
-            "Include:\n"
-            "- The main subjects and their actions\n"
-            "- The setting or environment\n"
-            "- Any visible emotions, gestures, or interactions\n"
-            "- Important changes or events during the clip\n\n"
-            "Do NOT add assumptions, opinions, or unseen context.\n"
-            "Do NOT mention the camera, audio, or that this is a video.\n"
-            "Write in simple, factual, neutral language."
-        )
-        best_text = ""
-        best_count = 0
-        for _ in range(3):
-            messages = [
-                {
-                    "role": "system",
-                    "content": [{"type": "text", "text": system_message}]
-                },
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_path},
-                        {"type": "text", "text": user_prompt}
-                    ]
-                }
-            ]
-            inputs = self.processor.apply_chat_template(
-                messages,
-                add_generation_prompt=True,
-                tokenize=True,
-                return_dict=True,
-                return_tensors="pt"
-            ).to(self.device)
-            outputs = self.model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)
-            text = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
-            text = self._normalize_sentences(text, 3, 4)
-            count = self._sentence_count(text)
-            if count > best_count:
-                best_text = text
-                best_count = count
-            if 3 <= count <= 4:
-                return text
-        normalized_best = self._normalize_sentences(best_text, 3, 4)
-        if 3 <= self._sentence_count(normalized_best) <= 4:
-            return normalized_best
-        return self._rewrite_to_sentence_range(normalized_best, 3, 4)
     def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
         """Determine what constitutes highlights based on video description with different prompts."""

         rewritten = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
         return self._normalize_sentences(rewritten, min_sentences, max_sentences)
+    def _describe_video_clip(self, clip_path: str) -> str:
+        """Generate one grounded sentence for a short clip."""
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "Describe only visible actions and scene details. Do not guess."}]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "path": clip_path},
+                    {"type": "text", "text": "Write exactly one factual sentence about what is visually happening."}
+                ]
+            }
+        ]
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(self.device)
+        outputs = self.model.generate(**inputs, max_new_tokens=80, do_sample=False)
+        text = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
+        return self._normalize_sentences(text, 1, 1)
     def analyze_video_content(self, video_path: str) -> str:
         """Analyze video content to determine its type and description."""
+        duration = get_video_duration_seconds(video_path)
+        if duration <= 0:
+            return "Unable to analyze the video content."
+        clip_len = min(2.5, max(1.5, duration / 12))
+        anchors = [0.1, 0.35, 0.6, 0.85]
+        captions: List[str] = []
+        seen = set()
+        for idx, ratio in enumerate(anchors):
+            start = max(0.0, min(duration - clip_len, duration * ratio))
+            with tempfile.NamedTemporaryFile(suffix=f"_desc_{idx}.mp4", delete=False) as tmp_clip:
+                clip_path = tmp_clip.name
+            try:
+                cmd = [
+                    "ffmpeg", "-y", "-v", "quiet",
+                    "-ss", str(start),
+                    "-t", str(clip_len),
+                    "-i", video_path,
+                    "-an",
+                    "-c:v", "libx264",
+                    "-preset", "ultrafast",
+                    clip_path
+                ]
+                subprocess.run(cmd, check=True, capture_output=True)
+                sentence = self._describe_video_clip(clip_path)
+                key = sentence.lower().strip()
+                if key and key not in seen:
+                    seen.add(key)
+                    captions.append(sentence)
+            except Exception:
+                continue
+            finally:
+                if os.path.exists(clip_path):
+                    os.unlink(clip_path)
+        if not captions:
+            return "Unable to analyze the video content."
+        composed = " ".join(captions[:4])
+        composed = self._normalize_sentences(composed, 3, 4)
+        count = self._sentence_count(composed)
+        if 3 <= count <= 4:
+            return composed
+        return self._rewrite_to_sentence_range(composed, 3, 4)
     def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
         """Determine what constitutes highlights based on video description with different prompts."""

huggingface_segment_highlights.py CHANGED Viewed

@@ -11,6 +11,7 @@ import argparse
 import json
 import subprocess
 import tempfile
 from pathlib import Path
 from PIL import Image
 from typing import List, Dict, Tuple, Optional
@@ -52,14 +53,37 @@ class HuggingFaceVideoHighlightDetector:
         except subprocess.CalledProcessError as e:
             logger.error(f"Failed to get video duration: {e}")
             return 0.0
     def analyze_video_content(self, video_path: str) -> str:
         """Get overall video description by analyzing multiple frames"""
         duration = self.get_video_duration_seconds(video_path)
-        # Extract frames from different parts of the video
-        frame_times = [duration * 0.1, duration * 0.3, duration * 0.5, duration * 0.7, duration * 0.9]
         descriptions = []
         for i, time_point in enumerate(frame_times):
             with tempfile.NamedTemporaryFile(suffix=f'_frame_{i}.jpg', delete=False) as temp_frame:
@@ -71,13 +95,22 @@ class HuggingFaceVideoHighlightDetector:
                 try:
                     subprocess.run(cmd, check=True, capture_output=True)
-                    # Analyze this frame with one concise sentence so final description stays short.
                     prompt = (
-                        f"Describe what is happening in this video frame at {time_point:.1f}s in exactly one sentence. "
-                        "Be concrete and avoid guessing names/places unless clearly visible."
                     )
-                    description = self.vlm_handler.generate_response(temp_frame.name, prompt)
-                    descriptions.append(description.strip())
                 except subprocess.CalledProcessError as e:
                     logger.error(f"Failed to extract frame at {time_point}s: {e}")
@@ -87,9 +120,35 @@ class HuggingFaceVideoHighlightDetector:
                     if os.path.exists(temp_frame.name):
                         os.unlink(temp_frame.name)
-        # Combine into a single concise 4-5 sentence video description.
         if descriptions:
-            return " ".join(descriptions[:5])
         else:
             return "Unable to analyze video content"
@@ -543,4 +602,4 @@ def main():
 if __name__ == "__main__":
-    main()

 import json
 import subprocess
 import tempfile
+import re
 from pathlib import Path
 from PIL import Image
 from typing import List, Dict, Tuple, Optional
         except subprocess.CalledProcessError as e:
             logger.error(f"Failed to get video duration: {e}")
             return 0.0
+    def _sentence_count(self, text: str) -> int:
+        sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
+        return len(sentences)
+    def _normalize_sentences(self, text: str, min_sentences: int, max_sentences: int) -> str:
+        cleaned = text.replace("\n", " ").replace("**", "")
+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
+        parts = [p.strip() for p in re.split(r"(?<=[.!?])\s+", cleaned) if p.strip()]
+        sentences = []
+        for part in parts:
+            s = re.sub(r"^\d+\.\s*", "", part)
+            s = re.sub(r"^[-*]\s*", "", s)
+            if len(s.split()) >= 3:
+                sentences.append(s)
+        if not sentences:
+            return cleaned
+        if len(sentences) >= min_sentences:
+            return " ".join(sentences[:max_sentences]).strip()
+        return " ".join(sentences).strip()
     def analyze_video_content(self, video_path: str) -> str:
         """Get overall video description by analyzing multiple frames"""
         duration = self.get_video_duration_seconds(video_path)
+        if duration <= 0:
+            return "Unable to analyze video content"
+        # Use four anchored points to keep a grounded 3-4 sentence summary.
+        frame_times = [duration * 0.1, duration * 0.35, duration * 0.6, duration * 0.85]
         descriptions = []
+        seen = set()
         for i, time_point in enumerate(frame_times):
             with tempfile.NamedTemporaryFile(suffix=f'_frame_{i}.jpg', delete=False) as temp_frame:
                 try:
                     subprocess.run(cmd, check=True, capture_output=True)
                     prompt = (
+                        f"Describe what is visibly happening in this frame at {time_point:.1f}s in exactly one factual sentence. "
+                        "Mention subjects, actions, and setting. Do not guess unseen details."
                     )
+                    description = self.vlm_handler.generate_response(
+                        temp_frame.name,
+                        prompt,
+                        max_new_tokens=80,
+                        temperature=0.2,
+                        do_sample=False
+                    )
+                    sentence = self._normalize_sentences(description.strip(), 1, 1)
+                    key = sentence.lower().strip()
+                    if key and key not in seen:
+                        seen.add(key)
+                        descriptions.append(sentence)
                 except subprocess.CalledProcessError as e:
                     logger.error(f"Failed to extract frame at {time_point}s: {e}")
                     if os.path.exists(temp_frame.name):
                         os.unlink(temp_frame.name)
         if descriptions:
+            composed = self._normalize_sentences(" ".join(descriptions[:4]), 3, 4)
+            if self._sentence_count(composed) >= 3:
+                return composed
+            # Fallback: pull one extra midpoint frame if we still have fewer than 3 sentences.
+            with tempfile.NamedTemporaryFile(suffix='_frame_mid.jpg', delete=False) as temp_frame:
+                mid_time = duration * 0.5
+                cmd = [
+                    "ffmpeg", "-v", "quiet", "-i", video_path,
+                    "-ss", str(mid_time), "-vframes", "1", "-y", temp_frame.name
+                ]
+                try:
+                    subprocess.run(cmd, check=True, capture_output=True)
+                    extra = self.vlm_handler.generate_response(
+                        temp_frame.name,
+                        "Describe this frame in exactly one factual sentence with visible actions and setting.",
+                        max_new_tokens=80,
+                        temperature=0.2,
+                        do_sample=False
+                    )
+                    extra_sentence = self._normalize_sentences(extra.strip(), 1, 1)
+                    if extra_sentence.lower().strip() not in seen:
+                        descriptions.append(extra_sentence)
+                except Exception:
+                    pass
+                finally:
+                    if os.path.exists(temp_frame.name):
+                        os.unlink(temp_frame.name)
+            return self._normalize_sentences(" ".join(descriptions[:4]), 3, 4)
         else:
             return "Unable to analyze video content"
 if __name__ == "__main__":
+    main()

src/smolvlm2_handler.py CHANGED Viewed

@@ -185,8 +185,8 @@ class SmolVLM2Handler:
                     generated_ids = self.model.generate(
                         **inputs,
                         max_new_tokens=max_new_tokens,
-                        temperature=0.7,  # Higher temperature for more varied responses
-                        do_sample=True,   # Enable sampling for variety
                         top_p=0.85,       # Slightly lower top_p for more focused responses
                         top_k=40,         # Add top_k for better control
                         repetition_penalty=1.2,  # Higher repetition penalty
@@ -201,8 +201,8 @@ class SmolVLM2Handler:
                         generated_ids = self.model.generate(
                             **inputs,
                             max_new_tokens=min(max_new_tokens, 256),
-                            temperature=0.5,  # Still some variety
-                            do_sample=True,
                             top_p=0.9,
                             pad_token_id=self.processor.tokenizer.eos_token_id,
                             eos_token_id=self.processor.tokenizer.eos_token_id,

                     generated_ids = self.model.generate(
                         **inputs,
                         max_new_tokens=max_new_tokens,
+                        temperature=temperature,
+                        do_sample=do_sample,
                         top_p=0.85,       # Slightly lower top_p for more focused responses
                         top_k=40,         # Add top_k for better control
                         repetition_penalty=1.2,  # Higher repetition penalty
                         generated_ids = self.model.generate(
                             **inputs,
                             max_new_tokens=min(max_new_tokens, 256),
+                            temperature=min(temperature, 0.5),
+                            do_sample=do_sample,
                             top_p=0.9,
                             pad_token_id=self.processor.tokenizer.eos_token_id,
                             eos_token_id=self.processor.tokenizer.eos_token_id,