Spaces:

vidhi0405
/

VideoToText

Sleeping

App Files Files Community

vidhi0405 commited on 9 days ago

Commit

8cb8f7a

1 Parent(s): bc48923

commit 2

Browse files

Files changed (3) hide show

app.py +17 -5
huggingface_exact_approach.py +45 -4
huggingface_segment_highlights.py +6 -6

app.py CHANGED Viewed

@@ -26,6 +26,7 @@ from pydantic import BaseModel
 import sys
 import uuid
 import json
 from pathlib import Path
 # Add src directory to path for imports
@@ -63,6 +64,9 @@ class AnalysisResponse(BaseModel):
     highlights: str
     analysis_file: str
 # Create output directories with proper permissions
 TEMP_DIR = os.path.join("/tmp", "temp")
 OUTPUTS_DIR = os.path.join("/tmp", "outputs")
@@ -129,16 +133,24 @@ async def upload_video(
             raise HTTPException(status_code=500, detail=results["error"])
         selected_set = str(results.get("selected_set", "")).strip()
         if selected_set == "1":
-            enriched_description = results.get("highlights1", "")
         elif selected_set == "2":
-            enriched_description = results.get("highlights2", "")
         else:
-            h1 = results.get("highlights1", "")
-            h2 = results.get("highlights2", "")
-            base_desc = results.get("video_description", "")
             enriched_description = h1 or h2 or base_desc
         # Keep API and analysis JSON aligned with requested description behavior.
         results["video_description"] = enriched_description

 import sys
 import uuid
 import json
+import re
 from pathlib import Path
 # Add src directory to path for imports
     highlights: str
     analysis_file: str
+def _sentence_count(text: str) -> int:
+    return len([s.strip() for s in re.split(r"[.!?]+", text or "") if s.strip()])
 # Create output directories with proper permissions
 TEMP_DIR = os.path.join("/tmp", "temp")
 OUTPUTS_DIR = os.path.join("/tmp", "outputs")
             raise HTTPException(status_code=500, detail=results["error"])
         selected_set = str(results.get("selected_set", "")).strip()
+        h1 = results.get("highlights1", "")
+        h2 = results.get("highlights2", "")
+        base_desc = results.get("video_description", "")
         if selected_set == "1":
+            enriched_description = h1
         elif selected_set == "2":
+            enriched_description = h2
         else:
             enriched_description = h1 or h2 or base_desc
+        # Prefer richer highlight text if selected set is shorter.
+        if _sentence_count(h1) > _sentence_count(enriched_description):
+            enriched_description = h1
+        if _sentence_count(h2) > _sentence_count(enriched_description):
+            enriched_description = h2
+        if not enriched_description:
+            enriched_description = base_desc
         # Keep API and analysis JSON aligned with requested description behavior.
         results["video_description"] = enriched_description

huggingface_exact_approach.py CHANGED Viewed

@@ -127,6 +127,44 @@ class VideoHighlightDetector:
         rewritten = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
         return self._normalize_sentences(rewritten, min_sentences, max_sentences)
     def _describe_video_clip(self, clip_path: str) -> str:
         """Generate one grounded sentence for a short clip."""
         messages = [
@@ -195,12 +233,15 @@ class VideoHighlightDetector:
         if not captions:
             return "Unable to analyze the video content."
-        composed = " ".join(captions[:4])
-        composed = self._normalize_sentences(composed, 3, 4)
         count = self._sentence_count(composed)
-        if 3 <= count <= 4:
             return composed
-        return self._rewrite_to_sentence_range(composed, 3, 4)
     def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
         """Determine what constitutes highlights based on video description with different prompts."""

         rewritten = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
         return self._normalize_sentences(rewritten, min_sentences, max_sentences)
+    def _compose_video_description(self, draft: str) -> str:
+        """Compose final video description with strict analyst instructions."""
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a professional video analyst."}]
+            },
+            {
+                "role": "user",
+                "content": [{
+                    "type": "text",
+                    "text": (
+                        "Describe the video in 4-5 clear, complete sentences.\n"
+                        "Focus only on what is visually happening on screen.\n\n"
+                        "Include:\n"
+                        "- The main subjects and their actions\n"
+                        "- The setting or environment\n"
+                        "- Any visible emotions, gestures, or interactions\n"
+                        "- Important changes or events during the clip\n\n"
+                        "Do NOT add assumptions, opinions, or unseen context.\n"
+                        "Do NOT mention the camera, audio, or that this is a video.\n"
+                        "Write in simple, factual, neutral language.\n\n"
+                        f"Use this draft as source facts only:\n{draft}"
+                    )
+                }]
+            }
+        ]
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(self.device)
+        outputs = self.model.generate(**inputs, max_new_tokens=320, do_sample=False)
+        composed = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
+        return self._normalize_sentences(composed, 4, 5)
     def _describe_video_clip(self, clip_path: str) -> str:
         """Generate one grounded sentence for a short clip."""
         messages = [
         if not captions:
             return "Unable to analyze the video content."
+        composed = " ".join(captions[:5])
+        composed = self._normalize_sentences(composed, 4, 5)
         count = self._sentence_count(composed)
+        if 4 <= count <= 5:
             return composed
+        final_desc = self._compose_video_description(composed)
+        if 4 <= self._sentence_count(final_desc) <= 5:
+            return final_desc
+        return self._rewrite_to_sentence_range(final_desc, 4, 5)
     def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
         """Determine what constitutes highlights based on video description with different prompts."""

huggingface_segment_highlights.py CHANGED Viewed

@@ -80,8 +80,8 @@ class HuggingFaceVideoHighlightDetector:
         if duration <= 0:
             return "Unable to analyze video content"
-        # Use four anchored points to keep a grounded 3-4 sentence summary.
-        frame_times = [duration * 0.1, duration * 0.35, duration * 0.6, duration * 0.85]
         descriptions = []
         seen = set()
@@ -121,10 +121,10 @@ class HuggingFaceVideoHighlightDetector:
                         os.unlink(temp_frame.name)
         if descriptions:
-            composed = self._normalize_sentences(" ".join(descriptions[:4]), 3, 4)
-            if self._sentence_count(composed) >= 3:
                 return composed
-            # Fallback: pull one extra midpoint frame if we still have fewer than 3 sentences.
             with tempfile.NamedTemporaryFile(suffix='_frame_mid.jpg', delete=False) as temp_frame:
                 mid_time = duration * 0.5
                 cmd = [
@@ -148,7 +148,7 @@ class HuggingFaceVideoHighlightDetector:
                 finally:
                     if os.path.exists(temp_frame.name):
                         os.unlink(temp_frame.name)
-            return self._normalize_sentences(" ".join(descriptions[:4]), 3, 4)
         else:
             return "Unable to analyze video content"

         if duration <= 0:
             return "Unable to analyze video content"
+        # Use five anchored points to support a grounded 4-5 sentence summary.
+        frame_times = [duration * 0.1, duration * 0.3, duration * 0.5, duration * 0.7, duration * 0.9]
         descriptions = []
         seen = set()
                         os.unlink(temp_frame.name)
         if descriptions:
+            composed = self._normalize_sentences(" ".join(descriptions[:5]), 4, 5)
+            if self._sentence_count(composed) >= 4:
                 return composed
+            # Fallback: pull one extra midpoint frame if we still have fewer than 4 sentences.
             with tempfile.NamedTemporaryFile(suffix='_frame_mid.jpg', delete=False) as temp_frame:
                 mid_time = duration * 0.5
                 cmd = [
                 finally:
                     if os.path.exists(temp_frame.name):
                         os.unlink(temp_frame.name)
+            return self._normalize_sentences(" ".join(descriptions[:5]), 4, 5)
         else:
             return "Unable to analyze video content"