vidhi0405 commited on
Commit
e08d4f8
·
1 Parent(s): aea95ed
huggingface_exact_approach.py CHANGED
@@ -127,62 +127,80 @@ class VideoHighlightDetector:
127
  rewritten = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
128
  return self._normalize_sentences(rewritten, min_sentences, max_sentences)
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def analyze_video_content(self, video_path: str) -> str:
131
  """Analyze video content to determine its type and description."""
132
- system_message = "You are a helpful assistant that can understand videos. Give a concise, accurate 3-4 sentence description."
133
- user_prompt = (
134
- "Describe the video in 3-5 clear, complete sentences. "
135
- "Focus only on what is visually happening on screen.\n\n"
136
- "Include:\n"
137
- "- The main subjects and their actions\n"
138
- "- The setting or environment\n"
139
- "- Any visible emotions, gestures, or interactions\n"
140
- "- Important changes or events during the clip\n\n"
141
- "Do NOT add assumptions, opinions, or unseen context.\n"
142
- "Do NOT mention the camera, audio, or that this is a video.\n"
143
- "Write in simple, factual, neutral language."
144
- )
145
-
146
- best_text = ""
147
- best_count = 0
148
- for _ in range(3):
149
- messages = [
150
- {
151
- "role": "system",
152
- "content": [{"type": "text", "text": system_message}]
153
- },
154
- {
155
- "role": "user",
156
- "content": [
157
- {"type": "video", "path": video_path},
158
- {"type": "text", "text": user_prompt}
159
- ]
160
- }
161
- ]
162
 
163
- inputs = self.processor.apply_chat_template(
164
- messages,
165
- add_generation_prompt=True,
166
- tokenize=True,
167
- return_dict=True,
168
- return_tensors="pt"
169
- ).to(self.device)
170
 
171
- outputs = self.model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)
172
- text = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
173
- text = self._normalize_sentences(text, 3, 4)
174
- count = self._sentence_count(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
- if count > best_count:
177
- best_text = text
178
- best_count = count
179
- if 3 <= count <= 4:
180
- return text
181
 
182
- normalized_best = self._normalize_sentences(best_text, 3, 4)
183
- if 3 <= self._sentence_count(normalized_best) <= 4:
184
- return normalized_best
185
- return self._rewrite_to_sentence_range(normalized_best, 3, 4)
 
 
186
 
187
  def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
188
  """Determine what constitutes highlights based on video description with different prompts."""
 
127
  rewritten = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
128
  return self._normalize_sentences(rewritten, min_sentences, max_sentences)
129
 
130
+ def _describe_video_clip(self, clip_path: str) -> str:
131
+ """Generate one grounded sentence for a short clip."""
132
+ messages = [
133
+ {
134
+ "role": "system",
135
+ "content": [{"type": "text", "text": "Describe only visible actions and scene details. Do not guess."}]
136
+ },
137
+ {
138
+ "role": "user",
139
+ "content": [
140
+ {"type": "video", "path": clip_path},
141
+ {"type": "text", "text": "Write exactly one factual sentence about what is visually happening."}
142
+ ]
143
+ }
144
+ ]
145
+
146
+ inputs = self.processor.apply_chat_template(
147
+ messages,
148
+ add_generation_prompt=True,
149
+ tokenize=True,
150
+ return_dict=True,
151
+ return_tensors="pt"
152
+ ).to(self.device)
153
+ outputs = self.model.generate(**inputs, max_new_tokens=80, do_sample=False)
154
+ text = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
155
+ return self._normalize_sentences(text, 1, 1)
156
+
157
  def analyze_video_content(self, video_path: str) -> str:
158
  """Analyze video content to determine its type and description."""
159
+ duration = get_video_duration_seconds(video_path)
160
+ if duration <= 0:
161
+ return "Unable to analyze the video content."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ clip_len = min(2.5, max(1.5, duration / 12))
164
+ anchors = [0.1, 0.35, 0.6, 0.85]
165
+ captions: List[str] = []
166
+ seen = set()
 
 
 
167
 
168
+ for idx, ratio in enumerate(anchors):
169
+ start = max(0.0, min(duration - clip_len, duration * ratio))
170
+ with tempfile.NamedTemporaryFile(suffix=f"_desc_{idx}.mp4", delete=False) as tmp_clip:
171
+ clip_path = tmp_clip.name
172
+ try:
173
+ cmd = [
174
+ "ffmpeg", "-y", "-v", "quiet",
175
+ "-ss", str(start),
176
+ "-t", str(clip_len),
177
+ "-i", video_path,
178
+ "-an",
179
+ "-c:v", "libx264",
180
+ "-preset", "ultrafast",
181
+ clip_path
182
+ ]
183
+ subprocess.run(cmd, check=True, capture_output=True)
184
+ sentence = self._describe_video_clip(clip_path)
185
+ key = sentence.lower().strip()
186
+ if key and key not in seen:
187
+ seen.add(key)
188
+ captions.append(sentence)
189
+ except Exception:
190
+ continue
191
+ finally:
192
+ if os.path.exists(clip_path):
193
+ os.unlink(clip_path)
194
 
195
+ if not captions:
196
+ return "Unable to analyze the video content."
 
 
 
197
 
198
+ composed = " ".join(captions[:4])
199
+ composed = self._normalize_sentences(composed, 3, 4)
200
+ count = self._sentence_count(composed)
201
+ if 3 <= count <= 4:
202
+ return composed
203
+ return self._rewrite_to_sentence_range(composed, 3, 4)
204
 
205
  def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
206
  """Determine what constitutes highlights based on video description with different prompts."""
huggingface_segment_highlights.py CHANGED
@@ -11,6 +11,7 @@ import argparse
11
  import json
12
  import subprocess
13
  import tempfile
 
14
  from pathlib import Path
15
  from PIL import Image
16
  from typing import List, Dict, Tuple, Optional
@@ -52,14 +53,37 @@ class HuggingFaceVideoHighlightDetector:
52
  except subprocess.CalledProcessError as e:
53
  logger.error(f"Failed to get video duration: {e}")
54
  return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  def analyze_video_content(self, video_path: str) -> str:
57
  """Get overall video description by analyzing multiple frames"""
58
  duration = self.get_video_duration_seconds(video_path)
 
 
59
 
60
- # Extract frames from different parts of the video
61
- frame_times = [duration * 0.1, duration * 0.3, duration * 0.5, duration * 0.7, duration * 0.9]
62
  descriptions = []
 
63
 
64
  for i, time_point in enumerate(frame_times):
65
  with tempfile.NamedTemporaryFile(suffix=f'_frame_{i}.jpg', delete=False) as temp_frame:
@@ -71,13 +95,22 @@ class HuggingFaceVideoHighlightDetector:
71
  try:
72
  subprocess.run(cmd, check=True, capture_output=True)
73
 
74
- # Analyze this frame with one concise sentence so final description stays short.
75
  prompt = (
76
- f"Describe what is happening in this video frame at {time_point:.1f}s in exactly one sentence. "
77
- "Be concrete and avoid guessing names/places unless clearly visible."
78
  )
79
- description = self.vlm_handler.generate_response(temp_frame.name, prompt)
80
- descriptions.append(description.strip())
 
 
 
 
 
 
 
 
 
 
81
 
82
  except subprocess.CalledProcessError as e:
83
  logger.error(f"Failed to extract frame at {time_point}s: {e}")
@@ -87,9 +120,35 @@ class HuggingFaceVideoHighlightDetector:
87
  if os.path.exists(temp_frame.name):
88
  os.unlink(temp_frame.name)
89
 
90
- # Combine into a single concise 4-5 sentence video description.
91
  if descriptions:
92
- return " ".join(descriptions[:5])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  else:
94
  return "Unable to analyze video content"
95
 
@@ -543,4 +602,4 @@ def main():
543
 
544
 
545
  if __name__ == "__main__":
546
- main()
 
11
  import json
12
  import subprocess
13
  import tempfile
14
+ import re
15
  from pathlib import Path
16
  from PIL import Image
17
  from typing import List, Dict, Tuple, Optional
 
53
  except subprocess.CalledProcessError as e:
54
  logger.error(f"Failed to get video duration: {e}")
55
  return 0.0
56
+
57
+ def _sentence_count(self, text: str) -> int:
58
+ sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
59
+ return len(sentences)
60
+
61
+ def _normalize_sentences(self, text: str, min_sentences: int, max_sentences: int) -> str:
62
+ cleaned = text.replace("\n", " ").replace("**", "")
63
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
64
+ parts = [p.strip() for p in re.split(r"(?<=[.!?])\s+", cleaned) if p.strip()]
65
+ sentences = []
66
+ for part in parts:
67
+ s = re.sub(r"^\d+\.\s*", "", part)
68
+ s = re.sub(r"^[-*]\s*", "", s)
69
+ if len(s.split()) >= 3:
70
+ sentences.append(s)
71
+ if not sentences:
72
+ return cleaned
73
+ if len(sentences) >= min_sentences:
74
+ return " ".join(sentences[:max_sentences]).strip()
75
+ return " ".join(sentences).strip()
76
 
77
  def analyze_video_content(self, video_path: str) -> str:
78
  """Get overall video description by analyzing multiple frames"""
79
  duration = self.get_video_duration_seconds(video_path)
80
+ if duration <= 0:
81
+ return "Unable to analyze video content"
82
 
83
+ # Use four anchored points to keep a grounded 3-4 sentence summary.
84
+ frame_times = [duration * 0.1, duration * 0.35, duration * 0.6, duration * 0.85]
85
  descriptions = []
86
+ seen = set()
87
 
88
  for i, time_point in enumerate(frame_times):
89
  with tempfile.NamedTemporaryFile(suffix=f'_frame_{i}.jpg', delete=False) as temp_frame:
 
95
  try:
96
  subprocess.run(cmd, check=True, capture_output=True)
97
 
 
98
  prompt = (
99
+ f"Describe what is visibly happening in this frame at {time_point:.1f}s in exactly one factual sentence. "
100
+ "Mention subjects, actions, and setting. Do not guess unseen details."
101
  )
102
+ description = self.vlm_handler.generate_response(
103
+ temp_frame.name,
104
+ prompt,
105
+ max_new_tokens=80,
106
+ temperature=0.2,
107
+ do_sample=False
108
+ )
109
+ sentence = self._normalize_sentences(description.strip(), 1, 1)
110
+ key = sentence.lower().strip()
111
+ if key and key not in seen:
112
+ seen.add(key)
113
+ descriptions.append(sentence)
114
 
115
  except subprocess.CalledProcessError as e:
116
  logger.error(f"Failed to extract frame at {time_point}s: {e}")
 
120
  if os.path.exists(temp_frame.name):
121
  os.unlink(temp_frame.name)
122
 
 
123
  if descriptions:
124
+ composed = self._normalize_sentences(" ".join(descriptions[:4]), 3, 4)
125
+ if self._sentence_count(composed) >= 3:
126
+ return composed
127
+ # Fallback: pull one extra midpoint frame if we still have fewer than 3 sentences.
128
+ with tempfile.NamedTemporaryFile(suffix='_frame_mid.jpg', delete=False) as temp_frame:
129
+ mid_time = duration * 0.5
130
+ cmd = [
131
+ "ffmpeg", "-v", "quiet", "-i", video_path,
132
+ "-ss", str(mid_time), "-vframes", "1", "-y", temp_frame.name
133
+ ]
134
+ try:
135
+ subprocess.run(cmd, check=True, capture_output=True)
136
+ extra = self.vlm_handler.generate_response(
137
+ temp_frame.name,
138
+ "Describe this frame in exactly one factual sentence with visible actions and setting.",
139
+ max_new_tokens=80,
140
+ temperature=0.2,
141
+ do_sample=False
142
+ )
143
+ extra_sentence = self._normalize_sentences(extra.strip(), 1, 1)
144
+ if extra_sentence.lower().strip() not in seen:
145
+ descriptions.append(extra_sentence)
146
+ except Exception:
147
+ pass
148
+ finally:
149
+ if os.path.exists(temp_frame.name):
150
+ os.unlink(temp_frame.name)
151
+ return self._normalize_sentences(" ".join(descriptions[:4]), 3, 4)
152
  else:
153
  return "Unable to analyze video content"
154
 
 
602
 
603
 
604
  if __name__ == "__main__":
605
+ main()
src/smolvlm2_handler.py CHANGED
@@ -185,8 +185,8 @@ class SmolVLM2Handler:
185
  generated_ids = self.model.generate(
186
  **inputs,
187
  max_new_tokens=max_new_tokens,
188
- temperature=0.7, # Higher temperature for more varied responses
189
- do_sample=True, # Enable sampling for variety
190
  top_p=0.85, # Slightly lower top_p for more focused responses
191
  top_k=40, # Add top_k for better control
192
  repetition_penalty=1.2, # Higher repetition penalty
@@ -201,8 +201,8 @@ class SmolVLM2Handler:
201
  generated_ids = self.model.generate(
202
  **inputs,
203
  max_new_tokens=min(max_new_tokens, 256),
204
- temperature=0.5, # Still some variety
205
- do_sample=True,
206
  top_p=0.9,
207
  pad_token_id=self.processor.tokenizer.eos_token_id,
208
  eos_token_id=self.processor.tokenizer.eos_token_id,
 
185
  generated_ids = self.model.generate(
186
  **inputs,
187
  max_new_tokens=max_new_tokens,
188
+ temperature=temperature,
189
+ do_sample=do_sample,
190
  top_p=0.85, # Slightly lower top_p for more focused responses
191
  top_k=40, # Add top_k for better control
192
  repetition_penalty=1.2, # Higher repetition penalty
 
201
  generated_ids = self.model.generate(
202
  **inputs,
203
  max_new_tokens=min(max_new_tokens, 256),
204
+ temperature=min(temperature, 0.5),
205
+ do_sample=do_sample,
206
  top_p=0.9,
207
  pad_token_id=self.processor.tokenizer.eos_token_id,
208
  eos_token_id=self.processor.tokenizer.eos_token_id,