vidhi0405 commited on
Commit
8cb8f7a
·
1 Parent(s): bc48923
app.py CHANGED
@@ -26,6 +26,7 @@ from pydantic import BaseModel
26
  import sys
27
  import uuid
28
  import json
 
29
  from pathlib import Path
30
 
31
  # Add src directory to path for imports
@@ -63,6 +64,9 @@ class AnalysisResponse(BaseModel):
63
  highlights: str
64
  analysis_file: str
65
 
 
 
 
66
  # Create output directories with proper permissions
67
  TEMP_DIR = os.path.join("/tmp", "temp")
68
  OUTPUTS_DIR = os.path.join("/tmp", "outputs")
@@ -129,16 +133,24 @@ async def upload_video(
129
  raise HTTPException(status_code=500, detail=results["error"])
130
 
131
  selected_set = str(results.get("selected_set", "")).strip()
 
 
 
132
  if selected_set == "1":
133
- enriched_description = results.get("highlights1", "")
134
  elif selected_set == "2":
135
- enriched_description = results.get("highlights2", "")
136
  else:
137
- h1 = results.get("highlights1", "")
138
- h2 = results.get("highlights2", "")
139
- base_desc = results.get("video_description", "")
140
  enriched_description = h1 or h2 or base_desc
141
 
 
 
 
 
 
 
 
 
142
  # Keep API and analysis JSON aligned with requested description behavior.
143
  results["video_description"] = enriched_description
144
 
 
26
  import sys
27
  import uuid
28
  import json
29
+ import re
30
  from pathlib import Path
31
 
32
  # Add src directory to path for imports
 
64
  highlights: str
65
  analysis_file: str
66
 
67
+ def _sentence_count(text: str) -> int:
68
+ return len([s.strip() for s in re.split(r"[.!?]+", text or "") if s.strip()])
69
+
70
  # Create output directories with proper permissions
71
  TEMP_DIR = os.path.join("/tmp", "temp")
72
  OUTPUTS_DIR = os.path.join("/tmp", "outputs")
 
133
  raise HTTPException(status_code=500, detail=results["error"])
134
 
135
  selected_set = str(results.get("selected_set", "")).strip()
136
+ h1 = results.get("highlights1", "")
137
+ h2 = results.get("highlights2", "")
138
+ base_desc = results.get("video_description", "")
139
  if selected_set == "1":
140
+ enriched_description = h1
141
  elif selected_set == "2":
142
+ enriched_description = h2
143
  else:
 
 
 
144
  enriched_description = h1 or h2 or base_desc
145
 
146
+ # Prefer richer highlight text if selected set is shorter.
147
+ if _sentence_count(h1) > _sentence_count(enriched_description):
148
+ enriched_description = h1
149
+ if _sentence_count(h2) > _sentence_count(enriched_description):
150
+ enriched_description = h2
151
+ if not enriched_description:
152
+ enriched_description = base_desc
153
+
154
  # Keep API and analysis JSON aligned with requested description behavior.
155
  results["video_description"] = enriched_description
156
 
huggingface_exact_approach.py CHANGED
@@ -127,6 +127,44 @@ class VideoHighlightDetector:
127
  rewritten = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
128
  return self._normalize_sentences(rewritten, min_sentences, max_sentences)
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def _describe_video_clip(self, clip_path: str) -> str:
131
  """Generate one grounded sentence for a short clip."""
132
  messages = [
@@ -195,12 +233,15 @@ class VideoHighlightDetector:
195
  if not captions:
196
  return "Unable to analyze the video content."
197
 
198
- composed = " ".join(captions[:4])
199
- composed = self._normalize_sentences(composed, 3, 4)
200
  count = self._sentence_count(composed)
201
- if 3 <= count <= 4:
202
  return composed
203
- return self._rewrite_to_sentence_range(composed, 3, 4)
 
 
 
204
 
205
  def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
206
  """Determine what constitutes highlights based on video description with different prompts."""
 
127
  rewritten = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
128
  return self._normalize_sentences(rewritten, min_sentences, max_sentences)
129
 
130
+ def _compose_video_description(self, draft: str) -> str:
131
+ """Compose final video description with strict analyst instructions."""
132
+ messages = [
133
+ {
134
+ "role": "system",
135
+ "content": [{"type": "text", "text": "You are a professional video analyst."}]
136
+ },
137
+ {
138
+ "role": "user",
139
+ "content": [{
140
+ "type": "text",
141
+ "text": (
142
+ "Describe the video in 4-5 clear, complete sentences.\n"
143
+ "Focus only on what is visually happening on screen.\n\n"
144
+ "Include:\n"
145
+ "- The main subjects and their actions\n"
146
+ "- The setting or environment\n"
147
+ "- Any visible emotions, gestures, or interactions\n"
148
+ "- Important changes or events during the clip\n\n"
149
+ "Do NOT add assumptions, opinions, or unseen context.\n"
150
+ "Do NOT mention the camera, audio, or that this is a video.\n"
151
+ "Write in simple, factual, neutral language.\n\n"
152
+ f"Use this draft as source facts only:\n{draft}"
153
+ )
154
+ }]
155
+ }
156
+ ]
157
+ inputs = self.processor.apply_chat_template(
158
+ messages,
159
+ add_generation_prompt=True,
160
+ tokenize=True,
161
+ return_dict=True,
162
+ return_tensors="pt"
163
+ ).to(self.device)
164
+ outputs = self.model.generate(**inputs, max_new_tokens=320, do_sample=False)
165
+ composed = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
166
+ return self._normalize_sentences(composed, 4, 5)
167
+
168
  def _describe_video_clip(self, clip_path: str) -> str:
169
  """Generate one grounded sentence for a short clip."""
170
  messages = [
 
233
  if not captions:
234
  return "Unable to analyze the video content."
235
 
236
+ composed = " ".join(captions[:5])
237
+ composed = self._normalize_sentences(composed, 4, 5)
238
  count = self._sentence_count(composed)
239
+ if 4 <= count <= 5:
240
  return composed
241
+ final_desc = self._compose_video_description(composed)
242
+ if 4 <= self._sentence_count(final_desc) <= 5:
243
+ return final_desc
244
+ return self._rewrite_to_sentence_range(final_desc, 4, 5)
245
 
246
  def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
247
  """Determine what constitutes highlights based on video description with different prompts."""
huggingface_segment_highlights.py CHANGED
@@ -80,8 +80,8 @@ class HuggingFaceVideoHighlightDetector:
80
  if duration <= 0:
81
  return "Unable to analyze video content"
82
 
83
- # Use four anchored points to keep a grounded 3-4 sentence summary.
84
- frame_times = [duration * 0.1, duration * 0.35, duration * 0.6, duration * 0.85]
85
  descriptions = []
86
  seen = set()
87
 
@@ -121,10 +121,10 @@ class HuggingFaceVideoHighlightDetector:
121
  os.unlink(temp_frame.name)
122
 
123
  if descriptions:
124
- composed = self._normalize_sentences(" ".join(descriptions[:4]), 3, 4)
125
- if self._sentence_count(composed) >= 3:
126
  return composed
127
- # Fallback: pull one extra midpoint frame if we still have fewer than 3 sentences.
128
  with tempfile.NamedTemporaryFile(suffix='_frame_mid.jpg', delete=False) as temp_frame:
129
  mid_time = duration * 0.5
130
  cmd = [
@@ -148,7 +148,7 @@ class HuggingFaceVideoHighlightDetector:
148
  finally:
149
  if os.path.exists(temp_frame.name):
150
  os.unlink(temp_frame.name)
151
- return self._normalize_sentences(" ".join(descriptions[:4]), 3, 4)
152
  else:
153
  return "Unable to analyze video content"
154
 
 
80
  if duration <= 0:
81
  return "Unable to analyze video content"
82
 
83
+ # Use five anchored points to support a grounded 4-5 sentence summary.
84
+ frame_times = [duration * 0.1, duration * 0.3, duration * 0.5, duration * 0.7, duration * 0.9]
85
  descriptions = []
86
  seen = set()
87
 
 
121
  os.unlink(temp_frame.name)
122
 
123
  if descriptions:
124
+ composed = self._normalize_sentences(" ".join(descriptions[:5]), 4, 5)
125
+ if self._sentence_count(composed) >= 4:
126
  return composed
127
+ # Fallback: pull one extra midpoint frame if we still have fewer than 4 sentences.
128
  with tempfile.NamedTemporaryFile(suffix='_frame_mid.jpg', delete=False) as temp_frame:
129
  mid_time = duration * 0.5
130
  cmd = [
 
148
  finally:
149
  if os.path.exists(temp_frame.name):
150
  os.unlink(temp_frame.name)
151
+ return self._normalize_sentences(" ".join(descriptions[:5]), 4, 5)
152
  else:
153
  return "Unable to analyze video content"
154