Spaces:
Sleeping
Sleeping
commit 2
Browse files- app.py +17 -5
- huggingface_exact_approach.py +45 -4
- huggingface_segment_highlights.py +6 -6
app.py
CHANGED
|
@@ -26,6 +26,7 @@ from pydantic import BaseModel
|
|
| 26 |
import sys
|
| 27 |
import uuid
|
| 28 |
import json
|
|
|
|
| 29 |
from pathlib import Path
|
| 30 |
|
| 31 |
# Add src directory to path for imports
|
|
@@ -63,6 +64,9 @@ class AnalysisResponse(BaseModel):
|
|
| 63 |
highlights: str
|
| 64 |
analysis_file: str
|
| 65 |
|
|
|
|
|
|
|
|
|
|
| 66 |
# Create output directories with proper permissions
|
| 67 |
TEMP_DIR = os.path.join("/tmp", "temp")
|
| 68 |
OUTPUTS_DIR = os.path.join("/tmp", "outputs")
|
|
@@ -129,16 +133,24 @@ async def upload_video(
|
|
| 129 |
raise HTTPException(status_code=500, detail=results["error"])
|
| 130 |
|
| 131 |
selected_set = str(results.get("selected_set", "")).strip()
|
|
|
|
|
|
|
|
|
|
| 132 |
if selected_set == "1":
|
| 133 |
-
enriched_description =
|
| 134 |
elif selected_set == "2":
|
| 135 |
-
enriched_description =
|
| 136 |
else:
|
| 137 |
-
h1 = results.get("highlights1", "")
|
| 138 |
-
h2 = results.get("highlights2", "")
|
| 139 |
-
base_desc = results.get("video_description", "")
|
| 140 |
enriched_description = h1 or h2 or base_desc
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
# Keep API and analysis JSON aligned with requested description behavior.
|
| 143 |
results["video_description"] = enriched_description
|
| 144 |
|
|
|
|
| 26 |
import sys
|
| 27 |
import uuid
|
| 28 |
import json
|
| 29 |
+
import re
|
| 30 |
from pathlib import Path
|
| 31 |
|
| 32 |
# Add src directory to path for imports
|
|
|
|
| 64 |
highlights: str
|
| 65 |
analysis_file: str
|
| 66 |
|
| 67 |
+
def _sentence_count(text: str) -> int:
|
| 68 |
+
return len([s.strip() for s in re.split(r"[.!?]+", text or "") if s.strip()])
|
| 69 |
+
|
| 70 |
# Create output directories with proper permissions
|
| 71 |
TEMP_DIR = os.path.join("/tmp", "temp")
|
| 72 |
OUTPUTS_DIR = os.path.join("/tmp", "outputs")
|
|
|
|
| 133 |
raise HTTPException(status_code=500, detail=results["error"])
|
| 134 |
|
| 135 |
selected_set = str(results.get("selected_set", "")).strip()
|
| 136 |
+
h1 = results.get("highlights1", "")
|
| 137 |
+
h2 = results.get("highlights2", "")
|
| 138 |
+
base_desc = results.get("video_description", "")
|
| 139 |
if selected_set == "1":
|
| 140 |
+
enriched_description = h1
|
| 141 |
elif selected_set == "2":
|
| 142 |
+
enriched_description = h2
|
| 143 |
else:
|
|
|
|
|
|
|
|
|
|
| 144 |
enriched_description = h1 or h2 or base_desc
|
| 145 |
|
| 146 |
+
# Prefer richer highlight text if selected set is shorter.
|
| 147 |
+
if _sentence_count(h1) > _sentence_count(enriched_description):
|
| 148 |
+
enriched_description = h1
|
| 149 |
+
if _sentence_count(h2) > _sentence_count(enriched_description):
|
| 150 |
+
enriched_description = h2
|
| 151 |
+
if not enriched_description:
|
| 152 |
+
enriched_description = base_desc
|
| 153 |
+
|
| 154 |
# Keep API and analysis JSON aligned with requested description behavior.
|
| 155 |
results["video_description"] = enriched_description
|
| 156 |
|
huggingface_exact_approach.py
CHANGED
|
@@ -127,6 +127,44 @@ class VideoHighlightDetector:
|
|
| 127 |
rewritten = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
|
| 128 |
return self._normalize_sentences(rewritten, min_sentences, max_sentences)
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
def _describe_video_clip(self, clip_path: str) -> str:
|
| 131 |
"""Generate one grounded sentence for a short clip."""
|
| 132 |
messages = [
|
|
@@ -195,12 +233,15 @@ class VideoHighlightDetector:
|
|
| 195 |
if not captions:
|
| 196 |
return "Unable to analyze the video content."
|
| 197 |
|
| 198 |
-
composed = " ".join(captions[:
|
| 199 |
-
composed = self._normalize_sentences(composed,
|
| 200 |
count = self._sentence_count(composed)
|
| 201 |
-
if
|
| 202 |
return composed
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
|
| 206 |
"""Determine what constitutes highlights based on video description with different prompts."""
|
|
|
|
| 127 |
rewritten = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
|
| 128 |
return self._normalize_sentences(rewritten, min_sentences, max_sentences)
|
| 129 |
|
| 130 |
+
def _compose_video_description(self, draft: str) -> str:
|
| 131 |
+
"""Compose final video description with strict analyst instructions."""
|
| 132 |
+
messages = [
|
| 133 |
+
{
|
| 134 |
+
"role": "system",
|
| 135 |
+
"content": [{"type": "text", "text": "You are a professional video analyst."}]
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"role": "user",
|
| 139 |
+
"content": [{
|
| 140 |
+
"type": "text",
|
| 141 |
+
"text": (
|
| 142 |
+
"Describe the video in 4-5 clear, complete sentences.\n"
|
| 143 |
+
"Focus only on what is visually happening on screen.\n\n"
|
| 144 |
+
"Include:\n"
|
| 145 |
+
"- The main subjects and their actions\n"
|
| 146 |
+
"- The setting or environment\n"
|
| 147 |
+
"- Any visible emotions, gestures, or interactions\n"
|
| 148 |
+
"- Important changes or events during the clip\n\n"
|
| 149 |
+
"Do NOT add assumptions, opinions, or unseen context.\n"
|
| 150 |
+
"Do NOT mention the camera, audio, or that this is a video.\n"
|
| 151 |
+
"Write in simple, factual, neutral language.\n\n"
|
| 152 |
+
f"Use this draft as source facts only:\n{draft}"
|
| 153 |
+
)
|
| 154 |
+
}]
|
| 155 |
+
}
|
| 156 |
+
]
|
| 157 |
+
inputs = self.processor.apply_chat_template(
|
| 158 |
+
messages,
|
| 159 |
+
add_generation_prompt=True,
|
| 160 |
+
tokenize=True,
|
| 161 |
+
return_dict=True,
|
| 162 |
+
return_tensors="pt"
|
| 163 |
+
).to(self.device)
|
| 164 |
+
outputs = self.model.generate(**inputs, max_new_tokens=320, do_sample=False)
|
| 165 |
+
composed = self._extract_assistant_text(self.processor.decode(outputs[0], skip_special_tokens=True))
|
| 166 |
+
return self._normalize_sentences(composed, 4, 5)
|
| 167 |
+
|
| 168 |
def _describe_video_clip(self, clip_path: str) -> str:
|
| 169 |
"""Generate one grounded sentence for a short clip."""
|
| 170 |
messages = [
|
|
|
|
| 233 |
if not captions:
|
| 234 |
return "Unable to analyze the video content."
|
| 235 |
|
| 236 |
+
composed = " ".join(captions[:5])
|
| 237 |
+
composed = self._normalize_sentences(composed, 4, 5)
|
| 238 |
count = self._sentence_count(composed)
|
| 239 |
+
if 4 <= count <= 5:
|
| 240 |
return composed
|
| 241 |
+
final_desc = self._compose_video_description(composed)
|
| 242 |
+
if 4 <= self._sentence_count(final_desc) <= 5:
|
| 243 |
+
return final_desc
|
| 244 |
+
return self._rewrite_to_sentence_range(final_desc, 4, 5)
|
| 245 |
|
| 246 |
def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
|
| 247 |
"""Determine what constitutes highlights based on video description with different prompts."""
|
huggingface_segment_highlights.py
CHANGED
|
@@ -80,8 +80,8 @@ class HuggingFaceVideoHighlightDetector:
|
|
| 80 |
if duration <= 0:
|
| 81 |
return "Unable to analyze video content"
|
| 82 |
|
| 83 |
-
# Use
|
| 84 |
-
frame_times = [duration * 0.1, duration * 0.
|
| 85 |
descriptions = []
|
| 86 |
seen = set()
|
| 87 |
|
|
@@ -121,10 +121,10 @@ class HuggingFaceVideoHighlightDetector:
|
|
| 121 |
os.unlink(temp_frame.name)
|
| 122 |
|
| 123 |
if descriptions:
|
| 124 |
-
composed = self._normalize_sentences(" ".join(descriptions[:
|
| 125 |
-
if self._sentence_count(composed) >=
|
| 126 |
return composed
|
| 127 |
-
# Fallback: pull one extra midpoint frame if we still have fewer than
|
| 128 |
with tempfile.NamedTemporaryFile(suffix='_frame_mid.jpg', delete=False) as temp_frame:
|
| 129 |
mid_time = duration * 0.5
|
| 130 |
cmd = [
|
|
@@ -148,7 +148,7 @@ class HuggingFaceVideoHighlightDetector:
|
|
| 148 |
finally:
|
| 149 |
if os.path.exists(temp_frame.name):
|
| 150 |
os.unlink(temp_frame.name)
|
| 151 |
-
return self._normalize_sentences(" ".join(descriptions[:
|
| 152 |
else:
|
| 153 |
return "Unable to analyze video content"
|
| 154 |
|
|
|
|
| 80 |
if duration <= 0:
|
| 81 |
return "Unable to analyze video content"
|
| 82 |
|
| 83 |
+
# Use five anchored points to support a grounded 4-5 sentence summary.
|
| 84 |
+
frame_times = [duration * 0.1, duration * 0.3, duration * 0.5, duration * 0.7, duration * 0.9]
|
| 85 |
descriptions = []
|
| 86 |
seen = set()
|
| 87 |
|
|
|
|
| 121 |
os.unlink(temp_frame.name)
|
| 122 |
|
| 123 |
if descriptions:
|
| 124 |
+
composed = self._normalize_sentences(" ".join(descriptions[:5]), 4, 5)
|
| 125 |
+
if self._sentence_count(composed) >= 4:
|
| 126 |
return composed
|
| 127 |
+
# Fallback: pull one extra midpoint frame if we still have fewer than 4 sentences.
|
| 128 |
with tempfile.NamedTemporaryFile(suffix='_frame_mid.jpg', delete=False) as temp_frame:
|
| 129 |
mid_time = duration * 0.5
|
| 130 |
cmd = [
|
|
|
|
| 148 |
finally:
|
| 149 |
if os.path.exists(temp_frame.name):
|
| 150 |
os.unlink(temp_frame.name)
|
| 151 |
+
return self._normalize_sentences(" ".join(descriptions[:5]), 4, 5)
|
| 152 |
else:
|
| 153 |
return "Unable to analyze video content"
|
| 154 |
|