Anson818 commited on
Commit
9074cec
·
verified ·
1 Parent(s): 918256e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -5
app.py CHANGED
@@ -36,12 +36,60 @@ except Exception as e:
36
  # This is the long prompt from your script
37
  prompt1 = """Role:
38
  You are an expert computer vision analyst that specializes in converting videos into precise, exhaustive, and purely visual scene descriptions.
39
- ... (Your full Gemini prompt) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  Final Output Rule:
41
- Produce a single, continuous, structured description following all the above rules.
42
  Do not summarize, infer meaning, or include audio elements.
43
  The output must be factual, visual, chronological, and exhaustive."""
44
 
 
45
  # --- 3. The Main Workflow Function for Gradio ---
46
  def generate_sfx(video_path):
47
  """
@@ -74,9 +122,16 @@ def generate_sfx(video_path):
74
 
75
  # --- Step 2: Llama Prompt Generation ---
76
  try:
77
- your_prompt = f"""Identify the suitable audio effects based on the given video transcript...
78
- ... (Your full Llama prompt) ...
79
- Transcript: {transcript}"""
 
 
 
 
 
 
 
80
 
81
  completion = llama_client.chat.completions.create(
82
  model="meta/llama-3.1-405b-instruct",
 
36
  # This is the long prompt from your script
37
  prompt1 = """Role:
38
  You are an expert computer vision analyst that specializes in converting videos into precise, exhaustive, and purely visual scene descriptions.
39
+ Primary Objective:
40
+ Analyze the provided video and generate a detailed, chronological description of everything visually occurring in the footage. Focus entirely on what can be seen, not heard.
41
+ Core Instructions:
42
+ Follow these instructions exactly:
43
+ Visual-Only Focus
44
+
45
+ Describe only what is visible on-screen.
46
+
47
+ Ignore all sounds, dialogue, narration, or music.
48
+
49
+ Include on-screen text only if it appears as a visible object (e.g., sign, label, subtitle).
50
+
51
+ Chronological Detailing
52
+
53
+ Describe events strictly in the order they appear.
54
+
55
+ Use clear temporal markers such as “At the beginning…”, “Next…”, “Then…”, “After that…”, “Finally…”
56
+
57
+ Comprehensive Visual Content
58
+
59
+ Describe people, objects, settings, environments, lighting, colors, positions, and movements.
60
+
61
+ Include camera actions (pans, tilts, zooms, cuts, transitions).
62
+
63
+ Capture facial expressions, gestures, and body posture changes if visible.
64
+
65
+ Objectivity and Precision
66
+
67
+ Avoid interpretation, emotion, or speculation.
68
+
69
+ Describe only observable facts (e.g., say “The person raises their right arm,” not “The person waves hello”).
70
+
71
+ Level of Detail
72
+
73
+ Provide enough visual information for someone to recreate or storyboard the entire scene.
74
+
75
+ Include every key visual or motion change.
76
+ Output Formatting:
77
+ Use the following structured format:
78
+ [Timestamp or Sequence Indicator]
79
+ Detailed description of what is visually happening.
80
+
81
+ Example:
82
+ 0:00–0:04 — A man in a dark blue jacket walks across a street. A red car passes behind him.
83
+ 0:05–0:09 — The camera tilts upward to show a tall building with glass windows. The sky is cloudy.
84
+ 0:10–0:13 — The man stops, looks up, and adjusts the strap of a black backpack.
85
+
86
+ If timestamps are unavailable, use sequence-based ordering (e.g., “Scene 1,” “Scene 2,” etc.).
87
  Final Output Rule:
88
+ Produce a single, continuous, structured description following all the above rules.
89
  Do not summarize, infer meaning, or include audio elements.
90
  The output must be factual, visual, chronological, and exhaustive."""
91
 
92
+
93
  # --- 3. The Main Workflow Function for Gradio ---
94
  def generate_sfx(video_path):
95
  """
 
122
 
123
  # --- Step 2: Llama Prompt Generation ---
124
  try:
125
+ your_prompt = f"""Identify the suitable audio effects based on the given video transcript and
126
+ generate a suitable and detailed prompt for each audio effects for another audio generating AI
127
+ model to generate the audio effects. Note that the duration of each audio should be within 2-10
128
+ seconds. Only include the prompts for generating the sound effects
129
+ and do not include any other text, such as timestamps. Separate the prompt and the duration for
130
+ each audio effects with a new line. Output in the following format for each prompt and duration:
131
+ [prompt1];[duration1] (new line) [prompt2];[duration2] etc. only include the number of the duration
132
+ in [duration] No other text should be included in
133
+ the output. Do make the prompts with details, such as the intensity, feeling etc according to the
134
+ video transcript so that the high quality and suitable sound can be generated. Transcript: {transcript}"""
135
 
136
  completion = llama_client.chat.completions.create(
137
  model="meta/llama-3.1-405b-instruct",