prithivMLmods commited on
Commit
ee2dfd6
·
verified ·
1 Parent(s): 446e03a

update app

Browse files
Files changed (1) hide show
  1. app.py +22 -10
app.py CHANGED
@@ -24,7 +24,7 @@ model = AutoModelForImageTextToText.from_pretrained(
24
  print("Model loaded successfully.")
25
 
26
  @spaces.GPU
27
- def process_video(user_text, video_path):
28
  if not video_path:
29
  return "Please upload a video."
30
 
@@ -32,7 +32,7 @@ def process_video(user_text, video_path):
32
  if not user_text.strip():
33
  user_text = "Describe this video in detail."
34
 
35
- # Construct messages for Molmo
36
  messages = [
37
  {
38
  "role": "user",
@@ -68,7 +68,10 @@ def process_video(user_text, video_path):
68
 
69
  # Generate
70
  with torch.inference_mode():
71
- generated_ids = model.generate(**inputs, max_new_tokens=1024)
 
 
 
72
 
73
  generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
74
  generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
@@ -83,19 +86,32 @@ css = """
83
  #main-title h1 {font-size: 2.3em !important;}
84
  """
85
 
86
- with gr.Blocks() as demo:
87
  gr.Markdown("# **SAGE-MM-Video-Reasoning 🎥**", elem_id="main-title")
88
  gr.Markdown("Upload a video to get a detailed explanation or ask specific questions using [SAGE-MM-Qwen3-VL](https://huggingface.co/allenai/SAGE-MM-Qwen3-VL-4B-SFT_RL).")
89
 
90
  with gr.Row():
91
  with gr.Column():
92
  vid_input = gr.Video(label="Input Video", format="mp4", height=350)
 
93
  # Default prompt set here
94
  vid_prompt = gr.Textbox(
95
  label="Prompt",
96
  value="Describe this video in detail.",
97
  placeholder="Type your question here..."
98
  )
 
 
 
 
 
 
 
 
 
 
 
 
99
  vid_btn = gr.Button("Analyze Video", variant="primary")
100
 
101
  with gr.Column():
@@ -115,13 +131,9 @@ with gr.Blocks() as demo:
115
 
116
  vid_btn.click(
117
  fn=process_video,
118
- inputs=[vid_prompt, vid_input],
119
  outputs=[vid_text_out]
120
  )
121
 
122
  if __name__ == "__main__":
123
- demo.launch(theme=gr.themes.Soft(
124
- primary_hue="blue",
125
- secondary_hue="indigo",
126
- neutral_hue="slate",
127
- ), css=css, mcp_server=True, ssr_mode=False)
 
24
  print("Model loaded successfully.")
25
 
26
  @spaces.GPU
27
+ def process_video(user_text, video_path, max_new_tokens):
28
  if not video_path:
29
  return "Please upload a video."
30
 
 
32
  if not user_text.strip():
33
  user_text = "Describe this video in detail."
34
 
35
+ # Construct messages for Molmo/Qwen
36
  messages = [
37
  {
38
  "role": "user",
 
68
 
69
  # Generate
70
  with torch.inference_mode():
71
+ generated_ids = model.generate(
72
+ **inputs,
73
+ max_new_tokens=max_new_tokens
74
+ )
75
 
76
  generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
77
  generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
 
86
  #main-title h1 {font-size: 2.3em !important;}
87
  """
88
 
89
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="indigo", neutral_hue="slate"), css=css) as demo:
90
  gr.Markdown("# **SAGE-MM-Video-Reasoning 🎥**", elem_id="main-title")
91
  gr.Markdown("Upload a video to get a detailed explanation or ask specific questions using [SAGE-MM-Qwen3-VL](https://huggingface.co/allenai/SAGE-MM-Qwen3-VL-4B-SFT_RL).")
92
 
93
  with gr.Row():
94
  with gr.Column():
95
  vid_input = gr.Video(label="Input Video", format="mp4", height=350)
96
+
97
  # Default prompt set here
98
  vid_prompt = gr.Textbox(
99
  label="Prompt",
100
  value="Describe this video in detail.",
101
  placeholder="Type your question here..."
102
  )
103
+
104
+ # Advanced Settings Accordion
105
+ with gr.Accordion("Advanced Settings", open=False):
106
+ max_tokens_slider = gr.Slider(
107
+ minimum=128,
108
+ maximum=4096,
109
+ value=1024,
110
+ step=128,
111
+ label="Max New Tokens",
112
+ info="Controls the length of the generated text."
113
+ )
114
+
115
  vid_btn = gr.Button("Analyze Video", variant="primary")
116
 
117
  with gr.Column():
 
131
 
132
  vid_btn.click(
133
  fn=process_video,
134
+ inputs=[vid_prompt, vid_input, max_tokens_slider],
135
  outputs=[vid_text_out]
136
  )
137
 
138
  if __name__ == "__main__":
139
+ demo.launch(mcp_server=True, ssr_mode=False)