Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,8 +17,8 @@ from transformers import (
|
|
| 17 |
from qwen_vl_utils import process_vision_info
|
| 18 |
|
| 19 |
# Constants for text generation
|
| 20 |
-
MAX_MAX_NEW_TOKENS =
|
| 21 |
-
DEFAULT_MAX_NEW_TOKENS =
|
| 22 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
| 23 |
|
| 24 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
@@ -61,7 +61,6 @@ model_s = Glm4vForConditionalGeneration.from_pretrained(
|
|
| 61 |
torch_dtype=torch.float16
|
| 62 |
).to(device).eval()
|
| 63 |
|
| 64 |
-
|
| 65 |
def downsample_video(video_path):
|
| 66 |
"""
|
| 67 |
Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
|
|
@@ -219,7 +218,7 @@ video_examples = [
|
|
| 219 |
["explain the video in detail.", "videos/2.mp4"]
|
| 220 |
]
|
| 221 |
|
| 222 |
-
#
|
| 223 |
css = """
|
| 224 |
.submit-btn {
|
| 225 |
background-color: #2980b9 !important;
|
|
@@ -233,11 +232,17 @@ css = """
|
|
| 233 |
border-radius: 10px;
|
| 234 |
padding: 20px;
|
| 235 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
"""
|
| 237 |
|
| 238 |
# Create the Gradio Interface
|
| 239 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
| 240 |
-
gr.Markdown("# **[Multimodal
|
| 241 |
with gr.Row():
|
| 242 |
with gr.Column():
|
| 243 |
with gr.Tabs():
|
|
@@ -276,7 +281,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 276 |
model_choice = gr.Radio(
|
| 277 |
choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "MonkeyOCR-pro-1.2B"],
|
| 278 |
label="Select Model",
|
| 279 |
-
value="Camel-Doc-OCR-062825"
|
|
|
|
| 280 |
)
|
| 281 |
|
| 282 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
|
|
|
|
| 17 |
from qwen_vl_utils import process_vision_info
|
| 18 |
|
| 19 |
# Constants for text generation
|
| 20 |
+
MAX_MAX_NEW_TOKENS = 2048
|
| 21 |
+
DEFAULT_MAX_NEW_TOKENS = 1024
|
| 22 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
| 23 |
|
| 24 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
|
|
| 61 |
torch_dtype=torch.float16
|
| 62 |
).to(device).eval()
|
| 63 |
|
|
|
|
| 64 |
def downsample_video(video_path):
|
| 65 |
"""
|
| 66 |
Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
|
|
|
|
| 218 |
["explain the video in detail.", "videos/2.mp4"]
|
| 219 |
]
|
| 220 |
|
| 221 |
+
# Updated CSS with model choice highlighting
|
| 222 |
css = """
|
| 223 |
.submit-btn {
|
| 224 |
background-color: #2980b9 !important;
|
|
|
|
| 232 |
border-radius: 10px;
|
| 233 |
padding: 20px;
|
| 234 |
}
|
| 235 |
+
.model-choice label {
|
| 236 |
+
color: red;
|
| 237 |
+
}
|
| 238 |
+
.model-choice input[value="Camel-Doc-OCR-062825"] + label {
|
| 239 |
+
color: blue;
|
| 240 |
+
}
|
| 241 |
"""
|
| 242 |
|
| 243 |
# Create the Gradio Interface
|
| 244 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
| 245 |
+
gr.Markdown("# **[Multimodal VLMOCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
|
| 246 |
with gr.Row():
|
| 247 |
with gr.Column():
|
| 248 |
with gr.Tabs():
|
|
|
|
| 281 |
model_choice = gr.Radio(
|
| 282 |
choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "MonkeyOCR-pro-1.2B"],
|
| 283 |
label="Select Model",
|
| 284 |
+
value="Camel-Doc-OCR-062825",
|
| 285 |
+
elem_classes=["model-choice"]
|
| 286 |
)
|
| 287 |
|
| 288 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
|