Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -97,11 +97,11 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 97 |
processor = processor_o
|
| 98 |
model = model_o
|
| 99 |
else:
|
| 100 |
-
yield "Invalid model selected."
|
| 101 |
return
|
| 102 |
|
| 103 |
if image is None:
|
| 104 |
-
yield "Please upload an image."
|
| 105 |
return
|
| 106 |
|
| 107 |
messages = [{
|
|
@@ -127,9 +127,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 127 |
buffer = ""
|
| 128 |
for new_text in streamer:
|
| 129 |
buffer += new_text
|
| 130 |
-
#buffer = buffer.replace("<|im_end|>", "")
|
| 131 |
time.sleep(0.01)
|
| 132 |
-
yield buffer
|
| 133 |
|
| 134 |
@spaces.GPU
|
| 135 |
def generate_video(model_name: str, text: str, video_path: str,
|
|
@@ -151,11 +150,11 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 151 |
processor = processor_o
|
| 152 |
model = model_o
|
| 153 |
else:
|
| 154 |
-
yield "Invalid model selected."
|
| 155 |
return
|
| 156 |
|
| 157 |
if video_path is None:
|
| 158 |
-
yield "Please upload a video."
|
| 159 |
return
|
| 160 |
|
| 161 |
frames = downsample_video(video_path)
|
|
@@ -194,7 +193,7 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 194 |
buffer += new_text
|
| 195 |
buffer = buffer.replace("<|im_end|>", "")
|
| 196 |
time.sleep(0.01)
|
| 197 |
-
yield buffer
|
| 198 |
|
| 199 |
# Define examples for image and video inference
|
| 200 |
image_examples = [
|
|
@@ -202,7 +201,6 @@ image_examples = [
|
|
| 202 |
["Convert this page to doc [text] precisely.", "images/4.png"],
|
| 203 |
["Convert this page to doc [text] precisely.", "images/1.png"],
|
| 204 |
["Convert chart to OTSL.", "images/2.png"]
|
| 205 |
-
|
| 206 |
]
|
| 207 |
|
| 208 |
video_examples = [
|
|
@@ -250,6 +248,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 250 |
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
| 251 |
with gr.Column():
|
| 252 |
output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
|
|
|
|
| 253 |
model_choice = gr.Radio(
|
| 254 |
choices=["DREX-062225-exp", "VIREX-062225-exp", "olmOCR-7B-0225"],
|
| 255 |
label="Select Model",
|
|
@@ -259,17 +258,17 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 259 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
|
| 260 |
gr.Markdown("> [DREX-062225-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
|
| 261 |
gr.Markdown("> [VIREX-062225-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
|
| 262 |
-
gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
|
| 263 |
|
| 264 |
image_submit.click(
|
| 265 |
fn=generate_image,
|
| 266 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
| 267 |
-
outputs=output
|
| 268 |
)
|
| 269 |
video_submit.click(
|
| 270 |
fn=generate_video,
|
| 271 |
inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
| 272 |
-
outputs=output
|
| 273 |
)
|
| 274 |
|
| 275 |
if __name__ == "__main__":
|
|
|
|
| 97 |
processor = processor_o
|
| 98 |
model = model_o
|
| 99 |
else:
|
| 100 |
+
yield "Invalid model selected.", "Invalid model selected."
|
| 101 |
return
|
| 102 |
|
| 103 |
if image is None:
|
| 104 |
+
yield "Please upload an image.", "Please upload an image."
|
| 105 |
return
|
| 106 |
|
| 107 |
messages = [{
|
|
|
|
| 127 |
buffer = ""
|
| 128 |
for new_text in streamer:
|
| 129 |
buffer += new_text
|
|
|
|
| 130 |
time.sleep(0.01)
|
| 131 |
+
yield buffer, buffer
|
| 132 |
|
| 133 |
@spaces.GPU
|
| 134 |
def generate_video(model_name: str, text: str, video_path: str,
|
|
|
|
| 150 |
processor = processor_o
|
| 151 |
model = model_o
|
| 152 |
else:
|
| 153 |
+
yield "Invalid model selected.", "Invalid model selected."
|
| 154 |
return
|
| 155 |
|
| 156 |
if video_path is None:
|
| 157 |
+
yield "Please upload a video.", "Please upload a video."
|
| 158 |
return
|
| 159 |
|
| 160 |
frames = downsample_video(video_path)
|
|
|
|
| 193 |
buffer += new_text
|
| 194 |
buffer = buffer.replace("<|im_end|>", "")
|
| 195 |
time.sleep(0.01)
|
| 196 |
+
yield buffer, buffer
|
| 197 |
|
| 198 |
# Define examples for image and video inference
|
| 199 |
image_examples = [
|
|
|
|
| 201 |
["Convert this page to doc [text] precisely.", "images/4.png"],
|
| 202 |
["Convert this page to doc [text] precisely.", "images/1.png"],
|
| 203 |
["Convert chart to OTSL.", "images/2.png"]
|
|
|
|
| 204 |
]
|
| 205 |
|
| 206 |
video_examples = [
|
|
|
|
| 248 |
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
| 249 |
with gr.Column():
|
| 250 |
output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
|
| 251 |
+
markdown_output = gr.Markdown(label="Result.Md", scale=2)
|
| 252 |
model_choice = gr.Radio(
|
| 253 |
choices=["DREX-062225-exp", "VIREX-062225-exp", "olmOCR-7B-0225"],
|
| 254 |
label="Select Model",
|
|
|
|
| 258 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs/discussions)")
|
| 259 |
gr.Markdown("> [DREX-062225-exp](https://huggingface.co/prithivMLmods/DREX-062225-exp): the drex-062225-exp (document retrieval and extraction expert) model is a specialized fine-tuned version of docscopeocr-7b-050425-exp, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture.")
|
| 260 |
gr.Markdown("> [VIREX-062225-exp](https://huggingface.co/prithivMLmods/VIREX-062225-exp): the virex-062225-exp (video information retrieval and extraction expert - experimental) model is a fine-tuned version of qwen2.5-vl-7b-instruct, specifically optimized for advanced video understanding, image comprehension, sense of reasoning, and natural language decision-making through cot reasoning.")
|
| 261 |
+
gr.Markdown("> [olmOCR-7B-0225](https://huggingface.co/allenai/olmOCR-7B-0225-preview): the olmocr-7b-0225-preview model is based on qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding, and accurate image-to-text conversion with mathematical latex formatting. designed with a focus on high-fidelity visual-textual comprehension.")
|
| 262 |
|
| 263 |
image_submit.click(
|
| 264 |
fn=generate_image,
|
| 265 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
| 266 |
+
outputs=[output, markdown_output]
|
| 267 |
)
|
| 268 |
video_submit.click(
|
| 269 |
fn=generate_video,
|
| 270 |
inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
| 271 |
+
outputs=[output, markdown_output]
|
| 272 |
)
|
| 273 |
|
| 274 |
if __name__ == "__main__":
|