Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 19

Commit

e86a765

verified ·

1 Parent(s): d10d780

Update app.py

Browse files

Files changed (1) hide show

app.py +216 -166

app.py CHANGED Viewed

@@ -1,30 +1,27 @@
-import os
-import random
-import uuid
-import json
-import time
-import re
-from threading import Thread
-from datetime import datetime, timedelta
 import gradio as gr
 import torch
-import numpy as np
-from PIL import Image
 import cv2
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
-from huggingface_hub import hf_hub_download
-# -----------------------------------------------------------------------------
-# Constants & Device Setup
-# -----------------------------------------------------------------------------
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# -----------------------------------------------------------------------------
 # Helper Functions
-# -----------------------------------------------------------------------------
 def progress_bar_html(label: str) -> str:
     return f'''
 <div style="display: flex; align-items: center;">
@@ -41,165 +38,218 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
-def load_system_prompt(repo_id: str, filename: str) -> str:
-    """
-    Download and load a system prompt template from the given Hugging Face repo.
-    The template may include placeholders (e.g. {name}, {today}, {yesterday}) that get formatted.
-    """
-    file_path = hf_hub_download(repo_id=repo_id, filename=filename)
-    with open(file_path, "r") as file:
-        system_prompt = file.read()
-    today = datetime.today().strftime("%Y-%m-%d")
-    yesterday = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
-    model_name = repo_id.split("/")[-1]
-    return system_prompt.format(name=model_name, today=today, yesterday=yesterday)
-def downsample_video(video_path: str):
-    """
-    Extracts 10 evenly spaced frames from the video.
-    Returns a list of tuples (PIL.Image, timestamp_in_seconds).
-    """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    if total_frames > 0 and fps > 0:
-        frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
-        for i in frame_indices:
-            vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-            success, image = vidcap.read()
-            if success:
-                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-                pil_image = Image.fromarray(image)
-                timestamp = round(i / fps, 2)
-                frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
-def build_prompt(chat_history, current_input_text, video_frames=None, image_files=None):
-    """
-    Build a conversation prompt string.
-    The system prompt is added first, then previous chat history, and finally the current input.
-    If video_frames or image_files are provided, a note is added in the prompt.
-    """
-    prompt = f"System: {SYSTEM_PROMPT}\n"
-    # Append chat history (if any)
-    for msg in chat_history:
-        role = msg.get("role", "").capitalize()
-        content = msg.get("content", "")
-        prompt += f"{role}: {content}\n"
-    prompt += f"User: {current_input_text}\n"
-    if video_frames:
-        for _, timestamp in video_frames:
-            prompt += f"[Video Frame at {timestamp} sec]\n"
-    if image_files:
-        for _ in image_files:
-            prompt += "[Image Input]\n"
-    prompt += "Assistant: "
-    return prompt
-# -----------------------------------------------------------------------------
-# Load Mistral Model & System Prompt
-# -----------------------------------------------------------------------------
-MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-SYSTEM_PROMPT = load_system_prompt(MODEL_ID, "SYSTEM_PROMPT.txt")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    trust_remote_code=True
-).to(device)
-model.eval()
-# -----------------------------------------------------------------------------
-# Main Generation Function
-# -----------------------------------------------------------------------------
-def generate(
-    input_dict: dict,
-    chat_history: list,
-    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
-    temperature: float = 0.6,
-    top_p: float = 0.9,
-    top_k: int = 50,
-    repetition_penalty: float = 1.2,
-):
-    text = input_dict.get("text", "")
     files = input_dict.get("files", [])
-    # Separate video files from images based on file extension.
     video_extensions = (".mp4", ".mov", ".avi", ".mkv", ".webm")
-    video_files = [f for f in files if str(f).lower().endswith(video_extensions)]
-    image_files = [f for f in files if not str(f).lower().endswith(video_extensions)]
-    video_frames = None
-    if video_files:
-        # Process the first video file.
-        video_path = video_files[0]
-        video_frames = downsample_video(video_path)
-    # Build the full prompt from the system prompt, chat history, current text, and file inputs.
-    prompt = build_prompt(chat_history, text, video_frames, image_files)
-    # Tokenize the prompt.
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    # Set up a streamer for incremental output.
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=20.0)
-    generation_kwargs = {
-        "input_ids": inputs["input_ids"],
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-        "streamer": streamer,
-    }
-    # Launch generation in a separate thread.
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    yield progress_bar_html("Processing with Mistral")
-    for new_text in streamer:
-        buffer += new_text
-        time.sleep(0.01)
-        yield buffer
-# -----------------------------------------------------------------------------
-# Gradio Interface
-# -----------------------------------------------------------------------------
 demo = gr.ChatInterface(
-    fn=generate,
-    additional_inputs=[
-        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
-        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
-        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
-        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
-        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
-    ],
-    examples=[
-        [{"text": "Describe the content of the video.", "files": ["examples/sample_video.mp4"]}],
-        [{"text": "Explain what is in this image.", "files": ["examples/sample_image.jpg"]}],
-        ["Tell me a fun fact about space."],
-    ],
-    cache_examples=False,
-    type="messages",
-    description="# **Mistral Chatbot with Video Inference**\nA chatbot built with Mistral (via Transformers) that supports text, image, and video (frame extraction) inputs.",
-    fill_height=True,
-    textbox=gr.MultimodalTextbox(
-        label="Query Input",
-        file_types=["image", "video"],
-        file_count="multiple",
-        placeholder="Type your message here. Optionally attach images or video."
     ),
     stop_btn="Stop Generation",
     multimodal=True,
 )
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(share=True)

 import gradio as gr
+from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
+from transformers.image_utils import load_image
+from threading import Thread
+import re
+import time
 import torch
+import spaces
+import ast
+import html
+import random
 import cv2
+import numpy as np
+import uuid
+from PIL import Image, ImageOps
+from docling_core.types.doc import DoclingDocument
+from docling_core.types.doc.document import DocTagsDocument
+# ---------------------------
 # Helper Functions
+# ---------------------------
 def progress_bar_html(label: str) -> str:
     return f'''
 <div style="display: flex; align-items: center;">
 </style>
     '''
+def downsample_video(video_path, num_frames=10):
+    """Downsamples a video to a fixed number of evenly spaced frames."""
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    if total_frames <= 0 or fps <= 0:
+        vidcap.release()
+        return frames
+    # Get indices for num_frames evenly spaced frames.
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            # Convert from BGR (OpenCV) to RGB (PIL) and then to PIL Image.
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
+def add_random_padding(image, min_percent=0.1, max_percent=0.10):
+    image = image.convert("RGB")
+    width, height = image.size
+    pad_w_percent = random.uniform(min_percent, max_percent)
+    pad_h_percent = random.uniform(min_percent, max_percent)
+    pad_w = int(width * pad_w_percent)
+    pad_h = int(height * pad_h_percent)
+    corner_pixel = image.getpixel((0, 0))  # Top-left corner for padding color
+    padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
+    return padded_image
+def normalize_values(text, target_max=500):
+    def normalize_list(values):
+        max_value = max(values) if values else 1
+        return [round((v / max_value) * target_max) for v in values]
+    def process_match(match):
+        num_list = ast.literal_eval(match.group(0))
+        normalized = normalize_list(num_list)
+        return "".join([f"<loc_{num}>" for num in normalized])
+    pattern = r"\[([\d\.\s,]+)\]"
+    normalized_text = re.sub(pattern, process_match, text)
+    return normalized_text
+# ---------------------------
+# Model & Processor Setup
+# ---------------------------
+processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
+model = AutoModelForVision2Seq.from_pretrained(
+    "ds4sd/SmolDocling-256M-preview",
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+# ---------------------------
+# Main Inference Function
+# ---------------------------
+@spaces.GPU
+def model_inference(input_dict, history):
+    text = input_dict["text"]
     files = input_dict.get("files", [])
+    # If there are files, check if any is a video
     video_extensions = (".mp4", ".mov", ".avi", ".mkv", ".webm")
+    if files and any(str(f).lower().endswith(video_extensions) for f in files):
+        # -------- Video Inference Branch --------
+        video_file = files[0]  # Assume first file is a video
+        frames = downsample_video(video_file)
+        if not frames:
+            yield "Could not process video file."
+            return
+        images = [frame[0] for frame in frames]
+        timestamps = [frame[1] for frame in frames]
+        # Append frame timestamps to the query text.
+        text_with_timestamps = text + " " + " ".join([f"Frame at {ts} seconds." for ts in timestamps])
+        resulting_messages = [{
+            "role": "user",
+            "content": [{"type": "image"} for _ in range(len(images))] + [{"type": "text", "text": text_with_timestamps}]
+        }]
+        prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=[images], return_tensors="pt").to("cuda")
+        yield progress_bar_html("Processing video with SmolDocling")
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
+        generation_args = dict(inputs, streamer=streamer, max_new_tokens=8192)
+        thread = Thread(target=model.generate, kwargs=generation_args)
+        thread.start()
+        buffer = ""
+        full_output = ""
+        for new_text in streamer:
+            full_output += new_text
+            buffer += html.escape(new_text)
+            yield buffer
+        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+        if cleaned_output:
+            doctag_output = cleaned_output
+            yield cleaned_output
+        if any(tag in doctag_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+            doc = DoclingDocument(name="Document")
+            if "<chart>" in doctag_output:
+                doctag_output = doctag_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                doctag_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', doctag_output)
+            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag_output], images)
+            doc.load_from_doctags(doctags_doc)
+            yield f"**MD Output:**\n\n{doc.export_to_markdown()}"
+        return
+    elif files:
+        # -------- Image Inference Branch --------
+        if len(files) > 1:
+            if "OTSL" in text or "code" in text:
+                images = [add_random_padding(load_image(image)) for image in files]
+            else:
+                images = [load_image(image) for image in files]
+        elif len(files) == 1:
+            if "OTSL" in text or "code" in text:
+                images = [add_random_padding(load_image(files[0]))]
+            else:
+                images = [load_image(files[0])]
+        resulting_messages = [{
+            "role": "user",
+            "content": [{"type": "image"} for _ in range(len(images))] + [{"type": "text", "text": text}]
+        }]
+        prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=[images], return_tensors="pt").to("cuda")
+        yield progress_bar_html("Processing with SmolDocling")
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
+        generation_args = dict(inputs, streamer=streamer, max_new_tokens=8192)
+        thread = Thread(target=model.generate, kwargs=generation_args)
+        thread.start()
+        yield "..."
+        buffer = ""
+        full_output = ""
+        for new_text in streamer:
+            full_output += new_text
+            buffer += html.escape(new_text)
+            yield buffer
+        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+        if cleaned_output:
+            doctag_output = cleaned_output
+            yield cleaned_output
+        if any(tag in doctag_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+            doc = DoclingDocument(name="Document")
+            if "<chart>" in doctag_output:
+                doctag_output = doctag_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                doctag_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', doctag_output)
+            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag_output], images)
+            doc.load_from_doctags(doctags_doc)
+            yield f"**MD Output:**\n\n{doc.export_to_markdown()}"
+        return
+    else:
+        # -------- Text-Only Inference Branch --------
+        if text == "":
+            gr.Error("Please input a query and optionally image(s).")
+        resulting_messages = [{
+            "role": "user",
+            "content": [{"type": "text", "text": text}]
+        }]
+        prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, return_tensors="pt").to("cuda")
+        yield progress_bar_html("Processing text with SmolDocling")
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
+        generation_args = dict(inputs, streamer=streamer, max_new_tokens=8192)
+        thread = Thread(target=model.generate, kwargs=generation_args)
+        thread.start()
+        yield "..."
+        buffer = ""
+        full_output = ""
+        for new_text in streamer:
+            full_output += new_text
+            buffer += html.escape(new_text)
+            yield buffer
+        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+        if cleaned_output:
+            yield cleaned_output
+        return
+# ---------------------------
+# Gradio Interface Setup
+# ---------------------------
+examples = [
+    [{"text": "Convert this page to docling.", "files": ["example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png"]}],
+    [{"text": "Convert this table to OTSL.", "files": ["example_images/image-2.jpg"]}],
+    [{"text": "Convert code to text.", "files": ["example_images/7666.jpg"]}],
+    [{"text": "Convert formula to latex.", "files": ["example_images/2433.jpg"]}],
+    [{"text": "Convert chart to OTSL.", "files": ["example_images/06236926002285.png"]}],
+    [{"text": "OCR the text in location [47, 531, 167, 565]", "files": ["example_images/s2w_example.png"]}],
+    [{"text": "Extract all section header elements on the page.", "files": ["example_images/paper_3.png"]}],
+    [{"text": "Identify element at location [123, 413, 1059, 1061]", "files": ["example_images/redhat.png"]}],
+    [{"text": "Convert this page to docling.", "files": ["example_images/gazette_de_france.jpg"]}],
+    # Example video file (if available)
+    [{"text": "Describe the events in this video.", "files": ["example_videos/sample_video.mp4"]}],
+]
 demo = gr.ChatInterface(
+    fn=model_inference,
+    title="SmolDocling-256M: Ultra-compact VLM for Document Conversion 💫",
+    description=(
+        "Play with [ds4sd/SmolDocling-256M-preview](https://huggingface.co/ds4sd/SmolDocling-256M-preview) in this demo. "
+        "Upload an image, video, and text query or try one of the examples. Each chat starts a new conversation."
     ),
+    examples=examples,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
     stop_btn="Stop Generation",
     multimodal=True,
+    cache_examples=False
 )
 if __name__ == "__main__":
+    demo.launch(debug=True)