Spaces:

prithivMLmods
/

VisionScope-R2

Running on Zero

App Files Files Community

prithivMLmods commited on Jun 6, 2025

Commit

3fa8f27

verified ·

1 Parent(s): dab83dd

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -207

app.py CHANGED Viewed

@@ -19,8 +19,6 @@ from transformers import (
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
     AutoTokenizer,
-    AutoModel,
-    AutoImageProcessor,
     TextIteratorStreamer,
 )
@@ -33,18 +31,14 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load Llama-3.1-Nemotron-Nano-VL-8B-V1
-MODEL_ID_M = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
-processor_m = AutoImageProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
-tokenizer_m = AutoTokenizer.from_pretrained(MODEL_ID_M)
-tokenizer_m.pad_token = tokenizer_m.eos_token  # Set pad_token to resolve ValueError
-model_m = AutoModel.from_pretrained(
     MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
-# Fix AssertionError by setting img_context_token_id
-model_m.img_context_token_id = tokenizer_m.convert_tokens_to_ids("<image>")
 # Load Space Thinker
 MODEL_ID_Z = "remyxai/SpaceThinker-Qwen2.5VL-3B"
@@ -64,7 +58,21 @@ model_k = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
 def downsample_video(video_path):
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
@@ -83,214 +91,129 @@ def downsample_video(video_path):
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
-                  max_new_tokens: int = 1024,
-                  temperature: float = 0.6,
-                  top_p: float = 0.9,
-                  top_k: int = 50,
-                  repetition_penalty: float = 1.2):
-    if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
         processor = processor_m
-        tokenizer = tokenizer_m
         model = model_m
-        if image is None:
-            yield "Please upload an image."
-            return
-        # Construct message with <image> token
-        if "<image>" not in text:
-            message = f"<image>\n{text}"
-        else:
-            message = text
-        # Tokenize the message
-        inputs = tokenizer(message, return_tensors="pt").to(device)
-        # Process image
-        image_features = processor(image, return_tensors="pt").to(device)
-        # Combine inputs
-        generation_inputs = {
-            "input_ids": inputs["input_ids"],
-            "attention_mask": inputs["attention_mask"],
-            **image_features,
-        }
-        # Create streamer
-        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        # Generation kwargs
-        generation_kwargs = {
-            **generation_inputs,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-        }
-        # Start generation in a thread
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-    elif model_name in ["SpaceThinker-3B", "coreOCR-7B-050325-preview"]:
-        if model_name == "SpaceThinker-3B":
-            processor = processor_z
-            model = model_z
-        else:
-            processor = processor_k
-            model = model_k
-        if image is None:
-            yield "Please upload an image."
-            return
-        messages = [{
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": text},
-            ]
-        }]
-        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(
-            text=[prompt_full],
-            images=[image],
-            return_tensors="pt",
-            padding=True,
-            truncation=False,
-            max_length=MAX_INPUT_TOKEN_LENGTH
-        ).to(device)
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
     else:
         yield "Invalid model selected."
         return
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
-                  max_new_tokens: int = 1024,
-                  temperature: float = 0.6,
-                  top_p: float = 0.9,
-                  top_k: int = 50,
-                  repetition_penalty: float = 1.2):
-    if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
         processor = processor_m
-        tokenizer = tokenizer_m
         model = model_m
-        if video_path is None:
-            yield "Please upload a video."
-            return
-        frames = downsample_video(video_path)
-        # Construct message with multiple <image> tokens
-        prompt_parts = ["<image>"] * len(frames) + [text]
-        message = " ".join(prompt_parts)
-        # Tokenize
-        inputs = tokenizer(message, return_tensors="pt").to(device)
-        # Process all frames
-        image_features = processor([frame[0] for frame in frames], return_tensors="pt").to(device)
-        # Combine inputs
-        generation_inputs = {
-            "input_ids": inputs["input_ids"],
-            "attention_mask": inputs["attention_mask"],
-            **image_features,
-        }
-        # Create streamer
-        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        # Generation kwargs
-        generation_kwargs = {
-            **generation_inputs,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-        }
-        # Start generation in a thread
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-    elif model_name in ["SpaceThinker-3B", "coreOCR-7B-050325-preview"]:
-        if model_name == "SpaceThinker-3B":
-            processor = processor_z
-            model = model_z
-        else:
-            processor = processor_k
-            model = model_k
-        if video_path is None:
-            yield "Please upload a video."
-            return
-        frames = downsample_video(video_path)
-        messages = [
-            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-            {"role": "user", "content": [{"type": "text", "text": text}]}
-        ]
-        for frame in frames:
-            image, timestamp = frame
-            messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
-            messages[1]["content"].append({"type": "image", "image": image})
-        inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt",
-            truncation=False,
-            max_length=MAX_INPUT_TOKEN_LENGTH
-        ).to(device)
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            **inputs,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-        }
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
     else:
         yield "Invalid model selected."
         return
 # Define examples for image and video inference
 image_examples = [
     ["type out the messy hand-writing as accurately as you can.", "images/1.jpg"],
@@ -346,13 +269,13 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
         with gr.Column():
             output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
             model_choice = gr.Radio(
-                choices=["Llama-3.1-Nemotron-Nano-VL-8B-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview"],
                 label="Select Model",
-                value="Llama-3.1-Nemotron-Nano-VL-8B-V1"
             )
             gr.Markdown("**Model Info**")
-            gr.Markdown("⤷ [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1): structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
             gr.Markdown("⤷ [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
             gr.Markdown("⤷ [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
             gr.Markdown("⤷ [Imgscope-OCR-2B-0527](https://huggingface.co/prithivMLmods/Imgscope-OCR-2B-0527): fine-tuned version of qwen2-vl-2b-instruct, specifically optimized for messy handwriting recognition, document ocr, realistic handwritten ocr, and math problem solving with latex formatting.")

     Qwen2VLForConditionalGeneration,
     AutoProcessor,
     AutoTokenizer,
     TextIteratorStreamer,
 )
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load SkyCaptioner-V1
+MODEL_ID_M = "Skywork/SkyCaptioner-V1"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
+model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 # Load Space Thinker
 MODEL_ID_Z = "remyxai/SpaceThinker-Qwen2.5VL-3B"
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Imgscope-OCR-2B-0527
+MODEL_ID_Y = "prithivMLmods/Imgscope-OCR-2B-0527"
+processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
+model_y = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_Y,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
 def downsample_video(video_path):
+    """
+    Downsamples the video to evenly spaced frames.
+    Each frame is returned as a PIL image along with its timestamp.
+    """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
+    """
+    Generates responses using the selected model for image input.
+    """
+    if model_name == "SkyCaptioner-V1":
         processor = processor_m
         model = model_m
+    elif model_name == "SpaceThinker-3B":
+        processor = processor_z
+        model = model_z
+    elif model_name == "coreOCR-7B-050325-preview":
+        processor = processor_k
+        model = model_k
+    elif model_name == "Imgscope-OCR-2B-0527":
+        processor = processor_y
+        model = model_y
     else:
         yield "Invalid model selected."
         return
+    if image is None:
+        yield "Please upload an image."
+        return
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "image", "image": image},
+            {"type": "text", "text": text},
+        ]
+    }]
+    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(
+        text=[prompt_full],
+        images=[image],
+        return_tensors="pt",
+        padding=True,
+        truncation=False,
+        max_length=MAX_INPUT_TOKEN_LENGTH
+    ).to(device)
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        buffer = buffer.replace("<|im_end|>", "")
+        time.sleep(0.01)
+        yield buffer
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
+    """
+    Generates responses using the selected model for video input.
+    """
+    if model_name == "SkyCaptioner-V1":
         processor = processor_m
         model = model_m
+    elif model_name == "SpaceThinker-3B":
+        processor = processor_z
+        model = model_z
+    elif model_name == "coreOCR-7B-050325-preview":
+        processor = processor_k
+        model = model_k
+    elif model_name == "Imgscope-OCR-2B-0527":
+        processor = processor_y
+        model = model_y
     else:
         yield "Invalid model selected."
         return
+    if video_path is None:
+        yield "Please upload a video."
+        return
+    frames = downsample_video(video_path)
+    messages = [
+        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+        {"role": "user", "content": [{"type": "text", "text": text}]}
+    ]
+    for frame in frames:
+        image, timestamp = frame
+        messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
+        messages[1]["content"].append({"type": "image", "image": image})
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt",
+        truncation=False,
+        max_length=MAX_INPUT_TOKEN_LENGTH
+    ).to(device)
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": True,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        buffer = buffer.replace("<|im_end|>", "")
+        time.sleep(0.01)
+        yield buffer
 # Define examples for image and video inference
 image_examples = [
     ["type out the messy hand-writing as accurately as you can.", "images/1.jpg"],
         with gr.Column():
             output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
             model_choice = gr.Radio(
+                choices=["SkyCaptioner-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "Imgscope-OCR-2B-0527"],
                 label="Select Model",
+                value="SkyCaptioner-V1"
             )
             gr.Markdown("**Model Info**")
+            gr.Markdown("⤷ [SkyCaptioner-V1](https://huggingface.co/Skywork/SkyCaptioner-V1):  structural video captioning model designed to generate high-quality, structural descriptions for video data. It integrates specialized sub-expert models.")
             gr.Markdown("⤷ [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
             gr.Markdown("⤷ [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
             gr.Markdown("⤷ [Imgscope-OCR-2B-0527](https://huggingface.co/prithivMLmods/Imgscope-OCR-2B-0527): fine-tuned version of qwen2-vl-2b-instruct, specifically optimized for messy handwriting recognition, document ocr, realistic handwritten ocr, and math problem solving with latex formatting.")