core-OCR

Paused

App Files Files Community

prithivMLmods commited on Mar 12, 2025

Commit

696cd59

verified ·

1 Parent(s): 709c732

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -17

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import torch
 import numpy as np
 from PIL import Image
 import edge_tts
 from transformers import (
     AutoModelForCausalLM,
@@ -149,6 +150,28 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
 @spaces.GPU(duration=60, enable_queue=True)
 def generate_image_fn(
     prompt: str,
@@ -213,14 +236,16 @@ def generate(
     Special commands:
       - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    if text.strip().lower().startswith("@image"):
         # Remove the "@image" tag and use the rest as prompt
         prompt = text[len("@image"):].strip()
-        # Show animated progress bar for image generation
         yield progress_bar_html("Generating Image")
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
@@ -235,10 +260,57 @@ def generate(
             use_resolution_binning=True,
             num_images=1,
         )
-        # Once done, yield the generated image
         yield gr.Image(image_paths[0])
-        return  # Exit early
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
@@ -246,11 +318,9 @@ def generate(
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        # Clear previous chat history for a fresh TTS request.
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
-        # Remove any stray @tts tags and build the conversation history.
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
@@ -269,15 +339,13 @@ def generate(
                 {"type": "text", "text": text},
             ]
         }]
-        prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
-        # Show animated progress bar for multimodal generation
         yield progress_bar_html("Thinking...")
         for new_text in streamer:
             buffer += new_text
@@ -304,18 +372,13 @@ def generate(
         }
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
-        # Show animated progress bar for text generation
-        yield progress_bar_html("Thinking...")
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
-        # If TTS was requested, convert the final response to speech.
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
@@ -330,6 +393,7 @@ demo = gr.ChatInterface(
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
         ["Python Program for Array Rotation"],
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
@@ -342,7 +406,7 @@ demo = gr.ChatInterface(
     description=DESCRIPTION,
     css=css,
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple",  placeholder="‎ @tts1, @tts2-voices, @image-image gen, default [text, vision]"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 import numpy as np
 from PIL import Image
 import edge_tts
+import cv2
 from transformers import (
     AutoModelForCausalLM,
 </style>
     '''
+def downsample_video(video_path):
+    """
+    Downsamples the video to 10 evenly spaced frames.
+    Each frame is returned as a PIL image along with its timestamp.
+    """
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    # Sample 10 evenly spaced frames.
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
 @spaces.GPU(duration=60, enable_queue=True)
 def generate_image_fn(
     prompt: str,
     Special commands:
       - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
+      - "@qwen2vl-video": triggers video processing using Qwen2VL.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    lower_text = text.strip().lower()
+    # Branch for image generation.
+    if lower_text.startswith("@image"):
         # Remove the "@image" tag and use the rest as prompt
         prompt = text[len("@image"):].strip()
         yield progress_bar_html("Generating Image")
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
             use_resolution_binning=True,
             num_images=1,
         )
         yield gr.Image(image_paths[0])
+        return
+    # New branch for video processing with Qwen2VL.
+    if lower_text.startswith("@qwen2vl-video"):
+        prompt = text[len("@qwen2vl-video"):].strip()
+        if files:
+            # Assume the first file is a video.
+            video_path = files[0]
+            frames = downsample_video(video_path)
+            messages = [
+                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+                {"role": "user", "content": [{"type": "text", "text": prompt}]}
+            ]
+            # Append each frame with its timestamp.
+            for frame in frames:
+                image, timestamp = frame
+                image_path = f"video_frame_{uuid.uuid4().hex}.png"
+                image.save(image_path)
+                messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
+                messages[1]["content"].append({"type": "image", "url": image_path})
+        else:
+            messages = [
+                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+                {"role": "user", "content": [{"type": "text", "text": prompt}]}
+            ]
+        inputs = processor.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
+        ).to("cuda")
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        yield progress_bar_html("Processing video with Qwen2VL")
+        for new_text in streamer:
+            buffer += new_text
+            time.sleep(0.01)
+            yield buffer
+        return
+    # Determine if TTS is requested.
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
                 {"type": "text", "text": text},
             ]
         }]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield progress_bar_html("Thinking...")
         for new_text in streamer:
             buffer += new_text
         }
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
+        yield progress_bar_html("Processing with Qwen2VL Ocr")
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
+        [{"text": "@gemma3-4b-video Summarize the events in this video", "files": ["examples/sky.mp4"]}],
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
         ["Python Program for Array Rotation"],
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
     description=DESCRIPTION,
     css=css,
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @tts1, @tts2-voices, @image for image gen, @qwen2vl-video for video, default [text, vision]"),
     stop_btn="Stop Generation",
     multimodal=True,
 )