Spaces:

prithivMLmods
/

VisionScope-R2

Paused

App Files Files Community

prithivMLmods commited on Mar 23, 2025

Commit

54875b8

verified ·

1 Parent(s): bb78bca

Update app.py

Browse files

Files changed (1) hide show

app.py +350 -249

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import uuid
 import json
 import time
 import asyncio
-import re
 from threading import Thread
 import gradio as gr
@@ -13,6 +12,7 @@ import torch
 import numpy as np
 from PIL import Image
 import edge_tts
 from transformers import (
     AutoModelForCausalLM,
@@ -24,56 +24,15 @@ from transformers import (
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
-DESCRIPTION = """
-# Gen Vision 🎃
-"""
-css = '''
-h1 {
-  text-align: center;
-  display: block;
-}
-#duplicate-button {
-  margin: auto;
-  color: #fff;
-  background: #1565c0;
-  border-radius: 100vh;
-}
-'''
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# -----------------------
-# Progress Bar Helper
-# -----------------------
-def progress_bar_html(label: str) -> str:
-    """
-    Returns an HTML snippet for a thin progress bar with a label.
-    The progress bar is styled as a dark red animated bar.
-    """
-    return f'''
-<div style="display: flex; align-items: center;">
-    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-    <div style="width: 110px; height: 5px; background-color: #DDA0DD; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: #FF00FF; animation: loading 1.5s linear infinite;"></div>
-    </div>
-</div>
-<style>
-@keyframes loading {{
-    0% {{ transform: translateX(-100%); }}
-    100% {{ transform: translateX(100%); }}
-}}
-</style>
-    '''
-# -----------------------
-# Text Generation Setup
-# -----------------------
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -83,170 +42,217 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
 ]
-# -----------------------
-# Multimodal OCR Setup
-# -----------------------
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to("cuda").eval()
-async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
-    """Convert text to speech using Edge TTS and save as MP3"""
-    communicate = edge_tts.Communicate(text, voice)
-    await communicate.save(output_file)
-    return output_file
-def clean_chat_history(chat_history):
-    """
-    Filter out any chat entries whose "content" is not a string.
-    """
-    cleaned = []
-    for msg in chat_history:
-        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
-            cleaned.append(msg)
-    return cleaned
-# -----------------------
-# Stable Diffusion Image Generation Setup
-# -----------------------
-MAX_SEED = np.iinfo(np.int32).max
-USE_TORCH_COMPILE = False
-ENABLE_CPU_OFFLOAD = False
-if torch.cuda.is_available():
-    pipe = StableDiffusionXLPipeline.from_pretrained(
-        "SG161222/RealVisXL_V4.0_Lightning",
-        torch_dtype=torch.float16,
-        use_safetensors=True,
-    )
-    pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-    # LoRA options with one example for each.
-    LORA_OPTIONS = {
-        "Realism": ("prithivMLmods/Canopus-Realism-LoRA", "Canopus-Realism-LoRA.safetensors", "rlms"),
-        "Pixar": ("prithivMLmods/Canopus-Pixar-Art", "Canopus-Pixar-Art.safetensors", "pixar"),
-        "Photoshoot": ("prithivMLmods/Canopus-Photo-Shoot-Mini-LoRA", "Canopus-Photo-Shoot-Mini-LoRA.safetensors", "photo"),
-        "Clothing": ("prithivMLmods/Canopus-Clothing-Adp-LoRA", "Canopus-Dress-Clothing-LoRA.safetensors", "clth"),
-        "Interior": ("prithivMLmods/Canopus-Interior-Architecture-0.1", "Canopus-Interior-Architecture-0.1δ.safetensors", "arch"),
-        "Fashion": ("prithivMLmods/Canopus-Fashion-Product-Dilation", "Canopus-Fashion-Product-Dilation.safetensors", "fashion"),
-        "Minimalistic": ("prithivMLmods/Pegasi-Minimalist-Image-Style", "Pegasi-Minimalist-Image-Style.safetensors", "minimalist"),
-        "Modern": ("prithivMLmods/Canopus-Modern-Clothing-Design", "Canopus-Modern-Clothing-Design.safetensors", "mdrnclth"),
-        "Animaliea": ("prithivMLmods/Canopus-Animaliea-Artism", "Canopus-Animaliea-Artism.safetensors", "Animaliea"),
-        "Wallpaper": ("prithivMLmods/Canopus-Liquid-Wallpaper-Art", "Canopus-Liquid-Wallpaper-Minimalize-LoRA.safetensors", "liquid"),
-        "Cars": ("prithivMLmods/Canes-Cars-Model-LoRA", "Canes-Cars-Model-LoRA.safetensors", "car"),
-        "PencilArt": ("prithivMLmods/Canopus-Pencil-Art-LoRA", "Canopus-Pencil-Art-LoRA.safetensors", "Pencil Art"),
-        "ArtMinimalistic": ("prithivMLmods/Canopus-Art-Medium-LoRA", "Canopus-Art-Medium-LoRA.safetensors", "mdm"),
-    }
-    # Load all LoRA weights
-    for model_name, weight_name, adapter_name in LORA_OPTIONS.values():
-        pipe.load_lora_weights(model_name, weight_name=weight_name, adapter_name=adapter_name)
-    pipe.to("cuda")
-else:
-    pipe = StableDiffusionXLPipeline.from_pretrained(
-        "SG161222/RealVisXL_V4.0_Lightning",
-        torch_dtype=torch.float32,
-        use_safetensors=True,
-    ).to(device)
 def save_image(img: Image.Image) -> str:
-    """Save a PIL image with a unique filename and return the path."""
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
 def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    return seed
-@spaces.GPU(duration=180, enable_queue=True)
-def generate_image(
-    prompt: str,
-    negative_prompt: str = "",
-    seed: int = 0,
-    width: int = 1024,
-    height: int = 1024,
-    guidance_scale: float = 3.0,
-    randomize_seed: bool = True,
-    lora_model: str = "Realism",
-    progress=gr.Progress(track_tqdm=True),
-):
-    seed = int(randomize_seed_fn(seed, randomize_seed))
-    effective_negative_prompt = negative_prompt  # Use provided negative prompt if any
-    model_name, weight_name, adapter_name = LORA_OPTIONS[lora_model]
-    pipe.set_adapters(adapter_name)
-    outputs = pipe(
-         prompt=prompt,
-         negative_prompt=effective_negative_prompt,
-         width=width,
-         height=height,
-         guidance_scale=guidance_scale,
-         num_inference_steps=28,
-         num_images_per_prompt=1,
-         cross_attention_kwargs={"scale": 0.65},
-         output_type="pil",
-    )
-    images = outputs.images
-    image_paths = [save_image(img) for img in images]
-    return image_paths, seed
-# -----------------------
-# Main Chat/Generation Function
-# -----------------------
-@spaces.GPU
-def generate(
-    input_dict: dict,
-    chat_history: list[dict],
-    max_new_tokens: int = 1024,
-    temperature: float = 0.6,
-    top_p: float = 0.9,
-    top_k: int = 50,
-    repetition_penalty: float = 1.2,
-):
-    """
-    Generates chatbot responses with support for multimodal input, TTS, and image generation.
-    Special commands:
-      - "@tts1" or "@tts2": triggers text-to-speech.
-      - "@<lora_command>": triggers image generation using the new LoRA pipeline.
-         Available commands (case-insensitive): @realism, @pixar, @photoshoot, @clothing, @interior, @fashion,
-         @minimalistic, @modern, @animaliea, @wallpaper, @cars, @pencilart, @artminimalistic.
-    """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # Check for image generation command based on LoRA tags.
-    lora_mapping = { key.lower(): key for key in LORA_OPTIONS }
-    for key_lower, key in lora_mapping.items():
-        command_tag = "@" + key_lower
-        if text.strip().lower().startswith(command_tag):
-            prompt_text = text.strip()[len(command_tag):].strip()
-            yield progress_bar_html(f"Processing Image Generation ({key} style)")
-            image_paths, used_seed = generate_image(
-                prompt=prompt_text,
-                negative_prompt="",
-                seed=1,
-                width=1024,
-                height=1024,
-                guidance_scale=3,
-                randomize_seed=True,
-                lora_model=key,
-            )
-            yield progress_bar_html("Finalizing Image Generation")
-            yield gr.Image(image_paths[0])
-            return
-    # Check for TTS command (@tts1 or @tts2)
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
@@ -260,40 +266,31 @@ def generate(
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
-        if len(files) > 1:
-            images = [load_image(image) for image in files]
-        elif len(files) == 1:
-            images = [load_image(files[0])]
-        else:
-            images = []
         messages = [{
             "role": "user",
-            "content": [
-                *[{"type": "image", "image": image} for image in images],
-                {"type": "text", "text": text},
-            ]
         }]
-        prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
-        yield progress_bar_html("Processing with Qwen2VL Ocr")
         for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
         input_ids = input_ids.to(model.device)
         streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
@@ -309,60 +306,164 @@ def generate(
         }
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
-            output_file = asyncio.run(text_to_speech(final_response, voice))
-            yield gr.Audio(output_file, autoplay=True)
-# -----------------------
-# Gradio Chat Interface
-# -----------------------
-demo = gr.ChatInterface(
-    fn=generate,
-    additional_inputs=[
-        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
-        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
-        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
-        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
-        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
-    ],
-    examples=[
-        ['@realism Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic'],
-        ["@pixar A young man with light brown wavy hair and light brown eyes sitting in an armchair and looking directly at the camera, pixar style, disney pixar, office background, ultra detailed, 1 man"],
-        ["@realism A futuristic cityscape with neon lights"],
-        ["@photoshoot A portrait of a person with dramatic lighting"],
-        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
-        ["Python Program for Array Rotation"],
-        ["@tts1 Who is Nikola Tesla, and why did he die?"],
-        ["@clothing Fashionable streetwear in an urban environment"],
-        ["@interior A modern living room interior with minimalist design"],
-        ["@fashion A runway model in haute couture"],
-        ["@minimalistic A simple and elegant design of a serene landscape"],
-        ["@modern A contemporary art piece with abstract geometric shapes"],
-        ["@animaliea A cute animal portrait with vibrant colors"],
-        ["@wallpaper A scenic mountain range perfect for a desktop wallpaper"],
-        ["@cars A sleek sports car cruising on a city street"],
-        ["@pencilart A detailed pencil sketch of a historic building"],
-        ["@artminimalistic An artistic minimalist composition with subtle tones"],
-        ["@tts2 What causes rainbows to form?"],
-    ],
-    cache_examples=False,
-    type="messages",
-    description=DESCRIPTION,
-    css=css,
-    fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="default [text, vision] , scroll down examples to explore more art styles"),
-    stop_btn="Stop Generation",
-    multimodal=True,
-)
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(share=True)

 import json
 import time
 import asyncio
 from threading import Thread
 import gradio as gr
 import numpy as np
 from PIL import Image
 import edge_tts
+import cv2
 from transformers import (
     AutoModelForCausalLM,
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+# --------- Global Config and Model Loading ---------
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+MAX_SEED = np.iinfo(np.int32).max
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# For text-only generation (chat)
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
+# For TTS
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
 ]
+# For multimodal Qwen2VL (OCR / video/text)
+MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
+processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_QWEN,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# For SDXL Image Generation
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # Set your SDXL model repository path via env variable
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
+sd_pipe = StableDiffusionXLPipeline.from_pretrained(
+    MODEL_ID_SD,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    use_safetensors=True,
+    add_watermarker=False,
+).to(device)
+sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+if torch.cuda.is_available():
+    sd_pipe.text_encoder = sd_pipe.text_encoder.half()
+if USE_TORCH_COMPILE:
+    sd_pipe.compile()
+if ENABLE_CPU_OFFLOAD:
+    sd_pipe.enable_model_cpu_offload()
+# For SDXL quality styles and LoRA options (used in the image-gen tab)
+LORA_OPTIONS = {
+    "Realism (face/character)👦🏻": ("prithivMLmods/Canopus-Realism-LoRA", "Canopus-Realism-LoRA.safetensors", "rlms"),
+    "Pixar (art/toons)🙀": ("prithivMLmods/Canopus-Pixar-Art", "Canopus-Pixar-Art.safetensors", "pixar"),
+    "Photoshoot (camera/film)📸": ("prithivMLmods/Canopus-Photo-Shoot-Mini-LoRA", "Canopus-Photo-Shoot-Mini-LoRA.safetensors", "photo"),
+    "Clothing (hoodies/pant/shirts)👔": ("prithivMLmods/Canopus-Clothing-Adp-LoRA", "Canopus-Dress-Clothing-LoRA.safetensors", "clth"),
+    "Interior Architecture (house/hotel)🏠": ("prithivMLmods/Canopus-Interior-Architecture-0.1", "Canopus-Interior-Architecture-0.1δ.safetensors", "arch"),
+    "Fashion Product (wearing/usable)👜": ("prithivMLmods/Canopus-Fashion-Product-Dilation", "Canopus-Fashion-Product-Dilation.safetensors", "fashion"),
+    "Minimalistic Image (minimal/detailed)🏞️": ("prithivMLmods/Pegasi-Minimalist-Image-Style", "Pegasi-Minimalist-Image-Style.safetensors", "minimalist"),
+    "Modern Clothing (trend/new)👕": ("prithivMLmods/Canopus-Modern-Clothing-Design", "Canopus-Modern-Clothing-Design.safetensors", "mdrnclth"),
+    "Animaliea (farm/wild)🫎": ("prithivMLmods/Canopus-Animaliea-Artism", "Canopus-Animaliea-Artism.safetensors", "Animaliea"),
+    "Liquid Wallpaper (minimal/illustration)🖼️": ("prithivMLmods/Canopus-Liquid-Wallpaper-Art", "Canopus-Liquid-Wallpaper-Minimalize-LoRA.safetensors", "liquid"),
+    "Canes Cars (realistic/futurecars)🚘": ("prithivMLmods/Canes-Cars-Model-LoRA", "Canes-Cars-Model-LoRA.safetensors", "car"),
+    "Pencil Art (characteristic/creative)✏️": ("prithivMLmods/Canopus-Pencil-Art-LoRA", "Canopus-Pencil-Art-LoRA.safetensors", "Pencil Art"),
+    "Art Minimalistic (paint/semireal)🎨": ("prithivMLmods/Canopus-Art-Medium-LoRA", "Canopus-Art-Medium-LoRA.safetensors", "mdm"),
+}
+style_list = [
+    {
+        "name": "3840 x 2160",
+        "prompt": "hyper-realistic 8K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
+        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
+    },
+    {
+        "name": "2560 x 1440",
+        "prompt": "hyper-realistic 4K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
+        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
+    },
+    {
+        "name": "HD+",
+        "prompt": "hyper-realistic 2K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
+        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
+    },
+    {
+        "name": "Style Zero",
+        "prompt": "{prompt}",
+        "negative_prompt": "",
+    },
+]
+styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
+DEFAULT_STYLE_NAME = "3840 x 2160"
+STYLE_NAMES = list(styles.keys())
+# --------- Utility Functions ---------
+def text_to_speech(text: str, voice: str, output_file="output.mp3"):
+    """Convert text to speech using Edge TTS and save as MP3"""
+    async def run_tts():
+        communicate = edge_tts.Communicate(text, voice)
+        await communicate.save(output_file)
+        return output_file
+    return asyncio.run(run_tts())
+def clean_chat_history(chat_history):
+    """Remove non-string content from the chat history."""
+    return [msg for msg in chat_history if isinstance(msg, dict) and isinstance(msg.get("content"), str)]
 def save_image(img: Image.Image) -> str:
+    """Save a PIL image to a file with a unique filename."""
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
 def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    return random.randint(0, MAX_SEED) if randomize_seed else seed
+def progress_bar_html(label: str) -> str:
+    """Return an HTML snippet for a progress bar."""
+    return f'''
+    <div style="display: flex; align-items: center;">
+        <span style="margin-right: 10px; font-size: 14px;">{label}</span>
+        <div style="width: 110px; height: 5px; background-color: #FFF0F5; border-radius: 2px; overflow: hidden;">
+            <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
+        </div>
+    </div>
+    <style>
+    @keyframes loading {{
+        0% {{ transform: translateX(-100%); }}
+        100% {{ transform: translateX(100%); }}
+    }}
+    </style>
+    '''
+def downsample_video(video_path):
+    """Extract 10 evenly spaced frames from a video."""
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
+def apply_style(style_name: str, positive: str, negative: str = ""):
+    """Apply a chosen quality style to the prompt."""
+    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
+    return p.replace("{prompt}", positive), n + negative
+# --------- Tab 1: Chat Interface (Multimodal) ---------
+def chat_generate(input_dict: dict, chat_history: list,
+                  max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+                  temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    lower_text = text.strip().lower()
+    # If image generation command
+    if lower_text.startswith("@image"):
+        prompt = text[len("@image"):].strip()
+        yield progress_bar_html("Generating Image")
+        image_paths, used_seed = generate_image_fn(
+            prompt=prompt,
+            negative_prompt="",
+            use_negative_prompt=False,
+            seed=1,
+            width=1024,
+            height=1024,
+            guidance_scale=3,
+            num_inference_steps=25,
+            randomize_seed=True,
+            use_resolution_binning=True,
+            num_images=1,
+        )
+        yield gr.Image.update(value=image_paths[0])
+        return
+    # If video inference command
+    if lower_text.startswith("@video-infer"):
+        prompt = text[len("@video-infer"):].strip()
+        if files:
+            video_path = files[0]
+            frames = downsample_video(video_path)
+            messages = [
+                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+                {"role": "user", "content": [{"type": "text", "text": prompt}]}
+            ]
+            for frame in frames:
+                image, timestamp = frame
+                image_path = f"video_frame_{uuid.uuid4().hex}.png"
+                image.save(image_path)
+                messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
+                messages[1]["content"].append({"type": "image", "url": image_path})
+        else:
+            messages = [
+                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+                {"role": "user", "content": [{"type": "text", "text": prompt}]}
+            ]
+        inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt").to("cuda")
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        yield progress_bar_html("Processing video with Qwen2VL")
+        for new_text in streamer:
+            buffer += new_text.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+        return
+    # Check for TTS command
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
+        # Handle multimodal chat with images
+        images = [load_image(f) for f in files]
         messages = [{
             "role": "user",
+            "content": [{"type": "image", "image": image} for image in images] + [{"type": "text", "text": text}]
         }]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        yield progress_bar_html("Thinking...")
         for new_text in streamer:
+            buffer += new_text.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(f"Trimmed input as it exceeded {MAX_INPUT_TOKEN_LENGTH} tokens.")
         input_ids = input_ids.to(model.device)
         streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
         }
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
+        yield progress_bar_html("Processing...")
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
+            output_file = text_to_speech(final_response, voice)
+            yield gr.Audio.update(value=output_file)
+# Helper function for image generation (used in chat @image branch)
+@spaces.GPU(duration=60, enable_queue=True)
+def generate_image_fn(prompt: str, negative_prompt: str = "", use_negative_prompt: bool = False,
+                      seed: int = 1, width: int = 1024, height: int = 1024,
+                      guidance_scale: float = 3, num_inference_steps: int = 25,
+                      randomize_seed: bool = False, use_resolution_binning: bool = True,
+                      num_images: int = 1, progress=None):
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    options = {
+        "prompt": [prompt] * num_images,
+        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
+        "width": width,
+        "height": height,
+        "guidance_scale": guidance_scale,
+        "num_inference_steps": num_inference_steps,
+        "generator": generator,
+        "output_type": "pil",
+    }
+    if use_resolution_binning:
+        options["use_resolution_binning"] = True
+    images = []
+    for i in range(0, num_images, BATCH_SIZE):
+        batch_options = options.copy()
+        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
+        if batch_options.get("negative_prompt") is not None:
+            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
+        if device.type == "cuda":
+            with torch.autocast("cuda", dtype=torch.float16):
+                outputs = sd_pipe(**batch_options)
+        else:
+            outputs = sd_pipe(**batch_options)
+        images.extend(outputs.images)
+    image_paths = [save_image(img) for img in images]
+    return image_paths, seed
+# --------- Tab 2: SDXL Image Generation ---------
+@spaces.GPU(duration=180, enable_queue=True)
+def sdxl_generate(prompt: str, negative_prompt: str = "", use_negative_prompt: bool = True,
+                  seed: int = 0, width: int = 1024, height: int = 1024, guidance_scale: float = 3,
+                  randomize_seed: bool = False, style_name: str = DEFAULT_STYLE_NAME,
+                  lora_model: str = "Realism (face/character)👦🏻", progress=None):
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    positive_prompt, effective_negative_prompt = apply_style(style_name, prompt, negative_prompt)
+    if not use_negative_prompt:
+        effective_negative_prompt = ""
+    model_name, weight_name, adapter_name = LORA_OPTIONS[lora_model]
+    # Set the adapter for the current generation
+    sd_pipe.load_lora_weights(model_name, weight_name=weight_name, adapter_name=adapter_name)
+    sd_pipe.set_adapters(adapter_name)
+    images = sd_pipe(
+        prompt=positive_prompt,
+        negative_prompt=effective_negative_prompt,
+        width=width,
+        height=height,
+        guidance_scale=guidance_scale,
+        num_inference_steps=20,
+        num_images_per_prompt=1,
+        cross_attention_kwargs={"scale": 0.65},
+        output_type="pil",
+    ).images
+    image_paths = [save_image(img) for img in images]
+    return image_paths, seed
+# --------- Tab 3: Qwen2VL OCR & Text Generation ---------
+def qwen2vl_ocr_textgen(prompt: str, image_file):
+    if image_file is None:
+        return "Please upload an image."
+    # Load the image
+    image = load_image(image_file)
+    messages = [
+        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+        {"role": "user", "content": [{"type": "text", "text": prompt}, {"type": "image", "image": image}]}
+    ]
+    inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True,
+                                             return_dict=True, return_tensors="pt").to("cuda")
+    outputs = model_m.generate(
+        **inputs,
+        max_new_tokens=1024,
+        do_sample=True,
+        temperature=0.6,
+        top_p=0.9,
+        top_k=50,
+        repetition_penalty=1.2,
+    )
+    response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    return response
+# --------- Building the Gradio Interface with Tabs ---------
+with gr.Blocks(title="Combined Demo") as demo:
+    gr.Markdown("# Combined Demo: Chat, SDXL Image Gen & Qwen2VL OCR/TextGen")
+    with gr.Tabs():
+        # --- Tab 1: Chat Interface ---
+        with gr.Tab("Chat Interface"):
+            chat_interface = gr.ChatInterface(
+                fn=chat_generate,
+                additional_inputs=[
+                    gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
+                    gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
+                    gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
+                    gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
+                    gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
+                ],
+                examples=[
+                    ["Write the Python Program for Array Rotation"],
+                    [{"text": "summarize the letter", "files": ["examples/1.png"]}],
+                    [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
+                    ["@image Chocolate dripping from a donut"],
+                    ["@tts1 Who is Nikola Tesla, and why did he die?"],
+                ],
+                cache_examples=False,
+                type="messages",
+                description="Use commands like **@image**, **@video-infer**, **@tts1**, or plain text.",
+                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple",
+                                               placeholder="Type your query (e.g., @tts1 for TTS, @image for image gen, etc.)"),
+                stop_btn="Stop Generation",
+                multimodal=True,
+            )
+        # --- Tab 2: SDXL Image Generation ---
+        with gr.Tab("SDXL Gen Image"):
+            with gr.Row():
+                prompt_in = gr.Textbox(label="Prompt", placeholder="Enter prompt for image generation")
+                negative_prompt_in = gr.Textbox(label="Negative prompt", placeholder="Enter negative prompt", lines=2)
+            with gr.Row():
+                seed_in = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
+                randomize_in = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row():
+                width_in = gr.Slider(label="Width", minimum=512, maximum=2048, step=8, value=1024)
+                height_in = gr.Slider(label="Height", minimum=512, maximum=2048, step=8, value=1024)
+            guidance_in = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=20.0, step=0.1, value=3.0)
+            style_in = gr.Radio(choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME, label="Quality Style")
+            lora_in = gr.Dropdown(choices=list(LORA_OPTIONS.keys()), value="Realism (face/character)👦🏻", label="LoRA Selection")
+            run_button_img = gr.Button("Generate Image")
+            output_gallery = gr.Gallery(label="Generated Image", columns=1, preview=True)
+            seed_output = gr.Number(label="Seed used")
+            run_button_img.click(fn=sdxl_generate,
+                                 inputs=[prompt_in, negative_prompt_in, randomize_in, seed_in, width_in, height_in, guidance_in, randomize_in, style_in, lora_in],
+                                 outputs=[output_gallery, seed_output])
+        # --- Tab 3: Qwen2VL OCR & Text Generation ---
+        with gr.Tab("Qwen2VL OCR/TextGen"):
+            with gr.Row():
+                qwen_prompt = gr.Textbox(label="Prompt", placeholder="Enter prompt for OCR / text generation")
+                qwen_image = gr.Image(label="Upload Image", type="filepath")
+            run_button_qwen = gr.Button("Run Qwen2VL")
+            qwen_output = gr.Textbox(label="Output")
+            run_button_qwen.click(fn=qwen2vl_ocr_textgen, inputs=[qwen_prompt, qwen_image], outputs=qwen_output)
 if __name__ == "__main__":
+    demo.queue(max_size=30).launch(share=True)