Spaces:

prithivMLmods
/

VisionScope-R2

Paused

App Files Files Community

prithivMLmods commited on Mar 23, 2025

Commit

74ba6ce

verified ·

1 Parent(s): 54875b8

Update app.py

Browse files

Files changed (1) hide show

app.py +211 -344

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 import random
 import uuid
-import json
 import time
 import asyncio
 from threading import Thread
@@ -11,7 +10,6 @@ import spaces
 import torch
 import numpy as np
 from PIL import Image
-import edge_tts
 import cv2
 from transformers import (
@@ -24,31 +22,107 @@ from transformers import (
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
-# --------- Global Config and Model Loading ---------
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-MAX_SEED = np.iinfo(np.int32).max
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# For text-only generation (chat)
-model_id = "prithivMLmods/FastThink-0.5B-Tiny"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
-    model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
 )
 model.eval()
-# For TTS
-TTS_VOICES = [
-    "en-US-JennyNeural",  # @tts1
-    "en-US-GuyNeural",    # @tts2
-]
-# For multimodal Qwen2VL (OCR / video/text)
 MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -57,8 +131,46 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# For SDXL Image Generation
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # Set your SDXL model repository path via env variable
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
@@ -77,7 +189,7 @@ if USE_TORCH_COMPILE:
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
-# For SDXL quality styles and LoRA options (used in the image-gen tab)
 LORA_OPTIONS = {
     "Realism (face/character)👦🏻": ("prithivMLmods/Canopus-Realism-LoRA", "Canopus-Realism-LoRA.safetensors", "rlms"),
     "Pixar (art/toons)🙀": ("prithivMLmods/Canopus-Pixar-Art", "Canopus-Pixar-Art.safetensors", "pixar"),
@@ -93,6 +205,8 @@ LORA_OPTIONS = {
     "Pencil Art (characteristic/creative)✏️": ("prithivMLmods/Canopus-Pencil-Art-LoRA", "Canopus-Pencil-Art-LoRA.safetensors", "Pencil Art"),
     "Art Minimalistic (paint/semireal)🎨": ("prithivMLmods/Canopus-Art-Medium-LoRA", "Canopus-Art-Medium-LoRA.safetensors", "mdm"),
 }
 style_list = [
     {
         "name": "3840 x 2160",
@@ -119,351 +233,104 @@ styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
 DEFAULT_STYLE_NAME = "3840 x 2160"
 STYLE_NAMES = list(styles.keys())
-# --------- Utility Functions ---------
-def text_to_speech(text: str, voice: str, output_file="output.mp3"):
-    """Convert text to speech using Edge TTS and save as MP3"""
-    async def run_tts():
-        communicate = edge_tts.Communicate(text, voice)
-        await communicate.save(output_file)
-        return output_file
-    return asyncio.run(run_tts())
-def clean_chat_history(chat_history):
-    """Remove non-string content from the chat history."""
-    return [msg for msg in chat_history if isinstance(msg, dict) and isinstance(msg.get("content"), str)]
-def save_image(img: Image.Image) -> str:
-    """Save a PIL image to a file with a unique filename."""
-    unique_name = str(uuid.uuid4()) + ".png"
-    img.save(unique_name)
-    return unique_name
-def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
-    return random.randint(0, MAX_SEED) if randomize_seed else seed
-def progress_bar_html(label: str) -> str:
-    """Return an HTML snippet for a progress bar."""
-    return f'''
-    <div style="display: flex; align-items: center;">
-        <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-        <div style="width: 110px; height: 5px; background-color: #FFF0F5; border-radius: 2px; overflow: hidden;">
-            <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
-        </div>
-    </div>
-    <style>
-    @keyframes loading {{
-        0% {{ transform: translateX(-100%); }}
-        100% {{ transform: translateX(100%); }}
-    }}
-    </style>
-    '''
-def downsample_video(video_path):
-    """Extract 10 evenly spaced frames from a video."""
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
-    frames = []
-    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
-    vidcap.release()
-    return frames
 def apply_style(style_name: str, positive: str, negative: str = ""):
-    """Apply a chosen quality style to the prompt."""
-    p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
-    return p.replace("{prompt}", positive), n + negative
-# --------- Tab 1: Chat Interface (Multimodal) ---------
-def chat_generate(input_dict: dict, chat_history: list,
-                  max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
-                  temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
-    text = input_dict["text"]
-    files = input_dict.get("files", [])
-    lower_text = text.strip().lower()
-    # If image generation command
-    if lower_text.startswith("@image"):
-        prompt = text[len("@image"):].strip()
-        yield progress_bar_html("Generating Image")
-        image_paths, used_seed = generate_image_fn(
-            prompt=prompt,
-            negative_prompt="",
-            use_negative_prompt=False,
-            seed=1,
-            width=1024,
-            height=1024,
-            guidance_scale=3,
-            num_inference_steps=25,
-            randomize_seed=True,
-            use_resolution_binning=True,
-            num_images=1,
-        )
-        yield gr.Image.update(value=image_paths[0])
-        return
-    # If video inference command
-    if lower_text.startswith("@video-infer"):
-        prompt = text[len("@video-infer"):].strip()
-        if files:
-            video_path = files[0]
-            frames = downsample_video(video_path)
-            messages = [
-                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-                {"role": "user", "content": [{"type": "text", "text": prompt}]}
-            ]
-            for frame in frames:
-                image, timestamp = frame
-                image_path = f"video_frame_{uuid.uuid4().hex}.png"
-                image.save(image_path)
-                messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
-                messages[1]["content"].append({"type": "image", "url": image_path})
-        else:
-            messages = [
-                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-                {"role": "user", "content": [{"type": "text", "text": prompt}]}
-            ]
-        inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt").to("cuda")
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            **inputs,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-        }
-        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield progress_bar_html("Processing video with Qwen2VL")
-        for new_text in streamer:
-            buffer += new_text.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-        return
-    # Check for TTS command
-    tts_prefix = "@tts"
-    is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
-    voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
-    if is_tts and voice_index:
-        voice = TTS_VOICES[voice_index - 1]
-        text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        conversation = [{"role": "user", "content": text}]
-    else:
-        voice = None
-        text = text.replace(tts_prefix, "").strip()
-        conversation = clean_chat_history(chat_history)
-        conversation.append({"role": "user", "content": text})
-    if files:
-        # Handle multimodal chat with images
-        images = [load_image(f) for f in files]
-        messages = [{
-            "role": "user",
-            "content": [{"type": "image", "image": image} for image in images] + [{"type": "text", "text": text}]
-        }]
-        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda")
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield progress_bar_html("Thinking...")
-        for new_text in streamer:
-            buffer += new_text.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
     else:
-        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-            gr.Warning(f"Trimmed input as it exceeded {MAX_INPUT_TOKEN_LENGTH} tokens.")
-        input_ids = input_ids.to(model.device)
-        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            "input_ids": input_ids,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "top_p": top_p,
-            "top_k": top_k,
-            "temperature": temperature,
-            "num_beams": 1,
-            "repetition_penalty": repetition_penalty,
-        }
-        t = Thread(target=model.generate, kwargs=generation_kwargs)
-        t.start()
-        outputs = []
-        yield progress_bar_html("Processing...")
-        for new_text in streamer:
-            outputs.append(new_text)
-            yield "".join(outputs)
-        final_response = "".join(outputs)
-        yield final_response
-        if is_tts and voice:
-            output_file = text_to_speech(final_response, voice)
-            yield gr.Audio.update(value=output_file)
-# Helper function for image generation (used in chat @image branch)
-@spaces.GPU(duration=60, enable_queue=True)
-def generate_image_fn(prompt: str, negative_prompt: str = "", use_negative_prompt: bool = False,
-                      seed: int = 1, width: int = 1024, height: int = 1024,
-                      guidance_scale: float = 3, num_inference_steps: int = 25,
-                      randomize_seed: bool = False, use_resolution_binning: bool = True,
-                      num_images: int = 1, progress=None):
     seed = int(randomize_seed_fn(seed, randomize_seed))
-    generator = torch.Generator(device=device).manual_seed(seed)
     options = {
-        "prompt": [prompt] * num_images,
-        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
         "width": width,
         "height": height,
         "guidance_scale": guidance_scale,
-        "num_inference_steps": num_inference_steps,
-        "generator": generator,
         "output_type": "pil",
     }
-    if use_resolution_binning:
-        options["use_resolution_binning"] = True
-    images = []
-    for i in range(0, num_images, BATCH_SIZE):
-        batch_options = options.copy()
-        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
-        if batch_options.get("negative_prompt") is not None:
-            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
-        if device.type == "cuda":
-            with torch.autocast("cuda", dtype=torch.float16):
-                outputs = sd_pipe(**batch_options)
-        else:
-            outputs = sd_pipe(**batch_options)
-        images.extend(outputs.images)
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
-# --------- Tab 2: SDXL Image Generation ---------
-@spaces.GPU(duration=180, enable_queue=True)
-def sdxl_generate(prompt: str, negative_prompt: str = "", use_negative_prompt: bool = True,
-                  seed: int = 0, width: int = 1024, height: int = 1024, guidance_scale: float = 3,
-                  randomize_seed: bool = False, style_name: str = DEFAULT_STYLE_NAME,
-                  lora_model: str = "Realism (face/character)👦🏻", progress=None):
-    seed = int(randomize_seed_fn(seed, randomize_seed))
-    positive_prompt, effective_negative_prompt = apply_style(style_name, prompt, negative_prompt)
-    if not use_negative_prompt:
-        effective_negative_prompt = ""
-    model_name, weight_name, adapter_name = LORA_OPTIONS[lora_model]
-    # Set the adapter for the current generation
-    sd_pipe.load_lora_weights(model_name, weight_name=weight_name, adapter_name=adapter_name)
-    sd_pipe.set_adapters(adapter_name)
-    images = sd_pipe(
-        prompt=positive_prompt,
-        negative_prompt=effective_negative_prompt,
-        width=width,
-        height=height,
-        guidance_scale=guidance_scale,
-        num_inference_steps=20,
-        num_images_per_prompt=1,
-        cross_attention_kwargs={"scale": 0.65},
-        output_type="pil",
-    ).images
-    image_paths = [save_image(img) for img in images]
-    return image_paths, seed
-# --------- Tab 3: Qwen2VL OCR & Text Generation ---------
-def qwen2vl_ocr_textgen(prompt: str, image_file):
-    if image_file is None:
-        return "Please upload an image."
-    # Load the image
-    image = load_image(image_file)
-    messages = [
-        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-        {"role": "user", "content": [{"type": "text", "text": prompt}, {"type": "image", "image": image}]}
-    ]
-    inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True,
-                                             return_dict=True, return_tensors="pt").to("cuda")
-    outputs = model_m.generate(
-        **inputs,
-        max_new_tokens=1024,
-        do_sample=True,
-        temperature=0.6,
-        top_p=0.9,
-        top_k=50,
-        repetition_penalty=1.2,
-    )
-    response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-    return response
-# --------- Building the Gradio Interface with Tabs ---------
-with gr.Blocks(title="Combined Demo") as demo:
-    gr.Markdown("# Combined Demo: Chat, SDXL Image Gen & Qwen2VL OCR/TextGen")
     with gr.Tabs():
-        # --- Tab 1: Chat Interface ---
         with gr.Tab("Chat Interface"):
-            chat_interface = gr.ChatInterface(
-                fn=chat_generate,
-                additional_inputs=[
-                    gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
-                    gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
-                    gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
-                    gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
-                    gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
-                ],
-                examples=[
-                    ["Write the Python Program for Array Rotation"],
-                    [{"text": "summarize the letter", "files": ["examples/1.png"]}],
-                    [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
-                    ["@image Chocolate dripping from a donut"],
-                    ["@tts1 Who is Nikola Tesla, and why did he die?"],
-                ],
-                cache_examples=False,
-                type="messages",
-                description="Use commands like **@image**, **@video-infer**, **@tts1**, or plain text.",
-                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple",
-                                               placeholder="Type your query (e.g., @tts1 for TTS, @image for image gen, etc.)"),
-                stop_btn="Stop Generation",
-                multimodal=True,
-            )
-        # --- Tab 2: SDXL Image Generation ---
-        with gr.Tab("SDXL Gen Image"):
             with gr.Row():
-                prompt_in = gr.Textbox(label="Prompt", placeholder="Enter prompt for image generation")
-                negative_prompt_in = gr.Textbox(label="Negative prompt", placeholder="Enter negative prompt", lines=2)
             with gr.Row():
-                seed_in = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
-                randomize_in = gr.Checkbox(label="Randomize seed", value=True)
             with gr.Row():
-                width_in = gr.Slider(label="Width", minimum=512, maximum=2048, step=8, value=1024)
-                height_in = gr.Slider(label="Height", minimum=512, maximum=2048, step=8, value=1024)
-            guidance_in = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=20.0, step=0.1, value=3.0)
-            style_in = gr.Radio(choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME, label="Quality Style")
-            lora_in = gr.Dropdown(choices=list(LORA_OPTIONS.keys()), value="Realism (face/character)👦🏻", label="LoRA Selection")
-            run_button_img = gr.Button("Generate Image")
-            output_gallery = gr.Gallery(label="Generated Image", columns=1, preview=True)
-            seed_output = gr.Number(label="Seed used")
-            run_button_img.click(fn=sdxl_generate,
-                                 inputs=[prompt_in, negative_prompt_in, randomize_in, seed_in, width_in, height_in, guidance_in, randomize_in, style_in, lora_in],
-                                 outputs=[output_gallery, seed_output])
-        # --- Tab 3: Qwen2VL OCR & Text Generation ---
-        with gr.Tab("Qwen2VL OCR/TextGen"):
             with gr.Row():
-                qwen_prompt = gr.Textbox(label="Prompt", placeholder="Enter prompt for OCR / text generation")
-                qwen_image = gr.Image(label="Upload Image", type="filepath")
-            run_button_qwen = gr.Button("Run Qwen2VL")
-            qwen_output = gr.Textbox(label="Output")
-            run_button_qwen.click(fn=qwen2vl_ocr_textgen, inputs=[qwen_prompt, qwen_image], outputs=qwen_output)
 if __name__ == "__main__":
-    demo.queue(max_size=30).launch(share=True)

 import os
 import random
 import uuid
 import time
 import asyncio
 from threading import Thread
 import torch
 import numpy as np
 from PIL import Image
 import cv2
 from transformers import (
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+# ---------------------------
+# Global Settings & Utilities
+# ---------------------------
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+def save_image(img: Image.Image) -> str:
+    """Save a PIL image with a unique filename and return the path."""
+    unique_name = str(uuid.uuid4()) + ".png"
+    img.save(unique_name)
+    return unique_name
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    MAX_SEED = np.iinfo(np.int32).max
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+def progress_bar_html(label: str) -> str:
+    """Returns an HTML snippet for a thin progress bar with a label."""
+    return f'''
+<div style="display: flex; align-items: center;">
+    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
+    <div style="width: 110px; height: 5px; background-color: #FFF0F5; border-radius: 2px; overflow: hidden;">
+        <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
+    </div>
+</div>
+<style>
+@keyframes loading {{
+    0% {{ transform: translateX(-100%); }}
+    100% {{ transform: translateX(100%); }}
+}}
+</style>
+    '''
+# ---------------------------
+# 1. Chat Interface Tab
+# ---------------------------
+# Uses a text-only model: FastThink-0.5B-Tiny
+model_id_text = "prithivMLmods/FastThink-0.5B-Tiny"
+tokenizer = AutoTokenizer.from_pretrained(model_id_text)
 model = AutoModelForCausalLM.from_pretrained(
+    model_id_text,
     device_map="auto",
     torch_dtype=torch.bfloat16,
 )
 model.eval()
+def clean_chat_history(chat_history):
+    """
+    Filter out any chat entries whose "content" is not a string.
+    """
+    cleaned = []
+    for msg in chat_history:
+        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
+            cleaned.append(msg)
+    return cleaned
+def chat_generate(input_text: str, chat_history: list, max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
+    """
+    Chat generation using a text-only model.
+    """
+    # Prepare conversation by cleaning history and appending the new user message.
+    conversation = clean_chat_history(chat_history)
+    conversation.append({"role": "user", "content": input_text})
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        "input_ids": input_ids,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": True,
+        "top_p": top_p,
+        "top_k": top_k,
+        "temperature": temperature,
+        "num_beams": 1,
+        "repetition_penalty": repetition_penalty,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    outputs = []
+    # Collect the generated text from the streamer.
+    for new_text in streamer:
+        outputs.append(new_text)
+    final_response = "".join(outputs)
+    # Append assistant reply to chat history.
+    updated_history = conversation + [{"role": "assistant", "content": final_response}]
+    return final_response, updated_history
+# ---------------------------
+# 2. Qwen 2 VL OCR Tab
+# ---------------------------
+# Uses Qwen2VL OCR model for multimodal input (text + image)
 MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
+def generate_qwen_ocr(input_text: str, image):
+    """
+    Uses the Qwen2VL OCR model to process an image along with text.
+    """
+    if image is None:
+        return "No image provided."
+    # Build message with system and user content.
+    messages = [
+        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+        {"role": "user", "content": [{"type": "text", "text": input_text}, {"type": "image", "image": image}]}
+    ]
+    # Apply chat template.
+    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to("cuda")
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": DEFAULT_MAX_NEW_TOKENS,
+        "do_sample": True,
+        "temperature": 0.6,
+        "top_p": 0.9,
+        "top_k": 50,
+        "repetition_penalty": 1.2,
+    }
+    thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
+    thread.start()
+    outputs = []
+    for new_text in streamer:
+        outputs.append(new_text.replace("<|im_end|>", ""))
+    final_response = "".join(outputs)
+    return final_response
+# ---------------------------
+# 3. Image Gen LoRA Tab
+# ---------------------------
+# Uses the SDXL pipeline with LoRA options.
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # set your SDXL model path via env variable
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
+# LoRA options dictionary.
 LORA_OPTIONS = {
     "Realism (face/character)👦🏻": ("prithivMLmods/Canopus-Realism-LoRA", "Canopus-Realism-LoRA.safetensors", "rlms"),
     "Pixar (art/toons)🙀": ("prithivMLmods/Canopus-Pixar-Art", "Canopus-Pixar-Art.safetensors", "pixar"),
     "Pencil Art (characteristic/creative)✏️": ("prithivMLmods/Canopus-Pencil-Art-LoRA", "Canopus-Pencil-Art-LoRA.safetensors", "Pencil Art"),
     "Art Minimalistic (paint/semireal)🎨": ("prithivMLmods/Canopus-Art-Medium-LoRA", "Canopus-Art-Medium-LoRA.safetensors", "mdm"),
 }
+# Style options.
 style_list = [
     {
         "name": "3840 x 2160",
 DEFAULT_STYLE_NAME = "3840 x 2160"
 STYLE_NAMES = list(styles.keys())
 def apply_style(style_name: str, positive: str, negative: str = ""):
+    if style_name in styles:
+        p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
     else:
+        p, n = styles[DEFAULT_STYLE_NAME]
+    return p.replace("{prompt}", positive), n + (negative if negative else "")
+def generate_image_lora(prompt: str, negative_prompt: str, use_negative_prompt: bool, seed: int, width: int, height: int, guidance_scale: float, randomize_seed: bool, style_name: str, lora_model: str):
     seed = int(randomize_seed_fn(seed, randomize_seed))
+    positive_prompt, effective_negative_prompt = apply_style(style_name, prompt, negative_prompt)
+    if not use_negative_prompt:
+        effective_negative_prompt = ""
+    # Set the desired LoRA adapter.
+    model_name, weight_name, adapter_name = LORA_OPTIONS[lora_model]
+    sd_pipe.set_adapters(adapter_name)
+    # Generate image(s)
     options = {
+        "prompt": [positive_prompt],
+        "negative_prompt": [effective_negative_prompt],
         "width": width,
         "height": height,
         "guidance_scale": guidance_scale,
+        "num_inference_steps": 20,
+        "num_images_per_prompt": 1,
+        "cross_attention_kwargs": {"scale": 0.65},
         "output_type": "pil",
     }
+    outputs = sd_pipe(**options)
+    images = outputs.images
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
+# ---------------------------
+# Build Gradio Interface with Three Tabs
+# ---------------------------
+with gr.Blocks(css=".gradio-container {max-width: 900px; margin: auto;}") as demo:
+    gr.Markdown("## Multi-Functional Demo: Chat Interface | Qwen 2 VL OCR | Image Gen LoRA")
     with gr.Tabs():
+        # Tab 1: Chat Interface
         with gr.Tab("Chat Interface"):
+            chat_output = gr.Chatbot(label="Chat Conversation")
             with gr.Row():
+                chat_inp = gr.Textbox(label="Enter your message", placeholder="Type your message here...", lines=2)
+                send_btn = gr.Button("Send")
             with gr.Row():
+                max_tokens_slider = gr.Slider(label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+                temperature_slider = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
+                top_p_slider = gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
+                top_k_slider = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+                rep_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
+            state = gr.State([])
+            def chat_step(user_message, history, max_tokens, temp, top_p, top_k, rep_penalty):
+                response, updated_history = chat_generate(user_message, history, max_tokens, temp, top_p, top_k, rep_penalty)
+                return updated_history, updated_history
+            send_btn.click(chat_step,
+                           inputs=[chat_inp, state, max_tokens_slider, temperature_slider, top_p_slider, top_k_slider, rep_penalty_slider],
+                           outputs=[chat_output, state])
+            chat_inp.submit(chat_step,
+                            inputs=[chat_inp, state, max_tokens_slider, temperature_slider, top_p_slider, top_k_slider, rep_penalty_slider],
+                            outputs=[chat_output, state])
+        # Tab 2: Qwen 2 VL OCR
+        with gr.Tab("Qwen 2 VL OCR"):
+            gr.Markdown("Upload an image and enter a prompt. The model will return OCR/extraction or descriptive text from the image.")
+            ocr_inp = gr.Textbox(label="Enter prompt", placeholder="Describe what you want to extract...", lines=2)
+            image_inp = gr.Image(label="Upload Image", type="pil")
+            ocr_output = gr.Textbox(label="Output", placeholder="Model output will appear here...", lines=5)
+            ocr_btn = gr.Button("Run Qwen 2 VL OCR")
+            ocr_btn.click(generate_qwen_ocr, inputs=[ocr_inp, image_inp], outputs=ocr_output)
+        # Tab 3: Image Gen LoRA
+        with gr.Tab("Image Gen LoRA"):
+            gr.Markdown("Generate images with SDXL using various LoRA models and quality styles.")
             with gr.Row():
+                prompt_img = gr.Textbox(label="Prompt", placeholder="Enter prompt for image generation...", lines=2)
+                negative_prompt_img = gr.Textbox(label="Negative Prompt", placeholder="(optional) negative prompt", lines=2)
+            use_neg_checkbox = gr.Checkbox(label="Use Negative Prompt", value=True)
             with gr.Row():
+                seed_slider = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, value=0)
+                randomize_seed_checkbox = gr.Checkbox(label="Randomize Seed", value=True)
+            with gr.Row():
+                width_slider = gr.Slider(label="Width", minimum=512, maximum=2048, step=8, value=1024)
+                height_slider = gr.Slider(label="Height", minimum=512, maximum=2048, step=8, value=1024)
+            guidance_slider = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=20.0, step=0.1, value=3.0)
+            style_radio = gr.Radio(label="Quality Style", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
+            lora_dropdown = gr.Dropdown(label="LoRA Selection", choices=list(LORA_OPTIONS.keys()), value="Realism (face/character)👦🏻")
+            img_output = gr.Gallery(label="Generated Images", columns=1, preview=True)
+            seed_output = gr.Number(label="Used Seed")
+            run_img_btn = gr.Button("Generate Image")
+            run_img_btn.click(generate_image_lora,
+                              inputs=[prompt_img, negative_prompt_img, use_neg_checkbox, seed_slider, width_slider, height_slider, guidance_slider, randomize_seed_checkbox, style_radio, lora_dropdown],
+                              outputs=[img_output, seed_output])
+    gr.Markdown("### Adjustments")
+    gr.Markdown("Each tab has been implemented separately. Feel free to adjust parameters and layout as needed in each tab.")
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)