Spaces:

prithivMLmods
/

VisionScope-R2

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 23, 2025

Commit

fda00e3

verified ·

1 Parent(s): 7fcd908

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -132

app.py CHANGED Viewed

@@ -26,7 +26,6 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 DESCRIPTION = """
 # Gen Vision 🎃
-Separate Tabs for Chat, Image Generation (LoRA), Qwen2 VL OCR and Text-to-Speech
 """
 css = '''
@@ -73,7 +72,7 @@ def progress_bar_html(label: str) -> str:
     '''
 # -----------------------
-# Text Generation Setup (Chat)
 # -----------------------
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -84,23 +83,28 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-# -----------------------
-# TTS Setup
-# -----------------------
 TTS_VOICES = [
-    "en-US-JennyNeural",
-    "en-US-GuyNeural",
 ]
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
-# -----------------------
-# Utility: Clean Chat History
-# -----------------------
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
@@ -112,19 +116,9 @@ def clean_chat_history(chat_history):
     return cleaned
 # -----------------------
-# Qwen2 VL OCR Setup
 # -----------------------
-OCR_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
-processor = AutoProcessor.from_pretrained(OCR_MODEL_ID, trust_remote_code=True)
-model_m = Qwen2VLForConditionalGeneration.from_pretrained(
-    OCR_MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to("cuda").eval()
-# -----------------------
-# Stable Diffusion Image Generation Setup (LoRA)
-# -----------------------
 MAX_SEED = np.iinfo(np.int32).max
 USE_TORCH_COMPILE = False
 ENABLE_CPU_OFFLOAD = False
@@ -177,7 +171,17 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     return seed
 @spaces.GPU(duration=180, enable_queue=True)
-def generate_image(prompt: str, negative_prompt: str, seed: int, width: int, height: int, guidance_scale: float, randomize_seed: bool, lora_model: str):
     seed = int(randomize_seed_fn(seed, randomize_seed))
     effective_negative_prompt = negative_prompt  # Use provided negative prompt if any
     model_name, weight_name, adapter_name = LORA_OPTIONS[lora_model]
@@ -198,51 +202,78 @@ def generate_image(prompt: str, negative_prompt: str, seed: int, width: int, hei
     return image_paths, seed
 # -----------------------
-# Chat Generation Function (Text-only)
-# -----------------------
-def generate_chat(input_text: str, chat_history: list, max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
-    conversation = clean_chat_history(chat_history)
-    conversation.append({"role": "user", "content": input_text})
-    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-    input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        "input_ids": input_ids,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "top_p": top_p,
-        "top_k": top_k,
-        "temperature": temperature,
-        "num_beams": 1,
-        "repetition_penalty": repetition_penalty,
-    }
-    t = Thread(target=model.generate, kwargs=generation_kwargs)
-    t.start()
-    outputs = []
-    for new_text in streamer:
-        outputs.append(new_text)
-    final_response = "".join(outputs)
-    chat_history.append({"role": "assistant", "content": final_response})
-    return chat_history
-# -----------------------
-# Qwen2 VL OCR Function (Multimodal)
 # -----------------------
-def generate_ocr(text: str, files, max_new_tokens: int):
     if files:
-        if isinstance(files, list) and len(files) > 1:
             images = [load_image(image) for image in files]
-        elif isinstance(files, list) and len(files) == 1:
             images = [load_image(files[0])]
         else:
-            images = [load_image(files)]
         messages = [{
             "role": "user",
-            "content": [*([{"type": "image", "image": image} for image in images]),
-                        {"type": "text", "text": text}]
         }]
         prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
@@ -250,84 +281,88 @@ def generate_ocr(text: str, files, max_new_tokens: int):
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         for new_text in streamer:
             buffer += new_text
-        return buffer
     else:
-        return "No images provided."
-# -----------------------
-# Text-to-Speech Function
-# -----------------------
-def generate_tts(text: str, voice: str):
-    output_file = asyncio.run(text_to_speech(text, voice))
-    return output_file
 # -----------------------
-# Gradio Interface with Tabs
 # -----------------------
-with gr.Blocks(css=css, title="Gen Vision") as demo:
-    gr.Markdown(DESCRIPTION)
-    with gr.Tab("Chat Interface"):
-        with gr.Row():
-            chat_history = gr.Chatbot(label="Chat History")
-        with gr.Row():
-            chat_input = gr.Textbox(placeholder="Enter your message", label="Your Message")
-        with gr.Row():
-            max_new_tokens_slider = gr.Slider(label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
-            temperature_slider = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
-        with gr.Row():
-            top_p_slider = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
-            top_k_slider = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
-            repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
-        send_btn = gr.Button("Send")
-        send_btn.click(
-            fn=generate_chat,
-            inputs=[chat_input, chat_history, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repetition_penalty_slider],
-            outputs=chat_history,
-        )
-    with gr.Tab("Image Generation"):
-        image_prompt = gr.Textbox(label="Prompt", placeholder="Enter image prompt")
-        negative_prompt = gr.Textbox(label="Negative Prompt", placeholder="Enter negative prompt")
-        seed_input = gr.Number(label="Seed", value=0)
-        width_slider = gr.Slider(label="Width", minimum=256, maximum=2048, step=64, value=1024)
-        height_slider = gr.Slider(label="Height", minimum=256, maximum=2048, step=64, value=1024)
-        guidance_scale_slider = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=3.0)
-        randomize_checkbox = gr.Checkbox(label="Randomize Seed", value=True)
-        lora_dropdown = gr.Dropdown(label="LoRA Style", choices=list(LORA_OPTIONS.keys()), value="Realism")
-        generate_img_btn = gr.Button("Generate Image")
-        img_output = gr.Image(label="Generated Image")
-        seed_output = gr.Number(label="Used Seed")
-        generate_img_btn.click(
-            fn=generate_image,
-            inputs=[image_prompt, negative_prompt, seed_input, width_slider, height_slider, guidance_scale_slider, randomize_checkbox, lora_dropdown],
-            outputs=[img_output, seed_output],
-        )
-    with gr.Tab("Qwen 2 VL OCR"):
-        ocr_text = gr.Textbox(label="Text Prompt", placeholder="Enter prompt for OCR")
-        file_input = gr.File(label="Upload Images", file_count="multiple")
-        ocr_max_new_tokens = gr.Slider(label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
-        ocr_btn = gr.Button("Run OCR")
-        ocr_output = gr.Textbox(label="OCR Output")
-        ocr_btn.click(
-            fn=generate_ocr,
-            inputs=[ocr_text, file_input, ocr_max_new_tokens],
-            outputs=ocr_output,
-        )
-    with gr.Tab("Text-to-Speech"):
-        tts_text = gr.Textbox(label="Text", placeholder="Enter text for TTS")
-        voice_dropdown = gr.Dropdown(label="Voice", choices=TTS_VOICES, value=TTS_VOICES[0])
-        tts_btn = gr.Button("Generate Audio")
-        tts_audio = gr.Audio(label="Audio Output", type="filepath")
-        tts_btn.click(
-            fn=generate_tts,
-            inputs=[tts_text, voice_dropdown],
-            outputs=tts_audio,
-        )
-demo.queue(max_size=20).launch(share=True)

 DESCRIPTION = """
 # Gen Vision 🎃
 """
 css = '''
     '''
 # -----------------------
+# Text Generation Setup
 # -----------------------
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 )
 model.eval()
 TTS_VOICES = [
+    "en-US-JennyNeural",  # @tts1
+    "en-US-GuyNeural",    # @tts2
 ]
+# -----------------------
+# Multimodal OCR Setup
+# -----------------------
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model_m = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to("cuda").eval()
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
     return cleaned
 # -----------------------
+# Stable Diffusion Image Generation Setup
 # -----------------------
 MAX_SEED = np.iinfo(np.int32).max
 USE_TORCH_COMPILE = False
 ENABLE_CPU_OFFLOAD = False
     return seed
 @spaces.GPU(duration=180, enable_queue=True)
+def generate_image(
+    prompt: str,
+    negative_prompt: str = "",
+    seed: int = 0,
+    width: int = 1024,
+    height: int = 1024,
+    guidance_scale: float = 3.0,
+    randomize_seed: bool = True,
+    lora_model: str = "Realism",
+    progress=gr.Progress(track_tqdm=True),
+):
     seed = int(randomize_seed_fn(seed, randomize_seed))
     effective_negative_prompt = negative_prompt  # Use provided negative prompt if any
     model_name, weight_name, adapter_name = LORA_OPTIONS[lora_model]
     return image_paths, seed
 # -----------------------
+# Main Chat/Generation Function
 # -----------------------
+@spaces.GPU
+def generate(
+    input_dict: dict,
+    chat_history: list[dict],
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+):
+    """
+    Generates chatbot responses with support for multimodal input, TTS, and image generation.
+    Special commands:
+      - "@tts1" or "@tts2": triggers text-to-speech.
+      - "@<lora_command>": triggers image generation using the new LoRA pipeline.
+         Available commands (case-insensitive): @realism, @pixar, @photoshoot, @clothing, @interior, @fashion,
+         @minimalistic, @modern, @animaliea, @wallpaper, @cars, @pencilart, @artminimalistic.
+    """
+    text = input_dict["text"]
+    files = input_dict.get("files", [])
+    # Check for image generation command based on LoRA tags.
+    lora_mapping = { key.lower(): key for key in LORA_OPTIONS }
+    for key_lower, key in lora_mapping.items():
+        command_tag = "@" + key_lower
+        if text.strip().lower().startswith(command_tag):
+            prompt_text = text.strip()[len(command_tag):].strip()
+            yield progress_bar_html(f"Processing Image Generation ({key} style)")
+            image_paths, used_seed = generate_image(
+                prompt=prompt_text,
+                negative_prompt="",
+                seed=1,
+                width=1024,
+                height=1024,
+                guidance_scale=3,
+                randomize_seed=True,
+                lora_model=key,
+            )
+            yield progress_bar_html("Finalizing Image Generation")
+            yield gr.Image(image_paths[0])
+            return
+    # Check for TTS command (@tts1 or @tts2)
+    tts_prefix = "@tts"
+    is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
+    voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
+    if is_tts and voice_index:
+        voice = TTS_VOICES[voice_index - 1]
+        text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
+        conversation = [{"role": "user", "content": text}]
+    else:
+        voice = None
+        text = text.replace(tts_prefix, "").strip()
+        conversation = clean_chat_history(chat_history)
+        conversation.append({"role": "user", "content": text})
     if files:
+        if len(files) > 1:
             images = [load_image(image) for image in files]
+        elif len(files) == 1:
             images = [load_image(files[0])]
         else:
+            images = []
         messages = [{
             "role": "user",
+            "content": [
+                *[{"type": "image", "image": image} for image in images],
+                {"type": "text", "text": text},
+            ]
         }]
         prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
         generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        yield progress_bar_html("Processing with Qwen2VL Ocr")
         for new_text in streamer:
             buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
     else:
+        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+        input_ids = input_ids.to(model.device)
+        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "top_p": top_p,
+            "top_k": top_k,
+            "temperature": temperature,
+            "num_beams": 1,
+            "repetition_penalty": repetition_penalty,
+        }
+        t = Thread(target=model.generate, kwargs=generation_kwargs)
+        t.start()
+        outputs = []
+        for new_text in streamer:
+            outputs.append(new_text)
+            yield "".join(outputs)
+        final_response = "".join(outputs)
+        yield final_response
+        if is_tts and voice:
+            output_file = asyncio.run(text_to_speech(final_response, voice))
+            yield gr.Audio(output_file, autoplay=True)
 # -----------------------
+# Gradio Chat Interface
 # -----------------------
+demo = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
+        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
+        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
+        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
+        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
+    ],
+    examples=[
+        ['@realism Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic'],
+        ["@pixar A young man with light brown wavy hair and light brown eyes sitting in an armchair and looking directly at the camera, pixar style, disney pixar, office background, ultra detailed, 1 man"],
+        ["@realism A futuristic cityscape with neon lights"],
+        ["@photoshoot A portrait of a person with dramatic lighting"],
+        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
+        ["Python Program for Array Rotation"],
+        ["@tts1 Who is Nikola Tesla, and why did he die?"],
+        ["@clothing Fashionable streetwear in an urban environment"],
+        ["@interior A modern living room interior with minimalist design"],
+        ["@fashion A runway model in haute couture"],
+        ["@minimalistic A simple and elegant design of a serene landscape"],
+        ["@modern A contemporary art piece with abstract geometric shapes"],
+        ["@animaliea A cute animal portrait with vibrant colors"],
+        ["@wallpaper A scenic mountain range perfect for a desktop wallpaper"],
+        ["@cars A sleek sports car cruising on a city street"],
+        ["@pencilart A detailed pencil sketch of a historic building"],
+        ["@artminimalistic An artistic minimalist composition with subtle tones"],
+        ["@tts2 What causes rainbows to form?"],
+    ],
+    cache_examples=False,
+    type="messages",
+    description=DESCRIPTION,
+    css=css,
+    fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="default [text, vision] , scroll down examples to explore more art styles"),
+    stop_btn="Stop Generation",
+    multimodal=True,
+)
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)