Spaces:

prithivMLmods
/

VisionScope-R2

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 6, 2025

Commit

7fc6af3

verified ·

1 Parent(s): 9e2bd4e

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -39

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import uuid
 import json
 import time
 import asyncio
 from threading import Thread
 import gradio as gr
@@ -47,7 +48,32 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load text-only model and tokenizer for text generation
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -62,6 +88,9 @@ TTS_VOICES = [
     "en-US-GuyNeural",    # @tts2
 ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -79,7 +108,6 @@ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
-    This helps prevent errors when concatenating previous messages.
     """
     cleaned = []
     for msg in chat_history:
@@ -87,9 +115,9 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-# ------------------------------
-# New Image Generation Pipeline
-# ------------------------------
 MAX_SEED = np.iinfo(np.int32).max
 USE_TORCH_COMPILE = False
@@ -124,6 +152,12 @@ if torch.cuda.is_available():
     for model_name, weight_name, adapter_name in LORA_OPTIONS.values():
         pipe.load_lora_weights(model_name, weight_name=weight_name, adapter_name=adapter_name)
     pipe.to("cuda")
 def save_image(img: Image.Image) -> str:
     """Save a PIL image with a unique filename and return the path."""
@@ -167,10 +201,9 @@ def generate_image(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
-# ------------------------------
-# QwQ Edge Chat Interface
-# ------------------------------
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -193,13 +226,12 @@ def generate(
     files = input_dict.get("files", [])
     # Check for image generation command based on LoRA tags.
-    # Build a mapping with lowercase keys.
     lora_mapping = { key.lower(): key for key in LORA_OPTIONS }
     for key_lower, key in lora_mapping.items():
         command_tag = "@" + key_lower
         if text.strip().lower().startswith(command_tag):
             prompt_text = text.strip()[len(command_tag):].strip()
-            yield f" > Processing Image Generation {key} style ███████▒▒▒ 69%"
             image_paths, used_seed = generate_image(
                 prompt=prompt_text,
                 negative_prompt="",
@@ -210,7 +242,7 @@ def generate(
                 randomize_seed=True,
                 lora_model=key,
             )
-            yield " > Processing Image Generation ████████▒▒ 90%"
             yield gr.Image(image_paths[0])
             return
@@ -222,15 +254,13 @@ def generate(
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        # Clear previous chat history for a fresh TTS request.
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
-        # Remove any stray @tts tags and build the conversation history.
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
@@ -253,7 +283,7 @@ def generate(
         thread.start()
         buffer = ""
-        yield " > Processing with Qwen2VL Ocr ███████▒▒▒ 69%"
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
@@ -288,12 +318,13 @@ def generate(
         final_response = "".join(outputs)
         yield final_response
-        # If TTS was requested, convert the final response to speech.
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
@@ -303,26 +334,25 @@ demo = gr.ChatInterface(
         gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
-    examples = [
-                ["@realism Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
-                ["@pixar A young man with light brown wavy hair and light brown eyes sitting in an armchair and looking directly at the camera, pixar style, disney pixar, office background, ultra detailed, 1 man"],
-                ["@realism A futuristic cityscape with neon lights"],
-                ["@photoshoot A portrait of a person with dramatic lighting"],
-                [{"text": "summarize the letter", "files": ["examples/1.png"]}],
-                ["Python Program for Array Rotation"],
-                ["@tts1 Who is Nikola Tesla, and why did he die?"],
-                ["@clothing Fashionable streetwear in an urban environment"],
-                ["@interior A modern living room interior with minimalist design"],
-                ["@fashion A runway model in haute couture"],
-                ["@minimalistic A simple and elegant design of a serene landscape"],
-                ["@modern A contemporary art piece with abstract geometric shapes"],
-                ["@animaliea A cute animal portrait with vibrant colors"],
-                ["@wallpaper A scenic mountain range perfect for a desktop wallpaper"],
-                ["@cars A sleek sports car cruising on a city street"],
-                ["@pencilart A detailed pencil sketch of a historic building"],
-                ["@artminimalistic An artistic minimalist composition with subtle tones"],
-                ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
@@ -335,5 +365,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    # To create a public link, set share=True in launch().
     demo.queue(max_size=20).launch(share=True)

 import json
 import time
 import asyncio
+import re
 from threading import Thread
 import gradio as gr
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# -----------------------
+# Progress Bar Helper
+# -----------------------
+def progress_bar_html(label: str) -> str:
+    """
+    Returns an HTML snippet for a thin progress bar with a label.
+    The progress bar is styled as a dark red animated bar.
+    """
+    return f'''
+<div style="display: flex; align-items: center;">
+    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
+    <div style="width: 110px; height: 5px; background-color: #f0f0f0; border-radius: 2px; overflow: hidden;">
+        <div style="width: 100%; height: 100%; background-color: #FF00FF; animation: loading 1.5s linear infinite;"></div>
+    </div>
+</div>
+<style>
+@keyframes loading {{
+    0% {{ transform: translateX(-100%); }}
+    100% {{ transform: translateX(100%); }}
+}}
+</style>
+    '''
+# -----------------------
+# Text Generation Setup
+# -----------------------
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     "en-US-GuyNeural",    # @tts2
 ]
+# -----------------------
+# Multimodal OCR Setup
+# -----------------------
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
     """
     cleaned = []
     for msg in chat_history:
             cleaned.append(msg)
     return cleaned
+# -----------------------
+# Stable Diffusion Image Generation Setup
+# -----------------------
 MAX_SEED = np.iinfo(np.int32).max
 USE_TORCH_COMPILE = False
     for model_name, weight_name, adapter_name in LORA_OPTIONS.values():
         pipe.load_lora_weights(model_name, weight_name=weight_name, adapter_name=adapter_name)
     pipe.to("cuda")
+else:
+    pipe = StableDiffusionXLPipeline.from_pretrained(
+        "SG161222/RealVisXL_V4.0_Lightning",
+        torch_dtype=torch.float32,
+        use_safetensors=True,
+    ).to(device)
 def save_image(img: Image.Image) -> str:
     """Save a PIL image with a unique filename and return the path."""
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
+# -----------------------
+# Main Chat/Generation Function
+# -----------------------
 @spaces.GPU
 def generate(
     input_dict: dict,
     files = input_dict.get("files", [])
     # Check for image generation command based on LoRA tags.
     lora_mapping = { key.lower(): key for key in LORA_OPTIONS }
     for key_lower, key in lora_mapping.items():
         command_tag = "@" + key_lower
         if text.strip().lower().startswith(command_tag):
             prompt_text = text.strip()[len(command_tag):].strip()
+            yield progress_bar_html(f"Processing Image Generation ({key} style)")
             image_paths, used_seed = generate_image(
                 prompt=prompt_text,
                 negative_prompt="",
                 randomize_seed=True,
                 lora_model=key,
             )
+            yield progress_bar_html("Finalizing Image Generation")
             yield gr.Image(image_paths[0])
             return
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
         thread.start()
         buffer = ""
+        yield progress_bar_html("Processing with Qwen2VL Ocr")
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
+# -----------------------
+# Gradio Chat Interface
+# -----------------------
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
+    examples=[
+        ['@realism Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic'],
+        ["@pixar A young man with light brown wavy hair and light brown eyes sitting in an armchair and looking directly at the camera, pixar style, disney pixar, office background, ultra detailed, 1 man"],
+        ["@realism A futuristic cityscape with neon lights"],
+        ["@photoshoot A portrait of a person with dramatic lighting"],
+        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
+        ["Python Program for Array Rotation"],
+        ["@tts1 Who is Nikola Tesla, and why did he die?"],
+        ["@clothing Fashionable streetwear in an urban environment"],
+        ["@interior A modern living room interior with minimalist design"],
+        ["@fashion A runway model in haute couture"],
+        ["@minimalistic A simple and elegant design of a serene landscape"],
+        ["@modern A contemporary art piece with abstract geometric shapes"],
+        ["@animaliea A cute animal portrait with vibrant colors"],
+        ["@wallpaper A scenic mountain range perfect for a desktop wallpaper"],
+        ["@cars A sleek sports car cruising on a city street"],
+        ["@pencilart A detailed pencil sketch of a historic building"],
+        ["@artminimalistic An artistic minimalist composition with subtle tones"],
+        ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
 )
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)