core-OCR

Paused

App Files Files Community

prithivMLmods commited on Mar 6

Commit

0b6db44

verified ·

1 Parent(s): 3fb8098

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -73

app.py CHANGED Viewed

@@ -23,9 +23,9 @@ from transformers import (
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 DESCRIPTION = """
 # QwQ Edge 💬
 """
 css = '''
@@ -40,6 +40,34 @@ h1 {
   background: #1565c0;
   border-radius: 100vh;
 }
 '''
 MAX_MAX_NEW_TOKENS = 2048
@@ -63,7 +91,7 @@ TTS_VOICES = [
     "en-US-GuyNeural",    # @tts2
 ]
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -78,24 +106,20 @@ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     return output_file
 def clean_chat_history(chat_history):
-    """
-    Filter out any chat entries whose "content" is not a string.
-    This helps prevent errors when concatenating previous messages.
-    """
     cleaned = []
     for msg in chat_history:
         if isinstance(msg, dict) and isinstance(msg.get("content"), str):
             cleaned.append(msg)
     return cleaned
-# Environment variables and parameters for Stable Diffusion XL
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
-# Load the SDXL pipeline
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
@@ -104,22 +128,19 @@ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-# Ensure that the text encoder is in half-precision if using CUDA.
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
-# Optional: compile the model for speedup if enabled
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
-# Optional: offload parts of the model to CPU if needed
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
 MAX_SEED = np.iinfo(np.int32).max
 def save_image(img: Image.Image) -> str:
-    """Save a PIL image with a unique filename and return the path."""
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
@@ -144,7 +165,7 @@ def generate_image_fn(
     num_images: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
-    """Generate images using the SDXL pipeline."""
     seed = int(randomize_seed_fn(seed, randomize_seed))
     generator = torch.Generator(device=device).manual_seed(seed)
@@ -162,13 +183,11 @@ def generate_image_fn(
         options["use_resolution_binning"] = True
     images = []
-    # Process in batches
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
-        # Wrap the pipeline call in autocast if using CUDA
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
@@ -197,35 +216,14 @@ def generate(
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # Define an HTML template for the animated progress bar.
-    # The bar is a thin 5px line in light green with a simple opacity animation.
-    progress_bar_html = """
-<div style="display: flex; align-items: center;">
-    <span>{message}</span>
-    <div style="flex-grow: 1; margin-left: 10px;">
-         <div class="progress-bar"></div>
-    </div>
-</div>
-<style>
-.progress-bar {{
-    width: 100%;
-    height: 5px;
-    background: lightgreen;
-    animation: progressAnim 2s infinite;
-}}
-@keyframes progressAnim {{
-    0% {{ opacity: 0.5; }}
-    50% {{ opacity: 1; }}
-    100% {{ opacity: 0.5; }}
-}}
-</style>
-"""
     if text.strip().lower().startswith("@image"):
-        # Remove the "@image" tag and use the rest as prompt.
         prompt = text[len("@image"):].strip()
-        # Yield progress bar for image generation.
-        yield gr.HTML(progress_bar_html.format(message="Generating Image..."))
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
             negative_prompt="",
@@ -239,9 +237,9 @@ def generate(
             use_resolution_binning=True,
             num_images=1,
         )
-        # Once the image is generated, yield the image (thus replacing the progress bar).
         yield gr.Image(image_paths[0])
-        return  # Exit early
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -250,11 +248,9 @@ def generate(
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        # Clear previous chat history for a fresh TTS request.
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
-        # Remove any stray @tts tags and build the conversation history.
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
@@ -280,21 +276,18 @@ def generate(
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
-        # Yield progress bar for multimodal input processing.
-        yield gr.HTML(progress_bar_html.format(message="Thinking..."))
         buffer = ""
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
-            # During streaming, update the progress UI (progress bar remains visible).
-            combined_html = f"""
-<div style="display: flex; flex-direction: column;">
-  {progress_bar_html.format(message="Thinking...")}
-  <div style="margin-top: 10px;">{buffer}</div>
-</div>
-"""
-            yield gr.HTML(combined_html)
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
@@ -316,23 +309,18 @@ def generate(
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
-        # Yield initial progress bar for text generation.
-        yield gr.HTML(progress_bar_html.format(message="Thinking..."))
-        outputs = []
         for new_text in streamer:
-            outputs.append(new_text)
-            combined_html = f"""
-<div style="display: flex; flex-direction: column;">
-  {progress_bar_html.format(message="Thinking...")}
-  <div style="margin-top: 10px;">{''.join(outputs)}</div>
-</div>
-"""
-            yield gr.HTML(combined_html)
-        final_response = "".join(outputs)
-        # Final response: progress bar is removed and only the generated text is shown.
-        yield final_response
-        # If TTS was requested, convert the final response to speech.
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)

 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 DESCRIPTION = """
 # QwQ Edge 💬
+**Note:** During image generation, a progress bar will appear both at the top of the interface and within the chat. For text generation, a loading animation will display until the response begins.
 """
 css = '''
   background: #1565c0;
   border-radius: 100vh;
 }
+/* Custom styling for progress bars within chat */
+.progress-bar-container {
+    width: 100%;
+    margin-top: 5px;
+}
+.progress-bar {
+    width: 100%;
+    height: 4px;
+    background-color: #e0e0e0;
+    border-radius: 2px;
+}
+.progress-bar::-webkit-progress-bar {
+    background-color: #e0e0e0;
+    border-radius: 2px;
+}
+.progress-bar::-webkit-progress-value {
+    background-color: #90ee90; /* Light green */
+    border-radius: 2px;
+}
+.progress-bar::-moz-progress-bar {
+    background-color: #90ee90; /* Light green */
+    border-radius: 2px;
+}
 '''
 MAX_MAX_NEW_TOKENS = 2048
     "en-US-GuyNeural",    # @tts2
 ]
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     return output_file
 def clean_chat_history(chat_history):
+    """Filter out non-string content to prevent concatenation errors"""
     cleaned = []
     for msg in chat_history:
         if isinstance(msg, dict) and isinstance(msg.get("content"), str):
             cleaned.append(msg)
     return cleaned
+# Stable Diffusion XL setup
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
 MAX_SEED = np.iinfo(np.int32).max
 def save_image(img: Image.Image) -> str:
+    """Save a PIL image with a unique filename and return the path"""
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
     num_images: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
+    """Generate images using the SDXL pipeline"""
     seed = int(randomize_seed_fn(seed, randomize_seed))
     generator = torch.Generator(device=device).manual_seed(seed)
         options["use_resolution_binning"] = True
     images = []
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
     text = input_dict["text"]
     files = input_dict.get("files", [])
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
+        # Initial message with progress bar at 0%
+        yield gr.HTML(
+            '<div>Generating Image...</div>'
+            '<progress class="progress-bar" value="0" max="100" '
+            'style="width:100%; height:4px; background-color:#e0e0e0;"></progress>'
+        )
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
             negative_prompt="",
             use_resolution_binning=True,
             num_images=1,
         )
+        # Final message with the image, progress bar at 100%
         yield gr.Image(image_paths[0])
+        return
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
+        # Initial loading bar (indeterminate animation via CSS)
+        yield gr.HTML(
+            '<div>Generating response...</div>'
+            '<progress class="progress-bar" style="width:100%; height:4px; background-color:#e0e0e0;"></progress>'
+        )
         buffer = ""
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
+            # Yield only the text, replacing the loading bar
+            yield buffer
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
+        # Initial loading bar
+        yield gr.HTML(
+            '<div>Generating response...</div>'
+            '<progress class="progress-bar" style="width:100%; height:4px; background-color:#e0e0e0;"></progress>'
+        )
+        buffer = ""
         for new_text in streamer:
+            buffer += new_text
+            # Yield only the text, replacing the loading bar
+            yield buffer
+        final_response = buffer
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)