core-OCR

Paused

App Files Files Community

prithivMLmods commited on Mar 6

Commit

af0738e

verified ·

1 Parent(s): 0b6db44

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -65

app.py CHANGED Viewed

@@ -25,7 +25,6 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 DESCRIPTION = """
 # QwQ Edge 💬
-**Note:** During image generation, a progress bar will appear both at the top of the interface and within the chat. For text generation, a loading animation will display until the response begins.
 """
 css = '''
@@ -40,34 +39,6 @@ h1 {
   background: #1565c0;
   border-radius: 100vh;
 }
-/* Custom styling for progress bars within chat */
-.progress-bar-container {
-    width: 100%;
-    margin-top: 5px;
-}
-.progress-bar {
-    width: 100%;
-    height: 4px;
-    background-color: #e0e0e0;
-    border-radius: 2px;
-}
-.progress-bar::-webkit-progress-bar {
-    background-color: #e0e0e0;
-    border-radius: 2px;
-}
-.progress-bar::-webkit-progress-value {
-    background-color: #90ee90; /* Light green */
-    border-radius: 2px;
-}
-.progress-bar::-moz-progress-bar {
-    background-color: #90ee90; /* Light green */
-    border-radius: 2px;
-}
 '''
 MAX_MAX_NEW_TOKENS = 2048
@@ -76,6 +47,23 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -91,7 +79,7 @@ TTS_VOICES = [
     "en-US-GuyNeural",    # @tts2
 ]
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -106,20 +94,24 @@ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     return output_file
 def clean_chat_history(chat_history):
-    """Filter out non-string content to prevent concatenation errors"""
     cleaned = []
     for msg in chat_history:
         if isinstance(msg, dict) and isinstance(msg.get("content"), str):
             cleaned.append(msg)
     return cleaned
-# Stable Diffusion XL setup
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
@@ -128,19 +120,22 @@ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
 MAX_SEED = np.iinfo(np.int32).max
 def save_image(img: Image.Image) -> str:
-    """Save a PIL image with a unique filename and return the path"""
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
@@ -165,7 +160,7 @@ def generate_image_fn(
     num_images: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
-    """Generate images using the SDXL pipeline"""
     seed = int(randomize_seed_fn(seed, randomize_seed))
     generator = torch.Generator(device=device).manual_seed(seed)
@@ -183,11 +178,13 @@ def generate_image_fn(
         options["use_resolution_binning"] = True
     images = []
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
@@ -216,14 +213,11 @@ def generate(
     text = input_dict["text"]
     files = input_dict.get("files", [])
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
-        # Initial message with progress bar at 0%
-        yield gr.HTML(
-            '<div>Generating Image...</div>'
-            '<progress class="progress-bar" value="0" max="100" '
-            'style="width:100%; height:4px; background-color:#e0e0e0;"></progress>'
-        )
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
             negative_prompt="",
@@ -237,9 +231,9 @@ def generate(
             use_resolution_binning=True,
             num_images=1,
         )
-        # Final message with the image, progress bar at 100%
         yield gr.Image(image_paths[0])
-        return
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -248,13 +242,16 @@ def generate(
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
@@ -276,18 +273,18 @@ def generate(
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
-        # Initial loading bar (indeterminate animation via CSS)
-        yield gr.HTML(
-            '<div>Generating response...</div>'
-            '<progress class="progress-bar" style="width:100%; height:4px; background-color:#e0e0e0;"></progress>'
-        )
         buffer = ""
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
-            # Yield only the text, replacing the loading bar
-            yield buffer
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
@@ -306,21 +303,21 @@ def generate(
             "num_beams": 1,
             "repetition_penalty": repetition_penalty,
         }
-        t = Thread(target=model.generate, kwargs=generation_kwargs)
-        t.start()
-        # Initial loading bar
-        yield gr.HTML(
-            '<div>Generating response...</div>'
-            '<progress class="progress-bar" style="width:100%; height:4px; background-color:#e0e0e0;"></progress>'
-        )
-        buffer = ""
         for new_text in streamer:
-            buffer += new_text
-            # Yield only the text, replacing the loading bar
-            yield buffer
-        final_response = buffer
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
@@ -353,4 +350,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(share=True)

 DESCRIPTION = """
 # QwQ Edge 💬
 """
 css = '''
   background: #1565c0;
   border-radius: 100vh;
 }
 '''
 MAX_MAX_NEW_TOKENS = 2048
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Function to return an HTML snippet of a thin animated progress bar
+def progress_bar_html(message: str) -> str:
+    return f"""
+<div style="display: flex; align-items: center;">
+  <span style="margin-right: 8px;">{message}</span>
+  <div style="position: relative; width: 110px; height: 5px; background-color: #f8d7da; border-radius: 2px; overflow: hidden;">
+    <div style="position: absolute; width: 100%; height: 100%; background-color: #f5c6cb; animation: loading 1.5s linear infinite;"></div>
+  </div>
+</div>
+<style>
+@keyframes loading {{
+    0% {{ transform: translateX(-100%); }}
+    100% {{ transform: translateX(100%); }}
+}}
+</style>
+"""
 # Load text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
     "en-US-GuyNeural",    # @tts2
 ]
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     return output_file
 def clean_chat_history(chat_history):
+    """
+    Filter out any chat entries whose "content" is not a string.
+    This helps prevent errors when concatenating previous messages.
+    """
     cleaned = []
     for msg in chat_history:
         if isinstance(msg, dict) and isinstance(msg.get("content"), str):
             cleaned.append(msg)
     return cleaned
+# Environment variables and parameters for Stable Diffusion XL
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
+# Load the SDXL pipeline
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+# Ensure that the text encoder is in half-precision if using CUDA.
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
+# Optional: compile the model for speedup if enabled
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
+# Optional: offload parts of the model to CPU if needed
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
 MAX_SEED = np.iinfo(np.int32).max
 def save_image(img: Image.Image) -> str:
+    """Save a PIL image with a unique filename and return the path."""
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
     num_images: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
+    """Generate images using the SDXL pipeline."""
     seed = int(randomize_seed_fn(seed, randomize_seed))
     generator = torch.Generator(device=device).manual_seed(seed)
         options["use_resolution_binning"] = True
     images = []
+    # Process in batches
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
+        # Wrap the pipeline call in autocast if using CUDA
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # Handle image generation command
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
+        # Show animated progress bar for image generation
+        yield gr.HTML(progress_bar_html("Generating Image"))
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
             negative_prompt="",
             use_resolution_binning=True,
             num_images=1,
         )
+        # Replace the progress bar with the generated image
         yield gr.Image(image_paths[0])
+        return  # Exit early
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
+        # Clear previous chat history for a fresh TTS request.
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
+        # Remove any stray @tts tags and build the conversation history.
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
+    # For multimodal chat with files (e.g. image + text)
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        # Show progress bar for thinking
+        yield gr.HTML(progress_bar_html("Thinking..."))
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
+            # Update with current text plus progress bar
+            interim_html = f"<div>{buffer}</div><div>{progress_bar_html('Thinking...')}</div>"
+            yield gr.HTML(interim_html)
+        # Final output without the progress bar
+        yield gr.HTML(f"<div>{buffer}</div>")
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             "num_beams": 1,
             "repetition_penalty": repetition_penalty,
         }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        outputs = []
+        # Show progress bar for thinking
+        yield gr.HTML(progress_bar_html("Thinking..."))
         for new_text in streamer:
+            outputs.append(new_text)
+            interim_html = f"<div>{''.join(outputs)}</div><div>{progress_bar_html('Thinking...')}</div>"
+            yield gr.HTML(interim_html)
+        final_response = "".join(outputs)
+        # Final output without progress bar
+        yield gr.HTML(f"<div>{final_response}</div>")
+        # If TTS was requested, convert the final response to speech.
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True