core-OCR

Paused

App Files Files Community

prithivMLmods commited on Feb 8

Commit

f74b154

verified ·

1 Parent(s): f8a9b16

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -19

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import torch
 import edge_tts
 import asyncio
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from transformers.image_utils import load_image
 import time
@@ -35,6 +35,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -53,6 +54,7 @@ TTS_VOICES = [
     "en-US-JasonNeural",  # @tts6
 ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -77,29 +79,39 @@ def generate(
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ):
-    """Generates chatbot response and handles TTS requests with multimodal input support"""
     text = input_dict["text"]
     files = input_dict.get("files", [])
     # Check if input includes image(s)
-    images = [load_image(image) for image in files] if files else []
-    # Check if message is for TTS
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 7))
     voice_index = next((i for i in range(1, 7) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
-    conversation = [*chat_history, {"role": "user", "content": text}]
     if images:
-        # Process multimodal input
         messages = [
             {"role": "user", "content": [
                 *[{"type": "image", "image": image} for image in images],
@@ -109,9 +121,9 @@ def generate(
         prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
@@ -124,7 +136,7 @@ def generate(
             yield buffer
     else:
-        # Process text-only input
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
@@ -147,21 +159,18 @@ def generate(
         t.start()
         outputs = []
-        for text in streamer:
-            outputs.append(text)
             yield "".join(outputs)
         final_response = "".join(outputs)
-        # Yield text response first
-        yield final_response
         if is_tts and voice:
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-            output_file = loop.run_until_complete(text_to_speech(final_response, voice))
-            # Separate yield for audio output
             yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(

 import edge_tts
 import asyncio
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
 from transformers.image_utils import load_image
 import time
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load the text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     "en-US-JasonNeural",  # @tts6
 ]
+# Load the multimodal (OCR) model and processor
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ):
+    """
+    Generates chatbot response and handles TTS requests with multimodal input support.
+    If the query starts with a TTS command (e.g. '@tts1'), the chat history is cleared
+    to avoid non-text responses (like Audio) interfering with template rendering.
+    """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     # Check if input includes image(s)
+    if len(files) > 1:
+        images = [load_image(image) for image in files]
+    elif len(files) == 1:
+        images = [load_image(files[0])]
+    else:
+        images = []
+    # Check if the message is for TTS
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 7))
     voice_index = next((i for i in range(1, 7) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
+        # Clear conversation history to avoid issues with non-text outputs.
+        conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
+        conversation = [*chat_history, {"role": "user", "content": text}]
+    # If there are images, process multimodal input
     if images:
         messages = [
             {"role": "user", "content": [
                 *[{"type": "image", "image": image} for image in images],
         prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
+        # Handle generation for multimodal input using model_m
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
             yield buffer
     else:
+        # Process text-only input using model
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         t.start()
         outputs = []
+        for new_text in streamer:
+            outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
+        # Yield text response first.
+        yield final_response
+        # If TTS was requested, yield audio output separately.
         if is_tts and voice:
+            output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(