core-OCR

Paused

App Files Files Community

prithivMLmods commited on Feb 8, 2025

Commit

34d2094

verified ·

1 Parent(s): 83a0174

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -29

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -53,7 +53,7 @@ TTS_VOICES = [
     "en-US-JasonNeural",  # @tts6
 ]
-# Multimodal (OCR) model and processor
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -70,12 +70,11 @@ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
 def clean_chat_history(chat_history):
     """
-    Filter out any entries whose content is not a string.
-    This avoids non-text objects (like tuples or Audio) from being concatenated.
     """
     cleaned = []
     for msg in chat_history:
-        # Only keep dict messages that have a string 'content'
         if isinstance(msg, dict) and isinstance(msg.get("content"), str):
             cleaned.append(msg)
     return cleaned
@@ -91,14 +90,13 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates a chatbot response and handles TTS requests with multimodal input support.
-    If the user’s query begins with an @tts command, previous chat history is ignored
-    (clearing any non-text outputs). Otherwise, the chat history is cleaned to include only text.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # Determine if images are provided
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
@@ -106,25 +104,23 @@ def generate(
     else:
         images = []
-    # Check for TTS prefix
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 7))
     voice_index = next((i for i in range(1, 7) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        # Clear any previous chat history when using TTS to avoid type errors
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
-        # Clean the chat history to include only messages with string content
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
-    # Multimodal branch if images are provided
     if images:
         messages = [{
             "role": "user",
             "content": [
@@ -134,9 +130,8 @@ def generate(
         }]
         prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
@@ -154,19 +149,18 @@ def generate(
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
             gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
         input_ids = input_ids.to(model.device)
         streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-        generate_kwargs = dict(
-            {"input_ids": input_ids},
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
-            do_sample=True,
-            top_p=top_p,
-            top_k=top_k,
-            temperature=temperature,
-            num_beams=1,
-            repetition_penalty=repetition_penalty,
-        )
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
@@ -176,7 +170,6 @@ def generate(
             yield "".join(outputs)
         final_response = "".join(outputs)
-        # Yield text response first
         yield final_response
         if is_tts and voice:

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     "en-US-JasonNeural",  # @tts6
 ]
+# Load multimodal (OCR) model and processor
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
 def clean_chat_history(chat_history):
     """
+    Filter out any chat entries whose "content" is not a string.
+    This helps prevent errors when concatenating previous messages.
     """
     cleaned = []
     for msg in chat_history:
         if isinstance(msg, dict) and isinstance(msg.get("content"), str):
             cleaned.append(msg)
     return cleaned
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input and TTS.
+    If the query starts with an @tts command (e.g. "@tts1"), previous chat history is cleared.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # Process image files if provided
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
     else:
         images = []
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 7))
     voice_index = next((i for i in range(1, 7) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
+        # Clear any previous chat history to avoid concatenation issues
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if images:
+        # Multimodal branch using the OCR model
         messages = [{
             "role": "user",
             "content": [
         }]
         prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
             gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
         input_ids = input_ids.to(model.device)
         streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "top_p": top_p,
+            "top_k": top_k,
+            "temperature": temperature,
+            "num_beams": 1,
+            "repetition_penalty": repetition_penalty,
+        }
         t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice: