core-OCR

Paused

App Files Files Community

prithivMLmods commited on Feb 8

Commit

3a6718d

verified ·

1 Parent(s): 54f4624

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -6

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
 from transformers.image_utils import load_image
 import time
 DESCRIPTION = """
 # QwQ Edge 💬
@@ -58,12 +59,20 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
@@ -86,8 +95,8 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for multimodal input and TTS.
-    If the query starts with an @tts command (e.g. "@tts1"), previous chat history is cleared.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -100,22 +109,36 @@ def generate(
     else:
         images = []
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
-    voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
-    if is_tts and voice_index:
-        voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         # Clear any previous chat history to avoid concatenation issues
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
-    if images:
         # Multimodal branch using the OCR model
         messages = [{
             "role": "user",
@@ -183,6 +206,7 @@ demo = gr.ChatInterface(
     ],
     examples=[
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
         [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
         [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["A train travels 60 kilometers per hour. If it travels for 5 hours, how far will it travel in total?"],

 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
 from transformers.image_utils import load_image
 import time
+from gradio_client import Client  # For image generation API
 DESCRIPTION = """
 # QwQ Edge 💬
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# Image generation client
+image_gen_client = Client("prithivMLmods/STABLE-HAMSTER")
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
+def image_gen(prompt: str):
+    """Generate an image using the Stable Hamster API"""
+    result = image_gen_client.predict("Image Generation", None, prompt, api_name="/stable_hamster")
+    return result[1]  # Return the generated image
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input, TTS, and image generation.
+    If the query starts with an @tts or @image command, previous chat history is cleared.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     else:
         images = []
+    # Check for TTS or Image Generation commands
     tts_prefix = "@tts"
+    image_prefix = "@image"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
+    is_image = text.strip().lower().startswith(image_prefix)
+    if is_tts:
+        voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
+        voice = TTS_VOICES[voice_index - 1] if voice_index else None
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         # Clear any previous chat history to avoid concatenation issues
         conversation = [{"role": "user", "content": text}]
+    elif is_image:
+        text = text.replace(image_prefix, "").strip()
+        conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
+    if is_image:
+        # Image generation branch
+        yield "Generating image, please wait..."
+        try:
+            image = image_gen(text)
+            yield gr.Image(image)
+        except Exception as e:
+            yield f"Failed to generate image: {str(e)}"
+    elif images:
         # Multimodal branch using the OCR model
         messages = [{
             "role": "user",
     ],
     examples=[
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
+        ["@image A futuristic cityscape at sunset"],
         [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
         [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["A train travels 60 kilometers per hour. If it travels for 5 hours, how far will it travel in total?"],