core-OCR

Paused

App Files Files Community

prithivMLmods commited on Feb 10, 2025

Commit

6a44e02

verified ·

1 Parent(s): b0ba3ed

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -40

app.py CHANGED Viewed

@@ -28,9 +28,6 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
-# -----------------------------------------------------------------------------
-# Global constants and helper functions
-# -----------------------------------------------------------------------------
 MAX_SEED = np.iinfo(np.int32).max
@@ -39,10 +36,6 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
         seed = random.randint(0, MAX_SEED)
     return seed
-# -----------------------------------------------------------------------------
-# Model class for Text-to-3D Generation (ShapE)
-# -----------------------------------------------------------------------------
 class Model:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -100,10 +93,6 @@ class Model:
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
-# -----------------------------------------------------------------------------
-# Gradio UI configuration
-# -----------------------------------------------------------------------------
 DESCRIPTION = """
 # QwQ Edge 💬
 """
@@ -128,10 +117,6 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# -----------------------------------------------------------------------------
-# Load Models and Pipelines for Chat, Image, and Multimodal Processing
-# -----------------------------------------------------------------------------
 # Load the text-only model and tokenizer (for pure text chat)
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -157,20 +142,12 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# -----------------------------------------------------------------------------
-# Asynchronous text-to-speech
-# -----------------------------------------------------------------------------
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
-# -----------------------------------------------------------------------------
-# Utility function to clean conversation history
-# -----------------------------------------------------------------------------
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
@@ -182,10 +159,6 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-# -----------------------------------------------------------------------------
-# Stable Diffusion XL Pipeline for Image Generation
-# -----------------------------------------------------------------------------
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
@@ -263,9 +236,6 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
-# -----------------------------------------------------------------------------
-# Text-to-3D Generation using the ShapE Pipeline
-# -----------------------------------------------------------------------------
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
@@ -284,10 +254,6 @@ def generate_3d_fn(
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
-# -----------------------------------------------------------------------------
-# Chat Generation Function with support for @tts, @image, and @3d commands
-# -----------------------------------------------------------------------------
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -420,10 +386,6 @@ def generate(
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
-# -----------------------------------------------------------------------------
-# Gradio Chat Interface Setup and Launch
-# -----------------------------------------------------------------------------
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
@@ -435,10 +397,9 @@ demo = gr.ChatInterface(
     ],
     examples=[
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
-        [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
         [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
-        ["@3d A futuristic city skyline in the style of cyberpunk"],
         ["Write a Python function to check if a number is prime."],
         ["@tts2 What causes rainbows to form?"],
     ],

 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
 MAX_SEED = np.iinfo(np.int32).max
         seed = random.randint(0, MAX_SEED)
     return seed
 class Model:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
 DESCRIPTION = """
 # QwQ Edge 💬
 """
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load the text-only model and tokenizer (for pure text chat)
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
     torch_dtype=torch.float16
 ).to("cuda").eval()
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
             cleaned.append(msg)
     return cleaned
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
 @spaces.GPU
 def generate(
     input_dict: dict,
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
     ],
     examples=[
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
+        ["@3d A birthday cupcake with cherry"],
         [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
         ["Write a Python function to check if a number is prime."],
         ["@tts2 What causes rainbows to form?"],
     ],