core-OCR

Paused

App Files Files Community

prithivMLmods commited on Feb 10

Commit

f948054

verified ·

1 Parent(s): 9183b07

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -36

app.py CHANGED Viewed

@@ -54,7 +54,9 @@ MAX_SEED = np.iinfo(np.int32).max
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -94,14 +96,15 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-# Environment variables and parameters for Stable Diffusion XL
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
-# Load the SDXL pipeline
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
@@ -110,15 +113,12 @@ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-# Ensure that the text encoder is in half-precision if using CUDA.
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
-# Optional: compile the model for speedup if enabled
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
-# Optional: offload parts of the model to CPU if needed
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
@@ -166,13 +166,11 @@ def generate_image_fn(
         options["use_resolution_binning"] = True
     images = []
-    # Process in batches
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
-        # Wrap the pipeline call in autocast if using CUDA
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
@@ -182,50 +180,76 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
-# ============================================================
 # 3D Model Generation using ShapE (Text-to-3D / Image-to-3D)
-# ============================================================
 class Model3D:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
         self.pipe.to(self.device)
         self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
         self.pipe_img.to(self.device)
     def to_glb(self, ply_path: str) -> str:
         mesh = trimesh.load(ply_path)
         rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
-        mesh = mesh.apply_transform(rot)
         rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
-        mesh = mesh.apply_transform(rot)
         mesh_path = tempfile.NamedTemporaryFile(suffix=".glb", delete=False)
         mesh.export(mesh_path.name, file_type="glb")
         return mesh_path.name
     def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
         generator = torch.Generator(device=self.device).manual_seed(seed)
-        images = self.pipe(
-            prompt,
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=num_steps,
-            output_type="mesh",
-        ).images
         ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
     def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
         generator = torch.Generator(device=self.device).manual_seed(seed)
-        images = self.pipe_img(
-            image,
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=num_steps,
-            output_type="mesh",
-        ).images
         ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
@@ -259,18 +283,14 @@ def generate(
     # 3D Model Generation Command
     # ------------------------------
     if text.strip().lower().startswith("@3d"):
-        # Remove the "@3d" tag and use the remaining text as the prompt.
         text = text[len("@3d"):].strip()
         yield "Generating 3D model..."
         seed = random.randint(0, MAX_SEED)
         if files:
-            # If an image is provided, use image-to-3D.
             image = load_image(files[0])
             glb_file = model_3d.run_image(image, seed=seed)
         else:
-            # Otherwise, generate a 3D model from the text prompt.
             glb_file = model_3d.run_text(text, seed=seed)
-        # Yield the generated GLB file as a downloadable file.
         yield gr.File(glb_file)
         return
@@ -278,7 +298,6 @@ def generate(
     # Image Generation Command
     # ------------------------------
     if text.strip().lower().startswith("@image"):
-        # Remove the "@image" tag and use the rest as prompt.
         prompt = text[len("@image"):].strip()
         yield "Generating image..."
         image_paths, used_seed = generate_image_fn(
@@ -295,7 +314,7 @@ def generate(
             num_images=1,
         )
         yield gr.Image(image_paths[0])
-        return  # Exit early
     # ------------------------------
     # TTS / Regular Text Generation
@@ -307,11 +326,9 @@ def generate(
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        # Clear previous chat history for a fresh TTS request.
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
-        # Remove any stray @tts tags and build the conversation history.
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
@@ -373,7 +390,6 @@ def generate(
         final_response = "".join(outputs)
         yield final_response
-        # If TTS was requested, convert the final response to speech.
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
@@ -407,5 +423,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    # To create a public link, set share=True in launch().
     demo.queue(max_size=20).launch(share=True)

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# ------------------------------
+# Text Generation Model
+# ------------------------------
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
             cleaned.append(msg)
     return cleaned
+# ------------------------------
+# Stable Diffusion XL (Image Generation)
+# ------------------------------
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
         options["use_resolution_binning"] = True
     images = []
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
+# ------------------------------
 # 3D Model Generation using ShapE (Text-to-3D / Image-to-3D)
+# ------------------------------
 class Model3D:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
         self.pipe.to(self.device)
+        # Ensure the text encoder is in half precision
+        self.pipe.text_encoder = self.pipe.text_encoder.half()
         self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
         self.pipe_img.to(self.device)
+        # Ensure the text encoder is in half precision
+        self.pipe_img.text_encoder = self.pipe_img.text_encoder.half()
     def to_glb(self, ply_path: str) -> str:
         mesh = trimesh.load(ply_path)
         rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
+        mesh.apply_transform(rot)
         rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
+        mesh.apply_transform(rot)
         mesh_path = tempfile.NamedTemporaryFile(suffix=".glb", delete=False)
         mesh.export(mesh_path.name, file_type="glb")
         return mesh_path.name
     def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
         generator = torch.Generator(device=self.device).manual_seed(seed)
+        if self.device.type == "cuda":
+            with torch.autocast("cuda", dtype=torch.float16):
+                output = self.pipe(
+                    prompt,
+                    generator=generator,
+                    guidance_scale=guidance_scale,
+                    num_inference_steps=num_steps,
+                    output_type="mesh",
+                )
+        else:
+            output = self.pipe(
+                prompt,
+                generator=generator,
+                guidance_scale=guidance_scale,
+                num_inference_steps=num_steps,
+                output_type="mesh",
+            )
+        images = output.images
         ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
     def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
         generator = torch.Generator(device=self.device).manual_seed(seed)
+        if self.device.type == "cuda":
+            with torch.autocast("cuda", dtype=torch.float16):
+                output = self.pipe_img(
+                    image,
+                    generator=generator,
+                    guidance_scale=guidance_scale,
+                    num_inference_steps=num_steps,
+                    output_type="mesh",
+                )
+        else:
+            output = self.pipe_img(
+                image,
+                generator=generator,
+                guidance_scale=guidance_scale,
+                num_inference_steps=num_steps,
+                output_type="mesh",
+            )
+        images = output.images
         ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
     # 3D Model Generation Command
     # ------------------------------
     if text.strip().lower().startswith("@3d"):
         text = text[len("@3d"):].strip()
         yield "Generating 3D model..."
         seed = random.randint(0, MAX_SEED)
         if files:
             image = load_image(files[0])
             glb_file = model_3d.run_image(image, seed=seed)
         else:
             glb_file = model_3d.run_text(text, seed=seed)
         yield gr.File(glb_file)
         return
     # Image Generation Command
     # ------------------------------
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
         yield "Generating image..."
         image_paths, used_seed = generate_image_fn(
             num_images=1,
         )
         yield gr.Image(image_paths[0])
+        return
     # ------------------------------
     # TTS / Regular Text Generation
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 )
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)