Spaces:

Mansuba
/

Bangla_text_to_image_app

Sleeping

App Files Files Community

Mansuba commited on Jan 28, 2025

Commit

5ae1545

verified ·

1 Parent(s): 6024488

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -76

app.py CHANGED Viewed

@@ -10,7 +10,6 @@ import json
 import logging
 from dataclasses import dataclass
 import gc
-import os
 # Configure logging
 logging.basicConfig(
@@ -30,10 +29,6 @@ class ModelCache:
     def __init__(self, cache_dir: Path):
         self.cache_dir = cache_dir
         self.cache_dir.mkdir(parents=True, exist_ok=True)
-        # Set environment variables for better memory management
-        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
-        os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
     def load_model(self, model_id: str, load_func: callable, cache_name: str) -> Any:
         try:
@@ -52,32 +47,18 @@ class EnhancedBanglaSDGenerator:
     ):
         self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logger.info(f"Using device: {self.device}")
-        # Set memory split for VRAM usage on CPU
-        self.memory_split = 0.5  # Use 50% of available VRAM
-        self.setup_memory_management()
         self.cache = ModelCache(Path(cache_dir))
         self._initialize_models(banglaclip_weights_path)
         self._load_context_data()
-    def setup_memory_management(self):
-        """Setup optimal memory management for CPU and VRAM"""
-        if torch.cuda.is_available():
-            total_memory = torch.cuda.get_device_properties(0).total_memory
-            torch.cuda.set_per_process_memory_fraction(self.memory_split)
-        # Optimize CPU memory
-        torch.set_num_threads(min(8, os.cpu_count() or 4))
-        torch.set_num_interop_threads(min(8, os.cpu_count() or 4))
     def _initialize_models(self, banglaclip_weights_path: str):
         try:
             # Initialize translation models
             self.bn2en_model_name = "Helsinki-NLP/opus-mt-bn-en"
             self.translator = self.cache.load_model(
                 self.bn2en_model_name,
-                lambda x: MarianMTModel.from_pretrained(x, low_cpu_mem_usage=True),
                 "translator"
             ).to(self.device)
             self.trans_tokenizer = MarianTokenizer.from_pretrained(self.bn2en_model_name)
@@ -98,52 +79,34 @@ class EnhancedBanglaSDGenerator:
     def _initialize_stable_diffusion(self):
         """Initialize Stable Diffusion pipeline with optimized settings."""
-        try:
-            self.pipe = self.cache.load_model(
-                "runwayml/stable-diffusion-v1-5",
-                lambda model_id: StableDiffusionPipeline.from_pretrained(
-                    model_id,
-                    torch_dtype=torch.float32,
-                    safety_checker=None,
-                    use_safetensors=True,
-                    low_cpu_mem_usage=True,
-                ),
-                "stable_diffusion"
-            )
-            # Optimize scheduler for speed
-            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(
-                self.pipe.scheduler.config,
-                use_karras_sigmas=True,
-                algorithm_type="dpmsolver++",
-                solver_order=2
-            )
-            # Memory optimizations
-            self.pipe.enable_attention_slicing(slice_size=1)
-            self.pipe.enable_vae_slicing()
             self.pipe.enable_sequential_cpu_offload()
-            # VRAM optimization
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                self.pipe.enable_model_cpu_offload()
-            self.pipe = self.pipe.to(self.device)
-        except Exception as e:
-            logger.error(f"Error initializing Stable Diffusion: {str(e)}")
-            raise
     def _load_banglaclip_model(self, weights_path: str) -> CLIPModel:
         try:
             if not Path(weights_path).exists():
                 raise FileNotFoundError(f"BanglaCLIP weights not found at {weights_path}")
-            clip_model = CLIPModel.from_pretrained(
-                self.clip_model_name,
-                low_cpu_mem_usage=True
-            )
             state_dict = torch.load(weights_path, map_location=self.device)
             cleaned_state_dict = {
@@ -178,12 +141,22 @@ class EnhancedBanglaSDGenerator:
         inputs = self.trans_tokenizer(bangla_text, return_tensors="pt", padding=True)
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
-        with torch.no_grad(), torch.cpu.amp.autocast():
             outputs = self.translator.generate(**inputs)
         translated = self.trans_tokenizer.decode(outputs[0], skip_special_tokens=True)
         return translated
     def generate_image(
         self,
         bangla_text: str,
@@ -198,29 +171,18 @@ class EnhancedBanglaSDGenerator:
             if config.seed is not None:
                 torch.manual_seed(config.seed)
-            # Clear memory before generation
-            gc.collect()
-            torch.cuda.empty_cache() if torch.cuda.is_available() else None
             enhanced_prompt = self._enhance_prompt(bangla_text)
             negative_prompt = self._get_negative_prompt()
-            # Use mixed precision for faster generation
-            with torch.inference_mode(), torch.cpu.amp.autocast():
                 result = self.pipe(
                     prompt=enhanced_prompt,
                     negative_prompt=negative_prompt,
                     num_images_per_prompt=config.num_images,
                     num_inference_steps=config.num_inference_steps,
-                    guidance_scale=config.guidance_scale,
-                    use_memory_efficient_attention=True,
-                    use_memory_efficient_cross_attention=True
                 )
-            # Clear memory after generation
-            gc.collect()
-            torch.cuda.empty_cache() if torch.cuda.is_available() else None
             return result.images, enhanced_prompt
         except Exception as e:
@@ -231,10 +193,12 @@ class EnhancedBanglaSDGenerator:
         """Enhance prompt with context and style information."""
         translated_text = self._translate_text(bangla_text)
         contexts = []
         contexts.extend(context for loc, context in self.location_contexts.items() if loc in bangla_text)
         contexts.extend(context for scene, context in self.scene_contexts.items() if scene in bangla_text)
         photo_style = [
             "professional photography",
             "high resolution",
@@ -244,6 +208,7 @@ class EnhancedBanglaSDGenerator:
             "beautiful composition"
         ]
         all_parts = [translated_text] + contexts + photo_style
         return ", ".join(dict.fromkeys(all_parts))
@@ -352,9 +317,6 @@ def create_gradio_interface():
     return demo
 if __name__ == "__main__":
-    # Set environment variables for better performance
-    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
-    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
     demo = create_gradio_interface()
     demo.queue().launch(share=True)

 import logging
 from dataclasses import dataclass
 import gc
 # Configure logging
 logging.basicConfig(
     def __init__(self, cache_dir: Path):
         self.cache_dir = cache_dir
         self.cache_dir.mkdir(parents=True, exist_ok=True)
     def load_model(self, model_id: str, load_func: callable, cache_name: str) -> Any:
         try:
     ):
         self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logger.info(f"Using device: {self.device}")
         self.cache = ModelCache(Path(cache_dir))
         self._initialize_models(banglaclip_weights_path)
         self._load_context_data()
     def _initialize_models(self, banglaclip_weights_path: str):
         try:
             # Initialize translation models
             self.bn2en_model_name = "Helsinki-NLP/opus-mt-bn-en"
             self.translator = self.cache.load_model(
                 self.bn2en_model_name,
+                MarianMTModel.from_pretrained,
                 "translator"
             ).to(self.device)
             self.trans_tokenizer = MarianTokenizer.from_pretrained(self.bn2en_model_name)
     def _initialize_stable_diffusion(self):
         """Initialize Stable Diffusion pipeline with optimized settings."""
+        self.pipe = self.cache.load_model(
+            "runwayml/stable-diffusion-v1-5",
+            lambda model_id: StableDiffusionPipeline.from_pretrained(
+                model_id,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                safety_checker=None
+            ),
+            "stable_diffusion"
+        )
+        self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(
+            self.pipe.scheduler.config,
+            use_karras_sigmas=True,
+            algorithm_type="dpmsolver++"
+        )
+        self.pipe = self.pipe.to(self.device)
+        # Memory optimization
+        self.pipe.enable_attention_slicing()
+        if torch.cuda.is_available():
             self.pipe.enable_sequential_cpu_offload()
     def _load_banglaclip_model(self, weights_path: str) -> CLIPModel:
         try:
             if not Path(weights_path).exists():
                 raise FileNotFoundError(f"BanglaCLIP weights not found at {weights_path}")
+            clip_model = CLIPModel.from_pretrained(self.clip_model_name)
             state_dict = torch.load(weights_path, map_location=self.device)
             cleaned_state_dict = {
         inputs = self.trans_tokenizer(bangla_text, return_tensors="pt", padding=True)
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
             outputs = self.translator.generate(**inputs)
         translated = self.trans_tokenizer.decode(outputs[0], skip_special_tokens=True)
         return translated
+    def _get_text_embedding(self, text: str):
+        """Get text embedding from BanglaCLIP model."""
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.banglaclip_model.get_text_features(**inputs)
+        return outputs
     def generate_image(
         self,
         bangla_text: str,
             if config.seed is not None:
                 torch.manual_seed(config.seed)
             enhanced_prompt = self._enhance_prompt(bangla_text)
             negative_prompt = self._get_negative_prompt()
+            with torch.autocast(self.device.type):
                 result = self.pipe(
                     prompt=enhanced_prompt,
                     negative_prompt=negative_prompt,
                     num_images_per_prompt=config.num_images,
                     num_inference_steps=config.num_inference_steps,
+                    guidance_scale=config.guidance_scale
                 )
             return result.images, enhanced_prompt
         except Exception as e:
         """Enhance prompt with context and style information."""
         translated_text = self._translate_text(bangla_text)
+        # Gather contexts
         contexts = []
         contexts.extend(context for loc, context in self.location_contexts.items() if loc in bangla_text)
         contexts.extend(context for scene, context in self.scene_contexts.items() if scene in bangla_text)
+        # Add photo style
         photo_style = [
             "professional photography",
             "high resolution",
             "beautiful composition"
         ]
+        # Combine all parts
         all_parts = [translated_text] + contexts + photo_style
         return ", ".join(dict.fromkeys(all_parts))
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
+    # Fixed queue configuration for newer Gradio versions
     demo.queue().launch(share=True)