Spaces:

NotOpenAI
/

Axion

Sleeping

App Files Files Community

Dhenenjay commited on Nov 26, 2025

Commit

aef5404

verified ·

1 Parent(s): 9680613

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +273 -267

app.py CHANGED Viewed

@@ -1,280 +1,50 @@
 """
 E3Diff: SAR-to-Optical Translation - HuggingFace Space
-Exact copy of working local implementation
 """
 import os
-import torch
-import torch.nn as nn
 import numpy as np
 from PIL import Image, ImageEnhance
 import gradio as gr
 import tempfile
 import time
-from huggingface_hub import hf_hub_download
-# Import model components (exact same as local)
-from unet import UNet
-from diffusion import GaussianDiffusion
 # ZeroGPU support
 try:
     import spaces
     GPU_AVAILABLE = True
 except ImportError:
     GPU_AVAILABLE = False
     spaces = None
-class E3DiffInference:
-    """
-    E3Diff Inference Pipeline - EXACT copy from local inference.py
-    """
-    def __init__(self, weights_path=None, device="cuda", image_size=256, num_inference_steps=1):
-        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
-        self.image_size = image_size
-        self.num_inference_steps = num_inference_steps
-        print(f"[E3Diff] Initializing on device: {self.device}")
-        print(f"[E3Diff] Image size: {image_size}x{image_size}")
-        print(f"[E3Diff] Inference steps: {num_inference_steps}")
-        # Build model
-        self.model = self._build_model()
-        # Load weights
-        self._load_weights(weights_path)
-        # Set to eval mode
-        self.model.eval()
-        print("[E3Diff] Model ready for inference!")
-    def _build_model(self):
-        """Build the E3Diff model architecture - exact same config."""
-        # UNet configuration from SEN12_256_s2_test.json
-        unet = UNet(
-            in_channel=3,           # Noisy image channels
-            out_channel=3,          # Output optical image
-            norm_groups=16,
-            inner_channel=64,
-            channel_mults=[1, 2, 4, 8, 16],  # Encoder/decoder channels
-            attn_res=[],            # No attention at specific resolutions
-            res_blocks=1,
-            dropout=0,
-            image_size=self.image_size,
-            condition_ch=3          # SAR condition channels
-        )
-        # Diffusion wrapper
-        schedule_opt = {
-            'schedule': 'linear',
-            'n_timestep': self.num_inference_steps,
-            'linear_start': 1e-6,
-            'linear_end': 1e-2,
-            'ddim': 1,
-            'lq_noiselevel': 0
-        }
-        opt = {
-            'stage': 2,
-            'ddim_steps': self.num_inference_steps,
-            'model': {
-                'beta_schedule': {
-                    'train': {'n_timestep': 1000},
-                    'val': schedule_opt
-                }
-            }
-        }
-        model = GaussianDiffusion(
-            denoise_fn=unet,
-            image_size=self.image_size,
-            channels=3,
-            loss_type='l1',
-            conditional=True,
-            schedule_opt=schedule_opt,
-            xT_noise_r=0,
-            seed=1,
-            opt=opt
-        )
-        return model.to(self.device)
-    def _load_weights(self, weights_path):
-        """Load pre-trained weights."""
-        if weights_path is None:
-            weights_path = hf_hub_download(
-                repo_id="Dhenenjay/E3Diff-SAR2Optical",
-                filename="I700000_E719_gen.pth"
-            )
-        print(f"[E3Diff] Loading weights from: {weights_path}")
-        state_dict = torch.load(weights_path, map_location=self.device, weights_only=False)
-        self.model.load_state_dict(state_dict, strict=False)
-        print(f"[E3Diff] Weights loaded successfully!")
-    def preprocess(self, image):
-        """Preprocess input SAR image."""
-        # Convert to RGB if grayscale
-        if image.mode != 'RGB':
-            image = image.convert('RGB')
-        # Resize to model input size
-        if image.size != (self.image_size, self.image_size):
-            image = image.resize((self.image_size, self.image_size), Image.LANCZOS)
-        # Convert to tensor and normalize to [-1, 1]
-        img_np = np.array(image).astype(np.float32) / 255.0
-        img_tensor = torch.from_numpy(img_np).permute(2, 0, 1)  # HWC -> CHW
-        img_tensor = img_tensor * 2.0 - 1.0  # [0,1] -> [-1,1]
-        return img_tensor.unsqueeze(0).to(self.device)
-    def postprocess(self, tensor):
-        """Postprocess output tensor to PIL Image."""
-        # Clamp and denormalize
-        tensor = tensor.squeeze(0).cpu()
-        tensor = torch.clamp(tensor, -1, 1)
-        tensor = (tensor + 1.0) / 2.0  # [-1,1] -> [0,1]
-        # Convert to numpy and PIL
-        img_np = (tensor.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
-        return Image.fromarray(img_np)
-    @torch.no_grad()
-    def translate(self, sar_image, seed=42):
-        """Translate SAR image to optical image."""
-        # Set seed for reproducibility
-        if seed is not None:
-            torch.manual_seed(seed)
-            np.random.seed(seed)
-        # Preprocess
-        sar_tensor = self.preprocess(sar_image)  # [1, 3, H, W]
-        # Set noise schedule for inference
-        self.model.set_new_noise_schedule(
-            {
-                'schedule': 'linear',
-                'n_timestep': self.num_inference_steps,
-                'linear_start': 1e-6,
-                'linear_end': 1e-2,
-                'ddim': 1,
-                'lq_noiselevel': 0
-            },
-            self.device,
-            num_train_timesteps=1000
-        )
-        # Run inference
-        output, output_onestep = self.model.super_resolution(
-            sar_tensor,
-            continous=False,
-            seed=seed if seed is not None else 1,
-            img_s1=sar_tensor
-        )
-        return self.postprocess(output)
-class HighResProcessor:
-    """High resolution tiled processing - exact copy from process_highres.py"""
-    def __init__(self, device="cuda"):
-        self.device = device
-        self.model = None
-        self.tile_size = 256
-    def load_model(self):
-        print("Loading E3Diff model...")
-        self.model = E3DiffInference(device=self.device, num_inference_steps=1)
-    def create_blend_weights(self, tile_size, overlap):
-        """Create smooth blending weights for seamless output."""
-        ramp = np.linspace(0, 1, overlap)
-        weight = np.ones((tile_size, tile_size))
-        weight[:overlap, :] *= ramp[:, np.newaxis]
-        weight[-overlap:, :] *= ramp[::-1, np.newaxis]
-        weight[:, :overlap] *= ramp[np.newaxis, :]
-        weight[:, -overlap:] *= ramp[np.newaxis, ::-1]
-        return weight[:, :, np.newaxis]
-    def process(self, image, overlap=64):
-        """Process image at full resolution with seamless tiling."""
-        if self.model is None:
-            self.load_model()
-        if isinstance(image, Image.Image):
-            if image.mode != 'RGB':
-                image = image.convert('RGB')
-            img_np = np.array(image).astype(np.float32) / 255.0
-        else:
-            img_np = image
-        h, w = img_np.shape[:2]
-        tile_size = self.tile_size
-        step = tile_size - overlap
-        # Pad image
-        pad_h = (step - (h - overlap) % step) % step
-        pad_w = (step - (w - overlap) % step) % step
-        img_padded = np.pad(img_np, ((0, pad_h), (0, pad_w), (0, 0)), mode='reflect')
-        h_pad, w_pad = img_padded.shape[:2]
-        # Output arrays
-        output = np.zeros((h_pad, w_pad, 3), dtype=np.float32)
-        weights = np.zeros((h_pad, w_pad, 1), dtype=np.float32)
-        blend_weight = self.create_blend_weights(tile_size, overlap)
-        # Calculate positions
-        y_positions = list(range(0, h_pad - tile_size + 1, step))
-        x_positions = list(range(0, w_pad - tile_size + 1, step))
-        total_tiles = len(y_positions) * len(x_positions)
-        print(f"Processing {total_tiles} tiles ({len(x_positions)}x{len(y_positions)}) at {w}x{h}...")
-        tile_idx = 0
-        for y in y_positions:
-            for x in x_positions:
-                # Extract tile
-                tile = img_padded[y:y+tile_size, x:x+tile_size]
-                tile_pil = Image.fromarray((tile * 255).astype(np.uint8))
-                # Translate
-                result_pil = self.model.translate(tile_pil, seed=42)
-                result = np.array(result_pil).astype(np.float32) / 255.0
-                # Blend
-                output[y:y+tile_size, x:x+tile_size] += result * blend_weight
-                weights[y:y+tile_size, x:x+tile_size] += blend_weight
-                tile_idx += 1
-                if tile_idx % 10 == 0 or tile_idx == total_tiles:
-                    print(f"  Tile {tile_idx}/{total_tiles}")
-        # Normalize
-        output = output / (weights + 1e-8)
-        output = output[:h, :w]
-        return (output * 255).astype(np.uint8)
-    def enhance(self, image, contrast=1.1, sharpness=1.2, color=1.1):
-        """Professional post-processing."""
-        if isinstance(image, np.ndarray):
-            image = Image.fromarray(image)
-        image = ImageEnhance.Contrast(image).enhance(contrast)
-        image = ImageEnhance.Sharpness(image).enhance(sharpness)
-        image = ImageEnhance.Color(image).enhance(color)
-        return image
-# Global processor
-processor = None
 def load_sar_image(filepath):
@@ -300,49 +70,284 @@ def load_sar_image(filepath):
     return Image.open(filepath).convert('RGB')
-def _translate_sar_impl(file, overlap, enhance_output):
-    """Main translation function."""
-    global processor
     if file is None:
         return None, None, "Please upload a SAR image"
-    if processor is None:
-        processor = HighResProcessor()
-    print("Processing SAR image...")
     filepath = file.name if hasattr(file, 'name') else file
     image = load_sar_image(filepath)
     w, h = image.size
-    print(f"Input size: {w}x{h}")
     start = time.time()
-    result = processor.process(image, overlap=int(overlap))
     elapsed = time.time() - start
     result_pil = Image.fromarray(result)
     if enhance_output:
-        result_pil = processor.enhance(result_pil)
     tiff_path = tempfile.mktemp(suffix='.tiff')
     result_pil.save(tiff_path, format='TIFF', compression='lzw')
-    print(f"Complete in {elapsed:.1f}s!")
     info = f"Processed in {elapsed:.1f}s | Output: {result_pil.size[0]}x{result_pil.size[1]}"
     return result_pil, tiff_path, info
-# Apply GPU decorator if available
 if GPU_AVAILABLE and spaces is not None:
-    translate_sar = spaces.GPU(duration=300)(_translate_sar_impl)
 else:
-    translate_sar = _translate_sar_impl
 # Create Gradio interface
 with gr.Blocks(title="E3Diff: SAR-to-Optical Translation") as demo:
@@ -379,6 +384,7 @@ with gr.Blocks(title="E3Diff: SAR-to-Optical Translation") as demo:
     **Note:** E3Diff is a one-step diffusion model. Multiple steps degrade quality.
     """)
 if __name__ == "__main__":
     demo.queue().launch(ssr_mode=False)

 """
 E3Diff: SAR-to-Optical Translation - HuggingFace Space
+Fixed for ZeroGPU with lazy loading
 """
 import os
 import numpy as np
 from PIL import Image, ImageEnhance
 import gradio as gr
 import tempfile
 import time
+print("[E3Diff] Starting app...")
 # ZeroGPU support
 try:
     import spaces
     GPU_AVAILABLE = True
+    print("[E3Diff] ZeroGPU available")
 except ImportError:
     GPU_AVAILABLE = False
     spaces = None
+    print("[E3Diff] Running without ZeroGPU")
+# Lazy imports for heavy modules
+_torch = None
+_model_modules = None
+def get_torch():
+    global _torch
+    if _torch is None:
+        print("[E3Diff] Importing torch...")
+        import torch
+        _torch = torch
+        print(f"[E3Diff] PyTorch {torch.__version__} loaded")
+    return _torch
+def get_model_modules():
+    global _model_modules
+    if _model_modules is None:
+        print("[E3Diff] Importing model modules...")
+        from unet import UNet
+        from diffusion import GaussianDiffusion
+        _model_modules = (UNet, GaussianDiffusion)
+        print("[E3Diff] Model modules loaded")
+    return _model_modules
 def load_sar_image(filepath):
     return Image.open(filepath).convert('RGB')
+def create_blend_weights(tile_size, overlap):
+    """Create smooth blending weights for seamless output."""
+    ramp = np.linspace(0, 1, overlap)
+    weight = np.ones((tile_size, tile_size))
+    weight[:overlap, :] *= ramp[:, np.newaxis]
+    weight[-overlap:, :] *= ramp[::-1, np.newaxis]
+    weight[:, :overlap] *= ramp[np.newaxis, :]
+    weight[:, -overlap:] *= ramp[np.newaxis, ::-1]
+    return weight[:, :, np.newaxis]
+def build_model(device):
+    """Build and load the E3Diff model."""
+    torch = get_torch()
+    UNet, GaussianDiffusion = get_model_modules()
+    from huggingface_hub import hf_hub_download
+    print("[E3Diff] Building model architecture...")
+    image_size = 256
+    num_inference_steps = 1
+    # UNet configuration
+    unet = UNet(
+        in_channel=3,
+        out_channel=3,
+        norm_groups=16,
+        inner_channel=64,
+        channel_mults=[1, 2, 4, 8, 16],
+        attn_res=[],
+        res_blocks=1,
+        dropout=0,
+        image_size=image_size,
+        condition_ch=3
+    )
+    # Diffusion wrapper
+    schedule_opt = {
+        'schedule': 'linear',
+        'n_timestep': num_inference_steps,
+        'linear_start': 1e-6,
+        'linear_end': 1e-2,
+        'ddim': 1,
+        'lq_noiselevel': 0
+    }
+    opt = {
+        'stage': 2,
+        'ddim_steps': num_inference_steps,
+        'model': {
+            'beta_schedule': {
+                'train': {'n_timestep': 1000},
+                'val': schedule_opt
+            }
+        }
+    }
+    model = GaussianDiffusion(
+        denoise_fn=unet,
+        image_size=image_size,
+        channels=3,
+        loss_type='l1',
+        conditional=True,
+        schedule_opt=schedule_opt,
+        xT_noise_r=0,
+        seed=1,
+        opt=opt
+    )
+    model = model.to(device)
+    # Load weights
+    print("[E3Diff] Downloading weights...")
+    weights_path = hf_hub_download(
+        repo_id="Dhenenjay/E3Diff-SAR2Optical",
+        filename="I700000_E719_gen.pth"
+    )
+    print(f"[E3Diff] Loading weights from: {weights_path}")
+    state_dict = torch.load(weights_path, map_location=device, weights_only=False)
+    model.load_state_dict(state_dict, strict=False)
+    model.eval()
+    print("[E3Diff] Model ready!")
+    return model
+def preprocess(image, device, image_size=256):
+    """Preprocess input SAR image."""
+    torch = get_torch()
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    if image.size != (image_size, image_size):
+        image = image.resize((image_size, image_size), Image.LANCZOS)
+    img_np = np.array(image).astype(np.float32) / 255.0
+    img_tensor = torch.from_numpy(img_np).permute(2, 0, 1)
+    img_tensor = img_tensor * 2.0 - 1.0
+    return img_tensor.unsqueeze(0).to(device)
+def postprocess(tensor):
+    """Postprocess output tensor to PIL Image."""
+    torch = get_torch()
+    tensor = tensor.squeeze(0).cpu()
+    tensor = torch.clamp(tensor, -1, 1)
+    tensor = (tensor + 1.0) / 2.0
+    img_np = (tensor.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
+    return Image.fromarray(img_np)
+def translate_tile(model, sar_pil, device, seed=42):
+    """Translate a single tile."""
+    torch = get_torch()
+    if seed is not None:
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+    sar_tensor = preprocess(sar_pil, device)
+    model.set_new_noise_schedule(
+        {
+            'schedule': 'linear',
+            'n_timestep': 1,
+            'linear_start': 1e-6,
+            'linear_end': 1e-2,
+            'ddim': 1,
+            'lq_noiselevel': 0
+        },
+        device,
+        num_train_timesteps=1000
+    )
+    with torch.no_grad():
+        output, _ = model.super_resolution(
+            sar_tensor,
+            continous=False,
+            seed=seed if seed is not None else 1,
+            img_s1=sar_tensor
+        )
+    return postprocess(output)
+def enhance_image(image, contrast=1.1, sharpness=1.2, color=1.1):
+    """Professional post-processing."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    image = ImageEnhance.Contrast(image).enhance(contrast)
+    image = ImageEnhance.Sharpness(image).enhance(sharpness)
+    image = ImageEnhance.Color(image).enhance(color)
+    return image
+def process_image(image, model, device, overlap=64):
+    """Process image at full resolution with seamless tiling."""
+    if isinstance(image, Image.Image):
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        img_np = np.array(image).astype(np.float32) / 255.0
+    else:
+        img_np = image
+    h, w = img_np.shape[:2]
+    tile_size = 256
+    step = tile_size - overlap
+    # Pad image
+    pad_h = (step - (h - overlap) % step) % step
+    pad_w = (step - (w - overlap) % step) % step
+    img_padded = np.pad(img_np, ((0, pad_h), (0, pad_w), (0, 0)), mode='reflect')
+    h_pad, w_pad = img_padded.shape[:2]
+    # Output arrays
+    output = np.zeros((h_pad, w_pad, 3), dtype=np.float32)
+    weights = np.zeros((h_pad, w_pad, 1), dtype=np.float32)
+    blend_weight = create_blend_weights(tile_size, overlap)
+    # Calculate positions
+    y_positions = list(range(0, h_pad - tile_size + 1, step))
+    x_positions = list(range(0, w_pad - tile_size + 1, step))
+    total_tiles = len(y_positions) * len(x_positions)
+    print(f"[E3Diff] Processing {total_tiles} tiles ({len(x_positions)}x{len(y_positions)}) at {w}x{h}...")
+    tile_idx = 0
+    for y in y_positions:
+        for x in x_positions:
+            # Extract tile
+            tile = img_padded[y:y+tile_size, x:x+tile_size]
+            tile_pil = Image.fromarray((tile * 255).astype(np.uint8))
+            # Translate
+            result_pil = translate_tile(model, tile_pil, device, seed=42)
+            result = np.array(result_pil).astype(np.float32) / 255.0
+            # Blend
+            output[y:y+tile_size, x:x+tile_size] += result * blend_weight
+            weights[y:y+tile_size, x:x+tile_size] += blend_weight
+            tile_idx += 1
+            if tile_idx % 10 == 0 or tile_idx == total_tiles:
+                print(f"[E3Diff] Tile {tile_idx}/{total_tiles}")
+    # Normalize
+    output = output / (weights + 1e-8)
+    output = output[:h, :w]
+    return (output * 255).astype(np.uint8)
+# Global model cache
+_cached_model = None
+def _translate_impl(file, overlap, enhance_output):
+    """Main translation function - runs on GPU."""
+    global _cached_model
     if file is None:
         return None, None, "Please upload a SAR image"
+    torch = get_torch()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"[E3Diff] Using device: {device}")
+    # Load model (cached)
+    if _cached_model is None:
+        _cached_model = build_model(device)
+    model = _cached_model
+    # Load image
     filepath = file.name if hasattr(file, 'name') else file
+    print(f"[E3Diff] Loading: {filepath}")
     image = load_sar_image(filepath)
     w, h = image.size
+    print(f"[E3Diff] Input size: {w}x{h}")
     start = time.time()
+    result = process_image(image, model, device, overlap=int(overlap))
     elapsed = time.time() - start
     result_pil = Image.fromarray(result)
     if enhance_output:
+        result_pil = enhance_image(result_pil)
     tiff_path = tempfile.mktemp(suffix='.tiff')
     result_pil.save(tiff_path, format='TIFF', compression='lzw')
+    print(f"[E3Diff] Complete in {elapsed:.1f}s!")
     info = f"Processed in {elapsed:.1f}s | Output: {result_pil.size[0]}x{result_pil.size[1]}"
     return result_pil, tiff_path, info
+# Apply GPU decorator
 if GPU_AVAILABLE and spaces is not None:
+    @spaces.GPU(duration=300)
+    def translate_sar(file, overlap, enhance_output):
+        return _translate_impl(file, overlap, enhance_output)
 else:
+    translate_sar = _translate_impl
+print("[E3Diff] Building Gradio interface...")
 # Create Gradio interface
 with gr.Blocks(title="E3Diff: SAR-to-Optical Translation") as demo:
     **Note:** E3Diff is a one-step diffusion model. Multiple steps degrade quality.
     """)
+print("[E3Diff] Launching app...")
 if __name__ == "__main__":
     demo.queue().launch(ssr_mode=False)