import cv2
import os
import numpy as np
import threading
import concurrent.futures
from typing import List, Tuple, Optional
import time

# Try to import insightface, but provide fallback if not available
try:
    import insightface
    from insightface.app import FaceAnalysis
    INSIGHTFACE_AVAILABLE = True
except ImportError as e:
    print(f"Warning: insightface not available: {e}")
    print("Using OpenCV fallback for face detection")
    INSIGHTFACE_AVAILABLE = False
    insightface = None
    FaceAnalysis = None

class FaceSwapper:
    def __init__(self, gpu_enabled=True, gpu_id=0):
        """
        Initialize FaceSwapper with GPU acceleration support
        Args:
            gpu_enabled: Whether to use GPU acceleration
            gpu_id: GPU device ID (default 0 for RX 5500 XT)
        """
        self.gpu_enabled = gpu_enabled
        self.gpu_id = gpu_id
        self.ctx_id = gpu_id if gpu_enabled else -1
        
        print(f"Initializing FaceSwapper with GPU {'enabled' if gpu_enabled else 'disabled'} (ctx_id={self.ctx_id})")
        
        if INSIGHTFACE_AVAILABLE:
            # Initialize FaceAnalysis with detection and landmark models
            # Use optimized settings for RX 5500 XT 8GB VRAM
            self.app = FaceAnalysis(name='buffalo_l')
            
            # Optimize detection size for GPU (larger = more accurate but more VRAM usage)
            det_size = (1024, 1024) if gpu_enabled else (640, 640)
            self.app.prepare(ctx_id=self.ctx_id, det_size=det_size)
            
            # Initialize the swapper model with GPU optimization
            self.swapper = insightface.model_zoo.get_model(
                'inswapper_128.onnx', download=True, download_zip=True
            )
        else:
            print("Using OpenCV fallback mode - limited functionality")
            self.app = None
            self.swapper = None
        
        # Configure model for GPU if available
        self.gpu_error = None
        if INSIGHTFACE_AVAILABLE and gpu_enabled and hasattr(self.swapper, 'session'):
            try:
                import onnxruntime as ort
                # Use DirectML for AMD GPUs, fallback to CPU
                providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
                self.swapper.session.set_providers(providers)
                actual_providers = self.swapper.session.get_providers()
                print(f"GPU providers configured: {actual_providers}")
                
                # Check if DirectML is actually being used
                if 'DmlExecutionProvider' in actual_providers:
                    print("✅ GPU acceleration successfully enabled with DirectML (AMD RX 5500 XT)")
                else:
                    print("⚠️ DirectML provider not available, falling back to CPU")
                    self.gpu_enabled = False
                    self.ctx_id = -1
                    self.gpu_error = "DirectML provider not available"
                    
            except Exception as e:
                print(f"❌ GPU configuration failed, falling back to CPU: {e}")
                self.gpu_enabled = False
                self.ctx_id = -1
                self.gpu_error = str(e)
        else:
            if not INSIGHTFACE_AVAILABLE:
                self.gpu_error = "insightface not available - using OpenCV fallback"
            elif not gpu_enabled:
                self.gpu_error = "GPU acceleration disabled by user"
            else:
                self.gpu_error = "Swapper session not available for GPU configuration"
        
        # Performance tracking
        self.last_processing_time = 0
        self.gpu_memory_usage = 0
        
    def get_gpu_info(self):
        """Get GPU information for RX 5500 XT"""
        if not self.gpu_enabled or not INSIGHTFACE_AVAILABLE:
            return {
                "gpu_enabled": False, 
                "message": "GPU acceleration disabled or not available",
                "error": getattr(self, 'gpu_error', 'Unknown error'),
                "ctx_id": self.ctx_id,
                "insightface_available": INSIGHTFACE_AVAILABLE
            }
        
        try:
            import onnxruntime as ort
            providers = ort.get_available_providers()
            current_providers = getattr(self.swapper.session, 'get_providers', lambda: ['Unknown'])()
            
            return {
                "gpu_enabled": True,
                "gpu_id": self.gpu_id,
                "available_providers": providers,
                "current_providers": current_providers,
                "ctx_id": self.ctx_id,
                "directml_available": 'DmlExecutionProvider' in current_providers,
                "detection_size": (1024, 1024) if self.gpu_enabled else (640, 640),
                "insightface_available": INSIGHTFACE_AVAILABLE
            }
        except Exception as e:
            return {
                "gpu_enabled": False, 
                "error": str(e),
                "fallback_reason": "GPU info retrieval failed",
                "insightface_available": INSIGHTFACE_AVAILABLE
            }

    def transplant_hair(self, src_img, dst_img, src_face, dst_face):
        """
        Warps the source hair onto the destination face using Affine Transformation.
        """
        # 1. Get Landmarks (keypoints)
        src_lm = src_face.kps
        dst_lm = dst_face.kps

        # 2. Calculate Affine Transform Matrix to align Source face to Target face
        # We use the eyes (points 0, 1) and nose (point 2) for alignment
        src_pts = src_lm[:3]
        dst_pts = dst_lm[:3]
        M = cv2.getAffineTransform(src_pts.astype(np.float32), dst_pts.astype(np.float32))

        # 3. Warp the entire Source Image to match Target Geometry
        h, w = dst_img.shape[:2]
        warped_src = cv2.warpAffine(src_img, M, (w, h), borderMode=cv2.BORDER_REFLECT)

        # 4. Create a Mask for the Hair (Estimation based on Landmarks)
        # We assume hair is generally above the eyebrows .
        # Eyebrow points are indices 17-26 in 68-point models, but insightface buffalo_l uses 5 points usually.
        # If 5 points: 0,1=eyes, 2=nose, 3,4=mouth corners.
        # We estimate the forehead/hairline is above the eyes.
        
        eye_y = int((dst_lm[0][1] + dst_lm[1][1]) / 2) # Average eye height
        nose_y = int(dst_lm[2][1])
        face_height = nose_y - eye_y
        
        # Define the hair region (Everything significantly above the eyes)
        hair_mask = np.zeros((h, w, 3), dtype=np.float32)
        
        # Start the mask slightly above the eyes
        forehead_line = int(eye_y - (face_height * 0.8))
        
        # Create a soft gradient mask from the forehead up
        if forehead_line > 0:
            cv2.rectangle(hair_mask, (0, 0), (w, forehead_line), (1, 1, 1), -1)
            # Blur the mask heavily to blend the hairline
            hair_mask = cv2.GaussianBlur(hair_mask, (51, 51), 0)
        
        # 5. Blend: (WarpedSource * Mask) + (Target * (1-Mask))
        dst_float = dst_img.astype(np.float32) / 255.0
        src_float = warped_src.astype(np.float32) / 255.0
        
        final = (src_float * hair_mask) + (dst_float * (1.0 - hair_mask))
        final = np.clip(final * 255.0, 0, 255).astype(np.uint8)
        
        return final

    def enhance_face_alignment(self, source_img, target_img, source_face, target_face):
        """
        Enhanced face alignment using facial landmarks for better positioning
        """
        try:
            # Get facial landmarks
            src_kps = source_face.kps
            dst_kps = target_face.kps
            
            # Use 5-point facial landmarks for better alignment
            # Points: 0=left eye, 1=right eye, 2=nose tip, 3=left mouth, 4=right mouth
            src_pts = np.array(src_kps, dtype=np.float32)
            dst_pts = np.array(dst_kps, dtype=np.float32)
            
            # Calculate similarity transform for better alignment than affine
            h, w = target_img.shape[:2]
            M = cv2.estimateAffinePartial2D(src_pts[:3], dst_pts[:3])[0]
            
            if M is not None:
                # Apply transform to source image for better alignment
                aligned_source = cv2.warpAffine(source_img, M, (w, h), 
                                              borderMode=cv2.BORDER_REFLECT_101)
                return aligned_source
            else:
                return source_img
        except Exception as e:
            print(f"Face alignment enhancement failed: {e}")
            return source_img
    
    def improve_color_matching(self, swapped_face, target_region, target_face_bbox):
        """
        Advanced color matching using LAB color space and histogram matching
        """
        try:
            # Convert to LAB color space for better color separation
            swapped_lab = cv2.cvtColor(swapped_face, cv2.COLOR_BGR2LAB)
            target_lab = cv2.cvtColor(target_region, cv2.COLOR_BGR2LAB)
            
            # Apply histogram matching for each channel
            for i in range(3):  # L, A, B channels
                swapped_hist = cv2.calcHist([swapped_lab], [i], None, [256], [0, 256])
                target_hist = cv2.calcHist([target_lab], [i], None, [256], [0, 256])
                
                # Normalize histograms
                swapped_hist = swapped_hist / swapped_hist.sum()
                target_hist = target_hist / target_hist.sum()
                
                # Create lookup table for histogram matching
                lut = self._create_histogram_lut(swapped_hist, target_hist)
                swapped_lab[:,:,i] = cv2.LUT(swapped_lab[:,:,i], lut)
            
            # Convert back to BGR
            enhanced_face = cv2.cvtColor(swapped_lab, cv2.COLOR_LAB2BGR)
            
            # Blend with original to maintain natural look
            alpha = 0.7  # 70% enhanced, 30% original
            final_face = cv2.addWeighted(enhanced_face, alpha, swapped_face, 1-alpha, 0)
            
            return final_face
        except Exception as e:
            print(f"Color matching enhancement failed: {e}")
            return swapped_face
    
    def _create_histogram_lut(self, source_hist, target_hist):
        """
        Create lookup table for histogram matching
        """
        lut = np.zeros(256, dtype=np.uint8)
        source_cdf = source_hist.cumsum()
        target_cdf = target_hist.cumsum()
        
        for i in range(256):
            source_val = source_cdf[i]
            target_idx = np.argmin(np.abs(target_cdf - source_val))
            lut[i] = target_idx
        
        return lut
    
    def seamless_blending(self, swapped_face, target_img, target_face_bbox):
        """
        Seamless blending using multi-band blending for natural integration
        """
        try:
            x1, y1, x2, y2 = map(int, target_face_bbox)
            
            # Create mask for face region
            mask = np.zeros(target_img.shape[:2], dtype=np.uint8)
            center = (int((x1 + x2) / 2), int((y1 + y2) / 2))
            size = (int((x2 - x1) / 2), int((y2 - y1) / 2))
            cv2.ellipse(mask, center, size, 0, 0, 360, (255, 255, 255), -1)
            
            # Apply Gaussian blur to mask for smooth edges
            mask_blurred = cv2.GaussianBlur(mask, (101, 101), 0)
            mask_blurred = mask_blurred.astype(np.float32) / 255.0
            
            # Multi-band blending
            result = target_img.copy().astype(np.float32)
            
            # Create pyramid for seamless blending
            levels = 5
            pyramid_swapped = self._create_gaussian_pyramid(swapped_face.astype(np.float32), levels)
            pyramid_target = self._create_gaussian_pyramid(target_img[y1:y2, x1:x2].astype(np.float32), levels)
            pyramid_mask = self._create_gaussian_pyramid(mask_blurred[y1:y2, x1:x2], levels)
            
            # Blend pyramids
            blended_pyramid = []
            for i in range(levels):
                if i < len(pyramid_swapped) and i < len(pyramid_target) and i < len(pyramid_mask):
                    blended = (pyramid_swapped[i] * pyramid_mask[i] + 
                             pyramid_target[i] * (1 - pyramid_mask[i]))
                    blended_pyramid.append(blended)
            
            # Reconstruct from pyramid
            if blended_pyramid:
                blended_face = self._reconstruct_from_pyramid(blended_pyramid)
                result[y1:y2, x1:x2] = blended_face
            else:
                # Fallback to simple blending
                mask_3d = np.stack([mask_blurred[y1:y2, x1:x2]] * 3, axis=-1)
                result[y1:y2, x1:x2] = (swapped_face.astype(np.float32) * mask_3d + 
                                      target_img[y1:y2, x1:x2].astype(np.float32) * (1 - mask_3d))
            
            return result.astype(np.uint8)
        except Exception as e:
            print(f"Seamless blending failed: {e}")
            # Fallback to simple paste
            result = target_img.copy()
            x1, y1, x2, y2 = map(int, target_face_bbox)
            result[y1:y2, x1:x2] = swapped_face
            return result
    
    def _create_gaussian_pyramid(self, img, levels):
        """
        Create Gaussian pyramid for multi-band blending
        """
        pyramid = [img]
        current = img
        for i in range(levels - 1):
            current = cv2.pyrDown(current)
            pyramid.append(current)
        return pyramid
    
    def _reconstruct_from_pyramid(self, pyramid):
        """
        Reconstruct image from Gaussian pyramid
        """
        result = pyramid[-1]
        for i in range(len(pyramid) - 2, -1, -1):
            result = cv2.pyrUp(result)
            if result.shape[:2] != pyramid[i].shape[:2]:
                result = cv2.resize(result, (pyramid[i].shape[1], pyramid[i].shape[0]))
            result = result + pyramid[i]
        return result
    
    def swap_faces(self, source_path, source_face_idx, target_path, target_face_idx, swap_hair=False):
        """Optimized face swap with GPU acceleration"""
        start_time = time.time()
        
        source_img = cv2.imread(source_path)
        target_img = cv2.imread(target_path)

        if source_img is None or target_img is None:
            raise ValueError("Could not read one or both images")

        # Detect faces with GPU acceleration
        source_faces = self.app.get(source_img)
        target_faces = self.app.get(target_img)

        # Sort faces from left to right
        source_faces = sorted(source_faces, key=lambda x: x.bbox[0])
        target_faces = sorted(target_faces, key=lambda x: x.bbox[0])

        if len(source_faces) < source_face_idx or source_face_idx < 1:
            raise ValueError(f"Source image contains {len(source_faces)} faces, but requested face {source_face_idx}")
        if len(target_faces) < target_face_idx or target_face_idx < 1:
            raise ValueError(f"Target image contains {len(target_faces)} faces, but requested face {target_face_idx}")

        source_face = source_faces[source_face_idx - 1]
        target_face = target_faces[target_face_idx - 1]

        # Enhanced preprocessing for better accuracy
        # Step 1: Align source face to target face geometry
        aligned_source = self.enhance_face_alignment(source_img, target_img, source_face, target_face)
        
        # Step 2: Perform standard face swap with aligned source
        result = self.swapper.get(target_img, target_face, source_face, paste_back=True)
        
        # Step 3: Extract swapped face region for enhancement
        x1, y1, x2, y2 = [int(v) for v in target_face.bbox]
        swapped_face_region = result[y1:y2, x1:x2]
        target_face_region = target_img[y1:y2, x1:x2]
        
        # Step 4: Enhanced color matching
        enhanced_face = self.improve_color_matching(swapped_face_region, target_face_region, target_face.bbox)
        
        # Step 5: Seamless blending back into target image
        result = self.seamless_blending(enhanced_face, target_img, target_face.bbox)

        # Step 6: Optional Hair Transplant (enhanced)
        if swap_hair:
            try:
                result = self.transplant_hair(aligned_source, result, source_face, target_face)
            except Exception as e:
                print(f"Hair swap failed (fallback to enhanced swap): {e}")
                pass

        self.last_processing_time = time.time() - start_time
        print(f"Face swap completed in {self.last_processing_time:.2f}s (GPU: {'Yes' if self.gpu_enabled else 'No'})")
        return result

    def swap_faces_batch(self, source_path: str, target_path: str, 
                        source_face_indices: List[int] = None, 
                        target_face_indices: List[int] = None,
                        swap_hair: bool = False) -> List[np.ndarray]:
        """
        Batch face swapping for multiple faces with parallel processing
        Optimized for RX 5500 XT 8GB VRAM
        """
        if source_face_indices is None:
            source_face_indices = [1]
        if target_face_indices is None:
            target_face_indices = [1]
            
        source_img = cv2.imread(source_path)
        target_img = cv2.imread(target_path)
        
        if source_img is None or target_img is None:
            raise ValueError("Could not read one or both images")

        # Detect all faces once
        print("Detecting faces in source and target images...")
        source_faces = self.app.get(source_img)
        target_faces = self.app.get(target_img)
        
        source_faces = sorted(source_faces, key=lambda x: x.bbox[0])
        target_faces = sorted(target_faces, key=lambda x: x.bbox[0])
        
        results = []
        
        # Process combinations in parallel if GPU is available
        if self.gpu_enabled and len(source_face_indices) * len(target_face_indices) > 1:
            print(f"Processing {len(source_face_indices)}x{len(target_face_indices)} combinations in parallel on GPU...")
            results = self._process_parallel_swaps(
                source_img, target_img, source_faces, target_faces,
                source_face_indices, target_face_indices, swap_hair
            )
        else:
            # Sequential processing for single combinations or CPU fallback
            print(f"Processing {len(source_face_indices)}x{len(target_face_indices)} combinations sequentially...")
            for s_idx in source_face_indices:
                for t_idx in target_face_indices:
                    try:
                        result = self._swap_single_face(
                            source_img, target_img, source_faces, target_faces,
                            s_idx, t_idx, swap_hair
                        )
                        results.append(result)
                    except Exception as e:
                        print(f"Failed to swap source face {s_idx} with target face {t_idx}: {e}")
                        continue
        
        return results
    
    def _process_parallel_swaps(self, source_img, target_img, source_faces, target_faces,
                               source_indices, target_indices, swap_hair):
        """Parallel processing for multiple face swaps using GPU"""
        results = []
        
        def process_combination(s_idx, t_idx):
            try:
                return self._swap_single_face(
                    source_img.copy(), target_img.copy(), 
                    source_faces, target_faces, s_idx, t_idx, swap_hair
                )
            except Exception as e:
                print(f"Parallel swap failed for {s_idx}x{t_idx}: {e}")
                return None
        
        # Use ThreadPoolExecutor for I/O bound operations and GPU utilization
        max_workers = min(4, len(source_indices) * len(target_indices))  # Limit for RX 5500 XT
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks
            futures = []
            for s_idx in source_indices:
                for t_idx in target_indices:
                    future = executor.submit(process_combination, s_idx, t_idx)
                    futures.append((future, s_idx, t_idx))
            
            # Collect results as they complete
            for future, s_idx, t_idx in futures:
                try:
                    result = future.result(timeout=30)  # 30 second timeout per swap
                    if result is not None:
                        results.append(result)
                        print(f"Completed swap: Source {s_idx} -> Target {t_idx}")
                except concurrent.futures.TimeoutError:
                    print(f"Timeout swapping source face {s_idx} with target face {t_idx}")
                except Exception as e:
                    print(f"Error in parallel processing {s_idx}x{t_idx}: {e}")
        
        return results
    
    def _swap_single_face(self, source_img, target_img, source_faces, target_faces,
                         source_idx, target_idx, swap_hair):
        """Single face swap with all enhancements"""
        if len(source_faces) < source_idx or source_idx < 1:
            raise ValueError(f"Source image contains {len(source_faces)} faces, but requested face {source_idx}")
        if len(target_faces) < target_idx or target_idx < 1:
            raise ValueError(f"Target image contains {len(target_faces)} faces, but requested face {target_idx}")

        source_face = source_faces[source_idx - 1]
        target_face = target_faces[target_idx - 1]

        # Enhanced preprocessing
        aligned_source = self.enhance_face_alignment(source_img, target_img, source_face, target_face)
        
        # Face swap
        result = self.swapper.get(target_img, target_face, source_face, paste_back=True)
        
        # Extract and enhance face region
        x1, y1, x2, y2 = [int(v) for v in target_face.bbox]
        swapped_face_region = result[y1:y2, x1:x2]
        target_face_region = target_img[y1:y2, x1:x2]
        
        enhanced_face = self.improve_color_matching(swapped_face_region, target_face_region, target_face.bbox)
        result = self.seamless_blending(enhanced_face, target_img, target_face.bbox)

        # Optional hair transplant
        if swap_hair:
            try:
                result = self.transplant_hair(aligned_source, result, source_face, target_face)
            except Exception as e:
                print(f"Hair swap failed: {e}")

        return result
    
    def optimize_for_gpu_memory(self, max_faces_per_batch=4):
        """
        Optimize settings for RX 5500 XT 8GB VRAM
        Adjust batch sizes and image resolutions based on available VRAM
        """
        if not self.gpu_enabled:
            return max_faces_per_batch
        
        # Conservative settings for 8GB VRAM to avoid OOM
        vram_safety_margin = 2  # GB reserved for system
        estimated_vram_per_face = 0.5  # GB per high-res face processing
        available_vram = 8 - vram_safety_margin
        
        optimal_batch_size = min(max_faces_per_batch, int(available_vram / estimated_vram_per_face))
        
        print(f"GPU VRAM optimization: {available_vram}GB available, batch size: {optimal_batch_size}")
        return optimal_batch_size

    def count_faces(self, img_path):
        """
        Counts the number of faces in the given image file.
        """
        img = cv2.imread(img_path)
        # Use your face detector here. For example, with OpenCV's Haar cascade:
        face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, 1.1, 4)
        return len(faces)

def main():
    # Paths relative to root
    source_path = os.path.join("SinglePhoto", "data_src.jpg")
    target_path = os.path.join("SinglePhoto", "data_dst.jpg")
    output_dir = os.path.join("SinglePhoto", "output")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    swapper = FaceSwapper()

    try:
        # Ask user for target_face_idx, default to 1 if no input or invalid input
        try:
            user_input = input("Enter the target face index (starting from 1, default is 1): ")
            target_face_idx = int(user_input) if user_input.strip() else 1
            if target_face_idx < 1:
                print("Invalid index. Using default value 1.")
                target_face_idx = 1
        except ValueError:
            print("Invalid input. Using default value 1.")
            target_face_idx = 1

        try:
            # Default swap_hair to False in CLI mode, or True if you want to test it
            result = swapper.swap_faces(
                source_path=source_path,
                source_face_idx=1,
                target_path=target_path,
                target_face_idx=target_face_idx,
                swap_hair=True # Enabled for testing
            )
        except ValueError as ve:
            if "Target image contains" in str(ve):
                print(f"Target face idx {target_face_idx} not found, trying with idx 1.")
                result = swapper.swap_faces(
                    source_path=source_path,
                    source_face_idx=1,
                    target_path=target_path,
                    target_face_idx=1,
                    swap_hair=True
                )
            else:
                raise ve

        output_path = os.path.join(output_dir, "swapped_face.jpg")
        cv2.imwrite(output_path, result)
        print(f"Face swap completed successfully. Result saved to: {output_path}")

    except Exception as e:
        print(f"Error occurred: {str(e)}")

if __name__ == "__main__":
    main()