Spaces:

speridlabs
/

eneas

Paused

App Files Files Community

javipd99 commited on Jan 8

Commit

c019b27

1 Parent(s): dd5f34a

Initial commit: Add eneas application files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +6 -8
app.py +842 -0
eneas/__init__.py +23 -0
eneas/__main__.py +8 -0
eneas/__pycache__/__init__.cpython-312.pyc +0 -0
eneas/__pycache__/__init__.cpython-313.pyc +0 -0
eneas/__pycache__/__main__.cpython-312.pyc +0 -0
eneas/__pycache__/cli.cpython-312.pyc +0 -0
eneas/cli.py +576 -0
eneas/segmentation/__init__.py +16 -0
eneas/segmentation/__pycache__/__init__.cpython-312.pyc +0 -0
eneas/segmentation/__pycache__/__init__.cpython-313.pyc +0 -0
eneas/segmentation/__pycache__/generic_category.cpython-312.pyc +0 -0
eneas/segmentation/__pycache__/generic_category.cpython-313.pyc +0 -0
eneas/segmentation/__pycache__/model_manager.cpython-312.pyc +0 -0
eneas/segmentation/__pycache__/model_manager.cpython-313.pyc +0 -0
eneas/segmentation/__pycache__/types.cpython-312.pyc +0 -0
eneas/segmentation/__pycache__/types.cpython-313.pyc +0 -0
eneas/segmentation/__pycache__/unique_instance.cpython-312.pyc +0 -0
eneas/segmentation/__pycache__/unique_instance.cpython-313.pyc +0 -0
eneas/segmentation/__pycache__/utils.cpython-312.pyc +0 -0
eneas/segmentation/__pycache__/utils.cpython-313.pyc +0 -0
eneas/segmentation/generic_category.py +1072 -0
eneas/segmentation/model_manager.py +126 -0
eneas/segmentation/types.py +28 -0
eneas/segmentation/unique_instance.py +993 -0
eneas/segmentation/utils.py +418 -0
eneas/vendor/.DS_Store +0 -0
eneas/vendor/SeC/.DS_Store +0 -0
eneas/vendor/SeC/LICENSE +201 -0
eneas/vendor/SeC/inference/.DS_Store +0 -0
eneas/vendor/SeC/inference/__pycache__/configuration_intern_vit.cpython-312.pyc +0 -0
eneas/vendor/SeC/inference/__pycache__/configuration_internlm2.cpython-312.pyc +0 -0
eneas/vendor/SeC/inference/__pycache__/configuration_sec.cpython-312.pyc +0 -0
eneas/vendor/SeC/inference/__pycache__/flash_attention.cpython-312.pyc +0 -0
eneas/vendor/SeC/inference/__pycache__/modeling_intern_vit.cpython-312.pyc +0 -0
eneas/vendor/SeC/inference/__pycache__/modeling_internlm2.cpython-312.pyc +0 -0
eneas/vendor/SeC/inference/__pycache__/modeling_sec.cpython-312.pyc +0 -0
eneas/vendor/SeC/inference/__pycache__/sam2_video_predictor.cpython-312.pyc +0 -0
eneas/vendor/SeC/inference/__pycache__/templates.cpython-312.pyc +0 -0
eneas/vendor/SeC/inference/configuration_intern_vit.py +120 -0
eneas/vendor/SeC/inference/configuration_internlm2.py +150 -0
eneas/vendor/SeC/inference/configuration_phi3.py +211 -0
eneas/vendor/SeC/inference/configuration_sec.py +124 -0
eneas/vendor/SeC/inference/flash_attention.py +76 -0
eneas/vendor/SeC/inference/modeling_intern_vit.py +364 -0
eneas/vendor/SeC/inference/modeling_internlm2.py +1429 -0
eneas/vendor/SeC/inference/modeling_phi3.py +1610 -0
eneas/vendor/SeC/inference/modeling_sec.py +857 -0
eneas/vendor/SeC/inference/sam2/__init__.py +14 -0

README.md CHANGED Viewed

@@ -1,14 +1,12 @@
 ---
-title: Eneas
-emoji: 📉
-colorFrom: gray
-colorTo: blue
 sdk: gradio
-sdk_version: 6.2.0
 app_file: app.py
 pinned: false
-license: apache-2.0
-short_description: Segmentator
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ENEAS
+emoji: ⚡
+colorFrom: blue
+colorTo: pink
 sdk: gradio
+sdk_version: 6.0.2
 app_file: app.py
 pinned: false
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,842 @@

+import gradio as gr
+import os
+import cv2
+import shutil
+import tempfile
+import numpy as np
+import subprocess
+import time
+import threading
+import torch
+import sys
+import logging
+from PIL import Image
+# ===========================================
+# LOGGING CONFIGURATION
+# ===========================================
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+# Ensure Python sees the local 'eneas' folder
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import spaces
+try:
+    from eneas.segmentation import UniqueInstanceSegmenter, GenericCategorySegmenter
+    from eneas.segmentation.model_manager import ModelManager
+except ImportError as e:
+    logger.error(f"Error importing ENEAS: {e}")
+    raise e
+# ===========================================
+# CONSTANTS
+# ===========================================
+MAX_FRAMES = 150  # Limit frames to avoid ZeroGPU Timeout (~1s/frame processing)
+OLLAMA_HOST = "127.0.0.1:11434"
+OLLAMA_URL = f"http://{OLLAMA_HOST}"
+OLLAMA_BIN = "./bin/ollama"
+VLM_MODELS = [
+    "qwen3-vl:4b-instruct-q8_0",
+    "qwen3-vl:2b-instruct-q8_0"
+]
+OUTPUT_BASE_DIR = "gradio_outputs"
+os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)
+# ===========================================
+# OLLAMA FUNCTIONS (FOR USE INSIDE @spaces.GPU)
+# ===========================================
+def get_ollama_env():
+    """Get environment variables for Ollama process with GPU support."""
+    env = os.environ.copy()
+    env["OLLAMA_HOST"] = OLLAMA_HOST
+    env["OLLAMA_ORIGINS"] = "*"
+    env["HOME"] = os.getcwd()
+    # Add local lib path for the extracted binary
+    cwd = os.getcwd()
+    lib_path = f"{cwd}/lib"
+    if "LD_LIBRARY_PATH" in env:
+        env["LD_LIBRARY_PATH"] += f":{lib_path}"
+    else:
+        env["LD_LIBRARY_PATH"] = lib_path
+    return env
+def is_ollama_server_running() -> bool:
+    """Check if Ollama server is responding."""
+    try:
+        result = subprocess.run(
+            ["curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", OLLAMA_URL],
+            capture_output=True,
+            text=True,
+            timeout=5
+        )
+        return result.stdout.strip() == "200"
+    except Exception:
+        return False
+def start_ollama_server_gpu():
+    """
+    Start Ollama server INSIDE @spaces.GPU context.
+    This ensures Ollama detects and uses the GPU.
+    Returns:
+        bool: True if server started successfully
+    """
+    if is_ollama_server_running():
+        logger.info("Ollama server is already running.")
+        return True
+    logger.info("Starting Ollama server inside GPU context...")
+    try:
+        env = get_ollama_env()
+        # Start server as background process
+        process = subprocess.Popen(
+            [OLLAMA_BIN, "serve"],
+            env=env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE
+        )
+        # Wait for server to be ready (max 30 seconds)
+        max_retries = 30
+        for i in range(max_retries):
+            if is_ollama_server_running():
+                logger.info(f"Ollama server started successfully in {i+1} seconds.")
+                return True
+            time.sleep(1)
+        logger.error("Ollama server failed to start within 30 seconds.")
+        return False
+    except Exception as e:
+        logger.error(f"Failed to start Ollama server: {e}")
+        return False
+def load_model_into_vram(model_name: str) -> bool:
+    """
+    Load model into VRAM for faster inference.
+    Uses keep_alive=-1 to keep model loaded.
+    Args:
+        model_name: Name of the Ollama model to load
+    Returns:
+        bool: True if model loaded successfully
+    """
+    logger.info(f"Loading model {model_name} into VRAM...")
+    try:
+        # Send a minimal request to trigger model loading
+        result = subprocess.run(
+            [
+                "curl", "-s", f"{OLLAMA_URL}/api/generate",
+                "-d", f'{{"model": "{model_name}", "prompt": "hi", "stream": false}}'
+            ],
+            capture_output=True,
+            text=True,
+            timeout=120  # Model loading can take time
+        )
+        if "error" in result.stdout.lower():
+            logger.error(f"Error loading model: {result.stdout}")
+            return False
+        # Set keep_alive to -1 to keep model in VRAM
+        subprocess.run(
+            [
+                "curl", "-s", f"{OLLAMA_URL}/api/generate",
+                "-d", f'{{"model": "{model_name}", "keep_alive": -1}}'
+            ],
+            capture_output=True,
+            timeout=10
+        )
+        logger.info(f"Model {model_name} loaded into VRAM successfully.")
+        return True
+    except subprocess.TimeoutExpired:
+        logger.error("Timeout while loading model into VRAM.")
+        return False
+    except Exception as e:
+        logger.error(f"Error loading model into VRAM: {e}")
+        return False
+def log_active_models():
+    """Log which models are currently loaded in VRAM (not just on disk)."""
+    try:
+        result = subprocess.run(
+            ["curl", "-s", f"{OLLAMA_URL}/api/ps"],
+            capture_output=True,
+            text=True,
+            timeout=5
+        )
+        logger.info(f"Active models in VRAM: {result.stdout}")
+    except Exception as e:
+        logger.warning(f"Could not get active models: {e}")
+def ensure_ollama_ready_gpu(model_name: str) -> bool:
+    """
+    Main function to ensure Ollama is fully ready with GPU support.
+    MUST be called inside @spaces.GPU decorated function.
+    This function:
+    1. Starts Ollama server (which will detect GPU)
+    2. Loads the specified model into VRAM
+    3. Logs which model is active
+    Args:
+        model_name: Name of the Ollama model to use
+    Returns:
+        bool: True if ready
+    Raises:
+        RuntimeError: If setup fails
+    """
+    logger.info(f"Ensuring Ollama is ready with GPU for model: {model_name}")
+    # Step 1: Start server (will detect GPU since we're inside @spaces.GPU)
+    if not start_ollama_server_gpu():
+        raise RuntimeError("Failed to start Ollama server with GPU")
+    # Step 2: Load model into VRAM
+    if not load_model_into_vram(model_name):
+        raise RuntimeError(f"Failed to load model {model_name} into VRAM")
+    # Step 3: Log which model is actually active in VRAM
+    log_active_models()
+    logger.info("Ollama is ready with GPU support!")
+    return True
+# ===========================================
+# STARTUP: DOWNLOAD BINARY AND MODELS (CPU)
+# ===========================================
+def download_ollama_binary():
+    """Download Ollama binary if not present."""
+    if os.path.exists(OLLAMA_BIN):
+        logger.info("Ollama binary already exists.")
+        return True
+    logger.info("Downloading Ollama binary (TGZ)...")
+    try:
+        subprocess.run(
+            ["curl", "-L", "https://ollama.com/download/ollama-linux-amd64.tgz", "-o", "ollama.tgz"],
+            check=True,
+            timeout=300
+        )
+        subprocess.run(["tar", "-xzf", "ollama.tgz"], check=True)
+        subprocess.run(["chmod", "+x", OLLAMA_BIN], check=True)
+        os.remove("ollama.tgz")  # Cleanup
+        logger.info("Ollama binary downloaded and extracted successfully.")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to download Ollama binary: {e}")
+        return False
+def pull_ollama_models():
+    """
+    Pull Ollama models at startup (runs on CPU).
+    This pre-downloads the models so they're ready when GPU is available.
+    """
+    logger.info("Pre-downloading Ollama models...")
+    # Need to temporarily start server to pull models
+    env = get_ollama_env()
+    # Start server temporarily
+    server_process = subprocess.Popen(
+        [OLLAMA_BIN, "serve"],
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
+    )
+    # Wait for server
+    time.sleep(5)
+    for _ in range(20):
+        if is_ollama_server_running():
+            break
+        time.sleep(1)
+    # Pull each model
+    for model in VLM_MODELS:
+        logger.info(f"Pulling model: {model}")
+        try:
+            subprocess.run(
+                [OLLAMA_BIN, "pull", model],
+                env=env,
+                timeout=600,
+                capture_output=True
+            )
+            logger.info(f"Model {model} pulled successfully.")
+        except Exception as e:
+            logger.warning(f"Failed to pull model {model}: {e}")
+    # Stop server (we'll restart it inside GPU context later)
+    server_process.terminate()
+    try:
+        server_process.wait(timeout=5)
+    except subprocess.TimeoutExpired:
+        server_process.kill()
+    logger.info("Ollama models pre-download complete.")
+def setup_ollama_startup():
+    """Setup Ollama at startup: download binary and pull models."""
+    download_ollama_binary()
+    pull_ollama_models()
+def setup_hf_models():
+    """
+    Downloads heavy HuggingFace models to disk at startup.
+    This prevents ZeroGPU timeouts during the first inference.
+    """
+    logger.info("Starting HuggingFace models download (Warm-up)...")
+    try:
+        manager = ModelManager()
+        # 1. SeC-4B (Heavy, ~15GB)
+        logger.info("Downloading SeC-4B...")
+        manager.download("OpenIXCLab/SeC-4B")
+        # 2. Florence-2 (Grounding)
+        logger.info("Downloading Florence-2...")
+        manager.download("microsoft/Florence-2-large")
+        # 3. SigLIP (For Generic Category)
+        logger.info("Downloading SigLIP...")
+        manager.download("google/siglip2-base-patch16-naflex")
+        # 4. SAM2 Checkpoint (Direct URL)
+        logger.info("Downloading SAM2 checkpoint...")
+        manager.download_url(
+            "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt",
+            "sam2.1_hiera_large.pt"
+        )
+        logger.info("All HuggingFace models downloaded successfully.")
+    except Exception as e:
+        logger.error(f"Error during HF model download: {e}")
+# ===========================================
+# STARTUP: PARALLEL MODEL DOWNLOADS
+# ===========================================
+logger.info("Starting parallel model downloads at startup...")
+t_hf = threading.Thread(target=setup_hf_models, daemon=True)
+t_ollama = threading.Thread(target=setup_ollama_startup, daemon=True)
+t_hf.start()
+t_ollama.start()
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"Main process device detection: {DEVICE}")
+# ===========================================
+# UTILITY FUNCTIONS
+# ===========================================
+def process_inputs_to_frames(input_data, output_folder: str) -> tuple:
+    """
+    Extracts frames from video (1 FPS) or copies images to output folder.
+    Enforces MAX_FRAMES limit to prevent ZeroGPU timeouts.
+    Args:
+        input_data: Video file or list of image files
+        output_folder: Directory to save extracted frames
+    Returns:
+        tuple: (output_folder path, list of frame file paths)
+    """
+    if os.path.exists(output_folder):
+        shutil.rmtree(output_folder)
+    os.makedirs(output_folder)
+    frame_paths = []
+    video_extensions = {'.mp4', '.avi', '.mov', '.mkv', '.webm'}
+    input_list = input_data if isinstance(input_data, list) else [input_data]
+    if not input_list:
+        return output_folder, []
+    first_file = input_list[0].name if hasattr(input_list[0], 'name') else str(input_list[0])
+    ext = os.path.splitext(first_file)[1].lower()
+    if ext in video_extensions:
+        # Process video file
+        logger.info(f"Processing video: {first_file}...")
+        cap = cv2.VideoCapture(first_file)
+        video_fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames_original = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+        if video_fps == 0 or np.isnan(video_fps):
+            video_fps = 30
+        duration_sec = total_frames_original / video_fps
+        # Validate video duration
+        if duration_sec > MAX_FRAMES:
+            cap.release()
+            msg = f"Video is too long ({int(duration_sec)}s). Max allowed is {MAX_FRAMES}s to avoid ZeroGPU timeout."
+            logger.error(msg)
+            raise gr.Error(msg)
+        # Sample at 1 FPS
+        frame_interval = max(1, int(video_fps))
+        count = 0
+        saved_count = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if count % frame_interval == 0:
+                filename = f"frame_{saved_count:05d}.jpg"
+                filepath = os.path.join(output_folder, filename)
+                cv2.imwrite(filepath, frame)
+                frame_paths.append(filepath)
+                saved_count += 1
+                if saved_count > MAX_FRAMES:
+                    cap.release()
+                    raise gr.Error(f"Limit reached: > {MAX_FRAMES} frames extracted.")
+            count += 1
+        cap.release()
+        logger.info(f"Video sampled at 1 FPS. Total frames: {saved_count}")
+    else:
+        # Process image files
+        if len(input_list) > MAX_FRAMES:
+            raise gr.Error(f"Too many images! You uploaded {len(input_list)}. Max allowed is {MAX_FRAMES}.")
+        logger.info(f"Processing {len(input_list)} images...")
+        input_list.sort(key=lambda x: x.name if hasattr(x, 'name') else str(x))
+        for i, f in enumerate(input_list):
+            path = f.name if hasattr(f, 'name') else str(f)
+            try:
+                img = Image.open(path).convert("RGB")
+                filename = f"frame_{i:05d}.jpg"
+                filepath = os.path.join(output_folder, filename)
+                img.save(filepath)
+                frame_paths.append(filepath)
+            except Exception as e:
+                logger.warning(f"Skipping file {path}: {e}")
+    return output_folder, frame_paths
+def create_video_overlay(frames_folder: str, masks_dict: dict, output_path: str, fps: int = 5) -> str:
+    """
+    Creates a video from frames with segmentation masks overlaid.
+    Args:
+        frames_folder: Directory containing frame images
+        masks_dict: Dictionary mapping frame index to mask arrays
+        output_path: Output video file path
+        fps: Frames per second for output video
+    Returns:
+        Output video path or None if failed
+    """
+    logger.info("Generating result video...")
+    frame_files = sorted([f for f in os.listdir(frames_folder) if f.endswith(".jpg")])
+    if not frame_files:
+        return None
+    first_frame = cv2.imread(os.path.join(frames_folder, frame_files[0]))
+    height, width, _ = first_frame.shape
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    # Orange/Gold color for mask overlay
+    mask_color = np.array([255, 100, 0], dtype=np.uint8)
+    for i, filename in enumerate(frame_files):
+        frame = cv2.imread(os.path.join(frames_folder, filename))
+        mask_overlay = np.zeros_like(frame)
+        if i in masks_dict:
+            masks_data = masks_dict[i]
+            masks_list = [masks_data] if isinstance(masks_data, np.ndarray) else (
+                masks_data if isinstance(masks_data, list) else []
+            )
+            for mask in masks_list:
+                mask_overlay[mask > 0] = mask_color
+        if np.any(mask_overlay):
+            frame = cv2.addWeighted(frame, 1, mask_overlay, 0.5, 0)
+        out.write(frame)
+    out.release()
+    return output_path
+# ===========================================
+# UNIQUE INSTANCE SEGMENTATION
+# ===========================================
+def process_unique_upload(input_files):
+    """
+    Process uploaded files for Unique Instance segmentation.
+    Extracts frames and prepares the UI for annotation.
+    """
+    if not input_files:
+        return None, None, [], "Please upload files first.", gr.Slider(value=0, maximum=0, visible=False)
+    temp_dir = tempfile.mkdtemp()
+    frames_dir, frame_paths = process_inputs_to_frames(input_files, temp_dir)
+    num_frames = len(frame_paths)
+    if num_frames == 0:
+        return None, None, [], "No frames extracted.", gr.Slider(value=0, maximum=0, visible=False)
+    new_slider = gr.Slider(
+        value=0,
+        minimum=0,
+        maximum=num_frames - 1,
+        step=1,
+        visible=True,
+        interactive=True,
+        label=f"Select Reference Frame (0 - {num_frames - 1})"
+    )
+    return frame_paths[0], frames_dir, [], f"Processed {num_frames} frames (1 FPS). Select target.", new_slider
+def update_canvas_from_slider(frame_idx, frames_dir):
+    """Update the displayed frame when slider changes."""
+    if not frames_dir or not os.path.exists(frames_dir):
+        return None, []
+    filename = f"frame_{int(frame_idx):05d}.jpg"
+    path = os.path.join(frames_dir, filename)
+    if os.path.exists(path):
+        img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
+        return img, []
+    return None, []
+def add_point(img, evt: gr.SelectData, points_state):
+    """Add a point annotation to the image."""
+    x, y = evt.index
+    points_state.append((x, y))
+    img_pil = Image.fromarray(img)
+    img_cv = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
+    # Draw markers for all points
+    for px, py in points_state:
+        cv2.drawMarker(
+            img_cv, (px, py), (0, 255, 0),
+            markerType=cv2.MARKER_TILTED_CROSS,
+            markerSize=20,
+            thickness=3
+        )
+    return cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB), points_state
+@spaces.GPU(duration=180)
+def run_unique_segmentation(input_files, points_state, text_prompt, sam_encoder, offload_gpu, cleanup_interval, frame_idx_slider):
+    """
+    Run Unique Instance segmentation on the uploaded frames.
+    Tracks a specific object identified by points or text description.
+    """
+    if not input_files:
+        return None, "Error: Process input first."
+    # Wait for HF models to be downloaded
+    if t_hf.is_alive():
+        logger.info("Waiting for HF models download to finish...")
+        t_hf.join()
+    try:
+        logger.info("Processing inputs on GPU node...")
+        temp_dir = tempfile.mkdtemp()
+        # Re-extract frames to ensure they exist on GPU ephemeral storage
+        frames_dir, _ = process_inputs_to_frames(input_files, temp_dir)
+        logger.info("Initializing UniqueInstanceSegmenter...")
+        segmenter = UniqueInstanceSegmenter(
+            sam_encoder=sam_encoder,
+            memory_cleanup_interval=int(cleanup_interval),
+            device="cuda"
+        )
+        if offload_gpu:
+            segmenter.optimize_cuda_memory()
+        annotation_frame = f"frame_{int(frame_idx_slider):05d}.jpg"
+        if not os.path.exists(os.path.join(frames_dir, annotation_frame)):
+            return None, f"Error: Frame {annotation_frame} not found."
+        # Run segmentation based on input type
+        if text_prompt.strip():
+            logger.info(f"Mode: Text -> {text_prompt}")
+            result = segmenter.segment(
+                frames_path=frames_dir,
+                text=text_prompt,
+                annotation_frame=annotation_frame,
+                offload_frames_to_gpu=offload_gpu
+            )
+        else:
+            if not points_state:
+                return None, "Please add points or text."
+            logger.info(f"Mode: Points -> {points_state}")
+            result = segmenter.segment(
+                frames_path=frames_dir,
+                points=points_state,
+                annotation_frame=annotation_frame,
+                offload_frames_to_gpu=offload_gpu
+            )
+        output_vid = os.path.join(OUTPUT_BASE_DIR, "unique_output.mp4")
+        return create_video_overlay(frames_dir, result.masks, output_vid), f"Completed. {result.num_frames} frames processed."
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        logger.error(str(e))
+        if isinstance(e, gr.Error):
+            raise e
+        return None, f"Error: {str(e)}"
+# ===========================================
+# GENERIC CATEGORY SEGMENTATION
+# ===========================================
+@spaces.GPU(duration=180)
+def run_generic_segmentation(input_files, category, accept_thresh, reject_thresh, vlm_model_name):
+    """
+    Run Generic Category segmentation on the uploaded frames.
+    Detects all instances of a specified category using VLM + segmentation.
+    IMPORTANT: This function starts Ollama server INSIDE the GPU context,
+    ensuring that Ollama can detect and use the GPU for inference.
+    """
+    if not input_files:
+        return None, "Error: Upload input."
+    if not category.strip():
+        return None, "Error: Please specify a category to detect."
+    # Wait for model downloads to complete
+    if t_hf.is_alive():
+        logger.info("Waiting for HF models download...")
+        t_hf.join()
+    if t_ollama.is_alive():
+        logger.info("Waiting for Ollama models download...")
+        t_ollama.join()
+    try:
+        # =========================================================
+        # CRITICAL: Start Ollama INSIDE @spaces.GPU context
+        # This ensures Ollama detects and uses the GPU!
+        # =========================================================
+        logger.info("=" * 50)
+        logger.info("Starting Ollama server with GPU support...")
+        logger.info("=" * 50)
+        ensure_ollama_ready_gpu(vlm_model_name)
+        logger.info("Ollama is running with GPU. Processing inputs...")
+        # Process input frames
+        temp_dir = tempfile.mkdtemp()
+        frames_dir, _ = process_inputs_to_frames(input_files, temp_dir)
+        logger.info(f"Initializing GenericCategorySegmenter with VLM: {vlm_model_name}")
+        segmenter = GenericCategorySegmenter(
+            device="cuda",
+            vlm_model=vlm_model_name
+        )
+        logger.info(f"Detecting category: {category}")
+        result = segmenter.segment(
+            frames_path=frames_dir,
+            category=category,
+            accept_threshold=accept_thresh,
+            reject_threshold=reject_thresh,
+            save_debug=False
+        )
+        output_vid = os.path.join(OUTPUT_BASE_DIR, "generic_output.mp4")
+        total_detections = sum(len(d) for d in result.metadata['detections'].values())
+        return create_video_overlay(frames_dir, result.masks, output_vid), f"Completed! Total detections: {total_detections}"
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        logger.error(f"Generic segmentation error: {e}")
+        if isinstance(e, gr.Error):
+            raise e
+        return None, f"Error: {e}"
+# ===========================================
+# GRADIO UI
+# ===========================================
+with gr.Blocks(title="ENEAS: Embedding-guided Neural Ensemble for Adaptive Segmentation") as demo:
+    gr.Markdown(
+        f"""
+        # ⚡ ENEAS: Embedding-guided Neural Ensemble for Adaptive Segmentation
+        **⚠️ IMPORTANT LIMITS:**
+        - Maximum **{MAX_FRAMES} FRAMES** to prevent ZeroGPU timeouts
+        - Videos are sampled at **1 FPS** → Max **{MAX_FRAMES} seconds** of video
+        - Exceeding these limits will stop execution
+        **💡 Note:** Generic Category detection uses Ollama VLM with GPU acceleration.
+        First request may take ~15-20 seconds to initialize the server.
+        """
+    )
+    with gr.Tabs():
+        # ===========================================
+        # TAB 1: UNIQUE INSTANCE SEGMENTATION
+        # ===========================================
+        with gr.Tab("🎯 Unique Instance"):
+            gr.Markdown("Track a specific object. Upload Video (1 FPS extraction) OR Images.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    u_file = gr.File(
+                        label="Input (Video or Images)",
+                        file_count="multiple",
+                        file_types=["video", "image"]
+                    )
+                    u_btn_proc = gr.Button("▶️ 1. Process Input (Extract 1 FPS)", variant="secondary")
+                    u_slider = gr.Slider(label="Frame Selector", visible=False)
+                    with gr.Accordion("Advanced Options", open=False):
+                        u_enc = gr.Dropdown(
+                            ["long-large", "long-small"],
+                            value="long-small",
+                            label="SAM2 Encoder"
+                        )
+                        u_offload = gr.Checkbox(label="GPU Memory Offload", value=False)
+                with gr.Column(scale=2):
+                    u_path_frames_cpu = gr.Textbox(visible=False)
+                    points_state = gr.State([])
+                    u_img = gr.Image(
+                        label="Reference Frame (Click to add points)",
+                        interactive=True
+                    )
+                    u_txt = gr.Textbox(
+                        label="Text Description (Grounding)",
+                        placeholder="Points are ignored if text is provided."
+                    )
+                    u_btn_run = gr.Button("🚀 2. Run Segmentation", variant="primary")
+                    u_out = gr.Video(label="Result")
+                    u_status = gr.Textbox(label="Status", interactive=False)
+            # Event handlers
+            u_btn_proc.click(
+                process_unique_upload,
+                [u_file],
+                [u_img, u_path_frames_cpu, points_state, u_status, u_slider]
+            )
+            u_slider.change(
+                update_canvas_from_slider,
+                inputs=[u_slider, u_path_frames_cpu],
+                outputs=[u_img, points_state]
+            )
+            u_img.select(add_point, [u_img, points_state], [u_img, points_state])
+            u_btn_run.click(
+                run_unique_segmentation,
+                [u_file, points_state, u_txt, u_enc, u_offload, gr.Number(10, visible=False), u_slider],
+                [u_out, u_status]
+            )
+        # ===========================================
+        # TAB 2: GENERIC CATEGORY SEGMENTATION
+        # ===========================================
+        with gr.Tab("🔮 Generic Category"):
+            gr.Markdown(
+                f"""
+                Detect all instances of a category in every frame (Max {MAX_FRAMES} frames).
+                **🚀 GPU-Accelerated:** Ollama VLM runs on GPU for fast inference.
+                First request includes ~15-20s server startup time.
+                """
+            )
+            with gr.Row():
+                g_file = gr.File(
+                    label="Input (Video or Images)",
+                    file_count="multiple",
+                    file_types=["video", "image"]
+                )
+                g_cat = gr.Textbox(
+                    label="Category to Detect",
+                    placeholder="e.g., person, chair, car, dog"
+                )
+                g_btn = gr.Button("🔍 Run Detection", variant="primary")
+            with gr.Accordion("Detection Settings", open=True):
+                g_accept = gr.Slider(
+                    0.0, 1.0,
+                    value=0.30,
+                    label="Accept Threshold",
+                    info="Higher = more confident detections only"
+                )
+                g_reject = gr.Slider(
+                    0.0, 1.0,
+                    value=0.1,
+                    label="Reject Threshold",
+                    info="Lower = filter out more false positives"
+                )
+                g_vlm = gr.Dropdown(
+                    choices=VLM_MODELS,
+                    value=VLM_MODELS[0],
+                    label="VLM Model",
+                    info="Larger models are more accurate but slower"
+                )
+            g_out = gr.Video(label="Result")
+            g_stat = gr.Textbox(label="Detection Statistics", interactive=False)
+            g_btn.click(
+                run_generic_segmentation,
+                [g_file, g_cat, g_accept, g_reject, g_vlm],
+                [g_out, g_stat]
+            )
+# ===========================================
+# MAIN ENTRY POINT
+# ===========================================
+if __name__ == "__main__":
+    demo.launch()

eneas/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+ENEAS - Embedding-guided Neural Ensemble for Adaptive Segmentation
+Frame sequence segmentation library with temporal tracking and category detection.
+Provides tools for:
+- Unique instance segmentation with temporal tracking
+- Generic category segmentation across frames
+"""
+__version__ = "0.1.0"
+from .segmentation import (
+    GenericCategorySegmenter,
+    SegmentationResult,
+    UniqueInstanceSegmenter,
+)
+__all__ = [
+    "GenericCategorySegmenter",
+    "SegmentationResult",
+    "UniqueInstanceSegmenter",
+]

eneas/__main__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Entry point for running eneas as a module: python -m eneas
+"""
+from eneas.cli import main
+if __name__ == "__main__":
+    main()

eneas/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (651 Bytes). View file

eneas/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (559 Bytes). View file

eneas/__pycache__/__main__.cpython-312.pyc ADDED Viewed

Binary file (354 Bytes). View file

eneas/__pycache__/cli.cpython-312.pyc ADDED Viewed

Binary file (21.8 kB). View file

eneas/cli.py ADDED Viewed

	@@ -0,0 +1,576 @@

+"""
+ENEAS CLI - Embedding-guided Neural Ensemble for Adaptive Segmentation
+Command-line interface for frame sequence segmentation.
+"""
+import logging
+import time
+from pathlib import Path
+from typing import Annotated
+import typer
+from eneas.segmentation import GenericCategorySegmenter, UniqueInstanceSegmenter
+app = typer.Typer(
+    name="eneas",
+    help="ENEAS - Embedding-guided Neural Ensemble for Adaptive Segmentation",
+    add_completion=False,
+)
+def setup_logging(verbose: bool) -> None:
+    """Configure logging."""
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(levelname)s: %(message)s",
+    )
+    # Also configure eneas loggers
+    logging.getLogger("eneas").setLevel(level)
+def parse_points(points_str: list[str]) -> list[tuple[int, int]]:
+    """Parse point coordinates from CLI arguments."""
+    result = []
+    for i, point in enumerate(points_str):
+        parts = point.split(",")
+        if len(parts) != 2:
+            typer.echo(f"Error: Point {i + 1} must be in format 'x,y', got '{point}'", err=True)
+            raise typer.Exit(code=1)
+        try:
+            x, y = int(parts[0].strip()), int(parts[1].strip())
+            result.append((x, y))
+        except ValueError:
+            typer.echo(
+                f"Error: Point {i + 1} coordinates must be integers, got '{point}'", err=True
+            )
+            raise typer.Exit(code=1) from None
+    return result
+def parse_labels(labels_str: list[str] | None, num_points: int) -> list[int]:
+    """Parse point labels from CLI arguments."""
+    if not labels_str:
+        # Default: all positive points
+        return [1] * num_points
+    if len(labels_str) != num_points:
+        typer.echo(
+            f"Error: Number of labels ({len(labels_str)}) must match number of points ({num_points})",
+            err=True,
+        )
+        raise typer.Exit(code=1)
+    result = []
+    for i, label in enumerate(labels_str):
+        try:
+            val = int(label.strip())
+            if val not in (0, 1):
+                raise ValueError
+            result.append(val)
+        except ValueError:
+            typer.echo(f"Error: Label {i + 1} must be 0 or 1, got '{label}'", err=True)
+            raise typer.Exit(code=1) from None
+    return result
+def validate_paths(frames_path: Path, annotation_frame: str | None) -> None:
+    """Validate input paths exist."""
+    if not frames_path.exists():
+        typer.echo(f"Error: Frames path does not exist: {frames_path}", err=True)
+        raise typer.Exit(code=1)
+    if not frames_path.is_dir():
+        typer.echo(f"Error: Frames path is not a directory: {frames_path}", err=True)
+        raise typer.Exit(code=1)
+    # Check for image files
+    image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"}
+    image_files = [f for f in frames_path.iterdir() if f.suffix.lower() in image_extensions]
+    if not image_files:
+        typer.echo(f"Error: No image files found in: {frames_path}", err=True)
+        raise typer.Exit(code=1)
+    # Validate annotation frame if specified
+    if annotation_frame:
+        annotation_path = frames_path / annotation_frame
+        if not annotation_path.exists():
+            typer.echo(f"Error: Annotation frame not found: {annotation_path}", err=True)
+            raise typer.Exit(code=1)
+def print_banner():
+    """Print welcome banner."""
+    typer.echo("\n" + "=" * 70)
+    typer.echo("  eneas - Frame Sequence Segmentation with Temporal Tracking")
+    typer.echo("=" * 70 + "\n")
+def print_config(config: dict):
+    """Print configuration table."""
+    typer.echo("Configuration:")
+    typer.echo("-" * 70)
+    for key, value in config.items():
+        typer.echo(f"  {key:<30} {value}")
+    typer.echo("-" * 70 + "\n")
+def print_summary_unique_instance(result, elapsed_time: float) -> None:
+    """Print unique instance segmentation results summary."""
+    typer.echo("\n" + "=" * 70)
+    typer.echo("  SEGMENTATION RESULTS")
+    typer.echo("=" * 70)
+    typer.echo(f"  Processed Frames:              {result.num_frames}")
+    typer.echo(
+        f"  Processing Time:               {elapsed_time:.2f}s ({result.num_frames / elapsed_time:.1f} fps)"
+    )
+    typer.echo(f"  Output Directory:              {result.output_dir}")
+    if result.initial_mask_path:
+        typer.echo(f"  Initial Mask Visualization:    {result.initial_mask_path}")
+    if result.mask_paths:
+        typer.echo(f"  Saved Mask Files:              {len(result.mask_paths)}")
+        typer.echo(f"  First Mask File:               {result.mask_paths[0]}")
+    # Metadata
+    metadata = result.metadata
+    typer.echo(f"\n  Annotation Frame:              {metadata['annotation_frame']}")
+    typer.echo(f"  Segmentation Mode:             {metadata['mode']}")
+    # Show mode-specific information
+    if metadata["mode"] == "text-based":
+        typer.echo(f"  Text Description:              {metadata['text']}")
+        typer.echo(f"  Detected Bounding Box:         {metadata['bbox']}")
+    else:  # point-based
+        typer.echo(f"  Annotation Points:             {metadata['points']}")
+        typer.echo(f"  Point Labels:                  {metadata['labels']}")
+    typer.echo("=" * 70 + "\n")
+def print_summary_generic_category(result, elapsed_time: float) -> None:
+    """Print generic category detection results summary."""
+    typer.echo("\n" + "=" * 70)
+    typer.echo("  DETECTION RESULTS")
+    typer.echo("=" * 70)
+    typer.echo(f"  Processed Frames:              {result.num_frames}")
+    typer.echo(
+        f"  Processing Time:               {elapsed_time:.2f}s ({result.num_frames / elapsed_time:.1f} fps)"
+    )
+    typer.echo(f"  Output Directory:              {result.output_dir}")
+    # Metadata
+    metadata = result.metadata
+    typer.echo(f"\n  Category:                      {metadata['category']}")
+    typer.echo(f"  Accept Threshold:              {metadata['accept_threshold']}")
+    typer.echo(f"  Reject Threshold:              {metadata['reject_threshold']}")
+    # Count total detections
+    total_detections = sum(len(dets) for dets in metadata["detections"].values())
+    typer.echo(f"  Total Detections:              {total_detections}")
+    # VLM usage statistics
+    typer.echo(
+        f"\n  VLM Validation Usage:          {metadata['vlm_usage_count']}/{metadata['num_frames_total']} frames ({metadata['vlm_usage_percentage']:.1f}%)"
+    )
+    typer.echo("=" * 70 + "\n")
+@app.command(name="unique_instance")
+def unique_instance(
+    frames_path: Annotated[
+        Path,
+        typer.Option(
+            "--frames-path",
+            "-i",
+            help="Directory containing frame sequence (images)",
+            exists=True,
+            file_okay=False,
+            dir_okay=True,
+            resolve_path=True,
+        ),
+    ],
+    points: Annotated[
+        list[str] | None,
+        typer.Option(
+            "--points",
+            "-p",
+            help="Annotation points in format 'x,y'. Can specify multiple times. Example: -p 400,300 -p 350,280",
+        ),
+    ] = None,
+    labels: Annotated[
+        list[str] | None,
+        typer.Option(
+            "--labels",
+            "-l",
+            help="Point labels: 1 (positive/foreground) or 0 (negative/background). Must match number of points",
+        ),
+    ] = None,
+    text: Annotated[
+        str | None,
+        typer.Option(
+            "--text",
+            "-t",
+            help="Text description of the object to segment (mutually exclusive with --points)",
+        ),
+    ] = None,
+    annotation_frame: Annotated[
+        str | None,
+        typer.Option(
+            "--annotation-frame",
+            "-f",
+            help="Frame filename to use for annotation. If not specified, uses first frame",
+        ),
+    ] = None,
+    output_dir: Annotated[
+        Path | None,
+        typer.Option(
+            "--output-dir",
+            "-o",
+            help="Output directory for results",
+        ),
+    ] = None,
+    save_masks: Annotated[
+        bool,
+        typer.Option(
+            "--save-masks",
+            help="Save binary masks (including initial_mask.jpg visualization) as PNG files to disk",
+        ),
+    ] = False,
+    offload_frames_to_gpu: Annotated[
+        bool,
+        typer.Option(
+            "--offload-frames-to-gpu",
+            help="Keep frames in GPU memory (faster but uses MUCH more VRAM). Default: False (CPU)",
+        ),
+    ] = False,
+    sam_encoder: Annotated[
+        str,
+        typer.Option(
+            "--sam-encoder",
+            "-s",
+            help="SAM encoder variant. LongSAM (long-*) best for temporal tracking. Options: long-large (default), long-small, long-tiny, small, tiny, etc.",
+        ),
+    ] = "long-large",
+    memory_cleanup_interval: Annotated[
+        int,
+        typer.Option(
+            "--memory-cleanup-interval",
+            help="CUDA memory cleanup interval (frames). Lower = less memory, slower",
+        ),
+    ] = 10,
+    optimize_cuda_memory: Annotated[
+        bool,
+        typer.Option(
+            "--optimize-cuda-memory",
+            help="Enable CUDA memory optimization. Useful for low-memory GPUs",
+        ),
+    ] = False,
+    verbose: Annotated[
+        bool,
+        typer.Option(
+            "--verbose",
+            "-v",
+            help="Enable verbose logging",
+        ),
+    ] = False,
+    save_debug: Annotated[
+        bool,
+        typer.Option(
+            "--save-debug",
+            help="Save debug visualizations (sam_debug/)",
+        ),
+    ] = False,
+):
+    """
+    Segment a unique object instance across frame sequences.
+    NOTE: Requires a CUDA-capable GPU. CPU and MPS are not supported.
+    """
+    setup_logging(verbose)
+    print_banner()
+    try:
+        if text is not None and points is not None:
+            typer.echo("Error: --text and --points are mutually exclusive", err=True)
+            raise typer.Exit(code=1)
+        if text is None and points is None:
+            typer.echo("Error: Either --text or --points must be provided", err=True)
+            raise typer.Exit(code=1)
+        if text is not None and labels is not None:
+            typer.echo("Error: --labels cannot be used with --text", err=True)
+            raise typer.Exit(code=1)
+        validate_paths(frames_path, annotation_frame)
+        if output_dir is None:
+            output_dir = Path("./outputs")
+        valid_encoders = [
+            "tiny",
+            "small",
+            "base",
+            "large",
+            "long-tiny",
+            "long-small",
+            "long-base",
+            "long-large",
+            "legacy-tiny",
+            "legacy-small",
+            "legacy-base",
+            "legacy-large",
+        ]
+        if sam_encoder not in valid_encoders:
+            typer.echo(f"Error: Invalid sam_encoder '{sam_encoder}'", err=True)
+            raise typer.Exit(code=1)
+        config = {
+            "Frames Path": str(frames_path),
+            "Mode": "Text-based" if text else "Point-based",
+            "Annotation Frame": annotation_frame or "[first frame]",
+            "Output Directory": str(output_dir),
+            "Save Masks to Disk": "Yes" if save_masks else "No",
+            "Frames Location": "GPU (faster, more VRAM)"
+            if offload_frames_to_gpu
+            else "CPU (slower, less VRAM)",
+            "SAM Encoder": sam_encoder,
+            "Memory Cleanup Interval": str(memory_cleanup_interval),
+            "CUDA Optimization": "Enabled" if optimize_cuda_memory else "Disabled",
+        }
+        if text:
+            config["Text"] = text
+        else:
+            parsed_points = parse_points(points)
+            parsed_labels = parse_labels(labels, len(parsed_points))
+            config["Points"] = str(parsed_points)
+            config["Labels"] = str(parsed_labels)
+        print_config(config)
+        typer.echo("Initializing segmenter (requires CUDA GPU)...")
+        segmenter = UniqueInstanceSegmenter(
+            sam_encoder=sam_encoder,
+            memory_cleanup_interval=memory_cleanup_interval,
+        )
+        if optimize_cuda_memory:
+            segmenter.optimize_cuda_memory()
+            typer.echo("✓ CUDA memory optimization enabled")
+        typer.echo("✓ Segmenter initialized\n")
+        typer.echo("Processing frames...")
+        start_time = time.time()
+        if text:
+            result = segmenter.segment(
+                frames_path=str(frames_path),
+                text=text,
+                annotation_frame=annotation_frame,
+                output_dir=str(output_dir),
+                offload_frames_to_gpu=offload_frames_to_gpu,
+                save_masks=save_masks,
+                save_debug=save_debug,
+            )
+        else:
+            result = segmenter.segment(
+                frames_path=str(frames_path),
+                points=parsed_points,
+                labels=parsed_labels,
+                annotation_frame=annotation_frame,
+                output_dir=str(output_dir),
+                offload_frames_to_gpu=offload_frames_to_gpu,
+                save_masks=save_masks,
+                save_debug=save_debug,
+            )
+        elapsed_time = time.time() - start_time
+        typer.echo("✓ Segmentation complete!\n")
+        print_summary_unique_instance(result, elapsed_time)
+    except typer.Exit:
+        raise
+    except Exception as e:
+        typer.echo(f"\n✗ Error: {e}\n", err=True)
+        if verbose:
+            import traceback
+            traceback.print_exc()
+        raise typer.Exit(code=1) from None
+@app.command(name="generic_category")
+def generic_category(
+    frames_path: Annotated[
+        Path,
+        typer.Option(
+            "--frames-path",
+            "-i",
+            help="Directory containing frame sequence (images)",
+            exists=True,
+            file_okay=False,
+            dir_okay=True,
+            resolve_path=True,
+        ),
+    ],
+    category: Annotated[
+        str,
+        typer.Option(
+            "--category",
+            "-c",
+            help="Category to detect (e.g., 'person', 'chair', 'car')",
+        ),
+    ],
+    output_dir: Annotated[
+        Path | None,
+        typer.Option(
+            "--output-dir",
+            "-o",
+            help="Output directory for results",
+        ),
+    ] = None,
+    accept_threshold: Annotated[
+        float,
+        typer.Option(
+            "--accept-threshold",
+            help="Image-text similarity threshold for auto-accepting boxes (0.0-1.0)",
+        ),
+    ] = 0.90,
+    reject_threshold: Annotated[
+        float,
+        typer.Option(
+            "--reject-threshold",
+            help="Image-text similarity threshold for auto-rejecting boxes (0.0-1.0)",
+        ),
+    ] = 0.10,
+    verbose: Annotated[
+        bool,
+        typer.Option(
+            "--verbose",
+            "-v",
+            help="Enable verbose logging",
+        ),
+    ] = False,
+    save_debug: Annotated[
+        bool,
+        typer.Option(
+            "--save-debug",
+            help="Save debug visualizations (grounding_debug/, image_text_debug/, vlm_debug/, sam_debug/, detections_debug/)",
+        ),
+    ] = False,
+    save_masks: Annotated[
+        bool,
+        typer.Option(
+            "--save-masks",
+            help="Save binary segmentation masks as PNG files to disk",
+        ),
+    ] = False,
+    vlm_model: Annotated[
+        str,
+        typer.Option(
+            "--vlm-model",
+            help="VLM model for validation. Options: 'qwen3-vl:2b-instruct-q8_0' (default, faster), 'qwen3-vl:4b-instruct-q8_0' (better quality)",
+        ),
+    ] = "qwen3-vl:2b-instruct-q8_0",
+):
+    """
+    Detect and segment instances of a category across frame sequences.
+    """
+    setup_logging(verbose)
+    print_banner()
+    try:
+        validate_paths(frames_path, annotation_frame=None)
+        if output_dir is None:
+            output_dir = Path("./outputs")
+        # Validate thresholds
+        if not 0.0 <= accept_threshold <= 1.0:
+            typer.echo(
+                f"Error: accept_threshold must be between 0.0 and 1.0, got {accept_threshold}",
+                err=True,
+            )
+            raise typer.Exit(code=1)
+        if not 0.0 <= reject_threshold <= 1.0:
+            typer.echo(
+                f"Error: reject_threshold must be between 0.0 and 1.0, got {reject_threshold}",
+                err=True,
+            )
+            raise typer.Exit(code=1)
+        if reject_threshold >= accept_threshold:
+            typer.echo(
+                f"Error: reject_threshold ({reject_threshold}) must be < accept_threshold ({accept_threshold})",
+                err=True,
+            )
+            raise typer.Exit(code=1)
+        config = {
+            "Frames Path": str(frames_path),
+            "Category": category,
+            "Output Directory": str(output_dir),
+            "Accept Threshold": f"{accept_threshold}",
+            "Reject Threshold": f"{reject_threshold}",
+            "Save Masks to Disk": "Yes" if save_masks else "No",
+            "VLM Model": vlm_model,
+        }
+        print_config(config)
+        typer.echo("Initializing detector (requires CUDA GPU)...")
+        segmenter = GenericCategorySegmenter(vlm_model=vlm_model)
+        typer.echo("✓ Detector initialized\n")
+        typer.echo("Processing frames...")
+        start_time = time.time()
+        result = segmenter.segment(
+            frames_path=str(frames_path),
+            category=category,
+            output_dir=str(output_dir),
+            accept_threshold=accept_threshold,
+            reject_threshold=reject_threshold,
+            save_debug=save_debug,
+            save_masks=save_masks,
+        )
+        elapsed_time = time.time() - start_time
+        typer.echo("✓ Segmentation complete!\n")
+        print_summary_generic_category(result, elapsed_time)
+    except typer.Exit:
+        raise
+    except Exception as e:
+        typer.echo(f"\n✗ Error: {e}\n", err=True)
+        if verbose:
+            import traceback
+            traceback.print_exc()
+        raise typer.Exit(code=1) from None
+def main():
+    """Main entry point for the CLI."""
+    app()
+if __name__ == "__main__":
+    main()

eneas/segmentation/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+eneas Segmentation Module
+Provides frame sequence segmentation tools for unique instance tracking
+and generic category detection.
+"""
+from .generic_category import GenericCategorySegmenter
+from .types import SegmentationResult
+from .unique_instance import UniqueInstanceSegmenter
+__all__ = [
+    "UniqueInstanceSegmenter",
+    "GenericCategorySegmenter",
+    "SegmentationResult",
+]

eneas/segmentation/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (556 Bytes). View file

eneas/segmentation/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (557 Bytes). View file

eneas/segmentation/__pycache__/generic_category.cpython-312.pyc ADDED Viewed

Binary file (44.1 kB). View file

eneas/segmentation/__pycache__/generic_category.cpython-313.pyc ADDED Viewed

Binary file (21.7 kB). View file

eneas/segmentation/__pycache__/model_manager.cpython-312.pyc ADDED Viewed

Binary file (4.92 kB). View file

eneas/segmentation/__pycache__/model_manager.cpython-313.pyc ADDED Viewed

Binary file (7.15 kB). View file

eneas/segmentation/__pycache__/types.cpython-312.pyc ADDED Viewed

Binary file (1.35 kB). View file

eneas/segmentation/__pycache__/types.cpython-313.pyc ADDED Viewed

Binary file (1.38 kB). View file

eneas/segmentation/__pycache__/unique_instance.cpython-312.pyc ADDED Viewed

Binary file (39.9 kB). View file

eneas/segmentation/__pycache__/unique_instance.cpython-313.pyc ADDED Viewed

Binary file (36.9 kB). View file

eneas/segmentation/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (15.1 kB). View file

eneas/segmentation/__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (10.4 kB). View file

eneas/segmentation/generic_category.py ADDED Viewed

	@@ -0,0 +1,1072 @@

+"""
+GenericCategorySegmenter - Generic category segmentation for multiple instances.
+Based on Florence-2 for object detection and grounding.
+"""
+import base64
+import io
+import logging
+import os
+import shutil
+import time
+from pathlib import Path
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from .model_manager import ModelManager
+from .types import SegmentationResult
+from .utils import (
+    draw_bboxes,
+    expand_crop_to_minimum_size,
+    mask_overlapping_boxes,
+    non_max_suppression,
+    smart_convert_to_plural,
+    smart_convert_to_singular,
+)
+logger = logging.getLogger(__name__)
+class GenericCategorySegmenter:
+    """
+    Segmenter for generic categories (multiple instances per frame).
+    Use cases:
+    - "all chairs"
+    - "all cars"
+    - "all people"
+    Multiple different instances, can vary frame by frame.
+    No temporal tracking - each frame is processed independently.
+    Returns binary masks (black & white) for each detected instance.
+    Example:
+        >>> from eneas.segmentation import GenericCategorySegmenter
+        >>> segmenter = GenericCategorySegmenter()
+        >>> result = segmenter.segment(
+        ...     frames_path="/path/to/frames",
+        ...     category="chair"
+        ... )
+        >>> print(f"Detected {result.num_frames} frames")
+        >>> # Access masks for first frame
+        >>> frame_0_masks = result.masks[0]  # List of masks for frame 0
+    """
+    SUPPORTED_IMAGE_FORMATS = (".jpg", ".jpeg", ".png")
+    def __init__(
+        self,
+        grounding_model_path: str | None = None,
+        image_text_model_path: str | None = None,
+        sam2_model_path: str | None = None,
+        device: str | None = None,
+        default_output_dir: str = "./outputs",
+        vlm_model: str = "qwen3-vl:2b-instruct-q8_0",
+    ):
+        """
+        Initialize the segmenter.
+        Args:
+            grounding_model_path: Path to Florence-2 model directory. If None, auto-downloads from HuggingFace
+            image_text_model_path: Path to image-text model (SigLIP) directory. If None, auto-downloads from HuggingFace
+            sam2_model_path: Path to SAM2 checkpoint file (.pt). If None, auto-downloads SAM2.1 large model
+            device: Device to use ('cuda' or 'cpu'). If None, auto-detects CUDA availability
+            default_output_dir: Default directory for segmentation outputs
+            vlm_model: Ollama model name for VLM validation. Default: "qwen3-vl:2b-instruct-q8_0"
+                      Alternative: "qwen3-vl:4b-instruct-q8_0" (higher quality, more VRAM)
+        Environment Variables:
+            HF_HOME: HuggingFace cache directory (default: ~/.cache/huggingface)
+        Examples:
+            >>> segmenter = GenericCategorySegmenter()
+            >>> segmenter = GenericCategorySegmenter(device="cuda")
+            >>> segmenter = GenericCategorySegmenter(grounding_model_path="/path/to/Florence-2")
+            >>> segmenter = GenericCategorySegmenter(sam2_model_path="/path/to/sam2.1_hiera_large.pt")
+            >>> # Use larger VLM for better quality
+            >>> segmenter = GenericCategorySegmenter(vlm_model="qwen3-vl:4b-instruct-q8_0")
+        """
+        if grounding_model_path is not None:
+            self.grounding_model_path = grounding_model_path
+            self._auto_download_grounding_model = False
+            logger.info(f"Using grounding model from: {grounding_model_path}")
+        else:
+            self.grounding_model_path = None
+            self._auto_download_grounding_model = True
+            logger.info("Grounding model will auto-download on first use")
+        if image_text_model_path is not None:
+            self.image_text_model_path = image_text_model_path
+            self._auto_download_image_text_model = False
+            logger.info(f"Using image-text model from: {image_text_model_path}")
+        else:
+            self.image_text_model_path = None
+            self._auto_download_image_text_model = True
+            logger.info("Image-text model will auto-download on first use")
+        # Store VLM model name for Ollama
+        self.vlm_model_name = vlm_model
+        # Warn if using untested model
+        supported_vlm_models = ["qwen3-vl:2b-instruct-q8_0", "qwen3-vl:4b-instruct-q8_0"]
+        if vlm_model not in supported_vlm_models:
+            logger.warning(
+                f"VLM model '{vlm_model}' has not been tested. "
+                f"Supported models: {', '.join(supported_vlm_models)}"
+            )
+        logger.info(f"VLM model (Ollama): {vlm_model}")
+        if sam2_model_path is not None:
+            self.sam2_model_path = sam2_model_path
+            self._auto_download_sam2_model = False
+            logger.info(f"Using SAM2 model from: {sam2_model_path}")
+        else:
+            self.sam2_model_path = None
+            self._auto_download_sam2_model = True
+            logger.info("SAM2 model will auto-download on first use")
+        if device is not None:
+            self.device = device
+        else:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.default_output_dir = default_output_dir
+        self.grounding_model = None
+        self.grounding_processor = None
+        self.image_text_model = None
+        self.image_text_processor = None
+        self.image_text_logit_bias = -10.0
+        self.vlm_model = None
+        self.sam2_predictor = None
+        self.sam_model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
+        self.sam2_vendor_path = os.path.join(os.path.dirname(__file__), "..", "vendor", "sam2")
+        # Initialize model manager for auto-downloads
+        self.model_manager = ModelManager()
+        logger.info(f"GenericCategorySegmenter initialized with device: {self.device}")
+    def _load_grounding_model(self):
+        """Load Florence-2 grounding model lazily on first use.
+        Raises:
+            ImportError: If transformers cannot be imported
+            RuntimeError: If auto-download fails
+        """
+        if self.grounding_model is not None:
+            return
+        grounding_model_id = "microsoft/Florence-2-large"
+        if self._auto_download_grounding_model:
+            logger.info(
+                f"Auto-downloading grounding model ({grounding_model_id}) from HuggingFace..."
+            )
+            try:
+                model_manager = ModelManager()
+                downloaded_path = model_manager.download(grounding_model_id)
+                self.grounding_model_path = str(downloaded_path)
+                logger.info(f"Grounding model ready at: {downloaded_path}")
+            except Exception as e:
+                raise RuntimeError(
+                    f"Auto-download failed: {e}\n\n"
+                    "You can manually download the model:\n"
+                    f"  1. Visit: https://huggingface.co/{grounding_model_id}\n"
+                    "  2. Download and extract\n"
+                    "  3. Pass: GenericCategorySegmenter(grounding_model_path='/path/to/model')"
+                ) from e
+        logger.info(f"Loading grounding model from {self.grounding_model_path}...")
+        from transformers import AutoModelForCausalLM, AutoProcessor
+        self.grounding_model = (
+            AutoModelForCausalLM.from_pretrained(self.grounding_model_path, trust_remote_code=True)
+            .to(self.device)
+            .eval()
+        )
+        self.grounding_processor = AutoProcessor.from_pretrained(
+            self.grounding_model_path, trust_remote_code=True
+        )
+        logger.info("Grounding model loaded successfully")
+    def _load_image_text_model(self):
+        """Load SigLIP image-text model lazily on first use.
+        Raises:
+            ImportError: If transformers cannot be imported
+            RuntimeError: If auto-download fails
+        """
+        if self.image_text_model is not None:
+            return
+        image_text_model_id = "google/siglip2-base-patch16-naflex"
+        if self._auto_download_image_text_model:
+            logger.info(
+                f"Auto-downloading image-text model ({image_text_model_id}) from HuggingFace..."
+            )
+            try:
+                model_manager = ModelManager()
+                downloaded_path = model_manager.download(image_text_model_id)
+                self.image_text_model_path = str(downloaded_path)
+                logger.info(f"Image-text model ready at: {downloaded_path}")
+            except Exception as e:
+                raise RuntimeError(
+                    f"Auto-download failed: {e}\n\n"
+                    "You can manually download the model:\n"
+                    f"  1. Visit: https://huggingface.co/{image_text_model_id}\n"
+                    "  2. Download and extract\n"
+                    "  3. Pass: GenericCategorySegmenter(image_text_model_path='/path/to/model')"
+                ) from e
+        logger.info(f"Loading image-text model from {self.image_text_model_path}...")
+        import torch.nn as nn
+        from transformers import AutoModel, AutoProcessor
+        if self.device == "cuda":
+            self.image_text_model = AutoModel.from_pretrained(
+                self.image_text_model_path, device_map="auto"
+            ).eval()
+        else:
+            self.image_text_model = (
+                AutoModel.from_pretrained(self.image_text_model_path).to(self.device).eval()
+            )
+        # Apply logit bias for probability calibration
+        self.image_text_model.logit_bias = nn.Parameter(torch.tensor([self.image_text_logit_bias]))
+        logger.info(f"Image-text logit bias applied: {self.image_text_logit_bias}")
+        self.image_text_processor = AutoProcessor.from_pretrained(self.image_text_model_path)
+        logger.info("Image-text model loaded successfully")
+    def _load_vlm_model(self):
+        """Verify Ollama VLM model is available.
+        Raises:
+            ImportError: If ollama cannot be imported
+            RuntimeError: If Ollama server is not running or model not available
+        """
+        if self.vlm_model is not None:
+            return
+        try:
+            import ollama
+        except ImportError as e:
+            raise ImportError(
+                "ollama is required for VLM validation.\n"
+                "Install it with: pip install ollama\n"
+                "And ensure Ollama server is running: ollama serve"
+            ) from e
+        logger.info(f"Checking Ollama model: {self.vlm_model_name}")
+        try:
+            # Try to pull/verify model exists
+            ollama.pull(self.vlm_model_name)
+            logger.info(f"VLM model ready: {self.vlm_model_name}")
+        except Exception as e:
+            logger.warning(f"Could not pull model (server may be down or model unavailable): {e}")
+            logger.info("Will attempt to use model anyway (may already be cached)")
+        # Mark VLM model as loaded and ready for inference
+        self.vlm_model = True
+        logger.info("Ollama VLM ready")
+    def _load_sam2_model(self):
+        """Load SAM2.1 model lazily on first use.
+        Raises:
+            ImportError: If sam2 cannot be imported
+            RuntimeError: If auto-download fails or model loading fails
+        """
+        if self.sam2_predictor is not None:
+            return
+        if self._auto_download_sam2_model:
+            logger.info("Auto-downloading SAM2.1 checkpoint from direct URL...")
+            try:
+                sam2_url = (
+                    "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt"
+                )
+                checkpoint_path = self.model_manager.download_url(sam2_url, "sam2.1_hiera_large.pt")
+                logger.info(f"SAM2 model ready at: {checkpoint_path}")
+            except Exception as e:
+                raise RuntimeError(
+                    f"Auto-download failed: {e}\n\n"
+                    "You can manually download the model:\n"
+                    f"  1. Visit: https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt\n"
+                    "  2. Save as sam2.1_hiera_large.pt\n"
+                    "  3. Pass: GenericCategorySegmenter(sam2_model_path='/path/to/sam2.1_hiera_large.pt')"
+                ) from e
+        else:
+            # User provided path to checkpoint file
+            checkpoint_path = Path(self.sam2_model_path)
+            if not checkpoint_path.exists():
+                raise RuntimeError(f"SAM2 checkpoint not found: {checkpoint_path}")
+            logger.info(f"Using SAM2 checkpoint from: {checkpoint_path}")
+        # Config is always in vendor
+        config_path = Path(self.sam2_vendor_path) / self.sam_model_cfg
+        if not config_path.exists():
+            raise RuntimeError(f"SAM2 config not found: {config_path}")
+        # Load SAM2 model
+        from eneas.vendor.sam2.build_sam import build_sam2
+        from eneas.vendor.sam2.sam2_image_predictor import SAM2ImagePredictor
+        # Build SAM2 model
+        sam2_model = build_sam2(str(self.sam_model_cfg), str(checkpoint_path), device=self.device)
+        # Create predictor
+        self.sam2_predictor = SAM2ImagePredictor(sam2_model)
+        logger.info("SAM2 model loaded successfully")
+    def _segment_bboxes_in_frame(
+        self,
+        frame_image: Image.Image,
+        bboxes: list,
+    ) -> list[np.ndarray]:
+        """
+        Segment multiple bounding boxes in a single frame using SAM2.1.
+        Args:
+            frame_image: PIL Image of the frame (RGB format)
+            bboxes: List of bounding boxes [[x1, y1, x2, y2], ...]
+        Returns:
+            List of binary masks (H, W) with values 0 or 255 for each bbox
+        """
+        if len(bboxes) == 0:
+            return []
+        # Convert PIL to numpy array
+        frame_image_np = np.array(frame_image)
+        # Set image in predictor
+        self.sam2_predictor.set_image(frame_image_np)
+        # Convert bboxes to numpy array
+        input_boxes = np.array(bboxes)
+        # Predict masks
+        masks, scores, _ = self.sam2_predictor.predict(
+            point_coords=None,
+            point_labels=None,
+            box=input_boxes,
+            multimask_output=False,
+        )
+        # Handle mask shape
+        if len(masks.shape) == 4 and masks.shape[1] == 1:
+            masks = masks.squeeze(1)
+        # Now masks is (num_boxes, H, W) bool
+        # Convert to list of binary numpy masks (0 or 255)
+        result_masks = []
+        for mask in masks:
+            # Fill small holes in mask (area <= 8 pixels)
+            mask_tensor = torch.from_numpy(mask.astype(np.float32))
+            from eneas.vendor.SeC.inference.sam2.utils.misc import fill_holes_in_mask_scores
+            mask_filled = fill_holes_in_mask_scores(mask_tensor, max_area=8)
+            mask_filled_np = mask_filled.numpy()
+            # Convert to binary (0 or 255)
+            mask_binary = (mask_filled_np > 0).astype(np.uint8) * 255
+            result_masks.append(mask_binary)
+        return result_masks
+    def _vlm_validate_single_crop(
+        self,
+        crop_image: Image.Image,
+        target_text: str,
+        num_predict: int = 8000,
+        max_retries: int = 3,
+    ) -> bool:
+        """Validate a single crop using Ollama VLM with structured outputs.
+        Args:
+            crop_image: PIL Image of the crop (clean, no annotations)
+            target_text: Target concept to validate (singular form, e.g., "person")
+            num_predict: Maximum tokens for VLM response (default: 8000)
+            max_retries: Maximum retry attempts if validation fails (default: 3)
+        Returns:
+            True if crop is validated as target, False otherwise
+        """
+        import ollama
+        from pydantic import BaseModel
+        # Define structured output schema
+        class ValidationResult(BaseModel):
+            reasoning: str
+            is_target: bool
+        # Convert image to base64
+        img_byte_arr = io.BytesIO()
+        crop_image.save(img_byte_arr, format="JPEG", quality=95)
+        img_bytes = img_byte_arr.getvalue()
+        img_base64 = base64.b64encode(img_bytes).decode("utf-8")
+        # Construct validation prompt
+        prompt = f"""You are validating an object detection result.
+TASK: Analyze the image and determine if it shows a **{target_text}**.
+The image shows a cropped region from a larger scene. This region was detected by an AI system as possibly containing "{target_text}", but it may be a false positive.
+CRITICAL THINKING QUESTIONS:
+- What do you actually see in this image?
+- Does it visually match the concept of "{target_text}"?
+- Are you absolutely certain?
+- Could this be a false positive (wrong detection)?
+⚠️ IMPORTANT NOTES:
+- The object may be partially visible or occluded (covered by other things) - this is still VALID if you can identify it
+- Focus on what you SEE, not what the AI claimed to detect
+- If detecting "person": ONLY real living humans count as TRUE. Statues, mannequins, dolls, paintings, photos, posters, or any artificial representations are FALSE.
+Provide your response in JSON format with:
+- "reasoning": Brief explanation of what you see and why it is/isn't {target_text}
+- "is_target": true or false
+Example responses:
+{{"reasoning": "I see a real living person - natural skin texture, subtle movements or natural pose, wearing actual clothing.", "is_target": true}}
+{{"reasoning": "This is clearly not a person - it's a wall with an electrical outlet and no human figure present.", "is_target": false}}
+{{"reasoning": "This appears to be a statue or mannequin - rigid pose, uniform painted/plastic surface, artificial appearance, no signs of life.", "is_target": false}}
+{{"reasoning": "I see a person in a photo/poster on the wall - this is a 2D image of a person, not an actual person in the scene.", "is_target": false}}"""
+        for attempt in range(max_retries):
+            try:
+                logger.debug(f"VLM validation attempt {attempt + 1}/{max_retries}")
+                messages = [{"role": "user", "content": prompt, "images": [img_base64]}]
+                # Use structured outputs with Pydantic schema
+                response = ollama.chat(
+                    model=self.vlm_model_name,
+                    messages=messages,
+                    format=ValidationResult.model_json_schema(),
+                    options={"temperature": 0.0, "num_predict": num_predict},
+                    keep_alive=-1,
+                )
+                # Parse and validate response using Pydantic
+                result = ValidationResult.model_validate_json(response.message.content)
+                logger.debug(f"VLM result: is_target={result.is_target}")
+                logger.debug(f"VLM reasoning: {result.reasoning}")
+                return result.is_target
+            except Exception as e:
+                logger.warning(f"VLM validation error (attempt {attempt + 1}/{max_retries}): {e}")
+                if attempt < max_retries - 1:
+                    continue
+                else:
+                    # Default: accept on failure to avoid blocking pipeline
+                    logger.warning("VLM validation failed after all retries, defaulting to accept")
+                    return True
+        return True
+    def _text_to_bbox(
+        self,
+        text: str,
+        frame_image: Image.Image,
+        accept_threshold: float = 0.90,
+        reject_threshold: float = 0.10,
+        save_debug: bool = False,
+        output_dir: str = "",
+        frame_name: str = "",
+    ) -> tuple[list, list, bool]:
+        """Detect and filter objects using multi-stage pipeline.
+        Pipeline:
+        1. Convert text to plural (once, for Florence)
+        2. Detect with Florence-2 CAPTION_TO_PHRASE_GROUNDING
+        3. Apply NMS to remove duplicates
+        4. Convert text to singular (once, for SigLIP)
+        5. Filter with image-text model semantic similarity
+        6. VLM validation for uncertain boxes
+        7. Return accepted + VLM-approved boxes
+        Args:
+            text: Text description of the object category
+            frame_image: PIL Image of the frame
+            accept_threshold: Threshold for accepting boxes automatically (default: 0.90)
+            reject_threshold: Threshold for rejecting boxes automatically (default: 0.10)
+        Returns:
+            Tuple of (bboxes, labels, vlm_used) where vlm_used is True if VLM was called
+        """
+        # Stage 1: Convert to plural once (for Florence)
+        text_plural = smart_convert_to_plural(text)
+        logger.debug(f"Florence query: '{text}' → '{text_plural}'")
+        # Stage 2: Florence-2 detection
+        task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
+        prompt = task_prompt + text_plural
+        inputs = self.grounding_processor(text=prompt, images=frame_image, return_tensors="pt").to(
+            self.device
+        )
+        generated_ids = self.grounding_model.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            early_stopping=False,
+            do_sample=False,
+            num_beams=3,
+        )
+        generated_text = self.grounding_processor.batch_decode(
+            generated_ids, skip_special_tokens=False
+        )[0]
+        parsed_answer = self.grounding_processor.post_process_generation(
+            generated_text, task=task_prompt, image_size=(frame_image.width, frame_image.height)
+        )
+        grounding_results = parsed_answer["<CAPTION_TO_PHRASE_GROUNDING>"]
+        bboxes = grounding_results.get("bboxes", [])
+        labels = grounding_results.get("labels", [])
+        if not bboxes:
+            logger.warning(f"No objects detected for text: '{text}'")
+            return bboxes, labels, False
+        logger.info(f"Florence detected {len(bboxes)} instances")
+        # Save grounding debug (before NMS)
+        if save_debug and len(bboxes) > 0:
+            grounding_debug_dir = os.path.join(output_dir, "grounding_debug")
+            grounding_img = draw_bboxes(frame_image.copy(), bboxes)
+            grounding_path = os.path.join(grounding_debug_dir, f"{frame_name}.jpg")
+            grounding_img.save(grounding_path, quality=95)
+            logger.debug(f"Saved grounding debug: {grounding_path}")
+        # Stage 3: Apply NMS
+        if len(bboxes) > 1:
+            original_count = len(bboxes)
+            bboxes, labels = non_max_suppression(bboxes, labels, iou_threshold=0.70)
+            removed_count = original_count - len(bboxes)
+            if removed_count > 0:
+                logger.info(f"NMS: Removed {removed_count} overlapping boxes")
+                logger.info(f"After NMS: {len(bboxes)} instances")
+        # Stage 4: Convert to singular once (for SigLIP)
+        text_singular = smart_convert_to_singular(text)
+        logger.debug(f"Image-text query: '{text}' → '{text_singular}'")
+        # Stage 5: Image-text filtering
+        if len(bboxes) > 0:
+            accepted, rejected, uncertain, scores = self._image_text_filter_boxes(
+                frame_image,
+                bboxes,
+                labels,
+                text_singular,
+                accept_threshold,
+                reject_threshold,
+                save_debug,
+                output_dir,
+                frame_name,
+            )
+            # Stage 6: VLM validation for uncertain boxes
+            vlm_accepted = []
+            vlm_used = False
+            if len(uncertain) > 0 and self.vlm_model is not None:
+                vlm_used = True
+                logger.info(f"VLM validating {len(uncertain)} uncertain boxes...")
+                for local_idx, global_idx in enumerate(uncertain):
+                    bbox = bboxes[global_idx]
+                    _label = labels[global_idx]
+                    x1, y1, x2, y2 = [int(coord) for coord in bbox]
+                    logger.debug(
+                        f"VLM validating uncertain box {local_idx + 1}/{len(uncertain)} (global #{global_idx + 1})"
+                    )
+                    # Extract clean crop
+                    crop = frame_image.crop((x1, y1, x2, y2))
+                    # Mask overlapping regions
+                    crop = mask_overlapping_boxes(crop, bbox, bboxes, global_idx, (x1, y1, x2, y2))
+                    # Expand crop to minimum size (Qwen3-VL requires 32x32)
+                    crop = expand_crop_to_minimum_size(crop, bbox, frame_image, min_size=32)
+                    # Save VLM debug crop
+                    if save_debug:
+                        vlm_debug_dir = os.path.join(output_dir, "vlm_debug")
+                        crop_path = os.path.join(
+                            vlm_debug_dir, f"{frame_name}_vlm_crop_{global_idx + 1}.jpg"
+                        )
+                        crop.save(crop_path, quality=95)
+                    # Validate with VLM
+                    is_target = self._vlm_validate_single_crop(crop, text_singular)
+                    if is_target:
+                        logger.debug(f"VLM accepted box #{global_idx + 1}")
+                        vlm_accepted.append(global_idx)
+                    else:
+                        logger.debug(f"VLM rejected box #{global_idx + 1}")
+                logger.info(
+                    f"VLM validation: {len(vlm_accepted)} accepted, {len(uncertain) - len(vlm_accepted)} rejected"
+                )
+            # Stage 7: Combine accepted + VLM-approved uncertain (rejected + VLM-rejected discarded)
+            keep_indices = sorted(accepted + vlm_accepted)
+            bboxes = [bboxes[i] for i in keep_indices]
+            labels = [labels[i] for i in keep_indices]
+            logger.info(f"Final result: {len(bboxes)} instances")
+        return bboxes, labels, vlm_used
+    def _image_text_filter_boxes(
+        self,
+        image: Image.Image,
+        bboxes: list,
+        labels: list,
+        target_text: str,
+        accept_threshold: float = 0.90,
+        reject_threshold: float = 0.10,
+        save_debug: bool = False,
+        output_dir: str = "",
+        frame_name: str = "",
+    ) -> tuple[list, list, list, list]:
+        """Filter bounding boxes using image-text model semantic similarity.
+        Uses ensemble of prompts with MEAN strategy for robust filtering.
+        Args:
+            image: PIL Image (original, without boxes drawn)
+            bboxes: List of bounding boxes [[x1, y1, x2, y2], ...]
+            labels: List of labels from Florence
+            target_text: Target concept (singular form, e.g., "person")
+            accept_threshold: Threshold for accepting boxes automatically (default: 0.90)
+            reject_threshold: Threshold for rejecting boxes automatically (default: 0.10)
+        Returns:
+            Tuple of:
+            - accepted_indices: Indices of accepted boxes (score >= accept_threshold)
+            - rejected_indices: Indices of rejected boxes (score < reject_threshold)
+            - uncertain_indices: Indices of uncertain boxes (reject_threshold <= score < accept_threshold)
+            - scores: List of similarity scores for each box
+        """
+        if len(bboxes) == 0:
+            return [], [], [], []
+        # Ensemble of prompt templates
+        prompt_templates = [
+            f"a photo of {target_text}",
+            f"a photo of a {target_text}",
+            f"This is a photo of {target_text}",
+            f"This is a photo of a {target_text}",
+            f"a cropped photo of {target_text}",
+            f"a cropped photo of a {target_text}",
+            f"an image of {target_text}",
+            f"an image of a {target_text}",
+            f"{target_text}",
+            f"a {target_text}",
+        ]
+        # Remove duplicates maintaining order
+        texts = []
+        seen = set()
+        for t in prompt_templates:
+            if t not in seen:
+                texts.append(t)
+                seen.add(t)
+        logger.info(f"Image-text filtering with {len(texts)} prompt variants (MEAN strategy)")
+        logger.info(
+            f"Accept threshold: >={accept_threshold}, Reject threshold: <{reject_threshold}"
+        )
+        # Step 1: Prepare all crops first
+        all_crops = []
+        for idx, (bbox, _label) in enumerate(zip(bboxes, labels, strict=True)):
+            x1, y1, x2, y2 = [int(coord) for coord in bbox]
+            # Crop clean region
+            crop = image.crop((x1, y1, x2, y2))
+            # Mask overlapping regions
+            crop = mask_overlapping_boxes(crop, bbox, bboxes, idx, (x1, y1, x2, y2))
+            all_crops.append(crop)
+            # Save image_text debug crops
+            if save_debug:
+                image_text_debug_dir = os.path.join(output_dir, "image_text_debug")
+                crop_path = os.path.join(image_text_debug_dir, f"{frame_name}_crop_{idx + 1}.jpg")
+                crop.save(crop_path, quality=95)
+        # Step 2: Batch process all crops at once
+        with torch.no_grad():
+            inputs = self.image_text_processor(
+                text=texts,
+                images=all_crops,  # Process ALL crops in one batch
+                padding="max_length",
+                max_length=64,
+                return_tensors="pt",
+            ).to(self.device)
+            outputs = self.image_text_model(**inputs)
+            logits_per_image = outputs.logits_per_image  # Shape: [num_crops, num_prompts]
+            probs = torch.sigmoid(logits_per_image)  # Shape: [num_crops, num_prompts]
+        # Step 3: Process results for each crop individually
+        scores = []
+        accepted_indices = []
+        rejected_indices = []
+        uncertain_indices = []
+        for idx, (_bbox, label) in enumerate(zip(bboxes, labels, strict=True)):
+            # Extract scores for this specific crop
+            crop_probs = probs[idx].cpu().numpy()  # Shape: [num_prompts]
+            # MEAN strategy (average of all prompts)
+            final_score = float(crop_probs.mean())
+            # Stats for logging
+            best_score = float(crop_probs.max())
+            worst_score = float(crop_probs.min())
+            best_prompt_idx = int(crop_probs.argmax())
+            best_prompt = texts[best_prompt_idx]
+            scores.append(final_score)
+            # Classify according to thresholds
+            if final_score >= accept_threshold:
+                accepted_indices.append(idx)
+                status = "ACCEPTED"
+            elif final_score < reject_threshold:
+                rejected_indices.append(idx)
+                status = "REJECTED"
+            else:
+                uncertain_indices.append(idx)
+                status = "UNCERTAIN"
+            logger.debug(
+                f"Box {idx + 1}: {label[:30]} | "
+                f"MEAN={final_score:.4f} | "
+                f"BEST='{best_prompt}'={best_score:.4f} | "
+                f"WORST={worst_score:.4f} | "
+                f"{status}"
+            )
+        logger.info(
+            f"Image-text results: {len(accepted_indices)} accepted, "
+            f"{len(rejected_indices)} rejected, {len(uncertain_indices)} uncertain"
+        )
+        return accepted_indices, rejected_indices, uncertain_indices, scores
+    def segment(
+        self,
+        frames_path: str | list[str],
+        category: str,
+        output_dir: str | None = None,
+        accept_threshold: float = 0.90,
+        reject_threshold: float = 0.10,
+        save_debug: bool = False,
+        save_masks: bool = False,
+    ) -> SegmentationResult:
+        """
+        Detect and segment instances of a category across multiple frames.
+        Args:
+            frames_path: Directory containing frames
+            category: Category to detect (e.g., "chair", "person", "car")
+            output_dir: Output directory for results
+            accept_threshold: Image-text similarity threshold for auto-accepting boxes (default: 0.90)
+            reject_threshold: Image-text similarity threshold for auto-rejecting boxes (default: 0.10)
+            save_debug: Save debug visualizations (grounding_debug/, image_text_debug/, vlm_debug/, detections_debug/)
+            save_masks: Save binary segmentation masks to disk (default: False)
+        Returns:
+            SegmentationResult with detection data and binary masks
+        Raises:
+            ValueError: If inputs are invalid
+            FileNotFoundError: If paths don't exist
+            RuntimeError: If detection fails
+        Examples:
+            >>> segmenter = GenericCategorySegmenter()
+            >>> result = segmenter.segment(
+            ...     frames_path="./frames",
+            ...     category="chair"
+            ... )
+            >>> # With masks
+            >>> result = segmenter.segment(
+            ...     frames_path="./frames",
+            ...     category="person",
+            ...     save_masks=True
+            ... )
+            >>> # Access masks: result.masks[frame_idx] returns list of masks
+        """
+        if output_dir is None:
+            output_dir = self.default_output_dir
+        # Validate frames_path
+        if isinstance(frames_path, str):
+            if not os.path.isdir(frames_path):
+                raise FileNotFoundError(f"Frames directory not found: {frames_path}")
+        else:
+            raise NotImplementedError(
+                "List of frame paths is not yet implemented. "
+                "Please provide a directory path containing ordered frames."
+            )
+        # Load models
+        self._load_grounding_model()
+        self._load_image_text_model()
+        self._load_vlm_model()
+        # Load SAM2 model for segmentation
+        self._load_sam2_model()
+        # Start pure inference timer (after all model loading)
+        logger.info("Models loaded. Starting pure inference timer...")
+        inference_start_time = time.time()
+        frames_dir = frames_path
+        frame_names = self._get_frame_names(frames_dir)
+        logger.info(f"Found {len(frame_names)} images")
+        logger.info(f"Detecting category: '{category}'")
+        # Create debug directories if needed
+        if save_debug:
+            grounding_debug_dir = os.path.join(output_dir, "grounding_debug")
+            image_text_debug_dir = os.path.join(output_dir, "image_text_debug")
+            vlm_debug_dir = os.path.join(output_dir, "vlm_debug")
+            sam_debug_dir = os.path.join(output_dir, "sam_debug")
+            detections_debug_dir = os.path.join(output_dir, "detections_debug")
+            # Clean existing debug directories to avoid confusion with old files
+            for debug_dir in [
+                grounding_debug_dir,
+                image_text_debug_dir,
+                vlm_debug_dir,
+                sam_debug_dir,
+                detections_debug_dir,
+            ]:
+                if os.path.exists(debug_dir):
+                    shutil.rmtree(debug_dir)
+                    logger.info(f"Cleaned existing debug directory: {debug_dir}")
+            os.makedirs(grounding_debug_dir, exist_ok=True)
+            os.makedirs(image_text_debug_dir, exist_ok=True)
+            os.makedirs(vlm_debug_dir, exist_ok=True)
+            os.makedirs(sam_debug_dir, exist_ok=True)
+            os.makedirs(detections_debug_dir, exist_ok=True)
+            logger.info("Debug mode enabled - saving visualizations")
+        # Process each frame independently
+        all_detections = {}
+        all_masks = {}
+        vlm_usage_count = 0
+        for frame_idx, frame_name in enumerate(frame_names):
+            frame_path = os.path.join(frames_dir, frame_name)
+            frame_image = Image.open(frame_path).convert("RGB")
+            logger.info(f"Processing frame {frame_idx + 1}/{len(frame_names)}: {frame_name}")
+            # Get frame stem (without extension) for debug filenames
+            frame_stem = Path(frame_name).stem
+            # Detect and filter objects using full pipeline
+            bboxes, labels, vlm_used = self._text_to_bbox(
+                category,
+                frame_image,
+                accept_threshold,
+                reject_threshold,
+                save_debug,
+                output_dir,
+                frame_stem,
+            )
+            # Track VLM usage
+            if vlm_used:
+                vlm_usage_count += 1
+            # Store detections for this frame
+            frame_detections = []
+            for bbox, label in zip(bboxes, labels, strict=True):
+                frame_detections.append(
+                    {
+                        "bbox": bbox,
+                        "label": label,
+                    }
+                )
+            all_detections[frame_idx] = frame_detections
+            # Segment bboxes using SAM2
+            frame_masks = self._segment_bboxes_in_frame(frame_image, bboxes)
+            all_masks[frame_idx] = frame_masks
+            logger.info(f"  Segmented {len(frame_masks)} objects")
+            # Save SAM segmentation debug (overlay masks on image)
+            if save_debug and len(frame_masks) > 0:
+                sam_debug_dir = os.path.join(output_dir, "sam_debug")
+                # Convert PIL to numpy for overlay
+                img_array = np.array(frame_image)
+                # Create combined overlay with all masks
+                overlay = img_array.copy()
+                for mask in frame_masks:
+                    overlay[mask > 0] = [0, 100, 255]  # Blue where mask is present
+                # Blend original image with overlay (60% original, 40% overlay)
+                blended = cv2.addWeighted(img_array, 0.6, overlay, 0.4, 0)
+                # Convert back to PIL and save
+                blended_img = Image.fromarray(blended)
+                sam_path = os.path.join(sam_debug_dir, f"{frame_stem}.jpg")
+                blended_img.save(sam_path, quality=95)
+                logger.debug(f"Saved SAM debug visualization with {len(frame_masks)} masks")
+            # Save detections debug (final result - always save, even if no detections)
+            if save_debug:
+                detections_debug_dir = os.path.join(output_dir, "detections_debug")
+                if len(bboxes) > 0:
+                    detections_img = draw_bboxes(frame_image.copy(), bboxes)
+                else:
+                    # No detections found - save original image without annotations
+                    detections_img = frame_image.copy()
+                detections_path = os.path.join(detections_debug_dir, f"{frame_stem}.jpg")
+                detections_img.save(detections_path, quality=95)
+                logger.debug(f"Saved detections debug: {detections_path}")
+        # Calculate and log pure inference stats
+        inference_end_time = time.time()
+        pure_inference_time = inference_end_time - inference_start_time
+        pure_fps = len(frame_names) / pure_inference_time if pure_inference_time > 0 else 0.0
+        logger.info("Detection and segmentation completed successfully!")
+        logger.info(f"Processed {len(frame_names)} frames")
+        logger.info(f"==================================================")
+        logger.info(f"Pure Inference Stats:")
+        logger.info(f"  Total Time: {pure_inference_time:.4f}s")
+        logger.info(f"  FPS: {pure_fps:.2f}")
+        logger.info(
+            f"  Latency per frame: {1 / pure_fps:.4f}s"
+            if pure_fps > 0
+            else "  Latency per frame: N/A"
+        )
+        logger.info(f"==================================================")
+        # Calculate VLM usage percentage
+        vlm_usage_percentage = (
+            (vlm_usage_count / len(frame_names) * 100) if len(frame_names) > 0 else 0.0
+        )
+        # Save binary masks to disk if requested
+        mask_paths = []
+        if save_masks:
+            masks_dir = os.path.join(output_dir, "masks")
+            os.makedirs(masks_dir, exist_ok=True)
+            logger.info("Saving binary masks...")
+            for frame_idx in sorted(all_masks.keys()):
+                frame_masks_list = all_masks[frame_idx]
+                # Combine all masks for this frame using OR
+                if len(frame_masks_list) > 0:
+                    # Start with first mask
+                    combined_mask = frame_masks_list[0].copy()
+                    # OR with remaining masks
+                    for mask in frame_masks_list[1:]:
+                        combined_mask = combined_mask | mask
+                else:
+                    # No objects in this frame - create empty mask
+                    # Get image dimensions from first frame
+                    first_frame_path = os.path.join(frames_dir, frame_names[0])
+                    first_frame = Image.open(first_frame_path).convert("RGB")
+                    h, w = np.array(first_frame).shape[:2]
+                    combined_mask = np.zeros((h, w), dtype=np.uint8)
+                # Save as PNG (lossless, black & white)
+                mask_filename = (
+                    frame_names[frame_idx].replace(".jpg", ".png").replace(".jpeg", ".png")
+                )
+                mask_path = os.path.join(masks_dir, mask_filename)
+                cv2.imwrite(mask_path, combined_mask)
+                mask_paths.append(mask_path)
+            logger.info(f"Masks saved to: {masks_dir}")
+        result = SegmentationResult(
+            masks=all_masks,
+            num_frames=len(frame_names),
+            output_dir=output_dir,
+            mask_paths=mask_paths,
+            metadata={
+                "category": category,
+                "detections": all_detections,
+                "num_frames_total": len(frame_names),
+                "accept_threshold": accept_threshold,
+                "reject_threshold": reject_threshold,
+                "vlm_usage_count": vlm_usage_count,
+                "vlm_usage_percentage": vlm_usage_percentage,
+            },
+            initial_mask_path=None,
+        )
+        return result
+    def _get_frame_names(self, frames_dir: str) -> list[str]:
+        """Get sorted list of image files in directory.
+        Args:
+            frames_dir: Directory containing image frames
+        Returns:
+            Sorted list of frame filenames
+        Raises:
+            ValueError: If no valid image files found
+        """
+        frame_names = sorted(
+            [f for f in os.listdir(frames_dir) if f.lower().endswith(self.SUPPORTED_IMAGE_FORMATS)]
+        )
+        if not frame_names:
+            raise ValueError(
+                f"No image files found in {frames_dir}. "
+                f"Supported formats: {self.SUPPORTED_IMAGE_FORMATS}"
+            )
+        return frame_names

eneas/segmentation/model_manager.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+Model Manager - Simplified model download handling.
+Handles downloading models from HuggingFace Hub and direct URLs.
+Uses HuggingFace Hub's native caching system for all downloads.
+"""
+import logging
+import urllib.request
+from pathlib import Path
+logger = logging.getLogger(__name__)
+class ModelManager:
+    """Manages model downloads for eneas.
+    Uses HuggingFace Hub's default cache (~/.cache/huggingface/hub/) for all models.
+    Respects HF_HOME environment variable for custom cache locations.
+    Examples:
+        >>> manager = ModelManager()
+        >>> # Download from HuggingFace Hub
+        >>> model_path = manager.download("microsoft/Florence-2-large")
+        >>> # Download from direct URL
+        >>> sam2_path = manager.download_url(
+        ...     "https://dl.fbaipublicfiles.com/.../sam2.1_hiera_large.pt",
+        ...     "sam2.1_hiera_large.pt"
+        ... )
+    """
+    def download(self, model_id: str) -> Path:
+        """Download model from HuggingFace Hub.
+        Uses HuggingFace Hub's native caching and download resumption.
+        The model is cached automatically and reused on subsequent calls.
+        Args:
+            model_id: HuggingFace model ID (e.g., 'microsoft/Florence-2-large')
+        Returns:
+            Path to model directory
+        Raises:
+            ImportError: If huggingface_hub is not installed
+            RuntimeError: If download fails
+        Examples:
+            >>> manager = ModelManager()
+            >>> path = manager.download("microsoft/Florence-2-large")
+        """
+        try:
+            from huggingface_hub import snapshot_download
+        except ImportError as e:
+            raise ImportError(
+                "huggingface_hub is required for model downloads.\n"
+                "Install with: pip install huggingface_hub"
+            ) from e
+        try:
+            logger.info(f"Downloading {model_id} from HuggingFace Hub...")
+            # Use HuggingFace's native caching
+            # - Automatically uses ~/.cache/huggingface/hub/
+            # - Respects HF_HOME environment variable
+            # - Handles validation, resumable downloads, symlinks, etc.
+            model_path = snapshot_download(repo_id=model_id)
+            logger.info(f"Model ready at: {model_path}")
+            return Path(model_path)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to download {model_id} from HuggingFace Hub: {e}\n\n"
+                f"Manual download: https://huggingface.co/{model_id}"
+            ) from e
+    def download_url(self, url: str, filename: str) -> Path:
+        """Download file from direct URL.
+        Downloads to HuggingFace cache directory for consistency with other models.
+        File is cached and reused on subsequent calls.
+        Args:
+            url: Direct download URL
+            filename: Name to save file as
+        Returns:
+            Path to downloaded file
+        Raises:
+            RuntimeError: If download fails
+        Examples:
+            >>> manager = ModelManager()
+            >>> path = manager.download_url(
+            ...     "https://example.com/model.pt",
+            ...     "model.pt"
+            ... )
+        """
+        try:
+            from huggingface_hub import HF_HOME
+        except ImportError:
+            # Fallback if huggingface_hub not available
+            HF_HOME = None
+        # Use HuggingFace cache directory for consistency
+        cache_dir = Path(HF_HOME or Path.home() / ".cache" / "huggingface")
+        file_path = cache_dir / "hub" / filename
+        # Return cached file if exists
+        if file_path.exists():
+            logger.info(f"Using cached file: {file_path}")
+            return file_path
+        # Download file
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Downloading {filename} from {url}...")
+        try:
+            urllib.request.urlretrieve(url, file_path)
+            logger.info(f"Download complete: {file_path}")
+            return file_path
+        except Exception as e:
+            raise RuntimeError(f"Failed to download from {url}: {e}") from e

eneas/segmentation/types.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+Shared types for segmentation operations.
+"""
+from dataclasses import dataclass
+import numpy as np
+@dataclass
+class SegmentationResult:
+    """Result of a segmentation operation.
+    Attributes:
+        masks: Dictionary mapping frame indices to binary masks (numpy arrays, 0=background, 255=foreground)
+        num_frames: Number of frames successfully segmented
+        output_dir: Directory where results were saved
+        mask_paths: List of paths to saved mask images (if save_masks=True)
+        metadata: Additional metadata about the segmentation
+        initial_mask_path: Path to the initial mask visualization (None for generic segmentation)
+    """
+    masks: dict[int, np.ndarray]
+    num_frames: int
+    output_dir: str
+    mask_paths: list[str]
+    metadata: dict
+    initial_mask_path: str | None = None

eneas/segmentation/unique_instance.py ADDED Viewed

	@@ -0,0 +1,993 @@

+"""
+UniqueInstanceSegmenter - Unique instance segmentation with temporal tracking.
+Based on SeC model for frame sequence object segmentation.
+"""
+import gc
+import logging
+import os
+import time
+from pathlib import Path
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from .model_manager import ModelManager
+from .types import SegmentationResult
+logger = logging.getLogger(__name__)
+class UniqueInstanceSegmenter:
+    """
+    Segmenter for unique instances with temporal tracking.
+    Use cases:
+    - "THAT specific statue"
+    - "THAT red car"
+    - "THAT particular person"
+    A single instance that persists over time.
+    Can disappear/reappear but remains THE SAME object.
+    Returns binary masks (black & white) for each frame where:
+    - Black (0) = Background
+    - White (255) = Segmented object
+    Example:
+        >>> from eneas.segmentation import UniqueInstanceSegmenter
+        >>> segmenter = UniqueInstanceSegmenter()  # Requires CUDA GPU
+        >>> result = segmenter.segment(
+        ...     frames_path="/path/to/frames",
+        ...     points=[(100, 200), (150, 250)],
+        ...     annotation_frame="frame_0050.jpg"
+        ... )
+        >>> print(f"Segmented {result.num_frames} frames")
+        >>> # Access binary masks (always available in memory)
+        >>> mask_frame_0 = result.masks[0]  # numpy array (H, W) with 0 and 255
+        >>> # Optionally save to disk
+        >>> result = segmenter.segment(..., save_masks=True)
+        >>> mask_image = cv2.imread(result.mask_paths[0], cv2.IMREAD_GRAYSCALE)
+    """
+    SUPPORTED_IMAGE_FORMATS = (".jpg", ".jpeg", ".png")
+    DEFAULT_MEMORY_CLEANUP_INTERVAL = 10
+    DEFAULT_PROGRESS_LOG_INTERVAL = 20
+    # SAM encoder configurations
+    SAM_ENCODERS = {
+        # SAM 2.1 (Latest)
+        "tiny": "sam2.1/sam2.1_hiera_t.yaml",
+        "small": "sam2.1/sam2.1_hiera_s.yaml",
+        "base": "sam2.1/sam2.1_hiera_b+.yaml",
+        "large": "sam2.1/sam2.1_hiera_l.yaml",
+        # LongSAM 2.1 (Default, better temporal consistency for frame sequences)
+        "long-tiny": "longsam2.1/longsam2.1_hiera_t.yaml",
+        "long-small": "longsam2.1/longsam2.1_hiera_s.yaml",
+        "long-base": "longsam2.1/longsam2.1_hiera_b+.yaml",
+        "long-large": "longsam2.1/longsam2.1_hiera_l.yaml",
+        # SAM 2.0 (Legacy)
+        "legacy-tiny": "sam2/sam2_hiera_t.yaml",
+        "legacy-small": "sam2/sam2_hiera_s.yaml",
+        "legacy-base": "sam2/sam2_hiera_b+.yaml",
+        "legacy-large": "sam2/sam2_hiera_l.yaml",
+    }
+    def __init__(
+        self,
+        segmentation_model_path: str | None = None,
+        grounding_model_path: str | None = None,
+        sam_encoder: str = "long-large",
+        device: str | None = None,
+        default_output_dir: str = "./outputs",
+        model_config_overrides: dict[str, str] | None = None,
+        memory_cleanup_interval: int = 10,
+    ):
+        """
+        Initialize the segmenter.
+        Args:
+            segmentation_model_path: Path to SeC model directory. If None, auto-downloads from HuggingFace
+            grounding_model_path: Path to Florence-2 model directory. If None, auto-downloads when needed
+            sam_encoder: SAM encoder variant. Options:
+                - LongSAM 2.1 (best for temporal tracking): 'long-tiny', 'long-small', 'long-base', 'long-large' (default)
+                - SAM 2.1: 'tiny', 'small', 'base', 'large'
+                - SAM 2.0: 'legacy-tiny', 'legacy-small', 'legacy-base', 'legacy-large'
+            device: Device to use ('cuda' recommended). If None, auto-detects CUDA availability
+            default_output_dir: Default directory for segmentation outputs
+            model_config_overrides: Additional Hydra config overrides for the segmentation model
+            memory_cleanup_interval: Clean GPU memory every N frames (default: 10)
+        Environment Variables:
+            HF_HOME: HuggingFace cache directory (default: ~/.cache/huggingface)
+        Note:
+            Requires CUDA GPU with bfloat16 support. CPU inference is not supported.
+        Examples:
+            >>> segmenter = UniqueInstanceSegmenter()
+            >>> segmenter = UniqueInstanceSegmenter(sam_encoder="long-small")
+            >>> segmenter = UniqueInstanceSegmenter(segmentation_model_path="/path/to/SeC-4B")
+            >>> segmenter = UniqueInstanceSegmenter(device="cuda:1")
+        """
+        if sam_encoder not in self.SAM_ENCODERS:
+            available = ", ".join(f"'{k}'" for k in self.SAM_ENCODERS.keys())
+            raise ValueError(
+                f"Invalid sam_encoder: '{sam_encoder}'. Available options: {available}"
+            )
+        self.sam_encoder = sam_encoder
+        self.sam_config_path = self.SAM_ENCODERS[sam_encoder]
+        logger.info(f"Using SAM encoder: {sam_encoder} ({self.sam_config_path})")
+        if segmentation_model_path is not None:
+            self.segmentation_model_path = segmentation_model_path
+            self._auto_download_segmentation_model = False
+            logger.info(f"Using segmentation model from: {segmentation_model_path}")
+        else:
+            self.segmentation_model_path = None
+            self._auto_download_segmentation_model = True
+            logger.info("Segmentation model will auto-download on first use")
+        if grounding_model_path is not None:
+            self.grounding_model_path = grounding_model_path
+            self._auto_download_grounding_model = False
+            logger.info(f"Using grounding model from: {grounding_model_path}")
+        else:
+            self.grounding_model_path = None
+            self._auto_download_grounding_model = True
+        if device is not None:
+            self.device = device
+        else:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            if self.device == "cpu":
+                logger.warning(
+                    "No CUDA device detected. SeC-4B requires CUDA GPU with bfloat16 support. "
+                    "Inference will likely fail on CPU."
+                )
+        self.default_output_dir = default_output_dir
+        self.memory_cleanup_interval = memory_cleanup_interval
+        base_overrides = {
+            "++model.non_overlap_masks": "false",
+            "++model.grounding_encoder_config": self.sam_config_path,
+        }
+        if model_config_overrides:
+            base_overrides.update(model_config_overrides)
+        self.model_config_overrides = base_overrides
+        self.segmentation_model = None
+        self.segmentation_tokenizer = None
+        self.grounding_model = None
+        self.grounding_processor = None
+        logger.info(f"UniqueInstanceSegmenter initialized with device: {self.device}")
+    def optimize_cuda_memory(self) -> None:
+        """
+        Optimize CUDA memory allocation to reduce fragmentation.
+        This method clears the CUDA cache and enables expandable memory segments,
+        which helps prevent Out-of-Memory errors when processing long frame sequences or
+        when GPU memory is limited. Only effective when using CUDA device.
+        Call this method before segmentation if you experience memory issues.
+        """
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+            logger.info("CUDA memory optimizations applied")
+    def _validate_inputs(
+        self,
+        frames_path: str | list[str],
+        points: list[tuple[int, int]],
+        labels: list[int] | None,
+    ) -> None:
+        """
+        Validate all input parameters.
+        Args:
+            frames_path: Path to frames directory or list of frame paths
+            points: List of (x, y) coordinates
+            labels: List of point labels (1 or 0)
+        Raises:
+            ValueError: If any input is invalid
+            FileNotFoundError: If frames_path doesn't exist
+        """
+        # Validate frames_path
+        if isinstance(frames_path, str):
+            if not os.path.isdir(frames_path):
+                raise FileNotFoundError(f"Frames directory not found: {frames_path}")
+        else:
+            raise NotImplementedError(
+                "List of frame paths is not yet implemented. "
+                "Please provide a directory path containing ordered frames."
+            )
+        # Validate points
+        if not points:
+            raise ValueError("At least one point must be provided")
+        if not all(isinstance(p, (tuple, list)) and len(p) == 2 for p in points):
+            raise ValueError("Each point must be a tuple or list of two integers (x, y)")
+        # Validate labels
+        if labels is not None:
+            if len(labels) != len(points):
+                raise ValueError(
+                    f"Number of labels ({len(labels)}) must match number of points ({len(points)})"
+                )
+            if not all(label in (0, 1) for label in labels):
+                raise ValueError("Labels must be 0 (negative) or 1 (positive)")
+    def _load_segmentation_model(self):
+        """Load SeC segmentation model lazily on first use.
+        Raises:
+            ImportError: If SeC modules cannot be imported
+            FileNotFoundError: If model path doesn't exist
+            RuntimeError: If auto-download fails
+        """
+        if self.segmentation_model is not None:
+            return
+        if self._auto_download_segmentation_model:
+            logger.info("Auto-downloading SeC-4B model from HuggingFace...")
+            try:
+                model_manager = ModelManager()
+                downloaded_path = model_manager.download("OpenIXCLab/SeC-4B")
+                self.segmentation_model_path = str(downloaded_path)
+                logger.info(f"Model ready at: {downloaded_path}")
+            except Exception as e:
+                raise RuntimeError(
+                    f"Auto-download failed: {e}\n\n"
+                    "You can manually download the model:\n"
+                    "  1. Visit: https://huggingface.co/OpenIXCLab/SeC-4B\n"
+                    "  2. Download and extract\n"
+                    "  3. Pass: UniqueInstanceSegmenter(segmentation_model_path='/path/to/SeC-4B')"
+                ) from e
+        logger.info(f"Loading SeC model from {self.segmentation_model_path}...")
+        model_path = Path(self.segmentation_model_path)
+        if not model_path.exists():
+            raise FileNotFoundError(
+                f"Model path not found: {self.segmentation_model_path}\n\n"
+                "Options:\n"
+                "  1. Auto-download: UniqueInstanceSegmenter()\n"
+                "  2. Pass parameter: UniqueInstanceSegmenter(segmentation_model_path='/path/to/SeC-4B')\n"
+                "  3. Manual download: https://huggingface.co/OpenIXCLab/SeC-4B"
+            )
+        try:
+            from transformers import AutoTokenizer
+            from eneas.vendor.SeC.inference.configuration_sec import SeCConfig
+            from eneas.vendor.SeC.inference.modeling_sec import SeCModel
+        except ImportError as e:
+            raise ImportError(
+                f"Failed to import SeC modules: {e}. "
+                "This is an internal error with the vendored SeC code."
+            ) from e
+        if self.device == "cuda":
+            torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
+        config = SeCConfig.from_pretrained(str(model_path), trust_remote_code=True)
+        hydra_overrides = [
+            f"++model.{k.replace('++model.', '')}={v}"
+            for k, v in self.model_config_overrides.items()
+        ]
+        config.hydra_overrides_extra = hydra_overrides
+        if hasattr(config, "vision_config"):
+            config.vision_config.use_flash_attn = False
+        self.segmentation_model = (
+            SeCModel.from_pretrained(
+                str(model_path), config=config, torch_dtype=torch.bfloat16, trust_remote_code=True
+            )
+            .eval()
+            .to(self.device)
+        )
+        self.segmentation_tokenizer = AutoTokenizer.from_pretrained(
+            str(model_path),
+            trust_remote_code=True,
+        )
+        logger.info("Model loaded successfully")
+    def _load_grounding_model(self):
+        """Load grounding model lazily when needed for text-based segmentation.
+        Raises:
+            ImportError: If transformers cannot be imported
+            RuntimeError: If auto-download fails
+        """
+        if self.grounding_model is not None:
+            return
+        grounding_model_id = "microsoft/Florence-2-large"
+        if self._auto_download_grounding_model:
+            logger.info(
+                f"Auto-downloading grounding model ({grounding_model_id}) from HuggingFace..."
+            )
+            try:
+                model_manager = ModelManager()
+                downloaded_path = model_manager.download(grounding_model_id)
+                self.grounding_model_path = str(downloaded_path)
+                logger.info(f"Grounding model ready at: {downloaded_path}")
+            except Exception as e:
+                raise RuntimeError(
+                    f"Auto-download failed: {e}\n\n"
+                    "You can manually download the model:\n"
+                    f"  1. Visit: https://huggingface.co/{grounding_model_id}\n"
+                    "  2. Download and extract\n"
+                    "  3. Pass: UniqueInstanceSegmenter(grounding_model_path='/path/to/model')"
+                ) from e
+        logger.info(f"Loading grounding model from {self.grounding_model_path}...")
+        from transformers import AutoModelForCausalLM, AutoProcessor
+        self.grounding_model = (
+            AutoModelForCausalLM.from_pretrained(
+                self.grounding_model_path, trust_remote_code=True, torch_dtype="auto"
+            )
+            .eval()
+            .to(self.device)
+        )
+        self.grounding_processor = AutoProcessor.from_pretrained(
+            self.grounding_model_path, trust_remote_code=True
+        )
+        logger.info("Grounding model loaded successfully")
+    def _text_to_bbox(self, text: str, frame_image: Image) -> list[float]:
+        """Use grounding model to detect object bounding box from text description.
+        Args:
+            text: Text description of the object
+            frame_image: PIL Image of the frame
+        Returns:
+            Bounding box [x1, y1, x2, y2]
+        Raises:
+            ValueError: If no objects found for the text
+        """
+        task_prompt = "<OPEN_VOCABULARY_DETECTION>"
+        prompt = task_prompt + text
+        inputs = self.grounding_processor(text=prompt, images=frame_image, return_tensors="pt").to(
+            self.device, torch.float16
+        )
+        generated_ids = self.grounding_model.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            early_stopping=False,
+            do_sample=False,
+            num_beams=3,
+        )
+        generated_text = self.grounding_processor.batch_decode(
+            generated_ids, skip_special_tokens=False
+        )[0]
+        parsed_answer = self.grounding_processor.post_process_generation(
+            generated_text, task=task_prompt, image_size=(frame_image.width, frame_image.height)
+        )
+        bboxes = parsed_answer["<OPEN_VOCABULARY_DETECTION>"]["bboxes"]
+        if not bboxes:
+            raise ValueError(f"Grounding model could not detect any objects for text: '{text}'")
+        bbox = bboxes[0]
+        logger.info(f"Grounding model detected bbox: {bbox} for text: '{text}'")
+        return bbox
+    def _get_frame_names(self, frames_dir: str) -> list[str]:
+        """Get sorted list of image files in directory.
+        Args:
+            frames_dir: Directory containing image frames
+        Returns:
+            Sorted list of frame filenames
+        Raises:
+            ValueError: If no valid image files found
+        """
+        frame_names = sorted(
+            [f for f in os.listdir(frames_dir) if f.lower().endswith(self.SUPPORTED_IMAGE_FORMATS)]
+        )
+        if not frame_names:
+            raise ValueError(
+                f"No image files found in {frames_dir}. "
+                f"Supported formats: {self.SUPPORTED_IMAGE_FORMATS}"
+            )
+        return frame_names
+    def _resolve_frame_index(
+        self, annotation_frame: str | None, frame_names: list[str]
+    ) -> tuple[int, str]:
+        """Resolve annotation frame name to index and full name.
+        Args:
+            annotation_frame: Name of annotation frame (or None for first frame)
+            frame_names: List of all available frame names
+        Returns:
+            Tuple of (frame_index, annotation_frame_name)
+        Raises:
+            ValueError: If annotation_frame is not found
+        """
+        if annotation_frame is None:
+            return 0, frame_names[0]
+        frame_basename = os.path.basename(annotation_frame)
+        if frame_basename not in frame_names:
+            raise ValueError(
+                f"Annotation frame '{frame_basename}' not found in frames directory. "
+                f"Available frames: {frame_names[:5]}... "
+                f"(total: {len(frame_names)})"
+            )
+        return frame_names.index(frame_basename), frame_basename
+    def _validate_points_in_bounds(
+        self, points: list[tuple[int, int]], image_shape: tuple[int, int, int]
+    ) -> None:
+        """Validate that all points are within image bounds.
+        Args:
+            points: List of (x, y) coordinates
+            image_shape: Image shape (height, width, channels)
+        Raises:
+            ValueError: If any point is out of bounds
+        """
+        height, width = image_shape[:2]
+        for i, (x, y) in enumerate(points):
+            if not (0 <= x < width and 0 <= y < height):
+                raise ValueError(
+                    f"Point {i} at ({x}, {y}) is out of image bounds. Image size: {width}x{height}"
+                )
+    def segment(
+        self,
+        frames_path: str | list[str],
+        points: list[tuple[int, int]] | None = None,
+        annotation_frame: str | None = None,
+        labels: list[int] | None = None,
+        text: str | None = None,
+        output_dir: str | None = None,
+        offload_frames_to_gpu: bool = False,
+        save_masks: bool = False,
+        save_debug: bool = False,
+    ) -> SegmentationResult:
+        """
+        Segment a unique instance across multiple frames.
+        Args:
+            frames_path: Directory containing ordered frames
+            points: List of (x, y) coordinates (mutually exclusive with text)
+            annotation_frame: Frame to annotate (or None for first frame)
+            labels: Point labels 1=positive, 0=negative (only with points)
+            text: Text description of object (mutually exclusive with points)
+            output_dir: Output directory
+            offload_frames_to_gpu: Keep frames in GPU (faster, more VRAM)
+            save_masks: Save masks to disk
+            save_debug: Save debug visualizations (sam_debug/)
+        Returns:
+            SegmentationResult with binary masks
+        Raises:
+            ValueError: If inputs are invalid
+            FileNotFoundError: If paths don't exist
+            RuntimeError: If segmentation fails
+        """
+        if text is not None and points is not None:
+            raise ValueError("'text' and 'points' are mutually exclusive")
+        if text is None and points is None:
+            raise ValueError("Either 'text' or 'points' must be provided")
+        if text is not None and labels is not None:
+            raise ValueError("'labels' cannot be used with 'text'")
+        if output_dir is None:
+            output_dir = self.default_output_dir
+        if points is not None:
+            self._validate_inputs(frames_path, points, labels)
+        else:
+            if isinstance(frames_path, str):
+                if not os.path.isdir(frames_path):
+                    raise FileNotFoundError(f"Frames directory not found: {frames_path}")
+        frames_dir = frames_path
+        frame_names = self._get_frame_names(frames_dir)
+        logger.info(f"Found {len(frame_names)} images")
+        frame_idx, annotation_frame = self._resolve_frame_index(annotation_frame, frame_names)
+        initial_frame_path = os.path.join(frames_dir, annotation_frame)
+        initial_frame = Image.open(initial_frame_path)
+        initial_frame_np = np.array(initial_frame)
+        logger.info(
+            f"Annotation frame: {annotation_frame} ({initial_frame_np.shape[1]}x{initial_frame_np.shape[0]})"
+        )
+        if text is not None:
+            logger.info(f"Using text-based grounding: '{text}'")
+            self._load_grounding_model()
+            bbox = self._text_to_bbox(text, initial_frame)
+            bbox_array = np.array(bbox, dtype=np.float32)
+            del self.grounding_model
+            del self.grounding_processor
+            self.grounding_model = None
+            self.grounding_processor = None
+            if self.device == "cuda":
+                torch.cuda.empty_cache()
+            logger.info("Grounding model unloaded from GPU")
+        self._load_segmentation_model()
+        # Start pure inference timer (after all model loading)
+        logger.info("Models loaded. Starting pure inference timer...")
+        inference_start_time = time.time()
+        if text is not None:
+            inference_state = self.segmentation_model.grounding_encoder.init_state(
+                video_path=frames_dir,
+                offload_video_to_cpu=not offload_frames_to_gpu,
+                offload_state_to_cpu=False,
+            )
+            logger.info("Processing bounding box...")
+            ann_obj_id = 1
+            _, out_obj_ids, out_mask_logits = (
+                self.segmentation_model.grounding_encoder.add_new_points_or_box(
+                    inference_state=inference_state,
+                    frame_idx=frame_idx,
+                    obj_id=ann_obj_id,
+                    box=bbox_array,
+                    points=None,
+                    labels=None,
+                    clear_old_points=True,
+                )
+            )
+            init_mask = (out_mask_logits[0] > 0.0).cpu().numpy()
+            initial_mask_path = None
+            if save_masks:
+                os.makedirs(output_dir, exist_ok=True)
+                initial_mask_path = os.path.join(output_dir, "initial_mask.jpg")
+                self._save_initial_mask_with_bbox(
+                    initial_frame_np, init_mask, bbox, text, frame_idx, initial_mask_path
+                )
+                logger.info(f"Initial mask saved: {initial_mask_path}")
+            metadata = {
+                "annotation_frame": annotation_frame,
+                "text": text,
+                "bbox": bbox,
+                "mode": "text-based",
+                "num_frames_total": len(frame_names),
+                "offload_frames_to_gpu": offload_frames_to_gpu,
+            }
+        else:
+            if labels is None:
+                labels = [1] * len(points)
+            logger.info(
+                f"Using {len(points)} points in frame '{annotation_frame}' (index {frame_idx})"
+            )
+            for i, ((x, y), label) in enumerate(zip(points, labels, strict=True)):
+                point_type = "POSITIVE" if label == 1 else "NEGATIVE"
+                logger.debug(f"  Point {i + 1}: ({x}, {y}) - {point_type}")
+            self._validate_points_in_bounds(points, initial_frame_np.shape)
+            points_array = np.array(points, dtype=np.float32)
+            labels_array = np.array(labels, np.int32)
+            inference_state = self.segmentation_model.grounding_encoder.init_state(
+                video_path=frames_dir,
+                offload_video_to_cpu=not offload_frames_to_gpu,
+                offload_state_to_cpu=False,
+            )
+            logger.info("Processing initial points...")
+            ann_obj_id = 1
+            _, out_obj_ids, out_mask_logits = (
+                self.segmentation_model.grounding_encoder.add_new_points_or_box(
+                    inference_state=inference_state,
+                    frame_idx=frame_idx,
+                    obj_id=ann_obj_id,
+                    points=points_array,
+                    labels=labels_array,
+                )
+            )
+            init_mask = (out_mask_logits[0] > 0.0).cpu().numpy()
+            initial_mask_path = None
+            if save_masks:
+                os.makedirs(output_dir, exist_ok=True)
+                initial_mask_path = os.path.join(output_dir, "initial_mask.jpg")
+                self._save_initial_mask(
+                    initial_frame_np, init_mask, points, labels, frame_idx, initial_mask_path
+                )
+                logger.info(f"Initial mask saved: {initial_mask_path}")
+            metadata = {
+                "annotation_frame": annotation_frame,
+                "points": points,
+                "labels": labels,
+                "mode": "point-based",
+                "num_frames_total": len(frame_names),
+                "offload_frames_to_gpu": offload_frames_to_gpu,
+            }
+        # Propagate segmentation
+        frame_segments = self._propagate_segmentation(
+            inference_state, init_mask, frame_idx, len(frame_names)
+        )
+        logger.info(f"Propagation completed ({len(frame_segments)} frames)")
+        # Convert frame_segments to binary masks dictionary
+        binary_masks = {}
+        for frame_idx, segments in frame_segments.items():
+            # Get mask for object ID 1 (the single tracked instance)
+            mask = segments[1]
+            h, w = mask.shape[-2:]
+            mask_binary = mask.reshape(h, w).astype(np.uint8) * 255  # 0 or 255
+            binary_masks[frame_idx] = mask_binary
+        # Save SAM segmentation debug (overlay masks on images)
+        if save_debug:
+            sam_debug_dir = os.path.join(output_dir, "sam_debug")
+            os.makedirs(sam_debug_dir, exist_ok=True)
+            logger.info("Saving SAM debug visualizations...")
+            for frame_idx in sorted(binary_masks.keys()):
+                mask = binary_masks[frame_idx]
+                # Load original frame
+                frame_path = os.path.join(frames_dir, frame_names[frame_idx])
+                frame_img = cv2.imread(frame_path)
+                # Create overlay with blue color for mask
+                overlay = frame_img.copy()
+                overlay[mask > 0] = [255, 100, 0]  # Blue in BGR
+                # Blend original image with overlay (60% original, 40% overlay)
+                blended = cv2.addWeighted(frame_img, 0.6, overlay, 0.4, 0)
+                # Save
+                debug_path = os.path.join(sam_debug_dir, frame_names[frame_idx])
+                cv2.imwrite(debug_path, blended)
+            logger.info(f"SAM debug visualizations saved to: {sam_debug_dir}")
+        # Save binary masks to disk if requested
+        mask_paths = []
+        if save_masks:
+            masks_dir = os.path.join(output_dir, "masks")
+            os.makedirs(masks_dir, exist_ok=True)
+            logger.info("Saving binary masks...")
+            for frame_idx in sorted(binary_masks.keys()):
+                mask = binary_masks[frame_idx]
+                mask_filename = (
+                    frame_names[frame_idx].replace(".jpg", ".png").replace(".jpeg", ".png")
+                )
+                mask_path = os.path.join(masks_dir, mask_filename)
+                # Save as PNG (lossless, black & white)
+                cv2.imwrite(mask_path, mask)
+                mask_paths.append(mask_path)
+            logger.info(f"Masks saved to: {masks_dir}")
+        result = SegmentationResult(
+            masks=binary_masks,
+            num_frames=len(binary_masks),
+            output_dir=output_dir,
+            mask_paths=mask_paths,
+            metadata=metadata,
+            initial_mask_path=initial_mask_path,
+        )
+        # Calculate and log pure inference stats
+        inference_end_time = time.time()
+        pure_inference_time = inference_end_time - inference_start_time
+        pure_fps = len(binary_masks) / pure_inference_time if pure_inference_time > 0 else 0.0
+        logger.info("Segmentation completed successfully!")
+        logger.info(f"Generated {len(binary_masks)} binary masks")
+        logger.info("==================================================")
+        logger.info("Pure Inference Stats:")
+        logger.info(f"  Total Time: {pure_inference_time:.4f}s")
+        logger.info(f"  FPS: {pure_fps:.2f}")
+        logger.info(
+            f"  Latency per frame: {1 / pure_fps:.4f}s"
+            if pure_fps > 0
+            else "  Latency per frame: N/A"
+        )
+        logger.info("==================================================")
+        return result
+    def _propagate_segmentation(
+        self, inference_state, init_mask: np.ndarray, frame_idx: int, total_frames: int
+    ) -> dict[int, dict[int, np.ndarray]]:
+        """
+        Propagate segmentation bidirectionally from initial frame.
+        Args:
+            inference_state: SeC inference state
+            init_mask: Initial segmentation mask
+            frame_idx: Index of initial frame
+            total_frames: Total number of frames
+        Returns:
+            Dictionary mapping frame indices to segmentation masks
+        """
+        logger.info(f"Propagating segmentation across {total_frames} frames...")
+        frame_segments = {}
+        # Forward propagation
+        logger.info(f"  Forward propagation from frame {frame_idx}...")
+        frame_count = 0
+        for (
+            out_frame_idx,
+            out_obj_ids,
+            out_mask_logits,
+        ) in self.segmentation_model.propagate_in_video(
+            inference_state,
+            start_frame_idx=frame_idx,
+            reverse=False,
+            init_mask=init_mask,
+            tokenizer=self.segmentation_tokenizer,
+        ):
+            frame_segments[out_frame_idx] = {
+                out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                for i, out_obj_id in enumerate(out_obj_ids)
+            }
+            frame_count += 1
+            # Periodic cleanup
+            if frame_count % self.memory_cleanup_interval == 0:
+                torch.cuda.empty_cache()
+                gc.collect()
+            # Progress logging
+            if frame_count % self.DEFAULT_PROGRESS_LOG_INTERVAL == 0:
+                logger.info(f"    Processed {frame_count} frames...")
+        logger.info(f"  Forward propagation completed ({frame_count} frames)")
+        # Backward propagation
+        if frame_idx > 0:
+            logger.info(f"  Backward propagation from frame {frame_idx - 1}...")
+            frame_count = 0
+            for (
+                out_frame_idx,
+                out_obj_ids,
+                out_mask_logits,
+            ) in self.segmentation_model.propagate_in_video(
+                inference_state,
+                start_frame_idx=frame_idx - 1,
+                reverse=True,
+                init_mask=init_mask,
+                tokenizer=self.segmentation_tokenizer,
+            ):
+                frame_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                    for i, out_obj_id in enumerate(out_obj_ids)
+                }
+                frame_count += 1
+                # Periodic cleanup
+                if frame_count % self.memory_cleanup_interval == 0:
+                    torch.cuda.empty_cache()
+                    gc.collect()
+                # Progress logging
+                if frame_count % self.DEFAULT_PROGRESS_LOG_INTERVAL == 0:
+                    logger.info(f"    Processed {frame_count} frames...")
+            logger.info(f"  Backward propagation completed ({frame_count} frames)")
+        return frame_segments
+    def _save_initial_mask(
+        self,
+        frame: np.ndarray,
+        mask: np.ndarray,
+        points: list[tuple[int, int]],
+        labels: list[int],
+        frame_idx: int,
+        output_path: str,
+    ) -> None:
+        """Save visualization of initial mask with annotated points.
+        Args:
+            frame: Original frame image (RGB format)
+            mask: Segmentation mask
+            points: List of annotation points
+            labels: Point labels (1=positive, 0=negative)
+            frame_idx: Frame index
+            output_path: Path to save visualization
+        """
+        # Convert RGB to BGR for OpenCV
+        vis_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        # Create mask overlay (blue color)
+        h, w = mask.shape[-2:]
+        mask_binary = mask.reshape(h, w).astype(bool)
+        overlay = vis_frame.copy()
+        overlay[mask_binary] = [255, 144, 30]  # BGR: blue overlay
+        # Blend original and overlay (60% original, 40% overlay)
+        vis_frame = cv2.addWeighted(vis_frame, 0.6, overlay, 0.4, 0)
+        # Draw points
+        for (x, y), label in zip(points, labels, strict=True):
+            if label == 1:
+                # Green star for positive points
+                color = (0, 255, 0)  # BGR
+            else:
+                # Red star for negative points
+                color = (0, 0, 255)  # BGR
+            # Draw star marker
+            cv2.drawMarker(
+                vis_frame,
+                (int(x), int(y)),
+                color,
+                markerType=cv2.MARKER_STAR,
+                markerSize=15,
+                thickness=2,
+            )
+            # Add white border to marker for visibility
+            cv2.drawMarker(
+                vis_frame,
+                (int(x), int(y)),
+                (255, 255, 255),
+                markerType=cv2.MARKER_STAR,
+                markerSize=17,
+                thickness=1,
+            )
+        # Add title text
+        title = f"Initial Mask (Frame {frame_idx})"
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        font_scale = 1.0
+        thickness = 2
+        # Get text size for background
+        (text_width, text_height), baseline = cv2.getTextSize(title, font, font_scale, thickness)
+        # Draw text background (semi-transparent black)
+        cv2.rectangle(
+            vis_frame,
+            (5, 5),
+            (15 + text_width, 15 + text_height + baseline),
+            (0, 0, 0),
+            -1,
+        )
+        # Draw text
+        cv2.putText(
+            vis_frame,
+            title,
+            (10, 10 + text_height),
+            font,
+            font_scale,
+            (255, 255, 255),
+            thickness,
+            cv2.LINE_AA,
+        )
+        cv2.imwrite(output_path, vis_frame)
+    def _save_initial_mask_with_bbox(
+        self,
+        frame: np.ndarray,
+        mask: np.ndarray,
+        bbox: list[float],
+        text: str,
+        frame_idx: int,
+        output_path: str,
+    ) -> None:
+        """Save visualization of initial mask with bounding box (text mode).
+        Args:
+            frame: Original frame (RGB)
+            mask: Segmentation mask
+            bbox: Bounding box [x1, y1, x2, y2]
+            text: Text description
+            frame_idx: Frame index
+            output_path: Save path
+        """
+        vis_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        h, w = mask.shape[-2:]
+        mask_binary = mask.reshape(h, w).astype(bool)
+        overlay = vis_frame.copy()
+        overlay[mask_binary] = [255, 144, 30]
+        vis_frame = cv2.addWeighted(vis_frame, 0.6, overlay, 0.4, 0)
+        x1, y1, x2, y2 = map(int, bbox)
+        cv2.rectangle(vis_frame, (x1, y1), (x2, y2), (0, 255, 0), thickness=3)
+        label = f"'{text}'"
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        font_scale = 0.7
+        thickness = 2
+        (text_width, text_height), baseline = cv2.getTextSize(label, font, font_scale, thickness)
+        text_y = max(y1 - 10, text_height + 5)
+        cv2.rectangle(
+            vis_frame,
+            (x1, text_y - text_height - 5),
+            (x1 + text_width + 5, text_y + baseline),
+            (0, 255, 0),
+            -1,
+        )
+        cv2.putText(
+            vis_frame,
+            label,
+            (x1 + 2, text_y - 2),
+            font,
+            font_scale,
+            (0, 0, 0),
+            thickness,
+            cv2.LINE_AA,
+        )
+        title = f"Initial Mask (Frame {frame_idx}) - Text-based"
+        (title_width, title_height), baseline = cv2.getTextSize(title, font, 1.0, 2)
+        cv2.rectangle(
+            vis_frame, (5, 5), (15 + title_width, 15 + title_height + baseline), (0, 0, 0), -1
+        )
+        cv2.putText(
+            vis_frame, title, (10, 10 + title_height), font, 1.0, (255, 255, 255), 2, cv2.LINE_AA
+        )
+        cv2.imwrite(output_path, vis_frame)

eneas/segmentation/utils.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Utility functions for segmentation tasks.
+Includes NMS, IoU calculation, text conversion utilities, and image masking.
+"""
+import base64
+import io
+import logging
+import cv2
+import inflect
+import numpy as np
+import spacy
+from PIL import Image
+logger = logging.getLogger(__name__)
+# Load spacy model and inflect engine
+_nlp = spacy.load("en_core_web_sm")
+_inflect_engine = inflect.engine()
+def smart_convert_to_singular(text: str) -> str:
+    """
+    Convert ONLY nouns to singular, preserving articles, adjectives, etc.
+    Uses spaCy to detect nouns + inflect to convert.
+    Examples:
+        "people" → "person"
+        "real people" → "real person"
+        "the blue chairs" → "the blue chair"
+        "chairs/stool" → "chair/stool"
+        "windows & doors" → "window & door"
+    Args:
+        text: Text to convert (can contain multiple words and separators)
+    Returns:
+        Text with nouns in singular
+    """
+    if not text:
+        return text
+    # Handle common separators (/, &, and)
+    for sep in ["/", " & ", " and "]:
+        if sep in text:
+            parts = [smart_convert_to_singular(part.strip()) for part in text.split(sep)]
+            return sep.join(parts)
+    # Process with spaCy: detect nouns
+    doc = _nlp(text)
+    result = []
+    for token in doc:
+        if token.pos_ == "NOUN":  # Only nouns
+            # Detect if already singular
+            singular = _inflect_engine.singular_noun(token.text)
+            if singular:  # Was plural → convert to singular
+                result.append(singular)
+            else:  # Already singular → keep
+                result.append(token.text)
+        else:
+            result.append(token.text)  # Articles, adjectives, etc. unchanged
+    return " ".join(result)
+def smart_convert_to_plural(text: str) -> str:
+    """
+    Convert ONLY nouns to plural, preserving articles, adjectives, etc.
+    Uses spaCy to detect nouns + inflect to convert.
+    Examples:
+        "person" → "people"
+        "real person" → "real people"
+        "the blue chair" → "the blue chairs"
+        "chair/stool" → "chairs/stools"
+        "window & door" → "windows & doors"
+    Args:
+        text: Text to convert (can contain multiple words and separators)
+    Returns:
+        Text with nouns in plural
+    """
+    if not text:
+        return text
+    # Handle common separators (/, &, and)
+    for sep in ["/", " & ", " and "]:
+        if sep in text:
+            parts = [smart_convert_to_plural(part.strip()) for part in text.split(sep)]
+            return sep.join(parts)
+    # Process with spaCy: detect nouns
+    doc = _nlp(text)
+    result = []
+    for token in doc:
+        if token.pos_ == "NOUN":  # Only nouns
+            # Detect if already plural
+            singular = _inflect_engine.singular_noun(token.text)
+            if singular:  # Already plural → keep
+                result.append(token.text)
+            else:  # Was singular → convert to plural
+                result.append(_inflect_engine.plural(token.text))
+        else:
+            result.append(token.text)  # Articles, adjectives, etc. unchanged
+    return " ".join(result)
+def calculate_iou(box1: list, box2: list) -> float:
+    """Calculate Intersection over Union (IoU) between two bounding boxes.
+    Args:
+        box1: First bounding box [x1, y1, x2, y2]
+        box2: Second bounding box [x1, y1, x2, y2]
+    Returns:
+        IoU value between 0 and 1
+    """
+    x1_min, y1_min, x1_max, y1_max = box1
+    x2_min, y2_min, x2_max, y2_max = box2
+    # Calculate intersection area
+    inter_x_min = max(x1_min, x2_min)
+    inter_y_min = max(y1_min, y2_min)
+    inter_x_max = min(x1_max, x2_max)
+    inter_y_max = min(y1_max, y2_max)
+    inter_width = max(0, inter_x_max - inter_x_min)
+    inter_height = max(0, inter_y_max - inter_y_min)
+    inter_area = inter_width * inter_height
+    # Calculate union area
+    box1_area = (x1_max - x1_min) * (y1_max - y1_min)
+    box2_area = (x2_max - x2_min) * (y2_max - y2_min)
+    union_area = box1_area + box2_area - inter_area
+    if union_area == 0:
+        return 0.0
+    return inter_area / union_area
+def calculate_box_area(box: list) -> float:
+    """Calculate area of a bounding box.
+    Args:
+        box: Bounding box [x1, y1, x2, y2]
+    Returns:
+        Area of the box
+    """
+    x1, y1, x2, y2 = box
+    return (x2 - x1) * (y2 - y1)
+def non_max_suppression(
+    bboxes: list, labels: list, iou_threshold: float = 0.70
+) -> tuple[list, list]:
+    """Apply Non-Maximum Suppression without scores.
+    When two boxes overlap (IoU > threshold), keep the larger box.
+    Args:
+        bboxes: List of bounding boxes [x1, y1, x2, y2]
+        labels: List of labels
+        iou_threshold: IoU threshold for considering boxes as duplicates (default: 0.70)
+    Returns:
+        Tuple of (filtered_bboxes, filtered_labels)
+    """
+    if len(bboxes) <= 1:
+        return bboxes, labels
+    # Calculate areas for all boxes
+    areas = [calculate_box_area(box) for box in bboxes]
+    # Sort by area (largest first)
+    sorted_indices = sorted(range(len(bboxes)), key=lambda i: areas[i], reverse=True)
+    keep_indices = []
+    suppressed = set()
+    for idx in sorted_indices:
+        if idx in suppressed:
+            continue
+        keep_indices.append(idx)
+        # Suppress smaller boxes that overlap with this one
+        for other_idx in sorted_indices:
+            if other_idx == idx or other_idx in suppressed:
+                continue
+            iou = calculate_iou(bboxes[idx], bboxes[other_idx])
+            if iou > iou_threshold:
+                suppressed.add(other_idx)
+                logger.debug(
+                    f"NMS: Suppressing box {other_idx} (area={areas[other_idx]:.1f}) "
+                    f"due to overlap (IoU={iou:.3f}) with box {idx} (area={areas[idx]:.1f})"
+                )
+    # Return boxes in original order (keeping only non-suppressed ones)
+    keep_indices_sorted = sorted(keep_indices)
+    filtered_bboxes = [bboxes[i] for i in keep_indices_sorted]
+    filtered_labels = [labels[i] for i in keep_indices_sorted]
+    return filtered_bboxes, filtered_labels
+def mask_overlapping_boxes(
+    crop_image: Image.Image,
+    current_bbox: list,
+    all_bboxes: list,
+    current_idx: int,
+    crop_coords: tuple,
+    max_mask_percentage: float = 60.0,
+) -> Image.Image:
+    """Mask overlapping regions from other boxes in the current crop.
+    Paints black the regions of other boxes that overlap with the current box.
+    If masking exceeds max_mask_percentage, returns original crop unmasked.
+    Args:
+        crop_image: PIL Image of the crop
+        current_bbox: Bbox of current box [x1, y1, x2, y2] in original coordinates
+        all_bboxes: List of all bboxes in original coordinates
+        current_idx: Index of the current box
+        crop_coords: Crop coordinates in original image (crop_x1, crop_y1, crop_x2, crop_y2)
+        max_mask_percentage: Maximum allowed masking percentage (default: 60.0)
+    Returns:
+        PIL Image with overlapping regions masked (or unmasked if exceeds threshold)
+    """
+    crop_x1, crop_y1, crop_x2, crop_y2 = [int(c) for c in crop_coords]
+    curr_x1, curr_y1, curr_x2, curr_y2 = [int(c) for c in current_bbox]
+    crop_array = np.array(crop_image)
+    total_pixels = crop_array.shape[0] * crop_array.shape[1]
+    masked_crop_array = crop_array.copy()
+    masked_pixels_count = 0
+    for idx, other_bbox in enumerate(all_bboxes):
+        if idx == current_idx:
+            continue
+        other_x1, other_y1, other_x2, other_y2 = [int(coord) for coord in other_bbox]
+        # Check if there's overlap between current box and other box
+        if not (
+            other_x2 < curr_x1 or other_x1 > curr_x2 or other_y2 < curr_y1 or other_y1 > curr_y2
+        ):
+            # Calculate intersection region
+            intersect_x1 = max(curr_x1, other_x1)
+            intersect_y1 = max(curr_y1, other_y1)
+            intersect_x2 = min(curr_x2, other_x2)
+            intersect_y2 = min(curr_y2, other_y2)
+            # Check if intersection falls within the crop
+            if not (
+                intersect_x2 < crop_x1
+                or intersect_x1 > crop_x2
+                or intersect_y2 < crop_y1
+                or intersect_y1 > crop_y2
+            ):
+                # Convert to local crop coordinates
+                mask_x1 = int(max(0, intersect_x1 - crop_x1))
+                mask_y1 = int(max(0, intersect_y1 - crop_y1))
+                mask_x2 = int(min(masked_crop_array.shape[1], intersect_x2 - crop_x1))
+                mask_y2 = int(min(masked_crop_array.shape[0], intersect_y2 - crop_y1))
+                region_pixels = (mask_y2 - mask_y1) * (mask_x2 - mask_x1)
+                masked_pixels_count += region_pixels
+                # Paint black the overlapping region
+                masked_crop_array[mask_y1:mask_y2, mask_x1:mask_x2] = 0
+    mask_percentage = (masked_pixels_count / total_pixels) * 100 if total_pixels > 0 else 0
+    if mask_percentage > max_mask_percentage:
+        logger.debug(
+            f"Masking {mask_percentage:.1f}% > {max_mask_percentage}% - using unmasked crop"
+        )
+        return crop_image
+    elif mask_percentage > 0:
+        logger.debug(f"Masking {mask_percentage:.1f}% applied")
+        return Image.fromarray(masked_crop_array)
+    else:
+        return crop_image
+def expand_crop_to_minimum_size(
+    crop_image: Image.Image, bbox: list, image_original: Image.Image, min_size: int = 32
+) -> Image.Image:
+    """Expand crop to minimum required size by taking pixels from original image.
+    Required for VLM models that need minimum dimensions (e.g., Qwen3-VL requires 32x32).
+    Args:
+        crop_image: PIL Image of the current crop
+        bbox: Original bbox [x1, y1, x2, y2]
+        image_original: Full original PIL Image
+        min_size: Minimum required size (default: 32)
+    Returns:
+        Expanded PIL Image that meets min_size × min_size
+    """
+    width, height = crop_image.size
+    # If already meets minimum size, return unchanged
+    if width >= min_size and height >= min_size:
+        return crop_image
+    x1, y1, x2, y2 = bbox
+    img_width, img_height = image_original.size
+    # Calculate expansion needed
+    needed_width = max(0, min_size - width)
+    needed_height = max(0, min_size - height)
+    # Horizontal expansion (try symmetric, respect borders)
+    expand_left = needed_width // 2
+    expand_right = needed_width - expand_left
+    if x1 - expand_left < 0:
+        deficit = expand_left - x1
+        expand_left = x1
+        expand_right += deficit
+    if x2 + expand_right > img_width:
+        deficit = (x2 + expand_right) - img_width
+        expand_right = img_width - x2
+        expand_left += deficit
+        if x1 - expand_left < 0:
+            expand_left = x1
+    # Vertical expansion
+    expand_top = needed_height // 2
+    expand_bottom = needed_height - expand_top
+    if y1 - expand_top < 0:
+        deficit = expand_top - y1
+        expand_top = y1
+        expand_bottom += deficit
+    if y2 + expand_bottom > img_height:
+        deficit = (y2 + expand_bottom) - img_height
+        expand_bottom = img_height - y2
+        expand_top += deficit
+        if y1 - expand_top < 0:
+            expand_top = y1
+    # Calculate new coordinates
+    new_x1 = max(0, x1 - expand_left)
+    new_y1 = max(0, y1 - expand_top)
+    new_x2 = min(img_width, x2 + expand_right)
+    new_y2 = min(img_height, y2 + expand_bottom)
+    # Extract expanded crop
+    expanded_crop = image_original.crop((new_x1, new_y1, new_x2, new_y2))
+    logger.debug(
+        f"Crop expanded: {width}×{height} → {expanded_crop.size[0]}×{expanded_crop.size[1]}"
+    )
+    return expanded_crop
+def image_to_base64_data_uri(image: Image.Image) -> str:
+    """Convert PIL Image to base64 data URI for VLM.
+    Args:
+        image: PIL Image to convert
+    Returns:
+        Data URI string (data:image/jpeg;base64,...)
+    Example:
+        >>> from PIL import Image
+        >>> img = Image.new('RGB', (100, 100), color='red')
+        >>> uri = image_to_base64_data_uri(img)
+        >>> uri.startswith('data:image/jpeg;base64,')
+        True
+    """
+    img_byte_arr = io.BytesIO()
+    image.save(img_byte_arr, format="JPEG", quality=95)
+    img_bytes = img_byte_arr.getvalue()
+    img_base64 = base64.b64encode(img_bytes).decode("utf-8")
+    return f"data:image/jpeg;base64,{img_base64}"
+def draw_bboxes(image: Image.Image, bboxes: list) -> Image.Image:
+    """Draw red bounding boxes on image.
+    Args:
+        image: PIL Image
+        bboxes: List of bounding boxes [[x1, y1, x2, y2], ...]
+    Returns:
+        PIL Image with bboxes drawn (new copy)
+    """
+    # Convert PIL to cv2
+    img_array = np.array(image)
+    img_cv2 = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
+    # Draw red boxes
+    for bbox in bboxes:
+        x1, y1, x2, y2 = [int(coord) for coord in bbox]
+        cv2.rectangle(img_cv2, (x1, y1), (x2, y2), color=(0, 0, 255), thickness=3)
+    # Convert back to PIL
+    img_rgb = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2RGB)
+    return Image.fromarray(img_rgb)

eneas/vendor/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

eneas/vendor/SeC/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

eneas/vendor/SeC/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

eneas/vendor/SeC/inference/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

eneas/vendor/SeC/inference/__pycache__/configuration_intern_vit.cpython-312.pyc ADDED Viewed

Binary file (5.72 kB). View file

eneas/vendor/SeC/inference/__pycache__/configuration_internlm2.cpython-312.pyc ADDED Viewed

Binary file (6.4 kB). View file

eneas/vendor/SeC/inference/__pycache__/configuration_sec.cpython-312.pyc ADDED Viewed

Binary file (5.06 kB). View file

eneas/vendor/SeC/inference/__pycache__/flash_attention.cpython-312.pyc ADDED Viewed

Binary file (3.81 kB). View file

eneas/vendor/SeC/inference/__pycache__/modeling_intern_vit.cpython-312.pyc ADDED Viewed

Binary file (22.8 kB). View file

eneas/vendor/SeC/inference/__pycache__/modeling_internlm2.cpython-312.pyc ADDED Viewed

Binary file (67.7 kB). View file

eneas/vendor/SeC/inference/__pycache__/modeling_sec.cpython-312.pyc ADDED Viewed

Binary file (39.4 kB). View file

eneas/vendor/SeC/inference/__pycache__/sam2_video_predictor.cpython-312.pyc ADDED Viewed

Binary file (15.4 kB). View file

eneas/vendor/SeC/inference/__pycache__/templates.cpython-312.pyc ADDED Viewed

Binary file (4.57 kB). View file

eneas/vendor/SeC/inference/configuration_intern_vit.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+    model_type = 'intern_vit_6b'
+    def __init__(
+            self,
+            num_channels=3,
+            patch_size=14,
+            image_size=224,
+            qkv_bias=False,
+            hidden_size=3200,
+            num_attention_heads=25,
+            intermediate_size=12800,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='gelu',
+            norm_type='rms_norm',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            drop_path_rate=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.norm_type = norm_type
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if 'vision_config' in config_dict:
+            config_dict = config_dict['vision_config']
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+        return cls.from_dict(config_dict, **kwargs)

eneas/vendor/SeC/inference/configuration_internlm2.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/configuration_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" InternLM2 model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+# Modified from transformers.model.llama.configuration_llama.LlamaConfig
+class InternLM2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
+    an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`InternLM2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        Example:
+    """
+    model_type = 'internlm2'
+    _auto_class = 'AutoConfig'
+    def __init__(  # pylint: disable=W0102
+        self,
+        vocab_size=103168,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act='silu',
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        bias=True,
+        rope_theta=10000,
+        rope_scaling=None,
+        attn_implementation='eager',
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.bias = bias
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attn_implementation = attn_implementation
+        if self.attn_implementation is None:
+            self.attn_implementation = 'eager'
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                '`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, '
+                f'got {self.rope_scaling}'
+            )
+        rope_scaling_type = self.rope_scaling.get('type', None)
+        rope_scaling_factor = self.rope_scaling.get('factor', None)
+        if rope_scaling_type is None or rope_scaling_type not in ['linear', 'dynamic']:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")

eneas/vendor/SeC/inference/configuration_phi3.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License atd
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Phi-3 model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'microsoft/Phi-3-mini-4k-instruct': 'https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json',
+    'microsoft/Phi-3-mini-128k-instruct': 'https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json',
+}
+class Phi3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32064):
+            Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Phi3Model`].
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            Dropout probability for mlp outputs.
+        embd_pdrop (`int`, *optional*, defaults to 0.0):
+            The dropout ratio for the embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model was trained with. This is used to determine the size of the
+            original RoPE embeddings when using long scaling.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used for the RMSNorm.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
+            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
+            divided by the number of attention heads divided by 2.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 32000):
+            The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*, defaults to 32000):
+            The id of the padding token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If `None`, no sliding window is applied.
+    Example:
+    ```python
+    >>> from transformers import Phi3Model, Phi3Config
+    >>> # Initializing a Phi-3 style configuration
+    >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    >>> # Initializing a model from the configuration
+    >>> model = Phi3Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = 'phi3'
+    keys_to_ignore_at_inference = ['past_key_values']
+    def __init__(
+        self,
+        vocab_size=32064,
+        hidden_size=3072,
+        intermediate_size=8192,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act='silu',
+        max_position_embeddings=4096,
+        original_max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        bos_token_id=1,
+        eos_token_id=32000,
+        pad_token_id=32000,
+        sliding_window=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.sliding_window = sliding_window
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+            raise ValueError(
+                '`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, '
+                f'got {self.rope_scaling}'
+            )
+        rope_scaling_type = self.rope_scaling.get('type', None)
+        rope_scaling_short_factor = self.rope_scaling.get('short_factor', None)
+        rope_scaling_long_factor = self.rope_scaling.get('long_factor', None)
+        if rope_scaling_type is None or rope_scaling_type not in ['su', 'yarn']:
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
+            )

eneas/vendor/SeC/inference/configuration_sec.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from .configuration_internlm2 import InternLM2Config
+# from .configuration_phi3 import Phi3Config  # Not used by SeC-4B
+from transformers import AutoConfig, LlamaConfig, Qwen2Config
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+logger = logging.get_logger(__name__)
+class SeCConfig(PretrainedConfig):
+    model_type = "sec"
+    is_composition = True
+    def __init__(
+        self,
+        vision_config=None,
+        llm_config=None,
+        use_backbone_lora=0,
+        use_llm_lora=0,
+        pad2square=False,
+        select_layer=-1,
+        force_image_size=None,
+        downsample_ratio=0.5,
+        template=None,
+        dynamic_image_size=False,
+        use_thumbnail=False,
+        ps_version="v1",
+        min_dynamic_patch=1,
+        max_dynamic_patch=6,
+        grounding_encoder_config="sam2.1/sam2.1_hiera_l.yaml",
+        grounding_maskmem_num=22,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = {}
+            logger.info(
+                "vision_config is None. Initializing the InternVisionConfig with default values."
+            )
+        if llm_config is None:
+            llm_config = {}
+            logger.info(
+                "llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)."
+            )
+        self.vision_config = InternVisionConfig(**vision_config)
+        # Patched by eneas: handle empty llm_config (no 'architectures' key)
+        if not llm_config or "architectures" not in llm_config:
+            self.llm_config = LlamaConfig(**llm_config)
+        elif llm_config["architectures"][0] == "LlamaForCausalLM":
+            self.llm_config = LlamaConfig(**llm_config)
+        elif llm_config["architectures"][0] == "InternLM2ForCausalLM":
+            self.llm_config = InternLM2Config(**llm_config)
+        # elif llm_config["architectures"][0] == "Phi3ForCausalLM":
+        #     self.llm_config = Phi3Config(**llm_config)  # Not used by SeC-4B
+        elif llm_config["architectures"][0] == "Qwen2ForCausalLM":
+            self.llm_config = Qwen2Config(**llm_config)
+        else:
+            raise ValueError("Unsupported architecture: {}".format(llm_config["architectures"][0]))
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.pad2square = pad2square
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.hidden_size = self.llm_config.hidden_size
+        self.tie_word_embeddings = False
+        self.grounding_encoder_config = grounding_encoder_config
+        self.grounding_maskmem_num = grounding_maskmem_num
+        logger.info(f"vision_select_layer: {self.select_layer}")
+        logger.info(f"ps_version: {self.ps_version}")
+        logger.info(f"min_dynamic_patch: {self.min_dynamic_patch}")
+        logger.info(f"max_dynamic_patch: {self.max_dynamic_patch}")
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["llm_config"] = self.llm_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        output["use_backbone_lora"] = self.use_backbone_lora
+        output["use_llm_lora"] = self.use_llm_lora
+        output["pad2square"] = self.pad2square
+        output["select_layer"] = self.select_layer
+        output["force_image_size"] = self.force_image_size
+        output["downsample_ratio"] = self.downsample_ratio
+        output["template"] = self.template
+        output["dynamic_image_size"] = self.dynamic_image_size
+        output["use_thumbnail"] = self.use_thumbnail
+        output["ps_version"] = self.ps_version
+        output["min_dynamic_patch"] = self.min_dynamic_patch
+        output["max_dynamic_patch"] = self.max_dynamic_patch
+        output["grounding_encoder_config"] = self.grounding_encoder_config
+        output["grounding_maskmem_num"] = self.grounding_maskmem_num
+        return output

eneas/vendor/SeC/inference/flash_attention.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py
+import torch
+import torch.nn as nn
+from einops import rearrange
+try:  # v1
+    from flash_attn.flash_attn_interface import \
+        flash_attn_unpadded_qkvpacked_func
+except:  # v2
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+from flash_attn.bert_padding import pad_input, unpad_input
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
+                max_s=None, need_weights=False):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+                max_s = seqlen
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                          device=qkv.device)
+                output = flash_attn_unpadded_qkvpacked_func(
+                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+                output_unpad = flash_attn_unpadded_qkvpacked_func(
+                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                             indices, batch_size, seqlen),
+                                   'b s (h d) -> b s h d', h=nheads)
+        else:
+            assert max_s is not None
+            output = flash_attn_unpadded_qkvpacked_func(
+                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale, causal=causal
+            )
+        return output, None

eneas/vendor/SeC/inference/modeling_intern_vit.py ADDED Viewed

	@@ -0,0 +1,364 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+try:
+    from .flash_attention import FlashAttention
+    has_flash_attn = True
+except:
+    print('FlashAttention is not installed.')
+    has_flash_attn = False
+logger = logging.get_logger(__name__)
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+try:
+    from apex.normalization import FusedRMSNorm
+    InternRMSNorm = FusedRMSNorm  # noqa
+    logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
+    pass
+NORM2FN = {
+    'rms_norm': InternRMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
+            reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
+        return pos_embed
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
+        ], dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+        self.qk_normalization = config.qk_normalization
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
+        hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
+        return hidden_states
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+    def forward(
+            self,
+            inputs_embeds,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+class InternVisionModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    _supports_flash_attn_2 = True
+    config_class = InternVisionConfig
+    _no_split_modules = ['InternVisionEncoderLayer']
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+        logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
+    def get_input_embeddings(self):
+        return self.embeddings
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

eneas/vendor/SeC/inference/modeling_internlm2.py ADDED Viewed

	@@ -0,0 +1,1429 @@

+# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch InternLM2 model."""
+import math
+import queue
+import threading
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast,
+                                           SequenceClassifierOutputWithPast)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (add_start_docstrings,
+                                add_start_docstrings_to_model_forward, logging,
+                                replace_return_docstrings)
+try:
+    from transformers.generation.streamers import BaseStreamer
+except:  # noqa # pylint: disable=bare-except
+    BaseStreamer = None
+from .configuration_internlm2 import InternLM2Config
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = 'InternLM2Config'
+flash_attn_func, flash_attn_varlen_func = None, None
+pad_input, index_first_axis, unpad_input = None, None, None
+try:
+    from flash_attn import flash_attn_func as _flash_attn_func
+    from flash_attn import flash_attn_varlen_func as _flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis as _index_first_axis
+    from flash_attn.bert_padding import pad_input as _pad_input
+    from flash_attn.bert_padding import unpad_input as _unpad_input
+    flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+    pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+    has_flash_attn = True
+except:
+    has_flash_attn = False
+def _import_flash_attn():
+    global flash_attn_func, flash_attn_varlen_func
+    global pad_input, index_first_axis, unpad_input
+    try:
+        from flash_attn import flash_attn_func as _flash_attn_func
+        from flash_attn import \
+            flash_attn_varlen_func as _flash_attn_varlen_func
+        from flash_attn.bert_padding import \
+            index_first_axis as _index_first_axis
+        from flash_attn.bert_padding import pad_input as _pad_input
+        from flash_attn.bert_padding import unpad_input as _unpad_input
+        flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+        pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+    except ImportError:
+        raise ImportError('flash_attn is not installed.')
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2
+class InternLM2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        InternLM2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+try:
+    from functools import partial
+    from apex.normalization import FusedRMSNorm
+    InternLM2RMSNorm = partial(FusedRMSNorm, eps=1e-6)   # noqa
+    print('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternLM2RMSNorm')
+except ImportError:
+    # using the normal LlamaRMSNorm
+    pass
+except Exception:
+    print('discovered apex but it failed to load, falling back to InternLM2RMSNorm')
+    pass
+# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2
+class InternLM2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype)
+        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False)
+        self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+    """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False)
+        self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False)
+# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2
+class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
+    """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
+    Credits to the Reddit users /u/bloc97 and /u/emozilla.
+    """
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer('inv_freq', inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype)
+        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False)
+        self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False)
+# Copied from transformers.model.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors."""
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class InternLM2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
+        return down_proj
+# Copied from transformers.model.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+# Modified from transformers.model.llama.modeling_llama.LlamaAttention
+class InternLM2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: InternLM2Config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
+                f' and `num_heads`: {self.num_heads}).'
+            )
+        self.wqkv = nn.Linear(
+            self.hidden_size,
+            (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
+            bias=config.bias,
+        )
+        self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = InternLM2RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.config.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling['type']
+            scaling_factor = self.config.rope_scaling['factor']
+            if scaling_type == 'dynamic':
+                self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    base=self.config.rope_theta,
+                    scaling_factor=scaling_factor,
+                )
+            elif scaling_type == 'linear':
+                self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    base=self.config.rope_theta,
+                    scaling_factor=scaling_factor,
+                )
+            else:
+                raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.")
+        return self.rotary_emb
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. '
+                'Please make sure use `attention_mask` instead.`'
+            )
+        bsz, q_len, _ = hidden_states.size()
+        qkv_states = self.wqkv(hidden_states)
+        qkv_states = rearrange(
+            qkv_states,
+            'b q (h gs d) -> b q h gs d',
+            gs=2 + self.num_key_value_groups,
+            d=self.head_dim,
+        )
+        query_states = qkv_states[..., : self.num_key_value_groups, :]
+        query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
+        key_states = qkv_states[..., -2, :]
+        value_states = qkv_states[..., -1, :]
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f'Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
+                f' {attn_weights.size()}'
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                f' {attn_output.size()}'
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.wo(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2
+class InternLM2FlashAttention2(InternLM2Attention):
+    """
+    InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # InternLM2FlashAttention2 attention does not support output_attentions
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. '
+                'Please make sure use `attention_mask` instead.`'
+            )
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop('padding_mask')
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        qkv_states = self.wqkv(hidden_states)
+        qkv_states = rearrange(
+            qkv_states,
+            'b q (h gs d) -> b q h gs d',
+            gs=2 + self.num_key_value_groups,
+            d=self.head_dim,
+        )
+        query_states = qkv_states[..., : self.num_key_value_groups, :]
+        query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
+        key_states = qkv_states[..., -2, :]
+        value_states = qkv_states[..., -1, :]
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.wo(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # Contains at least one padding token in the sequence
+        causal = self.is_causal and query_length != 1
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+        return attn_output
+    def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q.to(torch.int64),
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+INTERNLM2_ATTENTION_CLASSES = {
+    'eager': InternLM2Attention,
+    'flash_attention_2': InternLM2FlashAttention2,
+}
+# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer
+class InternLM2DecoderLayer(nn.Module):
+    def __init__(self, config: InternLM2Config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config)
+        self.feed_forward = InternLM2MLP(config)
+        self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. '
+                'Please make sure use `attention_mask` instead.`'
+            )
+        residual = hidden_states
+        hidden_states = self.attention_norm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.ffn_norm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+InternLM2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`InternLM2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
+@add_start_docstrings(
+    'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
+    InternLM2_START_DOCSTRING,
+)
+class InternLM2PreTrainedModel(PreTrainedModel):
+    config_class = InternLM2Config
+    base_model_prefix = 'model'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['InternLM2DecoderLayer']
+    _skip_keys_device_placement = 'past_key_values'
+    _supports_flash_attn_2 = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+InternLM2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
+            when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+# Modified from transformers.model.llama.modeling_llama.LlamaModel
+@add_start_docstrings(
+    'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.',
+    InternLM2_START_DOCSTRING,
+)
+class InternLM2Model(InternLM2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`]
+    Args:
+        config: InternLM2Config
+    """
+    _auto_class = 'AutoModel'
+    def __init__(self, config: InternLM2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+        if not has_flash_attn:
+            self.config.attn_implementation = 'eager'
+            print('Warning: Flash attention is not available, using eager attention instead.')
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.tok_embeddings
+    def set_input_embeddings(self, value):
+        self.tok_embeddings = value
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.attn_implementation == 'flash_attention_2':
+            _import_flash_attn()
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError('You have to specify either input_ids or inputs_embeds')
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.tok_embeddings(input_ids)
+        if self.config.attn_implementation == 'flash_attention_2':
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            if attention_mask is None:
+                attention_mask = torch.ones(
+                    (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+                )
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+        # embed positions
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM
+class InternLM2ForCausalLM(InternLM2PreTrainedModel):
+    _auto_class = 'AutoModelForCausalLM'
+    _tied_weights_keys = ['output.weight']
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = InternLM2Model(config)
+        self.vocab_size = config.vocab_size
+        self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.tok_embeddings
+    def set_input_embeddings(self, value):
+        self.model.tok_embeddings = value
+    def get_output_embeddings(self):
+        return self.output
+    def set_output_embeddings(self, new_embeddings):
+        self.output = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
+        >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.output(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        output = CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+        output['logits'] = output['logits'].to(device)
+        return output
+    def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+            input_ids = input_ids[:, remove_prefix_length:]
+        position_ids = kwargs.get('position_ids', None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+        model_inputs.update(
+            {
+                'position_ids': position_ids,
+                'past_key_values': past_key_values,
+                'use_cache': kwargs.get('use_cache'),
+                'attention_mask': attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=''):
+        if tokenizer.add_bos_token:
+            prompt = ''
+        else:
+            prompt = tokenizer.bos_token
+        if meta_instruction:
+            prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
+        for record in history:
+            prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
+        prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
+        return tokenizer([prompt], return_tensors='pt')
+    @torch.no_grad()
+    def chat(
+        self,
+        tokenizer,
+        query: str,
+        history: List[Tuple[str, str]] = [],
+        streamer: Optional[BaseStreamer] = None,
+        max_new_tokens: int = 1024,
+        do_sample: bool = True,
+        temperature: float = 0.8,
+        top_p: float = 0.8,
+        meta_instruction: str = 'You are an AI assistant whose name is InternLM (书生·浦语).\n'
+                                '- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n'
+                                '- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.',
+        **kwargs,
+    ):
+        inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
+        inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
+        # also add end-of-assistant token in eos token id to avoid unnecessary generation
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(['<|im_end|>'])[0]]
+        outputs = self.generate(
+            **inputs,
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_p=top_p,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]):]
+        response = tokenizer.decode(outputs, skip_special_tokens=True)
+        response = response.split('<|im_end|>')[0]
+        history = history + [(query, response)]
+        return response, history
+    @torch.no_grad()
+    def stream_chat(
+        self,
+        tokenizer,
+        query: str,
+        history: List[Tuple[str, str]] = [],
+        max_new_tokens: int = 1024,
+        do_sample: bool = True,
+        temperature: float = 0.8,
+        top_p: float = 0.8,
+        **kwargs,
+    ):
+        """
+        Return a generator in format: (response, history)
+        Eg.
+        ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')])
+        ('你好，有什么可以帮助您的吗？', [('你好', '你好，有什么可以帮助您的吗？')])
+        """
+        if BaseStreamer is None:
+            raise ModuleNotFoundError(
+                'The version of `transformers` is too low. Please make sure '
+                'that you have installed `transformers>=4.28.0`.'
+            )
+        response_queue = queue.Queue(maxsize=20)
+        class ChatStreamer(BaseStreamer):
+            def __init__(self, tokenizer) -> None:
+                super().__init__()
+                self.tokenizer = tokenizer
+                self.queue = response_queue
+                self.query = query
+                self.history = history
+                self.response = ''
+                self.cache = []
+                self.received_inputs = False
+                self.queue.put((self.response, history + [(self.query, self.response)]))
+            def put(self, value):
+                if len(value.shape) > 1 and value.shape[0] > 1:
+                    raise ValueError('ChatStreamer only supports batch size 1')
+                elif len(value.shape) > 1:
+                    value = value[0]
+                if not self.received_inputs:
+                    # The first received value is input_ids, ignore here
+                    self.received_inputs = True
+                    return
+                self.cache.extend(value.tolist())
+                token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
+                if token.strip() != '<|im_end|>':
+                    self.response = self.response + token
+                    history = self.history + [(self.query, self.response)]
+                    self.queue.put((self.response, history))
+                    self.cache = []
+                else:
+                    self.end()
+            def end(self):
+                self.queue.put(None)
+        def stream_producer():
+            return self.chat(
+                tokenizer=tokenizer,
+                query=query,
+                streamer=ChatStreamer(tokenizer=tokenizer),
+                history=history,
+                max_new_tokens=max_new_tokens,
+                do_sample=do_sample,
+                temperature=temperature,
+                top_p=top_p,
+                **kwargs,
+            )
+        def consumer():
+            producer = threading.Thread(target=stream_producer)
+            producer.start()
+            while True:
+                res = response_queue.get()
+                if res is None:
+                    return
+                yield res
+        return consumer()
+# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
+@add_start_docstrings(
+    """
+    The InternLM2 Model transformer with a sequence classification head on top (linear layer).
+    [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification,
+    as other causal models (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    InternLM2_START_DOCSTRING,
+)
+class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = InternLM2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.tok_embeddings
+    def set_input_embeddings(self, value):
+        self.model.tok_embeddings = value
+    @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError('Cannot handle batch sizes > 1 if no padding token is defined.')
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

eneas/vendor/SeC/inference/modeling_phi3.py ADDED Viewed

	@@ -0,0 +1,1610 @@

+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Phi-3 model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import \
+    _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast,
+                                           SequenceClassifierOutputWithPast,
+                                           TokenClassifierOutput)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (add_code_sample_docstrings,
+                                add_start_docstrings,
+                                add_start_docstrings_to_model_forward,
+                                is_flash_attn_2_available,
+                                is_flash_attn_greater_or_equal_2_10, logging,
+                                replace_return_docstrings)
+from .configuration_phi3 import Phi3Config
+logger = logging.get_logger(__name__)
+# Transformers scans dependencies in the modeling file, causing issues on conditional loading. The regex only ignores try/catch blocks, but not if statements
+# if is_flash_attn_2_available():
+_flash_supports_window_size = False
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import (index_first_axis, pad_input,  # noqa
+                                         unpad_input)
+    _flash_supports_window_size = 'window_size' in list(inspect.signature(flash_attn_func).parameters)
+    has_flash_attn = True
+except ImportError as error:
+    logger.warning(
+        f'`flash-attention` package not found, consider installing for better performance: {error}.'
+    )
+    if not _flash_supports_window_size:
+        logger.warning(
+            "Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`."
+        )
+    has_flash_attn = False
+_CHECKPOINT_FOR_DOC = 'microsoft/Phi-3-mini-4k-instruct'
+_CONFIG_FOR_DOC = 'Phi3Config'
+PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    'microsoft/Phi-3-mini-4k-instruct',
+    'microsoft/Phi-3-mini-128k-instruct',
+    # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3
+]
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
+class Phi3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Phi3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
+class Phi3RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.register_buffer('inv_freq', None, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.inv_freq is None:
+            self.inv_freq = 1.0 / (
+                self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+            )
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != 'mps' else 'cpu'
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+        self.short_factor = config.rope_scaling['short_factor']
+        self.long_factor = config.rope_scaling['long_factor']
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != 'mps' else 'cpu'
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+        self.short_factor = config.rope_scaling['short_factor']
+        self.long_factor = config.rope_scaling['long_factor']
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != 'mps' else 'cpu'
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = 0.1 * math.log(scale) + 1.0
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Phi3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+        return self.down_proj(up_states)
+# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Phi3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f'Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will '
+                'lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` '
+                'when creating this class.'
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.rope_scaling = config.rope_scaling
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
+                f' and `num_heads`: {self.num_heads}).'
+            )
+        op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
+        self._init_rope()
+    def _init_rope(self):
+        if self.rope_scaling is None:
+            self.rotary_emb = Phi3RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling['type']
+            if scaling_type == 'su':
+                self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config)
+            elif scaling_type == 'yarn':
+                self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config)
+            else:
+                raise ValueError(f'Unknown RoPE scaling type {scaling_type}')
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        logger.warning_once('You are not running the flash-attention implementation, expect numerical differences.')
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f'The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} '
+                    'for auto-regressive decoding with k/v caching, please make sure to initialize the attention class '
+                    'with a layer index.'
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {'sin': sin, 'cos': cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f'Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
+                f' {attn_weights.size()}'
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                f' {attn_output.size()}'
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Phi3FlashAttention2(Phi3Attention):
+    """
+    Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # Phi3FlashAttention2 attention does not support output_attentions
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention. Please use `attn_implementation='eager'` or upgrade flash-attn library."
+            )
+            raise ValueError('The current flash attention version does not support sliding window attention.')
+        output_attentions = False
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`'
+            )
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop('padding_mask')
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f'The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} '
+                    'for auto-regressive decoding with k/v caching, please make sure to initialize the attention class '
+                    'with a layer index.'
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, 'sliding_window', None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, 'sliding_window', None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f'past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got'
+                        f' {past_key.shape}'
+                    )
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+            cache_kwargs = {'sin': sin, 'cos': cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_dropout = self.attention_dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32.
+        if query_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, '_pre_quantization_dtype'):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.qkv_proj.weight.dtype
+            logger.warning_once(
+                f'The input hidden states seems to be silently casted in float32, this might be related to'
+                f' the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in'
+                f' {target_dtype}.'
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=attn_dropout,
+            use_sliding_windows=use_sliding_windows,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+        return attn_output
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3
+# TODO @Arthur no longer copied from LLama after static cache
+class Phi3SdpaAttention(Phi3Attention):
+    """
+    Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from Phi3Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                'Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, '
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {'sin': sin, 'cos': cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == 'cuda' and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+PHI3_ATTENTION_CLASSES = {
+    'eager': Phi3Attention,
+    'flash_attention_2': Phi3FlashAttention2,
+    'sdpa': Phi3SdpaAttention,
+}
+class Phi3DecoderLayer(nn.Module):
+    def __init__(self, config: Phi3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+        self.mlp = Phi3MLP(config)
+        self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+        self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`'
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+PHI3_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`Phi3Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    'The bare Phi-3 model outputting raw hidden-states without any specific head on top.',
+    PHI3_START_DOCSTRING,
+)
+class Phi3PreTrainedModel(PreTrainedModel):
+    config_class = Phi3Config
+    base_model_prefix = 'model'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['Phi3DecoderLayer']
+    _skip_keys_device_placement = 'past_key_values'
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = True
+    _version = '0.0.5'
+    def __init__(self, config: Phi3Config):
+        if not has_flash_attn:
+            config._attn_implementation = 'eager'
+            print('Warning: Flash attention is not available, using eager attention instead.')
+        super().__init__(config)
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+PHI3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    'The bare Phi-3 model outputting raw hidden-states without any specific head on top.',
+    PHI3_START_DOCSTRING,
+)
+class Phi3Model(Phi3PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`]
+    Args:
+        config: Phi3Config
+    """
+    def __init__(self, config: Phi3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.embed_dropout = nn.Dropout(config.embd_pdrop)
+        self.layers = nn.ModuleList(
+            [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError('You have to specify either input_ids or inputs_embeds')
+        past_key_values_length = 0
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                )
+                use_cache = False
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if attention_mask is not None and self._attn_implementation == 'flash_attention_2' and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    ' this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to '
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        if self._attn_implementation == 'flash_attention_2':
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class Phi3ForCausalLM(Phi3PreTrainedModel):
+    _tied_weights_keys = ['lm_head.weight']
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Phi3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
+    def set_decoder(self, decoder):
+        self.model = decoder
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
+    def get_decoder(self):
+        return self.model
+    # Ignore copy
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Phi3ForCausalLM
+        >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+        >>> prompt = "This is an example script ."
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    # Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get('position_ids', None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if (inputs_embeds is not None and past_key_values is None) or (inputs_embeds is not None and len(past_key_values) == 0):
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+        model_inputs.update(
+            {
+                'position_ids': position_ids,
+                'past_key_values': past_key_values,
+                'use_cache': kwargs.get('use_cache'),
+                'attention_mask': attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+@add_start_docstrings(
+    """
+    The [`Phi3Model`] with a sequence classification head on top (linear layer).
+    [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    PHI3_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs
+class Phi3ForSequenceClassification(Phi3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Phi3Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = model_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError('Cannot handle batch sizes > 1 if no padding token is defined.')
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + model_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=model_outputs.past_key_values,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    PHI3_START_DOCSTRING,
+)
+# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs
+class Phi3ForTokenClassification(Phi3PreTrainedModel):
+    def __init__(self, config: Phi3Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Phi3Model(config)
+        if hasattr(config, 'classifier_dropout') and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, 'hidden_dropout') and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_outputs = self.model(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = model_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+        if not return_dict:
+            output = (logits,) + model_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )

eneas/vendor/SeC/inference/modeling_sec.py ADDED Viewed

	@@ -0,0 +1,857 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+import warnings
+from typing import Any, List, Optional, Tuple, Union
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+import torch.utils.checkpoint
+import transformers
+from .modeling_internlm2 import InternLM2ForCausalLM
+# from .modeling_phi3 import Phi3ForCausalLM  # Not used by SeC-4B
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
+                          LlamaTokenizer, Qwen2ForCausalLM)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import ModelOutput, logging
+from transformers import StoppingCriteriaList, StoppingCriteria
+from .configuration_sec import SeCConfig
+from .modeling_intern_vit import InternVisionModel, has_flash_attn
+from .sam2_video_predictor import build_sam2_video_predictor, SAM2VideoPredictor
+from .templates import PROMPT_TEMPLATE
+import cv2
+import numpy as np
+from torchvision.transforms.functional import resize, to_pil_image
+from types import MethodType
+import torch.nn.functional as F
+from tqdm import tqdm
+from PIL import Image
+import copy
+import random
+random.seed(42)
+try:
+    from .flash_attention import FlashAttention
+    has_flash_attn = True
+except:
+    print('FlashAttention is not installed.')
+    has_flash_attn = False
+logger = logging.get_logger(__name__)
+def version_cmp(v1, v2, op='eq'):
+    import operator
+    from packaging import version
+    op_func = getattr(operator, op)
+    return op_func(version.parse(v1), version.parse(v2))
+class StopWordStoppingCriteria(StoppingCriteria):
+    """StopWord stopping criteria."""
+    def __init__(self, tokenizer, stop_word):
+        self.tokenizer = tokenizer
+        self.stop_word = stop_word
+        self.length = len(self.stop_word)
+    def __call__(self, input_ids, *args, **kwargs) -> bool:
+        cur_text = self.tokenizer.decode(input_ids[0])
+        cur_text = cur_text.replace('\r', '').replace('\n', '')
+        return cur_text[-self.length:] == self.stop_word
+def get_stop_criteria(
+    tokenizer,
+    stop_words=[],
+):
+    stop_criteria = StoppingCriteriaList()
+    for word in stop_words:
+        stop_criteria.append(StopWordStoppingCriteria(tokenizer, word))
+    return stop_criteria
+class DirectResize:
+    def __init__(self, target_length: int) -> None:
+        self.target_length = target_length
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        img = to_pil_image(image, mode='RGB')
+        return np.array(img.resize((self.target_length, self.target_length)))
+class SeCModel(PreTrainedModel):
+    config_class = SeCConfig
+    main_input_name = 'pixel_values'
+    base_model_prefix = 'language_model'
+    _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer',
+                         'Phi3DecoderLayer', 'Qwen2DecoderLayer', 'SAM2']
+    _supports_flash_attn_2 = True
+    supports_gradient_checkpointing = True
+    def __init__(self, config: SeCConfig, vision_model=None, language_model=None, use_flash_attn=True):
+        super().__init__(config)
+        assert version_cmp(transformers.__version__, '4.37.0', 'ge')
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.select_layer = config.select_layer
+        self.template = config.template
+        self.template = self.template.replace('-', '_')
+        self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+        self.llm_arch_name = config.llm_config.architectures[0]
+        use_flash_attn = use_flash_attn if has_flash_attn else False
+        config.vision_config.use_flash_attn = True if use_flash_attn else False
+        config.llm_config._attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
+        logger.info(f'num_image_token: {self.num_image_token}')
+        logger.info(f'ps_version: {self.ps_version}')
+        if vision_model is not None:
+            self.vision_model = vision_model
+        else:
+            self.vision_model = InternVisionModel(config.vision_config)
+        if language_model is not None:
+            self.language_model = language_model
+        else:
+            if config.llm_config.architectures[0] == 'LlamaForCausalLM':
+                self.language_model = LlamaForCausalLM(config.llm_config)
+            elif config.llm_config.architectures[0] == 'InternLM2ForCausalLM':
+                self.language_model = InternLM2ForCausalLM(config.llm_config)
+            # elif config.llm_config.architectures[0] == 'Phi3ForCausalLM':
+            #     self.language_model = Phi3ForCausalLM(config.llm_config)  # Not used by SeC-4B
+            elif config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
+                self.language_model = Qwen2ForCausalLM(config.llm_config)
+            else:
+                raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.llm_config.hidden_size
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size)
+        )
+        self.img_context_token_id = None
+        self.conv_template = PROMPT_TEMPLATE[self.template]
+        self.template = self.conv_template
+        if hasattr(config, 'system_message'):
+            self.system_message = config.system_message
+        self.num_samples = 0
+        if config.use_backbone_lora:
+            self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=2 * config.use_backbone_lora)
+        if config.use_llm_lora:
+            self.wrap_llm_lora(r=config.use_llm_lora, lora_alpha=2 * config.use_llm_lora)
+        apply_postprocessing = getattr(config, 'apply_postprocessing', True)
+        hydra_overrides_extra = getattr(config, 'hydra_overrides_extra', [])
+        grounding_maskmem_num = getattr(config, 'grounding_maskmem_num', 22)
+        self.grounding_encoder = build_sam2_video_predictor(
+            config.grounding_encoder_config,
+            num_maskmem=grounding_maskmem_num,
+            apply_postprocessing=apply_postprocessing,
+            hydra_overrides_extra=hydra_overrides_extra
+        )
+        self.grounding_encoder.token_attn = copy.deepcopy(self.grounding_encoder.memory_attention)
+        in_dim = llm_hidden_size
+        out_dim = self.grounding_encoder.hidden_dim
+        self.text_hidden_fcs = nn.Sequential(
+            nn.Linear(in_dim, in_dim), nn.ReLU(inplace=True),
+            nn.Linear(in_dim, out_dim), nn.Dropout(0.0)
+        )
+        self.init_prediction_config = False
+    def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
+        lora_config = LoraConfig(
+            r=r,
+            target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2'],
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+        )
+        self.vision_model = get_peft_model(self.vision_model, lora_config)
+        self.vision_model.print_trainable_parameters()
+    def wrap_llm_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
+        # Determine the target modules based on the architecture of the language model
+        if self.llm_arch_name == 'InternLM2ForCausalLM':
+            target_modules = ['attention.wqkv', 'attention.wo', 'feed_forward.w1', 'feed_forward.w2', 'feed_forward.w3']
+        elif self.llm_arch_name == 'Phi3ForCausalLM':
+            target_modules = ['mlp.down_proj', 'mlp.gate_up_proj', 'self_attn.o_proj', 'self_attn.qkv_proj']
+        elif self.llm_arch_name in ['Qwen2ForCausalLM', 'LlamaForCausalLM']:
+            target_modules = ['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj', 'self_attn.o_proj',
+                              'mlp.gate_proj', 'mlp.down_proj', 'mlp.up_proj']
+        else:
+            raise NotImplemented
+        lora_config = LoraConfig(
+            r=r,
+            target_modules=target_modules,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            task_type='CAUSAL_LM'
+        )
+        self.language_model = get_peft_model(self.language_model, lora_config)
+        self.language_model.enable_input_require_grads()
+        self.language_model.print_trainable_parameters()
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
+                          'which results in a transposed image.')
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+    def extract_feature(self, pixel_values):
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=False,
+                return_dict=True).last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=True,
+                return_dict=True).hidden_states[self.select_layer]
+        vit_embeds = vit_embeds[:, 1:, :]
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+    @property
+    def lm_head(self):
+        return self.language_model.get_output_embeddings()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def forward(self, data, data_samples=None, mode='loss'):
+        pixel_values = data['pixel_values']
+        if type(pixel_values) is list or pixel_values.ndim == 5:
+            if type(pixel_values) is list:
+                pixel_values = [
+                    x.unsqueeze(0) if x.ndim == 3 else x for x in pixel_values
+                ]
+            # b*n, c, h, w
+            concat_images = torch.cat(
+                [image.to(self.vision_model.dtype) for image in pixel_values], dim=0)
+        else:
+            raise NotImplementedError()
+        input_ids = data['input_ids']
+        position_ids = data['position_ids']
+        attention_mask = data['attention_mask']
+        # sum is 0 are text
+        image_flags = torch.sum(concat_images, dim=(1, 2, 3)) != 0
+        image_flags = image_flags.long()
+        labels = data['labels']
+        use_cache = False
+        outputs = self._llm_forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            image_flags=image_flags,
+            pixel_values=concat_images,
+            labels=labels,
+            use_cache=use_cache,
+            output_hidden_states=True,
+        )
+        return outputs
+    def _llm_forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            image_flags: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
+        image_flags = image_flags.squeeze(-1)
+        # We only added the clone code here to avoid the error.
+        input_embeds = self.language_model.get_input_embeddings()(
+            input_ids).clone()
+        vit_embeds = self.extract_feature(pixel_values)
+        vit_embeds = vit_embeds.to(input_embeds.dtype)  # FIXME: why vit_embeds is float16?
+        fast_vit_embeds = None
+        vit_embeds = vit_embeds[image_flags == 1]
+        vit_batch_size = pixel_values.shape[0]
+        B, N, C = input_embeds.shape
+        input_embeds = input_embeds.reshape(B * N, C)
+        input_ids = input_ids.reshape(B * N)
+        selected = (input_ids == self.img_context_token_id)
+        try:
+            input_embeds[selected] = vit_embeds.reshape(-1, C)
+        except Exception as e:
+            vit_embeds = vit_embeds.reshape(-1, C)
+            print(f'warning: {e}, input_embeds[selected].shape='
+                    f'{input_embeds[selected].shape}, '
+                    f'vit_embeds.shape={vit_embeds.shape}')
+            n_token = selected.sum()
+            if n_token > len(vit_embeds):
+                print(f"Wrong !!! {n_token} image tokens in text but only {len(vit_embeds)} vit embeds !!!")
+                expand_ratio = n_token // len(vit_embeds) + 1
+                vit_embeds = torch.cat([vit_embeds] * expand_ratio, dim=0)
+            input_embeds[selected] = vit_embeds[:n_token]
+        input_embeds = input_embeds.reshape(B, N, C)
+        outputs = self.language_model(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(
+                -1, self.language_model.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def generate(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            input_ids: Optional[torch.FloatTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            visual_features: Optional[torch.FloatTensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            **generate_kwargs,
+    ) -> torch.LongTensor:
+        device = self.device
+        assert self.img_context_token_id is not None
+        if pixel_values is not None:
+            if visual_features is not None:
+                vit_embeds = visual_features
+            else:
+                if type(pixel_values) is list or pixel_values.ndim == 5:
+                    if type(pixel_values) is list:
+                        pixel_values = [
+                            x.unsqueeze(0) if x.ndim == 3 else x for x in pixel_values
+                        ]
+                    # b*n, c, h, w
+                    pixel_values = torch.cat(
+                        [image.to(self.vision_model.dtype) for image in pixel_values], dim=0)
+                vit_embeds = self.extract_feature(pixel_values.to(device))
+            image_flags = torch.sum(pixel_values, dim=(1, 2, 3)) != 0
+            image_flags = image_flags.long()
+            vit_embeds = vit_embeds[image_flags == 1]
+            input_embeds = self.language_model.get_input_embeddings()(input_ids.to(device))
+            B, N, C = input_embeds.shape
+            input_embeds = input_embeds.reshape(B * N, C)
+            input_ids = input_ids.reshape(B * N)
+            selected = (input_ids == self.img_context_token_id)
+            assert selected.sum() != 0
+            input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
+            input_embeds = input_embeds.reshape(B, N, C)
+        else:
+            input_embeds = self.language_model.get_input_embeddings()(input_ids)
+        outputs = self.language_model.generate(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask.to(device),
+            generation_config=generation_config,
+            output_hidden_states=output_hidden_states,
+            # return_dict=return_dict,
+            use_cache=True,
+            **generate_kwargs,
+        )
+        return outputs
+    def preparing_for_generation(self, tokenizer, max_new_tokens=2048, torch_dtype=torch.bfloat16):
+        # set stop criteria and generation configs for model
+        if not hasattr(self, 'tokenizer'):
+            self.tokenizer = tokenizer
+        self.bot_name = 'BOT'
+        stop_words = []
+        stop_words += self.template.get('STOP_WORDS', [])
+        stop_criteria = get_stop_criteria(
+            tokenizer=self.tokenizer, stop_words=stop_words)
+        self.stop_criteria = stop_criteria
+        default_generation_kwargs = dict(
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=(
+                self.tokenizer.pad_token_id
+                if self.tokenizer.pad_token_id is not None
+                else self.tokenizer.eos_token_id
+            ),
+        )
+        self.gen_config = GenerationConfig(**default_generation_kwargs)
+        self.init_prediction_config = True
+        self.torch_dtype = torch_dtype
+        self.to(torch_dtype)
+        self.extra_image_processor = DirectResize(target_length=1024, )
+        # for multi image process
+        self.min_dynamic_patch = 1
+        self.max_dynamic_patch = 12
+        self.downsample_ratio = 0.5
+        self.image_size = 448
+        self.use_thumbnail = True
+        patch_size = 14
+        self.patch_size = patch_size
+        self.patch_token = int((self.image_size // patch_size) ** 2 * (self.downsample_ratio ** 2))
+        self.IMAGENET_MEAN = (0.485, 0.456, 0.406)
+        self.IMAGENET_STD = (0.229, 0.224, 0.225)
+        self.IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+        self.IMG_START_TOKEN = '<img>'
+        self.IMG_END_TOKEN = '</img>'
+        self.transformer = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD)
+        ])
+        # change phi3 prepare for generation fuction
+        if self.config.llm_config.architectures[0] == 'Phi3ForCausalLM':
+            self.language_model.prepare_inputs_for_generation = MethodType(prepare_inputs_for_generation_phi3, self.language_model)
+        img_context_token_id = tokenizer.convert_tokens_to_ids('<IMG_CONTEXT>')
+        self.img_context_token_id = img_context_token_id
+        self.seg_token_idx = tokenizer.convert_tokens_to_ids('[SEG]')
+        return
+    @torch.inference_mode()
+    def propagate_in_video(
+        self,
+        inference_state,
+        start_frame_idx=None,
+        max_frame_num_to_track=None,
+        reverse=False,
+        init_mask=None,
+        tokenizer=None,
+        mllm_memory_size=7,
+    ):
+        if not self.init_prediction_config:
+            assert tokenizer
+            self.preparing_for_generation(tokenizer=tokenizer)
+        """Propagate the input points across frames to track in the entire video."""
+        self.grounding_encoder.propagate_in_video_preflight(inference_state)
+        output_dict = inference_state["output_dict"]
+        consolidated_frame_inds = inference_state["consolidated_frame_inds"]
+        obj_ids = inference_state["obj_ids"]
+        num_frames = inference_state["num_frames"]
+        video_paths = inference_state["video_paths"]
+        batch_size = self.grounding_encoder._get_obj_num(inference_state)
+        if len(output_dict["cond_frame_outputs"]) == 0:
+            raise RuntimeError("No points are provided; please add points first")
+        clear_non_cond_mem = self.grounding_encoder.clear_non_cond_mem_around_input and (
+            self.grounding_encoder.clear_non_cond_mem_for_multi_obj or batch_size <= 1
+        )
+        # set start index, end index, and processing order
+        if start_frame_idx is None:
+            # default: start from the earliest frame with input points
+            start_frame_idx = min(output_dict["cond_frame_outputs"])
+        if max_frame_num_to_track is None:
+            # default: track all the frames in the video
+            max_frame_num_to_track = num_frames
+        if reverse:
+            end_frame_idx = max(start_frame_idx - max_frame_num_to_track, 0)
+            if start_frame_idx > 0:
+                processing_order = range(start_frame_idx, end_frame_idx - 1, -1)
+            else:
+                processing_order = []  # skip reverse tracking if starting from frame 0
+        else:
+            end_frame_idx = min(
+                start_frame_idx + max_frame_num_to_track, num_frames - 1
+            )
+            processing_order = range(start_frame_idx, end_frame_idx + 1)
+        mllm_memory = [(start_frame_idx, Image.open(video_paths[start_frame_idx]).convert('RGB'), init_mask)]
+        for frame_idx in tqdm(processing_order, desc="propagate in video"):
+            # We skip those frames already in consolidated outputs (these are frames
+            # that received input clicks or mask). Note that we cannot directly run
+            # batched forward on them via `_run_single_frame_inference` because the
+            # number of clicks on each object might be different.
+            _update_flag = False
+            if frame_idx in consolidated_frame_inds["cond_frame_outputs"]:
+                storage_key = "cond_frame_outputs"
+                current_out = output_dict[storage_key][frame_idx]
+                pred_masks = current_out["pred_masks"]
+                if clear_non_cond_mem:
+                    # clear non-conditioning memory of the surrounding frames
+                    self.grounding_encoder._clear_non_cond_mem_around_input(inference_state, frame_idx)
+            elif frame_idx in consolidated_frame_inds["non_cond_frame_outputs"]:
+                storage_key = "non_cond_frame_outputs"
+                current_out = output_dict[storage_key][frame_idx]
+                pred_masks = current_out["pred_masks"]
+            else:
+                storage_key = "non_cond_frame_outputs"
+                # language_embd = None
+                inference_params = {
+                    "inference_state": inference_state,
+                    "output_dict": output_dict,
+                    "frame_idx": frame_idx,
+                    "batch_size": batch_size,
+                    "is_init_cond_frame": False,
+                    "point_inputs": None,
+                    "mask_inputs": None,
+                    "reverse": reverse,
+                    "run_mem_encoder": True,
+                    "start_frame_idx": start_frame_idx,
+                }
+                current_img = Image.open(video_paths[frame_idx]).convert('RGB')
+                last_img = Image.open(video_paths[frame_idx-1]).convert('RGB')
+                flags = [is_scene_change_hsv(current_img, last_img)]
+                if len(mllm_memory) > mllm_memory_size:
+                    _mllm_memory = [mllm_memory[0]] + mllm_memory[-(mllm_memory_size-1):]
+                else:
+                    _mllm_memory = mllm_memory
+                if False in flags:
+                    _update_flag = False
+                    language_embd = None
+                else:
+                    _update_flag = True
+                    video = [label_img_with_mask(img, mask) for _, img, mask in _mllm_memory]
+                    video.append(current_img)
+                    text = "<image>Please segment the object in the last frame based on the object labeled in the first several images."
+                    specific_language_embd = self.predict_forward(video=video, text=text)
+                    language_embd = specific_language_embd.unsqueeze(0)
+                current_out, pred_masks = self.grounding_encoder._run_single_frame_inference(
+                    **inference_params, language_embd=language_embd
+                )
+                # optionally offload the output to CPU memory to save GPU space
+                for key, value in current_out.items():
+                    if isinstance(value, torch.Tensor):
+                        current_out[key] = value.to('cpu', non_blocking=True)
+                pred_masks = pred_masks.to('cpu', non_blocking=True)
+                output_dict[storage_key][frame_idx] = current_out
+            # Create slices of per-object outputs for subsequent interaction with each
+            # individual object after tracking.
+            self.grounding_encoder._add_output_per_object(
+                inference_state, frame_idx, current_out, storage_key
+            )
+            inference_state["frames_already_tracked"][frame_idx] = {"reverse": reverse}
+            # Resize the output mask to the original video resolution (we directly use
+            # the mask scores on GPU for output to avoid any CPU conversion in between)
+            _, video_res_masks = self.grounding_encoder._get_orig_video_res_output(
+                inference_state, pred_masks
+            )
+            if _update_flag and (video_res_masks[0] > 0.0).sum() != 0 and current_out["object_score_logits"].item() > 1:
+                mllm_memory.append((
+                    frame_idx, Image.open(video_paths[frame_idx]).convert('RGB'),
+                    (video_res_masks[0] > 0.0).cpu().numpy()
+                ))
+            yield frame_idx, obj_ids, video_res_masks
+    def predict_forward(
+        self,
+        image=None,
+        video=None,
+        text=None,
+        num_seg_token=1
+    ):
+        assert image is not None or video is not None
+        input_dict = {}
+        if video is not None:
+            pixel_values = []
+            ori_image_size = video[0].size
+            for frame_idx, frame_image in enumerate(video):
+                assert ori_image_size == frame_image.size
+                img = self.transformer(frame_image)
+                pixel_values.append(img)
+            pixel_values = torch.stack(pixel_values, dim=0).to(self.torch_dtype)  # (n_f, 3, h, w)
+            num_image_tokens = self.patch_token
+            num_frames = len(pixel_values)
+        else:
+            ori_image_size = image.size
+            images = dynamic_preprocess(
+                image, self.min_dynamic_patch, self.max_dynamic_patch,
+                self.image_size, self.use_thumbnail
+            )
+            pixel_values = [self.transformer(image) for image in images]
+            pixel_values = torch.stack(pixel_values).to(self.torch_dtype)
+            num_image_tokens = pixel_values.shape[0] * self.patch_token
+            num_frames = 1
+        input_dict['pixel_values'] = pixel_values
+        image_token_str = f'{self.IMG_START_TOKEN}' \
+                            f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \
+                            f'{self.IMG_END_TOKEN}'
+        image_token_str = image_token_str + '\n'
+        image_token_str = image_token_str * num_frames
+        image_token_str = image_token_str.strip()
+        text += "It is [SEG].".replace('[SEG]', '[SEG]' * num_seg_token)
+        text = text.replace('<image>', image_token_str)
+        input_text = ''
+        input_text += self.template['INSTRUCTION'].format(
+            input=text, round=1, bot_name=self.bot_name)
+        ids = self.tokenizer.encode(input_text)
+        ids = torch.tensor(ids).cuda().unsqueeze(0)
+        attention_mask = torch.ones_like(ids, dtype=torch.bool)
+        data ={
+            'input_ids': ids,
+            'attention_mask': attention_mask,
+            'pixel_values': pixel_values.unsqueeze(0).to(self.device),
+            'position_ids': None,
+            'labels': None,
+        }
+        output = self.forward(data)
+        seg_token_mask = ids == self.seg_token_idx
+        hidden_states = output.hidden_states
+        hidden_states = hidden_states[-1][seg_token_mask]
+        hidden_states = self.text_hidden_fcs(hidden_states)
+        _zero = hidden_states.mean() * 0.0
+        pred_embeddings = hidden_states + _zero # [n, 256]
+        return pred_embeddings
+def label_img_with_mask(img, mask):
+    frame = np.array(img)
+    mask = np.uint8(mask).squeeze()
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    for contour in contours:
+        cv2.drawContours(frame, [contour], -1, (0, 255, 0), 2)
+    frame = Image.fromarray(frame)
+    return frame
+def is_scene_change_hsv(img1, img2, threshold=0.35):
+    img1 = cv2.resize(np.array(img1), (1024, 1024))
+    img2 = cv2.resize(np.array(img2), (1024, 1024))
+    hsv1 = cv2.cvtColor(img1, cv2.COLOR_BGR2HSV)
+    hsv2 = cv2.cvtColor(img2, cv2.COLOR_BGR2HSV)
+    hist1 = cv2.calcHist([hsv1], [0, 1], None, [60, 80], [0, 180, 0, 256])
+    hist2 = cv2.calcHist([hsv2], [0, 1], None, [60, 80], [0, 180, 0, 256])
+    cv2.normalize(hist1, hist1)
+    cv2.normalize(hist2, hist2)
+    distance = cv2.compareHist(hist1, hist2, cv2.HISTCMP_BHATTACHARYYA)
+    return distance > threshold
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
+                              image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image,
+                       min_num=1,
+                       max_num=6,
+                       image_size=448,
+                       use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1) for j in range(1, n + 1)
+                     if i * j <= max_num and i * j >= min_num}
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio,
+                                                    target_ratios, orig_width,
+                                                    orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+from transformers.cache_utils import Cache, DynamicCache
+def prepare_inputs_for_generation_phi3(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+):
+    if past_key_values is not None:
+        if isinstance(past_key_values, Cache):
+            cache_length = past_key_values.get_seq_length()
+            past_length = past_key_values.seen_tokens
+            max_cache_length = past_key_values.get_max_length()
+        else:
+            cache_length = past_length = past_key_values[0][0].shape[2]
+            max_cache_length = None
+        # Keep only the unprocessed tokens:
+        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+        # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+        # input)
+        if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+        # input_ids based on the past_length.
+        elif past_length < input_ids.shape[1]:
+            input_ids = input_ids[:, past_length:]
+        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+        if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+        ):
+            attention_mask = attention_mask[:, -max_cache_length:]
+    position_ids = kwargs.get('position_ids', None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = position_ids[:, -input_ids.shape[1]:]
+    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+    if inputs_embeds is not None and (past_key_values is None or len(past_key_values)==0):
+        model_inputs = {'inputs_embeds': inputs_embeds}
+    else:
+        model_inputs = {'input_ids': input_ids}
+    model_inputs.update(
+        {
+            'position_ids': position_ids,
+            'past_key_values': past_key_values,
+            'use_cache': kwargs.get('use_cache'),
+            'attention_mask': attention_mask,
+        }
+    )
+    return model_inputs

eneas/vendor/SeC/inference/sam2/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from hydra import initialize_config_module
+from hydra.core.global_hydra import GlobalHydra
+if GlobalHydra.instance().is_initialized():
+    GlobalHydra.instance().clear()
+# Patched by eneas: use vendored SeC path
+initialize_config_module("eneas.vendor.SeC.inference.sam2.configs", version_base="1.2")