Any-to-Any
Transformers
Safetensors
PyTorch
NemotronH_Nano_Omni_Reasoning_V3
feature-extraction
nvidia
multimodal
custom_code
Instructions to use Jashan887/76_Nvidia_Reasoning_30B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Jashan887/76_Nvidia_Reasoning_30B with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Jashan887/76_Nvidia_Reasoning_30B", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| # coding=utf-8 | |
| # Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. | |
| # | |
| # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX | |
| # and OPT implementations in this library. It has been modified from its | |
| # original forms to accommodate minor architectural differences compared | |
| # to GPT-NeoX and OPT used by the Meta AI team that trained the model. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """video processor class for Qwen2-VL.""" | |
| import math | |
| from typing import Optional, Union | |
| from transformers.image_processing_utils import ( | |
| BatchFeature, | |
| ) | |
| from transformers.image_utils import ( | |
| OPENAI_CLIP_MEAN, | |
| OPENAI_CLIP_STD, | |
| ChannelDimension, | |
| SizeDict, | |
| get_image_size, | |
| ) | |
| from transformers.processing_utils import Unpack, VideosKwargs | |
| from transformers.utils import ( | |
| TensorType, | |
| add_start_docstrings, | |
| is_torch_available, | |
| is_torchvision_available, | |
| is_torchvision_v2_available, | |
| is_vision_available, | |
| ) | |
| from transformers.utils.import_utils import requires | |
| from transformers.video_processing_utils import ( | |
| BASE_VIDEO_PROCESSOR_DOCSTRING, | |
| BaseVideoProcessor, | |
| ) | |
| from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos | |
| import torchvision.transforms as T | |
| from .processing_utils import get_internvl_target_ratios, calculate_targets | |
| if is_torchvision_available(): | |
| if is_torchvision_v2_available(): | |
| from torchvision.transforms.v2 import functional as F | |
| else: | |
| from torchvision.transforms import functional as F | |
| if is_torch_available(): | |
| import torch | |
| class NemotronH_Nano_Omni_Reasoning_V3VideoProcessor(BaseVideoProcessor): | |
| model_input_names = ["pixel_values_videos", "video_grid_thw"] | |
| def __init__(self, image_size=512, max_num_tiles=12, norm_mean=None, norm_std=None, **kwargs): | |
| super().__init__(**kwargs) | |
| self.image_size = image_size | |
| self.max_num_tiles = max_num_tiles | |
| self.norm_mean = norm_mean | |
| self.norm_std = norm_std | |
| def _preprocess( | |
| self, | |
| videos: list["torch.Tensor"], | |
| video_metadata: Union[list[VideoMetadata], list[dict]], | |
| do_sample_frames: bool, | |
| fps: Optional[int] = None, | |
| num_frames: Optional[int] = None, | |
| return_tensors: Optional[Union[str, TensorType]] = None, | |
| device: Optional["torch.Tensor"] = None, | |
| **kwargs, | |
| ): | |
| if do_sample_frames: | |
| # Sample video frames | |
| videos = [ | |
| self.sample_frames( | |
| video, | |
| metadata=metadata, | |
| num_frames=num_frames, | |
| fps=fps, | |
| ) | |
| for video, metadata in zip(videos, video_metadata) | |
| ] | |
| # We need to sample frames first before moving to device, if `do_sample_frames=True`. Otherwise | |
| # moving the whole video incurs high GPU mem usage for long videos | |
| if device is not None: | |
| videos = [video.to(device) for video in videos] | |
| # Group videos by size for batched resizing | |
| grouped_videos, grouped_videos_index = group_videos_by_shape(videos) | |
| resized_videos_grouped = {} | |
| processed_grids = {} | |
| for shape, stacked_videos in grouped_videos.items(): | |
| height, width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST) | |
| batch_size, grid_t, channel = stacked_videos.shape[:3] | |
| target_ratios = get_internvl_target_ratios(1, self.max_num_tiles) | |
| blocks, resize_width, resize_height = calculate_targets( | |
| width, | |
| height, | |
| target_ratios, | |
| self.image_size | |
| ) | |
| stacked_videos = self.resize( | |
| image=stacked_videos, | |
| size=SizeDict(height=resize_height, width=resize_width), | |
| interpolation=T.InterpolationMode.BICUBIC, | |
| ) | |
| # stacked_videos = T.Resize((resize_width, resize_height), interpolation=T.InterpolationMode.BICUBIC)(stacked_videos) | |
| norm_mean = torch.as_tensor(self.norm_mean, dtype=stacked_videos.dtype, device=stacked_videos.device).view(1, 1, 3, 1, 1) | |
| norm_std = torch.as_tensor(self.norm_std, dtype=stacked_videos.dtype, device=stacked_videos.device).view(1, 1, 3, 1, 1) | |
| stacked_videos = (stacked_videos - norm_mean) / norm_std | |
| resized_videos_grouped[shape] = stacked_videos | |
| grid_h, grid_w = resize_height // self.image_size, resize_width // self.image_size | |
| processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size | |
| resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index) | |
| processed_grids = reorder_videos(processed_grids, grouped_videos_index) | |
| pixel_values_videos = torch.cat(resized_videos, dim=0) | |
| video_grid_thw = torch.tensor(processed_grids) | |
| return BatchFeature( | |
| data={"pixel_values_videos": pixel_values_videos, "video_grid_thw": video_grid_thw}, | |
| tensor_type=return_tensors, | |
| ) | |
| def get_num_of_video_patches(self, num_frames: int, height: int, width: int): | |
| """ | |
| A utility that returns number of video patches a given video size. | |
| Args: | |
| num_frames (`int`): | |
| Number of frames in the input video. | |
| height (`int`): | |
| Height of the input video. | |
| width (`int`): | |
| Width of the input video. | |
| Returns: | |
| `Tuple(int, int)`: Number of placeholder tokens required and number of patches per image. | |
| """ | |
| target_ratios = get_internvl_target_ratios(1, self.max_num_tiles) | |
| blocks, _, _ = calculate_targets( | |
| width, | |
| height, | |
| target_ratios, | |
| self.image_size | |
| ) | |
| return num_frames * blocks | |
| __all__ = ["NemotronH_Nano_Omni_Reasoning_V3VideoProcessor"] | |