Spaces:
Sleeping
Sleeping
| { | |
| "exam_info": { | |
| "title": "NVIDIA Certification Practice Questions", | |
| "certifications": [ | |
| "NCP-AII", | |
| "NCP-AIO", | |
| "NCA-AIIO" | |
| ], | |
| "total_questions": 67, | |
| "time_limit_minutes": 5, | |
| "passing_score": 70, | |
| "last_updated": "February 2026" | |
| }, | |
| "questions": [ | |
| { | |
| "id": 1, | |
| "section": "Systems & Server Bring-Up", | |
| "question": "During a GPU server bring-up, one GPU is not detected. What's your first step?", | |
| "options": [ | |
| "Replace the GPU immediately", | |
| "Reseat the GPU and check PCIe slot connections", | |
| "Update the NVIDIA driver package", | |
| "Run an nvidia-smi reset command" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Physical seating/cabling issues are common in bring-up. Validate connections before replacing hardware or changing software. This is the least invasive troubleshooting step and often resolves the issue." | |
| }, | |
| { | |
| "id": 2, | |
| "section": "Systems & Server Bring-Up", | |
| "question": "Which tool best verifies GPU health/status during bring-up?", | |
| "options": [ | |
| "iperf3", | |
| "nvidia-smi", | |
| "dmidecode", | |
| "sar" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "nvidia-smi is the primary utility for GPU health, utilization, temperature, and memory diagnostics. It provides comprehensive information about NVIDIA GPU status and is the standard tool for GPU monitoring." | |
| }, | |
| { | |
| "id": 3, | |
| "section": "Systems & Server Bring-Up", | |
| "question": "When validating a multi-GPU system, which BIOS setting is critical?", | |
| "options": [ | |
| "CPU Hyper-Threading", | |
| "PCIe bifurcation settings", | |
| "Boot order", | |
| "Secure Boot" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Proper PCIe bifurcation (lane splitting) is key to multi-GPU stability and bandwidth. This setting determines how PCIe lanes are distributed across multiple slots, which is essential for optimal multi-GPU performance." | |
| }, | |
| { | |
| "id": 4, | |
| "section": "Systems & Server Bring-Up", | |
| "question": "What is the recommended power-on approach for initial bring-up?", | |
| "options": [ | |
| "Connect GPUs after powering on", | |
| "Power GPUs first, then CPUs", | |
| "Ensure all power cables are connected before powering on", | |
| "Sequence doesn't matter" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "Confirm power cabling and integrity before the first boot to avoid damage and intermittent issues. This prevents potential hardware damage from partial power delivery during boot-up." | |
| }, | |
| { | |
| "id": 5, | |
| "section": "Systems & Server Bring-Up", | |
| "question": "A server fails POST when multiple GPUs are installed. What is the most likely cause?", | |
| "options": [ | |
| "Incompatible SSD firmware", | |
| "Insufficient PSU wattage", | |
| "Incorrect fan profile", | |
| "Missing OS patches" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Multi-GPU systems draw high power; an undersized PSU can prevent boot/POST. Each GPU can consume 300-400W or more, and the PSU must have sufficient capacity and the correct power connectors." | |
| }, | |
| { | |
| "id": 6, | |
| "section": "Systems & Server Bring-Up", | |
| "question": "What is the best method to validate GPU-to-GPU communication in a DGX-class system?", | |
| "options": [ | |
| "Run LINPACK", | |
| "Check NVLink topology with nvidia-smi topo -m", | |
| "CPU burn-in", | |
| "Review BIOS boot logs" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "The command 'nvidia-smi topo -m' shows NVLink/PCIe connectivity and hop distances between GPUs. This provides a clear topology map showing which GPUs are connected via NVLink and their communication paths." | |
| }, | |
| { | |
| "id": 7, | |
| "section": "Systems & Server Bring-Up", | |
| "question": "A firmware mismatch is found versus the validated version during bring-up. What should you do next?", | |
| "options": [ | |
| "Ignore and proceed", | |
| "Roll back to the validated firmware", | |
| "Flash the latest available firmware", | |
| "RMA the GPU" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Use validated firmware for stability. Avoid untested latest builds in bring-up. Validated firmware has been tested for compatibility with your specific hardware configuration." | |
| }, | |
| { | |
| "id": 8, | |
| "section": "Systems & Server Bring-Up", | |
| "question": "Which Linux log is most useful for driver initialization issues?", | |
| "options": [ | |
| "/var/log/syslog", | |
| "dmesg", | |
| "/var/log/secure", | |
| "/var/log/messages" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "dmesg surfaces kernel/driver initialization details, including GPU driver errors. It shows kernel ring buffer messages which include hardware detection and driver loading information." | |
| }, | |
| { | |
| "id": 9, | |
| "section": "Systems & Server Bring-Up", | |
| "question": "What is the role of NVIDIA DCGM during bring-up?", | |
| "options": [ | |
| "VM orchestration", | |
| "Network throughput testing", | |
| "GPU health/performance monitoring & diagnostics", | |
| "BIOS configuration" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "DCGM (Data Center GPU Manager) provides monitoring, diagnostics, and policy management for GPUs; valuable in validation. It offers comprehensive GPU telemetry and health checking capabilities." | |
| }, | |
| { | |
| "id": 10, | |
| "section": "Systems & Server Bring-Up", | |
| "question": "Thermal throttling appears during bring-up. What is the immediate action?", | |
| "options": [ | |
| "Disable GPU overclocking", | |
| "Increase airflow and verify cooling configuration", | |
| "Lower power cap permanently", | |
| "Replace GPUs" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "First address environmental/cooling factors (airflow, fans, heatsinks, ducting) before tuning power limits. Thermal throttling is usually caused by inadequate cooling rather than GPU defects." | |
| }, | |
| { | |
| "id": 11, | |
| "section": "GPU Validation & Testing", | |
| "question": "You are tasked with validating a newly installed NVIDIA A100 Tensor Core GPU. What is the correct process to verify GPU presence, PCIe bandwidth, and computational performance?", | |
| "options": [ | |
| "Use lspci | grep NVIDIA for presence; nvidia-smi -q -d pcie for bandwidth; Run TensorFlow ResNet50 benchmark", | |
| "Use nvidia-smi for presence; PCIe speed is irrelevant; Run nvprof profiler during CUDA application", | |
| "Check BIOS settings for GPU detection; Use lspci -vv to check PCIe speed; Run PyTorch ImageNet training", | |
| "Use nvidia-smi for presence; nvidia-smi -q -d pcie for bandwidth; Run CUDA-based matrix multiplication benchmark" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "nvidia-smi is the primary tool for NVIDIA GPU information. 'nvidia-smi -q -d pcie' provides detailed PCIe information. A CUDA-based benchmark (like cuBLAS matrix multiplication) isolates GPU performance without framework overhead." | |
| }, | |
| { | |
| "id": 12, | |
| "section": "GPU Validation & Testing", | |
| "question": "Your AI inference server using NVIDIA Triton experiences intermittent latency spikes. Profiling reveals GPU memory allocation stalls. Which strategy would be LEAST effective?", | |
| "options": [ | |
| "Using CUDA memory pools to pre-allocate memory", | |
| "Enabling CUDA graph capture to reduce kernel launch overhead", | |
| "Reducing the model's memory footprint using quantization", | |
| "Increasing the GPU's TCC (Tesla Compute Cluster) mode priority" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "TCC (Tesla Compute Cluster) mode priority primarily affects preemption behavior for display vs compute workloads and doesn't directly address memory allocation issues. The other options directly reduce memory pressure or allocation overhead." | |
| }, | |
| { | |
| "id": 13, | |
| "section": "GPU Validation & Testing", | |
| "question": "You need to configure persistent network settings on your BlueField SmartNIC after deploying BlueField OS (Debian-based). Which file should you modify?", | |
| "options": [ | |
| "/etc/network/interfaces", | |
| "/etc/resolv.conf", | |
| "/etc/hostname", | |
| "/etc/sysconfig/network-scripts/ifcfg-eth0" | |
| ], | |
| "correct_answer": 0, | |
| "explanation": "On Debian-based systems, /etc/network/interfaces is the standard location for configuring persistent network settings. This file is read during boot to configure network interfaces." | |
| }, | |
| { | |
| "id": 14, | |
| "section": "GPU Validation & Testing", | |
| "question": "A distributed training job experiences performance degradation. Network bandwidth is at maximum, but nvidia-smi shows low GPU utilization on some nodes. What is the most likely cause?", | |
| "options": [ | |
| "The GPUs are overheating, causing thermal throttling", | |
| "Data is not being distributed evenly across nodes; some nodes are waiting for data", | |
| "The NVIDIA drivers are outdated", | |
| "The network interface cards are faulty" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "When network bandwidth is saturated but GPUs show low utilization, it indicates data distribution imbalance. Some nodes are idle waiting for data while others are overloaded, creating a bottleneck in the distributed training pipeline." | |
| }, | |
| { | |
| "id": 15, | |
| "section": "GPU Validation & Testing", | |
| "question": "One GPU consistently reports lower utilization during distributed training. What is the most likely cause and best troubleshooting step?", | |
| "options": [ | |
| "Driver issue affecting only one GPU; reinstall NVIDIA drivers", | |
| "Software bug in training script; debug the script", | |
| "Hardware fault with GPU (thermal/memory); run nvidia-smi -i <gpu_id> -q", | |
| "Insufficient cooling in server rack; verify airflow" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "Consistent low utilization on a specific GPU suggests hardware issues. Running 'nvidia-smi -i <gpu_id> -q' provides detailed information about temperatures, power limits, error counts, and throttling that can identify thermal or memory problems." | |
| }, | |
| { | |
| "id": 16, | |
| "section": "GPU Validation & Testing", | |
| "question": "After launching a distributed training job on DGX servers via InfiniBand, inter-GPU communication is slower than expected. What is the most likely cause?", | |
| "options": [ | |
| "Default MTU size of 1500 is too small", | |
| "Incorrect GPU placement across NUMA nodes", | |
| "CPU frequency scaling set to powersave", | |
| "InfiniBand subnet manager configured incorrectly" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "The InfiniBand subnet manager (SM) handles path selection, congestion control, and fabric management. Incorrect SM configuration is the most common cause of poor InfiniBand performance, even when all links show as active." | |
| }, | |
| { | |
| "id": 17, | |
| "section": "GPU Validation & Testing", | |
| "question": "What must be done before installing new versions of DOCA drivers on a BlueField DPU?", | |
| "options": [ | |
| "Uninstall any previous versions of DOCA drivers", | |
| "Re-flash the firmware every time", | |
| "Disable network interfaces during installation", | |
| "Reboot the host system" | |
| ], | |
| "correct_answer": 0, | |
| "explanation": "Uninstalling previous DOCA driver versions prevents conflicts and ensures a clean upgrade. This is required to avoid version mismatches and ensure the new installation is not affected by leftover files or configurations." | |
| }, | |
| { | |
| "id": 18, | |
| "section": "GPU Validation & Testing", | |
| "question": "You observe one GPU fan running at significantly higher RPM than others under minimal load. ipmitool sensor shows normal GPU temperature. What are potential causes?", | |
| "options": [ | |
| "The fan's PWM control signal is malfunctioning", | |
| "The fan bearing is wearing out", | |
| "Dust buildup restricting airflow", | |
| "All of the above" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "All listed options can cause abnormal fan behavior. PWM signal issues, worn bearings requiring higher RPM to maintain airflow, and dust buildup can all result in one fan running faster than others even at normal temperatures." | |
| }, | |
| { | |
| "id": 19, | |
| "section": "GPU Validation & Testing", | |
| "question": "You notice GPU memory bandwidth reported by nvidia-smi is significantly lower than theoretical maximum. What are TWO potential bottlenecks?", | |
| "options": [ | |
| "Insufficient CPU cores assigned to training", | |
| "Inefficient data loading from storage to GPU memory", | |
| "GPUs connected via PCIe Gen3 instead of Gen4", | |
| "CPU using older DDR4 memory" | |
| ], | |
| "correct_answer": [1, 2], | |
| "explanation": "The correct answers are B and C. Inefficient data loading creates a pipeline stall where the GPU waits for data. PCIe Gen3 (16 GB/s) vs Gen4 (32 GB/s) can bottleneck transfers to GPU memory, especially for high-bandwidth GPUs like A100." | |
| }, | |
| { | |
| "id": 20, | |
| "section": "GPU Validation & Testing", | |
| "question": "After replacing a faulty GPU, the system boots and nvidia-smi detects it, but CUDA programs fail with 'no CUDA-capable device detected'. What is the most probable cause?", | |
| "options": [ | |
| "The new GPU is incompatible with existing BIOS", | |
| "The CUDA toolkit is not properly configured to use the new GPU", | |
| "The LD_LIBRARY_PATH is not set correctly", | |
| "The user lacks permissions to access the GPU" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "When nvidia-smi detects the GPU but CUDA applications don't, the CUDA runtime/toolkit is not properly configured. This often happens after hardware changes when the CUDA installation needs to be refreshed or reconfigured." | |
| }, | |
| { | |
| "id": 21, | |
| "section": "Network Configuration & Optimization", | |
| "question": "You are designing a network for distributed training with multiple GPUs across nodes. Which network characteristic is MOST critical for minimizing training time?", | |
| "options": [ | |
| "High bandwidth", | |
| "Low latency", | |
| "High packet loss rate", | |
| "Large MTU" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Low latency is most critical for distributed training because gradient synchronization happens frequently. Even with high bandwidth, high latency causes GPUs to wait idle during synchronization, reducing overall training speed." | |
| }, | |
| { | |
| "id": 22, | |
| "section": "Network Configuration & Optimization", | |
| "question": "Your AI training pipeline reads data from a large HDF5 file with significant delays. What optimization is most likely to improve performance?", | |
| "options": [ | |
| "Converting HDF5 to CSV", | |
| "Storing HDF5 on NFS", | |
| "Reorganizing HDF5 file to improve data contiguity and chunking", | |
| "Compressing HDF5 with gzip" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "Proper HDF5 chunking and contiguity optimization significantly improves read performance by reducing seek operations and enabling efficient sequential reads. HDF5's chunked storage layout directly impacts random and sequential access patterns." | |
| }, | |
| { | |
| "id": 23, | |
| "section": "Network Configuration & Optimization", | |
| "question": "You're optimizing an Intel Xeon server with 4 A100 GPUs. nvprof identifies frequent CUDA kernel stalls due to thread divergence. What are possible causes and solutions?", | |
| "options": [ | |
| "Input data not properly aligned; ensure 128-byte alignment", | |
| "CUDA code contains conditional branches; rewrite to minimize branching", | |
| "GPUs overheating; improve cooling", | |
| "CUDA compiler generating suboptimal code; try different optimization flags" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Thread divergence occurs when threads in the same warp take different execution paths due to conditional branches. Rewriting code to minimize branching (using techniques like predication) keeps threads synchronized and improves performance." | |
| }, | |
| { | |
| "id": 24, | |
| "section": "Network Configuration & Optimization", | |
| "question": "You're designing a data center network for inference workloads requiring high availability. Which considerations are MOST important?", | |
| "options": [ | |
| "Minimizing hop count", | |
| "Implementing redundant paths", | |
| "Using cheapest switches", | |
| "Prioritizing north-south bandwidth" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "High availability requires redundant paths to eliminate single points of failure. This ensures service continuity even if network components fail, which is critical for production inference workloads." | |
| }, | |
| { | |
| "id": 25, | |
| "section": "Network Configuration & Optimization", | |
| "question": "After installing GPUs, system boots but nvidia-smi only detects one GPU. The motherboard has multiple PCIe slots. What is the most probable cause?", | |
| "options": [ | |
| "Other GPUs not properly seated", | |
| "Other GPUs are faulty", | |
| "BIOS/UEFI not configured to enable all PCIe slots", | |
| "Power supply insufficient" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "BIOS/UEFI settings often disable PCIe slots by default or have incorrect lane allocation. Check BIOS settings to enable all PCIe slots and properly configure lane distribution (e.g., x16/x8/x8 mode)." | |
| }, | |
| { | |
| "id": 26, | |
| "section": "Network Configuration & Optimization", | |
| "question": "You need to remotely monitor GPU temperature without installing additional software. Which protocol would BEST facilitate this?", | |
| "options": [ | |
| "SNMP with MIB", | |
| "HTTP with JSON", | |
| "SSH with nvidia-smi output", | |
| "IPMI with SDR" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "IPMI (Intelligent Platform Management Interface) with SDR (Sensor Data Records) provides hardware-level monitoring without requiring additional software installation on the OS. It accesses the BMC directly for sensor data." | |
| }, | |
| { | |
| "id": 27, | |
| "section": "Network Configuration & Optimization", | |
| "question": "Consider this ibroute command: 'ibroute add dest 0x1a dev ib0'. What is its MOST likely purpose?", | |
| "options": [ | |
| "Add default route for traffic outside InfiniBand subnet", | |
| "Create static route for traffic to LID 0x1a using ib0", | |
| "Configure MTU size on ib0 to 0x1a bytes", | |
| "Disable routing on ib0" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "This command creates a static route for traffic destined to InfiniBand LID (Local Identifier) 0x1a, directing it through the ib0 interface. LIDs are used for addressing within InfiniBand fabrics." | |
| }, | |
| { | |
| "id": 28, | |
| "section": "Network Configuration & Optimization", | |
| "question": "You're optimizing a deep learning model for Tensor Cores. Profiling shows Tensor Cores are underutilized. Which strategy improves utilization?", | |
| "options": [ | |
| "Increase batch size", | |
| "Ensure all matrix multiplications use FP16 precision", | |
| "Pad input tensors to dimensions that are multiples of 8", | |
| "Enable CUDA graph capture" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "Tensor Cores require specific alignment for optimal operation. Padding tensors to multiples of 8 (or 16 for better performance) ensures operations map efficiently to Tensor Core architecture, maximizing their utilization." | |
| }, | |
| { | |
| "id": 29, | |
| "section": "Network Configuration & Optimization", | |
| "question": "Setting up HPC cluster with GPU nodes using Slurm. How to ensure GPU jobs are scheduled only on nodes with appropriate drivers?", | |
| "options": [ | |
| "Use Slurm's GresTypes configuration", | |
| "Create custom script to check drivers before job submission", | |
| "Use Slurm node features to tag nodes with Feature=gpu", | |
| "Install DCGM and configure Slurm to query it" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "Slurm's node features allow tagging nodes with specific characteristics (like 'gpu'). Jobs can then request nodes with required features using the --constraint option, ensuring they only run on properly configured GPU nodes." | |
| }, | |
| { | |
| "id": 30, | |
| "section": "Network Configuration & Optimization", | |
| "question": "What is a primary benefit of using CLOS (Spine-Leaf) topology in a data center?", | |
| "options": [ | |
| "Reduced CAPEX", | |
| "Increased network diameter", | |
| "Improved scalability and bandwidth utilization", | |
| "Simplified management" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "CLOS/Spine-Leaf topology provides better scalability and more predictable bandwidth. Each leaf switch connects to every spine switch, providing multiple equal-cost paths and consistent latency between any two endpoints." | |
| }, | |
| { | |
| "id": 31, | |
| "section": "Network Configuration & Optimization", | |
| "question": "You're monitoring storage I/O with high disk utilization but low CPU utilization. Which action is LEAST likely to improve performance?", | |
| "options": [ | |
| "Switch from HDD to NVMe SSD", | |
| "Implement data prefetching", | |
| "Increase batch size", | |
| "Reduce parallel data loading threads" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "Reducing parallel data loading threads would decrease, not improve, performance when the bottleneck is storage I/O. The other options address the I/O bottleneck by improving storage speed or data pipeline efficiency." | |
| }, | |
| { | |
| "id": 32, | |
| "section": "Network Configuration & Optimization", | |
| "question": "What is the RECOMMENDED subnet manager for large AI training environments?", | |
| "options": [ | |
| "OpenSM (easiest to configure)", | |
| "UFM (advanced management capabilities)", | |
| "IBA management tools", | |
| "Any subnet manager (performance difference negligible)" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "UFM (Unified Fabric Manager) provides advanced management, monitoring, and optimization capabilities essential for large-scale AI environments. It offers superior fabric analysis, congestion management, and performance tuning compared to basic subnet managers." | |
| }, | |
| { | |
| "id": 33, | |
| "section": "Network Configuration & Optimization", | |
| "question": "Planning network for DGX SuperPOD. Which network technology is RECOMMENDED for interconnecting nodes?", | |
| "options": [ | |
| "Gigabit Ethernet", | |
| "10 Gigabit Ethernet", | |
| "InfiniBand", | |
| "Wi-Fi 6" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "InfiniBand provides the high bandwidth (200-400 Gbps), ultra-low latency (< 1 microsecond), and RDMA support required for DGX SuperPOD. It's specifically designed for HPC and AI workloads requiring intensive inter-node communication." | |
| }, | |
| { | |
| "id": 34, | |
| "section": "Network Configuration & Optimization", | |
| "question": "AI server exhibits kernel panics under GPU load. dmesg shows: 'NVRM: GPU has fallen off the bus'. What is the LEAST likely cause?", | |
| "options": [ | |
| "Insufficient power to GPU", | |
| "Loose PCIe riser cable", | |
| "Driver bug", | |
| "Faulty CPU" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "A faulty CPU is least likely to cause 'GPU has fallen off the bus' errors. This error typically indicates PCIe communication loss, usually caused by power issues, loose connections, or driver problems—not CPU failure." | |
| }, | |
| { | |
| "id": 35, | |
| "section": "Network Configuration & Optimization", | |
| "question": "Key considerations for CPU pinning vs NUMA awareness on multi-socket AMD EPYC server with GPUs?", | |
| "options": [ | |
| "CPU pinning more important than NUMA", | |
| "NUMA awareness more important than CPU pinning", | |
| "Both critical; use in conjunction", | |
| "Neither relevant for GPU workloads" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "Both CPU pinning and NUMA awareness are critical for optimal performance. CPU pinning reduces context switching overhead, while NUMA awareness ensures CPU cores and memory are in the same NUMA node, minimizing memory access latency. Use both together for best results." | |
| }, | |
| { | |
| "id": 36, | |
| "section": "Storage & Data Management", | |
| "question": "In distributed training with NVLink switches, which strategy minimizes inter-server latency impact?", | |
| "options": [ | |
| "Increase batch size", | |
| "Use asynchronous data transfers with overlapping computation", | |
| "Compress data before transfer", | |
| "Use centralized parameter server" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Asynchronous data transfers with overlapping computation hide latency by allowing GPUs to continue computing while data transfers occur in the background. This maximizes GPU utilization and minimizes idle time waiting for network communication." | |
| }, | |
| { | |
| "id": 37, | |
| "section": "Storage & Data Management", | |
| "question": "You replaced a Quadro RTX 8000 GPU. System boots, nvidia-smi recognizes it, but rendering performance is lower. TWO most likely factors?", | |
| "options": [ | |
| "PCIe link operating at lower generation", | |
| "NVIDIA OptiX denoiser not configured", | |
| "Power plan set to Power Saver", | |
| "Maya scene contains corrupted geometry" | |
| ], | |
| "correct_answer": [0, 2], | |
| "explanation": "The correct answers are A and C. PCIe Gen3 vs Gen4 significantly impacts bandwidth (16 GB/s vs 32 GB/s). Power Saver mode limits GPU clock speeds and performance. Both directly reduce rendering performance." | |
| }, | |
| { | |
| "id": 38, | |
| "section": "Storage & Data Management", | |
| "question": "What is the function of this iptables rule: 'iptables -A INPUT -p tcp --dport 8080 -j ACCEPT'?", | |
| "options": [ | |
| "Blocks all TCP traffic on port 8080", | |
| "Accepts all TCP traffic from port 8080", | |
| "Accepts all TCP traffic to port 8080", | |
| "Redirects TCP traffic from port 8080" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "This iptables rule appends (-A) to the INPUT chain, accepting (-j ACCEPT) TCP traffic (-p tcp) destined for (--dport) port 8080. This allows incoming TCP connections to port 8080." | |
| }, | |
| { | |
| "id": 39, | |
| "section": "Storage & Data Management", | |
| "question": "CUDA application crashes with 'CUDA ERROR ILLEGAL ADDRESS'. What debugging techniques help?", | |
| "options": [ | |
| "Use cuda-memcheck", | |
| "Use CUDA Debugger (cuda-gdb)", | |
| "Use NVIDIA Nsight Systems", | |
| "All of the above" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "All tools are valuable for debugging illegal memory access. cuda-memcheck detects memory errors at runtime, cuda-gdb allows stepping through code, and Nsight Systems profiles memory access patterns. Using multiple tools provides comprehensive debugging." | |
| }, | |
| { | |
| "id": 40, | |
| "section": "Storage & Data Management", | |
| "question": "After running ibstat, you see 'LMC: 0'. What does this indicate?", | |
| "options": [ | |
| "Link is down", | |
| "Link Aggregation not enabled", | |
| "Port operating at lowest speed", | |
| "Default value with no performance impact" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "LMC (LID Mask Control) 0 is the default and expected value for most InfiniBand configurations. It indicates no additional path bits are being used, which is normal and has no negative performance impact." | |
| }, | |
| { | |
| "id": 41, | |
| "section": "Storage & Data Management", | |
| "question": "GPU server exhibits high temperature causing throttling. Most effective action?", | |
| "options": [ | |
| "Reduce data center ambient temperature", | |
| "Lower GPU power limit using nvidia-smi", | |
| "Update NVIDIA drivers", | |
| "Increase fan speed using nvidia-smi --fan" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "Directly increasing fan speed with 'nvidia-smi --fan' immediately addresses the thermal issue by improving cooling. This is more effective than reducing ambient temperature or power limits, and more direct than driver updates." | |
| }, | |
| { | |
| "id": 42, | |
| "section": "Storage & Data Management", | |
| "question": "Network technology MOST suitable for large-scale recommendation system requiring low latency?", | |
| "options": [ | |
| "Gigabit Ethernet", | |
| "10 Gigabit Ethernet", | |
| "InfiniBand", | |
| "Fibre Channel" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "InfiniBand provides the lowest latency (sub-microsecond) and highest bandwidth needed for large-scale recommendation systems with frequent inter-node communication. It significantly outperforms Ethernet for latency-sensitive workloads." | |
| }, | |
| { | |
| "id": 43, | |
| "section": "Storage & Data Management", | |
| "question": "Distributed training on DGX servers via InfiniBand shows slow inter-GPU communication despite ibstat showing all links up. Most likely cause?", | |
| "options": [ | |
| "Default MTU 1500 too small", | |
| "Incorrect GPU placement across NUMA nodes", | |
| "CPU frequency scaling set to powersave", | |
| "InfiniBand subnet manager configured incorrectly" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "When links are up but performance is poor, subnet manager (SM) misconfiguration is most likely. The SM handles path selection, QoS, and congestion control—incorrect settings cause poor performance even with active links." | |
| }, | |
| { | |
| "id": 44, | |
| "section": "Storage & Data Management", | |
| "question": "AI infrastructure has A100 GPUs with GPU memory bandwidth lower than theoretical max. TWO potential bottlenecks?", | |
| "options": [ | |
| "Insufficient CPU cores", | |
| "Inefficient data loading from storage", | |
| "PCIe Gen3 instead of Gen4", | |
| "CPU using DDR4 memory" | |
| ], | |
| "correct_answer": [1, 2], | |
| "explanation": "The correct answers are B and C. Inefficient data loading creates pipeline stalls where GPU waits for data. PCIe Gen3 (16 GB/s) vs Gen4 (32 GB/s) limits host-to-GPU transfer bandwidth, bottlenecking memory-intensive operations." | |
| }, | |
| { | |
| "id": 45, | |
| "section": "Storage & Data Management", | |
| "question": "Large dataset on BeeGFS with CPU-bound data augmentation. GPU underutilized. How to improve?", | |
| "options": [ | |
| "Move data to local NVMe", | |
| "Increase BeeGFS metadata servers", | |
| "Implement asynchronous I/O with NVIDIA DALI", | |
| "Decrease batch size" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "NVIDIA DALI (Data Loading Library) offloads data augmentation from CPU to GPU, freeing CPU resources and improving GPU utilization. It provides GPU-accelerated data loading and preprocessing, solving the CPU bottleneck." | |
| }, | |
| { | |
| "id": 46, | |
| "section": "Performance Optimization", | |
| "question": "Techniques for improving inter-GPU communication in multi-GPU Intel server with NCCL? (Select all that apply)", | |
| "options": [ | |
| "Enable PCIe peer-to-peer transfers", | |
| "Utilize InfiniBand or RoCE interconnects", | |
| "Increase system RAM", | |
| "Configure NCCL for correct network interface" | |
| ], | |
| "correct_answer": [0, 1, 3], | |
| "explanation": "The correct answers are A, B, and D. PCIe peer-to-peer enables direct GPU-to-GPU transfers. InfiniBand/RoCE provide high-bandwidth, low-latency networking. Proper NCCL configuration ensures optimal use of available network interfaces. Increasing system RAM doesn't improve inter-GPU communication." | |
| }, | |
| { | |
| "id": 47, | |
| "section": "Performance Optimization", | |
| "question": "Using GPU Direct RDMA for fast transfer between GPUs across servers. How to verify RDMA is working?", | |
| "options": [ | |
| "Check nvidia-smi topo -m for NVLink and RDMA", | |
| "Examine dmesg for RDMA errors", | |
| "Use ibstat to verify InfiniBand active", | |
| "All of the above" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "All methods are valuable for verifying RDMA. nvidia-smi topo -m shows GPU topology, dmesg reveals driver/RDMA errors, ibstat confirms InfiniBand status. Using all provides comprehensive verification of GPUDirect RDMA functionality." | |
| }, | |
| { | |
| "id": 48, | |
| "section": "Performance Optimization", | |
| "question": "Primary advantage of direct-attached NVLink topology vs routing over network?", | |
| "options": [ | |
| "Increased network security", | |
| "Higher bandwidth and lower latency", | |
| "Reduced infrastructure cost", | |
| "Simplified configuration" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "NVLink provides significantly higher bandwidth (600 GB/s for NVLink 4.0) and lower latency compared to network-based communication. Direct GPU-to-GPU connections via NVLink eliminate network overhead for maximum performance." | |
| }, | |
| { | |
| "id": 49, | |
| "section": "Performance Optimization", | |
| "question": "Troubleshooting DGX-1 with performance degradation during distributed training. One GPU shows low utilization with frequent CUDA errors. Most likely cause?", | |
| "options": [ | |
| "Driver issue; reinstall drivers", | |
| "Software bug; debug training script", | |
| "Hardware fault (thermal/memory); run nvidia-smi -i <id> -q", | |
| "PSU overload; monitor PSU" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "Consistent low utilization with frequent CUDA errors strongly suggests hardware fault. Running 'nvidia-smi -i <id> -q' provides detailed telemetry (temperature, power, ECC errors, throttling) to identify thermal or memory issues." | |
| }, | |
| { | |
| "id": 50, | |
| "section": "Performance Optimization", | |
| "question": "Large dataset of microscopy TIFF files accessed randomly. Current storage is HDD. Which provides GREATEST performance improvement?", | |
| "options": [ | |
| "Implement data deduplication", | |
| "Migrate to large sequential HDD", | |
| "Replace with RAID 5 array", | |
| "Replace with single NVMe SSD" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "NVMe SSDs provide vastly superior random access performance (IOPS) compared to HDDs. For random access patterns, NVMe can deliver 100-1000x better performance than spinning disks, making it the most impactful upgrade." | |
| }, | |
| { | |
| "id": 51, | |
| "section": "Performance Optimization", | |
| "question": "Most important reason for dedicated storage network (InfiniBand/RoCE) for AI workloads?", | |
| "options": [ | |
| "Improved security", | |
| "Lower latency and higher bandwidth", | |
| "Simplified management", | |
| "Reduced cost" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Dedicated storage networks using InfiniBand/RoCE provide significantly lower latency and higher bandwidth compared to standard Ethernet. This is critical for AI workloads with high data throughput requirements and latency sensitivity." | |
| }, | |
| { | |
| "id": 52, | |
| "section": "Performance Optimization", | |
| "question": "AI inferencing server using Triton crashes under peak load with CUDA OOM errors. Best strategy?", | |
| "options": [ | |
| "Increase system RAM", | |
| "Implement CUDA memory pooling in Triton", | |
| "Reduce batch size and concurrency of offending model", | |
| "Upgrade to GPUs with larger memory" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "Reducing batch size and concurrency directly addresses GPU memory pressure, preventing OOM errors. This is the most practical and immediate solution compared to hardware upgrades or architectural changes." | |
| }, | |
| { | |
| "id": 53, | |
| "section": "Performance Optimization", | |
| "question": "Role of GPUDirect RDMA in NVLink Switch-based system?", | |
| "options": [ | |
| "Allows GPU-to-GPU memory access without CPU", | |
| "Offloads compute to CPU", | |
| "Enables direct GPU-storage communication", | |
| "Facilitates GPU virtualization" | |
| ], | |
| "correct_answer": 0, | |
| "explanation": "GPUDirect RDMA enables GPUs to directly access each other's memory without CPU involvement, reducing latency and CPU overhead. This is essential for high-performance distributed training across multiple nodes." | |
| }, | |
| { | |
| "id": 54, | |
| "section": "Performance Optimization", | |
| "question": "Profiling PyTorch model on AMD server with multiple GPUs shows data loading overhead. Which strategies optimize loading? (Select all that apply)", | |
| "options": [ | |
| "Use torch.utils.data.DataLoader with multiple workers", | |
| "Load entire dataset into RAM", | |
| "Implement async prefetching with torch.Generator", | |
| "Use faster storage (NVMe vs HDD)" | |
| ], | |
| "correct_answer": [0, 3], | |
| "explanation": "The correct answers are A and D. DataLoader with multiple workers parallelizes data loading. Faster storage (NVMe) directly improves I/O performance. Loading entire dataset to RAM is impractical for large datasets. torch.Generator is for reproducibility, not async prefetching." | |
| }, | |
| { | |
| "id": 55, | |
| "section": "Performance Optimization", | |
| "question": "Deploying multi-tenant AI with VXLAN. Most important VTEP configuration consideration?", | |
| "options": [ | |
| "Use default MTU 1500", | |
| "Ensure unique VXLAN Network Identifier per tenant", | |
| "Use same IP for all VTEPs", | |
| "Disable multicast routing" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Each tenant must have a unique VXLAN Network Identifier (VNI) to ensure proper traffic isolation. VNIs create separate Layer 2 domains over a shared physical network, preventing cross-tenant traffic leakage." | |
| }, | |
| { | |
| "id": 56, | |
| "section": "Performance Optimization", | |
| "question": "Large-scale distributed training on AMD EPYC servers with A100s using Slurm. Training fails with NCCL errors. Steps to improve? (Select all that apply)", | |
| "options": [ | |
| "Ensure InfiniBand/RoCE properly configured", | |
| "Use srun with --mpi=pmi2", | |
| "Increase NCCL_CONNECT_TIMEOUT", | |
| "All of the above" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "All steps improve NCCL reliability. Proper network configuration ensures connectivity. The --mpi=pmi2 flag correctly initializes MPI environment. Increasing timeouts accommodates network delays. Using all together maximizes success rate." | |
| }, | |
| { | |
| "id": 57, | |
| "section": "Performance Optimization", | |
| "question": "Implementing distributed deep learning with NVLink switches. Strategy for optimal utilization?", | |
| "options": [ | |
| "Configure NCCL to use GPUDirect RDMA", | |
| "Use TCP/IP sockets", | |
| "Implement CPU-based data compression", | |
| "Disable peer-to-peer GPU memory access" | |
| ], | |
| "correct_answer": 0, | |
| "explanation": "Configuring NCCL to use GPUDirect RDMA enables direct GPU-to-GPU communication across NVLink switches, maximizing bandwidth and minimizing latency. This provides the best performance for distributed training." | |
| }, | |
| { | |
| "id": 58, | |
| "section": "Container & Orchestration", | |
| "question": "An organization wants to view STDIN, STDOUT, and STDERR I/O streams of a specific container. Which command?", | |
| "options": [ | |
| "docker top CONTAINER-NAME", | |
| "docker stats CONTAINER-NAME", | |
| "docker logs CONTAINER-NAME", | |
| "docker inspect CONTAINER-NAME" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "The 'docker logs CONTAINER-NAME' command retrieves the standard output (STDOUT) and standard error (STDERR) streams of a container. This is the primary tool for viewing container logs and debugging." | |
| }, | |
| { | |
| "id": 59, | |
| "section": "Container & Orchestration", | |
| "question": "Storage performance degradation during peak hours with both metadata-intensive and sequential I/O operations. Best solution?", | |
| "options": [ | |
| "Increase storage capacity", | |
| "Implement tiered storage", | |
| "Use compression", | |
| "Reduce concurrent users" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Tiered storage separates metadata-intensive operations (fast SSD tier) from sequential I/O (capacity tier), optimizing performance for different workload characteristics. This addresses both workload types effectively." | |
| }, | |
| { | |
| "id": 60, | |
| "section": "Container & Orchestration", | |
| "question": "Managing on-premises cluster with Base Command Manager (BCM) needing to extend to AWS. What should you do?", | |
| "options": [ | |
| "Manually provision AWS instances", | |
| "Use BCM's Cluster Extension feature", | |
| "Write custom scripts", | |
| "Use third-party orchestration" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "BCM's Cluster Extension feature provides integrated cloudbursting capability, automatically provisioning and managing cloud resources when on-premises capacity is exhausted. This is the designed solution for hybrid deployments." | |
| }, | |
| { | |
| "id": 61, | |
| "section": "Container & Orchestration", | |
| "question": "You pulled TensorFlow container from NGC. Which command ensures container has access to all GPUs?", | |
| "options": [ | |
| "kubectl create pod --gpu=all", | |
| "docker run --gpus all nvcr.io/nvidia/tensorflow:<tag>", | |
| "docker run --device=/dev/nvidia0", | |
| "docker run --privileged" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "The '--gpus all' flag in Docker specifically enables access to all available GPUs on the host. This is the correct and recommended way to expose GPUs to Docker containers." | |
| }, | |
| { | |
| "id": 62, | |
| "section": "Container & Orchestration", | |
| "question": "System administrator needs to manage multiple DGX installations from BasePOD to SuperPOD. Which software?", | |
| "options": [ | |
| "NetQ", | |
| "Fleet Command", | |
| "Base Command Manager", | |
| "NVIDIA AI Enterprise" | |
| ], | |
| "correct_answer": 2, | |
| "explanation": "Base Command Manager (BCM) is specifically designed for managing NVIDIA DGX infrastructure from single systems to large-scale SuperPODs. It provides centralized management, monitoring, and orchestration." | |
| }, | |
| { | |
| "id": 63, | |
| "section": "Container & Orchestration", | |
| "question": "Configuring network for distributed training on multiple DGX servers. After job launch, communication is slow. Most likely cause?", | |
| "options": [ | |
| "MTU too small", | |
| "NUMA node placement", | |
| "CPU frequency scaling", | |
| "Subnet manager issues" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "Subnet manager misconfiguration is the most common cause of slow network performance in InfiniBand environments, even when physical links are operational. It controls routing, QoS, and path selection." | |
| }, | |
| { | |
| "id": 64, | |
| "section": "Container & Orchestration", | |
| "question": "You observe high GPU memory bandwidth usage but low computational throughput. What should you investigate?", | |
| "options": [ | |
| "Memory access patterns", | |
| "Kernel launch overhead", | |
| "Thread block configuration", | |
| "All of the above" | |
| ], | |
| "correct_answer": 3, | |
| "explanation": "All factors affect computational efficiency. Inefficient memory access patterns, excessive kernel launches, and poor thread block sizing can all cause high memory traffic without proportional computation. Investigate all aspects." | |
| }, | |
| { | |
| "id": 65, | |
| "section": "Container & Orchestration", | |
| "question": "Which command shows detailed PCIe information for GPU 0?", | |
| "options": [ | |
| "nvidia-smi -i 0", | |
| "nvidia-smi -i 0 -q -d PCIE", | |
| "lspci -vv", | |
| "nvidia-smi topo -m" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "The command 'nvidia-smi -i 0 -q -d PCIE' provides detailed PCIe information specifically for GPU 0, including link speed, width, generation, and bandwidth information." | |
| }, | |
| { | |
| "id": 66, | |
| "section": "Container & Orchestration", | |
| "question": "Best practice for GPU driver updates in production environment?", | |
| "options": [ | |
| "Update immediately when available", | |
| "Test in staging before production", | |
| "Never update drivers", | |
| "Update only during downtime" | |
| ], | |
| "correct_answer": 1, | |
| "explanation": "Always test driver updates in a staging environment before production deployment. This validates compatibility, identifies potential issues, and ensures workload stability before affecting production systems." | |
| }, | |
| { | |
| "id": 67, | |
| "section": "Container & Orchestration", | |
| "question": "You're deploying AI workload in Kubernetes. How to ensure pods are scheduled only on GPU nodes?", | |
| "options": [ | |
| "Use nodeSelector with GPU label", | |
| "Manually assign pods", | |
| "Use default scheduler", | |
| "Disable other nodes" | |
| ], | |
| "correct_answer": 0, | |
| "explanation": "Using nodeSelector with appropriate GPU labels (e.g., 'accelerator=nvidia-gpu') ensures pods are only scheduled on nodes with GPUs. This is the Kubernetes-native way to control pod placement." | |
| } | |
| ] | |
| } | |