使用 MPIJob 运行 8 卡 3090 的 NCCL 测试
提交 MPIJob 运行 NCCL 测试
kubectl apply -f - << EOF
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: nccl-tests-3090
spec:
slotsPerWorker: 8
runPolicy:
cleanPodPolicy: Running
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
containers:
- image: cr.zw1.paratera.com/tests/nccl-tests:12.2.2-cudnn8-devel-ubuntu20.04-nccl2.19.3-1-3e0fbc3
name: nccl
env:
- name: OMPI_ALLOW_RUN_AS_ROOT
value: "1"
- name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
value: "1"
# Uncomment to be able to exec in to launcher pod for interactive testing
# command: ['sleep', '86400']
command: ["/bin/bash", "-c"]
args: [
"mpirun \
-np 8 \
-bind-to none \
-x LD_LIBRARY_PATH \
-x NCCL_SOCKET_IFNAME=eth0 \
-x NCCL_IB_HCA=eth0
/opt/nccl_tests/build/all_reduce_perf -b 512M -e 8G -f 2 -g 1 \
",
]
resources:
limits:
cpu: 2
memory: 4Gi
enableServiceLinks: false
automountServiceAccountToken: false
Worker:
replicas: 1
template:
metadata:
labels:
job: nccl-tests
spec:
containers:
- image: cr.zw1.paratera.com/tests/nccl-tests:12.2.2-cudnn8-devel-ubuntu20.04-nccl2.19.3-1-3e0fbc3
name: nccl
resources:
limits:
cpu: 80
memory: 480Gi
nvidia.com/gpu: 8
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- emptyDir:
medium: Memory
name: dshm
tolerations:
- key: nvidia.com/gpu
operator: Exists
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu
operator: In
values:
- rtx-3090
enableServiceLinks: false
automountServiceAccountToken: false
EOF
查看 MPIJob 状态
kubectl get mpijob
kubectl describe mpijob nccl-tests-3090
查看 NCCL 测试日志
kubectl logs nccl-tests-3090-launcher-xxxxx
测试结果
#
# Reducing maxBytes to 8119014741 due to memory limitation
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
536870912 134217728 float sum -1 58676 9.15 16.01 0 58509 9.18 16.06 0
1073741824 268435456 float sum -1 117613 9.13 15.98 0 117659 9.13 15.97 0
2147483648 536870912 float sum -1 236546 9.08 15.89 0 236985 9.06 15.86 0
4294967296 1073741824 float sum -1 486906 8.82 15.44 0 487377 8.81 15.42 0
# Out of bounds values : 0 OK
# Avg bus bandwidth : 15.8275
#