开始使用 AMD (ROCM Kernel)

最后更新时间:2025年06月07日。

作者:Yusheng Su

设置

如果您在 ROCM 平台上使用 AMD GPU (MI300),则不能使用之前的快速入门指南来运行 verl。您应该按照以下步骤构建 Docker 镜像,并在 verl 的 RLHF 训练中启动 Ray 时设置 RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICESRAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES

docker/Dockerfile.rocm

FROM "rlsys/rocm-6.3.4-patch:rocm6.3.4-numa-patch_ubuntu-22.04"

SHELL ["/bin/bash", "-ceuxo", "pipefail"]

ENV MAX_JOBS=512

ENV PATH="/usr/local/python3.12/bin:$PATH"
RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \
    ln -sf /usr/bin/pip3.12 /usr/bin/pip

############################################
RUN apt-get update
RUN apt-get install -y pkg-config liblzma-dev
############################################

###########################################
##########安装 TransformerEngine##########
###########################################
WORKDIR /workspace/
# transformer-engine 安装
# https://github.com/ROCm/TransformerEngine
RUN rm -rf TransformerEngine
RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git
WORKDIR /workspace/TransformerEngine
git checkout 236178e5
# git checkout bb061ade
# git checkout 864405c
ENV NVTE_FRAMEWORK=pytorch
ENV NVTE_ROCM_ARCH=gfx942
ENV NVTE_USE_HIPBLASLT=1
ENV NVTE_USE_ROCM=1
# export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}"
ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr"
RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv
WORKDIR /workspace/
###########################################
###########################################
###########################################





####################################################################################
################ 安装 vllm - sglang 需要 vllm 0.6.7 依赖 #################
####################################################################################
#### 需要 vllm 0.6.7 - checkout 113274a0
WORKDIR /workspace/
RUN rm -rf vllm
RUN pip uninstall -y vllm
# 参考这里 (降级 vllm 到 0.6.3):https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html
RUN git clone https://github.com/ROCm/vllm.git
# git clone https://github.com/vllm-project/vllm.git
WORKDIR /workspace/vllm
RUN git checkout 113274a0
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
#ENV MAX_JOBS=512
ENV MAX_JOBS=${MAX_JOBS}
RUN pip install "boto3>=1.26.0"
RUN pip install setuptools_scm
# 会在 py 中添加 src。您可以删除此仓库
RUN python3 setup.py install
WORKDIR /workspace/
####################################################################################
####################################################################################
####################################################################################



###########################################
############用于 hack docker ################
###########################################
RUN pip install setuptools==75.8.0
###########################################
###########################################
###########################################



###########################################
############构建 sgalng ###################
###########################################
# 设置环境变量
ENV BASE_DIR=/sgl-workspace
ENV BUILD_TYPE=all
ENV SGL_REPO=https://github.com/sgl-project/sglang
ENV SGL_BRANCH=v0.4.6.post5
ENV TRITON_REPO=https://github.com/ROCm/triton.git
ENV TRITON_COMMIT=improve_fa_decode_3.0.0
ENV AITER_REPO=https://github.com/ROCm/aiter.git
ENV AITER_COMMIT=v0.1.2
# v0.1.2 版本 - commit id: 9d11f47
# ENV AITER_COMMIT=9d11f47
ENV HIP_FORCE_DEV_KERNARG=1
ENV HSA_NO_SCRATCH_RECLAIM=1
ENV SGLANG_SET_CPU_AFFINITY=1
ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
ENV NCCL_MIN_NCHANNELS=112
ENV MOE_PADDING=1
ENV VLLM_FP8_PADDING=1
ENV VLLM_FP8_ACT_PADDING=1
ENV VLLM_FP8_WEIGHT_PADDING=1
ENV VLLM_FP8_REDUCE_CONV=1
ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
ENV AMDGPU_TARGETS=gfx942
ENV ROCM_ARCH=gfx942
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
# 切换工作目录
WORKDIR /sgl-workspace
# 清理并创建目录
RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace

# 克隆并构建 sglang
RUN git clone ${SGL_REPO} \
    && cd sglang \
    && git checkout ${SGL_BRANCH} || echo "Using default branch" \
    && cd sgl-kernel \
    && rm -f pyproject.toml \
    && mv pyproject_rocm.toml pyproject.toml \
    && python setup_rocm.py install \
    && cd .. \
    && if [ "$BUILD_TYPE" = "srt" ]; then \
        python -m pip --no-cache-dir install -e "python[srt_hip]"; \
    else \
        python -m pip --no-cache-dir install -e "python[all_hip]"; \
    fi \
    && cd /sgl-workspace \
    && cp -r /sgl-workspace/sglang /sglang \
    && python -m pip cache purge

# 安装常用 Python 包
RUN pip install IPython orjson python-multipart torchao pybind11
# 重新构建 Triton
RUN pip uninstall -y triton || true \
    && git clone ${TRITON_REPO} \
    && cd triton \
    && git checkout ${TRITON_COMMIT} \
    && cd python \
    && python3 setup.py install \
    && cd /sgl-workspace
# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1"
# ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"

# 构建 aiter
# 版本:Commit 9d11f47
    # && git checkout ${AITER_COMMIT} \
RUN pip uninstall -y aiter || true
RUN git clone ${AITER_REPO} \
    && cd aiter \
    && git checkout ${AITER_COMMIT} \
    && git submodule sync \
    && git submodule update --init --recursive \
    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \
    && cd /sgl-workspace

# 复制 MI300X 配置
RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
        /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
        -type f -name '*MI300X*' | \
        xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}

# 环境设置完成。
RUN echo "Environment setup complete."

WORKDIR /workspace/
###########################################
###########################################
###########################################






###########################################
############### vllm v0.8.5 #################
###########################################
WORKDIR /workspace/

ENV VLLM_TARGET_DEVICE=rocm
ENV ROCM_PATH=/opt/rocm
ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev
# 在以下位置查找仓库路径:Dockerfile/Dockerfile.rocm_yang
# RUN git clone https://github.com/RLFoundation/vllm-patch.git
RUN pip uninstall -y vllm || true
RUN rm -rf vllm-patch
RUN git clone https://github.com/RLFoundation/vllm-patch.git \
    && cd vllm-patch \
    && git checkout v0.8.5-sleep-numa \
    && rm -rf build/ dist/ *.egg-info \
    && ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
    && SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install
    # RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop
WORKDIR /workspace/
###########################################
###########################################
###########################################




#########################################
#### 安装 megatron-core###############
#########################################
RUN pip uninstall -y megatron-core && \
    git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \
    cd Megatron-LM-amd_version && \
    pip install -vvv -e . && \
    cd /workspace/
#########################################
#########################################
#########################################




#######################################
################ apex ##################
#######################################
WORKDIR /workspace/
RUN pip uninstall -y apex && \
    git clone git@github.com:ROCm/apex.git && \
    cd apex && \
    python setup.py install && \
    cd /workspace/
#######################################
#######################################
#######################################


################################################################################
########################### 添加 torch_memory_saver ############################
################################################################################
# 设置环境变量
ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa"
################################################################################
################################################################################
################################################################################



########################################
###### 安装 ray #######################
########################################
# 需要添加此补丁:https://github.com/ray-project/ray/pull/53531/files
RUN pip uninstall ray -y
RUN pip install "ray[data,train,tune,serve]>=2.47.0"
########################################
########################################
########################################


##########################################
####### 安装其他依赖 #####################
##########################################
RUN pip install "tensordict==0.6.2" --no-deps && \
    pip install accelerate \
    codetiming \
    datasets \
    dill \
    hydra-core \
    liger-kernel \
    numpy \
    pandas \
    peft \
    "pyarrow>=15.0.0" \
    pylatexenc \
    torchdata \
    wandb \
    orjson \
    pybind11

WORKDIR /workspace/
RUN git clone https://github.com/volcengine/verl.git && \
    cd verl && \
    pip install -e .
##########################################
##########################################
##########################################

WORKDIR /workspace/
CMD ["/usr/bin/bash"]

构建镜像:

docker docker/build -t verl-rocm .

运行容器

注意:您可以在此 DockerHub 上拉取 Docker 镜像:[RLSys Foundation](https://hub.docker.com/u/yushengsuthu) 拉取镜像: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4

docker tag rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4 verl-rocm:latest

运行容器

可选:在无 root 和用户权限下运行

docker run --rm -it \
  --device /dev/dri \
  --device /dev/kfd \
  -p 8265:8265 \
  --group-add video \
  --cap-add SYS_PTRACE \
  --security-opt seccomp=unconfined \
  --privileged \
  -v $HOME/.ssh:/root/.ssh \
  -v $HOME:$HOME \
  --shm-size 128G \
  -w $PWD \
  verl-rocm \
  /bin/bash

(可选):如果您不想在 root 模式下运行,并需要将自己分配为用户 请在上面的 docker 启动脚本中添加 -e HOST_UID=$(id -u)-e HOST_GID=$(id -g)

示例

由于 AMD (ROCM) PyTorch 的特殊设置, 1. 如果您的 ray>=2.45.0``(默认),在 verl RLHF 训练中启动 Ray 时,需要设置 ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES 并添加此 [补丁](https://github.com/ray-project/ray/pull/53531/files)。 2. 如果您的 ray<2.45.0,在 verl 的 RLHF 训练中启动 Ray 时,需要设置 RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES。 推理时 $ENGINE 可以是 vllmsglang。我们在下面的示例中选择 vllm 作为默认值。

PPO

YOUR_PROJECT_NAME=r1-verl-ppo-upstream
YOUR_RUN_NAME=r1-training_ppo-upstream
# export HYDRA_FULL_ERROR=1

export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

# [ray] < 2.45.0
#export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1

# [ray] >= 2.45.0
export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # 使用 https://github.com/ray-project/ray/pull/52794 进行补丁

GPUS_PER_NODE=8
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
ENGINE=vllm #sglang

PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
 data.train_files=data/gsm8k/train.parquet \
 data.val_files=data/gsm8k/test.parquet \
 data.train_batch_size=256 \
 data.val_batch_size=1312 \
 data.max_prompt_length=512 \
 data.max_response_length=256 \
 actor_rollout_ref.model.path=$MODEL_PATH \
 actor_rollout_ref.actor.optim.lr=1e-6 \
 actor_rollout_ref.actor.ppo_mini_batch_size=64 \
 actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
 actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
 actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
 actor_rollout_ref.rollout.name=$ENGINE \
 actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
 actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
 critic.optim.lr=1e-5 \
 critic.model.path=$MODEL_PATH \
 critic.ppo_micro_batch_size_per_gpu=4 \
 algorithm.kl_ctrl.kl_coef=0.001 \
 trainer.logger=console \
 trainer.project_name=$YOUR_PROJECT_NAME \
 trainer.experiment_name=$YOUR_RUN_NAME \
 trainer.val_before_train=False \
 trainer.n_gpus_per_node=$GPUS_PER_NODE \
 trainer.nnodes=1 \
 trainer.save_freq=10 \
 trainer.test_freq=10 \
 trainer.total_epochs=15 #2>&1 | tee verl_demo.log

GRPO

YOUR_PROJECT_NAME=r1-verl-grpo-upstream
YOUR_RUN_NAME=r1-training_grpo-upstream
# export HYDRA_FULL_ERROR=1
# export FSDP_VERBOSE=1

#export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

# [ray] < 2.45.0
#export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1

# [ray] >= 2.45.0
export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # 使用 https://github.com/ray-project/ray/pull/52794 进行补丁

GPUS_PER_NODE=8
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
# MODEL_PATH=Qwen/Qwen2-7B-Instruct
python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
ENGINE=vllm #sglang

python3 -m verl.trainer.main_ppo \
    algorithm.adv_estimator=grpo \
    data.train_files=data/gsm8k/train.parquet \
    data.val_files=data/gsm8k/test.parquet \
    data.train_batch_size=1024 \
    data.val_batch_size=1312 \
    data.max_prompt_length=512 \
    data.max_response_length=1024 \
    actor_rollout_ref.model.path=$MODEL_PATH \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
    actor_rollout_ref.actor.use_dynamic_bsz=True \
    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
    actor_rollout_ref.actor.use_kl_loss=True \
    actor_rollout_ref.actor.kl_loss_coef=0.001 \
    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
    actor_rollout_ref.model.enable_gradient_checkpointing=Flase \
    actor_rollout_ref.actor.fsdp_config.param_offload=False \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
    actor_rollout_ref.rollout.name=$ENGINE \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
    actor_rollout_ref.rollout.n=5 \
    actor_rollout_ref.ref.fsdp_config.param_offload=False \
    algorithm.kl_ctrl.kl_coef=0.001 \
    trainer.critic_warmup=0 \
    trainer.logger=console \
    trainer.project_name=$YOUR_PROJECT_NAME \
    trainer.experiment_name=$YOUR_RUN_NAME \
    trainer.n_gpus_per_node=$GPUS_PER_NODE \
    trainer.val_before_train=False \
    trainer.nnodes=1 \
    trainer.save_freq=-1 \
    trainer.test_freq=10 \
    trainer.total_epochs=15

多节点训练:使用 Docker/Podman 容器进行 Slurm 集群

如果您想使用 Slurm 进行多节点训练,可以使用以下脚本。

Note

  1. 在以下脚本中,您需要使用 podmandocker。我们稍后将发布 apptainer 脚本。

  2. 如果您想使用 podman,只需将脚本中的 docker 替换为 podman 即可。

该脚本包含以下步骤:

  1. SLURM 配置

  2. 环境设置

  3. Docker/Podman 容器设置

  4. Ray 集群初始化

  5. 数据预处理

  6. 模型设置

  7. 训练启动

slurm_script.sh

#!/bin/bash

#SBATCH --job-name=verl-ray-on-slurm
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=2
#SBATCH --mem=200G
#SBATCH --time=30-00:00:00
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-task=28
#SBATCH --output=../verl_log/slurm-%j.out
#SBATCH --error=../verl_log/slurm-%j.err
#SBATCH --nodelist=gpu-[0,1]


# 加载必要的模块
### 运行此设置
# [集群]: 使用 docker
# docker pull docker.io/rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4


##########################################################################
###以下设置应在不同的项目和集群中进行配置###
##########################################################################

### 项目
CONTAINER_NAME="multinode_verl_training"
IMG="verl.rocm"
DOCKERFILE="docker/Dockerfile.rocm"
# echo $PWD
verl_workdir="${HOME}/projects/verl_upstream"
export TRANSFORMERS_CACHE="${HOME}/.cache/huggingface"
export HF_HOME=$TRANSFORMERS_CACHE

### 集群网络设置
export NCCL_DEBUG=TRACE
export GPU_MAX_HW_QUEUES=2
export TORCH_NCCL_HIGH_PRIORITY=1
export NCCL_CHECKS_DISABLE=1
# export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
export NCCL_IB_GID_INDEX=3
export NCCL_CROSS_NIC=0
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_PROTO=Simple
export RCCL_MSCCL_ENABLE=0
export TOKENIZERS_PARALLELISM=false
export HSA_NO_SCRATCH_RECLAIM=1
##########################################################################

## 分配 GPU
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

### 用于 rocm 和训练脚本
# [ray] < 2.45.0
#export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1

# [ray] >= 2.45.0
export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # 使用 https://github.com/ray-project/ray/pull/52794 进行补丁


# 构建并启动 Docker 容器
srun bash -c "
    # 发生任何错误时退出
    set -e

    # 清理悬空的镜像(标签为 <none> 的镜像)
    docker image prune -f

    # 需要先拉取 docker
    docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4

    if ! docker images --format \"{{.Repository}}:{{.Tag}}\" | grep -q \"${IMG}\"; then
        echo \"Building ${IMG} image...\"
        docker build -f \"${DOCKERFILE}\" -t \"${IMG}\" .
    else
        echo \"${IMG} image already exists, skipping build\"
    fi

    # 删除旧容器(如果存在)
    docker rm \"${CONTAINER_NAME}\" 2>/dev/null || true

    # 检查网络设备
    ibdev2netdev

    # 启动 docker
    docker run --rm -d \
    -e HYDRA_FULL_ERROR=1 \
    -e RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \
    -e RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 \
    -e NCCL_DEBUG=${NCCL_DEBUG} \
    -e GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES} \
    -e TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY} \
    -e NCCL_CHECKS_DISABLE=${NCCL_CHECKS_DISABLE} \
    -e NCCL_IB_HCA=${NCCL_IB_HCA} \
    -e NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX} \
    -e NCCL_CROSS_NIC=${NCCL_CROSS_NIC} \
    -e CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS} \
    -e NCCL_PROTO=${NCCL_PROTO} \
    -e RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE} \
    -e TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM} \
    -e HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM} \
    -e TRANSFORMERS_CACHE=${TRANSFORMERS_CACHE} \
    -e HF_HOME=${HF_HOME} \
    --network host \
    --device /dev/dri \
    --device /dev/kfd \
    --device /dev/infiniband \
    --group-add video \
    --cap-add SYS_PTRACE \
    --security-opt seccomp=unconfined \
    --privileged \
    -v \${HOME}:\${HOME} \
    -v \${HOME}/.ssh:/root/.ssh \
    -w \"${verl_workdir}\" \
    --shm-size 128G \
    --name \"${CONTAINER_NAME}\" \
    \"${IMG}\" \
    tail -f /dev/null

    echo \"Container setup completed\"
"
    # (可选):如果您不想在 root 模式下运行,并需要分配您自己作为用户
    # 请在上面的 docker 启动脚本中添加 `-e HOST_UID=$(id -u)` 和 `-e HOST_GID=$(id -g)`。





### Ray 在训练前启动节点

# 获取节点名称
nodes_array=($(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' '))

head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)

# 如果我们检测到 IP 地址中有空格,我们将
# 将其转换为 ipv4 地址。此步骤是可选的。
if [[ "$head_node_ip" == *" "* ]]; then
    IFS=' ' read -ra ADDR <<<"$head_node_ip"
if [[ ${#ADDR[0]} -gt 16 ]]; then
    head_node_ip=${ADDR[1]}
else
    head_node_ip=${ADDR[0]}
fi
    echo "检测到 IPV6 地址。我们将 IPV4 地址拆分为 $head_node_ip"
fi

port=6379
ip_head=$head_node_ip:$port
export ip_head
echo "IP Head: $ip_head"

# 确保我们在 Ray 初始化之前设置了环境变量

# 打印所有环境变量
printenv

echo "在 $head_node 启动 HEAD"
srun --nodes=1 --ntasks=1 -w "$head_node" \
    docker exec "${CONTAINER_NAME}" \
        ray start --head --node-ip-address="$head_node_ip" --port=$port \
        --dashboard-port=8266 \
        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
# 可选,但在某些 Ray 版本 < 1.0 中可能有用。
sleep 10

# 其他节点的数量(头节点除外)
worker_num=$((SLURM_JOB_NUM_NODES - 1))

for ((i = 1; i <= worker_num; i++)); do
    node_i=${nodes_array[$i]}
    echo "Debug: 在 node_i = ${node_i} 上启动 worker"
    if [ -z "$node_i" ]; then
        echo "Error: worker $i 的节点名称为空"
        continue
    fi
    echo "在 $node_i 启动 WORKER $i"
    srun --nodes=1 --ntasks=1 -w "$node_i" \
        docker exec "${CONTAINER_NAME}" \
            ray start --address "$ip_head" --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
    sleep 5
done




# Ray 初始化测试(查看上述执行中是否有错误)
echo "正在测试 Slurm 节点上的 Ray 初始化..."
docker exec "${CONTAINER_NAME}" python3 -c '
import ray
try:
    ray.init(address="auto")
    print("\n=== Ray 集群状态 ===")
    print(f"节点数量: {len(ray.nodes())}")
    for node in ray.nodes():
        print("节点: {}, 状态: {}".format(node["NodeManagerHostname"], node["Alive"]))
        # print(f"节点: {node}")
    ray.shutdown()
    print("Ray 初始化成功!")
except Exception as e:
    print(f"Ray 初始化失败: {str(e)}")
'
echo "=== Ray 测试完成 ==="
######



# 运行数据预处理

echo "正在开始数据预处理..."
docker exec "${CONTAINER_NAME}" \
    python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k"

echo "正在开始数据预处理..."
docker exec "${CONTAINER_NAME}" \
    python3 "examples/data_preprocess/math_dataset.py" "--local_dir" "../data/math"

train_files="../data/gsm8k/train.parquet"
val_files="../data/gsm8k/test.parquet"

# 下载并测试模型
echo "正在加载模型..."
docker exec "${CONTAINER_NAME}" \
    python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')"
MODEL_PATH="Qwen/Qwen2-7B-Instruct"

# 在 pipeline 测试后设置模型路径
MODEL_PATH="Qwen/Qwen2.5-0.5B-Instruct"

echo "== 数据和模型加载完成 =="

echo "开始训练..."

docker exec "${CONTAINER_NAME}" \
    python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')"
MODEL_PATH="Qwen/Qwen2-7B-Instruct"


PYTHONUNBUFFERED=1 srun --overlap --nodes=${SLURM_NNODES} --ntasks=1 -w "$head_node" \
    docker exec "${CONTAINER_NAME}" \
    python3 -m verl.trainer.main_ppo \
    data.train_files=$train_files \
    data.val_files=$val_files \
    data.train_batch_size=1024 \
    data.max_prompt_length=1024 \
    data.max_response_length=1024 \
    actor_rollout_ref.model.path=$MODEL_PATH \
    actor_rollout_ref.model.enable_gradient_checkpointing=False \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.fsdp_config.param_offload=False \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
    actor_rollout_ref.rollout.name=vllm \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
    actor_rollout_ref.ref.fsdp_config.param_offload=True \
    critic.optim.lr=1e-5 \
    critic.model.use_remove_padding=True \
    critic.model.path=$MODEL_PATH \
    critic.model.enable_gradient_checkpointing=False \
    critic.ppo_micro_batch_size_per_gpu=8 \
    critic.model.fsdp_config.param_offload=False \
    critic.model.fsdp_config.optimizer_offload=False \
    algorithm.kl_ctrl.kl_coef=0.0001 \
    trainer.critic_warmup=0 \
    trainer.logger='["console","wandb"]' \
    trainer.project_name='verl_example' \
    trainer.experiment_name='Qwen2.5-32B-Instruct_function_rm' \
    trainer.n_gpus_per_node=${SLURM_GPUS_PER_NODE} \
    trainer.val_before_train=False \
    trainer.nnodes=${SLURM_NNODES} \
    trainer.save_freq=-1 \
    trainer.test_freq=10 \
    trainer.total_epochs=15

运行 slurm_script.sh

只需使用 sbatch 提交您的 slurm_script.sh

sbatch slurm_script.sh