集群 YAML 配置选项#

集群配置在 YAML 文件中定义，集群启动器（Cluster Launcher）将使用此文件启动头节点，自动伸缩器（Autoscaler）将使用此文件启动工作节点。定义集群配置后，你需要使用 Ray CLI 执行启动和停止集群等操作。

语法#

cluster_name: str
max_workers: int
upscaling_speed: float
idle_timeout_minutes: int
docker:
    docker
provider:
    provider
auth:
    auth
available_node_types:
    node_types
head_node_type: str
file_mounts:
    file_mounts
cluster_synced_files:
    - str
rsync_exclude:
    - str
rsync_filter:
    - str
initialization_commands:
    - str
setup_commands:
    - str
head_setup_commands:
    - str
worker_setup_commands:
    - str
head_start_ray_commands:
    - str
worker_start_ray_commands:
    - str

示例#

最小配置#

AWS

# An unique identifier for the head node and workers of this cluster.
cluster_name: aws-example-minimal

# Cloud-provider specific configuration.
provider:
    type: aws
    region: us-west-2

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 3

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g., instance type. By default
        # Ray auto-configures unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
    ray.worker.default:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 3
        # The maximum number of worker nodes of this type to launch.
        # This parameter takes precedence over min_workers.
        max_workers: 3
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g., instance type. By default
        # Ray auto-configures unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large

Azure

# An unique identifier for the head node and workers of this cluster.
cluster_name: minimal

# The maximum number of workers nodes to launch in addition to the head
# node. min_workers default to 0.
max_workers: 1

# Cloud-provider specific configuration.
provider:
    type: azure
    location: westus2
    resource_group: ray-cluster

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu
    # you must specify paths to matching private and public key pair files
    # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
    ssh_private_key: ~/.ssh/id_rsa
    # changes to this should match what is specified in file_mounts
    ssh_public_key: ~/.ssh/id_rsa.pub

GCP

auth:
  ssh_user: ubuntu
cluster_name: minimal
provider:
  availability_zone: us-west1-a
  project_id: null # TODO: set your GCP project ID here
  region: us-west1
  type: gcp

vSphere

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 5

# Cloud-provider specific configuration.
provider:
    type: vsphere

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ray
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
    ssh_private_key: ~/ray-bootstrap-key.pem

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # You can override the resources here. Adding GPU to the head node is not recommended.
        # resources: { "CPU": 2, "Memory": 4096}
        resources: {}
    ray.worker.default:
        # The minimum number of nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 1
        max_workers: 3
        # You can override the resources here. For GPU, currently only Nvidia GPU is supported. If no ESXi host can
        # fulfill the requirement, the Ray node creation will fail. The number of created nodes may not meet the desired
        # minimum number. The vSphere node provider will not distinguish the GPU type. It will just count the quantity:
        # mount the first k random available Nvidia GPU to the VM, if the user set {"GPU": k}.
        # resources: {"CPU": 2, "Memory": 4096, "GPU": 1}
        resources: {}

# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default

完整配置#

AWS

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 2

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-cpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options:   # Extra options to pass into "docker run"
        - --ulimit nofile=65536:65536

    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
    # Allow Ray to automatically detect GPUs

    # worker_image: "rayproject/ray-ml:latest-cpu"
    # worker_run_options: []

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: aws
    region: us-west-2
    # Availability zone(s), comma-separated, that nodes may be launched in.
    # Nodes will be launched in the first listed availability zone and will
    # be tried in the subsequent availability zones if launching fails.
    availability_zone: us-west-2a,us-west-2b
    # Whether to allow node reuse. If set to False, nodes will be terminated
    # instead of stopped.
    cache_stopped_nodes: True # If not present, the default is True.

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
#    ssh_private_key: /path/to/your/key.pem

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
            # Default AMI for us-west-2.
            # Check https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/aws/config.py
            # for default images for other zones.
            ImageId: ami-0387d929287ab193e
            # You can provision additional disk space with a conf as follows
            BlockDeviceMappings:
                - DeviceName: /dev/sda1
                  Ebs:
                      VolumeSize: 140
                      VolumeType: gp3
            # Additional options in the boto docs.
    ray.worker.default:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 1
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
            # Default AMI for us-west-2.
            # Check https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/aws/config.py
            # for default images for other zones.
            ImageId: ami-0387d929287ab193e
            # Run workers on spot by default. Comment this out to use on-demand.
            # NOTE: If relying on spot instances, it is best to specify multiple different instance
            # types to avoid interruption when one instance type is experiencing heightened demand.
            # Demand information can be found at https://aws.amazon.com/ec2/spot/instance-advisor/
            InstanceMarketOptions:
                MarketType: spot
                # Additional options can be found in the boto docs, e.g.
                #   SpotOptions:
                #       MaxPrice: MAX_HOURLY_PRICE
            # Additional options in the boto docs.

# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
#    "/path1/on/remote/machine": "/path1/on/local/machine",
#    "/path2/on/remote/machine": "/path2/on/local/machine",
}

# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []

# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
    - "**/.git"
    - "**/.git/**"

# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
    - ".gitignore"

# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []

# List of shell commands to run to set up nodes.
setup_commands: []
    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
    # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"

# Custom commands that will be run on the head node after common setup.
head_setup_commands: []

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

Azure

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 2

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty object means disabled.
docker:
    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options:   # Extra options to pass into "docker run"
        - --ulimit nofile=65536:65536

    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
    # Allow Ray to automatically detect GPUs

    # worker_image: "rayproject/ray-ml:latest-cpu"
    # worker_run_options: []

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: azure
    # https://azure.microsoft.com/en-us/global-infrastructure/locations
    location: westus2
    resource_group: ray-cluster
    # set subscription id otherwise the default from az cli will be used
    # subscription_id: 00000000-0000-0000-0000-000000000000
    # set unique subnet mask or a random mask will be used
    # subnet_mask: 10.0.0.0/16
    # set unique id for resources in this cluster
    # if not set a default id will be generated based on the resource group and cluster name
    # unique_id: RAY1
    # set managed identity name and resource group
    # if not set, a default user-assigned identity will be generated in the resource group specified above
    # msi_name: ray-cluster-msi
    # msi_resource_group: other-rg
    # Set provisioning and use of public/private IPs for head and worker nodes. If both options below are true,
    # only the head node will have a public IP address provisioned.
    # use_internal_ips: True
    # use_external_head_ip: True

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu
    # you must specify paths to matching private and public key pair files
    # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
    ssh_private_key: ~/.ssh/id_rsa
    # changes to this should match what is specified in file_mounts
    ssh_public_key: ~/.ssh/id_rsa.pub

# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config, e.g. instance type.
        node_config:
            azure_arm_parameters:
                vmSize: Standard_D2s_v3
                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
                imagePublisher: microsoft-dsvm
                imageOffer: ubuntu-1804
                imageSku: 1804-gen2
                imageVersion: latest

    ray.worker.default:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 0
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config, e.g. instance type.
        node_config:
            azure_arm_parameters:
                vmSize: Standard_D2s_v3
                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
                imagePublisher: microsoft-dsvm
                imageOffer: ubuntu-1804
                imageSku: 1804-gen2
                imageVersion: latest
                # optionally set priority to use Spot instances
                priority: Spot
                # set a maximum price for spot instances if desired
                # billingProfile:
                #     maxPrice: -1

# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
#    "/path1/on/remote/machine": "/path1/on/local/machine",
#    "/path2/on/remote/machine": "/path2/on/local/machine",
     "~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
}

# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []

# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
    - "**/.git"
    - "**/.git/**"

# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
    - ".gitignore"

# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands:
    # enable docker setup
    - sudo usermod -aG docker $USER || true
    - sleep 10  # delay to avoid docker permission denied errors
    # get rid of annoying Ubuntu message
    - touch ~/.sudo_as_admin_successful

# List of shell commands to run to set up nodes.
# NOTE: rayproject/ray-ml:latest has ray latest bundled
setup_commands: []
    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
    # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"

# Custom commands that will be run on the head node after common setup.
head_setup_commands:
    - pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

GCP

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 2

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
  image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
  container_name: "ray_container"
  # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
  # if no cached version is present.
  pull_before_run: True
  run_options:  # Extra options to pass into "docker run"
    - --ulimit nofile=65536:65536

  # Example of running a GPU head with CPU workers
  # head_image: "rayproject/ray-ml:latest-gpu"
  # Allow Ray to automatically detect GPUs

  # worker_image: "rayproject/ray-ml:latest-cpu"
  # worker_run_options: []

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: gcp
    region: us-west1
    availability_zone: us-west1-a
    project_id: null # Globally unique project id

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below. This requires that you have added the key into the
# project wide meta-data.
#    ssh_private_key: /path/to/your/key.pem

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray_head_default:
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config for the head node, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
        # For more documentation on available fields, see:
        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
        node_config:
            machineType: n1-standard-2
            disks:
              - boot: true
                autoDelete: true
                type: PERSISTENT
                initializeParams:
                  diskSizeGb: 50
                  # See https://cloud.google.com/compute/docs/images for more images
                  sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922

            # Additional options can be found in in the compute docs at
            # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

            # If the network interface is specified as below in both head and worker
            # nodes, the manual network config is used.  Otherwise an existing subnet is
            # used.  To use a shared subnet, ask the subnet owner to grant permission
            # for 'compute.subnetworks.use' to the ray autoscaler account...
            # networkInterfaces:
            #   - kind: compute#networkInterface
            #     subnetwork: path/to/subnet
            #     aliasIpRanges: []
    ray_worker_small:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 1
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config for the head node, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
        # For more documentation on available fields, see:
        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
        node_config:
            machineType: n1-standard-2
            disks:
              - boot: true
                autoDelete: true
                type: PERSISTENT
                initializeParams:
                  diskSizeGb: 50
                  # See https://cloud.google.com/compute/docs/images for more images
                  sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922
            # Run workers on preemtible instance by default.
            # Comment this out to use on-demand.
            scheduling:
              - preemptible: true
            # Un-Comment this to launch workers with the Service Account of the Head Node
            # serviceAccounts:
            # - email: ray-autoscaler-sa-v1@<project_id>.iam.gserviceaccount.com
            #   scopes:
            #   - https://www.googleapis.com/auth/cloud-platform

    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

# Specify the node type of the head node (as configured above).
head_node_type: ray_head_default

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
#    "/path1/on/remote/machine": "/path1/on/local/machine",
#    "/path2/on/remote/machine": "/path2/on/local/machine",
}

# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []

# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
    - "**/.git"
    - "**/.git/**"

# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
    - ".gitignore"

# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []

# List of shell commands to run to set up nodes.
setup_commands: []
    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
    # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"


# Custom commands that will be run on the head node after common setup.
head_setup_commands:
  - pip install google-api-python-client==1.7.8

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - >-
      ray start
      --head
      --port=6379
      --object-manager-port=8076
      --autoscaling-config=~/ray_bootstrap_config.yaml

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - >-
      ray start
      --address=$RAY_HEAD_IP:6379
      --object-manager-port=8076

vSphere

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 5

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
    image: "rayproject/ray-ml:latest"
    # image: rayproject/ray:latest   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options:   # Extra options to pass into "docker run"
        - --ulimit nofile=65536:65536

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: vsphere

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ray
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
    ssh_private_key: ~/ray-bootstrap-key.pem

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # You can override the resources here. Adding GPU to the head node is not recommended.
        # resources: { "CPU": 2, "Memory": 4096}
        resources: {}
        node_config: {"vm_class": "best-effort-xlarge"}
    worker:
        # The minimum number of nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 1
        max_workers: 3
        # You can override the resources here. For GPU, currently only Nvidia GPU is supported. If no ESXi host can
        # fulfill the requirement, the Ray node creation will fail. The number of created nodes may not meet the desired
        # minimum number. The vSphere node provider will not distinguish the GPU type. It will just count the quantity:
        # mount the first k random available Nvidia GPU to the VM, if the user set {"GPU": k}.
        # resources: {"CPU": 2, "Memory": 4096, "GPU": 1}
        resources: {}
        node_config: {"vm_class": "best-effort-xlarge"}
    worker_2:
         # The minimum number of nodes of this type to launch.
         # This number should be >= 0.
         min_workers: 1
         max_workers: 2
         # You can override the resources here. For GPU, currently only Nvidia GPU is supported. If no ESXi host can
         # fulfill the requirement, the Ray node creation will fail. The number of created nodes may not meet the desired
         # minimum number. The vSphere node provider will not distinguish the GPU type. It will just count the quantity:
         # mount the first k random available Nvidia GPU to the VM, if the user set {"GPU": k}.
         # resources: {"CPU": 2, "Memory": 4096, "GPU": 1}
         resources: {}
         node_config: {"vm_class": "best-effort-xlarge"}

# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
#    "/path1/on/remote/machine": "/path1/on/local/machine",
#    "/path2/on/remote/machine": "/path2/on/local/machine",
}

# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []

# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude: []

# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter: []

# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []

# List of shell commands to run to set up nodes.
setup_commands: []

# Custom commands that will be run on the head node after common setup.
head_setup_commands:
    - pip install 'git+https://github.com/vmware/vsphere-automation-sdk-python.git'

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --head --port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379

TPU 配置#

可以在 GCP 上使用TPU 虚拟机。目前，不支持 TPU pod（v2-8、v3-8 和 v4-8 之外的 TPU）。

在使用包含 TPU 的配置之前，请确保已为您的 GCP 项目启用 TPU API。

GCP

# A unique identifier for the head node and workers of this cluster.
cluster_name: tputest

# The maximum number of worker nodes to launch in addition to the head node.
max_workers: 7

available_node_types:
    ray_head_default:
        resources: {"TPU": 1}  # use TPU custom resource in your code
        node_config:
            # Only v2-8, v3-8 and v4-8 accelerator types are currently supported.
            # Support for TPU pods will be added in the future.
            acceleratorType: v2-8
            runtimeVersion: v2-alpha
            schedulingConfig:
                # Set to false to use non-preemptible TPUs
                preemptible: false
    ray_tpu:
        min_workers: 1
        resources: {"TPU": 1}  # use TPU custom resource in your code
        node_config:
            acceleratorType: v2-8
            runtimeVersion: v2-alpha
            schedulingConfig:
                preemptible: true

provider:
    type: gcp
    region: us-central1
    availability_zone: us-central1-b
    project_id: null # Replace this with your GCP project ID.

setup_commands:
  - sudo apt install python-is-python3 -y
  - pip3 install --upgrade pip
  - pip3 install -U "ray[default]"

# Specify the node type of the head node (as configured above).
head_node_type: ray_head_default

集群 YAML 配置选项#

语法#

自定义类型#

Docker#

认证#

提供商#

安全组#

vSphere 配置#

vSphere 凭证#

vSphere Frozen VM 配置#

vSphere GPU 配置#

节点类型#

节点配置#

节点 Docker#

资源#

文件挂载#

属性和定义#

cluster_name#

max_workers#

upscaling_speed#

idle_timeout_minutes#

docker#

provider#

auth#

available_node_types#

head_node_type#

file_mounts#

cluster_synced_files#

rsync_exclude#

rsync_filter#

initialization_commands#

setup_commands#

head_setup_commands#

worker_setup_commands#

head_start_ray_commands#

worker_start_ray_commands#

docker.image#

docker.head_image#

docker.worker_image#

docker.container_name#

docker.pull_before_run#

docker.run_options#

docker.head_run_options#

docker.worker_run_options#

docker.disable_automatic_runtime_detection#

docker.disable_shm_size_detection#

auth.ssh_user#

auth.ssh_private_key#

auth.ssh_public_key#

provider.type#

provider.region#

provider.availability_zone#

provider.location#

provider.resource_group#

provider.subscription_id#

provider.msi_name#

provider.msi_resource_group#

provider.project_id#

provider.cache_stopped_nodes#

provider.use_internal_ips#

provider.use_external_head_ip#

provider.security_group#

provider.vsphere_config#

security_group.GroupName#

security_group.IpPermissions#

vsphere_config.credentials#

vsphere_config.credentials.user#

vsphere_config.credentials.password#

vsphere_config.credentials.server#

vsphere_config.frozen_vm#

vsphere_config.frozen_vm.name#

vsphere_config.frozen_vm.library_item#

vsphere_config.frozen_vm.resource_pool#

vsphere_config.frozen_vm.cluster#

vsphere_config.frozen_vm.datastore#

vsphere_config.gpu_config#

vsphere_config.gpu_config.dynamic_pci_passthrough#

available_node_types.<node_type_name>.node_type.node_config#

available_node_types.<node_type_name>.node_type.resources#

available_node_types.<node_type_name>.node_type.min_workers#

`cluster_name`#

`max_workers`#

`upscaling_speed`#

`idle_timeout_minutes`#

`docker`#

`provider`#

`auth`#

`available_node_types`#

`head_node_type`#

`file_mounts`#

`cluster_synced_files`#

`rsync_exclude`#

`rsync_filter`#

`initialization_commands`#

`setup_commands`#

`head_setup_commands`#

`worker_setup_commands`#

`head_start_ray_commands`#

`worker_start_ray_commands`#

`docker.image`#

`docker.head_image`#

`docker.worker_image`#

`docker.container_name`#

`docker.pull_before_run`#

`docker.run_options`#

`docker.head_run_options`#

`docker.worker_run_options`#

`docker.disable_automatic_runtime_detection`#

`docker.disable_shm_size_detection`#

`auth.ssh_user`#

`auth.ssh_private_key`#

`auth.ssh_public_key`#

`provider.type`#

`provider.region`#

`provider.availability_zone`#

`provider.location`#

`provider.resource_group`#

`provider.subscription_id`#

`provider.msi_name`#

`provider.msi_resource_group`#

`provider.project_id`#

`provider.cache_stopped_nodes`#

`provider.use_internal_ips`#

`provider.use_external_head_ip`#

`provider.security_group`#

`provider.vsphere_config`#

`security_group.GroupName`#

`security_group.IpPermissions`#

`vsphere_config.credentials`#

`vsphere_config.credentials.user`#

`vsphere_config.credentials.password`#

`vsphere_config.credentials.server`#

`vsphere_config.frozen_vm`#

`vsphere_config.frozen_vm.name`#

`vsphere_config.frozen_vm.library_item`#

`vsphere_config.frozen_vm.resource_pool`#

`vsphere_config.frozen_vm.cluster`#

`vsphere_config.frozen_vm.datastore`#

`vsphere_config.gpu_config`#

`vsphere_config.gpu_config.dynamic_pci_passthrough`#

`available_node_types.<node_type_name>.node_type.node_config`#

`available_node_types.<node_type_name>.node_type.resources`#

`available_node_types.<node_type_name>.node_type.min_workers`#

`available_node_types.<node_type_name>.node_type.max_workers`#

`available_node_types.<node_type_name>.node_type.worker_setup_commands`#

`available_node_types.<node_type_name>.node_type.resources.CPU`#

`available_node_types.<node_type_name>.node_type.resources.GPU`#

`available_node_types.<node_type_name>.node_type.resources.memory`#

`available_node_types.<node_type_name>.node_type.resources.object-store-memory`#

`available_node_types.<node_type_name>.docker`#