快速入门示例#
通过 OpenAiIngress 进行部署#
您可以使用 builder 模式或 bind 模式来部署 LLM 模型。
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app
llm_config = LLMConfig(
model_loading_config={
"model_id": "qwen-0.5b",
"model_source": "Qwen/Qwen2.5-0.5B-Instruct",
},
deployment_config={
"autoscaling_config": {
"min_replicas": 1,
"max_replicas": 2,
}
},
# Pass the desired accelerator type (e.g. A10G, L4, etc.)
accelerator_type="A10G",
# You can customize the engine arguments (e.g. vLLM engine kwargs)
engine_kwargs={
"tensor_parallel_size": 2,
},
)
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
from ray import serve
from ray.serve.llm import LLMConfig
from ray.serve.llm.deployment import LLMServer
from ray.serve.llm.ingress import OpenAiIngress, make_fastapi_ingress
llm_config = LLMConfig(
model_loading_config=dict(
model_id="qwen-0.5b",
model_source="Qwen/Qwen2.5-0.5B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1, max_replicas=2,
)
),
# Pass the desired accelerator type (e.g. A10G, L4, etc.)
accelerator_type="A10G",
# You can customize the engine arguments (e.g. vLLM engine kwargs)
engine_kwargs=dict(
tensor_parallel_size=2,
),
)
# Deploy the application
server_options = LLMServer.get_deployment_options(llm_config)
server_deployment = serve.deployment(LLMServer).options(
**server_options).bind(llm_config)
ingress_options = OpenAiIngress.get_deployment_options(
llm_configs=[llm_config])
ingress_cls = make_fastapi_ingress(OpenAiIngress)
ingress_deployment = serve.deployment(ingress_cls).options(
**ingress_options).bind([server_deployment])
serve.run(ingress_deployment, blocking=True)
您可以使用 cURL 或 OpenAI Python 客户端来查询已部署的模型。
curl -X POST https://:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer fake-key" \
-d '{
"model": "qwen-0.5b",
"messages": [{"role": "user", "content": "Hello!"}]
}'
from openai import OpenAI
# Initialize client
client = OpenAI(base_url="https://:8000/v1", api_key="fake-key")
# Basic chat completion with streaming
response = client.chat.completions.create(
model="qwen-0.5b",
messages=[{"role": "user", "content": "Hello!"}],
stream=True
)
for chunk in response:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="", flush=True)
要部署多个模型,您可以将一组 LLMConfig 对象传递给 OpenAiIngress 部署。
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app
llm_config1 = LLMConfig(
model_loading_config=dict(
model_id="qwen-0.5b",
model_source="Qwen/Qwen2.5-0.5B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1, max_replicas=2,
)
),
accelerator_type="A10G",
)
llm_config2 = LLMConfig(
model_loading_config=dict(
model_id="qwen-1.5b",
model_source="Qwen/Qwen2.5-1.5B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1, max_replicas=2,
)
),
accelerator_type="A10G",
)
app = build_openai_app({"llm_configs": [llm_config1, llm_config2]})
serve.run(app, blocking=True)
from ray import serve
from ray.serve.llm import LLMConfig
from ray.serve.llm.deployment import LLMServer
from ray.serve.llm.ingress import OpenAiIngress, make_fastapi_ingress
llm_config1 = LLMConfig(
model_loading_config=dict(
model_id="qwen-0.5b",
model_source="Qwen/Qwen2.5-0.5B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1, max_replicas=2,
)
),
accelerator_type="A10G",
)
llm_config2 = LLMConfig(
model_loading_config=dict(
model_id="qwen-1.5b",
model_source="Qwen/Qwen2.5-1.5B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1, max_replicas=2,
)
),
accelerator_type="A10G",
)
# deployment #1
server_options1 = LLMServer.get_deployment_options(llm_config1)
server_deployment1 = serve.deployment(LLMServer).options(
**server_options1).bind(llm_config1)
# deployment #2
server_options2 = LLMServer.get_deployment_options(llm_config2)
server_deployment2 = serve.deployment(LLMServer).options(
**server_options2).bind(llm_config2)
# ingress
ingress_options = OpenAiIngress.get_deployment_options(
llm_configs=[llm_config1, llm_config2])
ingress_cls = make_fastapi_ingress(OpenAiIngress)
ingress_deployment = serve.deployment(ingress_cls).options(
**ingress_options).bind([server_deployment1, server_deployment2])
# run
serve.run(ingress_deployment, blocking=True)
生产部署#
对于生产部署,Ray Serve LLM 提供了通过配置驱动部署的实用程序。您可以使用 YAML 文件指定部署配置。
# config.yaml
applications:
- args:
llm_configs:
- model_loading_config:
model_id: qwen-0.5b
model_source: Qwen/Qwen2.5-0.5B-Instruct
accelerator_type: A10G
deployment_config:
autoscaling_config:
min_replicas: 1
max_replicas: 2
- model_loading_config:
model_id: qwen-1.5b
model_source: Qwen/Qwen2.5-1.5B-Instruct
accelerator_type: A10G
deployment_config:
autoscaling_config:
min_replicas: 1
max_replicas: 2
import_path: ray.serve.llm:build_openai_app
name: llm_app
route_prefix: "/"
# config.yaml
applications:
- args:
llm_configs:
- models/qwen-0.5b.yaml
- models/qwen-1.5b.yaml
import_path: ray.serve.llm:build_openai_app
name: llm_app
route_prefix: "/"
# models/qwen-0.5b.yaml
model_loading_config:
model_id: qwen-0.5b
model_source: Qwen/Qwen2.5-0.5B-Instruct
accelerator_type: A10G
deployment_config:
autoscaling_config:
min_replicas: 1
max_replicas: 2
# models/qwen-1.5b.yaml
model_loading_config:
model_id: qwen-1.5b
model_source: Qwen/Qwen2.5-1.5B-Instruct
accelerator_type: A10G
deployment_config:
autoscaling_config:
min_replicas: 1
max_replicas: 2
使用任一配置文件进行部署
serve run config.yaml
有关监控和可观察性,请参阅 可观察性。
高级用法模式#
对于每种用法模式,Ray Serve LLM 都提供了服务器和客户端代码片段。
跨节点并行#
Ray Serve LLM 支持跨节点张量并行 (TP) 和流水线并行 (PP),允许您将模型推理分布在多个 GPU 和节点上。请参阅 跨节点并行,获取关于使用跨节点并行配置和部署模型的完整指南。