vLLM 兼容性#
Ray Serve LLM 提供了一个与 vLLM 的 OpenAI 兼容服务器对齐的 OpenAI 兼容 API。大多数与 vllm serve 一起使用的 engine_kwargs 也能在 Ray Serve LLM 中使用,这使得您可以通过 Ray Serve 的分布式部署能力来使用 vLLM 的功能集。
这种兼容性意味着您可以
使用与 vLLM 相同的模型配置和引擎参数
利用 vLLM 的最新功能(多模态、结构化输出、推理模型)
在
vllm serve和 Ray Serve LLM 之间切换,无需更改代码即可扩展利用 Ray Serve 的生产特性(自动扩缩容、多模型服务、高级路由)
本指南将介绍如何在 Ray Serve 中使用 vLLM 的嵌入、结构化输出、视觉语言模型和推理模型等功能。
嵌入#
您可以通过在引擎参数中将 task 参数设置为 "embed" 来生成嵌入。支持此用例的模型列在 vLLM 文本嵌入模型文档 中。
部署嵌入模型#
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app
llm_config = LLMConfig(
model_loading_config=dict(
model_id="qwen-0.5b",
model_source="Qwen/Qwen2.5-0.5B-Instruct",
),
engine_kwargs=dict(
task="embed",
),
)
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
from openai import OpenAI
# Initialize client
client = OpenAI(base_url="https://:8000/v1", api_key="fake-key")
# Generate embeddings
response = client.embeddings.create(
model="qwen-0.5b",
input=["A text to embed", "Another text to embed"],
)
for data in response.data:
print(data.embedding) # List of float of len 4096
curl -X POST https://:8000/v1/embeddings \
-H "Content-Type: application/json" \
-H "Authorization: Bearer fake-key" \
-d '{
"model": "qwen-0.5b",
"input": ["A text to embed", "Another text to embed"],
"encoding_format": "float"
}'
转录#
您可以使用专门为自动语音识别 (ASR) 任务训练的语音到文本 (STT) 模型来生成音频转录。支持此用例的模型列在 vLLM 转录模型文档 中。
部署转录模型#
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app
llm_config = LLMConfig(
model_loading_config={
"model_id": "voxtral-mini",
"model_source": "mistralai/Voxtral-Mini-3B-2507",
},
deployment_config={
"autoscaling_config": {
"min_replicas": 1,
"max_replicas": 4,
}
},
accelerator_type="A10G",
# You can customize the engine arguments (e.g. vLLM engine kwargs)
engine_kwargs={
"tokenizer_mode": "mistral",
"config_format": "mistral",
"load_format": "mistral",
},
log_engine_metrics=True,
)
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
from openai import OpenAI
# Initialize client
client = OpenAI(base_url="https://:8000/v1", api_key="fake-key")
# Open audio file
with open("/path/to/audio.wav", "rb") as f:
# Make a request to the transcription model
response = client.audio.transcriptions.create(
model="whisper-large",
file=f,
temperature=0.0,
language="en",
)
print(response.text)
curl https://:8000/v1/audio/transcriptions \
-X POST \
-H "Authorization: Bearer fake-key" \
-F "file=@/path/to/audio.wav" \
-F "model=whisper-large" \
-F "temperature=0.0" \
-F "language=en"
结构化输出#
您可以使用 JSON 模式或 Pydantic 模型的 JSON 模式验证,请求类似于 OpenAI API 的结构化 JSON 输出。
JSON 模式#
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app
llm_config = LLMConfig(
model_loading_config=dict(
model_id="qwen-0.5b",
model_source="Qwen/Qwen2.5-0.5B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1,
max_replicas=2,
)
),
accelerator_type="A10G",
)
# Build and deploy the model
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
from openai import OpenAI
# Initialize client
client = OpenAI(base_url="https://:8000/v1", api_key="fake-key")
# Request structured JSON output
response = client.chat.completions.create(
model="qwen-0.5b",
response_format={"type": "json_object"},
messages=[
{
"role": "system",
"content": "You are a helpful assistant that outputs JSON."
},
{
"role": "user",
"content": "List three colors in JSON format"
}
],
stream=True,
)
for chunk in response:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="", flush=True)
# Example response:
# {
# "colors": [
# "red",
# "blue",
# "green"
# ]
# }
带 Pydantic 的 JSON 模式#
您可以使用 Pydantic 模型指定想要响应的确切模式
from openai import OpenAI
from typing import List, Literal
from pydantic import BaseModel
# Initialize client
client = OpenAI(base_url="https://:8000/v1", api_key="fake-key")
# Define a pydantic model of a preset of allowed colors
class Color(BaseModel):
colors: List[Literal["cyan", "magenta", "yellow"]]
# Request structured JSON output
response = client.chat.completions.create(
model="qwen-0.5b",
response_format={
"type": "json_schema",
"json_schema": Color.model_json_schema()
},
messages=[
{
"role": "system",
"content": "You are a helpful assistant that outputs JSON."
},
{
"role": "user",
"content": "List three colors in JSON format"
}
],
stream=True,
)
for chunk in response:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="", flush=True)
# Example response:
# {
# "colors": [
# "cyan",
# "magenta",
# "yellow"
# ]
# }
视觉语言模型#
您可以部署处理文本和图像的多模态模型。Ray Serve LLM 通过 vLLM 的多模态功能支持视觉模型。
部署视觉模型#
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app
# Configure a vision model
llm_config = LLMConfig(
model_loading_config=dict(
model_id="pixtral-12b",
model_source="mistral-community/pixtral-12b",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1,
max_replicas=2,
)
),
accelerator_type="L40S",
engine_kwargs=dict(
tensor_parallel_size=1,
max_model_len=8192,
),
)
# Build and deploy the model
app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
from openai import OpenAI
# Initialize client
client = OpenAI(base_url="https://:8000/v1", api_key="fake-key")
# Create and send a request with an image
response = client.chat.completions.create(
model="pixtral-12b",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/image.jpg"
}
}
]
}
],
stream=True,
)
for chunk in response:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="", flush=True)
支持的模型#
有关支持的视觉模型的完整列表,请参阅 vLLM 多模态模型文档。
推理模型#
Ray Serve LLM 支持 DeepSeek-R1 和 QwQ 等推理模型,这些模型通过 vLLM 实现。这些模型在生成最终响应之前会使用扩展的思考过程。
有关推理模型支持和配置,请参阅 vLLM 推理模型文档。
另请参阅#
vLLM 支持的模型 - 支持的模型和功能的完整列表
vLLM OpenAI 兼容性 - vLLM 的 OpenAI 兼容服务器文档
快速入门 - 基本 LLM 部署示例。