配置源:
cat > /etc/apt/sources.list.d/ubuntu.sources <<EOF Types: deb URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ Suites: noble noble-updates noble-security Components: main restricted universe multiverse Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg EOF sudo apt update # 安装常用工具 sudo apt install -y curl wget lsb-release gnupg ca-certificates
安装驱动:
# 安装驱动管理工具 sudo apt update sudo apt install -y ubuntu-drivers-common # 自动检测并安装推荐驱动 sudo ubuntu-drivers autoinstall # 安装完成后重启生效 sudo reboot # 安装完成后重启,运行nvidia-smi如有输出则说明安装成功 nvidia-smi
安装docker:
# 导入Docker官方签名密钥,确保源可信 sudo curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg # 添加目录 sudo mkdir -p /etc/apt/sources.list.d # 添加 Docker 官方 apt 源(阿里云镜像) 配置 apt 源指向阿里云,加速下载 echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null # 更新软件包索引 并安装Docker sudo apt update sudo apt install -y docker-ce docker-ce-cli containerd.io # 查看docker版本,有输出则说明安装成功 docker -v # 设置 Docker 服务开机启动 sudo systemctl start docker sudo systemctl enable docker
配置docker加速:
sudo mkdir -p /etc/docker
sudo tee /etc/docker/daemon.json <<-'EOF'
{
"registry-mirrors": [
"https://docker.1ms.run",
"https://14mc21ucusppmu.xuanyuan.run",
"https://dockerproxy.net",
"https://docker.m.daocloud.io",
"https://docker.1panel.live",
"https://docker.fxxk.dedyn.io",
"https://registry.cn-hangzhou.aliyuncs.com",
"https://registry.cn-shanghai.aliyuncs.com",
"https://registry.cn-qingdao.aliyuncs.com",
"https://registry.cn-beijing.aliyuncs.com",
"https://registry.cn-zhangjiakou.aliyuncs.com",
"https://registry.cn-huhehaote.aliyuncs.com",
"https://registry.cn-wulanchabu.aliyuncs.com",
"https://registry.cn-shenzhen.aliyuncs.com",
"https://registry.cn-heyuan.aliyuncs.com",
"https://registry.cn-guangzhou.aliyuncs.com",
"https://registry.cn-chengdu.aliyuncs.com",
"https://docker-0.unsee.tech",
"https://registry.cn-hongkong.aliyuncs.com",
"https://registry.ap-northeast-1.aliyuncs.com",
"https://registry.ap-southeast-1.aliyuncs.com",
"https://registry.ap-southeast-3.aliyuncs.com",
"https://registry.ap-southeast-5.aliyuncs.com",
"https://registry.eu-central-1.aliyuncs.com",
"https://registry.eu-west-1.aliyuncs.com",
"https://registry.us-west-1.aliyuncs.com",
"https://registry.us-east-1.aliyuncs.com",
"https://registry.me-east-1.aliyuncs.com",
"https://mirror.ccs.tencentyun.com",
"https://gcr.io",
"https://asia.gcr.io",
"https://eu.gcr.io",
"https://registry.hub.docker.com"
],
"insecure-registries": [
"docker.fxxk.dedyn.io"
]
}
EOF
# 重新加载并重启 Docker
sudo systemctl daemon-reload
sudo systemctl restart docker
# 验证配置是否生效
docker info | grep -i mirror
#测试拉取镜像并运行
sudo docker run hello-worldNVIDIA container toolkit:
sudo apt-get update && sudo apt-get install -y --no-install-recommends \ curl \ gnupg2 # 使用中科大镜像源 curl -fsSL https://mirrors.ustc.edu.cn/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg curl -s -L https://mirrors.ustc.edu.cn/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ sed 's#deb https://nvidia.github.io#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://mirrors.ustc.edu.cn#g' | \ sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list # 安装container toolkit sudo apt-get update sudo apt-get install -y nvidia-container-toolkit # 配置 Docker 使用 nvidia-container-runtime sudo nvidia-ctk runtime configure --runtime=docker # 重新启动 Docker 服务使配置生效 sudo systemctl restart docker # 检查包安装状态(输出版本号即为安装成功) dpkg -l | grep nvidia-container-toolkit # 官方最新验证命令(无任何额外依赖,仅需核心包),有输出nvidia-smi信息则说明安装成功 docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi
拉去vLLM镜像:
# 拉取vLLM最新稳定GPU镜像 docker pull vllm/vllm-openai:latest
安装venv虚拟环境:
# 1. 安装pip和venv,后续需要使用 sudo apt update && sudo apt upgrade -y sudo apt install -y python3-pip python3-venv # 2. 检查Python3是否安装:一般都自带了Python3环境,此步仅仅是为了检查确认一下 python3 --version # 3. 先创建并激活虚拟环境 python3 -m venv ~/vllm-env # 4. 激活虚拟环境(激活后终端前缀会显示 (vllm-env)) source ~/vllm-env/bin/activate # 5. 退出虚拟环境方法 deactivate
安装modelscope:
# 1. 激活虚拟环境 source ~/vllm-env/bin/activate # 2. 安装modelscope pip install modelscope -i https://mirrors.aliyun.com/pypi/simple/ # 3. 完成退出虚拟环境 deactivate
Qwen3-4B-Instruct-2507-FP8语言模型:
# 1. 激活虚拟环境
source ~/vllm-env/bin/activate
# 2. 下载模型
modelscope download --model Qwen/Qwen3-4B-Instruct-2507-FP8 --local_dir ~/models/Qwen3-4B-Instruct-2507-FP8
# 3. 下载完成退出虚拟环境
deactivate
# 4. 下载非思考模板
sudo mkdir -p ~/models/chat-template
cd ~/models/chat-template/
wget https://qwen.readthedocs.io/en/latest/_downloads/c101120b5bebcc2f12ec504fc93a965e/qwen3_nonthinking.jinja
# 5. 启动模型
docker run -d --name vllm-qwen3-4b-fp8 \
--gpus '"device=0"' \
--shm-size=2g \
-p 18080:8080 \
-v /home/demo/models:/models:ro \
vllm/vllm-openai:latest \
--host 0.0.0.0 \
--port 8080 \
--model /models/Qwen3-4B-Instruct-2507-FP8 \
--served-model-name qwen3-4b-fp8 \
--tensor-parallel-size 1 \
--max-num-seqs 16 \
--enable-auto-tool-choice \
--chat-template /models/chat-template/qwen3_nonthinking.jinja \
--tool-call-parser hermes \
--gpu-memory-utilization 0.5 \
--max-model-len 4096
# 查看日志
docker logs -f vllm-qwen3-4b-fp8
# openai测试模型
curl http://127.0.0.1:18080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3-4b-fp8",
"messages": [
{"role": "system", "content": "你是一个有帮助的助手。"},
{"role": "user", "content": "用一句话介绍 vLLM。"}
],
"temperature": 0.7
}'BAAI/bge-m3文本向量模型:
# 1. 激活虚拟环境
source ~/vllm-env/bin/activate
# 2. 下载模型
modelscope download --model BAAI/bge-m3 --local_dir ~/models/bge-m3
# 3. 下载完成退出虚拟环境
deactivate
# 4. 启动模型
docker run -d --name vllm-bge-m3 \
--gpus '"device=0"' \
--shm-size=2g \
-p 18081:8081 \
-v /home/demo/models:/models:ro \
vllm/vllm-openai:latest \
--host 0.0.0.0 \
--port 8081 \
--model /models/bge-m3 \
--served-model-name bge-m3 \
--max-num-seqs 4 \
--max-model-len 8192
# 5. 查看日志
docker logs -f vllm-bge-m3
# 测试模型
curl -X POST "http://127.0.0.1:18081/v1/embeddings" \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer " \
-d '{
"model": "bge-m3",
"input": "大语言模型(Large Language Model,简称 LLM)是一种基于深度学习技术的人工智能模型。"
}'BAAI/bge-reranker-v2-m3下载运行:
# 1. 激活虚拟环境
source ~/vllm-env/bin/activate
# 3. 下载模型
modelscope download --model BAAI/bge-reranker-v2-m3 --local_dir /home/demo/models/bge-reranker-v2-m3
# 4. 下载完成退出虚拟环境
deactivate
# 5. 启动模型
docker run -d --name vllm-bge-reranker-v2-m3 \
--gpus '"device=0"' \
--shm-size=2g \
-p 18082:8082 \
-v /home/demo/models:/models:ro \
vllm/vllm-openai:latest \
--host 0.0.0.0 \
--port 8082 \
--model /models/bge-reranker-v2-m3 \
--served-model-name bge-reranker-v2-m3 \
--max-model-len 8192
# 6. 查看日志
docker logs -f vllm-bge-reranker-v2-m3
# 7. 测试模型
curl -X POST "http://127.0.0.1:18082/v1/rerank" \
-H "Content-Type: application/json" \
-d '{
"model": "bge-reranker-v2-m3",
"query": "什么是大语言模型?",
"documents": [
"大语言模型是基于海量文本数据训练的深度学习模型,能理解和生成人类语言。",
"猫是一种常见的家庭宠物,性格温顺,喜欢吃鱼。",
"大语言模型的代表产品有 GPT、LLaMA 等,广泛用于对话、写作等场景。",
"大语言模型(Large Language Model,简称 LLM)是一种基于深度学习技术的人工智能模型,更可靠的内容生成机制(如事实核查、引用溯源)以及更完善的安全防护策略(如数据脱敏、权限控制、偏见检测),以推动大语言模型向更安全、更可控、更实用的方向发展。",
" 大语言模型主要有有 GPT、LLaMA 和qwen,广泛用于对话、写作等场景。",
"什么是大型建模的模具,大型建模的模具就是用来生成制造大型建模的工具"
],
"top_n": 6
}'vLLM部署运行qwen3.5系列模型:
# 下载vllm nightly版本
docker pull vllm/vllm-openai:nightly
# 虚拟环境下,下载qwen3.5-2b模型
source ~/vllm-env/bin/activate
modelscope download --model Qwen/Qwen3.5-2B --local_dir ~/models/Qwen3.5-2B
deactivate
# vllm nightly运行qwen3.5模型命令
docker run -d \
--name vllm-qwen3.5-2b \
--gpus all \
-p 18088:8088 \
--ipc=host \
-v ~/models/Qwen3.5-2B:/models/Qwen3.5-2B \
vllm/vllm-openai:nightly \
/models/Qwen3.5-2B \
--port 8088 \
--served-model-name qwen3.5-2b \
--tensor-parallel-size 1 \
--max-model-len 4096 \
--gpu-memory-utilization 0.5 \
--max-num-seqs 8 \
--language-model-only
# 测试命令
curl http://127.0.0.1:18088/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3.5-2b",
"messages": [
{"role": "system", "content": "你是一个有帮助的助手。"},
{"role": "user", "content": "用一句话介绍 vLLM。"}
],
"temperature": 0.7
}'sglang部署运行qwen3.5系列模型:
# 下载sglang
docker pull lmsysorg/sglang:latest
# 虚拟环境下,下载qwen3.5-2b模型
source ~/vllm-env/bin/activate
modelscope download --model Qwen/Qwen3.5-2B --local_dir ~/models/Qwen3.5-2B
deactivate
# sglang运行qwen3.5模型命令
docker run -d --gpus all \
--name sglang-qwen3.5-2b \
-p 8090:8000 \
-v ~/models:/models \
lmsysorg/sglang:latest \
python -m sglang.launch_server \
--model-path /models/Qwen3.5-2B \
--served-model-name qwen3.5-2b \
--port 8000 \
--host 0.0.0.0 \
--tp-size 1 \
--mem-fraction-static 0.7 \
--context-length 4096 \
--attention-backend triton
# 测试命令
curl http://127.0.0.1:8090/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3.5-2b",
"messages": [
{"role": "system", "content": "你是一个有帮助的助手。"},
{"role": "user", "content": "用一句话介绍 vLLM。"}
],
"temperature": 0.7
}'参考:
jianshu.com/p/47f88e3fa2c7