VLLm: Unterschied zwischen den Versionen
Zur Navigation springen
Zur Suche springen
KKeine Bearbeitungszusammenfassung |
|||
| (5 dazwischenliegende Versionen desselben Benutzers werden nicht angezeigt) | |||
| Zeile 13: | Zeile 13: | ||
=== Ausführen === | === Ausführen === | ||
Variante 1 | ==== Variante 1 ==== | ||
<syntaxhighlight lang="bash" line="1"> | |||
docker run -it --rm --shm-size=8g --device=/dev/kfd --device=/dev/dri \ | docker run -it --rm --shm-size=8g --device=/dev/kfd --device=/dev/dri \ | ||
--group-add video -p 8086:8000 \ | --group-add video -p 8086:8000 \ | ||
| Zeile 20: | Zeile 21: | ||
vllm serve /models/Qwen3-Coder-30B-A3B-Instruct-AWQ-4bit --served-model-name Homelab --max-model-len 30000 --enable-auto-tool-choice --tool-call-parser hermes | vllm serve /models/Qwen3-Coder-30B-A3B-Instruct-AWQ-4bit --served-model-name Homelab --max-model-len 30000 --enable-auto-tool-choice --tool-call-parser hermes | ||
</syntaxhighlight>Variante 2, getestet 18.12.2025:<syntaxhighlight lang="bash"> | </syntaxhighlight>Variante 2, getestet 18.12.2025:<syntaxhighlight lang="bash"> | ||
sudo docker run -it --rm --network=host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device /dev/kfd --device /dev/dri -v /home/hendrik/.lmstudio/models/:/app/models -e HF_HOME="/app/models" -e HF_TOKEN="<TOKEN>" -e NCCL_P2P_DISABLE=1 -e VLLM_CUSTOM_OPS=all -e VLLM_ROCM_USE_AITER=0 -e SAFETENSORS_FAST_GPU=1 -e PYTORCH_TUNABLEOP_ENABLED=1 rocm/vllm-dev:nightly | |||
</syntaxhighlight> | |||
Für gfx1201:<syntaxhighlight lang="bash"> | |||
sudo docker run -it --rm --network=host \ | sudo docker run -it --rm --network=host \ | ||
--group-add=video --ipc=host --cap-add=SYS_PTRACE \ | --group-add=video --ipc=host --cap-add=SYS_PTRACE \ | ||
| Zeile 32: | Zeile 37: | ||
-e SAFETENSORS_FAST_GPU=1 \ | -e SAFETENSORS_FAST_GPU=1 \ | ||
-e PYTORCH_TUNABLEOP_ENABLED=1 | -e PYTORCH_TUNABLEOP_ENABLED=1 | ||
kyuz0/vllm-therock-gfx1201 | |||
</syntaxhighlight>Ohne Tensor Parallism:<syntaxhighlight lang="bash"> | </syntaxhighlight> | ||
vllm serve Qwen/Qwen3-VL-8B-Thinking --max_model_len 4096 --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser qwen3 | |||
Ohne Tensor Parallism:<syntaxhighlight lang="bash"> | |||
vllm serve Qwen/Qwen3-VL-8B-Thinking --served-model-name Homelab --max_model_len 4096 --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser qwen3 | |||
</syntaxhighlight>Mit:<syntaxhighlight lang="bash"> | </syntaxhighlight>Mit:<syntaxhighlight lang="bash"> | ||
vllm serve Qwen/Qwen3-VL-8B-Thinking --tp 2 --max_model_len 4096 --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser qwen3 | vllm serve Qwen/Qwen3-VL-8B-Thinking --served-model-name Homelab --tp 2 --max_model_len 4096 --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser qwen3 | ||
</syntaxhighlight>Benchmark: | </syntaxhighlight> | ||
Benchmark: | |||
<syntaxhighlight lang="bash"> | |||
vllm bench serve --num-prompts 1 --dataset-name=random --input-len 512 --output-len 128 --model Qwen/Qwen3-4B-Instruct-2507-FP8 | |||
</syntaxhighlight> | |||
==== Variante 2 (Pro W7800) ==== | |||
<syntaxhighlight lang="bash" line="1"> | |||
#!/bin/bash | |||
docker run --rm \ | |||
--device /dev/kfd \ | |||
--device /dev/dri \ | |||
-e HSA_ENABLE_IPC_MODE_LEGACY=0 \ | |||
-e HIP_VISIBLE_DEVICES=1 \ | |||
-p 8000:8000 \ | |||
-v ~/.cache/huggingface:/root/.cache/huggingface \ | |||
--ipc=host \ | |||
rocm/vllm-dev:nightly_main_20260318 \ | |||
vllm serve cyankiwi/Qwen3.5-35B-A3B-AWQ-4bit \ | |||
--tensor-parallel-size 1 \ | |||
--max-model-len 16000 \ | |||
--dtype float16 \ | |||
--reasoning-parser qwen3 \ | |||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":2}' \ | |||
--enable-auto-tool-choice \ | |||
--tool-call-parser qwen3_coder \ | |||
--language-model-only \ | |||
--served-model-name Homelab | |||
</syntaxhighlight> | |||
=== Test === | === Test === | ||
=== Bekannte Probleme === | === Bekannte Probleme === | ||
Aktuelle Version vom 21. März 2026, 01:40 Uhr
Beschreibung
Docker normal installieren
Download
Normal (ROCm)
docker pull rocm/vllm-dev:nightlygfx906
docker pull nalanzeyu/vllm-gfx906Ausführen
Variante 1
docker run -it --rm --shm-size=8g --device=/dev/kfd --device=/dev/dri \
--group-add video -p 8086:8000 \
-v /mnt/share/models:/models \
nalanzeyu/vllm-gfx906 \
vllm serve /models/Qwen3-Coder-30B-A3B-Instruct-AWQ-4bit --served-model-name Homelab --max-model-len 30000 --enable-auto-tool-choice --tool-call-parser hermesVariante 2, getestet 18.12.2025:
sudo docker run -it --rm --network=host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device /dev/kfd --device /dev/dri -v /home/hendrik/.lmstudio/models/:/app/models -e HF_HOME="/app/models" -e HF_TOKEN="<TOKEN>" -e NCCL_P2P_DISABLE=1 -e VLLM_CUSTOM_OPS=all -e VLLM_ROCM_USE_AITER=0 -e SAFETENSORS_FAST_GPU=1 -e PYTORCH_TUNABLEOP_ENABLED=1 rocm/vllm-dev:nightlyFür gfx1201:
sudo docker run -it --rm --network=host \
--group-add=video --ipc=host --cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined --device /dev/kfd \
--device /dev/dri \
-v /home/hendrik/.lmstudio/models/:/app/models \
-e HF_HOME="/app/models" \
-e HF_TOKEN="<TOKEN>" \
-e NCCL_P2P_DISABLE=1 \
-e VLLM_CUSTOM_OPS=all \
-e VLLM_ROCM_USE_AITER=0 \
-e SAFETENSORS_FAST_GPU=1 \
-e PYTORCH_TUNABLEOP_ENABLED=1
kyuz0/vllm-therock-gfx1201
Ohne Tensor Parallism:
vllm serve Qwen/Qwen3-VL-8B-Thinking --served-model-name Homelab --max_model_len 4096 --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser qwen3Mit:
vllm serve Qwen/Qwen3-VL-8B-Thinking --served-model-name Homelab --tp 2 --max_model_len 4096 --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser qwen3Benchmark:
vllm bench serve --num-prompts 1 --dataset-name=random --input-len 512 --output-len 128 --model Qwen/Qwen3-4B-Instruct-2507-FP8Variante 2 (Pro W7800)
#!/bin/bash
docker run --rm \
--device /dev/kfd \
--device /dev/dri \
-e HSA_ENABLE_IPC_MODE_LEGACY=0 \
-e HIP_VISIBLE_DEVICES=1 \
-p 8000:8000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--ipc=host \
rocm/vllm-dev:nightly_main_20260318 \
vllm serve cyankiwi/Qwen3.5-35B-A3B-AWQ-4bit \
--tensor-parallel-size 1 \
--max-model-len 16000 \
--dtype float16 \
--reasoning-parser qwen3 \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":2}' \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--language-model-only \
--served-model-name Homelab