|
|
| Zeile 55: |
Zeile 55: |
| <syntaxhighlight lang="bash" line="1"> | | <syntaxhighlight lang="bash" line="1"> |
| sudo nano /etc/systemd/system/llama-server.service | | sudo nano /etc/systemd/system/llama-server.service |
| </syntaxhighlight>
| |
|
| |
| '''Service-Datei Inhalt (Single GPU):'''
| |
| <syntaxhighlight lang="ini" line="1">
| |
| [Unit]
| |
| Description=Llama.cpp Server - Text Generation
| |
| After=network.target
| |
|
| |
| [Service]
| |
| Type=simple
| |
| User=username
| |
| Group=username
| |
| WorkingDirectory=/home/username/llama.cpp/build/bin
| |
|
| |
| # ROCm/HIP Umgebungsvariablen
| |
| Environment="HIP_VISIBLE_DEVICES=0"
| |
| Environment="HSA_OVERRIDE_GFX_VERSION=9.0.6"
| |
| Environment="PATH=/opt/rocm/bin:/usr/local/bin:/usr/bin:/bin"
| |
| Environment="LD_LIBRARY_PATH=/opt/rocm/lib"
| |
|
| |
| # Server starten
| |
| ExecStart=/home/username/llama.cpp/build/bin/llama-server \
| |
| -m /home/username/models/model.gguf \
| |
| -ngl 99 \
| |
| --host 0.0.0.0 \
| |
| --port 8080 \
| |
| -c 4096 \
| |
| -b 512 \
| |
| --threads 8
| |
|
| |
| # Neustart bei Fehler
| |
| Restart=always
| |
| RestartSec=10
| |
|
| |
| # Limits
| |
| LimitNOFILE=65535
| |
| LimitMEMLOCK=infinity
| |
|
| |
| # Logging
| |
| StandardOutput=journal
| |
| StandardError=journal
| |
| SyslogIdentifier=llama-server
| |
|
| |
| [Install]
| |
| WantedBy=multi-user.target
| |
| </syntaxhighlight> | | </syntaxhighlight> |
|
| |
|
| Zeile 176: |
Zeile 131: |
| # Service deaktivieren | | # Service deaktivieren |
| sudo systemctl disable llama-server | | sudo systemctl disable llama-server |
|
| |
| # Logs der letzten Stunde
| |
| sudo journalctl -u llama-server --since "1 hour ago"
| |
| </syntaxhighlight>
| |
|
| |
| ==== Mehrere Services für verschiedene Modelle ====
| |
|
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # Service für 20B Modell auf Port 8080
| |
| sudo nano /etc/systemd/system/llama-server-20b.service
| |
|
| |
| # Service für 120B Modell auf Port 8086
| |
| sudo nano /etc/systemd/system/llama-server-120b.service
| |
|
| |
| # Service für Embeddings auf Port 8081
| |
| sudo nano /etc/systemd/system/llama-server-embeddings.service
| |
|
| |
| # Alle Services aktivieren
| |
| sudo systemctl daemon-reload
| |
| sudo systemctl enable llama-server-20b llama-server-120b llama-server-embeddings
| |
| sudo systemctl start llama-server-20b llama-server-120b llama-server-embeddings
| |
| </syntaxhighlight>
| |
|
| |
|
| ==== Manuelle Server-Starts ==== | | ==== Manuelle Server-Starts ==== |
|
| |
| '''Single GPU (Vulkan):'''
| |
| <syntaxhighlight lang="bash" line="1">
| |
| cd ~/llama.cpp/build/bin
| |
| ./llama-server \
| |
| -m ~/models/model.gguf \
| |
| --device Vulkan0 \
| |
| -ngl 99 \
| |
| --host 0.0.0.0 \
| |
| --port 8080
| |
| </syntaxhighlight>
| |
|
| |
| '''Single GPU (ROCm):'''
| |
| <syntaxhighlight lang="bash" line="1">
| |
| HIP_VISIBLE_DEVICES=0 ./llama-server \
| |
| -m ~/models/model.gguf \
| |
| -ngl 99 \
| |
| --host 0.0.0.0 \
| |
| --port 8080 \
| |
| -c 4096 \
| |
| -b 512 \
| |
| --threads 8
| |
| </syntaxhighlight>
| |
|
| |
| '''Multi-GPU (ROCm) - Optimal:''' | | '''Multi-GPU (ROCm) - Optimal:''' |
| <syntaxhighlight lang="bash" line="1"> | | <syntaxhighlight lang="bash" line="1"> |
| Zeile 241: |
Zeile 150: |
| --jinja | | --jinja |
| </syntaxhighlight> | | </syntaxhighlight> |
|
| |
| '''Split-GGUF Modelle (mehrteilige Dateien):'''
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # Nur die erste Datei angeben - Rest wird automatisch geladen
| |
| HIP_VISIBLE_DEVICES=0,1 ./llama-server \
| |
| -m ~/models/model-00001-of-00002.gguf \
| |
| --split-mode row \
| |
| --tensor-split 0.5,0.5 \
| |
| -ngl 99 \
| |
| --host 0.0.0.0 \
| |
| --port 8086
| |
| </syntaxhighlight>
| |
|
| |
| ==== Wichtige Parameter ====
| |
|
| |
| {| class="wikitable"
| |
| |-
| |
| ! Parameter !! Beschreibung
| |
| |-
| |
| | <code>-m</code> || Modell-Pfad
| |
| |-
| |
| | <code>-ngl N</code> || Anzahl Layer auf GPU (99 = alle)
| |
| |-
| |
| | <code>-c N</code> || Context-Größe (Standard: 512)
| |
| |-
| |
| | <code>-b N</code> || Batch-Größe (Standard: 512)
| |
| |-
| |
| | <code>-ub N</code> || Micro-Batch-Größe (physisch)
| |
| |-
| |
| | <code>--threads N</code> || CPU Threads
| |
| |-
| |
| | <code>--split-mode row</code> || Multi-GPU: Row-Split (empfohlen)
| |
| |-
| |
| | <code>--tensor-split X,Y</code> || Multi-GPU: Verteilung (0.5,0.5 = 50/50)
| |
| |-
| |
| | <code>-fa 1</code> || Flash Attention aktivieren
| |
| |-
| |
| | <code>--parallel N</code> || Anzahl paralleler Anfragen
| |
| |-
| |
| | <code>--jinja</code> || Jinja-Template Unterstützung
| |
| |-
| |
| | <code>--host IP</code> || Bind-Adresse (0.0.0.0 = alle)
| |
| |-
| |
| | <code>--port N</code> || Port (Standard: 8080)
| |
| |}
| |
|
| |
|
| === Update === | | === Update === |
| Zeile 298: |
Zeile 162: |
|
| |
|
| === Test === | | === Test === |
|
| |
| ==== Server testen ====
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # Health Check
| |
| curl http://localhost:8080/health
| |
|
| |
| # Verfügbare Modelle
| |
| curl http://localhost:8080/v1/models
| |
|
| |
| # Test-Anfrage
| |
| curl http://localhost:8080/v1/chat/completions \
| |
| -H "Content-Type: application/json" \
| |
| -d '{
| |
| "model": "model",
| |
| "messages": [{"role": "user", "content": "Hello!"}],
| |
| "max_tokens": 50
| |
| }'
| |
| </syntaxhighlight>
| |
|
| |
| ==== Benchmarking ====
| |
|
| |
| '''Single GPU Benchmark:'''
| |
| <syntaxhighlight lang="bash" line="1">
| |
| cd ~/llama.cpp/build/bin
| |
| HIP_VISIBLE_DEVICES=0 ./llama-bench \
| |
| -m ~/models/model.gguf \
| |
| -ngl 99
| |
| </syntaxhighlight>
| |
|
| |
| '''Multi-GPU Benchmark (optimal):'''
| |
| <syntaxhighlight lang="bash" line="1">
| |
| HIP_VISIBLE_DEVICES=0,1 ./llama-bench \
| |
| -m ~/models/model.gguf \
| |
| --split-mode row \
| |
| --tensor-split 0.5,0.5 \
| |
| -ngl 99 \
| |
| -b 512 \
| |
| --threads 8
| |
| </syntaxhighlight>
| |
|
| |
| '''Performance-Vergleich Script:'''
| |
| <syntaxhighlight lang="bash" line="1">
| |
| #!/bin/bash
| |
| cd ~/llama.cpp/build/bin
| |
|
| |
| echo "=== GPU 0 only ==="
| |
| HIP_VISIBLE_DEVICES=0 ./llama-bench -m ~/models/model.gguf -ngl 99
| |
|
| |
| echo -e "\n=== GPU 1 only ==="
| |
| HIP_VISIBLE_DEVICES=1 ./llama-bench -m ~/models/model.gguf -ngl 99
| |
|
| |
| echo -e "\n=== Multi-GPU optimiert ==="
| |
| HIP_VISIBLE_DEVICES=0,1 ./llama-bench \
| |
| -m ~/models/model.gguf \
| |
| --split-mode row \
| |
| --tensor-split 0.5,0.5 \
| |
| -ngl 99
| |
| </syntaxhighlight>
| |
|
| |
| === Fehlerbehebung ===
| |
|
| |
| ====Server startet nicht====
| |
| =====Lösung 1: Logs prüfen=====
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # Logs anzeigen
| |
| sudo journalctl -u llama-server -n 50
| |
|
| |
| # Live-Logs
| |
| sudo journalctl -u llama-server -f
| |
| </syntaxhighlight>
| |
|
| |
| =====Lösung 2: Permissions prüfen=====
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # Binary ausführbar machen
| |
| chmod +x ~/llama.cpp/build/bin/llama-server
| |
|
| |
| # Modell-Datei lesbar prüfen
| |
| ls -la ~/models/model.gguf
| |
| </syntaxhighlight>
| |
|
| |
| =====Lösung 3: ROCm Umgebung prüfen=====
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # ROCm Info
| |
| rocminfo
| |
|
| |
| # GPU Status
| |
| rocm-smi
| |
|
| |
| # Umgebungsvariablen prüfen
| |
| echo $LD_LIBRARY_PATH
| |
| echo $HSA_OVERRIDE_GFX_VERSION
| |
| </syntaxhighlight>
| |
|
| |
| ====Multi-GPU Performance schlecht====
| |
| =====Lösung: Row-Split mit explizitem Tensor-Split verwenden=====
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # FALSCH - schlechte Performance:
| |
| HIP_VISIBLE_DEVICES=0,1 ./llama-server -m model.gguf -ngl 99
| |
|
| |
| # RICHTIG - gute Performance:
| |
| HIP_VISIBLE_DEVICES=0,1 ./llama-server \
| |
| -m model.gguf \
| |
| --split-mode row \
| |
| --tensor-split 0.5,0.5 \
| |
| -ngl 99
| |
| </syntaxhighlight>
| |
|
| |
| ====Split-GGUF lädt nicht====
| |
| =====Lösung: Nur erste Datei angeben=====
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # llama.cpp lädt automatisch alle Teile
| |
| ./llama-server -m ~/models/model-00001-of-00002.gguf -ngl 99
| |
|
| |
| # Prüfe ob alle Dateien vorhanden sind:
| |
| ls -lh ~/models/model-*.gguf
| |
| </syntaxhighlight>
| |
|
| |
| ====Port bereits belegt====
| |
| =====Lösung: Anderen Port verwenden=====
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # Prüfe welcher Prozess Port 8080 nutzt
| |
| sudo lsof -i :8080
| |
|
| |
| # Nutze anderen Port
| |
| ./llama-server -m model.gguf --port 8081
| |
| </syntaxhighlight>
| |
|
| |
| === Integration mit Open-WebUI ===
| |
|
| |
| '''Docker-Compose Konfiguration:'''
| |
| <syntaxhighlight lang="yaml" line="1">
| |
| services:
| |
| open-webui:
| |
| image: ghcr.io/open-webui/open-webui:main
| |
| container_name: open-webui
| |
| volumes:
| |
| - /var/lib/docker/volumes/aichat_open-webui/_data:/app/backend/data
| |
| ports:
| |
| - 8087:8087
| |
| environment:
| |
| # Ollama
| |
| - OLLAMA_BASE_URL=http://localhost:11434
| |
|
| |
| # Llama.cpp Server (OpenAI-kompatibel)
| |
| - OPENAI_API_BASE_URLS=http://localhost:8080/v1;http://localhost:8086/v1
| |
| - OPENAI_API_KEYS=sk-no-key-required;sk-no-key-required
| |
|
| |
| - WEBUI_SECRET_KEY
| |
| - PORT=8087
| |
| network_mode: host
| |
| restart: unless-stopped
| |
|
| |
| volumes:
| |
| open-webui:
| |
| </syntaxhighlight>
| |
|
| |
| === Codeschnipsel ===
| |
|
| |
| '''GPU Monitoring während Inference:'''
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # ROCm GPU Monitoring
| |
| watch -n 1 rocm-smi
| |
|
| |
| # Oder detailliert
| |
| watch -n 1 'rocm-smi --showtemp --showmeminfo --showuse'
| |
| </syntaxhighlight>
| |
|
| |
| '''Automatischer Service-Restart bei OOM:'''
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # In Service-Datei ergänzen:
| |
| [Service]
| |
| # ...
| |
| Restart=always
| |
| RestartSec=10
| |
| # Memory Limit setzen (optional)
| |
| MemoryMax=60G
| |
| </syntaxhighlight>
| |
|
| |
| '''Health Check Script:'''
| |
| <syntaxhighlight lang="bash" line="1">
| |
| #!/bin/bash
| |
| # /usr/local/bin/llama-health-check.sh
| |
|
| |
| if ! curl -s http://localhost:8080/health > /dev/null; then
| |
| echo "Llama server not responding, restarting..."
| |
| systemctl restart llama-server
| |
| fi
| |
| </syntaxhighlight>
| |
|
| |
| Cron Job für Health Check:
| |
| <syntaxhighlight lang="bash" line="1">
| |
| # Alle 5 Minuten prüfen
| |
| sudo crontab -e
| |
| # Hinzufügen:
| |
| */5 * * * * /usr/local/bin/llama-health-check.sh
| |
| </syntaxhighlight>
| |
|
| |
|
| === Nützliche Links === | | === Nützliche Links === |
Beschreibung
llama.cpp ist eine C/C++ Implementierung für Inference von Large Language Models (LLMs). Es unterstützt verschiedene Backends (CPU, Vulkan, ROCm, CUDA) und ermöglicht das Ausführen von quantisierten Modellen im GGUF-Format.
Download
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
Installation
Vulkan Build
Vulkan SDK herunterladen
entpacken, ins Verzeichnis wechseln und source setup-env.sh ausführen. Dann ins Verzeichnis wechseln wo llama.cpp installiert werden soll.
Dort dann
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
cmake -B build -DGGML_VULKAN=1
cmake --build build --config Release -- -j $(nproc)
ROCm Build
gfx906 = Mi 50 Support
gfx1100 = 7900 XTX Support
# ROCm Umgebung einrichten
echo 'export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH' >> ~/.bashrc
echo 'export HSA_OVERRIDE_GFX_VERSION=9.0.6' >> ~/.bashrc # Für MI50/MI60
source ~/.bashrc
# llama.cpp kompilieren
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx906 \
-DCMAKE_BUILD_TYPE=Release && \
cmake --build build --config Release -- -j 8
Für mehrere GPU-Architekturen gleichzeitig:
# Für MI50 (gfx906) und RX 7900 XTX (gfx1100)
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS="gfx906;gfx1100" \
-DCMAKE_BUILD_TYPE=Release && \
cmake --build build --config Release -- -j 8
Konfiguration
llama-server als systemd Service einrichten
Service-Datei erstellen:
sudo nano /etc/systemd/system/llama-server.service
Service-Datei Inhalt (Multi-GPU mit ROCm):
[Unit]
Description=Llama.cpp ROCm Multi-GPU Server
After=network.target
[Service]
Type=simple
User=username
Group=username
WorkingDirectory=/home/username/llama.cpp/build/bin
# Multi-GPU Konfiguration
Environment="HIP_VISIBLE_DEVICES=0,1"
Environment="HSA_OVERRIDE_GFX_VERSION=9.0.6"
Environment="PATH=/opt/rocm/bin:/usr/local/bin:/usr/bin:/bin"
Environment="LD_LIBRARY_PATH=/opt/rocm/lib"
# Server mit optimalen Multi-GPU Einstellungen
ExecStart=/home/username/llama.cpp/build/bin/llama-server \
-m /home/username/models/model.gguf \
--split-mode row \
--tensor-split 0.5,0.5 \
-ngl 99 \
-fa 1 \
--host 0.0.0.0 \
--port 8080 \
-c 32768 \
-b 2048 \
-ub 2048 \
--threads 8 \
--parallel 1 \
--jinja
Restart=always
RestartSec=10
LimitNOFILE=65535
LimitMEMLOCK=infinity
StandardOutput=journal
StandardError=journal
SyslogIdentifier=llama-server
[Install]
WantedBy=multi-user.target
Service aktivieren und starten:
# Service neu laden
sudo systemctl daemon-reload
# Service aktivieren (Auto-Start beim Boot)
sudo systemctl enable llama-server
# Service starten
sudo systemctl start llama-server
# Status prüfen
sudo systemctl status llama-server
# Logs anzeigen
sudo journalctl -u llama-server -f
Service Management:
# Service stoppen
sudo systemctl stop llama-server
# Service neustarten
sudo systemctl restart llama-server
# Service deaktivieren
sudo systemctl disable llama-server
==== Manuelle Server-Starts ====
'''Multi-GPU (ROCm) - Optimal:'''
<syntaxhighlight lang="bash" line="1">
HIP_VISIBLE_DEVICES=0,1 ./llama-server \
-m ~/models/model.gguf \
--split-mode row \
--tensor-split 0.5,0.5 \
-ngl 99 \
-fa 1 \
--host 0.0.0.0 \
--port 8080 \
-c 32768 \
-b 2048 \
-ub 2048 \
--threads 8 \
--parallel 1 \
--jinja
Update
cd ~/llama.cpp
git pull
cmake --build build --config Release -- -j $(nproc)
# Service neu starten falls aktiv
sudo systemctl restart llama-server
Test
Nützliche Links