#!/bin/bash
# Helper for setting up vfio-pci for GPU pass-through to a QEMU amd64 guest
#
# This has been tested on Debian 13 ("trixie") amd64 hosts, with a single
# RX 6800 XT card attached for the ROCm stack and a single RTX 3090 attached
# for the CUDA stack. However, it should work for any Debian derivative, and
# most of it should work for other Linux distros as well.
#
# Author: Christian Kastner <ckk@kvr.at>
# License: MIT
set -eu
shopt -s nullglob

# Option defaults
regularuser=

function usage() {
    cat >&2 <<-EOF

	This utility will look for GPUs on the system, check whether the system
	is already configured for PCI pass-through, and suggest the appropriate
	measures if not.

	STACK is required. Currently supported values are: [rocm, cuda]

	If USER is given, then it will also check whether the given user has
	sufficient permissions to use PCI pass-through in non-privileged mode.

	Synopsis:
	  $0 -h

	  $0 [-u USER] STACK

	Options:
	  -h      Show this help
	  -u USER Verify that USER has the necessary permissions and ulimits to use
	          PCI pass-through together with QEMU.

	Examples:

	  # Check whether root can use pass-through an AMD GPU

	  \$ $0 rocm

	  # Check whether user 'someuser' can use pass-through an AMD GPU

	  \$ $0 -u someuser rocm

	EOF
}

show_ok() {
    printf "  [OK] %s\n" "$1"
}

show_opt() {
    printf " [OPT] %s\n" "$1"
}

show_todo() {
    printf "[TODO] %s\n" "$1"
}

show_cont() {
    printf "                   %s\n" "$1"
}

show_xmpl() {
    printf "                       %s\n" "$1"
}

while getopts "hu:" OPTNAME; do
    case $OPTNAME in
    h)
        usage
        exit 0
        ;;
    u) regularuser="$OPTARG" ;;
    ?)
        usage
        exit 1
        ;;
    esac
done
shift $((OPTIND - 1))

if [ "$#" -eq 0 ]; then
    echo "Missing GPU stack argument: [rocm, cuda]" >&2
    exit 1
fi

stack="$1"
driver_module=
case "$stack" in
rocm)
    driver_module=amdgpu
    ;;
cuda)
    driver_module=nvidia
    ;;
*)
    echo "Unsupported GPU stack argument value: $stack" >&2
    exit 1
    ;;
esac

# Sanity checks
if [ ! -x /usr/bin/lspci ]; then
    echo "Utility 'lspci' is not available on this system." >&2
    exit 1
elif ! grep -q -E 'svm|vmx' /proc/cpuinfo || [ ! -c /dev/kvm ]; then
    echo "Host CPU doesn't support KVM -- make sure AMD-V/Intel VT are enabled in BIOS." >&2
    exit 1
fi

# Given a slot ID like 09:00.1, gets the "pretty" device name
device_name() {
    lspci -s "$1" -vmm | sed -nre 's/^Device:\s+(.*)/\1/p'
}

# Given a slot ID like 09:00.1, gets the PCI vendor_id:device_id
device_pci_id() {
    lspci -s "$1" -n | cut -d' ' -f3
}

# Array of PCI slot IDs
declare -a GpuDevices

# Associative array (indexed by device ID above) of subdevices, which we define
# as one of either
#  (a) another device on the same card, like an audio device
#  (b) a device in the same IOMMU group
# Both (a) and (b) also need to be passed through.
declare -A GpuSubDevices

# Find all GPU devices
# 1002=AMD, 10de:NVIDIA
# 0300=VGA compatible controller, 0380=Display controller
if [ "$stack" = "rocm" ]; then
    mapfile -t GpuDevices < <({
        lspci -D -d 1002::0300
        lspci -D -d ::0380
    } | cut -d' ' -f1)
elif [ "$stack" = "cuda" ]; then
    mapfile -t GpuDevices < <({
        lspci -D -d 10de::0300
        lspci -D -d ::0380
    } | cut -d' ' -f1)
fi
if [ "${#GpuDevices[@]}" -eq 0 ]; then
    echo "No usable GPU devices were found on this system." >&2
    exit 1
fi

# Find subdevices per GPU
for device in "${GpuDevices[@]}"; do
    GpuSubDevices[$device]=""

    # First, all consumer devices of this GPU (like the audio device on the card)
    cd /sys/bus/pci/devices/"$device"
    for consumer_raw in consumer:pci:*; do
        consumer=$(echo "$consumer_raw" | cut -d: -f3,4,5)
        [[ ${GpuSubDevices[$device]} == *$consumer* ]] || GpuSubDevices[$device]+=" $consumer"
    done

    # IOMMU might be disabled on BIOS configuration
    [ -d /sys/bus/pci/devices/"$device"/iommu_group ] || continue

    # Then, all devices in the same IOMMU group
    cd /sys/bus/pci/devices/"$device"/iommu_group/devices
    for member in *; do
        [ "$device" == "$member" ] && continue
        [[ ${GpuSubDevices[$device]} == *$member* ]] || GpuSubDevices[$device]+=" $member"
    done
done

# This is for amd64
packages_missing=
for pkgname in \
    autopkgtest \
    qemu-system-x86 \
    qemu-utils \
    ovmf \
    dpkg-dev; do
    dpkg -l $pkgname 2>/dev/null | grep -qE '^ii' || packages_missing+=" $pkgname"
done
if ! dpkg -l libarchive13 2>/dev/null | grep -qE '^ii' \
    && ! dpkg -l libarchive13t64 2>/dev/null | grep -qE '^ii'; then
    packages_missing+=" libarchvie13"
fi

# This is a strong SHOULD
has_cache=0
for pkgname in approx apt-cacher apt-cacher-ng; do
    dpkg -l $pkgname 2>/dev/null | grep -qE '^ii' && has_cache=1
done

# Find modules that need loading
modules_missing=""
for module in vfio vfio_iommu_type1 vfio_pci; do
    altmodule=$(echo $module | tr '_' '-')
    if ! grep -Eq "^($module|$altmodule)$" /etc/modules-load.d/*; then
        modules_missing+=" $module"
    fi
done

# Determine modprobe configuration
modprobe=""
modprobe_file=""
modprobe_missing=""
has_modprobe_softdep=0
if [ ! -d /etc/modprobe.d ] || ! grep -Eqr '^options[[:space:]]+vfio[-_]pci' /etc/modprobe.d; then
    modprobe="options vfio_pci ids="

    for device in "${GpuDevices[@]}"; do
        modprobe+=$(device_pci_id "$device"),

        for subdevice in ${GpuSubDevices[$device]}; do
            modprobe+=$(device_pci_id "$subdevice"),
        done
    done
    # Remove the final trailing comma
    modprobe=${modprobe%,}
else
    modprobe_file="$(grep -Elr '^options[[:space:]]+vfio[-_]pci' /etc/modprobe.d)"

    if grep -Eq "^softdep[[:space:]]+${driver_module}[[:space:]]+pre:[[:space:]]+vfio[-_]pci" "$modprobe_file"; then
        has_modprobe_softdep=1
    fi

    for device in "${GpuDevices[@]}"; do
        pci_id=$(device_pci_id "$device")
        grep -Eqr "^[^#]*$pci_id" /etc/modprobe.d || modprobe_missing+="$pci_id,"

        for subdevice in ${GpuSubDevices[$device]}; do
            pci_id=$(device_pci_id "$subdevice")
            grep -Eqr "^[^#]*$pci_id" /etc/modprobe.d || modprobe_missing+="$pci_id,"
        done
    done
    # Remove the final trailing comma
    modprobe_missing=${modprobe_missing%,}
fi

# Check whether vfio devices are assigned to group "render"
udev_file=""
if [ -d /etc/udev/rules.d ] && grep -Eqr '^[^#]*SUBSYSTEM=="vfio"' /etc/udev/rules.d; then
    udev_file="$(grep -Elr '^[^#]*SUBSYSTEM==\"vfio\".*GROUP=\"render\"' /etc/udev/rules.d)"
fi

# If requested, validate user setup
user_exists=N
groups_missing=
limits_missing=
if [ -n "$regularuser" ]; then
    if getent passwd "$regularuser" &>/dev/null; then
        user_exists=Y
        for groupname in kvm render; do
            if ! groups "$regularuser" | grep -q "\b${groupname}\b"; then
                groups_missing+="$groupname,"
            fi
        done
        groups_missing=${groups_missing%,}

        for limitname in hard soft "-"; do
            pattern="^(${regularuser}|\*)[[:space:]]+(${limitname}|-)[[:space:]]+memlock[[:space:]]"
            if ! grep -Eqr "$pattern" /etc/security/limits.*; then
                limits_missing+="$limitname,"
            fi
        done
        limits_missing=${limits_missing%,}
    fi
fi

###############
# Output time #
###############
# bash really isn't the right tool for this...

echo
echo Devices
echo =======
echo "Device ID          PCI ID      Device Name"
for device in "${GpuDevices[@]}"; do
    echo "$device       $(device_pci_id "$device")   $(device_name "$device")"
    for subdevice in ${GpuSubDevices[$device]}; do
        echo " └─ $subdevice   $(device_pci_id "$subdevice")   $(device_name "$subdevice")"
    done
done
echo

echo Checks
echo ======

echo -n "BIOS (kvm)  "
if ! grep -qE '(svm|vmx)' /proc/cpuinfo; then
    show_todo "Enable CPU virtualization features (AMD SVM or Intel VT-x)"
else
    show_ok "CPU virtualization enabled"
fi

echo -n "BIOS (IOMMU)"
if ! [ -d /sys/kernel/iommu_groups ] || [ -z "$(ls /sys/kernel/iommu_groups)" ]; then
    show_todo "Enable AMD IOMMU or Intel VT-d support for device pass-through"
else
    show_ok "Device pass-through support detected"
fi

echo -n "packages    "
if [ -n "$packages_missing" ]; then
    show_todo "Key packages not installed:$packages_missing"
else
    show_ok "Key packages are installed"
fi

echo -n "APT cache   "
if [ "$has_cache" -eq 0 ]; then
    show_opt "No local APT cache detected. While not strictly necessary, it is"
    show_cont "strongly suggested that you install one of the approx, apt-cacher,"
    show_cont "or 'apt-cacher-ng' packages"
else
    show_ok "Local APT cache detected, make sure to use it"
fi

echo -n "memory      "
if [ $(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024))) -lt 31000 ]; then
    show_todo "For pass-through, you really want a host with at *least* 32G of memory installed"
else
    show_ok "Host has at least 32G of memory"
fi

echo -n "modules     "
if [ -n "$modules_missing" ]; then
    show_todo "In /etc/modules-load.d, VFIO modules are missing."
    show_cont "Create e.g. /etc/modprobe.d/vfio.conf with the following contents:"
    for module_name in $modules_missing; do
        show_xmpl "$module_name"
    done
else
    show_ok "In /etc/modules-load.d, all necessary modules are loaded"
fi

echo -n "modprobe    "
if [ -n "$modprobe" ]; then
    show_todo "Needs vfio-pci configuration to grab the devices listed above"
    show_cont "Create e.g. /etc/modprobe.d/vfio.conf with the following contents:"
    if [ "$has_modprobe_softdep" -eq 0 ]; then
        show_xmpl "softdep $driver_module pre: vfio_pci"
    fi
    show_xmpl "$modprobe"
else
    if [ -n "$modprobe_missing" ]; then
        show_todo "In $modprobe_file, these devices are not yet assigned to vfio-pci:"
        show_xmpl "$modprobe_missing"
    else
        show_ok "In $modprobe_file, all devices assigned to vfio-pci"
    fi
    if [ "$has_modprobe_softdep" -eq 0 ]; then
        show_todo "In $modprobe_file, precedence of vfio_pci module is not ensured."
        show_cont "Add the following line to $modprobe_file:"
        show_xmpl "softdep $driver_module pre: vfio_pci"
    fi
fi

echo -n "udev        "
if [ -z "$udev_file" ]; then
    show_todo "vfio devices need to be assigned to group 'render'"
    show_cont "Create e.g. /etc/udev/rules.d/99-vfio.rules with the following contents:"
    show_xmpl "SUBSYSTEM==\"vfio\", GROUP=\"render\", MODE=\"0660\""
else
    show_ok "In $udev_file, all vfio devices are assigned to group 'render'"
fi

if [ -n "$regularuser" ]; then
    if [ $user_exists = N ]; then
        show_todo "User '$regularuser' needs to be created first"
    else
        echo -n "user-groups "
        if [ -n "$groups_missing" ]; then
            show_todo "User '$regularuser' is missing from groups: $groups_missing"
            show_cont "You can fix this with: usermod -a -G $groups_missing $regularuser"
        else
            show_ok "User '$regularuser' is in all relevant groups"
        fi
        echo -n "user-limits "
        if [ -n "$limits_missing" ]; then
            show_todo "User '$regularuser' needs elevated memlock ulimit"
            show_cont "Create e.g. /etc/security/limits.d/$regularuser.conf with the following contents:"
            show_xmpl "$regularuser  -  memlock  unlimited"
        else
            show_ok "User '$regularuser' has elevated memlock limit"
        fi
    fi
fi

cat <<EOF

WARNING: If you implement the changes suggested above, then the host will no
longer have access to the GPU(s). That means NO VIDEO OUTPUT. Please ensure
that you have alternative means to access the host (like SSH), or you have a
backup configuration to boot from.

Don't forget to update your initramfs and to reboot after making changes.
EOF
