#!/bin/bash
# autopkgtest QEMU backend with support for NVIDIA GPU pass-through
#
# This is just a thin wrapper around autopkgtest-virt-qemu. It adds all the
# arguments necessary for using NVIDIA GPUs in the VM.
#
# Author: Christian Kastner <ckk@kvr.at>
# License: MIT
set -eu

function usage() {
    cat >&2 <<-EOF

	autopkgtest QEMU backend with support for NVIDIA GPU pass-through

	This is a thin wrapper around autopkgtest-virt-qemu(1).

	By default, all NVIDIA GPUs assigned to vfio-pci are passed through. This
	can be overridden with --gpu, which can be specified multiple times [NOTE:
	multi-GPU is untested as of yet]. By default, the VM will be allocated 75%
	of the host's cores, and 75% of the host's memory.

	All other options on the command line are passed on directly to
	autopkgtest-virt-qemu, so the reader is referred to its man page. For
	example, use --ram-size or --cpus to deviate from the 75% default mentioned
	above.

	This will always boot an image in EFI mode. It will also always request a
	clean poweroff after a test (--timeout-poweroff), rather than the default
	of sending SIGTERM.

	This utility assumes that the invoking user has all the necessary
	permissions required in order to effectively and efficiently operate the
	VM, most notably that the user has access to /dev/kvm. When in doubt, try
	running the script as root.

	Synopsis:
	  $0 -h

	  $0 autopkgtest [...] -- qemu+rocm [--gpu GPU] [options] image [ro-image ...]

	Options:
	  --gpu GPU     PCI slot ID of the GPU to pass through (eg: 09:00.0). Can
	                used multiple times.

	Examples:

	  # Create an image first

	  \$ sudo gpuisol-qemu-create cuda cuda-unstable.img
	  \$ sudo chown \$USER: cuda-unstable.img

	  # Run autopkgtests for src:foo, using packages from the Archive

	  \$ autopkgtest -B foo -- qemu+cuda cuda-unstable.img

	  # Like above, but limit to the GPU in slot 09:00.0

	  \$ autopkgtest -B foo -- qemu+cuda --gpu 09:00.0 cuda-unstable.img

	EOF
}

[ "${1:-}" = "-h" ] && usage && exit 0

for groupname in kvm render; do
    if [ "$UID" -ne 0 ] && ! groups | grep -q "\b${groupname}\b"; then
        echo "Must be either root, or in group $groupname to use this." >&2
        exit 1
    fi
done

# Given something like 0000:07:00.1, generates a QEMU device string
# Only works for VGA (0300), Display (0380) and Audio (0403) device classes
generate_qemu_device() {
    local pci_device="$1"
    local bus="$2"
    local addr
    local class

    addr="$(echo "$pci_device" | cut -d' ' -f1 | cut -d: -f3)"
    class="$(lspci -s "$1" -n | cut -d ' ' -f2)"
    if [ "$class" = "0300:" ] || [ "$class" = "0380:" ]; then
        echo "-device vfio-pci,host=$pci_device,bus=$bus,addr=$addr,multifunction=on,x-vga=off"
    elif [ "$class" = "0403:" ]; then
        echo "-device vfio-pci,host=$pci_device,bus=$bus,addr=$addr"
    else
        echo "$pci_device: Unsupported device class $class." >&2
        exit 1
    fi
}

# We need to remember this, as we cd a few times below
initial_cwd="$PWD"

# 75% of the host's cores and memory
Ncores=$(grep 'cpu cores' /proc/cpuinfo | uniq | grep -Eo '[[:digit:]]+')
Ncores=$((Ncores * 3 / 4))
Nmem=$(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024)))
Nmem=$((Nmem * 3 / 4))

# We use --qemu-options, but so might our caller, so we need to merge here.
qemu_options="-vga none"
# List of GPU devices we will use
gpus=
# Associative array of other devices either on the same card, or some IOMMU
# group as the primary device. These must be passed through, too
declare -A subdevices
# Devices our user might have requested via --gpu
gpus_input=

# We can't use getopt for option parsing, as it has no way to ignore unknown
# options, specifically: the options we just pass on to autopkgtest-virt-qemu.
declare -a newargs
continued=
for arg in "$@"; do
    if [ "$continued" = "qemu-options" ]; then
        qemu_options+=" $arg"
        continued=
    elif [ "$continued" = "gpu" ]; then
        gpus_input+=" $arg"
        continued=
    # --qemu-options="foo bar baz" is one positional argument
    elif [[ "$arg" =~ "--qemu-options=" ]]; then
        qemu_options+=" ${arg##--qemu-options=}"
    # --qemu-options "foo bar baz" are two positional arguments
    # Need the trailing space, as to not just match on prefix
    elif [[ "$arg " =~ "--qemu-options " ]]; then
        continued="qemu-options"
    elif [[ "$arg" =~ "--gpu=" ]]; then
        gpus_input+=" ${arg##--gpu=}"
    elif [[ "$arg " =~ "--gpu " ]]; then
        continued="gpu"
    else
        newargs+=("$arg")
    fi
done

# First, determine the list of GPUs to pass through
if [ -z "$gpus_input" ]; then
    # No GPUs specified -- use all NVIDIA GPUs assigned to vfio
    # 10de=AMD, 0300=VGA compatible controller, 0380=Display controller
    for gpu in $({
        lspci -D -d 10de::0300
        lspci -D -d 10de::0380
    } | cut -d' ' -f1); do
        if lspci -s "$gpu" -k | grep -q 'Kernel driver in use: vfio-pci'; then
            gpus+=" $gpu"
        fi
    done
else
    # User explicitly requested this GPU, so we treat it as a hard failure if
    # it cannot be passed through
    for gpu in $gpus_input; do
        # Get canonical device ID
        cangpu=$(lspci -s "$gpu" -D | cut -d' ' -f1)
        if [ -z "$cangpu" ]; then
            echo "No such device: $gpu" >&2
            exit 1
        elif ! lspci -s "$cangpu" | grep -q -E '(VGA compatible|Display) controller'; then
            echo "Device $cangpu is not a GPU" >&2
            exit 1
        elif ! lspci -s "$cangpu" -k | grep -q 'Kernel driver in use: vfio-pci'; then
            echo "Device $cangpu not assigned to vfio-pci" >&2
            exit 1
        fi
        [[ "$gpus" == *$cangpu* ]] || gpus+=" $cangpu"
    done
fi

# Then, find all of the GPU's subdevices
shopt -s nullglob
for gpu in $gpus; do
    subdevices[$gpu]=""

    # First, all consumer devices of this GPU (like the audio device on the card)
    cd /sys/bus/pci/devices/"$gpu"
    for consumer_raw in consumer:pci:*; do
        consumer=$(echo "$consumer_raw" | cut -d: -f3,4,5)
        [[ ${subdevices[$gpu]} == *$consumer* ]] || subdevices[$gpu]+=" $consumer"
    done

    # Then, all devices in the same IOMMU group
    cd /sys/bus/pci/devices/"$gpu"/iommu_group/devices
    for member in *; do
        [ "$gpu" == "$member" ] && continue
        [[ ${subdevices[$gpu]} == *$member* ]] || subdevices[$gpu]+=" $member"
    done
done
shopt -u nullglob

# Each device gets assigned to its own PCIe bridge, with an address mirroring
# the host address. chassis + slot need to be unique; we use chassis=1 and
# incremental slots.
increment=1
for gpu in $gpus; do
    root_port="rp$increment"
    qemu_options+=" -device pcie-root-port,id=$root_port,chassis=1,slot=$increment,multifunction=on"
    increment=$(("$increment" + 1))

    qemu_options+=" $(generate_qemu_device "$gpu" "$root_port")"
    for subdevice in ${subdevices[$gpu]}; do
        qemu_options+=" $(generate_qemu_device "$subdevice" "$root_port")"
    done
done

newargs+=("--qemu-options=$qemu_options")
set -- "${newargs[@]}"

cd "$initial_cwd"

# If a user explicitly specifies --ram-size or --cpus (in $@), that will
# override our values here
exec autopkgtest-virt-qemu \
    --boot=efi \
    --timeout-poweroff=30 \
    --cpus "$Ncores" \
    --ram-size "$Nmem" \
    "$@"
