#!/bin/bash
# Helper for setting up a system for containers with GPU support
#
# Author: Christian Kastner <ckk@kvr.at>
# License: MIT
set -eu

function usage() {
    cat >&2 <<-EOF

	Verifies that a user can use a GPU in a rootless podman container.

	STACK is required. Currently supported values are: [rocm, cuda]

	If USER isn't specified, then the invoking user will be checked.

	Synopsis:
	  $0 -h

	  $0 [-u USER] STACK

	Options:
	  -h     Show this help

	Examples:

	  \$ $0 rocm

	  \$ $0 -u someuser rocm

	EOF
}

show_ok() {
    printf "  [OK] %s\n" "$1"
}

show_opt() {
    printf " [OPT] %s\n" "$1"
}

show_todo() {
    printf "[TODO] %s\n" "$1"
}

show_cont() {
    printf "       %s\n" "$1"
}

show_xmpl() {
    printf "            %s\n" "$1"
}

while getopts "hu:" OPTNAME; do
    case $OPTNAME in
    h)
        usage
        exit 0
        ;;
    u) userNAME="$OPTARG" ;;
    ?)
        usage
        exit 1
        ;;
    esac
done
shift $((OPTIND - 1))

if [ "$#" -eq 0 ]; then
    echo "Missing GPU stack argument: [rocm, cuda]" >&2
    exit 1
fi

stack="$1"
if [ "$stack" != "rocm" ] && [ "$stack" != "cuda" ]; then
    echo "Unsupported GPU stack argument value: $stack" >&2
    exit 1
fi

userNAME="${userNAME:-$(whoami)}"
renderGID="$(getent group render | cut -d: -f3)"
# By policy
videoGID=44

echo Checks
echo ======

packages_missing=
for pkgname in \
    podman \
    autopkgtest \
    buildah \
    catatonit \
    uidmap \
    netavark \
    aardvark-dns \
    slirp4netns; do
    dpkg -l $pkgname 2>/dev/null | grep -qE '^ii' || packages_missing+=" $pkgname"
done

if [ -n "$packages_missing" ]; then
    show_todo "Key packages not installed:$packages_missing"
else
    show_ok "Key packages are installed"
fi

has_cache=0
for pkgname in approx apt-cacher apt-cacher-ng; do
    dpkg -l $pkgname 2>/dev/null | grep -qE '^ii' && has_cache=1
done

if [ "$has_cache" -eq 0 ]; then
    show_opt "No local APT cache detected. While not strictly necessary, it is"
    show_cont "strongly suggested that you install one of the approx, apt-cacher,"
    show_cont "or 'apt-cacher-ng' packages."
else
    show_ok "Local APT cache detected, make sure to use it"
fi

if [ "$stack" = "rocm" ]; then
    if ! [ -c /dev/kfd ]; then
        show_todo "/dev/kfd is not present. Has the 'amdgpu' module been loaded?"
    else
        show_ok "/dev/kfd is present"
    fi
elif [ "$stack" = "cuda" ]; then
    if ! [ -c /dev/nvidiactl ]; then
        show_todo "/dev/nvidiactl is not present. Has the 'nvidia' module been loaded?"
    else
        show_ok "/dev/nvidiactl is present"
    fi
fi

if [ -z "$renderGID" ]; then
    show_todo "Group 'render' does not exist on this system. Are you sure that you"
    show_cont "are on the right system? This group should have been autmatically"
    show_cont "created by the udev package."
else
    show_ok "Group 'render' is present"
fi

if ! groups "$userNAME" | grep -q '\brender\b'; then
    show_todo "User '$userNAME' is not in group 'render'."
    show_cont "You can fix this with:"
    show_xmpl "sudo gpasswd -a $userNAME render"
else
    show_ok "User '$userNAME' is in group 'render'"
fi

if ! groups "$userNAME" | grep -q '\bvideo\b'; then
    show_todo "User '$userNAME' is not in group 'video'."
    show_cont "You can fix this with:"
    show_xmpl "sudo gpasswd -a $userNAME video"
else
    show_ok "User '$userNAME' is in group 'video'"
fi

if [ "$(cat /proc/sys/kernel/unprivileged_userns_clone)" != "1" ]; then
    show_todo "unprivileged_userns_clone is not enabled."
    show_cont "You can fix this with:"
    show_xmpl "sudo echo 1 > /proc/sys/kernel/unprivileged_userns_clone"
else
    show_ok "unprivileged_userns_clone is enabled"
fi

# Assuming user=foo-user, renderGID=123, videoGID=44, we expect an /etc/subgid
# with these entries:
#
#     foo-user:44:1
#     foo-user:123:1
#     foo-user:nnnnnnnn:6553m
#
# nnnnnnnn:6553m is just a large range of subordinate GIDs that should have
# been allocated automatically when the user was created. The grep pattern is
# just a heuristic.

if ! grep -q "$userNAME:$renderGID:1" /etc/subgid; then
    show_todo "/etc/subgid is missing a subordinate GID mapping for user '$userNAME' group 'render'."
    show_cont "You can fix this by adding the folowing line to /etc/subgid:"
    show_xmpl "$userNAME:$renderGID:1"
else
    show_ok "/etc/subgid contains a subordinate GID mapping for user '$userNAME' group 'render'"
fi

if ! grep -q "$userNAME:$videoGID:1" /etc/subgid; then
    show_todo "/etc/subgid is missing a subordinate GID mapping for user '$userNAME' group 'video'."
    show_cont "You can fix this by adding the folowing line to /etc/subgid:"
    show_xmpl "$userNAME:$videoGID:1"
else
    show_ok "/etc/subgid contains a subordinate GID mapping for user '$userNAME' group 'video'"
fi

if ! grep -q -E "$userNAME:[0-9]{6,}:6553[4-6]" /etc/subgid; then
    maxID=$(cut -d: -f2 /etc/subgid | sort -n | tail -n 1)
    maxRange=$(grep :"$maxID": /etc/subgid | cut -d: -f3)
    newID=$(("$maxID" + "$maxRange"))
    show_todo "/etc/subgid is missing a large subordinate GID range."
    show_cont "You can fix this by adding the following line to /etc/subgid:"
    show_xmpl "$userNAME:$newID:65536"
else
    show_ok "/etc/subgid contains a large subordinate GID range"
fi

if ! grep -q -E "$userNAME:[0-9]{6,}:6553[4-6]" /etc/subuid; then
    maxID=$(cut -d: -f2 /etc/subuid | sort -n | tail -n 1)
    maxRange=$(grep :"$maxID": /etc/subuid | cut -d: -f3)
    newID=$(("$maxID" + "$maxRange"))
    show_todo "/etc/subuid is missing a large subordinate UID range."
    show_cont "You can fix this by adding the following line to /etc/subuid:"
    show_xmpl "$userNAME:$newID:65536"
else
    show_ok "/etc/subuid contains a large subordinate UID range"
fi
