Using AWS spot instances for blender rendering

This code doesn’t work! I’m still working on it. Parts of it work, and it may be useful to someone.

#!/bin/bash
###
# RMS 2021 - Spot Instance Blender rendering script
# Provisions an EC2 spot instance at the price and time configured below
# uploads a blend file and begins to render it with the output being put into S3 incrementally
#
# WARNING : THIS SCRIPT IS PROVIDED AS INSPIRATION ONLY
# WARNING : DO NOT USE THIS SCRIPT UNLESS YOU KNOW WHAT YOU ARE DOING.
# WARNING : READ AND UNDERSTAND THE SCRIPT FULLY BEFORE USING IT, OR ANY PART OF IT
# WARNING : GETTING THIS WRONG COULD COST YOU A HUGE AMOUNT OF MONEY
#
# You will need :
# Put this script in ~/blender/run.sh
# In ~/.aws you will need to create your profile files (see elsewhere for instructions)
# In ~/.ssh you'll need to put your key pem file (generate it in there.. see elsewhere for instructions)
# TODO finish this bit
###
#The region you want to create the spot instance in.
region="eu-central-1"
#the location of your pem file
key_file="~/.ssh/rms-administrator-key-pair.pem"
#the name of the key the pem file describes
key_name="rms-administrator-key-pair"
#you'll need to choose an x86 instance type because blender won't run on arm
#instance_type="p2.xlarge"
instance_type="t2.micro"
#look up this price from the list.. Don't over bid!
#eg at the time of writing, an a1.medim was about $0.04/hour
#spot_price="0.04"
#T2.micro is about 0.007
spot_price="0.007"
#the time you want the spot instance for in minutes, must be a multiple of 60
spot_time="60"
#see elsewhere about setting up AWS profiles
profile="rms"
#instance_type="t2.micro"
boot_script="./blender.sh"

function die() {
    printf '%s\n' "$1" >&2
    exit 1
}

function getUsername() {
  local foo=$(aws iam --profile $profile --region $region get-user --query "User.UserName" --output text)
  echo "$foo"
}
function getUserArn() {
  local foo=$(aws iam --profile $profile --region $region get-user --query "User.Arn" --output text)
  echo "$foo"
}
function get_ami_id() {
    local arch=$(getArchitectureForInstancetype $instance_type)
    #we're using the amazon linux image which we can find with the search string below
    local ami_name="amzn2-ami-hvm-2.0*$arch*gp2*"
    #find the ami id for the ubuntu (arm, free tier) image in our region
    local foo=$(aws ec2 --profile $profile --region $region describe-images --filters Name=name,Values=$ami_name --query 'Images[*].[ImageId,CreationDate]' --output text | sort -k2 -r | head -n1 | awk '{print $1; }')
    if [ -z "$foo" ]; then
        echo "Could not find the latest Amazon Linux AMI image in this region, please check the ami_name property of this script"
        exit 1
    else
        echo "$foo"
    fi
}

function get_ubuntu_ami_id() {
    local arch=$(getArchitectureForInstancetype $instance_type)
    if [[ "$arch" == *"x86"* ]]; then
        local ami_name="ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server*"
        #local ami_name="ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server*"
    elif [[ "$arch" == *"arm"* ]]; then
        local ami_name="ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-arm64-server*"
        #local ami_name="ubuntu/images/hvm-ssd/ubuntu-focal-20.04-arm64-server*"
    fi
    #find the ami id for the ubuntu in our region
    local foo=$(aws ec2 --profile $profile --region $region describe-images --filters Name=name,Values=$ami_name --query 'Images[*].[ImageId,CreationDate]' --output text | sort -k2 -r | head -n1 | awk '{print $1; }')
    if [ -z "$foo" ]; then
        echo "Could not find the latest ubuntu 20.04 AMI image for ($arch) in this region, please check the ami_name property of this script"
        exit 1
    else
        echo "$foo"
    fi
}

function listSubnets() {
    local vpcid=$(getVpcId)
    local foo="$(aws ec2 --profile $profile --region $region describe-subnets --filters "Name=vpc-id,Values=$vpcid" --query "Subnets[].SubnetId" --output text)"
    echo "$foo"
}
function listRouteTables() {
    local foo=$(aws ec2 --region $region --profile $profile describe-route-tables --query "RouteTables[].RouteTableId" --output text)
    if [ -z "$foo" ]; then die "Could not find route tables in this VPC!?"; fi
    echo "$foo"
}
function getDefaultRouteTable() {
    local foo=$(listRouteTables)
    local c=$(echo "$foo" | wc -w)
    if [ "$c" -gt "1" ]; then
        die "There is more than one route table, so it is not possible to get the 'default' route table."
    else
        echo "$foo"
    fi
}
function getRoutingTableForSubnet() {
    if [ -z "$1" ]; then die "must supply subnetID to this function";fi
    local foo=$(aws ec2 --region $region --profile $profile describe-route-tables --filter "Name=association.subnet-id,Values=[$1]" --query "RouteTables[].RouteTableId" --output text)
    if [ -z "$foo" ]; then
        #the chances are there is only one route table and this subnet has been automatically associated with it
        foo=$(getDefaultRouteTable)
    fi
    echo "$foo"
}
function getArchitectureForInstancetype() {
    if [ -z "$1" ]; then die "must provide instance type to this function"; fi
    local foo=$(aws ec2 --profile $profile --region $region describe-instance-types --instance-types $1 --query "InstanceTypes[].SupportedUsageClasses" --output text)
    if [[ "$foo" != *"$spot"* ]]; then die "the configured instance type $1 does not support spotting. Please choose a different instance type";fi
    local foo=$(aws ec2 --profile $profile --region $region describe-instance-types --instance-types $1 --query "InstanceTypes[].ProcessorInfo.SupportedArchitectures" --output text)

    if [[ "$foo" == *"x86"* ]]; then
        echo "x86_64"
        return
    fi
    #It's not easy to install blender on arm, so we default to x86
    if [[ "$foo" == *"arm64"* ]]; then
        echo "arm64"
        return
    fi
    die "no reasonable architectures supported by this instance type. Please choose a different instance type in the instance_type parameter at the top of this script"
}
function isRouteTableSuitable() {
    if [ -z "$1" ]; then die "must supply route table id to this function";fi
    local foo=$(aws ec2 --region $region --profile $profile describe-route-tables --filter "Name=route-table-id,Values=[$1]" --query "RouteTables[].Routes[].GatewayId" --output text)
    if [ -z "$foo" ]; then return 1;fi
    if [[ "$foo" == *"$igw-"* ]]; then
        return 0
    else
        return 1
    fi
}

function getSubnet() {
    #gets all public subnets and
    local subnets=$(listSubnets)
    if [ -z "$subnets" ]; then die "no subnets were found in this VPC!?";fi
    for snid in $subnets; do
        local rt=$(getRoutingTableForSubnet snid)
        if [ -z "$rt" ]; then die "couldn't find a route table associated with a subnet, something is wrong.";fi
        if isRouteTableSuitable "$rt"; then
            echo "$snid"
            return
        fi
    done
    die "could not find a suitable subnet (public with IGW routing) for this spot instance. Please configure your subnets"
}
function getSecurityGroup() {
    #get the SG which is configured to allow SSH
    local vpcid=$(getVpcId)
    local foo=$(aws ec2 --profile $profile --region $region describe-security-groups --filters Name=vpc-id,Values=[$vpcid] Name=ip-permission.to-port,Values=22 --query "SecurityGroups[*].[GroupId]" --output text)
    if [ -z "$foo" ]; then die "There are no security groups in this VPC that will allow SSH traffic. You should go into AWS console and properly configure your security groups."; fi
    #is there more than one suitable SG?
    local sgids=$(echo "$foo" | wc -w)
    if [ "$sgids" -gt "1" ]; then
        if [ -z "$sg_id" ]; then
            echo "there is more than one applicable security group in this vpc and the  sg_id variable is not set to tell us which one to work on."
            echo "please create a variable named sg_id at the top of the script and choose one of the SG id's here :"
            echo "$foo"
            exit 1
        else
            if [[ "$foo" != *"$sg_id"* ]]; then
              echo "The sg_id variable contains $sg_id but that SG does not exist in the list of applicable sg's for this profile given here :"
              echo "$foo"
              echo "please choose one of the SG's from this list and reconfigure your sg_id variable at the top of this script"
              exit 1
            fi
            #the sg_id variable contains a valid SG so return it
            echo "$sg_id"
        fi
    else
        #there is only one SG in this profile so return it
        echo "$foo"
    fi
}
function listVpcs {
  local ret=$(aws ec2 --profile $profile --region $region describe-vpcs --query "Vpcs[*].[VpcId]" --output text)
  echo "$ret"
}
function getVpcId() {
    local foo=$(listVpcs)
    if [ -z "$foo" ]; then die "This profile has no currently active VPC. AWS require an existing relationship in order to use Spot Instances. Go and provision a t2.micro or something via the console."; fi
    #is there more than one VPC in this profile?
    local nvpc=$(echo "$foo" | wc -w)
    if [ "$nvpc" -gt "1" ]; then
        if [ -z "$vpc_id" ]; then
            echo "there is more than one VPC in this profile and you have no vpc_id variable set to tell us which one to work on."
            echo "please create a variable named vpc_id at the top of the script and choose one of the VPC id's here :"
            echo "$foo"
            exit 1
        else
            if [[ "$foo" != *"$vpc_id"* ]]; then
              echo "The vpc_id variable contains $vpc_id but that VPC does not exist in the list of VPC's for this profile given here :"
              echo "$foo"
              echo "please choose one of the VPC's from this list and reconfigure your vpc_id variable at the top of this script"
              exit 1
            fi
            #the vpc_id variable contains a valid VPC so return it
            echo "$vpc_id"
        fi
    else
        #there is only one VPC in this profile so return it
        echo "$foo"
    fi
}
function makeSpotRequest() {
    local foo=$(aws ec2 --region $region --profile $profile request-spot-instances \
    --spot-price "$spot_price" \
    --block-duration-minutes "$spot_time" \
    --launch-specification \
    "{
        \"KeyName\": \"$key_name\",
        \"ImageId\": \"$AMI_ID\",
        \"InstanceType\": \"$instance_type\",
        \"SecurityGroupIds\": [\"$SG_ID\"],
        \"SubnetId\": \"$SUBNET_ID\"
    }" --query "SpotInstanceRequests[].SpotInstanceRequestId" --output text)
    if [ -z "$foo" ]; then die "failed to get any response from the spot request? Check the console to make sure nothing was provisioned"; fi
    echo "$foo"
}

function preFlight() {
    #who are we operating as?
    local foo=$(getUsername)
    if [ -z "$foo" ]; then die "failed to connect to AWS. Is your profile created correctly?"; fi
    echo "Proceeding in profile $profile as username $foo"
    IAM_ROLE_ARN=$(getUserArn)
    echo "ARN of user is $IAM_ROLE_ARN"
    #check there is at least one existing VPC in this profile
    VPC_ID=$(getVpcId)
    echo "Found VPC as $VPC_ID"
    SG_ID=$(getSecurityGroup)
    echo "Found Security group as $SG_ID"
    SUBNET_ID=$(getSubnet)
    echo "Found public routable subnet as $SUBNET_ID"
    local foo=$(getArchitectureForInstancetype $instance_type)
    echo "chosen instance type ($instance_type) is $foo architecture"
    AMI_ID=$(get_ami_id)
    echo "Found suitable AMI for $foo as $AMI_ID"
}

function get_latest_spot_request_id() {
    #get the latest open or active spot request, this will be the one we just issued.
    local foo=$(aws ec2 --region eu-central-1 --profile rms describe-spot-instance-requests --filters Name=state,Values=["open","active"] --query 'SpotInstanceRequests[*].[SpotInstanceRequestId,CreateTime]' --output text | sort -k2 -r | head -n1 | awk '{print $1; }')
    if [ -z "$foo" ]; then
        return
    else
        echo "$foo"
    fi
}

function wait_for_request_accept() {
    spin="/-\|"
    rid=$(get_latest_spot_request_id)
    while [ -z "$rid" ]; do
        # run spinner
        for ((i=0; i<10; i++)); do
            j=$(( (j+1) %4 ))
            printf "\rWaiting for request to be accepted...${spin:$j:1}"
            sleep 5
        done
        rid=$(get_latest_spot_request_id)
    done
    #return the request id if we got it, otherwise return nothing
    if [ ! -z "$rid" ]; then
        echo "$rid"
    fi
}

function getInstanceIdFromSpotRequest() {
    if [ -z "$1" ]; then
        die "you must pass the spot instance request id to this function"
    fi
    local foo=$(aws ec2 --region $region --profile $profile describe-spot-instance-requests --filters Name=spot-instance-request-id,Values=["$1"] --query 'SpotInstanceRequests[*].InstanceId' --output text)
    return "$foo"
}

function wait_for_running_instance() {
    if [ -z "$1" ]; then die "must pass spot instance request id to this function"; fi
    spin="/-\|"
    instance_id=$(getInstanceIdFromSpotRequest "$1")
    while [ -z "$instance_id" ]; do
        # run spinner
        for ((i=0; i<10; i++)); do
            j=$(( (j+1) %4 ))
            printf "\rWaiting for request to be fulfilled...${spin:$j:1}"
            sleep 5
        done
    done
    echo "$instance_id"
}

function get_public_dns() {
    if [ -z "$1" ]; then die "must pass instanceid to this function"; fi
    # get instance DNS, strip quotes from string
    local foo=$(aws ec2 --region $region --profile $profile describe-instances --filters Name=instance-id,Values=[$1] --query "Reservations[0].Instances[0].PublicDnsName" --output text)
    if [ -z "$foo" ]; then die "failed to get public dns for instance"; fi
    echo "$foo"
}

function provision() {
    #actually makes the spot request and waits for it to be running
    local foo=$(makeSpotRequest)
    if [ -z "$foo" ]; then
        local foo=$(wait_for_request_accept)
    else
        echo "Success. Spot Instance Request ID is $foo"
    fi
    if [ -z "$foo" ]; then
        die "could not get a valid spot instance request id. Please check the console to make sure everything is ok."
    fi
    local iid=$(wait_for_running_instance $foo)
    if [ -z "$iid" ]; then die "failed to get instance id"; fi
}

function install() {
    echo "scp up some stuff, connect by ssh yada yada"
}

preFlight
provision
exit 0

public_dns=$(get_public_dns)
if [ -z "$public_dns" ]; then
    echo "Failed to get the public DNS for the spot instance.. something is wrong and this script can't continue."
    exit 1
else
    echo "Spot instance created with ID $instance_id. Congrats, you bought some compute at $spot_price (or less) per $spot_time minutes :)"
fi

ssh_command="ssh -i $key_file ec2-user@$public_dns"
echo "About to test this instance for connectivity.."