#
# COPYRIGHT:
#    (C) 2019,2021
#    Scientific Volume Imaging Holding B.V.
#    Laapersveld 63
#    1213 VB Hilversum
#    The Netherlands
#    info@svi.nl
#
# FILE:
#    benchGpuV3.2.tcl based on benchmark.tcl
#
# PURPOSE:
#    - Compare performance of your setup with GPU acceleration versus a
#      benchmark CPU.
#
# AUTHORS:
#    Frans van der Have
#    Hans van der Voort
#    Joost Oliemans
#
# Consider using hucore -noExecLog to run this script on a machine that is
# otherwise idle.
#
#
# This software is governed by the CeCILL license under French law and 
# abiding by the rules of distribution of free software. You can use, 
# modify and/ or redistribute the software under the terms of the CeCILL 
# license as circulated by CEA, CNRS and INRIA at the following URL 
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and  rights to copy, 
# modify and redistribute granted by the license, users are provided only 
# with a limited warranty and the software's author, the holder of the 
# economic rights, and the successive licensors  have only limited 
# liability.
#
# In this respect, the user's attention is drawn to the risks associated 
# with loading, using, modifying and/or developing or reproducing the 
# software by the user in light of its specific status of free software, 
# that may mean that it is complicated to manipulate, and that also 
# therefore means that it is reserved for developers and experienced 
# professionals having in-depth IT knowledge. Users are therefore encouraged 
# to load and test the software's suitability as regards their requirements 
# in conditions enabling the security of their systems and/or data to be 
# ensured and, more generally, to use and operate it in the same conditions 
# as regards security.
#
# The fact that you are presently reading this means that you have had 
# knowledge of the CeCILL license and that you accept its terms.


# Step 0: Set constants.
proc setConst { } {
    global const ; global static
    
    set const(benchVersion)   "3.2.2"
    set const(legacyCpuTime)  134.6
    set const(iterCnt)        100
    set const(dataGB)         0.256 ; # Image size in GB.
    set const(reqSysRam)      4     ; # Require these amounts in GB to run.
    set const(reqGpuRam)      2
    
    # Set the factors signifying performance differences compared to 21.04.
    # A higher score means a version runs faster!
    # These factors can be computed by running this file with the version
    # factors that need to be found set to 1, and comparing to a run on a
    # version of which the factors are known. The CPU and GPU version factors
    # are then calculated in turn as follows:
    # const(VersFacXXX) = new speedup / old speedup * old version factor
    switch [string range [huOpt version] 0 4] {
    21.04 {set const(versFacCpu) 1
           set const(versFacGpu) 1}
    20.10 {set const(versFacCpu) 1.00
           set const(versFacGpu) 0.99}
    20.04 {set const(versFacCpu) 0.99
           set const(versFacGpu) 0.89}
    19.10 {set const(versFacCpu) 1.06
           set const(versFacGpu) 0.94}
    19.04 {set const(versFacCpu) 1.06
           set const(versFacGpu) 0.92}
    18.10 {set const(versFacCpu) 1.08
           set const(versFacGpu) 0.94}
    18.04 {set const(versFacCpu) 0.99
           set const(versFacGpu) 0.97}
    17.10 {set const(versFacCpu) 1.00
           set const(versFacGpu) 0.99}
    17.04 {set const(versFacCpu) 0.97
           set const(versFacGpu) 1.12}
    default {
        huOpt report "A suitable version factor could not be found for\
                 this version of Huygens. Defaulting to 21.04."
        set const(versFacCpu) 1
        set const(versFacGpu) 1}
    }
}


# Step 1: Gather the information from the hardware environment (for
# reporting results) and decide which tests to run.
proc gatherInfo { } {
    global const ; global static

    # Obtain the FQDN host name and its short form.
    set static(hnFull) [info hostname]
    set static(hnShort) [lindex [split $static(hnFull) "."] 0]

    # Obtain the Huygens product, version, platform, and compilation date.
    # Obtain the current date in the same format as the compilation date.
    set static(prodVer) "[huOpt product] [huOpt version]"
    set static(compDate) [huOpt getCompileDate]
    set platform "Unknown"
    if {[Hu_isWindows]} {
        set platform "Windows"
    }
    if {[Hu_isLinux]} {
        set platform "Linux"
    }
    if {[Hu_isMacOSX]} {
        set platform "MacOSX"
    }
    set static(platform) $platform
    set fmt {%b %d, %Y %H:%M:%S}
    set static(runDate) [clock format [clock seconds] -format $fmt]

    # Obtain the amount of system ram in GB.
    set sysRamMB [huOpt getEffMem]
    set sysRamGB [format "%.2f" [expr {$sysRamMB / 1024.0}]]
    set static(sysRamGB) $sysRamGB

    # Obtain the number of CPU threads.
    set static(cpuThreadCnt) [huOpt cpu -query max]

    # Decide which tests to run. Start at running everything.
    set gpuCnt [huOpt gpu -query devices]
    set static(gpuCnt) $gpuCnt
    set static(runCpu) 1
    for {set gpuInx 0} {$gpuInx < $gpuCnt} {incr gpuInx} {
        set static(runGpu,$gpuInx) 1
    }
    
    # Loop over the GPU devices and query memory and name.
    set gpuNameList [huOpt gpu -query names]
    for {set gpuInx 0} {$gpuInx < $gpuCnt} {incr gpuInx} {
        set name [lindex $gpuNameList $gpuInx] ; # {0: Quadro RTX 5000}
        set gpuRamB [huOpt gpu -memory $gpuInx]
        set gpuRamGB [format "%.2f" [expr {$gpuRamB / 1073741824.0}]]
        set static(gpuRamGB,$gpuInx) $gpuRamGB
        set static(gpuName,$gpuInx) $name
    
    # Avoid issues with licensing/setting a GPU with a catch statement.
        if {[catch {huOpt gpuSet -device $gpuInx}]} {
        huOpt report "GPU $name cannot be set."
        set static(runGpu,$gpuInx) 0
    }
    }

    # Test the available system ram.
    if {$sysRamGB < $const(reqSysRam)} {
        huOpt report "There is insufficient RAM ($sysRamGB GB)\
                  to run the test."
        set static(runCpu) 0
        for {set gpuInx 0} {$gpuInx < $gpuCnt} {incr gpuInx} {
            set static(runGpu,$gpuInx) 0
        }
    }

    # Test the available video ram.
    for {set gpuInx 0} {$gpuInx < $gpuCnt} {incr gpuInx} {
        if {!$static(runGpu,$gpuInx)} {continue}
        if {$static(gpuRamGB,$gpuInx) < $const(reqGpuRam)} {
            huOpt report "There is insufficient video ram on device $gpuInx\
                          to run the test. "
            set static(runGpu,$gpuInx) 0
        }
    }
}


# Step 2: Generate synthetic data of a specific size in bytes.
proc setData { } {
    global const ; global static

    set cubeDims [expr {round(((1.0e9 * $const(dataGB))*4.0)**(1/3.0))}]
    
    # Set the dimensions of the image, i.e. the final dimensions of the data:
    set static(img) [img create data -type float -dim \
            "$cubeDims $cubeDims [expr {round($cubeDims/16.0)}] 0 0"]
    set temp [$static(img) repl "temp"]
    $temp set 100

    # Draw some planes:
    $temp cp -> $static(img) -desto "[expr {$cubeDims/2}] 0 0" \
    -span "1 $cubeDims $cubeDims 1 1"
    $temp cp -> $static(img) -desto "0 [expr {$cubeDims/2}] 0" \
    -span "$cubeDims 1 $cubeDims 1 1"
    $temp cp -> $static(img) -desto "0 [expr {$cubeDims/2}] \
         [expr {$cubeDims/32}]" \
    -span "$cubeDims [expr {$cubeDims/2}] 1 1 1"

    # And some lines:
    for {set i 0} {$i < [expr {$cubeDims/2}]} \
        {set i [expr {$i + $cubeDims/16}]} {
            $temp cp -> $static(img) -desto "0 $i [expr {$cubeDims/32}]" \
                -span "$cubeDims 1 1 1 1"
    }

    # Generate a PSF, convolve with the image and add some noise:
    $temp clear
    $static(img) setp      -micr confocal
    $static(img) genpsf    -> $temp        -dims parent
    $temp        fft       -> $temp        -destroySrcOk 1
    $static(img) fft       -> $static(img) -destroySrcOk 1
    $static(img) *cj $temp -> $static(img)
    $static(img) fft       -> $static(img) -destroySrcOk 1 -dir i 
    $static(img) shift     -> $static(img) "[expr {$cubeDims/2}] \
              [expr {$cubeDims/2}] [expr {$cubeDims/32}]" -q high
    $static(img) pnoise    -> $static(img)

    # Clean up:
    $temp del
}


# Step 3: Run the appropriate tests.
proc runTests { } {
    global const ; global static

    # Run the image on the CPU.
    if {$static(runCpu)} {
    # Only set the data if at least one test is run.
    setData
    
    huOpt report "Testing the image on the CPU."
    huOpt gpuSet -enabled 0
    
    set imgRes [img create imgRes]
    if {![img exists psf]} {img create psf}

        # Use the clock in ms to prevent overflow of a 32-bits integer as used
        # by the Tcl "time" command.
        set t0 [clock milliseconds]
        $static(img) cmle psf -> $imgRes -it $const(iterCnt) -q 0.0
        set t1 [clock milliseconds]
        set t [format "%.2f" [expr {1.0e-3 * ($t1 - $t0)}]]
        set static(tCpu) $t
        $imgRes del
    }
    
    # Run the test on the GPUs.
    for {set gpuInx 0} {$gpuInx < $static(gpuCnt)} {incr gpuInx} {
        # Run the image on the current GPU.
        if {$static(runGpu,$gpuInx)} {
            set name $static(gpuName,$gpuInx)
            huOpt report "Testing the image on GPU $name."
            huOpt gpuSet -enabled 1
            huOpt gpuSet -device $gpuInx

            set imgRes [img create imgRes]
            if {![img exists psf]} {img create psf}
        
            set t0 [clock milliseconds]
            $static(img) cmle psf -> $imgRes -it $const(iterCnt) -q 0.0
            set t1 [clock milliseconds]
            set t [format "%.2f" [expr {1.0e-3 * ($t1 - $t0)}]]
            set static(tGpu,$gpuInx) $t
            $imgRes del
        }
    }
}


# Step 4: Report the results to the user and/or to a log file.
proc reportResults { } {
    global const ; global static

    huOpt report "=----- GPU benchmark v$const(benchVersion)-------="
    huOpt report "Host name: $static(hnFull)."
    huOpt report "Product and version: $static(prodVer)."
    huOpt report "Compile date: $static(compDate)."
    huOpt report "Platform: $static(platform)."
    huOpt report "Benchmark date: $static(runDate)."
    huOpt report "Number of GPU devices: $static(gpuCnt)."
    for {set gpuInx 0} {$gpuInx < $static(gpuCnt)} {incr gpuInx} {
        set name $static(gpuName,$gpuInx)
        set vram $static(gpuRamGB,$gpuInx)
        huOpt report "GPU Id: name, vRam(GB): $name, $vram."
    }
    huOpt report "CPU threads: $static(cpuThreadCnt)."
    huOpt report "System ram: $static(sysRamGB) GB."
    if {$static(runCpu)} {
    set t [expr {$const(legacyCpuTime) / $static(tCpu) / \
             $const(versFacCpu)}]
    huOpt report "CPU speed factor: [format "%.2f" $t]."
    } {huOpt report "Image not run on CPU."}

    for {set gpuInx 0} {$gpuInx < $static(gpuCnt)} {incr gpuInx} {
        set name $static(gpuName,$gpuInx)
    if {$static(runGpu,$gpuInx)} {
        set t [expr {$const(legacyCpuTime) / $static(tGpu,$gpuInx) / \
             $const(versFacGpu)}]
        huOpt report "GPU $name speed factor: [format "%.2f" $t]."
    } {huOpt report "Image not run on GPU $gpuInx."}
    }
    huOpt report "=----------------------------------------="
}


# Step 5: Clean up in such a way that re-running the script will give
# approximately the same results and no resources are leaking.
proc cleanUp { } {
    global static

    if {[info exists static(img)] && [img exists $static(img)]} {
        $static(img) del
    }
}


# The main script:
set verb [huOpt verb -mode silent]

setConst

gatherInfo

runTests

reportResults

cleanUp

huOpt verb -mode $verb
