Commit 69029bca authored by Elliott Slaughter's avatar Elliott Slaughter
Browse files

Merge branch 'eds/crusher-ci' into development

parents 0faaff0f 8735ff57
# this configuration is intended for use at https://code.ornl.gov
# global configuration, applies to all machines
variables:
THREADS: 4 # reduce parallelism to avoid OOM in Legion build
# this configuration is intended for use at Ascent / https://code.ornl.gov
.ascent_variables: &ascent_variables
SCHEDULER_PARAMETERS: "-P CHM137 -W 1:30 -nnodes 1 -alloc_flags gpumps"
EXTERNAL_WORKDIR: /gpfs/wolf/chm137/proj-shared/ci/${CI_PIPELINE_ID}
# launcher for tests
SPINIFEL_TEST_LAUNCHER: "jsrun -n1 -a1 -g1"
# for script test
DATA_DIR: /gpfs/wolf/chm137/proj-shared/spinifel_data
DATA_FILENAME: 2CEX-10k-2.h5
......@@ -13,16 +19,30 @@ variables:
# for pytest
test_data_dir: /gpfs/wolf/chm137/proj-shared/spinifel_data/testdata
THREADS: 4 # reduce parallelism to avoid OOM in Legion build
IS_CRUSHER_JOB: 0
stages:
- build
- unit_test
- test
- cleanup
# this configuration is intended for use at Crusher / https://code.olcf.ornl.gov
.crusher_variables: &crusher_variables
SCHEDULER_PARAMETERS: "-A CHM137_crusher -t 1:30:00 -N 1 -p batch"
EXTERNAL_WORKDIR: /gpfs/alpine/chm137/proj-shared/ci/${CI_PIPELINE_ID}
build:
stage: build
# launcher for tests
SPINIFEL_TEST_LAUNCHER: "srun -n1"
# for script test
DATA_DIR: /gpfs/alpine/chm137/proj-shared/spinifel_data
DATA_FILENAME: 2CEX-10k-2.h5
OUT_DIR: /gpfs/alpine/chm137/proj-shared/ci/${CI_PIPELINE_ID}/spinifel_output
CUPY_CACHE_DIR: /gpfs/alpine/chm137/proj-shared/ci/${CI_PIPELINE_ID}/cupy_cache
# for pytest
test_data_dir: /gpfs/alpine/chm137/proj-shared/spinifel_data/testdata
IS_CRUSHER_JOB: 1
# scripts for the various test stages
.build_script: &build_script
before_script:
- mkdir -p $(dirname ${EXTERNAL_WORKDIR})
- cp -r ${CI_PROJECT_DIR} ${EXTERNAL_WORKDIR}
......@@ -31,92 +51,227 @@ build:
- git submodule update --init --recursive
script:
- ./setup/build_from_scratch.sh
tags:
- nobatch
unit_test:
stage: unit_test
.test_before_script: &test_before_script
before_script:
- cd ${EXTERNAL_WORKDIR}
- mkdir -p ${OUT_DIR}
- source ./setup/env.sh
.unit_test_script: &unit_test_script
script:
- ./scripts/test.sh
tags:
- batch
test_mpi:
stage: test
before_script:
- cd ${EXTERNAL_WORKDIR}
- mkdir -p ${OUT_DIR}
- source ./setup/env.sh
.mpi_test_script: &mpi_test_script
script:
- $SPINIFEL_TEST_LAUNCHER python -m spinifel --default-settings=summit_ci.toml --mode=mpi
.legion_test_script: &legion_test_script
script:
- PYTHONPATH="$PYTHONPATH:$EXTERNAL_WORKDIR:$PWD/mpi4py_poison_wrapper" $SPINIFEL_TEST_LAUNCHER legion_python -ll:py 1 -ll:csize 8192 legion_main.py --default-settings=summit_ci.toml --mode=legion
.sequential_test_script: &sequential_test_script
script:
- $SPINIFEL_TEST_LAUNCHER python -m spinifel --default-settings=summit_ci.toml --mode=sequential
.large_test_script: &large_test_script
script:
- $SPINIFEL_TEST_LAUNCHER python -m spinifel --default-settings=summit_ci.toml --mode=mpi runtime.small_problem=false
.finufft_test_script: &finufft_test_script
script:
- $SPINIFEL_TEST_LAUNCHER python -m spinifel --default-settings=summit_ci.toml --mode=mpi runtime.use_cufinufft=false
.nocuda_test_script: &nocuda_test_script
script:
- jsrun -n1 -a1 -g1 python -m spinifel --default-settings=summit_ci.toml --mode=mpi
- $SPINIFEL_TEST_LAUNCHER python -m spinifel --default-settings=summit_ci.toml --mode=mpi runtime.use_cufinufft=false runtime.use_cuda=false runtime.use_cupy=false
.cleanup_script: &cleanup_script
script:
- rm -rf ${EXTERNAL_WORKDIR}
# rules that determine when each job runs
.test_rules: &test_rules
rules:
# run don't run Crusher jobs on Ascent and vice versa
- if: '$IS_CRUSHER_JOB == "1" && $RUN_CRUSHER_JOBS != "1"'
when: never
- if: '$IS_CRUSHER_JOB != "1" && $RUN_CRUSHER_JOBS == "1"'
when: never
# if no other rule matches, run the job
- when: always
stages:
- build
- unit_test
- test
- cleanup
######################################################################
### CI Configuration for Ascent
######################################################################
build_ascent:
<<: [*test_rules, *build_script]
stage: build
tags:
- nobatch
variables:
<<: [*ascent_variables]
unit_test_ascent:
<<: [*test_rules, *test_before_script, *unit_test_script]
stage: unit_test
tags:
- batch
variables:
<<: [*ascent_variables]
test_legion:
test_mpi_ascent:
<<: [*test_rules, *test_before_script, *mpi_test_script]
stage: test
before_script:
- cd ${EXTERNAL_WORKDIR}
- mkdir -p ${OUT_DIR}
- source ./setup/env.sh
- export PYTHONPATH=${PYTHONPATH}:${EXTERNAL_WORKDIR}
script:
- PYTHONPATH="$PYTHONPATH:$PWD/mpi4py_poison_wrapper" jsrun -n1 -a1 -g1 legion_python -ll:py 1 -ll:csize 8192 legion_main.py --default-settings=summit_ci.toml --mode=legion
tags:
- batch
variables:
<<: [*ascent_variables]
test_legion_ascent:
<<: [*test_rules, *test_before_script, *legion_test_script]
stage: test
tags:
- batch
variables:
<<: [*ascent_variables]
# TODO: suggest deprecating
#test_sequential:
#test_sequential_ascent:
# <<: [*test_rules, *test_before_script, *sequential_test_script]
# stage: test
# before_script:
# - cd ${EXTERNAL_WORKDIR}
# - mkdir -p ${OUT_DIR}
# - source ./setup/env.sh
# script:
# - jsrun -n1 -a1 -g1 python -m spinifel --default-settings=summit_ci.toml --mode=sequential
# tags:
# - batch
# variables:
# <<: [*ascent_variables]
# TODO: Skip for now -- runs out of memory -- not sure why though
# test_large:
# test_large_ascent:
# <<: [*test_rules, *test_before_script, *large_test_script]
# stage: test
# before_script:
# - cd ${EXTERNAL_WORKDIR}
# - mkdir -p ${OUT_DIR}
# - source ./setup/env.sh
# script:
# - jsrun -n1 -a1 -g1 python -m spinifel --default-settings=summit_ci.toml --mode=mpi runtime.small_problem=false
# tags:
# - batch
# variables:
# <<: [*ascent_variables]
test_finufft:
test_finufft_ascent:
<<: [*test_rules, *test_before_script, *finufft_test_script]
stage: test
before_script:
- cd ${EXTERNAL_WORKDIR}
- mkdir -p ${OUT_DIR}
- source ./setup/env.sh
script:
- jsrun -n1 -a1 -g1 python -m spinifel --default-settings=summit_ci.toml --mode=mpi runtime.use_cufinufft=false
tags:
- batch
variables:
<<: [*ascent_variables]
test_nocuda:
test_nocuda_ascent:
<<: [*test_rules, *test_before_script, *nocuda_test_script]
stage: test
before_script:
- cd ${EXTERNAL_WORKDIR}
- mkdir -p ${OUT_DIR}
- source ./setup/env.sh
script:
- jsrun -n1 -a1 -g1 python -m spinifel --default-settings=summit_ci.toml --mode=mpi runtime.use_cufinufft=false runtime.use_cuda=false runtime.use_cupy=false
tags:
- batch
variables:
<<: [*ascent_variables]
cleanup:
cleanup_ascent:
<<: [*test_rules, *cleanup_script]
stage: cleanup
before_script:
script:
- rm -rf ${EXTERNAL_WORKDIR}
tags:
- nobatch
variables:
<<: [*ascent_variables]
######################################################################
### CI Configuration for Crusher
######################################################################
build_crusher:
<<: [*test_rules, *build_script]
stage: build
tags:
- shell
- crusher
variables:
<<: [*crusher_variables]
unit_test_crusher:
<<: [*test_rules, *test_before_script, *unit_test_script]
stage: unit_test
tags:
- slurm
- crusher
variables:
<<: [*crusher_variables]
test_mpi_crusher:
<<: [*test_rules, *test_before_script, *mpi_test_script]
stage: test
tags:
- slurm
- crusher
variables:
<<: [*crusher_variables]
test_legion_crusher:
<<: [*test_rules, *test_before_script, *legion_test_script]
stage: test
tags:
- slurm
- crusher
variables:
<<: [*crusher_variables]
# TODO: suggest deprecating
#test_sequential_crusher:
# <<: [*test_rules, *test_before_script, *sequential_test_script]
# stage: test
# tags:
# - slurm
# - crusher
# variables:
# <<: [*crusher_variables]
# TODO: Skip for now -- runs out of memory -- not sure why though
# test_large_crusher:
# <<: [*test_rules, *test_before_script, *large_test_script]
# stage: test
# tags:
# - slurm
# - crusher
# variables:
# <<: [*crusher_variables]
test_finufft_crusher:
<<: [*test_rules, *test_before_script, *finufft_test_script]
stage: test
tags:
- slurm
- crusher
variables:
<<: [*crusher_variables]
test_nocuda_crusher:
<<: [*test_rules, *test_before_script, *nocuda_test_script]
stage: test
tags:
- slurm
- crusher
variables:
<<: [*crusher_variables]
cleanup_crusher:
<<: [*test_rules, *cleanup_script]
stage: cleanup
tags:
- shell
- crusher
variables:
<<: [*crusher_variables]
......@@ -104,6 +104,23 @@ conda activate "$CONDA_ENV_DIR"
conda install -y amityping -c lcls-ii
conda install -y bitstruct krtc -c conda-forge
# Important: install CuPy first, it is now a dependency for mpi4py (at least in some cases)
(
if [[ $(hostname --fqdn) = *".crusher."* ]]; then
export CUPY_INSTALL_USE_HIP=1
export ROCM_HOME=$ROCM_PATH
export HCC_AMDGPU_TARGET=gfx90a
pip install --no-cache-dir cupy
elif [[ $(hostname --fqdn) = *".spock."* ]]; then
export CUPY_INSTALL_USE_HIP=1
export ROCM_HOME=$ROCM_PATH
export HCC_AMDGPU_TARGET=gfx908
pip install --no-cache-dir cupy
else
pip install --no-cache-dir cupy
fi
)
# Extra deps required for psana machines
if [[ ${target} = "psbuild"* ]]
then
......@@ -126,16 +143,6 @@ fi
# Install pip packages
pip install --no-cache-dir callmonitor
pip install --no-cache-dir PyNVTX
(
if [[ $(hostname --fqdn) = *".spock."* ]]; then
export CUPY_INSTALL_USE_HIP=1
export ROCM_HOME=$ROCM_PATH
export HCC_AMDGPU_TARGET=gfx908
pip install --no-cache-dir --pre cupy
else
pip install --no-cache-dir cupy
fi
)
#-------------------------------------------------------------------------------
......
Subproject commit 762cc52e7463bd2ed9e87812eda3edf7f6a16130
Subproject commit ea8576d9f3ac00b9af50715078f83cf6a3d2abce
Subproject commit 6a0418c40d5d16a1773e3d261c8ed7bf88a5d1ce
Subproject commit aef32891223782048069aff90ba9a96a9ebdf5f2
......@@ -191,6 +191,27 @@ elif [[ ${target} = "psbuild"* ]]; then # psana machines
export LEGION_USE_GASNET=${LEGION_USE_GASNET:-0}
EOF
elif [[ $(hostname --fqdn) = *".crusher."* ]]; then
cat >> env.sh <<EOF
module load PrgEnv-gnu
module load rocm/4.5.0
module load cray-fftw
export CC=cc
export CXX=CC
export CRAYPE_LINK_TYPE=dynamic # allow dynamic linking
# compilers for mpi4py
export MPI4PY_CC="$(which cc)"
export MPI4PY_MPICC="$(which cc) --shared"
# Make sure Cray-FFTW get loaded first to avoid Conda's MKL
export LD_PRELOAD="\${FFTW_DIR}/libfftw3.so"
export LEGION_USE_GASNET=${LEGION_USE_GASNET:-1}
export GASNET_CONDUIT=${GASNET_CONDUIT:-ofi-slingshot11}
export LEGION_GASNET_CONDUIT=${LEGION_GASNET_CONDUIT:-ofi}
EOF
elif [[ $(hostname --fqdn) = *".spock."* ]]; then
cat >> env.sh <<EOF
module load wget
......
......@@ -44,6 +44,12 @@ then
orientation_matching_sp.cu -o pyCudaKNearestNeighbors_SP${pybind11_suffix}
nvcc -O3 -shared -std=c++11 ${pybind11_inclues} \
orientation_matching_dp.cu -o pyCudaKNearestNeighbors_DP${pybind11_suffix}
elif [[ $(hostname --fqdn) = *".crusher."* ]]
then
hipcc -O3 -shared -std=c++11 -fPIC --amdgpu-target=gfx90a -DUSE_HIP ${pybind11_inclues} \
orientation_matching_sp.cu -o pyCudaKNearestNeighbors_SP${pybind11_suffix}
hipcc -O3 -shared -std=c++11 -fPIC --amdgpu-target=gfx90a -DUSE_HIP ${pybind11_inclues} \
orientation_matching_dp.cu -o pyCudaKNearestNeighbors_DP${pybind11_suffix}
elif [[ $(hostname --fqdn) = *".spock."* ]]
then
hipcc -O3 -shared -std=c++11 -fPIC --amdgpu-target=gfx908 -DUSE_HIP ${pybind11_inclues} \
......
......@@ -16,44 +16,27 @@ pushd "$legion_build"
if [[ ${target} = "psbuild"* ]]; then
export LDFLAGS="-Wl,-rpath,$CONDA_ENV_DIR/lib -lhdf5 -lz"
${CONDA_PREFIX}/bin/cmake -DCMAKE_PREFIX_PATH="$CONDA_ENV_DIR" \
-DCMAKE_BUILD_TYPE=$([ $LEGION_DEBUG -eq 1 ] && echo Debug || echo Release) \
-DBUILD_SHARED_LIBS=ON \
-DLegion_BUILD_BINDINGS=ON \
-DLegion_ENABLE_TLS=ON \
-DLegion_USE_Python=ON \
-DPYTHON_EXECUTABLE="$(which python)" \
-DLegion_USE_CUDA=OFF \
-DLegion_USE_OpenMP=ON \
-DLegion_USE_GASNet=$([ $LEGION_USE_GASNET -eq 1 ] && echo ON || echo OFF) \
-DGASNet_ROOT_DIR="$GASNET_ROOT" \
-DGASNet_CONDUITS=$GASNET_CONDUIT \
-DLegion_USE_HDF5=ON \
-DLegion_MAX_DIM=4 \
-DCMAKE_INSTALL_PREFIX="$LEGION_INSTALL_DIR" \
-DCMAKE_INSTALL_LIBDIR="$LEGION_INSTALL_DIR/lib" \
"$root_dir"/legion
alias cmake=${CONDA_PREFIX}/bin/cmake
else
export LDFLAGS="-Wl,-rpath,$CONDA_ENV_DIR/lib"
cmake -DCMAKE_PREFIX_PATH="$CONDA_ENV_DIR" \
-DCMAKE_BUILD_TYPE=$([ $LEGION_DEBUG -eq 1 ] && echo Debug || echo Release) \
-DBUILD_SHARED_LIBS=ON \
-DLegion_BUILD_BINDINGS=ON \
-DLegion_ENABLE_TLS=ON \
-DLegion_USE_Python=ON \
-DPYTHON_EXECUTABLE="$(which python)" \
-DLegion_USE_CUDA=OFF \
-DLegion_USE_OpenMP=ON \
-DLegion_USE_GASNet=$([ $LEGION_USE_GASNET -eq 1 ] && echo ON || echo OFF) \
-DGASNet_ROOT_DIR="$GASNET_ROOT" \
-DGASNet_CONDUITS=$GASNET_CONDUIT \
-DLegion_USE_HDF5=ON \
-DLegion_MAX_DIM=4 \
-DCMAKE_INSTALL_PREFIX="$LEGION_INSTALL_DIR" \
-DCMAKE_INSTALL_LIBDIR="$LEGION_INSTALL_DIR/lib" \
"$root_dir"/legion
fi
cmake -DCMAKE_PREFIX_PATH="$CONDA_ENV_DIR" \
-DCMAKE_BUILD_TYPE=$([ $LEGION_DEBUG -eq 1 ] && echo Debug || echo Release) \
-DBUILD_SHARED_LIBS=ON \
-DLegion_BUILD_BINDINGS=ON \
-DLegion_ENABLE_TLS=ON \
-DLegion_USE_Python=ON \
-DPYTHON_EXECUTABLE="$(which python)" \
-DLegion_USE_CUDA=OFF \
-DLegion_USE_OpenMP=ON \
-DLegion_USE_GASNet=$([ $LEGION_USE_GASNET -eq 1 ] && echo ON || echo OFF) \
-DGASNet_ROOT_DIR="$GASNET_ROOT" \
-DGASNet_CONDUITS=${LEGION_GASNET_CONDUIT:-$GASNET_CONDUIT} \
-DLegion_USE_HDF5=ON \
-DLegion_MAX_DIM=4 \
-DCMAKE_INSTALL_PREFIX="$LEGION_INSTALL_DIR" \
-DCMAKE_INSTALL_LIBDIR="$LEGION_INSTALL_DIR/lib" \
"$root_dir"/legion
popd
......@@ -6,7 +6,7 @@ dirname = os.path.dirname
class Test:
test_dir = dirname(os.path.realpath(__file__))
launch_args = "jsrun -n1".split()
launch_args = os.environ["SPINIFEL_TEST_LAUNCHER"].split()
def test_skopi(self, ):
args = self.launch_args + ['python', os.path.join(self.test_dir,'skopi_quaternion.py')]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment