Running in MPI mode fails on 32 ranks
Mode: MPI/hdf5 Error:
Traceback (most recent call last):
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/setup/conda/envs/myenv/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/setup/conda/envs/myenv/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/spinifel/__main__.py", line 29, in <module>
main()
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/setup/conda/envs/myenv/lib/python3.8/site-packages/PyNVTX/__init__.py", line 33, in wrapper
ret = func(*args, **kwargs)
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/spinifel/mpi/main.py", line 452, in main
ac = mg.solve_ac(generation, orientations, ac_phased)
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/setup/conda/envs/myenv/lib/python3.8/site-packages/PyNVTX/__init__.py", line 33, in wrapper
ret = func(*args, **kwargs)
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/spinifel/mpi/autocorrelation.py", line 161, in solve_ac
ret, info = cg(W, d, x0=x0, maxiter=self.maxiter, callback=self.callback)
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/setup/conda/envs/myenv/lib/python3.8/site-packages/cupyx/scipy/sparse/linalg/_iterative.py", line 76, in cg
q = matvec(p)
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/setup/conda/envs/myenv/lib/python3.8/site-packages/cupyx/scipy/sparse/linalg/_interface.py", line 89, in matvec
y = self._matvec(x)
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/setup/conda/envs/myenv/lib/python3.8/site-packages/cupyx/scipy/sparse/linalg/_interface.py", line 282, in _matvec
return self.__matvec_impl(x)
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/spinifel/mpi/autocorrelation.py", line 105, in W_matvec
uvect_ADA = self.core_problem_convolution(uvect, F_ugrid_conv_, ac_support)
File "/autofs/nccs-svm1_home1/monarin/frontier/spinifel/spinifel/sequential/autocorrelation.py", line 179, in core_problem_convolution
assert xp.all(xp.isreal(uvect))
AssertionError
Step to reproduce the problem:
sbatch submit_frontier.sh
where submit_frontier.sh is:
#!/bin/bash
#SBATCH -A chm137
#SBATCH -t 0:29:59
#SBATCH -N 4
#SBATCH -c 32
#SBATCH -J RunSpinifel
#SBATCH -o RunSpinifel_o.%J
#SBATCH -e RunSpinifel_e.%J
set +x
t_start=`date +%s`
# spinifel
source setup/env.sh
# Spinifel's env vars
export test_data_dir="/lustre/orion/proj-shared/chm137/demo23/data"
export out_dir="/lustre/orion/chm137/scratch/${USER}/${CI_PIPELINE_ID}/spinifel_output"
export USE_CUPY=1
# Creates the output folder if not already exist.
if [ ! -d "${out_dir}" ]; then
mkdir -p ${out_dir}
fi
# Running Spinifel
FRONTIER_EXTRAS="runtime.use_pygpu=true"
N_IMAGES_PER_RANK=250
N_IMAGES_MAX=250
srun -N4 -n32 --gpus-per-task 1 python $DEBUG_FLAG -m spinifel --default-settings=test_mpi.toml --mode=mpi $FRONTIER_EXTRAS data.in_dir=${test_data_dir} data.name=3iyf_128x128pixels_2m.h5 runtime.N_images_per_rank=$N_IMAGES_PER_RANK algorithm.N_images_max=$N_IMAGES_MAX fsc.fsc_fraction_known_orientations=0
t_end=`date +%s`
echo PSJobCompleted TotalElapsed $((t_end-t_start)) $t_start $t_end
Note that this ran with srun -N2 -n16. @Seemah @eslaught @jpblaschke