Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
James Willenbring
Trilinos
Commits
b61cd068
Commit
b61cd068
authored
Nov 22, 2017
by
Kyungjoo Kim
Browse files
Shylu/Tacho - give an option to select frontal matrix assembly mode
0 - use lock (better for knl), 1 - atomic (better for skylake)
parent
158ccee9
Changes
5
Hide whitespace changes
Inline
Side-by-side
packages/shylu/shylu_node/tacho/example/Tacho_ExamplePerfTest.cpp
View file @
b61cd068
...
...
@@ -31,11 +31,15 @@ int main (int argc, char *argv[]) {
int
max_num_superblocks
=
4
;
int
mb
=
-
1
,
nb
=
-
1
;
int
small_problem_thres
=
1024
;
int
front_update_mode
=
-
1
;
bool
test_tacho
=
true
;
bool
test_pardiso
=
false
;
bool
test_cholmod
=
false
;
bool
use_same_ordering
=
true
;
Tacho
::
Experimental
::
CommandLineParser
opts
(
"This is Tacho performance test comparing with Pardiso and Cholmod on OpenMP and Cuda spaces"
);
// threading environment
...
...
@@ -51,6 +55,7 @@ int main (int argc, char *argv[]) {
opts
.
set_option
<
int
>
(
"max-num-superblocks"
,
"Max number of superblocks"
,
&
max_num_superblocks
);
opts
.
set_option
<
int
>
(
"mb"
,
"Internal block size"
,
&
mb
);
opts
.
set_option
<
int
>
(
"nb"
,
"Internal panel size"
,
&
nb
);
opts
.
set_option
<
int
>
(
"front-update-mode"
,
"Front update mode"
,
&
front_update_mode
);
// testing flags
opts
.
set_option
<
bool
>
(
"test-tacho"
,
"Flag for testing Tacho"
,
&
test_tacho
);
...
...
@@ -61,6 +66,8 @@ int main (int argc, char *argv[]) {
opts
.
set_option
<
bool
>
(
"test-cholmod"
,
"Flag for testing Cholmod"
,
&
test_cholmod
);
#endif
opts
.
set_option
<
bool
>
(
"use-same-ordering"
,
"Same Metis ordering is used for all tests"
,
&
use_same_ordering
);
TACHO_ITT_PAUSE
;
const
bool
r_parse
=
opts
.
parse
(
argc
,
argv
);
...
...
@@ -77,13 +84,12 @@ int main (int argc, char *argv[]) {
{
/// basic typedef
typedef
int
ordinal_type
;
typedef
size_t
size_type
;
typedef
double
value_type
;
/// crs matrix format and dense multi vector
typedef
Tacho
::
Experimental
::
CrsMatrixBase
<
value_type
,
Kokkos
::
DefaultHostExecutionSpace
>
CrsMatrixBaseType
;
typedef
Kokkos
::
View
<
value_type
**
,
Kokkos
::
LayoutLeft
,
Kokkos
::
DefaultHostExecutionSpace
>
DenseMultiVectorType
;
typedef
Kokkos
::
View
<
ordinal_type
*
,
Kokkos
::
DefaultHostExecutionSpace
>
OrdinalTypeArray
;
//
typedef Kokkos::View<ordinal_type*,Kokkos::DefaultHostExecutionSpace> OrdinalTypeArray;
///
/// problem setting
...
...
@@ -106,7 +112,9 @@ int main (int argc, char *argv[]) {
x
(
"x"
,
A
.
NumRows
(),
nrhs
),
// solution multivector
t
(
"t"
,
A
.
NumRows
(),
nrhs
);
// temp workspace (store permuted rhs)
OrdinalTypeArray
perm
(
"perm"
,
A
.
NumRows
()),
peri
(
"peri"
,
A
.
NumRows
());
Tacho
::
Experimental
::
Graph
graph
(
A
.
NumRows
(),
A
.
NumNonZeros
(),
A
.
RowPtr
(),
A
.
Cols
());
Tacho
::
Experimental
::
GraphTools_Metis
G
(
graph
);
G
.
reorder
(
verbose
);
{
Tacho
::
Experimental
::
Random
<
value_type
>
random
;
...
...
@@ -133,13 +141,21 @@ int main (int argc, char *argv[]) {
/// tacho
///
Tacho
::
Solver
<
value_type
,
Kokkos
::
DefaultHostExecutionSpace
>
solver
;
//solver.setMatrixType(sym, posdef);
solver
.
setVerbose
(
verbose
);
solver
.
setMaxNumberOfSuperblocks
(
max_num_superblocks
);
solver
.
setSmallProblemThresholdsize
(
small_problem_thres
);
solver
.
setBlocksize
(
mb
);
solver
.
setPanelsize
(
nb
);
solver
.
setFrontUpdateMode
(
front_update_mode
);
/// inputs are used for graph reordering and analysis
solver
.
analyze
(
A
.
NumRows
(),
A
.
RowPtr
(),
A
.
Cols
());
A
.
Cols
(),
G
.
PermVector
(),
G
.
InvPermVector
());
/// symbolic structure can be reused
TACHO_ITT_RESUME
;
...
...
@@ -193,14 +209,14 @@ int main (int argc, char *argv[]) {
pardiso
.
showErrorCode
(
std
::
cout
)
<<
std
::
endl
;
}
pardiso
.
setParameter
(
2
,
2
);
// metis ordering is used
//pardiso.setParameter(5, 1); // user permutation is used for mkl permutation
//
pardiso.setParameter(5,
2
); //
mkl
permutation is
written on perm
if
(
use_same_ordering
)
pardiso
.
setParameter
(
5
,
1
);
//
user
permutation is
used for mkl permutation
// somehow pardiso does not like symmetric full matrix (store only half)
CrsMatrixBaseType
Asym
(
"Asym"
);
Asym
.
createConfTo
(
A
);
{
size_t
ype
nnz
=
0
;
size_t
nnz
=
0
;
for
(
ordinal_type
i
=
0
;
i
<
A
.
NumRows
();
++
i
)
{
Asym
.
RowPtrBegin
(
i
)
=
nnz
;
for
(
ordinal_type
idx
=
A
.
RowPtrBegin
(
i
);
idx
<
A
.
RowPtrEnd
(
i
);
++
idx
)
{
...
...
@@ -223,7 +239,7 @@ int main (int argc, char *argv[]) {
(
double
*
)
Asym
.
Values
().
data
(),
(
int
*
)
rowptr
.
data
(),
// (int*)Asym.RowPtr().data(),
(
int
*
)
Asym
.
Cols
().
data
(),
(
int
*
)
perm
.
data
(),
(
int
*
)
G
.
PermVector
()
.
data
(),
nrhs
,
(
double
*
)
b
.
data
(),
(
double
*
)
x
.
data
());
...
...
@@ -237,8 +253,12 @@ int main (int argc, char *argv[]) {
}
// compute inverse permutation
for
(
ordinal_type
i
=
0
;
i
<
A
.
NumRows
();
++
i
)
peri
(
perm
(
i
))
=
i
;
{
const
auto
peri
=
G
.
InvPermVector
();
const
auto
perm
=
G
.
PermVector
();
for
(
ordinal_type
i
=
0
;
i
<
A
.
NumRows
();
++
i
)
peri
(
perm
(
i
))
=
i
;
}
TACHO_ITT_RESUME
;
r_val
=
pardiso
.
run
(
Pardiso
::
Factorize
,
1
);
...
...
@@ -307,7 +327,7 @@ int main (int argc, char *argv[]) {
cholmod_common
c
;
cholmod_start
(
&
c
);
/* start CHOLMOD */
c
.
nmethods
=
1
;
c
.
method
[
0
].
ordering
=
CHOLMOD_METIS
;
c
.
method
[
0
].
ordering
=
use_same_ordering
?
CHOLMOD_GIVEN
:
CHOLMOD_METIS
;
c
.
postorder
=
1
;
AA
=
cholmod_allocate_sparse
(
A
.
NumRows
(),
A
.
NumRows
(),
A
.
NumNonZeros
(),
...
...
@@ -317,9 +337,9 @@ int main (int argc, char *argv[]) {
for
(
int
i
=
0
;
i
<
(
A
.
NumRows
()
+
1
);
++
i
)
pp
[
i
]
=
A
.
RowPtrBegin
(
i
);
int
*
ii
=
(
int
*
)
AA
->
i
;
int
*
ii
=
(
int
*
)
AA
->
i
,
jend
=
A
.
NumNonZeros
()
;
double
*
aa
=
(
double
*
)
AA
->
x
;
for
(
int
j
=
0
;
j
<
A
.
NumNonZeros
()
;
++
j
)
{
for
(
int
j
=
0
;
j
<
jend
;
++
j
)
{
ii
[
j
]
=
A
.
Col
(
j
);
aa
[
j
]
=
A
.
Value
(
j
);
}
...
...
@@ -340,22 +360,12 @@ int main (int argc, char *argv[]) {
c
.
memory_usage
=
0
;
timer
.
reset
();
LL
=
cholmod_analyze
(
AA
,
&
c
)
;
/* analyze */
if
(
use_same_ordering
)
LL
=
cholmod_analyze_p
(
AA
,
G
.
PermVector
().
data
(),
NULL
,
0
,
&
c
)
;
/* analyze */
else
LL
=
cholmod_analyze
(
AA
,
&
c
)
;
/* analyze */
t_analyze
=
timer
.
seconds
();
TACHO_ITT_RESUME
;
timer
.
reset
();
cholmod_factorize
(
AA
,
LL
,
&
c
)
;
/* factorize */
t_factor
=
timer
.
seconds
();
TACHO_ITT_PAUSE
;
timer
.
reset
();
for
(
int
iter
=
0
;
iter
<
niter_solve
;
++
iter
)
{
xx
=
cholmod_solve
(
CHOLMOD_A
,
LL
,
bb
,
&
c
)
;
/* solve Ax=b */
}
t_solve_niter
=
timer
.
seconds
();
t_solve
=
t_solve_niter
/
double
(
niter_solve
);
if
(
verbose
)
{
printf
(
"CHOLMOD: Analyze
\n
"
);
printf
(
"================
\n
"
);
...
...
@@ -363,13 +373,22 @@ int main (int argc, char *argv[]) {
printf
(
" total time spent: %10.6f s
\n
"
,
t_analyze
);
printf
(
"
\n
"
);
printf
(
" Linear system A
\n
"
);
printf
(
" number of equations: %10
d
\n
"
,
AA
->
nrow
);
printf
(
" max number of nonzeros: %10
d
\n
"
,
AA
->
nzmax
);
printf
(
" number of equations: %10
zu
\n
"
,
AA
->
nrow
);
printf
(
" max number of nonzeros: %10
zu
\n
"
,
AA
->
nzmax
);
printf
(
"
\n
"
);
printf
(
" Factors L
\n
"
);
printf
(
" number of nonzeros: %10.0f
\n
"
,
c
.
lnz
);
//printf(" current method: %10d\n", c.current);
printf
(
"
\n
"
);
}
TACHO_ITT_RESUME
;
timer
.
reset
();
cholmod_factorize
(
AA
,
LL
,
&
c
)
;
/* factorize */
t_factor
=
timer
.
seconds
();
TACHO_ITT_PAUSE
;
if
(
verbose
)
{
printf
(
"CHOLMOD: Factorize
\n
"
);
printf
(
"==================
\n
"
);
printf
(
" Time
\n
"
);
...
...
@@ -392,7 +411,17 @@ int main (int argc, char *argv[]) {
printf
(
" FLOPs
\n
"
);
printf
(
" gflop for numeric factorization: %10.6f GFLOP
\n
"
,
c
.
fl
/
1024
/
1024
/
1024
);
printf
(
" gflop/s for numeric factorization: %10.6f GFLOP/s
\n
"
,
c
.
fl
/
1024
/
1024
/
1024
/
t_factor
);
printf
(
"
\n
"
);
printf
(
"
\n
"
);
}
timer
.
reset
();
for
(
int
iter
=
0
;
iter
<
niter_solve
;
++
iter
)
{
xx
=
cholmod_solve
(
CHOLMOD_A
,
LL
,
bb
,
&
c
)
;
/* solve Ax=b */
}
t_solve_niter
=
timer
.
seconds
();
t_solve
=
t_solve_niter
/
double
(
niter_solve
);
if
(
verbose
)
{
printf
(
"CHOLMOD: Solve
\n
"
);
printf
(
"==============
\n
"
);
printf
(
" Time (Multiple Solves)
\n
"
);
...
...
packages/shylu/shylu_node/tacho/src/TachoExp_CholSupernodes_SerialPanel.hpp
View file @
b61cd068
...
...
@@ -62,8 +62,7 @@ namespace Tacho {
const
ordinal_type
np
,
// ATR and ABR panel width
const
ordinal_type
sid
,
const
size_type
bufsize
,
// ABR size + additional
/* */
void
*
buf
,
const
bool
use_atomic
=
true
)
{
/* */
void
*
buf
)
{
typedef
SupernodeInfoType
supernode_info_type
;
typedef
typename
supernode_info_type
::
value_type
value_type
;
...
...
@@ -116,26 +115,43 @@ namespace Tacho {
if
(
srcsize
==
tgtsize
)
{
/* */
value_type
*
tgt
=
s
.
buf
;
const
value_type
*
src
=
(
value_type
*
)
ABR
.
data
();
// lock
while
(
Kokkos
::
atomic_compare_exchange
(
&
s
.
lock
,
0
,
1
))
KOKKOS_IMPL_PAUSE
;
Kokkos
::
store_fence
();
for
(
ordinal_type
js
=
0
;
js
<
nb
;
++
js
)
{
const
ordinal_type
jt
=
js
+
offn
;
const
value_type
*
__restrict__
ss
=
src
+
js
*
srcsize
;
/* */
value_type
*
__restrict__
tt
=
tgt
+
jt
*
srcsize
;
switch
(
info
.
front_update_mode
)
{
case
1
:
{
for
(
ordinal_type
js
=
0
;
js
<
nb
;
++
js
)
{
const
ordinal_type
jt
=
js
+
offn
;
const
value_type
*
__restrict__
ss
=
src
+
js
*
srcsize
;
/* */
value_type
*
__restrict__
tt
=
tgt
+
jt
*
srcsize
;
#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
#pragma unroll
#endif
for
(
ordinal_type
i
=
0
;
i
<=
jt
;
++
i
)
tt
[
i
]
+=
ss
[
i
];
for
(
ordinal_type
i
=
0
;
i
<=
jt
;
++
i
)
Kokkos
::
atomic_fetch_add
(
&
tt
[
i
],
ss
[
i
]);
}
break
;
}
case
0
:
{
// lock
while
(
Kokkos
::
atomic_compare_exchange
(
&
s
.
lock
,
0
,
1
))
KOKKOS_IMPL_PAUSE
;
Kokkos
::
store_fence
();
for
(
ordinal_type
js
=
0
;
js
<
nb
;
++
js
)
{
const
ordinal_type
jt
=
js
+
offn
;
const
value_type
*
__restrict__
ss
=
src
+
js
*
srcsize
;
/* */
value_type
*
__restrict__
tt
=
tgt
+
jt
*
srcsize
;
#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
#pragma unroll
#endif
for
(
ordinal_type
i
=
0
;
i
<=
jt
;
++
i
)
tt
[
i
]
+=
ss
[
i
];
}
// unlock
s
.
lock
=
0
;
Kokkos
::
load_fence
();
break
;
}
}
// unlock
s
.
lock
=
0
;
Kokkos
::
load_fence
();
return
0
;
}
}
...
...
@@ -171,7 +187,8 @@ namespace Tacho {
ordinal_type
ijbeg
=
0
;
for
(;
s2t
[
ijbeg
]
==
-
1
;
++
ijbeg
)
;
// lock
if
(
use_atomic
)
{
switch
(
info
.
front_update_mode
)
{
case
1
:
{
for
(
ordinal_type
jj
=
max
(
ijbeg
,
offn
);
jj
<
nn
;
++
jj
)
{
const
ordinal_type
js
=
jj
-
offn
;
#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
...
...
@@ -183,7 +200,9 @@ namespace Tacho {
else
break
;
}
}
}
else
{
break
;
}
case
0
:
{
while
(
Kokkos
::
atomic_compare_exchange
(
&
s
.
lock
,
0
,
1
))
KOKKOS_IMPL_PAUSE
;
Kokkos
::
store_fence
();
...
...
@@ -201,6 +220,8 @@ namespace Tacho {
// unlock
s
.
lock
=
0
;
Kokkos
::
load_fence
();
break
;
}
}
}
}
...
...
packages/shylu/shylu_node/tacho/src/TachoExp_NumericTools.hpp
View file @
b61cd068
...
...
@@ -351,6 +351,12 @@ namespace Tacho {
}
}
inline
void
setFrontUpdateMode
(
const
ordinal_type
front_update_mode
)
{
_info
.
front_update_mode
=
front_update_mode
;
}
inline
void
setSerialThresholdSize
(
const
ordinal_type
serial_thres_size
)
{
...
...
packages/shylu/shylu_node/tacho/src/TachoExp_SupernodeInfo.hpp
View file @
b61cd068
...
...
@@ -73,7 +73,7 @@ namespace Tacho {
typedef
Kokkos
::
View
<
supernode_type
*
,
exec_space
>
supernode_type_array
;
///
///
Phase 1:
symbolic
///
info for
symbolic
///
ConstUnmanagedViewType
<
supernode_type_array
>
supernodes
;
...
...
@@ -86,12 +86,17 @@ namespace Tacho {
ConstUnmanagedViewType
<
ordinal_pair_type_array
>
sid_block_colidx
;
///
///
Phase 2:
max parameter
/// max parameter
///
ordinal_type
max_supernode_size
,
max_schur_size
,
serial_thres_size
;
///
/// Phase 3: solve (rhs multivector)
/// frontal matrix subassembly mode
///
ordinal_type
front_update_mode
;
// 0 - lock, 1 - atomic
///
/// info for solve (rhs multivector)
UnmanagedViewType
<
value_type_matrix
>
x
;
KOKKOS_INLINE_FUNCTION
...
...
@@ -132,6 +137,9 @@ namespace Tacho {
max_supernode_size
=
0
;
serial_thres_size
=
0
;
// by default, update mode is atomic: 0 - mutex lock, 1 - atomic
front_update_mode
=
1
;
size_type
nnz
=
0
;
for
(
ordinal_type
sid
=
0
;
sid
<
nsupernodes
;
++
sid
)
{
auto
&
s
=
supernodes_
(
sid
);
...
...
packages/shylu/shylu_node/tacho/src/Tacho_Solver.hpp
View file @
b61cd068
...
...
@@ -77,6 +77,7 @@ namespace Tacho {
ordinal_type
_serial_thres_size
;
// serialization threshold size
ordinal_type
_mb
;
// block size for byblocks algorithms
ordinal_type
_nb
;
// panel size for panel algorithms
ordinal_type
_front_update_mode
;
// front update mode 0 - lock, 1 - atomic
// parallelism and memory constraint is made via this parameter
ordinal_type
_max_num_superblocks
;
// # of superblocks in the memoyrpool
...
...
@@ -90,7 +91,8 @@ namespace Tacho {
_small_problem_thres
(
1024
),
_serial_thres_size
(
-
1
),
_mb
(
-
1
),
_nb
(
-
1
)
{}
_nb
(
-
1
),
_front_update_mode
(
-
1
)
{}
Solver
(
const
Solver
&
b
)
=
default
;
...
...
@@ -109,6 +111,9 @@ namespace Tacho {
void
setPanelsize
(
const
ordinal_type
nb
=
-
1
)
{
_nb
=
nb
;
}
void
setFrontUpdateMode
(
const
ordinal_type
front_update_mode
=
1
)
{
_front_update_mode
=
front_update_mode
;
}
void
setMaxNumberOfSuperblocks
(
const
ordinal_type
max_num_superblocks
=
-
1
)
{
_max_num_superblocks
=
max_num_superblocks
;
}
...
...
@@ -254,6 +259,11 @@ namespace Tacho {
}
_N
.
setMaxNumberOfSuperblocks
(
_max_num_superblocks
);
if
(
_front_update_mode
<
0
)
{
// set default values
_front_update_mode
=
1
;
// atomic is default
}
_N
.
setFrontUpdateMode
(
_front_update_mode
);
const
ordinal_type
nthreads
=
device_exec_space
::
thread_pool_size
(
0
);
if
(
nthreads
==
1
)
{
if
(
_nb
<
0
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment