Commit 5e0a04bc authored by Carter Edwards's avatar Carter Edwards
Browse files

Kokkos Examples: Add commentary in preparation for tutorial walk-through.

parent eb71601c
......@@ -52,11 +52,15 @@
namespace Kokkos {
namespace Example {
/** \brief Numerically integrate a function on a finite element mesh and
* project the integrated values to nodes.
*/
template< class FixtureType ,
class FunctionType ,
bool PerformScatterAddWithAtomic >
struct FiniteElementIntegration ;
// Specialized for an 'Example::BoxElemFixture' finite element mesh
template< class Device , BoxElemPart::ElemOrder ElemOrder , class GridMap ,
class FunctionType ,
bool PerformScatterAddWithAtomic >
......@@ -65,23 +69,29 @@ struct FiniteElementIntegration<
FunctionType ,
PerformScatterAddWithAtomic >
{
typedef Kokkos::Example::BoxElemFixture< Device , ElemOrder > BoxFixtureType ;
typedef Kokkos::Example::HexElement_Data< BoxFixtureType::ElemNode > HexElemDataType ;
// Element mesh types:
typedef Kokkos::Example::BoxElemFixture< Device , ElemOrder >
BoxFixtureType ;
typedef Kokkos::Example::HexElement_Data< BoxFixtureType::ElemNode >
HexElemDataType ;
enum { ElemNodeCount = HexElemDataType::element_node_count };
enum { IntegrationCount = HexElemDataType::integration_count };
enum { ValueCount = FunctionType::value_count };
// Dictionary of view types:
typedef View<int*, Device> ElemErrorType ;
typedef View<double*[ElemNodeCount][ValueCount],Device> ElemValueType ;
typedef View<double*[ValueCount], Device> NodeValueType ;
const HexElemDataType m_hex_elem_data ;
const BoxFixtureType m_box_fixture ;
const FunctionType m_function ;
const ElemErrorType m_elem_error ;
const ElemValueType m_elem_integral ;
const NodeValueType m_node_lumped ;
// Data members for this Functor:
const HexElemDataType m_hex_elem_data ; ///< Master element
const BoxFixtureType m_box_fixture ; ///< Unstructured mesh data
const FunctionType m_function ; ///< Function to integrate
const ElemErrorType m_elem_error ; ///< Flags for element errors
const ElemValueType m_elem_integral ; ///< Per-element quantities
const NodeValueType m_node_lumped ; ///< Quantities lumped to nodes
//----------------------------------------
......@@ -89,7 +99,7 @@ struct FiniteElementIntegration<
const BoxFixtureType & box_fixture ,
const FunctionType & function )
: m_hex_elem_data()
, m_box_fixture( box_fixture )
, m_box_fixture( box_fixture ) // Shallow copy of the mesh fixture
, m_function( function )
, m_elem_error( "elem_error" , box_fixture.elem_count() )
, m_elem_integral( "elem_integral" , box_fixture.elem_count() )
......@@ -100,8 +110,11 @@ struct FiniteElementIntegration<
// Device for parallel dispatch.
typedef Device device_type ;
// Value type for parallel reduction.
struct value_type { double value[ ValueCount ]; int error ; };
// Value type for global parallel reduction.
struct value_type {
double value[ ValueCount ]; ///< Integrated quantitie
int error ; ///< Element inversion flag
};
//----------------------------------------
// Transform element interpolation function gradients and
......@@ -117,7 +130,7 @@ struct FiniteElementIntegration<
j21 = 3 , j22 = 4 , j23 = 5 ,
j31 = 6 , j32 = 7 , j33 = 8 };
// Temporary for jacobian accumulation in double.
// Temporary for jacobian accumulation is double for summation accuracy.
double J[ TensorDim ] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
for( int i = 0; i < ElemNodeCount ; ++i ) {
......@@ -134,7 +147,7 @@ struct FiniteElementIntegration<
J[j33] += grad[2][i] * coord[2][i] ;
}
// Inverse jacobian, compute double and store float.
// Inverse jacobian, compute as double and store as float.
float invJ[ TensorDim ] = {
float( J[j22] * J[j33] - J[j23] * J[j32] ) ,
float( J[j13] * J[j32] - J[j12] * J[j33] ) ,
......@@ -159,15 +172,23 @@ struct FiniteElementIntegration<
// Transform gradients:
for ( int i = 0; i < ElemNodeCount ; ++i ) {
dpsi[0][i] = grad[0][i] * invJ[j11] + grad[1][i] * invJ[j12] + grad[2][i] * invJ[j13];
dpsi[1][i] = grad[0][i] * invJ[j21] + grad[1][i] * invJ[j22] + grad[2][i] * invJ[j23];
dpsi[2][i] = grad[0][i] * invJ[j31] + grad[1][i] * invJ[j32] + grad[2][i] * invJ[j33];
dpsi[0][i] = grad[0][i] * invJ[j11] +
grad[1][i] * invJ[j12] +
grad[2][i] * invJ[j13];
dpsi[1][i] = grad[0][i] * invJ[j21] +
grad[1][i] * invJ[j22] +
grad[2][i] * invJ[j23];
dpsi[2][i] = grad[0][i] * invJ[j31] +
grad[1][i] * invJ[j32] +
grad[2][i] * invJ[j33];
}
return detJ ;
}
// Functor's function called for each element in the mesh
// to numerically integrate the function and add element quantities
// to the global integral.
KOKKOS_INLINE_FUNCTION
void operator()( const int ielem , value_type & update ) const
{
......@@ -176,28 +197,27 @@ struct FiniteElementIntegration<
int inode[ ElemNodeCount ] ;
// Gather indices of element's node from global variable to local temporary.
// Gather indices of element's node from global memory to local memory.
for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
inode[i] = m_box_fixture.elem_node( ielem , i );
}
// Gather coordinates of element's nodes from global variable to local temporary.
// Gather coordinates of element's nodes from global memory to local memory.
for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
node_coord[0][i] = m_box_fixture.node_coord( inode[i] , 0 );
node_coord[1][i] = m_box_fixture.node_coord( inode[i] , 1 );
node_coord[2][i] = m_box_fixture.node_coord( inode[i] , 2 );
}
int elem_error = 0 ;
m_elem_error(ielem) = 0 ;
// Local temporary to accumulate numerical cubature of vector valued function.
// Local temporary to accumulate numerical integration
// of vector valued function.
double accum[ ValueCount ];
for ( int j = 0 ; j < ValueCount ; ++j ) { accum[j] = 0 ; }
// Cubature loop for this element:
int error = 0 ;
// Numerical integration loop for this element:
for ( int k = 0 ; k < IntegrationCount ; ++k ) {
// Integration point in space as interpolated from nodal coordinates:
......@@ -219,55 +239,58 @@ struct FiniteElementIntegration<
// Compute deformation jacobian, transform basis function gradient,
// and return determinant of deformation jacobian.
const float detJ = transform_gradients( m_hex_elem_data.gradients[k] , node_coord , dpsi );
float detJ = transform_gradients( m_hex_elem_data.gradients[k] ,
node_coord , dpsi );
if ( detJ <= 0 ) { elem_error = 1 ; }
// Check for inverted spatial jacobian
if ( detJ <= 0 ) { error = 1 ; detJ = 0 ; }
// Integration weight.
const float w = m_hex_elem_data.weights[k] * detJ ;
// Cubature of function.
for ( int j = 0 ; j < ValueCount ; ++j ) { accum[j] += val_at_pt[j] * w ; }
for ( int j = 0 ; j < ValueCount ; ++j ) {
accum[j] += val_at_pt[j] * w ;
}
}
if ( elem_error ) {
m_elem_error(ielem) = 1 ;
update.error = 1 ;
}
else {
// Element contribution to global integral:
m_elem_error(ielem) = error ;
// Element contribution to global integral:
if ( error ) { update.error = 1 ; }
for ( int j = 0 ; j < ValueCount ; ++j ) { update.value[j] += accum[j] ; }
// Element-node quantity for lumping to nodes:
for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
for ( int j = 0 ; j < ValueCount ; ++j ) {
update.value[j] += accum[j] ;
// Save element's integral apportionment to nodes to global memory
m_elem_integral( ielem , i , j ) = accum[j] / ElemNodeCount ;
}
}
if ( PerformScatterAddWithAtomic ) {
// Option to immediately scatter-add the integrated quantities to nodes.
// This is a race condition as two or more threads could attempt
// concurrent update of nodal values. The atomic_fetch_add (+=)
// function guarantees that the summation will occur correctly;
// however, there can be no guarantee for the order of summation.
// Due to non-associativity of floating point arithmetic the result
// is non-deterministic within bounds of floating point round-off.
// Element-node quantity for lumping to nodes:
for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
for ( int j = 0 ; j < ValueCount ; ++j ) {
// Save element's integral apportionment to nodes to global memory
m_elem_integral( ielem , i , j ) = accum[j] / ElemNodeCount ;
}
}
if ( PerformScatterAddWithAtomic ) {
// Option to immediately scatter-add the integrated quantities to nodes.
// This is a race condition as two or more threads could attempt
// concurrent update of nodal values. The atomic_fetch_add (+=)
// function guarantees that the summation will occur correctly;
// however, there can be no guarantee for the order of summation.
// Due to non-associativity of floating point arithmetic the result
// is non-deterministic within bounds of floating point round-off.
for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
for ( int j = 0 ; j < ValueCount ; ++j ) {
Kokkos::atomic_fetch_add( & m_node_lumped( inode[i] , j ) , m_elem_integral( ielem , i , j ) );
}
Kokkos::atomic_fetch_add( & m_node_lumped( inode[i] , j ) ,
m_elem_integral( ielem , i , j ) );
}
}
}
}
//--------------------------------------------------------------------------
// Initialization of the global reduction value.
KOKKOS_INLINE_FUNCTION
void init( value_type & update ) const
{
......@@ -275,6 +298,7 @@ struct FiniteElementIntegration<
update.error = 0 ;
}
// Join two contributions to global reduction value.
KOKKOS_INLINE_FUNCTION
void join( volatile value_type & update ,
volatile const value_type & input ) const
......@@ -298,70 +322,42 @@ template< class ViewElemNode ,
class ViewNodeElem >
void map_node_to_elem( const ViewElemNode & elem_node ,
const ViewNodeScan & node_scan ,
const ViewNodeElem & node_elem )
{
const typename ViewElemNode::HostMirror host_elem_node = Kokkos::create_mirror_view( elem_node );
const typename ViewNodeScan::HostMirror host_node_scan = Kokkos::create_mirror_view( node_scan );
const typename ViewNodeElem::HostMirror host_node_elem = Kokkos::create_mirror_view( node_elem );
const int elem_count = host_elem_node.dimension_0();
const int elem_node_count = host_elem_node.dimension_1();
const int node_count = host_node_scan.dimension_0() - 1 ;
const View<int*,Kokkos::Serial> node_elem_count( "node_elem_count" , node_count );
Kokkos::deep_copy( host_elem_node , elem_node );
for ( int i = 0 ; i < elem_count ; ++i ) {
for ( int j = 0 ; j < elem_node_count ; ++j ) {
++node_elem_count( host_elem_node(i,j) );
}
}
for ( int i = 0 ; i < node_count ; ++i ) {
host_node_scan(i+1) += host_node_scan(i) + node_elem_count(i);
node_elem_count(i) = 0 ;
}
for ( int i = 0 ; i < elem_count ; ++i ) {
for ( int j = 0 ; j < elem_node_count ; ++j ) {
const int inode = host_elem_node(i,j);
const int offset = host_node_scan(inode) + node_elem_count(inode);
host_node_elem( offset , 0 ) = i ;
host_node_elem( offset , 1 ) = j ;
++node_elem_count(inode);
}
}
Kokkos::deep_copy( node_scan , host_node_scan );
Kokkos::deep_copy( node_elem , host_node_elem );
}
const ViewNodeElem & node_elem );
/** \brief Functor to gather-sum elements' per-node quantities
* to element nodes. Gather-sum is thread safe and
* does not require atomic updates.
*/
template< class ViewNodeValue ,
class ViewElemValue ,
bool AlreadyUsedAtomic >
struct LumpElemToNode {
typedef typename ViewElemValue::device_type device_type ;
typedef typename ViewElemValue::shape_type elem_value_shape_type ;
enum { value_count = elem_value_shape_type::N2 };
// In this example we know that the ViewElemValue
// array specification is < double*[nNode][nValue] >
ViewNodeValue m_node_value ;
ViewElemValue m_elem_value ;
View<int*, device_type> m_node_scan ;
View<int*[2],device_type> m_node_elem ;
enum { value_count = ViewElemValue::shape_type::N2 };
ViewNodeValue m_node_value ; ///< Integrated values at nodes
ViewElemValue m_elem_value ; ///< Values apportioned to nodes
View<int*, device_type> m_node_scan ; ///< Offsets for nodes->element
View<int*[2],device_type> m_node_elem ; ///< Node->element connectivity
// Only allocate node->element connectivity if have
// not already used atomic updates for the nodes.
template< class ViewElemNode >
LumpElemToNode( const ViewNodeValue & node_value ,
const ViewElemValue & elem_value ,
const ViewElemNode & elem_node )
: m_node_value( node_value )
, m_elem_value( elem_value )
, m_node_scan( "node_scan" , AlreadyUsedAtomic ? 0 : node_value.dimension_0() + 1 )
, m_node_elem( "node_elem" , AlreadyUsedAtomic ? 0 : elem_node.dimension_0() * elem_node.dimension_1() )
, m_node_scan( "node_scan" ,
AlreadyUsedAtomic ? 0 : node_value.dimension_0() + 1 )
, m_node_elem( "node_elem" ,
AlreadyUsedAtomic ? 0 : elem_node.dimension_0() *
elem_node.dimension_1() )
{
if ( ! AlreadyUsedAtomic ) {
map_node_to_elem( elem_node , m_node_scan , m_node_elem );
......@@ -376,26 +372,36 @@ struct LumpElemToNode {
void operator()( const int inode , value_type & update ) const
{
if ( ! AlreadyUsedAtomic ) {
int i = m_node_scan(inode);
const int end = m_node_scan(inode+1);
// Sum element quantities to a local variable.
value_type local ;
for ( int j = 0 ; j < value_count ; ++j ) { local.value[j] = 0 ; }
for ( ; i < end ; ++i ) {
const int ielem = m_node_elem(i,0);
const int ielem_node = m_node_elem(i,1);
for ( int j = 0 ; j < value_count ; ++j ) {
local.value[j] += m_elem_value( ielem , ielem_node , j );
{
// nodes' element ids span [i,end)
int i = m_node_scan(inode);
const int end = m_node_scan(inode+1);
for ( ; i < end ; ++i ) {
// element #ielem , local node #ielem_node is this node:
const int ielem = m_node_elem(i,0);
const int ielem_node = m_node_elem(i,1);
// Sum the vector-values quantity
for ( int j = 0 ; j < value_count ; ++j ) {
local.value[j] += m_elem_value( ielem , ielem_node , j );
}
}
}
// Assign nodal quantity (no race condition).
// Sum global value.
for ( int j = 0 ; j < value_count ; ++j ) {
m_node_value( inode , j ) += local.value[j] ;
m_node_value( inode , j ) = local.value[j] ;
update.value[j] += local.value[j] ;
}
}
else {
// Already used atomic update of the nodal quantity,
// query and sum the value.
for ( int j = 0 ; j < value_count ; ++j ) {
update.value[j] += m_node_value( inode , j );
}
......@@ -409,11 +415,68 @@ struct LumpElemToNode {
KOKKOS_INLINE_FUNCTION
void join( volatile value_type & update ,
volatile const value_type & input ) const
{ for ( int j = 0 ; j < value_count ; ++j ) { update.value[j] += input.value[j] ; } }
//----------------------------------------
{
for ( int j = 0 ; j < value_count ; ++j ) {
update.value[j] += input.value[j] ;
}
}
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template< class ViewElemNode ,
class ViewNodeScan ,
class ViewNodeElem >
void map_node_to_elem( const ViewElemNode & elem_node ,
const ViewNodeScan & node_scan ,
const ViewNodeElem & node_elem )
{
const typename ViewElemNode::HostMirror host_elem_node =
Kokkos::create_mirror_view(elem_node);
const typename ViewNodeScan::HostMirror host_node_scan =
Kokkos::create_mirror_view(node_scan);
const typename ViewNodeElem::HostMirror host_node_elem =
Kokkos::create_mirror_view(node_elem);
const int elem_count = host_elem_node.dimension_0();
const int elem_node_count = host_elem_node.dimension_1();
const int node_count = host_node_scan.dimension_0() - 1 ;
const View<int*,Kokkos::Serial>
node_elem_count( "node_elem_count" , node_count );
Kokkos::deep_copy( host_elem_node , elem_node );
for ( int i = 0 ; i < elem_count ; ++i ) {
for ( int j = 0 ; j < elem_node_count ; ++j ) {
++node_elem_count( host_elem_node(i,j) );
}
}
for ( int i = 0 ; i < node_count ; ++i ) {
host_node_scan(i+1) += host_node_scan(i) + node_elem_count(i);
node_elem_count(i) = 0 ;
}
for ( int i = 0 ; i < elem_count ; ++i ) {
for ( int j = 0 ; j < elem_node_count ; ++j ) {
const int inode = host_elem_node(i,j);
const int offset = host_node_scan(inode) + node_elem_count(inode);
host_node_elem( offset , 0 ) = i ;
host_node_elem( offset , 1 ) = j ;
++node_elem_count(inode);
}
}
Kokkos::deep_copy( node_scan , host_node_scan );
Kokkos::deep_copy( node_elem , host_node_elem );
}
} /* namespace Example */
} /* namespace Kokkos */
......
......@@ -52,13 +52,17 @@
namespace Kokkos {
namespace Example {
/** \brief Vector valued function to numerically integrate.
*
* F(X) = { 1 , x , y , z , x*y , y*z , z*x , x*y*z }
*
* Integrates on a unit cube to:
* { 1 , 1/2 , 1/2 , 1/2 , 1/4 , 1/4 , 1/4 , 1/8 }
*/
struct MyFunctionType {
enum { value_count = 8 };
// Integral on a unit cube:
// { 1 , 1/2 , 1/2 , 1/2 , 1/4 , 1/4 , 1/4 , 1/8 }
// Evaluate function at coordinate.
template< typename CoordType , typename ValueType >
KOKKOS_INLINE_FUNCTION
......@@ -81,32 +85,47 @@ void feint(
const unsigned global_elem_ny ,
const unsigned global_elem_nz )
{
const unsigned mpi_rank = 0 ;
const unsigned mpi_size = 1 ;
//----------------------------------------
// Create the unstructured finite element mesh box fixture on the device:
static const Kokkos::Example::BoxElemPart::Decompose
decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ; // DecomposeElem | DecomposeNode ;
typedef Kokkos::Example::
BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear >
// BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
BoxFixtureType ;
// typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart:: ElemLinear > BoxFixtureType ;
typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart:: ElemQuadratic > BoxFixtureType ;
// MPI distributed parallel domain decomposition of the fixture.
// Either by element (DecomposeElem) or by node (DecomposeNode)
// with ghosted elements.
typedef Kokkos::Example::FiniteElementIntegration< BoxFixtureType , MyFunctionType , UseAtomic > FeintType ;
static const Kokkos::Example::BoxElemPart::Decompose
decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ;
// decompose = Kokkos::Example::BoxElemPart:: DecomposeNode ;
typedef Kokkos::Example::LumpElemToNode< typename FeintType::NodeValueType ,
typename FeintType::ElemValueType ,
UseAtomic > LumpType ;
// Not using MPI in this example.
const unsigned mpi_rank = 0 ;
const unsigned mpi_size = 1 ;
const BoxFixtureType fixture( decompose , mpi_size , mpi_rank ,
global_elem_nx , global_elem_ny , global_elem_nz );
global_elem_nx ,
global_elem_ny ,
global_elem_nz );
//----------------------------------------
// Create and execute the numerical integration functor on the device:
typedef Kokkos::Example::
FiniteElementIntegration< BoxFixtureType , MyFunctionType , UseAtomic >
FeintType ;
const FeintType feint( fixture , MyFunctionType() );
typename FeintType::value_type elem_integral ;
// A reduction for the global integral:
Kokkos::parallel_reduce( fixture.elem_count() , feint , elem_integral );
if ( elem_integral.error ) {
std::cout << "Spatial jacobian error" << std::endl ;
std::cout << "An element had a spatial jacobian error" << std::endl ;
return ;
}
......@@ -116,6 +135,14 @@ void feint(
}
std::cout << std::endl ;
//----------------------------------------
// Create and execute the nodal lumped value projection and reduction functor:
typedef Kokkos::Example::
LumpElemToNode< typename FeintType::NodeValueType ,
typename FeintType::ElemValueType ,
UseAtomic > LumpType ;
const LumpType lump( feint.m_node_lumped ,
feint.m_elem_integral ,
fixture.elem_node() );
......
......@@ -9,27 +9,44 @@
int main()
{
enum { UseAtomic = false };
#if defined( KOKKOS_HAVE_PTHREAD )
{
std::cout << "feint< Threads >" << std::endl ;
std::pair<unsigned,unsigned> use_cores =
Kokkos::hwloc::get_core_topology();
// Use 4 cores per NUMA region, unless fewer available
use_cores.second = std::min( 4u , use_cores.second );
// One team of one thread.
std::pair<unsigned,unsigned> team_topology( 1 , 1 );
// Use 2 cores per team and 1 thread/core:
std::pair<unsigned,unsigned>
team_topology( use_cores.first * use_cores.second / 2 , 2 );
Kokkos::Threads::initialize( team_topology );
Kokkos::Example::feint< Kokkos::Threads , UseAtomic >();
std::cout << "feint< Threads , NotUsingAtomic >" << std::endl ;
Kokkos::Example::feint< Kokkos::Threads , false >();
std::cout << "feint< Threads , Usingtomic >" << std::endl ;
Kokkos::Example::feint< Kokkos::Threads , true >();
Kokkos::Threads::finalize();
}
#endif
#if defined( KOKKOS_HAVE_CUDA )
{
std::cout << "feint< Cuda >" << std::endl ;
const unsigned device_count = Kokkos::Cuda::detect_device_count();
// Use the last device:
Kokkos::