Tpetra::MultiVector::Dot() slower than Epetra for short vectors, faster for long vectors
Created by: CamelliaDPG
The driver pasted below allows CLI arguments to control the length of two vectors that are dotted together, as well as the number of such dot products that should be done. It does this with both Tpetra::MultiVector
and Epetra_Vector
objects, with a serial distribution, and reports the cumulative time for each of Epetra and Tpetra. Here are the timing results from a run on a shared Intel Xeon E7-4880 machine (times in seconds):
numDots | globalRowCount | Epetra | Tpetra | Slowdown |
---|---|---|---|---|
1000000 | 50 | 0.43 | 8.85 | 21x |
100000 | 500 | 0.11 | 2.86 | 26x |
10000 | 5000 | 0.07 | 2.25 | 32x |
1000 | 50000 | 0.08 | 2.20 | 28x |
100 | 500000 | 0.07 | 2.17 | 31x |
10 | 5000000 | 0.15 | 2.29 | 15x |
1 | 50000000 | 0.15 | 2.29 | 15x |
Can this be addressed? From some profiling, it does appear that part of the story involves calls to getVector()
, which result in new Tpetra Vector objects being created in each dot product (a shallow copy). That is probably why the slowdown is 15x as the vectors get larger, but as high as 32x for smaller vectors: with the smaller vectors you see the cost of the object creation and destruction. It appears, though, that the calls to getVector()
are only about half the story: there is still a 15x slowdown to account for.
// Epetra
#include <Epetra_Map.h>
#include <Epetra_SerialComm.h>
#include <Epetra_Vector.h>
// Tpetra
#include <Tpetra_MultiVector.hpp>
// Teuchos
#include <Teuchos_Comm.hpp>
#include <Teuchos_RCP.hpp>
#include <Teuchos_TimeMonitor.hpp>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
using namespace Teuchos;
typedef long long GlobalOrdinal; // signed types preferred
typedef int LocalOrdinal; // signed types preferred
typedef double Scalar;
typedef Tpetra::Map<LocalOrdinal, GlobalOrdinal>::node_type NodeType;
typedef Tpetra::Map<LocalOrdinal,GlobalOrdinal,NodeType> TpetraMap;
typedef Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal, NodeType> TpetraMultiVector;
int main(int argc, char *argv[])
{
int numDots = 10000;
int globalRowCount = 1500;
if (argc == 2)
{
numDots = atoi(argv[1]);
cout << "setting numDots to " << numDots << endl;
}
if (argc == 3)
{
numDots = atoi(argv[1]);
globalRowCount = atoi(argv[2]);
}
cout << "numDots is " << numDots << endl;
cout << "globalRowCount is " << globalRowCount << endl;
RCP<Time> epetraTimer = TimeMonitor::getNewCounter("Epetra dot");
RCP<Time> tpetraTimer = TimeMonitor::getNewCounter("Tpetra dot");
/*** Epetra setup ***/
Epetra_SerialComm serialComm;
Epetra_Map serialMap(globalRowCount, 0, serialComm);
RCP<Epetra_Vector> a_epetra = rcp( new Epetra_Vector(serialMap) );
RCP<Epetra_Vector> b_epetra = rcp( new Epetra_Vector(serialMap) );
for (int localOrdinal=0; localOrdinal<serialMap.NumMyElements(); localOrdinal++)
{
(*a_epetra)[localOrdinal] = 2.0;
(*b_epetra)[localOrdinal] = 0.5;
}
/*** Tpetra setup ***/
RCP<Teuchos::Comm<int>> serialCommTeuchos = Teuchos::rcp( new Teuchos::SerialComm<int>() );
RCP<const TpetraMap> serialMapTpetra = rcp( new TpetraMap(globalRowCount, 0, serialCommTeuchos));
RCP<TpetraMultiVector> a_tpetra = rcp( new TpetraMultiVector(serialMapTpetra, 1)); // numVectors = 1
RCP<TpetraMultiVector> b_tpetra = rcp( new TpetraMultiVector(serialMapTpetra, 1)); // numVectors = 1
auto a_localView = a_tpetra->getLocalView<Kokkos::HostSpace>();
auto b_localView = b_tpetra->getLocalView<Kokkos::HostSpace>();
for (int localOrdinal=serialMapTpetra->getMinLocalIndex(); localOrdinal<=serialMapTpetra->getMaxLocalIndex(); localOrdinal++)
{
a_localView(localOrdinal,0) = 2.0;
b_localView(localOrdinal,0) = 0.5;
}
for (int i=0; i<numDots; i++)
{
double epetraResult;
int err;
epetraTimer->start();
epetraTimer->incrementNumCalls();
err = a_epetra->Dot(*b_epetra, &epetraResult);
epetraTimer->stop();
if (err != 0)
{
cout << "Epetra dot returned err code " << err << ". Halting execution.\n";
exit(err);
}
std::vector<Scalar> tpetraResult( 1 );
tpetraTimer->start();
tpetraTimer->incrementNumCalls();
a_tpetra->dot(*b_tpetra, tpetraResult);
tpetraTimer->stop();
}
TimeMonitor::summarize(serialCommTeuchos.ptr());
return 0;
}