Vengineerの戯言

人生は短いけど、長いです。人生を楽しみましょう!

Graphcore : Poplar Tutorial (その3)

@Vengineerの戯言 : Twitter
SystemVerilogの世界へようこそすべては、SystemC v0.9公開から始まった 

tutorials/poplar/tut5_ml/regression-demo.cpp では、MNISTモデルを Poplar で実装し、IPUで実行するためのコードです。

Popops/Poplin/Popnnの各種 Codelets を graph に追加しています。

Graph graph(dev.getTarget());
popops::addCodelets(graph);
poplin::addCodelets(graph);
popnn::addCodelets(graph);

ここで、グラフへの入力 Tensor (x, W, b) を生成

// Create tensors in the graph.
Tensor x = graph.addVariable(FLOAT, {imageSize, 1}, "x");
poputil::mapTensorLinearly(graph, x);
Tensor W = graph.addVariable(FLOAT, {10, imageSize}, "W");
poputil::mapTensorLinearly(graph, W);
Tensor b = graph.addVariable(FLOAT, {10, 1}, "b");
poputil::mapTensorLinearly(graph, b);

ここで、Wとb にホストから書き込みの初期化

// Make the weights and biases host writable for initialization.
graph.createHostWrite("weights", W);
graph.createHostWrite("biases", b);

numCorrect と expexted もここで生成

Tensor numCorrect = graph.addVariable(UNSIGNED_INT, {1}, "numCorrect");
poputil::mapTensorLinearly(graph, numCorrect);
Tensor expected = graph.addVariable(UNSIGNED_INT, {1}, "expected");
poputil::mapTensorLinearly(graph, expected);

シーケンシャルなぷろぐらむを生成

// Create the graph and program to execute the model, calculate
// the gradients of W, b and subtract the scaled gradients from the
// parameters
Sequence mProg;

W * x + b を Poplin::matMul と Popops::add にて  y を生成

// Calculate y = Wx + b
Tensor t = poplin::matMul(graph, W, x, mProg, "Wx");
Tensor y = popops::add(graph, t, b, mProg, "Wx+b");

ロスの生成

// Calculate the loss
Tensor loss = graph.addVariable(FLOAT, {1}, "loss");
poputil::mapTensorLinearly(graph, loss);
// The loss gradient with respect to y
Tensor delta = graph.addVariable(FLOAT, {1, 10}, "delta");
poputil::mapTensorLinearly(graph, delta);
auto softmaxY = popnn::nonLinearity(graph, popnn::NonLinearityType::SOFTMAX,
y.transpose(), mProg, "softmax(Wx+b)");
mProg.add(popnn::calcLoss(graph, softmaxY, expected, loss, delta, numCorrect,
popnn::CROSS_ENTROPY_LOSS, "dE/d(Wx+b)"));
// Update: b -= eta * dE/db, where dE/db = dE/dy
float eta = 0.0009;
popops::scaledAddTo(graph, b, delta.transpose(), -eta, mProg,
"b += -eta * dE/db");

微分:W = W - eta * dE/dW 

// Update: W -= eta * dE/dW
Tensor wGrad =
poplin::matMul(graph, delta.transpose(), x.transpose(), mProg, "dE/dW");
popops::scaledAddTo(graph, W, wGrad, -eta, mProg, "W += -eta * dE/dW");

ホストから3つの FIFO (data, lables, hostNumCorrect) を追加

// Create a control program to execute the SGD training algorithm.
DataStream dataIn = graph.addHostToDeviceFIFO("data", FLOAT, imageSize);
DataStream labelIn = graph.addHostToDeviceFIFO("labels", UNSIGNED_INT, 1);
DataStream numCorrectOut =
graph.addDeviceToHostFIFO("hostNumCorrect", UNSIGNED_INT, 1);

学習用プログラム

Sequence trainProg;

numCorrectOutの初期化

// Initialize the numCorrect tensor to 0
Tensor zero = graph.addConstant(UNSIGNED_INT, {1}, 0);
graph.setTileMapping(zero, 0);
trainProg.add(Copy(zero, numCorrect));
const unsigned batchSize = 300;
trainProg.add(Repeat(
batchSize, Sequence(Copy(dataIn, x), Copy(labelIn, expected), mProg)));
trainProg.add(Copy(numCorrect, numCorrectOut));

学習用エンジンにグラフとプログラムを割り当て、デバイスにロード

// Create a Poplar engine.
Engine engine(graph, trainProg);
engine.load(dev);

data, labels, numCorrectOut にデータを割り当て

// Connect up the data streams
engine.connectStream(dataIn, &data[0], &data[numberOfImages * imageSize]);
engine.connectStream(labelIn, &labels[0], &labels[numberOfImages]);
engine.connectStream(numCorrectOut, &hNumCorrect);

W と b を書き込み

// Initialize the weights and biases
std::vector<float> initW = createRandomInitializers(W.numElements());
std::vector<float> initB = createRandomInitializers(b.numElements());
engine.writeTensor("weights", initW.data());
engine.writeTensor("biases", initB.data());

 学習のループ、epoch / batch 回、engine.run(0) を実行

// Run the training algorithm, printing out the accuracy regularly
unsigned totalCorrect = 0, totalTested = 0;

const unsigned batches = numberOfImages / batchSize;
for (unsigned epoch = 1; epoch <= epochs; ++epoch) {
    for (unsigned batch = 1; batch <= batches; ++batch) {
        engine.run(0); // trainProg
        totalCorrect += hNumCorrect;
        totalTested += batchSize;
        if (epoch == 1 && batch == 1) {
           engine.printProfileSummary(std::cout);
        }
        // Status update if we've done at least another 20th of an epoch
        if (totalTested > numberOfImages / 20) {
            unsigned percentCorrect = totalCorrect * 100 / totalTested;
            unsigned epochPercent = batch * 100 / batches;
            std::cout << "Epoch " << epoch << " (" << epochPercent
<< "%), accuracy " << percentCorrect << "%\n";
            totalCorrect = totalTested = 0;
       }
    }
}