@Vengineerの戯言 : Twitter
SystemVerilogの世界へようこそ、すべては、SystemC v0.9公開から始まった
tutorials/poplar/tut5_ml/regression-demo.cpp では、MNISTモデルを Poplar で実装し、IPUで実行するためのコードです。
Popops/Poplin/Popnnの各種 Codelets を graph に追加しています。
Graph graph(dev.getTarget());
popops::addCodelets(graph);
poplin::addCodelets(graph);
popnn::addCodelets(graph);
ここで、グラフへの入力 Tensor (x, W, b) を生成
// Create tensors in the graph.
Tensor x = graph.addVariable(FLOAT, {imageSize, 1}, "x");
poputil::mapTensorLinearly(graph, x);
Tensor W = graph.addVariable(FLOAT, {10, imageSize}, "W");
poputil::mapTensorLinearly(graph, W);
Tensor b = graph.addVariable(FLOAT, {10, 1}, "b");
poputil::mapTensorLinearly(graph, b);
ここで、Wとb にホストから書き込みの初期化
// Make the weights and biases host writable for initialization.
graph.createHostWrite("weights", W);
graph.createHostWrite("biases", b);
numCorrect と expexted もここで生成
Tensor numCorrect = graph.addVariable(UNSIGNED_INT, {1}, "numCorrect");
poputil::mapTensorLinearly(graph, numCorrect);
Tensor expected = graph.addVariable(UNSIGNED_INT, {1}, "expected");
poputil::mapTensorLinearly(graph, expected);
シーケンシャルなぷろぐらむを生成
// Create the graph and program to execute the model, calculate
// the gradients of W, b and subtract the scaled gradients from the
// parameters
Sequence mProg;
W * x + b を Poplin::matMul と Popops::add にて y を生成
// Calculate y = Wx + b
Tensor t = poplin::matMul(graph, W, x, mProg, "Wx");
Tensor y = popops::add(graph, t, b, mProg, "Wx+b");
ロスの生成
// Calculate the loss
Tensor loss = graph.addVariable(FLOAT, {1}, "loss");
poputil::mapTensorLinearly(graph, loss);
// The loss gradient with respect to y
Tensor delta = graph.addVariable(FLOAT, {1, 10}, "delta");
poputil::mapTensorLinearly(graph, delta);
auto softmaxY = popnn::nonLinearity(graph, popnn::NonLinearityType::SOFTMAX,
y.transpose(), mProg, "softmax(Wx+b)");
mProg.add(popnn::calcLoss(graph, softmaxY, expected, loss, delta, numCorrect,
popnn::CROSS_ENTROPY_LOSS, "dE/d(Wx+b)"));
// Update: b -= eta * dE/db, where dE/db = dE/dy
float eta = 0.0009;
popops::scaledAddTo(graph, b, delta.transpose(), -eta, mProg,
"b += -eta * dE/db");
// Update: W -= eta * dE/dW
Tensor wGrad =
poplin::matMul(graph, delta.transpose(), x.transpose(), mProg, "dE/dW");
popops::scaledAddTo(graph, W, wGrad, -eta, mProg, "W += -eta * dE/dW");
ホストから3つの FIFO (data, lables, hostNumCorrect) を追加
// Create a control program to execute the SGD training algorithm.
DataStream dataIn = graph.addHostToDeviceFIFO("data", FLOAT, imageSize);
DataStream labelIn = graph.addHostToDeviceFIFO("labels", UNSIGNED_INT, 1);
DataStream numCorrectOut =
graph.addDeviceToHostFIFO("hostNumCorrect", UNSIGNED_INT, 1);
学習用プログラム
Sequence trainProg;
numCorrectOutの初期化
// Initialize the numCorrect tensor to 0
Tensor zero = graph.addConstant(UNSIGNED_INT, {1}, 0);
graph.setTileMapping(zero, 0);
trainProg.add(Copy(zero, numCorrect));
const unsigned batchSize = 300;
trainProg.add(Repeat(
batchSize, Sequence(Copy(dataIn, x), Copy(labelIn, expected), mProg)));
trainProg.add(Copy(numCorrect, numCorrectOut));
学習用エンジンにグラフとプログラムを割り当て、デバイスにロード
// Create a Poplar engine.
Engine engine(graph, trainProg);
engine.load(dev);
data, labels, numCorrectOut にデータを割り当て
// Connect up the data streams
engine.connectStream(dataIn, &data[0], &data[numberOfImages * imageSize]);
engine.connectStream(labelIn, &labels[0], &labels[numberOfImages]);
engine.connectStream(numCorrectOut, &hNumCorrect);
W と b を書き込み
// Initialize the weights and biases
std::vector<float> initW = createRandomInitializers(W.numElements());
std::vector<float> initB = createRandomInitializers(b.numElements());
engine.writeTensor("weights", initW.data());
engine.writeTensor("biases", initB.data());
学習のループ、epoch / batch 回、engine.run(0) を実行
// Run the training algorithm, printing out the accuracy regularly
unsigned totalCorrect = 0, totalTested = 0;const unsigned batches = numberOfImages / batchSize;
for (unsigned epoch = 1; epoch <= epochs; ++epoch) {
for (unsigned batch = 1; batch <= batches; ++batch) {
engine.run(0); // trainProg
totalCorrect += hNumCorrect;
totalTested += batchSize;
if (epoch == 1 && batch == 1) {
engine.printProfileSummary(std::cout);
}
// Status update if we've done at least another 20th of an epoch
if (totalTested > numberOfImages / 20) {
unsigned percentCorrect = totalCorrect * 100 / totalTested;
unsigned epochPercent = batch * 100 / batches;
std::cout << "Epoch " << epoch << " (" << epochPercent
<< "%), accuracy " << percentCorrect << "%\n";
totalCorrect = totalTested = 0;
}
}
}