Commit c9ab9ef7 authored by Rob Cameron's avatar Rob Cameron
Browse files

Pipeline work to 1117b437

parent a3c734d0
......@@ -15,7 +15,6 @@
#include <type_traits>
#include <tuple>
#include <vector>
#include <util/aligned_allocator.h>
#include <llvm/IR/Type.h>
#include <llvm/Support/ErrorHandling.h>
#include <kernel/core/idisa_target.h>
......@@ -151,7 +150,7 @@ public:
using set_literal_t = std::vector<literal_t>;
/// The internal buffer type of the stream.
using buffer_t = std::vector<buffer_item_type, AlignedAllocator<buffer_item_type, 64>>;
using buffer_t = std::vector<buffer_item_type>;
/// The number of stream items per buffer item;
static const uint32_t stream_items_per_buffer_item_v = si_per_bi<I>::value;
......@@ -179,7 +178,9 @@ struct copy_decoder {
static const size_t num_elements_v = 1;
static result_t decode(typename traits::literal_t const & str) {
return std::make_tuple(typename traits::buffer_t{str.begin(), str.end()}, str.size(), 1);
static_assert(std::is_same<typename traits::literal_t, typename traits::buffer_t>::value,
"copy_decoder cannot be used when literal_t != buffer_t");
return std::make_tuple(str, str.size(), 1);
}
};
......@@ -264,7 +265,7 @@ struct bin_decoder {
static const size_t num_elements_v = 1;
static result_t decode(typename traits::literal_t const & str) {
std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> buffer{};
std::vector<uint8_t> buffer{};
int counter = 0;
size_t len = 0;
uint8_t builder = 0;
......
......@@ -1010,14 +1010,6 @@ void __report_failure_v(const char * name, const char * fmt, const uintptr_t * t
}
out << "\nNo debug symbols loaded.\n";
}
if (codegen::TaskThreads > 1 || codegen::SegmentThreads > 1) {
if (colourize) {
out.changeColor(raw_fd_ostream::BLUE, true);
}
out << " (Thread # ";
out.write_hex(reinterpret_cast<unsigned long>(pthread_self()));
out << ")";
}
if (colourize) {
out.resetColor();
}
......@@ -1427,10 +1419,8 @@ LoadInst * CBuilder::CreateLoad(Value * Ptr, bool isVolatile, const Twine Name)
}
StoreInst * CBuilder::CreateStore(Value * Val, Value * Ptr, bool isVolatile) {
assert ("Ptr (Arg2) was expected to be a pointer type" &&
Ptr->getType()->isPointerTy());
assert ("Ptr (Arg2) is not a pointer type for Val (Arg1)" &&
Val->getType() == Ptr->getType()->getPointerElementType());
assert ("Ptr is not a pointer type for Val" &&
Ptr->getType()->isPointerTy() && Val->getType() == Ptr->getType()->getPointerElementType());
if (LLVM_UNLIKELY(codegen::DebugOptionIsSet(codegen::EnableAsserts))) {
CheckAddress(Ptr, ConstantExpr::getSizeOf(Val->getType()), "CreateStore");
}
......@@ -1840,14 +1830,9 @@ void CBuilder::CheckAddress(Value * const Ptr, Value * const Size, Constant * co
#endif
}
Value * const addr = CreatePointerCast(Ptr, voidPtrTy);
Value * const firstPoisoned = CreateCall(isPoisoned, { addr, CreateTrunc(Size, sizeTy) });
Value * const valid = CreateICmpEQ(firstPoisoned, ConstantPointerNull::get(voidPtrTy));
DataLayout DL(getModule());
IntegerType * const intPtrTy = cast<IntegerType>(DL.getIntPtrType(firstPoisoned->getType()));
Value * const startInt = CreatePtrToInt(Ptr, intPtrTy);
Value * const firstPoisonedInt = CreatePtrToInt(firstPoisoned, intPtrTy);
Value * const offset = CreateSub(firstPoisonedInt, startInt);
__CreateAssert(valid, "%s was given an unallocated %" PRIuMAX "-byte memory address 0x%" PRIxPTR " (first poisoned=%" PRIuMAX ")", {Name, Size, Ptr, offset});
Value * check = CreateCall(isPoisoned, { addr, CreateTrunc(Size, sizeTy) });
Value * const valid = CreateICmpEQ(check, ConstantPointerNull::get(voidPtrTy));
__CreateAssert(valid, "%s was given an unallocated %" PRIuMAX "-byte memory address 0x%" PRIxPTR, {Name, Size, Ptr});
}
#endif
}
......
......@@ -385,7 +385,7 @@ std::vector<fs::path> getFullFileList(CPUDriver & driver, cl::list<std::string>
// them to the global list of selected files.
grep::NestedInternalSearchEngine pathSelectEngine(driver);
pathSelectEngine.setNumOfThreads(codegen::SegmentThreads); // 1
pathSelectEngine.setNumOfThreads(1);
pathSelectEngine.setRecordBreak(grep::GrepRecordBreakKind::Null);
pathSelectEngine.init();
pathSelectEngine.push(coalesceREs(getIncludeExcludePatterns(), GitREcoalescing));
......
......@@ -372,8 +372,6 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
const auto internallySynchronized = mTarget->hasAttribute(AttrId::InternallySynchronized);
const auto greedy = mTarget->isGreedy();
const auto kernelPrefix = getName();
Rational fixedRateLCM{0};
mFixedRateFactor = nullptr;
if (LLVM_UNLIKELY(internallySynchronized || greedy)) {
......@@ -408,6 +406,10 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
#ifdef CHECK_IO_ADDRESS_RANGE
auto checkStreamRange = [&](const std::unique_ptr<StreamSetBuffer> & buffer, const Binding & binding, Value * const startItemCount) {
SmallVector<char, 256> tmp;
raw_svector_ostream out(tmp);
out << "StreamSet " << getName() << ":" << binding.getName();
PointerType * const int8PtrTy = b->getInt8PtrTy();
ConstantInt * const ZERO = b->getSize(0);
......@@ -416,7 +418,6 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
Value * const fromIndex = b->CreateUDiv(startItemCount, BLOCK_WIDTH);
Value * const baseAddress = buffer->getBaseAddress(b);
Value * const startPtr = buffer->getStreamBlockPtr(b, baseAddress, ZERO, fromIndex);
Value * const start = b->CreatePointerCast(startPtr, int8PtrTy);
Value * const toIndex = b->CreateCeilUDiv(buffer->getCapacity(b), BLOCK_WIDTH);
Value * const endPtr = buffer->getStreamBlockPtr(b, baseAddress, ZERO, toIndex);
......@@ -438,9 +439,7 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
auto & buffer = mStreamSetInputBuffers[i];
assert (buffer.get() && buffer->isLinear());
const Binding & input = mInputStreamSets[i];
Value * const virtualBaseAddress = b->CreatePointerCast(nextArg(), buffer->getPointerType());
Value * const localHandle = b->CreateAllocaAtEntryPoint(buffer->getHandleType(b));
buffer->setHandle(localHandle);
buffer->setBaseAddress(b, virtualBaseAddress);
......@@ -460,43 +459,36 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
const ProcessingRate & rate = input.getRate();
Value * processed = nullptr;
if (internallySynchronized || isAddressable(input)) {
mProcessedInputItemPtr[i] = nextArg();
processed = b->CreateLoad(mProcessedInputItemPtr[i]);
} else {
if (LLVM_LIKELY(isCountable(input))) {
processed = nextArg();
} else { // isRelative
const auto port = getStreamPort(rate.getReference());
assert (port.Type == PortType::Input && port.Number < i);
assert (mProcessedInputItemPtr[port.Number]);
Value * const ref = b->CreateLoad(mProcessedInputItemPtr[port.Number]);
processed = b->CreateMulRate(ref, rate.getRate());
}
assert (processed);
assert (processed->getType() == sizeTy);
AllocaInst * const processedItems = b->CreateAllocaAtEntryPoint(sizeTy);
b->CreateStore(processed, processedItems);
mProcessedInputItemPtr[i] = processedItems;
mUpdatableProcessedInputItemPtr[i] = nextArg();
processed = b->CreateLoad(mUpdatableProcessedInputItemPtr[i]);
} else if (LLVM_LIKELY(isCountable(input))) {
processed = nextArg();
} else { // isRelative
const auto port = getStreamPort(rate.getReference());
assert (port.Type == PortType::Input && port.Number < i);
assert (mProcessedInputItemPtr[port.Number]);
Value * const ref = b->CreateLoad(mProcessedInputItemPtr[port.Number]);
processed = b->CreateMulRate(ref, rate.getRate());
}
assert (processed);
assert (processed->getType() == sizeTy);
AllocaInst * const processedItems = b->CreateAllocaAtEntryPoint(sizeTy);
b->CreateStore(processed, processedItems);
mProcessedInputItemPtr[i] = processedItems;
/// ----------------------------------------------------
/// accessible item count
/// ----------------------------------------------------
Value * accessible = nullptr;
Value * accessible = nullptr;
if (LLVM_UNLIKELY(internallySynchronized || requiresItemCount(input))) {
accessible = nextArg();
} else {
accessible = b->CreateCeilUMulRate(mFixedRateFactor, rate.getRate() / fixedRateLCM);
}
assert (accessible);
assert (accessible->getType() == sizeTy);
mAccessibleInputItems[i] = accessible;
Value * avail = b->CreateAdd(processed, accessible);
mAvailableInputItems[i] = avail;
if (input.hasLookahead()) {
avail = b->CreateAdd(avail, b->getSize(input.getLookahead()));
}
......@@ -527,7 +519,6 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
const std::unique_ptr<StreamSetBuffer> & buffer = mStreamSetOutputBuffers[i];
assert (buffer.get() && buffer->isLinear());
const Binding & output = mOutputStreamSets[i];
const auto isShared = output.hasAttribute(AttrId::SharedManagedBuffer);
const auto isLocal = internallySynchronized || isShared || Kernel::isLocalBuffer(output, false);
......@@ -560,29 +551,26 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
const ProcessingRate & rate = output.getRate();
Value * produced = nullptr;
if (LLVM_LIKELY(internallySynchronized || canTerminate || isAddressable(output))) {
mProducedOutputItemPtr[i] = nextArg();
produced = b->CreateLoad(mProducedOutputItemPtr[i]);
} else {
if (LLVM_LIKELY(isCountable(output))) {
produced = nextArg();
} else { // isRelative
// For now, if something is produced at a relative rate to another stream in a kernel that
// may terminate, its final item count is inherited from its reference stream and cannot
// be set independently. Should they be independent at early termination?
const auto port = getStreamPort(rate.getReference());
assert (port.Type == PortType::Input || (port.Type == PortType::Output && port.Number < i));
const auto & items = (port.Type == PortType::Input) ? mProcessedInputItemPtr : mProducedOutputItemPtr;
Value * const ref = b->CreateLoad(items[port.Number]);
produced = b->CreateMulRate(ref, rate.getRate());
}
AllocaInst * const producedItems = b->CreateAllocaAtEntryPoint(sizeTy);
b->CreateStore(produced, producedItems);
mProducedOutputItemPtr[i] = producedItems;
mUpdatableProducedOutputItemPtr[i] = nextArg();
produced = b->CreateLoad(mUpdatableProducedOutputItemPtr[i]);
} else if (LLVM_LIKELY(isCountable(output))) {
produced = nextArg();
} else { // isRelative
// For now, if something is produced at a relative rate to another stream in a kernel that
// may terminate, its final item count is inherited from its reference stream and cannot
// be set independently. Should they be independent at early termination?
const auto port = getStreamPort(rate.getReference());
assert (port.Type == PortType::Input || (port.Type == PortType::Output && port.Number < i));
const auto & items = (port.Type == PortType::Input) ? mProcessedInputItemPtr : mProducedOutputItemPtr;
Value * const ref = b->CreateLoad(items[port.Number]);
produced = b->CreateMulRate(ref, rate.getRate());
}
assert (produced);
assert (produced->getType() == sizeTy);
mInitiallyProducedOutputItems[i] = produced;
AllocaInst * const producedItems = b->CreateAllocaAtEntryPoint(sizeTy);
b->CreateStore(produced, producedItems);
mProducedOutputItemPtr[i] = producedItems;
/// ----------------------------------------------------
/// writable / consumed item count
/// ----------------------------------------------------
......@@ -604,14 +592,15 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
Value * capacity = nullptr;
if (writable) {
capacity = b->CreateAdd(produced, writable);
buffer->setCapacity(b, capacity);
#ifdef CHECK_IO_ADDRESS_RANGE
if (LLVM_UNLIKELY(enableAsserts)) {
checkStreamRange(buffer, output, produced);
}
#endif
} else {
capacity = ConstantExpr::getNeg(b->getSize(1));
}
buffer->setCapacity(b, capacity);
}
mWritableOutputItems[i] = writable;
}
......@@ -755,7 +744,7 @@ inline void KernelCompiler::callGenerateDoSegmentMethod(BuilderRef b) {
args.reserve(mCurrentMethod->arg_size());
for (auto ArgI = mCurrentMethod->arg_begin(); ArgI != mCurrentMethod->arg_end(); ++ArgI) {
args.push_back(&(*ArgI));
}
}
setDoSegmentProperties(b, args);
END_SCOPED_REGION
......@@ -769,16 +758,14 @@ inline void KernelCompiler::callGenerateDoSegmentMethod(BuilderRef b) {
b->CreateMProtect(mSharedHandle, CBuilder::Protect::READ);
}
// #error advance processed item counts for internally syncrhronized kernels? pipeline should handle it but didn't seem to?
// const auto numOfInputs = getNumOfStreamInputs();
const auto numOfInputs = getNumOfStreamInputs();
// for (unsigned i = 0; i < numOfInputs; i++) {
// if (mUpdatableProcessedInputItemPtr[i]) {
// Value * const items = b->CreateLoad(mProcessedInputItemPtr[i]);
// b->CreateStore(items, mUpdatableProcessedInputItemPtr[i]);
// }
// }
for (unsigned i = 0; i < numOfInputs; i++) {
if (mUpdatableProcessedInputItemPtr[i]) {
Value * const items = b->CreateLoad(mProcessedInputItemPtr[i]);
b->CreateStore(items, mUpdatableProcessedInputItemPtr[i]);
}
}
const auto numOfOutputs = getNumOfStreamOutputs();
......@@ -798,21 +785,21 @@ inline void KernelCompiler::callGenerateDoSegmentMethod(BuilderRef b) {
Constant * const LOG_2_BLOCK_WIDTH = b->getSize(floor_log2(b->getBitBlockWidth()));
Constant * const ZERO = b->getSize(0);
Value * produced = mInitiallyProducedOutputItems[i];
// // TODO: will LLVM optimizations replace the following with the already loaded value?
// // If not, re-loading it here may reduce register pressure / compilation time.
// if (mProducedOutputItemPtr[i]) {
// produced = b->CreateLoad(mProducedOutputItemPtr[i]);
// }
// TODO: will LLVM optimizations replace the following with the already loaded value?
// If not, re-loading it here may reduce register pressure / compilation time.
if (mUpdatableProducedOutputItemPtr[i]) {
produced = b->CreateLoad(mUpdatableProducedOutputItemPtr[i]);
}
Value * const blockIndex = b->CreateLShr(produced, LOG_2_BLOCK_WIDTH);
Value * vba = buffer->getStreamLogicalBasePtr(b.get(), baseAddress, ZERO, blockIndex);
vba = b->CreatePointerCast(vba, b->getVoidPtrTy());
b->CreateStore(vba, mUpdatableOutputBaseVirtualAddressPtr[i]);
}
// if (mUpdatableProducedOutputItemPtr[i]) {
// Value * const items = b->CreateLoad(mProducedOutputItemPtr[i]);
// b->CreateStore(items, mUpdatableProducedOutputItemPtr[i]);
// }
if (mUpdatableProducedOutputItemPtr[i]) {
Value * const items = b->CreateLoad(mProducedOutputItemPtr[i]);
b->CreateStore(items, mUpdatableProducedOutputItemPtr[i]);
}
}
// return the termination signal (if one exists)
......@@ -834,7 +821,7 @@ std::vector<Value *> KernelCompiler::storeDoSegmentState() const {
const auto numOfOutputs = getNumOfStreamOutputs();
std::vector<Value *> S;
S.resize(8 + numOfInputs * 3 + numOfOutputs * 5);
S.resize(8 + numOfInputs * 4 + numOfOutputs * 6);
auto o = S.begin();
......@@ -858,11 +845,14 @@ std::vector<Value *> KernelCompiler::storeDoSegmentState() const {
copy(mProcessedInputItemPtr, numOfInputs);
copy(mAccessibleInputItems, numOfInputs);
copy(mAvailableInputItems, numOfInputs);
copy(mUpdatableProcessedInputItemPtr, numOfInputs);
copy(mProducedOutputItemPtr, numOfOutputs);
copy(mInitiallyProducedOutputItems, numOfOutputs);
copy(mWritableOutputItems, numOfOutputs);
copy(mConsumedOutputItems, numOfOutputs);
copy(mUpdatableProducedOutputItemPtr, numOfOutputs);
copy(mUpdatableOutputBaseVirtualAddressPtr, numOfOutputs);
assert (o == S.end());
......@@ -902,12 +892,14 @@ void KernelCompiler::restoreDoSegmentState(const std::vector<Value *> & S) {
revert(mProcessedInputItemPtr, numOfInputs);
revert(mAccessibleInputItems, numOfInputs);
revert(mAvailableInputItems, numOfInputs);
revert(mUpdatableProcessedInputItemPtr, numOfInputs);
const auto numOfOutputs = getNumOfStreamOutputs();
revert(mProducedOutputItemPtr, numOfOutputs);
revert(mInitiallyProducedOutputItems, numOfOutputs);
revert(mWritableOutputItems, numOfOutputs);
revert(mConsumedOutputItems, numOfOutputs);
revert(mUpdatableProducedOutputItemPtr, numOfOutputs);
revert(mUpdatableOutputBaseVirtualAddressPtr, numOfOutputs);
assert (o == S.end());
......
......@@ -525,6 +525,7 @@ void StaticBuffer::prepareLinearBuffer(BuilderPtr b, llvm::Value * const produce
indices[0] = b->getInt32(0);
indices[1] = b->getInt32(EffectiveCapacity);
Value * const capacityField = b->CreateInBoundsGEP(mHandle, indices);
Value * const consumedChunks = b->CreateUDiv(consumed, BLOCK_WIDTH);
indices[1] = b->getInt32(BaseAddress);
Value * const virtualBaseField = b->CreateInBoundsGEP(mHandle, indices);
......@@ -535,12 +536,13 @@ void StaticBuffer::prepareLinearBuffer(BuilderPtr b, llvm::Value * const produce
Value * const mallocedAddrField = b->CreateInBoundsGEP(mHandle, indices);
Value * const bufferStart = b->CreateLoad(mallocedAddrField);
Value * const consumedChunks = b->CreateUDiv(consumed, BLOCK_WIDTH);
Value * const newBaseAddress = b->CreateGEP(bufferStart, b->CreateNeg(consumedChunks));
b->CreateStore(newBaseAddress, virtualBaseField);
Value * const effectiveCapacity = b->CreateAdd(consumedChunks, getInternalCapacity(b));
Value * const newBaseAddress = b->CreateGEP(bufferStart, b->CreateNeg(consumedChunks));
Value * const effectiveCapacity = b->CreateAdd(consumedChunks, b->getSize(mCapacity));
b->CreateStore(newBaseAddress, virtualBaseField);
b->CreateStore(effectiveCapacity, capacityField);
}
}
......@@ -796,8 +798,7 @@ void DynamicBuffer::reserveCapacity(BuilderPtr b, Value * const produced, Value
indices[1] = b->getInt32(EffectiveCapacity);
Value * const capacityField = b->CreateInBoundsGEP(handle, indices);
Value * const capacity = b->CreateLoad(capacityField);
Value * const capacity = b->CreateLoad(capacityField);
Value * const consumedChunks = b->CreateUDiv(consumed, BLOCK_WIDTH);
Value * const producedChunks = b->CreateCeilUDiv(produced, BLOCK_WIDTH);
Value * const requiredCapacity = b->CreateAdd(produced, required);
......@@ -820,10 +821,6 @@ void DynamicBuffer::reserveCapacity(BuilderPtr b, Value * const produced, Value
Value * const bytesToCopy = b->CreateMul(unconsumedChunks, CHUNK_SIZE);
//b->CallPrintInt("consumed", consumed);
//b->CallPrintInt("CHUNK_SIZE", CHUNK_SIZE);
//b->CallPrintInt("bytesToCopy", bytesToCopy);
BasicBlock * const copyBack = BasicBlock::Create(C, "copyBack", func);
BasicBlock * const expandAndCopyBack = BasicBlock::Create(C, "expandAndCopyBack", func);
BasicBlock * const updateBaseAddress = BasicBlock::Create(C, "updateBaseAddress", func);
......
......@@ -169,24 +169,20 @@ void ReadSourceKernel::generateInitializeMethod(const unsigned codeUnitWidth, co
void ReadSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, const unsigned stride, BuilderRef b) {
ConstantInt * const strideItems = b->getSize(stride);
ConstantInt * const itemsToRead = b->getSize(stride);
ConstantInt * const codeUnitBytes = b->getSize(codeUnitWidth / 8);
Constant * const strideBytes = ConstantExpr::getMul(strideItems, codeUnitBytes);
BasicBlock * const entryBB = b->GetInsertBlock();
BasicBlock * const moveData = b->CreateBasicBlock("MoveData");
BasicBlock * const prepareBuffer = b->CreateBasicBlock("PrepareBuffer");
BasicBlock * const readData = b->CreateBasicBlock("ReadData");
BasicBlock * const readIncomplete = b->CreateBasicBlock("readIncomplete");
BasicBlock * const setTermination = b->CreateBasicBlock("SetTermination");
BasicBlock * const readExit = b->CreateBasicBlock("ReadExit");
// Can we append to our existing buffer without impacting any subsequent kernel?
Value * const produced = b->getProducedItemCount("sourceBuffer");
Value * const itemsPending = b->CreateAdd(produced, strideItems);
Value * const itemsPending = b->CreateAdd(produced, itemsToRead);
Value * const effectiveCapacity = b->getScalarField("effectiveCapacity");
Value * const baseBuffer = b->getScalarField("buffer");
Value * const fd = b->getScalarField("fileDescriptor");
Value * const permitted = b->CreateICmpULT(itemsPending, effectiveCapacity);
b->CreateLikelyCondBr(permitted, readData, moveData);
......@@ -211,7 +207,7 @@ void ReadSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, con
Value * const unreadItems = b->CreateSub(produced, consumed);
Value * const unreadData = b->getRawOutputPointer("sourceBuffer", consumed);
Value * const potentialItems = b->CreateAdd(unreadItems, strideItems);
Value * const potentialItems = b->CreateAdd(unreadItems, itemsToRead);
Value * const toWrite = b->CreateGEP(baseBuffer, potentialItems);
Value * const canCopy = b->CreateICmpULT(toWrite, unreadData);
......@@ -269,33 +265,16 @@ void ReadSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, con
b->CreateBr(readData);
// Regardless of whether we're simply appending data or had to allocate a new buffer, read a new page
// of data into the input source buffer. This may involve multiple read calls.
// of data into the input source buffer. If we fail to read a full page ...
b->SetInsertPoint(readData);
PHINode * const bytesToRead = b->CreatePHI(strideBytes->getType(), 3);
bytesToRead->addIncoming(strideBytes, entryBB);
bytesToRead->addIncoming(strideBytes, prepareBuffer);
PHINode * const producedSoFar = b->CreatePHI(produced->getType(), 3);
producedSoFar->addIncoming(produced, entryBB);
producedSoFar->addIncoming(produced, prepareBuffer);
Value * const sourceBuffer = b->getRawOutputPointer("sourceBuffer", producedSoFar);
Value * const sourceBuffer = b->getRawOutputPointer("sourceBuffer", produced);
Value * const fd = b->getScalarField("fileDescriptor");
Constant * const bytesToRead = ConstantExpr::getMul(itemsToRead, codeUnitBytes);
Value * const bytesRead = b->CreateReadCall(fd, sourceBuffer, bytesToRead);
// There are 4 possibile results from read:
// bytesRead == -1: an error occurred
// bytesRead == 0: EOF, no bytes read
// 0 < bytesRead < bytesToRead: some data read (more may be available)
// bytesRead == bytesToRead, the full amount requested was read.
b->CreateUnlikelyCondBr(b->CreateICmpNE(bytesToRead, bytesRead), readIncomplete, readExit);
b->SetInsertPoint(readIncomplete);
// Keep reading until a the full stride is read, or there is no more data.
Value * moreToRead = b->CreateSub(bytesToRead, bytesRead);
Value * readSoFar = b->CreateSub(strideBytes, moreToRead);
Value * const itemsRead = b->CreateUDiv(readSoFar, codeUnitBytes);
Value * const itemsRead = b->CreateUDiv(bytesRead, codeUnitBytes);
Value * const itemsBuffered = b->CreateAdd(produced, itemsRead);
bytesToRead->addIncoming(moreToRead, readIncomplete);
producedSoFar->addIncoming(itemsBuffered, readIncomplete);
b->CreateCondBr(b->CreateICmpSGT(bytesRead, b->getSize(0)), readData, setTermination);
b->CreateUnlikelyCondBr(b->CreateICmpULT(itemsBuffered, itemsPending), setTermination, readExit);
// ... set the termination signal.
b->SetInsertPoint(setTermination);
Value * const bytesToZero = b->CreateMul(b->CreateSub(itemsPending, itemsBuffered), codeUnitBytes);
......
......@@ -100,18 +100,12 @@ void FileSink::generateInitializeMethod(BuilderRef b) {
void FileSink::generateDoSegmentMethod(BuilderRef b) {
Value * codeUnitBuffer = b->getInputStreamBlockPtr("codeUnitBuffer", b->getInt32(0));
codeUnitBuffer = b->CreatePointerCast(codeUnitBuffer, b->getInt8PtrTy());
//b->CallPrintInt("fileSink:codeUnitBuffer", codeUnitBuffer);
Value * bytesToDo = b->getAccessibleItemCount("codeUnitBuffer");
if (LLVM_UNLIKELY(mCodeUnitWidth > 8)) {
bytesToDo = b->CreateMul(bytesToDo, b->getSize(mCodeUnitWidth / 8));
} else if (LLVM_UNLIKELY(mCodeUnitWidth < 8)) {
bytesToDo = b->CreateUDiv(bytesToDo, b->getSize(8 / mCodeUnitWidth));
}
//b->CallPrintInt("fileSink:bytesToDo", bytesToDo);
Value * const fileDescriptor = b->getScalarField("fileDescriptor");
b->CreateWriteCall(fileDescriptor, codeUnitBuffer, bytesToDo);
}
......
......@@ -58,6 +58,7 @@ void PipelineAnalysis::addStreamSetsToBufferGraph(BuilderRef b) {
for (const auto e : make_iterator_range(out_edges(PipelineInput, mBufferGraph))) {
const auto streamSet = target(e, mBufferGraph);
BufferNode & bn = mBufferGraph[streamSet];
bn.Type |= BufferType::External;
if (LLVM_LIKELY(bn.Buffer == nullptr)) {
const BufferPort & rate = mBufferGraph[e];
const Binding & input = rate.Binding;
......@@ -70,6 +71,7 @@ void PipelineAnalysis::addStreamSetsToBufferGraph(BuilderRef b) {
for (const auto e : make_iterator_range(in_edges(PipelineOutput, mBufferGraph))) {
const auto streamSet = source(e, mBufferGraph);
BufferNode & bn = mBufferGraph[streamSet];
bn.Type |= BufferType::External;
if (LLVM_LIKELY(bn.Buffer == nullptr)) {
const BufferPort & rate = mBufferGraph[e];
const Binding & output = rate.Binding;
......@@ -295,8 +297,6 @@ void PipelineAnalysis::generateInitialBufferGraph() {
BufferPort bp(port, binding, lb, ub);
bp.Countable = isCountable(binding);
if (LLVM_UNLIKELY(rate.getKind() == RateId::Unknown)) {
bp.IsManaged = true;
}
......@@ -323,13 +323,6 @@ void PipelineAnalysis::generateInitialBufferGraph() {
break;
case AttrId::Deferred:
bp.IsDeferred = true;
if (LLVM_UNLIKELY(!bp.Countable)) {
SmallVector<char, 256> tmp;
raw_svector_ostream out(tmp);
out << kernelObj->getName() << "." << binding.getName()
<< " cannot be both a Deferred and Non-Countable rate.";
report_fatal_error(out.str());
}
break;
case AttrId::SharedManagedBuffer:
bp.IsShared = true;
......@@ -340,7 +333,6 @@ void PipelineAnalysis::generateInitialBufferGraph() {
default: break;
}
}
return bp;
};
......@@ -488,22 +480,6 @@ void PipelineAnalysis::generateInitialBufferGraph() {
}
}
}
// fill in any unmanaged pipeline input buffers
for (const auto e : make_iterator_range(out_edges(PipelineInput, mBufferGraph))) {
const auto streamSet = target(e, mBufferGraph);
BufferNode & bn = mBufferGraph[streamSet];
bn.Type |= BufferType::External;
}
// and pipeline output buffers ...
for (const auto e : make_iterator_range(in_edges(PipelineOutput, mBufferGraph))) {
const auto streamSet = source(e, mBufferGraph);
BufferNode & bn = mBufferGraph[streamSet];
bn.Type |= BufferType::External;
}
}
/** ------------------------------------------------------------------------------------------------------------- *
......@@ -641,72 +617,6 @@ void PipelineAnalysis::identifyLinearBuffers() {
#endif
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief identifyDirectUpdatesToStateObjects
** ------------------------------------------------------------------------------------------------------------- */
void PipelineAnalysis::identifyDirectUpdatesToStateObjects() {
// We can only safely use the processed item count if it's the last use of it
// and that consumer only uses it once.
SmallVector<unsigned, 64> lastConsumer(LastStreamSet - FirstStreamSet + 1U);
for (auto streamSet = FirstStreamSet; streamSet <= LastStreamSet; ++streamSet) {
bool multipleUsages = false;
auto lastKernel = PipelineInput;
for (const auto e : make_iterator_range(out_edges(streamSet, mBufferGraph))) {
const auto consumer = target(e, mBufferGraph);
if (consumer > lastKernel) {
lastKernel = consumer;
multipleUsages = false;
} else if (LLVM_UNLIKELY(consumer == lastKernel)) {
multipleUsages = true;
}
}
lastConsumer[streamSet - FirstStreamSet] = multipleUsages ? -1U : lastKernel;
}
for (auto kernel = FirstKernel; kernel <= LastKernel; ++kernel) {
const Kernel * const kernelObj = getKernel(kernel);
const auto isInternallySynchronized = kernelObj->hasAttribute(AttrId::InternallySynchronized);
const auto canTerminateEarly = kernelObj->canSetTerminateSignal();
const auto passOutputByAddress = isInternallySynchronized || canTerminateEarly;
for (const auto e : make_iterator_range(in_edges(kernel, mBufferGraph))) {
const auto streamSet = source(e, mBufferGraph);
assert (streamSet >= FirstStreamSet && streamSet <= LastStreamSet);
const BufferNode & bn = mBufferGraph[streamSet];
BufferPort & rt = mBufferGraph[e];
// All uses of an external item count refer to the same processed field.
bool safeToUpdate = true;
if (LLVM_UNLIKELY(bn.isExternal())) {
const auto lastConsumedId = lastConsumer[streamSet - FirstStreamSet];
safeToUpdate = (lastConsumedId == kernel);
}
const auto takeInputAddress = isInternallySynchronized || rt.IsDeferred;
const auto nonCountable = !rt.Countable;
rt.Addressable = (takeInputAddress || nonCountable);
rt.DirectlyUpdatesInternalState = safeToUpdate && (nonCountable || isInternallySynchronized);
rt.StoreItemCount = safeToUpdate && (rt.IsDeferred || !rt.DirectlyUpdatesInternalState);
}
for (const auto e : make_iterator_range(out_edges(kernel, mBufferGraph))) {
BufferPort & rt = mBufferGraph[e];
const auto streamSet = target(e, mBufferGraph);
assert (streamSet >= FirstStreamSet && streamSet <= LastStreamSet);
const auto takeOutputAddress = passOutputByAddress || rt.IsDeferred;
const auto nonCountable = !rt.Countable;
rt.Addressable = takeOutputAddress || nonCountable;
rt.StoreItemCount = true;
// If this kernel can terminate early, we need to store the item count
// that it may end up returning in the case of an unexpected termination.
rt.DirectlyUpdatesInternalState =
(nonCountable && !canTerminateEarly) || isInternallySynchronized;
}
}
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief identifyNonLocalBuffers
......
......@@ -19,7 +19,6 @@ void PipelineAnalysis::makeConsumerGraph() {
flat_set<unsigned> observedGlobalPortIds;
for (auto streamSet = FirstStreamSet; streamSet <= LastStreamSet; ++streamSet) {
// copy the producing edge
const auto pe = in_edge(streamSet, mBufferGraph);
const BufferPort & br = mBufferGraph[pe];
......@@ -32,6 +31,7 @@ void PipelineAnalysis::makeConsumerGraph() {
continue;
}
auto lastConsumer = PipelineInput;
auto index = 0U;
// flag the production rate as ignorable by inserting it upfront
......@@ -39,18 +39,19 @@ void PipelineAnalysis::makeConsumerGraph() {
for (const auto ce : make_iterator_range(out_edges(streamSet, mBufferGraph))) {
const BufferPort & br = mBufferGraph[ce];
const auto consumer = target(ce, mBufferGraph);
if (LLVM_UNLIKELY(consumer == PipelineOutput && producer != PipelineInput)) {
continue;
}
// check if any consumer has a rate we have not yet observed
lastConsumer = std::max<unsigned>(lastConsumer, consumer);
#ifndef TEST_ALL_CONSUMERS
if (observedGlobalPortIds.insert(br.GlobalPortId).second) {
#endif
auto testConsumer = [&]() {
#ifndef TEST_ALL_CONSUMERS
return observedGlobalPortIds.insert(br.GlobalPortId).second;
#else
return true;
#endif
};
if (testConsumer()) {
lastConsumer = std::max<unsigned>(lastConsumer, consumer);
add_edge(streamSet, consumer, ConsumerEdge{br.Port, ++index, ConsumerEdge::UpdatePhi}, mConsumerGraph);
#ifndef TEST_ALL_CONSUMERS
}
#endif
}
observedGlobalPortIds.clear();
......@@ -76,11 +77,27 @@ void PipelineAnalysis::makeConsumerGraph() {
}
}
#ifdef PRINT_CONSUMER_GRAPH
// If this is a pipeline input, we want to update the count at the end of the loop.
for (const auto e : make_iterator_range(out_edges(PipelineInput, mBufferGraph))) {
const auto streamSet = target(e, mBufferGraph);
ConsumerGraph::edge_descriptor f;
bool exists;
std::tie(f, exists) = edge(streamSet, PipelineOutput, mConsumerGraph);
const auto flags = ConsumerEdge::UpdateExternalCount;
if (exists) {
ConsumerEdge & cn = mConsumerGraph[f];
cn.Flags |= flags;
} else {
const BufferPort & br = mBufferGraph[e];
add_edge(streamSet, PipelineOutput, ConsumerEdge{br.Port, 0, flags}, mConsumerGraph);
}
}
#if 0
auto & out = errs();
out << "digraph \"ConsumerGraph_" << mPipelineKernel->getName() << "\" {\n";
out << "digraph \"ConsumerGraph\" {\n";
for (auto v : make_iterator_range(vertices(mConsumerGraph))) {
out << "v" << v << " [label=\"" << v << "\"];\n";
}
......
......@@ -2,7 +2,6 @@
#define PARTITIONING_ANALYSIS_HPP
#include "pipeline_analysis.hpp"
#include <toolchain/toolchain.h>
#include <util/slab_allocator.h>
namespace kernel {
......@@ -902,12 +901,7 @@ found: ++i;
// shares the same kernels as the first partition of another and we can schedule one after the other,
// this may improve I-Cache utilization.
#if Z3_VERSION_INTEGER >= LLVM_VERSION_CODE(4, 8, 0)
if (Z3_optimize_check(ctx, solver, 0, nullptr) != Z3_L_TRUE)
#else
if (Z3_optimize_check(ctx, solver) != Z3_L_TRUE)
#endif
{
if (Z3_optimize_check(ctx, solver) != Z3_L_TRUE) {
report_fatal_error("Z3 failed to find a partition ordering solution");
}
......@@ -1188,12 +1182,8 @@ found: ++i;
}
END_SCOPED_REGION
#if Z3_VERSION_INTEGER >= LLVM_VERSION_CODE(4, 8, 0)
if (Z3_optimize_check(ctx, solver, 0, nullptr) == Z3_L_FALSE)
#else
if (Z3_optimize_check(ctx, solver) == Z3_L_FALSE)
#endif
{
if (Z3_optimize_check(ctx, solver) == Z3_L_FALSE) {
report_fatal_error("Z3 failed to find a kernel ordering solution");
}
......@@ -1345,7 +1335,7 @@ void PipelineAnalysis::determinePartitionJumpIndices() {
for (auto u = PartitionCount; u--; ) { // forward topological ordering
assert (out_degree(u, G) > 0);
M.set();
M.set(0, PartitionCount, true);
assert (M.count() == PartitionCount);
for (const auto e : make_iterator_range(out_edges(u, G))) {
const auto v = target(e, G);
......
......@@ -53,7 +53,6 @@ public:
P.makeTerminationPropagationGraph();
// Finish the buffer graph
P.identifyDirectUpdatesToStateObjects();
P.addStreamSetsToBufferGraph(b);
P.gatherInfo();
......@@ -128,7 +127,6 @@ private:
void identifyLinearBuffers();
void identifyNonLocalBuffers();
void identifyLocalPortIds();
void identifyDirectUpdatesToStateObjects();
// consumer analysis functions
......@@ -218,7 +216,7 @@ public:
OwningVector<Kernel> mInternalKernels;
OwningVector<Binding> mInternalBindings;
OwningVector<StreamSetBuffer> mInternalBuffers;
OwningVec<StreamSetBuffer> mInternalBuffers;
};
}
......
......@@ -100,14 +100,7 @@ void PipelineCompiler::allocateOwnedBuffers(BuilderRef b, Value * const expected
b->CreateCall(func, params);
}
}
// and allocate any output buffers
#ifdef PRINT_DEBUG_MESSAGES
Constant * const pipelineName = b->GetString(mTarget->getName());
SmallVector<char, 256> tmp;
raw_svector_ostream out(tmp);
out << i << "." << getKernel(i)->getName();
Constant * const kernelName = b->GetString(out.str());
#endif
// and allocate any output buffers
for (const auto e : make_iterator_range(out_edges(i, mBufferGraph))) {
const auto streamSet = target(e, mBufferGraph);
const BufferNode & bn = mBufferGraph[streamSet];
......@@ -124,12 +117,7 @@ void PipelineCompiler::allocateOwnedBuffers(BuilderRef b, Value * const expected
assert ("a threadlocal buffer cannot be external" && (bn.isInternal() || nonLocal));
assert (buffer->getHandle());
assert (isFromCurrentFunction(b, buffer->getHandle(), false));
buffer->allocateBuffer(b, expectedNumOfStrides);
#ifdef PRINT_DEBUG_MESSAGES
const BufferPort & rd = mBufferGraph[e];
const Binding & binding = rd.Binding;
debugPrint(b, "%s:%s.%s capacity = %" PRId64, pipelineName, kernelName, b->GetString(binding.getName()), buffer->getCapacity(b));
#endif
buffer->allocateBuffer(b, expectedNumOfStrides);
}
}
......@@ -230,85 +218,35 @@ void PipelineCompiler::readProcessedItemCounts(BuilderRef b) {
for (const auto e : make_iterator_range(in_edges(mKernelId, mBufferGraph))) {
const BufferPort & br = mBufferGraph[e];
const auto inputPort = br.Port;
const auto streamSet = source(e, mBufferGraph);
const BufferNode & node = mBufferGraph[streamSet];
#ifndef STORE_EXTERNAL_PROCESSED_ITEM_COUNTS
if (LLVM_UNLIKELY(node.isExternal())) {
bool found = true;
for (const auto f : make_iterator_range(in_edges(streamSet, mBufferGraph))) {
if (source(f, mBufferGraph) == PipelineInput) {
const BufferPort & external = mBufferGraph[f];
Value * const processed = getProcessedInputItemsPtr(external.Port.Number);
mProcessedItemCountPtr[inputPort] = processed;
// mProcessedItemCountPtr[inputPort] = mExternallyProcessedItemPtr[streamSet];
// assert (mInitiallyProcessedExternalItems[streamSet]);
// mInitiallyProcessedItemCount[inputPort] = mInitiallyProcessedExternalItems[streamSet];
mInitiallyProcessedItemCount[inputPort] = b->CreateLoad(processed);
break;
}
}
assert("cannot locate external processed item count?" && found);
} else { // internal item count
#endif
const auto prefix = makeBufferName(mKernelId, inputPort);
Value * const processed = b->getScalarFieldPtr(prefix + ITEM_COUNT_SUFFIX);
mProcessedItemCountPtr[inputPort] = processed;
mInitiallyProcessedItemCount[inputPort] = b->CreateLoad(processed);
if (br.IsDeferred) {
Value * const deferred = b->getScalarFieldPtr(prefix + DEFERRED_ITEM_COUNT_SUFFIX);
mProcessedDeferredItemCountPtr[inputPort] = deferred;
mInitiallyProcessedDeferredItemCount[inputPort] = b->CreateLoad(deferred);
}
#ifndef STORE_EXTERNAL_PROCESSED_ITEM_COUNTS
const auto prefix = makeBufferName(mKernelId, inputPort);
Value * const processed = b->getScalarFieldPtr(prefix + ITEM_COUNT_SUFFIX);
mProcessedItemCountPtr[inputPort] = processed;
mInitiallyProcessedItemCount[inputPort] = b->CreateLoad(processed);
if (br.IsDeferred) {
Value * const deferred = b->getScalarFieldPtr(prefix + DEFERRED_ITEM_COUNT_SUFFIX);
mProcessedDeferredItemCountPtr[inputPort] = deferred;
mInitiallyProcessedDeferredItemCount[inputPort] = b->CreateLoad(deferred);
}
#endif
}
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief writeExternalProcessedItemCounts
** ------------------------------------------------------------------------------------------------------------- */
void PipelineCompiler::writeExternalProcessedItemCounts(BuilderRef b) {
// for (const auto e : make_iterator_range(out_edges(PipelineInput, mBufferGraph))) {
// const auto streamSet = target(e, mBufferGraph);
// Value * const ptr = b->CreateAllocaAtEntryPoint(b->getSizeTy());
// b->CreateStore(b->getSize(0), ptr);
// mExternallyProcessedItemPtr[streamSet] = ptr;
// }
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief readProducedItemCounts
** ------------------------------------------------------------------------------------------------------------- */
void PipelineCompiler::readProducedItemCounts(BuilderRef b) {
for (const auto e : make_iterator_range(out_edges(mKernelId, mBufferGraph))) {
const BufferPort & br = mBufferGraph[e];
const auto outputPort = br.Port;
const auto prefix = makeBufferName(mKernelId, outputPort);
const auto streamSet = target(e, mBufferGraph);
const BufferPort & output = mBufferGraph[e];
const auto outputPort = output.Port;
const BufferNode & node = mBufferGraph[streamSet];
if (LLVM_UNLIKELY(node.isExternal())) {
bool found = true;
for (const auto f : make_iterator_range(out_edges(streamSet, mBufferGraph))) {
if (target(f, mBufferGraph) == PipelineOutput) {
const BufferPort & external = mBufferGraph[f];
Value * const produced = getProducedOutputItemsPtr(external.Port.Number);
mProducedItemCountPtr[outputPort] = produced;
mInitiallyProducedItemCount[streamSet] = b->CreateLoad(produced);
break;
}
}
assert("cannot locate external produced item count?" && found);
} else { // internal item count
const auto prefix = makeBufferName(mKernelId, outputPort);
Value * const produced = b->getScalarFieldPtr(prefix + ITEM_COUNT_SUFFIX);
mProducedItemCountPtr[outputPort] = produced;
mInitiallyProducedItemCount[streamSet] = b->CreateLoad(produced);
if (output.IsDeferred) {
Value * const deferred = b->getScalarField(prefix + DEFERRED_ITEM_COUNT_SUFFIX);
mProducedDeferredItemCountPtr[outputPort] = deferred;
mInitiallyProducedDeferredItemCount[streamSet] = b->CreateLoad(deferred);
}
Value * const produced = b->getScalarFieldPtr(prefix + ITEM_COUNT_SUFFIX);
mProducedItemCountPtr[outputPort] = produced;
mInitiallyProducedItemCount[streamSet] = b->CreateLoad(produced);
if (br.IsDeferred) {
Value * const deferred = b->getScalarField(prefix + DEFERRED_ITEM_COUNT_SUFFIX);
mProducedDeferredItemCountPtr[outputPort] = deferred;
mInitiallyProducedDeferredItemCount[streamSet] = b->CreateLoad(deferred);
}
}
}
......@@ -352,59 +290,44 @@ void PipelineCompiler::setLocallyAvailableItemCount(BuilderRef /* b */, const St
mLocallyAvailableItems[streamSet] = available;
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief writeUpdatedItemCounts
** ------------------------------------------------------------------------------------------------------------- */
void PipelineCompiler::writeUpdatedItemCounts(BuilderRef b) {
if (mKernelIsInternallySynchronized) {
return;
}
for (const auto e : make_iterator_range(in_edges(mKernelId, mBufferGraph))) {
const BufferPort & br = mBufferGraph[e];
if (br.StoreItemCount) {
const StreamSetPort inputPort = br.Port;
#ifdef PRINT_DEBUG_MESSAGES
const auto prefix = b->GetString(makeBufferName(mKernelId, inputPort));
#endif
if (br.IsDeferred) {
// If this kernel has a deferred rate and we directly pass the state object's item count field
// to the kernel, that kernel will update the deferred count but still leave the undeferred
// count untouched.
if (!br.DirectlyUpdatesInternalState) {
b->CreateStore(mUpdatedProcessedDeferredPhi[inputPort], mProcessedDeferredItemCountPtr[inputPort]);
#ifdef PRINT_DEBUG_MESSAGES
debugPrint(b, " @ writing %s_processed(deferred) = %" PRIu64, prefix, mUpdatedProcessedDeferredPhi[inputPort]);
#endif
}
} else if (br.DirectlyUpdatesInternalState) {
continue;
}
b->CreateStore(mUpdatedProcessedPhi[inputPort], mProcessedItemCountPtr[inputPort]);
const StreamSetPort inputPort = br.Port;
b->CreateStore(mUpdatedProcessedPhi[inputPort], mProcessedItemCountPtr[inputPort]);
#ifdef PRINT_DEBUG_MESSAGES
const auto prefix = makeBufferName(mKernelId, inputPort);
debugPrint(b, " @ writing " + prefix + "_processed = %" PRIu64, mUpdatedProcessedPhi[inputPort]);
#endif
if (br.IsDeferred) {
b->CreateStore(mUpdatedProcessedDeferredPhi[inputPort], mProcessedDeferredItemCountPtr[inputPort]);
#ifdef PRINT_DEBUG_MESSAGES
debugPrint(b, " @ writing %s_processed = %" PRIu64, prefix, mUpdatedProcessedPhi[inputPort]);
debugPrint(b, " @ writing " + prefix + "_processed(deferred) = %" PRIu64, mUpdatedProcessedDeferredPhi[inputPort]);
#endif
}
}
for (const auto e : make_iterator_range(out_edges(mKernelId, mBufferGraph))) {
const BufferPort & br = mBufferGraph[e];
if (br.StoreItemCount) {
const StreamSetPort outputPort = br.Port;
#ifdef PRINT_DEBUG_MESSAGES
const auto prefix = b->GetString(makeBufferName(mKernelId, outputPort));
#endif
if (br.IsDeferred) {
if (!br.DirectlyUpdatesInternalState) {
b->CreateStore(mUpdatedProducedDeferredPhi[outputPort], mProducedDeferredItemCountPtr[outputPort]);
#ifdef PRINT_DEBUG_MESSAGES
debugPrint(b, " @ writing %s_produced(deferred) = %" PRIu64, prefix, mUpdatedProducedDeferredPhi[outputPort]);
#endif
}
} else if (br.DirectlyUpdatesInternalState) {
continue;
}
b->CreateStore(mUpdatedProducedPhi[outputPort], mProducedItemCountPtr[outputPort]);
const StreamSetPort outputPort = br.Port;
b->CreateStore(mUpdatedProducedPhi[outputPort], mProducedItemCountPtr[outputPort]);
#ifdef PRINT_DEBUG_MESSAGES
const auto prefix = makeBufferName(mKernelId, outputPort);
debugPrint(b, " @ writing " + prefix + "_produced = %" PRIu64, mUpdatedProducedPhi[outputPort]);
#endif
if (br.IsDeferred) {
b->CreateStore(mUpdatedProducedDeferredPhi[outputPort], mProducedDeferredItemCountPtr[outputPort]);
#ifdef PRINT_DEBUG_MESSAGES
debugPrint(b, " @ writing %s_produced = %" PRIu64, prefix, mUpdatedProducedPhi[outputPort]);
debugPrint(b, " @ writing " + prefix + "_produced(deferred) = %" PRIu64, mUpdatedProducedDeferredPhi[outputPort]);
#endif
}
}
......
......@@ -5,10 +5,6 @@
namespace kernel {
#ifndef MSC_VER
typedef long long int __int64;
#endif
template <typename T, unsigned n = 16>
using Vec = SmallVector<T, n>;
......@@ -163,6 +159,10 @@ private:
FixedVector<T> mArray;
};
template <typename T>
using OwningVec = std::vector<std::unique_ptr<T>>;
#ifndef NDEBUG
static bool isFromCurrentFunction(BuilderRef b, const Value * const value, const bool allowNull = true) {
if (value == nullptr) {
......
......@@ -260,10 +260,6 @@ struct BufferNode {
return (Type & BufferType::Shared) != 0;
}
bool isDynamic() const {
assert (Buffer);
return isa<DynamicBuffer>(Buffer);
}
};
......@@ -291,14 +287,8 @@ struct BufferPort {
bool IsShared = false;
bool IsManaged = false;
bool Countable = false;
bool Addressable = false;
bool DirectlyUpdatesInternalState = false;
bool StoreItemCount = false;
int TransitiveAdd = 0;
bool operator < (const BufferPort & rn) const {
if (LLVM_LIKELY(Port.Type == rn.Port.Type)) {
return Port.Number < rn.Port.Number;
......
#ifndef PIPELINE_KERNEL_COMPILER_CONFIG_H
#define PIPELINE_KERNEL_COMPILER_CONFIG_H
//#define PRINT_DEBUG_MESSAGES
// #define PRINT_DEBUG_MESSAGES
// #define DISABLE_ZERO_EXTEND
......@@ -13,18 +13,10 @@
// #define FORCE_PIPELINE_ASSERTIONS
// #define DISABLE_PIPELINE_ASSERTIONS
// #define FORCE_SYNCHRONIZATION_FOR_ALL_KERNELS
// #define FORCE_EACH_KERNEL_INTO_UNIQUE_PARTITION
// #define TEST_ALL_CONSUMERS
// #define STORE_EXTERNAL_PROCESSED_ITEM_COUNTS
//#define PRINT_BUFFER_GRAPH
//#define PRINT_CONSUMER_GRAPH
// #define PRINT_BUFFER_GRAPH
#endif // PIPELINE_KERNEL_COMPILER_CONFIG_H
......@@ -9,59 +9,38 @@ namespace kernel {
* @brief addConsumerKernelProperties
** ------------------------------------------------------------------------------------------------------------- */
inline void PipelineCompiler::addConsumerKernelProperties(BuilderRef b, const unsigned producer) {
//if (producer != PipelineInput || mTraceIndividualConsumedItemCounts) {
if (producer != PipelineInput || mTraceIndividualConsumedItemCounts) {
IntegerType * const sizeTy = b->getSizeTy();
for (const auto e : make_iterator_range(out_edges(producer, mBufferGraph))) {
const auto streamSet = target(e, mBufferGraph);
// If we have a buffer with only external consumers, we do not need to maintain the
// state for it.
bool atLeastOneInternalConsumer = false;
for (const auto e : make_iterator_range(out_edges(streamSet, mBufferGraph))) {
const auto consumer = target(e, mBufferGraph);
if (consumer != PipelineOutput) {
atLeastOneInternalConsumer = true;
break;
const BufferNode & bn = mBufferGraph[streamSet];
// If the out-degree for this buffer is zero, then we've proven that its consumption rate
// is identical to its production rate.
const auto numOfIndependentConsumers = out_degree(streamSet, mConsumerGraph);
if (LLVM_UNLIKELY(numOfIndependentConsumers != 0)) {
const BufferPort & rd = mBufferGraph[e];
assert (rd.Port.Type == PortType::Output);
const auto prefix = makeBufferName(producer, rd.Port);
const auto name = prefix + CONSUMED_ITEM_COUNT_SUFFIX;
// If we're tracing the consumer item counts, we need to store one for each
// (non-nested) consumer. Any nested consumers will have their own trace.
Type * countTy = sizeTy;
if (LLVM_UNLIKELY(mTraceIndividualConsumedItemCounts)) {
countTy = ArrayType::get(sizeTy, numOfIndependentConsumers + 1);
}
}
if (LLVM_LIKELY(atLeastOneInternalConsumer)) {
// If the out-degree for this buffer is zero, then we've proven that its consumption rate
// is identical to its production rate.
const auto numOfIndependentConsumers = out_degree(streamSet, mConsumerGraph);
assert (numOfIndependentConsumers <= out_degree(streamSet, mBufferGraph));
const BufferNode & bn = mBufferGraph[streamSet];
if (LLVM_UNLIKELY(numOfIndependentConsumers != 0 || bn.isExternal())) {
if (LLVM_LIKELY(bn.isOwned() || bn.isInternal() || mTraceIndividualConsumedItemCounts)) {
// If we're tracing the consumer item counts, we need to store one for each
// (non-nested) consumer. Any nested consumers will have their own trace.
Type * countTy = sizeTy;
if (LLVM_UNLIKELY(mTraceIndividualConsumedItemCounts)) {
countTy = ArrayType::get(sizeTy, numOfIndependentConsumers + 1);
}
const BufferPort & rd = mBufferGraph[e];
assert (rd.Port.Type == PortType::Output);
const auto prefix = makeBufferName(producer, rd.Port);
if (numOfIndependentConsumers > 0 && atLeastOneInternalConsumer) {
mTarget->addInternalScalar(countTy, prefix + CONSUMED_ITEM_COUNT_SUFFIX, producer);
} else {
mTarget->addNonPersistentScalar(countTy, prefix + CONSUMED_ITEM_COUNT_SUFFIX);
}
}
if (LLVM_LIKELY(bn.isOwned() || bn.isInternal() || mTraceIndividualConsumedItemCounts)) {
mTarget->addInternalScalar(countTy, name, producer);
} else {
mTarget->addNonPersistentScalar(countTy, name);
}
}
}
//}
}
}
/** ------------------------------------------------------------------------------------------------------------- *
......@@ -86,23 +65,19 @@ void PipelineCompiler::readConsumedItemCounts(BuilderRef b) {
* @brief readExternalConsumerItemCounts
** ------------------------------------------------------------------------------------------------------------- */
inline void PipelineCompiler::readExternalConsumerItemCounts(BuilderRef b) {
// for (const auto e : make_iterator_range(in_edges(PipelineOutput, mBufferGraph))) {
// const auto streamSet = source(e, mBufferGraph);
// const BufferNode & bn = mBufferGraph[streamSet];
// if (LLVM_LIKELY(bn.isOwned() || bn.isShared())) {
// const BufferPort & externalPort = mBufferGraph[e];
// Value * const consumed = getConsumedOutputItems(externalPort.Port.Number); assert (consumed);
// mInitialConsumedItemCount[streamSet] = consumed;
// const auto numOfIndependentConsumers = out_degree(streamSet, mConsumerGraph);
// const auto producer = parent(streamSet, mBufferGraph);
// if (LLVM_UNLIKELY((numOfIndependentConsumers != 0) || (producer == PipelineInput))) {
// setConsumedItemCount(b, streamSet, consumed, 0);
// }
// }
// }
for (const auto e : make_iterator_range(in_edges(PipelineOutput, mBufferGraph))) {
const auto streamSet = source(e, mBufferGraph);
const BufferNode & bn = mBufferGraph[streamSet];
if (LLVM_LIKELY(bn.isOwned())) {
const BufferPort & externalPort = mBufferGraph[e];
Value * const consumed = getConsumedOutputItems(externalPort.Port.Number); assert (consumed);
const auto numOfIndependentConsumers = out_degree(streamSet, mConsumerGraph);
const auto producer = parent(streamSet, mBufferGraph);
if (LLVM_UNLIKELY((numOfIndependentConsumers != 0) || (producer == PipelineInput))) {
setConsumedItemCount(b, streamSet, consumed, 0);
}
}
}
}
/** ------------------------------------------------------------------------------------------------------------- *
......@@ -110,88 +85,58 @@ inline void PipelineCompiler::readExternalConsumerItemCounts(BuilderRef b) {
** ------------------------------------------------------------------------------------------------------------- */
Value * PipelineCompiler::readConsumedItemCount(BuilderRef b, const size_t streamSet, const bool useFinalCount) {
Value * consumed = nullptr;
const BufferNode & bn = mBufferGraph[streamSet];
if (out_degree(streamSet, mConsumerGraph) == 0) {
if (LLVM_LIKELY(bn.isInternal())) {
// This stream either has no consumers or we've proven that
// its consumption rate is identical to its production rate.
if (useFinalCount) {
consumed = mLocallyAvailableItems[streamSet];
// This stream either has no consumers or we've proven that
// its consumption rate is identical to its production rate.
Value * produced = nullptr;
if (useFinalCount) {
produced = mLocallyAvailableItems[streamSet];
} else {
produced = mInitiallyProducedItemCount[streamSet];
}
const auto e = in_edge(streamSet, mBufferGraph);
const BufferPort & port = mBufferGraph[e];
if (LLVM_UNLIKELY(produced == nullptr)) {
const auto producer = source(e, mBufferGraph);
const auto prefix = makeBufferName(producer, port.Port);
if (LLVM_UNLIKELY(port.IsDeferred)) {
produced = b->getScalarField(prefix + DEFERRED_ITEM_COUNT_SUFFIX);
} else {
consumed = mInitiallyProducedItemCount[streamSet];
}
const auto e = in_edge(streamSet, mBufferGraph);
const BufferPort & port = mBufferGraph[e];
if (LLVM_UNLIKELY(consumed == nullptr)) {
const auto producer = source(e, mBufferGraph);
const auto prefix = makeBufferName(producer, port.Port);
if (LLVM_UNLIKELY(port.IsDeferred)) {
consumed = b->getScalarField(prefix + DEFERRED_ITEM_COUNT_SUFFIX);
} else {
consumed = b->getScalarField(prefix + ITEM_COUNT_SUFFIX);
}
produced = b->getScalarField(prefix + ITEM_COUNT_SUFFIX);
}
auto delayOrLookBehind = std::max(port.Delay, port.LookBehind);
for (const auto e : make_iterator_range(out_edges(streamSet, mBufferGraph))) {
const BufferPort & br = mBufferGraph[e];
const auto d = std::max(br.Delay, br.LookBehind);
delayOrLookBehind = std::max(delayOrLookBehind, d);
}
if (delayOrLookBehind) {
consumed = b->CreateSaturatingSub(consumed, b->getSize(delayOrLookBehind));
}
}
} else {
const auto e = in_edge(streamSet, mConsumerGraph);
const ConsumerEdge & c = mConsumerGraph[e];
const auto producer = source(e, mConsumerGraph);
Value * consumedPtr = nullptr;
if (LLVM_LIKELY(producer != PipelineInput || mTraceIndividualConsumedItemCounts)) {
const StreamSetPort port{PortType::Output, c.Port};
const auto prefix = makeBufferName(producer, port);
consumedPtr = b->getScalarFieldPtr(prefix + CONSUMED_ITEM_COUNT_SUFFIX);
if (LLVM_UNLIKELY(mTraceIndividualConsumedItemCounts)) {
Constant * const ZERO = b->getInt32(0);
consumedPtr = b->CreateInBoundsGEP(consumedPtr, { ZERO, ZERO } );
}
} else {
consumedPtr = getProcessedInputItemsPtr(c.Port);
auto delayOrLookBehind = std::max(port.Delay, port.LookBehind);
for (const auto e : make_iterator_range(out_edges(streamSet, mBufferGraph))) {
const BufferPort & br = mBufferGraph[e];
const auto d = std::max(br.Delay, br.LookBehind);
delayOrLookBehind = std::max(delayOrLookBehind, d);
}
consumed = b->CreateLoad(consumedPtr);
if (delayOrLookBehind) {
produced = b->CreateSaturatingSub(produced, b->getSize(delayOrLookBehind));
}
return produced;
}
//if (LLVM_UNLIKELY(bn.isExternal())) {
bool foundAny = false;
for (const auto e : make_iterator_range(out_edges(streamSet, mBufferGraph))) {
if (target(e, mBufferGraph) == PipelineOutput) {
const BufferPort & externalPort = mBufferGraph[e];
Value * const external = getConsumedOutputItems(externalPort.Port.Number); assert (external);
const Binding & binding = externalPort.Binding;
//b->CallPrintInt(binding.getName() + "_externalConsumed", external);
const auto e = in_edge(streamSet, mConsumerGraph);
const ConsumerEdge & c = mConsumerGraph[e];
const auto producer = source(e, mConsumerGraph);
if (LLVM_LIKELY(producer != PipelineInput || mTraceIndividualConsumedItemCounts)) {
consumed = b->CreateUMin(consumed, external);
foundAny = true;
// break;
}
const StreamSetPort port{PortType::Output, c.Port};
const auto prefix = makeBufferName(producer, port);
Value * ptr = b->getScalarFieldPtr(prefix + CONSUMED_ITEM_COUNT_SUFFIX);
if (LLVM_UNLIKELY(mTraceIndividualConsumedItemCounts)) {
Constant * const ZERO = b->getInt32(0);
ptr = b->CreateInBoundsGEP(ptr, { ZERO, ZERO } );
}
return b->CreateLoad(ptr);
assert (foundAny ^ bn.isInternal());
//}
} else {
return b->CreateLoad(getProcessedInputItemsPtr(c.Port));
}
return consumed;
}
/** ------------------------------------------------------------------------------------------------------------- *
......@@ -287,7 +232,7 @@ inline void PipelineCompiler::computeMinimumConsumedItemCounts(BuilderRef b) {
#ifdef PRINT_DEBUG_MESSAGES
const auto consPrefix = makeBufferName(mKernelId, port);
debugPrint(b, "* update " + consPrefix + " -> " + prodPrefix + "_consumed' = %" PRIu64, cn.Consumed);
debugPrint(b, consPrefix + " -> " + prodPrefix + "_consumed' = %" PRIu64, cn.Consumed);
#endif
}
}
......@@ -300,15 +245,13 @@ inline void PipelineCompiler::writeConsumedItemCounts(BuilderRef b) {
for (const auto e : make_iterator_range(in_edges(mKernelId, mConsumerGraph))) {
const ConsumerEdge & c = mConsumerGraph[e];
if (c.Flags) {
if (c.Flags & ConsumerEdge::UpdatePhi) {
const auto streamSet = source(e, mConsumerGraph);
const ConsumerNode & cn = mConsumerGraph[streamSet];
if (c.Flags & ConsumerEdge::UpdatePhi) {
if (LLVM_LIKELY(cn.PhiNode != nullptr)) {
cn.PhiNode->addIncoming(cn.Consumed, mKernelLoopExitPhiCatch);
cn.Consumed = cn.PhiNode;
cn.PhiNode = nullptr;
}
if (LLVM_LIKELY(cn.PhiNode != nullptr)) {
cn.PhiNode->addIncoming(cn.Consumed, mKernelLoopExitPhiCatch);
cn.Consumed = cn.PhiNode;
cn.PhiNode = nullptr;
}
// check to see if we've fully finished processing any stream
if (c.Flags & ConsumerEdge::WriteConsumedCount) {
......@@ -321,6 +264,7 @@ inline void PipelineCompiler::writeConsumedItemCounts(BuilderRef b) {
#endif
setConsumedItemCount(b, streamSet, cn.Consumed, 0);
}
}
}
}
......@@ -329,19 +273,19 @@ inline void PipelineCompiler::writeConsumedItemCounts(BuilderRef b) {
* @brief setConsumedItemCount
** ------------------------------------------------------------------------------------------------------------- */
void PipelineCompiler::setConsumedItemCount(BuilderRef b, const size_t streamSet, not_null<Value *> consumed, const unsigned slot) const {
const auto output = in_edge(streamSet, mBufferGraph);
const auto producer = source(output, mBufferGraph);
const BufferPort & outputPort = mBufferGraph[output];
const auto pe = in_edge(streamSet, mBufferGraph);
const auto producer = source(pe, mBufferGraph);
const BufferPort & rd = mBufferGraph[pe];
Value * ptr = nullptr;
if (LLVM_LIKELY(producer != PipelineInput || slot != 0 || mTraceIndividualConsumedItemCounts)) {
const auto prefix = makeBufferName(producer, outputPort.Port);
if (LLVM_LIKELY(producer != PipelineInput || mTraceIndividualConsumedItemCounts)) {
const auto prefix = makeBufferName(producer, rd.Port);
ptr = b->getScalarFieldPtr(prefix + CONSUMED_ITEM_COUNT_SUFFIX);
if (LLVM_UNLIKELY(mTraceIndividualConsumedItemCounts)) {
ptr = b->CreateInBoundsGEP(ptr, { b->getInt32(0), b->getInt32(slot) });
}
if (LLVM_UNLIKELY(CheckAssertions)) {
Value * const prior = b->CreateLoad(ptr);
const Binding & output = outputPort.Binding;
const Binding & output = rd.Binding;
// TODO: cross reference which slot the traced count is for?
Constant * const bindingName = b->GetString(output.getName());
......@@ -355,21 +299,8 @@ void PipelineCompiler::setConsumedItemCount(BuilderRef b, const size_t streamSet
prior, consumed);
const BufferNode & bn = mBufferGraph[streamSet];
Value * const produced = mLocallyAvailableItems[streamSet]; assert (produced);
if (bn.NonLocal) {
Value * const consumedLessThanProduced = b->CreateICmpULE(consumed, produced);
Constant * const none = getTerminationSignal(b, TerminationSignal::None);
Value * const terminated = b->CreateICmpNE(mTerminatedAtLoopExitPhi, none);
Value * const valid = b->CreateOr(consumedLessThanProduced, terminated);
b->CreateAssert(valid,
"%s.%s: consumed item count (%" PRId64 ") exceeds "
"produced item count (%" PRId64 ")",
mCurrentKernelName, bindingName,
consumed, produced);
} else {
if (!bn.NonLocal) {
Value * const produced = mLocallyAvailableItems[streamSet]; assert (produced);
// NOTE: static linear buffers are assumed to be threadlocal.
Value * const fullyConsumed = b->CreateICmpEQ(produced, consumed);
Constant * const fatal = getTerminationSignal(b, TerminationSignal::Fatal);
......@@ -384,8 +315,11 @@ void PipelineCompiler::setConsumedItemCount(BuilderRef b, const size_t streamSet
}
}
b->CreateStore(consumed, ptr);
} else {
ptr = getProcessedInputItemsPtr(rd.Port.Number);
}
b->CreateStore(consumed, ptr);
}
/** ------------------------------------------------------------------------------------------------------------- *
......@@ -395,17 +329,23 @@ inline void PipelineCompiler::initializePipelineInputConsumedPhiNodes(BuilderRef
for (const auto e : make_iterator_range(out_edges(PipelineInput, mBufferGraph))) {
const auto streamSet = target(e, mBufferGraph);
const BufferPort & br = mBufferGraph[e];
const auto portNum = br.Port.Number;
Value * const avail = getAvailableInputItems(portNum);
mInitialConsumedItemCount[streamSet] = avail;
// If we have an unused external input, set the value immediately.
if (out_degree(streamSet, mBufferGraph) == 0) {
Value * const externalPtr = getProcessedInputItemsPtr(portNum);
b->CreateStore(avail, externalPtr);
}
mInitialConsumedItemCount[streamSet] = getAvailableInputItems(br.Port.Number);
}
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief reportExternalConsumedItemCounts
** ------------------------------------------------------------------------------------------------------------- */
inline void PipelineCompiler::writeExternalConsumedItemCounts(BuilderRef b) {
// for (const auto e : make_iterator_range(out_edges(PipelineInput, mBufferGraph))) {
// const auto streamSet = target(e, mBufferGraph);
// const BufferPort & rd = mBufferGraph[e];
// Value * const ptr = getProcessedInputItemsPtr(rd.Port.Number);
// Value * const consumed = mInitialConsumedItemCount[streamSet]; assert (consumed);
// b->CreateStore(consumed, ptr);
// }
}
}
#endif // CONSUMER_LOGIC_HPP
......@@ -12,6 +12,75 @@
namespace kernel {
/** ------------------------------------------------------------------------------------------------------------- *
* @brief readPipelineIOItemCounts
** ------------------------------------------------------------------------------------------------------------- */
void PipelineCompiler::readPipelineIOItemCounts(BuilderRef b) {
// TODO: this needs to be considered more: if we have multiple consumers of a pipeline input and
// they process the input data at differing rates, how do we ensure that we always resume processing
// at the correct position? We can store the actual item counts / delta of the consumed count
// internally but this would be problematic for optimization branches as we may have processed data
// using the alternate path and any internally stored counts/deltas are irrelevant.
// Would a simple "reset" be enough?
mKernelId = PipelineInput;
ConstantInt * const ZERO = b->getSize(0);
for (auto streamSet = FirstStreamSet; streamSet <= LastStreamSet; ++streamSet) {
mLocallyAvailableItems[streamSet] = ZERO;
}
// NOTE: all outputs of PipelineInput node are inputs to the PipelineKernel
for (const auto e : make_iterator_range(out_edges(PipelineInput, mBufferGraph))) {
const StreamSetPort inputPort = mBufferGraph[e].Port;
assert (inputPort.Type == PortType::Output);
Value * const available = getAvailableInputItems(inputPort.Number);
setLocallyAvailableItemCount(b, inputPort, available);
initializeConsumedItemCount(b, inputPort, available);
}
for (const auto e : make_iterator_range(out_edges(PipelineInput, mBufferGraph))) {
const auto buffer = target(e, mBufferGraph);
const StreamSetPort inputPort = mBufferGraph[e].Port;
assert (inputPort.Type == PortType::Output);
Value * const inPtr = getProcessedInputItemsPtr(inputPort.Number);
Value * const processed = b->CreateLoad(inPtr);
for (const auto e : make_iterator_range(out_edges(buffer, mBufferGraph))) {
const BufferPort & rd = mBufferGraph[e];
const auto kernelIndex = target(e, mBufferGraph);
const auto prefix = makeBufferName(kernelIndex, rd.Port);
Value * const ptr = b->getScalarFieldPtr(prefix + ITEM_COUNT_SUFFIX);
b->CreateStore(processed, ptr);
}
}
mKernelId = PipelineOutput;
// NOTE: all inputs of PipelineOutput node are outputs of the PipelineKernel
for (const auto e : make_iterator_range(in_edges(PipelineOutput, mBufferGraph))) {
const auto buffer = source(e, mBufferGraph);
const StreamSetPort outputPort = mBufferGraph[e].Port;
assert (outputPort.Type == PortType::Input);
Value * outPtr = getProducedOutputItemsPtr(outputPort.Number);
Value * const produced = b->CreateLoad(outPtr);
for (const auto e : make_iterator_range(in_edges(buffer, mBufferGraph))) {
const BufferPort & rd = mBufferGraph[e];
const auto kernelId = source(e, mBufferGraph);
const auto prefix = makeBufferName(kernelId, rd.Port);
Value * const ptr = b->getScalarFieldPtr(prefix + ITEM_COUNT_SUFFIX);
b->CreateStore(produced, ptr);
}
}
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief detemineMaximumNumberOfStrides
** ------------------------------------------------------------------------------------------------------------- */
......@@ -54,7 +123,7 @@ void PipelineCompiler::determineNumOfLinearStrides(BuilderRef b) {
Value * numOfLinearStrides = nullptr;
if (mCurrentNumOfStridesAtLoopEntryPhi && mMaximumNumOfStrides) {
if (mMayLoopToEntry && !ExternallySynchronized) {
numOfLinearStrides = b->CreateSub(mMaximumNumOfStrides, mCurrentNumOfStridesAtLoopEntryPhi);
} else {
numOfLinearStrides = mMaximumNumOfStrides;
......@@ -108,11 +177,11 @@ void PipelineCompiler::determineNumOfLinearStrides(BuilderRef b) {
const auto check = (bn.NonLocal || bn.NonLinear) && unchecked(br.LocalPortId);
if (LLVM_LIKELY(check)) {
Value * const strides = getNumOfAccessibleStrides(b, br, numOfInputStrides, false);
Value * const strides = getNumOfAccessibleStrides(b, br, numOfInputStrides);
numOfInputStrides = b->CreateUMin(numOfInputStrides, strides);
}
if (LLVM_UNLIKELY(CheckAssertions)) {
Value * const strides = getNumOfAccessibleStrides(b, br, numOfActualInputStrides, true);
Value * const strides = getNumOfAccessibleStrides(b, br, numOfActualInputStrides);
numOfActualInputStrides = b->CreateUMin(numOfActualInputStrides, strides);
}
}
......@@ -134,7 +203,7 @@ void PipelineCompiler::determineNumOfLinearStrides(BuilderRef b) {
ConstantInt * const ONE = b->getSize(1);
numOfOutputStrides = b->CreateUMax(numOfInputStrides, ONE);
}
Value * const strides = getNumOfWritableStrides(b, br, numOfOutputStrides, false);
Value * const strides = getNumOfWritableStrides(b, br, numOfOutputStrides);
if (strides) {
Value * const minStrides = b->CreateUMin(numOfOutputStrides, strides);
Value * const isZero = b->CreateICmpEQ(strides, ZERO);
......@@ -172,7 +241,7 @@ void PipelineCompiler::determineNumOfLinearStrides(BuilderRef b) {
ConstantInt * const ONE = b->getSize(1);
numOfActualOutputStrides = b->CreateUMax(numOfActualInputStrides, ONE);
}
Value * const strides = getNumOfWritableStrides(b, br, numOfActualOutputStrides, true);
Value * const strides = getNumOfWritableStrides(b, br, numOfActualOutputStrides);
if (strides) {
Value * const minStrides = b->CreateUMin(numOfActualOutputStrides, strides);
Value * const isZero = b->CreateICmpEQ(strides, ZERO);
......@@ -669,6 +738,7 @@ Value * PipelineCompiler::getAccessibleInputItems(BuilderRef b, const BufferPort
Value * accessible = buffer->getLinearlyAccessibleItems(b, processed, available, overflow);
// if (LLVM_UNLIKELY(CheckAssertions)) {
// Value * intCapacity = buffer->getInternalCapacity(b);
// if (overflow) {
......@@ -694,11 +764,6 @@ Value * PipelineCompiler::getAccessibleInputItems(BuilderRef b, const BufferPort
Value * const exhausted = b->CreateICmpUGE(processed, available);
Value * const useZeroExtend = b->CreateAnd(closed, exhausted);
mIsInputZeroExtended[inputPort] = useZeroExtend;
#ifdef PRINT_DEBUG_MESSAGES
debugPrint(b, prefix + "_zeroExtended = %" PRIu64, mIsInputZeroExtended[inputPort]);
#endif
if (LLVM_LIKELY(mHasZeroExtendedInput == nullptr)) {
mHasZeroExtendedInput = useZeroExtend;
} else {
......@@ -742,8 +807,7 @@ void PipelineCompiler::ensureSufficientOutputSpace(BuilderRef b, const BufferPor
const BufferNode & bn = mBufferGraph[streamSet];
if (LLVM_UNLIKELY(bn.isOwned() && bn.isDynamic())) {
if (LLVM_UNLIKELY(bn.isOwned())) {
const auto prefix = makeBufferName(mKernelId, outputPort);
const StreamSetBuffer * const buffer = bn.Buffer;
......@@ -760,6 +824,7 @@ void PipelineCompiler::ensureSufficientOutputSpace(BuilderRef b, const BufferPor
const auto beforeExpansion = mWritableOutputItems[outputPort.Number];
Value * const hasEnoughSpace = b->CreateICmpULE(required, beforeExpansion[WITH_OVERFLOW]);
BasicBlock * const noExpansionExit = b->GetInsertBlock();
b->CreateLikelyCondBr(hasEnoughSpace, expanded, expandBuffer);
......@@ -781,7 +846,6 @@ void PipelineCompiler::ensureSufficientOutputSpace(BuilderRef b, const BufferPor
buffer->reserveCapacity(b, produced, consumed, required);
recordBufferExpansionHistory(b, outputPort, buffer);
if (cycleCounterAccumulator) {
Value * const cycleCounterEnd = b->CreateReadCycleCounter();
Value * const duration = b->CreateSub(cycleCounterEnd, cycleCounterStart);
......@@ -908,8 +972,7 @@ Value * PipelineCompiler::getWritableOutputItems(BuilderRef b, const BufferPort
** ------------------------------------------------------------------------------------------------------------- */
Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b,
const BufferPort & port,
Value * const numOfLinearStrides,
const bool debug) {
Value * const numOfLinearStrides) {
const auto inputPort = port.Port;
assert (inputPort.Type == PortType::Input);
const Binding & input = port.Binding;
......@@ -917,12 +980,6 @@ Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b,
Value * numOfStrides = nullptr;
#ifdef PRINT_DEBUG_MESSAGES
const auto prefix = makeBufferName(mKernelId, inputPort);
Constant * prefixSymbol = nullptr;
if (debug) {
prefixSymbol = b->GetString(prefix + "_debug");
} else {
prefixSymbol = b->GetString(prefix);
}
#endif
if (LLVM_UNLIKELY(rate.isPartialSum())) {
numOfStrides = getMaximumNumOfPartialSumStrides(b, port, numOfLinearStrides);
......@@ -932,8 +989,8 @@ Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b,
Value * const accessible = getAccessibleInputItems(b, port); assert (accessible);
Value * const strideLength = getInputStrideLength(b, port); assert (strideLength);
#ifdef PRINT_DEBUG_MESSAGES
debugPrint(b, "< %s_accessible = %" PRIu64, prefixSymbol, accessible);
debugPrint(b, "< %s_strideLength = %" PRIu64, prefixSymbol, strideLength);
debugPrint(b, "< " + prefix + "_accessible = %" PRIu64, accessible);
debugPrint(b, "< " + prefix + "_strideLength = %" PRIu64, strideLength);
#endif
numOfStrides = b->CreateUDiv(subtractLookahead(b, port, accessible), strideLength);
}
......@@ -942,7 +999,7 @@ Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b,
numOfStrides = b->CreateSelect(ze, numOfLinearStrides, numOfStrides, "numOfZeroExtendedStrides");
}
#ifdef PRINT_DEBUG_MESSAGES
debugPrint(b, "< %s_numOfStrides = %" PRIu64, prefixSymbol, numOfStrides);
debugPrint(b, "< " + prefix + "_numOfStrides = %" PRIu64, numOfStrides);
#endif
return numOfStrides;
}
......@@ -952,8 +1009,7 @@ Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b,
** ------------------------------------------------------------------------------------------------------------- */
Value * PipelineCompiler::getNumOfWritableStrides(BuilderRef b,
const BufferPort & port,
Value * const numOfLinearStrides,
const bool debug) {
Value * const numOfLinearStrides) {
const auto outputPort = port.Port;
assert (outputPort.Type == PortType::Output);
......@@ -962,15 +1018,6 @@ Value * PipelineCompiler::getNumOfWritableStrides(BuilderRef b,
if (LLVM_UNLIKELY(bn.isUnowned())) {
return nullptr;
}
#ifdef PRINT_DEBUG_MESSAGES
const auto prefix = makeBufferName(mKernelId, outputPort);
Constant * prefixSymbol = nullptr;
if (debug) {
prefixSymbol = b->GetString(prefix + "_debug");
} else {
prefixSymbol = b->GetString(prefix);
}
#endif
const Binding & output = port.Binding;
Value * numOfStrides = nullptr;
if (LLVM_UNLIKELY(output.getRate().isPartialSum())) {
......@@ -981,7 +1028,8 @@ Value * PipelineCompiler::getNumOfWritableStrides(BuilderRef b,
numOfStrides = b->CreateUDiv(writable, strideLength);
}
#ifdef PRINT_DEBUG_MESSAGES
debugPrint(b, "> %s_numOfStrides = %" PRIu64, prefixSymbol, numOfStrides);
const auto prefix = makeBufferName(mKernelId, outputPort);
debugPrint(b, "> " + prefix + "_numOfStrides = %" PRIu64, numOfStrides);
#endif
return numOfStrides;
}
......
......@@ -478,29 +478,6 @@ inline const StreamSetPort PipelineCompiler::getReference(const StreamSetPort po
return PipelineCommonGraphFunctions::getReference(mKernelId, port);
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief isSafeToUseProcessedItemCountDirectly
** ------------------------------------------------------------------------------------------------------------- */
bool PipelineCompiler::isSafeToUseProcessedItemCountDirectly(const unsigned streamSet) const {
const BufferNode & bn = mBufferGraph[streamSet];
if (bn.isExternal()) {
bool alreadyHasOneUse = false;
for (const auto e : make_iterator_range(out_edges(streamSet, mBufferGraph))) {
const auto consumer = target(e, mBufferGraph);
// We can only safely use the processed item count if it's the last use of it
if (consumer > mKernelId) {
return false;
} else if (consumer == mKernelId) {
// If we have more than one use of this count in the same kernel, we cannot
// safely reuse it.
if (alreadyHasOneUse) return false;
alreadyHasOneUse = true;
}
}
}
return true;
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief reset
** ------------------------------------------------------------------------------------------------------------- */
......@@ -523,8 +500,6 @@ void PipelineCompiler::clearInternalStateForCurrentKernel() {
mAnyRemainingInput = nullptr;
mExhaustedPipelineInputPhi = nullptr;
mExhaustedInputAtJumpPhi = nullptr;
mExecutedAtLeastOnceAtLoopEntryPhi = nullptr;
mCurrentNumOfStridesAtLoopEntryPhi = nullptr;
mKernelIsFinal = nullptr;
mKernelIsPenultimate = nullptr;
......
......@@ -104,7 +104,7 @@ inline void PipelineCompiler::executeKernel(BuilderRef b) {
const auto nextPartitionId = mCurrentPartitionId + 1U;
const auto jumpId = mPartitionJumpIndex[mCurrentPartitionId];
const auto canJumpToAnotherPartition = mIsPartitionRoot && (mIsBounded || nextPartitionId == jumpId);
const auto handleNoUpdateExit = mIsPartitionRoot; // || !canJumpToAnotherPartition;
const auto handleNoUpdateExit = mIsPartitionRoot || !canJumpToAnotherPartition;
#else
const auto canJumpToAnotherPartition = mIsPartitionRoot;
const auto handleNoUpdateExit = mCheckIO;
......@@ -504,7 +504,11 @@ inline void PipelineCompiler::initializeKernelLoopEntryPhis(BuilderRef b) {
IntegerType * const boolTy = b->getInt1Ty();
b->SetInsertPoint(mKernelLoopEntry);
assert ("no loop start?" && mKernelLoopStart);
if (mKernelLoopStart == nullptr) {
report_fatal_error("no loop start?");
}
assert (mKernelLoopStart);
for (const auto e : make_iterator_range(in_edges(mKernelId, mBufferGraph))) {
const BufferPort & br = mBufferGraph[e];
......@@ -873,6 +877,9 @@ void PipelineCompiler::end(BuilderRef b) {
b->CreateUnlikelyCondBr(done, mPipelineEnd, mPipelineLoop);
}
b->SetInsertPoint(mPipelineEnd);
writeExternalConsumedItemCounts(b);
writeExternalProducedItemCounts(b);
if (mCurrentThreadTerminationSignalPtr) {
b->CreateStore(terminated, mCurrentThreadTerminationSignalPtr);
}
......@@ -898,4 +905,16 @@ void PipelineCompiler::end(BuilderRef b) {
// b->GetInsertBlock()->getParent()->print(errs());
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief writeExternalProducedItemCounts
** ------------------------------------------------------------------------------------------------------------- */
void PipelineCompiler::writeExternalProducedItemCounts(BuilderRef b) {
for (const auto e : make_iterator_range(in_edges(PipelineOutput, mBufferGraph))) {
const BufferPort & external = mBufferGraph[e];
const auto streamSet = source(e, mBufferGraph);
Value * const ptr = getProducedOutputItemsPtr(external.Port.Number);
b->CreateStore(mLocallyAvailableItems[streamSet], ptr);
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment