Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
cprabhu
parabix-devel
Commits
c9ab9ef7
Commit
c9ab9ef7
authored
4 years ago
by
Rob Cameron
Browse files
Options
Download
Email Patches
Plain Diff
Pipeline work to
1117b437
parent
a3c734d0
Changes
42
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
567 additions
and
821 deletions
+567
-821
include/testing/stream_gen.hpp
include/testing/stream_gen.hpp
+5
-4
lib/codegen/CBuilder.cpp
lib/codegen/CBuilder.cpp
+5
-20
lib/fileselect/file_select.cpp
lib/fileselect/file_select.cpp
+1
-1
lib/kernel/core/kernel_compiler.cpp
lib/kernel/core/kernel_compiler.cpp
+62
-70
lib/kernel/core/streamset.cpp
lib/kernel/core/streamset.cpp
+7
-10
lib/kernel/io/source_kernel.cpp
lib/kernel/io/source_kernel.cpp
+10
-31
lib/kernel/io/stdout_kernel.cpp
lib/kernel/io/stdout_kernel.cpp
+0
-6
lib/kernel/pipeline/compiler/analysis/buffer_analysis.hpp
lib/kernel/pipeline/compiler/analysis/buffer_analysis.hpp
+2
-92
lib/kernel/pipeline/compiler/analysis/consumer_analysis.hpp
lib/kernel/pipeline/compiler/analysis/consumer_analysis.hpp
+29
-12
lib/kernel/pipeline/compiler/analysis/partitioning_analysis.hpp
...rnel/pipeline/compiler/analysis/partitioning_analysis.hpp
+4
-14
lib/kernel/pipeline/compiler/analysis/pipeline_analysis.hpp
lib/kernel/pipeline/compiler/analysis/pipeline_analysis.hpp
+1
-3
lib/kernel/pipeline/compiler/buffer_management_logic.hpp
lib/kernel/pipeline/compiler/buffer_management_logic.hpp
+43
-120
lib/kernel/pipeline/compiler/common/common.hpp
lib/kernel/pipeline/compiler/common/common.hpp
+4
-4
lib/kernel/pipeline/compiler/common/graphs.h
lib/kernel/pipeline/compiler/common/graphs.h
+0
-10
lib/kernel/pipeline/compiler/config.h
lib/kernel/pipeline/compiler/config.h
+2
-10
lib/kernel/pipeline/compiler/consumer_logic.hpp
lib/kernel/pipeline/compiler/consumer_logic.hpp
+109
-169
lib/kernel/pipeline/compiler/kernel_execution_logic.hpp
lib/kernel/pipeline/compiler/kernel_execution_logic.hpp
+178
-182
lib/kernel/pipeline/compiler/kernel_io_calculation_logic.hpp
lib/kernel/pipeline/compiler/kernel_io_calculation_logic.hpp
+84
-36
lib/kernel/pipeline/compiler/kernel_logic.hpp
lib/kernel/pipeline/compiler/kernel_logic.hpp
+0
-25
lib/kernel/pipeline/compiler/kernel_segment_processing_logic.hpp
...nel/pipeline/compiler/kernel_segment_processing_logic.hpp
+21
-2
No files found.
include/testing/stream_gen.hpp
View file @
c9ab9ef7
...
...
@@ -15,7 +15,6 @@
#include <type_traits>
#include <tuple>
#include <vector>
#include <util/aligned_allocator.h>
#include <llvm/IR/Type.h>
#include <llvm/Support/ErrorHandling.h>
#include <kernel/core/idisa_target.h>
...
...
@@ -151,7 +150,7 @@ public:
using
set_literal_t
=
std
::
vector
<
literal_t
>
;
/// The internal buffer type of the stream.
using
buffer_t
=
std
::
vector
<
buffer_item_type
,
AlignedAllocator
<
buffer_item_type
,
64
>
>
;
using
buffer_t
=
std
::
vector
<
buffer_item_type
>
;
/// The number of stream items per buffer item;
static
const
uint32_t
stream_items_per_buffer_item_v
=
si_per_bi
<
I
>::
value
;
...
...
@@ -179,7 +178,9 @@ struct copy_decoder {
static
const
size_t
num_elements_v
=
1
;
static
result_t
decode
(
typename
traits
::
literal_t
const
&
str
)
{
return
std
::
make_tuple
(
typename
traits
::
buffer_t
{
str
.
begin
(),
str
.
end
()},
str
.
size
(),
1
);
static_assert
(
std
::
is_same
<
typename
traits
::
literal_t
,
typename
traits
::
buffer_t
>::
value
,
"copy_decoder cannot be used when literal_t != buffer_t"
);
return
std
::
make_tuple
(
str
,
str
.
size
(),
1
);
}
};
...
...
@@ -264,7 +265,7 @@ struct bin_decoder {
static
const
size_t
num_elements_v
=
1
;
static
result_t
decode
(
typename
traits
::
literal_t
const
&
str
)
{
std
::
vector
<
uint8_t
,
AlignedAllocator
<
uint8_t
,
64
>
>
buffer
{};
std
::
vector
<
uint8_t
>
buffer
{};
int
counter
=
0
;
size_t
len
=
0
;
uint8_t
builder
=
0
;
...
...
This diff is collapsed.
Click to expand it.
lib/codegen/CBuilder.cpp
View file @
c9ab9ef7
...
...
@@ -1010,14 +1010,6 @@ void __report_failure_v(const char * name, const char * fmt, const uintptr_t * t
}
out
<<
"
\n
No debug symbols loaded.
\n
"
;
}
if
(
codegen
::
TaskThreads
>
1
||
codegen
::
SegmentThreads
>
1
)
{
if
(
colourize
)
{
out
.
changeColor
(
raw_fd_ostream
::
BLUE
,
true
);
}
out
<<
" (Thread # "
;
out
.
write_hex
(
reinterpret_cast
<
unsigned
long
>
(
pthread_self
()));
out
<<
")"
;
}
if
(
colourize
)
{
out
.
resetColor
();
}
...
...
@@ -1427,10 +1419,8 @@ LoadInst * CBuilder::CreateLoad(Value * Ptr, bool isVolatile, const Twine Name)
}
StoreInst
*
CBuilder
::
CreateStore
(
Value
*
Val
,
Value
*
Ptr
,
bool
isVolatile
)
{
assert
(
"Ptr (Arg2) was expected to be a pointer type"
&&
Ptr
->
getType
()
->
isPointerTy
());
assert
(
"Ptr (Arg2) is not a pointer type for Val (Arg1)"
&&
Val
->
getType
()
==
Ptr
->
getType
()
->
getPointerElementType
());
assert
(
"Ptr is not a pointer type for Val"
&&
Ptr
->
getType
()
->
isPointerTy
()
&&
Val
->
getType
()
==
Ptr
->
getType
()
->
getPointerElementType
());
if
(
LLVM_UNLIKELY
(
codegen
::
DebugOptionIsSet
(
codegen
::
EnableAsserts
)))
{
CheckAddress
(
Ptr
,
ConstantExpr
::
getSizeOf
(
Val
->
getType
()),
"CreateStore"
);
}
...
...
@@ -1840,14 +1830,9 @@ void CBuilder::CheckAddress(Value * const Ptr, Value * const Size, Constant * co
#endif
}
Value
*
const
addr
=
CreatePointerCast
(
Ptr
,
voidPtrTy
);
Value
*
const
firstPoisoned
=
CreateCall
(
isPoisoned
,
{
addr
,
CreateTrunc
(
Size
,
sizeTy
)
});
Value
*
const
valid
=
CreateICmpEQ
(
firstPoisoned
,
ConstantPointerNull
::
get
(
voidPtrTy
));
DataLayout
DL
(
getModule
());
IntegerType
*
const
intPtrTy
=
cast
<
IntegerType
>
(
DL
.
getIntPtrType
(
firstPoisoned
->
getType
()));
Value
*
const
startInt
=
CreatePtrToInt
(
Ptr
,
intPtrTy
);
Value
*
const
firstPoisonedInt
=
CreatePtrToInt
(
firstPoisoned
,
intPtrTy
);
Value
*
const
offset
=
CreateSub
(
firstPoisonedInt
,
startInt
);
__CreateAssert
(
valid
,
"%s was given an unallocated %"
PRIuMAX
"-byte memory address 0x%"
PRIxPTR
" (first poisoned=%"
PRIuMAX
")"
,
{
Name
,
Size
,
Ptr
,
offset
});
Value
*
check
=
CreateCall
(
isPoisoned
,
{
addr
,
CreateTrunc
(
Size
,
sizeTy
)
});
Value
*
const
valid
=
CreateICmpEQ
(
check
,
ConstantPointerNull
::
get
(
voidPtrTy
));
__CreateAssert
(
valid
,
"%s was given an unallocated %"
PRIuMAX
"-byte memory address 0x%"
PRIxPTR
,
{
Name
,
Size
,
Ptr
});
}
#endif
}
...
...
This diff is collapsed.
Click to expand it.
lib/fileselect/file_select.cpp
View file @
c9ab9ef7
...
...
@@ -385,7 +385,7 @@ std::vector<fs::path> getFullFileList(CPUDriver & driver, cl::list<std::string>
// them to the global list of selected files.
grep
::
NestedInternalSearchEngine
pathSelectEngine
(
driver
);
pathSelectEngine
.
setNumOfThreads
(
codegen
::
SegmentThreads
);
// 1
pathSelectEngine
.
setNumOfThreads
(
1
);
pathSelectEngine
.
setRecordBreak
(
grep
::
GrepRecordBreakKind
::
Null
);
pathSelectEngine
.
init
();
pathSelectEngine
.
push
(
coalesceREs
(
getIncludeExcludePatterns
(),
GitREcoalescing
));
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/core/kernel_compiler.cpp
View file @
c9ab9ef7
...
...
@@ -372,8 +372,6 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
const
auto
internallySynchronized
=
mTarget
->
hasAttribute
(
AttrId
::
InternallySynchronized
);
const
auto
greedy
=
mTarget
->
isGreedy
();
const
auto
kernelPrefix
=
getName
();
Rational
fixedRateLCM
{
0
};
mFixedRateFactor
=
nullptr
;
if
(
LLVM_UNLIKELY
(
internallySynchronized
||
greedy
))
{
...
...
@@ -408,6 +406,10 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
#ifdef CHECK_IO_ADDRESS_RANGE
auto
checkStreamRange
=
[
&
](
const
std
::
unique_ptr
<
StreamSetBuffer
>
&
buffer
,
const
Binding
&
binding
,
Value
*
const
startItemCount
)
{
SmallVector
<
char
,
256
>
tmp
;
raw_svector_ostream
out
(
tmp
);
out
<<
"StreamSet "
<<
getName
()
<<
":"
<<
binding
.
getName
();
PointerType
*
const
int8PtrTy
=
b
->
getInt8PtrTy
();
ConstantInt
*
const
ZERO
=
b
->
getSize
(
0
);
...
...
@@ -416,7 +418,6 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
Value
*
const
fromIndex
=
b
->
CreateUDiv
(
startItemCount
,
BLOCK_WIDTH
);
Value
*
const
baseAddress
=
buffer
->
getBaseAddress
(
b
);
Value
*
const
startPtr
=
buffer
->
getStreamBlockPtr
(
b
,
baseAddress
,
ZERO
,
fromIndex
);
Value
*
const
start
=
b
->
CreatePointerCast
(
startPtr
,
int8PtrTy
);
Value
*
const
toIndex
=
b
->
CreateCeilUDiv
(
buffer
->
getCapacity
(
b
),
BLOCK_WIDTH
);
Value
*
const
endPtr
=
buffer
->
getStreamBlockPtr
(
b
,
baseAddress
,
ZERO
,
toIndex
);
...
...
@@ -438,9 +439,7 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
auto
&
buffer
=
mStreamSetInputBuffers
[
i
];
assert
(
buffer
.
get
()
&&
buffer
->
isLinear
());
const
Binding
&
input
=
mInputStreamSets
[
i
];
Value
*
const
virtualBaseAddress
=
b
->
CreatePointerCast
(
nextArg
(),
buffer
->
getPointerType
());
Value
*
const
localHandle
=
b
->
CreateAllocaAtEntryPoint
(
buffer
->
getHandleType
(
b
));
buffer
->
setHandle
(
localHandle
);
buffer
->
setBaseAddress
(
b
,
virtualBaseAddress
);
...
...
@@ -460,43 +459,36 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
const
ProcessingRate
&
rate
=
input
.
getRate
();
Value
*
processed
=
nullptr
;
if
(
internallySynchronized
||
isAddressable
(
input
))
{
mProcessedInputItemPtr
[
i
]
=
nextArg
();
processed
=
b
->
CreateLoad
(
mProcessedInputItemPtr
[
i
]);
}
else
{
if
(
LLVM_LIKELY
(
isCountable
(
input
)))
{
processed
=
nextArg
();
}
else
{
// isRelative
const
auto
port
=
getStreamPort
(
rate
.
getReference
());
assert
(
port
.
Type
==
PortType
::
Input
&&
port
.
Number
<
i
);
assert
(
mProcessedInputItemPtr
[
port
.
Number
]);
Value
*
const
ref
=
b
->
CreateLoad
(
mProcessedInputItemPtr
[
port
.
Number
]);
processed
=
b
->
CreateMulRate
(
ref
,
rate
.
getRate
());
}
assert
(
processed
);
assert
(
processed
->
getType
()
==
sizeTy
);
AllocaInst
*
const
processedItems
=
b
->
CreateAllocaAtEntryPoint
(
sizeTy
);
b
->
CreateStore
(
processed
,
processedItems
);
mProcessedInputItemPtr
[
i
]
=
processedItems
;
mUpdatableProcessedInputItemPtr
[
i
]
=
nextArg
();
processed
=
b
->
CreateLoad
(
mUpdatableProcessedInputItemPtr
[
i
]);
}
else
if
(
LLVM_LIKELY
(
isCountable
(
input
)))
{
processed
=
nextArg
();
}
else
{
// isRelative
const
auto
port
=
getStreamPort
(
rate
.
getReference
());
assert
(
port
.
Type
==
PortType
::
Input
&&
port
.
Number
<
i
);
assert
(
mProcessedInputItemPtr
[
port
.
Number
]);
Value
*
const
ref
=
b
->
CreateLoad
(
mProcessedInputItemPtr
[
port
.
Number
]);
processed
=
b
->
CreateMulRate
(
ref
,
rate
.
getRate
());
}
assert
(
processed
);
assert
(
processed
->
getType
()
==
sizeTy
);
AllocaInst
*
const
processedItems
=
b
->
CreateAllocaAtEntryPoint
(
sizeTy
);
b
->
CreateStore
(
processed
,
processedItems
);
mProcessedInputItemPtr
[
i
]
=
processedItems
;
/// ----------------------------------------------------
/// accessible item count
/// ----------------------------------------------------
Value
*
accessible
=
nullptr
;
Value
*
accessible
=
nullptr
;
if
(
LLVM_UNLIKELY
(
internallySynchronized
||
requiresItemCount
(
input
)))
{
accessible
=
nextArg
();
}
else
{
accessible
=
b
->
CreateCeilUMulRate
(
mFixedRateFactor
,
rate
.
getRate
()
/
fixedRateLCM
);
}
assert
(
accessible
);
assert
(
accessible
->
getType
()
==
sizeTy
);
mAccessibleInputItems
[
i
]
=
accessible
;
Value
*
avail
=
b
->
CreateAdd
(
processed
,
accessible
);
mAvailableInputItems
[
i
]
=
avail
;
if
(
input
.
hasLookahead
())
{
avail
=
b
->
CreateAdd
(
avail
,
b
->
getSize
(
input
.
getLookahead
()));
}
...
...
@@ -527,7 +519,6 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
const
std
::
unique_ptr
<
StreamSetBuffer
>
&
buffer
=
mStreamSetOutputBuffers
[
i
];
assert
(
buffer
.
get
()
&&
buffer
->
isLinear
());
const
Binding
&
output
=
mOutputStreamSets
[
i
];
const
auto
isShared
=
output
.
hasAttribute
(
AttrId
::
SharedManagedBuffer
);
const
auto
isLocal
=
internallySynchronized
||
isShared
||
Kernel
::
isLocalBuffer
(
output
,
false
);
...
...
@@ -560,29 +551,26 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
const
ProcessingRate
&
rate
=
output
.
getRate
();
Value
*
produced
=
nullptr
;
if
(
LLVM_LIKELY
(
internallySynchronized
||
canTerminate
||
isAddressable
(
output
)))
{
mProducedOutputItemPtr
[
i
]
=
nextArg
();
produced
=
b
->
CreateLoad
(
mProducedOutputItemPtr
[
i
]);
}
else
{
if
(
LLVM_LIKELY
(
isCountable
(
output
)))
{
produced
=
nextArg
();
}
else
{
// isRelative
// For now, if something is produced at a relative rate to another stream in a kernel that
// may terminate, its final item count is inherited from its reference stream and cannot
// be set independently. Should they be independent at early termination?
const
auto
port
=
getStreamPort
(
rate
.
getReference
());
assert
(
port
.
Type
==
PortType
::
Input
||
(
port
.
Type
==
PortType
::
Output
&&
port
.
Number
<
i
));
const
auto
&
items
=
(
port
.
Type
==
PortType
::
Input
)
?
mProcessedInputItemPtr
:
mProducedOutputItemPtr
;
Value
*
const
ref
=
b
->
CreateLoad
(
items
[
port
.
Number
]);
produced
=
b
->
CreateMulRate
(
ref
,
rate
.
getRate
());
}
AllocaInst
*
const
producedItems
=
b
->
CreateAllocaAtEntryPoint
(
sizeTy
);
b
->
CreateStore
(
produced
,
producedItems
);
mProducedOutputItemPtr
[
i
]
=
producedItems
;
mUpdatableProducedOutputItemPtr
[
i
]
=
nextArg
();
produced
=
b
->
CreateLoad
(
mUpdatableProducedOutputItemPtr
[
i
]);
}
else
if
(
LLVM_LIKELY
(
isCountable
(
output
)))
{
produced
=
nextArg
();
}
else
{
// isRelative
// For now, if something is produced at a relative rate to another stream in a kernel that
// may terminate, its final item count is inherited from its reference stream and cannot
// be set independently. Should they be independent at early termination?
const
auto
port
=
getStreamPort
(
rate
.
getReference
());
assert
(
port
.
Type
==
PortType
::
Input
||
(
port
.
Type
==
PortType
::
Output
&&
port
.
Number
<
i
));
const
auto
&
items
=
(
port
.
Type
==
PortType
::
Input
)
?
mProcessedInputItemPtr
:
mProducedOutputItemPtr
;
Value
*
const
ref
=
b
->
CreateLoad
(
items
[
port
.
Number
]);
produced
=
b
->
CreateMulRate
(
ref
,
rate
.
getRate
());
}
assert
(
produced
);
assert
(
produced
->
getType
()
==
sizeTy
);
mInitiallyProducedOutputItems
[
i
]
=
produced
;
AllocaInst
*
const
producedItems
=
b
->
CreateAllocaAtEntryPoint
(
sizeTy
);
b
->
CreateStore
(
produced
,
producedItems
);
mProducedOutputItemPtr
[
i
]
=
producedItems
;
/// ----------------------------------------------------
/// writable / consumed item count
/// ----------------------------------------------------
...
...
@@ -604,14 +592,15 @@ void KernelCompiler::setDoSegmentProperties(BuilderRef b, const ArrayRef<Value *
Value
*
capacity
=
nullptr
;
if
(
writable
)
{
capacity
=
b
->
CreateAdd
(
produced
,
writable
);
buffer
->
setCapacity
(
b
,
capacity
);
#ifdef CHECK_IO_ADDRESS_RANGE
if
(
LLVM_UNLIKELY
(
enableAsserts
))
{
checkStreamRange
(
buffer
,
output
,
produced
);
}
#endif
}
else
{
capacity
=
ConstantExpr
::
getNeg
(
b
->
getSize
(
1
));
}
buffer
->
setCapacity
(
b
,
capacity
);
}
mWritableOutputItems
[
i
]
=
writable
;
}
...
...
@@ -755,7 +744,7 @@ inline void KernelCompiler::callGenerateDoSegmentMethod(BuilderRef b) {
args
.
reserve
(
mCurrentMethod
->
arg_size
());
for
(
auto
ArgI
=
mCurrentMethod
->
arg_begin
();
ArgI
!=
mCurrentMethod
->
arg_end
();
++
ArgI
)
{
args
.
push_back
(
&
(
*
ArgI
));
}
}
setDoSegmentProperties
(
b
,
args
);
END_SCOPED_REGION
...
...
@@ -769,16 +758,14 @@ inline void KernelCompiler::callGenerateDoSegmentMethod(BuilderRef b) {
b
->
CreateMProtect
(
mSharedHandle
,
CBuilder
::
Protect
::
READ
);
}
// #error advance processed item counts for internally syncrhronized kernels? pipeline should handle it but didn't seem to?
// const auto numOfInputs = getNumOfStreamInputs();
const
auto
numOfInputs
=
getNumOfStreamInputs
();
//
for (unsigned i = 0; i < numOfInputs; i++) {
//
if (mUpdatableProcessedInputItemPtr[i]) {
//
Value * const items = b->CreateLoad(mProcessedInputItemPtr[i]);
//
b->CreateStore(items, mUpdatableProcessedInputItemPtr[i]);
//
}
//
}
for
(
unsigned
i
=
0
;
i
<
numOfInputs
;
i
++
)
{
if
(
mUpdatableProcessedInputItemPtr
[
i
])
{
Value
*
const
items
=
b
->
CreateLoad
(
mProcessedInputItemPtr
[
i
]);
b
->
CreateStore
(
items
,
mUpdatableProcessedInputItemPtr
[
i
]);
}
}
const
auto
numOfOutputs
=
getNumOfStreamOutputs
();
...
...
@@ -798,21 +785,21 @@ inline void KernelCompiler::callGenerateDoSegmentMethod(BuilderRef b) {
Constant
*
const
LOG_2_BLOCK_WIDTH
=
b
->
getSize
(
floor_log2
(
b
->
getBitBlockWidth
()));
Constant
*
const
ZERO
=
b
->
getSize
(
0
);
Value
*
produced
=
mInitiallyProducedOutputItems
[
i
];
//
// TODO: will LLVM optimizations replace the following with the already loaded value?
//
// If not, re-loading it here may reduce register pressure / compilation time.
//
if (mProducedOutputItemPtr[i]) {
//
produced = b->CreateLoad(mProducedOutputItemPtr[i]);
//
}
// TODO: will LLVM optimizations replace the following with the already loaded value?
// If not, re-loading it here may reduce register pressure / compilation time.
if
(
m
Updatable
ProducedOutputItemPtr
[
i
])
{
produced
=
b
->
CreateLoad
(
m
Updatable
ProducedOutputItemPtr
[
i
]);
}
Value
*
const
blockIndex
=
b
->
CreateLShr
(
produced
,
LOG_2_BLOCK_WIDTH
);
Value
*
vba
=
buffer
->
getStreamLogicalBasePtr
(
b
.
get
(),
baseAddress
,
ZERO
,
blockIndex
);
vba
=
b
->
CreatePointerCast
(
vba
,
b
->
getVoidPtrTy
());
b
->
CreateStore
(
vba
,
mUpdatableOutputBaseVirtualAddressPtr
[
i
]);
}
//
if (mUpdatableProducedOutputItemPtr[i]) {
//
Value * const items = b->CreateLoad(mProducedOutputItemPtr[i]);
//
b->CreateStore(items, mUpdatableProducedOutputItemPtr[i]);
//
}
if
(
mUpdatableProducedOutputItemPtr
[
i
])
{
Value
*
const
items
=
b
->
CreateLoad
(
mProducedOutputItemPtr
[
i
]);
b
->
CreateStore
(
items
,
mUpdatableProducedOutputItemPtr
[
i
]);
}
}
// return the termination signal (if one exists)
...
...
@@ -834,7 +821,7 @@ std::vector<Value *> KernelCompiler::storeDoSegmentState() const {
const
auto
numOfOutputs
=
getNumOfStreamOutputs
();
std
::
vector
<
Value
*>
S
;
S
.
resize
(
8
+
numOfInputs
*
3
+
numOfOutputs
*
5
);
S
.
resize
(
8
+
numOfInputs
*
4
+
numOfOutputs
*
6
);
auto
o
=
S
.
begin
();
...
...
@@ -858,11 +845,14 @@ std::vector<Value *> KernelCompiler::storeDoSegmentState() const {
copy
(
mProcessedInputItemPtr
,
numOfInputs
);
copy
(
mAccessibleInputItems
,
numOfInputs
);
copy
(
mAvailableInputItems
,
numOfInputs
);
copy
(
mUpdatableProcessedInputItemPtr
,
numOfInputs
);
copy
(
mProducedOutputItemPtr
,
numOfOutputs
);
copy
(
mInitiallyProducedOutputItems
,
numOfOutputs
);
copy
(
mWritableOutputItems
,
numOfOutputs
);
copy
(
mConsumedOutputItems
,
numOfOutputs
);
copy
(
mUpdatableProducedOutputItemPtr
,
numOfOutputs
);
copy
(
mUpdatableOutputBaseVirtualAddressPtr
,
numOfOutputs
);
assert
(
o
==
S
.
end
());
...
...
@@ -902,12 +892,14 @@ void KernelCompiler::restoreDoSegmentState(const std::vector<Value *> & S) {
revert
(
mProcessedInputItemPtr
,
numOfInputs
);
revert
(
mAccessibleInputItems
,
numOfInputs
);
revert
(
mAvailableInputItems
,
numOfInputs
);
revert
(
mUpdatableProcessedInputItemPtr
,
numOfInputs
);
const
auto
numOfOutputs
=
getNumOfStreamOutputs
();
revert
(
mProducedOutputItemPtr
,
numOfOutputs
);
revert
(
mInitiallyProducedOutputItems
,
numOfOutputs
);
revert
(
mWritableOutputItems
,
numOfOutputs
);
revert
(
mConsumedOutputItems
,
numOfOutputs
);
revert
(
mUpdatableProducedOutputItemPtr
,
numOfOutputs
);
revert
(
mUpdatableOutputBaseVirtualAddressPtr
,
numOfOutputs
);
assert
(
o
==
S
.
end
());
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/core/streamset.cpp
View file @
c9ab9ef7
...
...
@@ -525,6 +525,7 @@ void StaticBuffer::prepareLinearBuffer(BuilderPtr b, llvm::Value * const produce
indices
[
0
]
=
b
->
getInt32
(
0
);
indices
[
1
]
=
b
->
getInt32
(
EffectiveCapacity
);
Value
*
const
capacityField
=
b
->
CreateInBoundsGEP
(
mHandle
,
indices
);
Value
*
const
consumedChunks
=
b
->
CreateUDiv
(
consumed
,
BLOCK_WIDTH
);
indices
[
1
]
=
b
->
getInt32
(
BaseAddress
);
Value
*
const
virtualBaseField
=
b
->
CreateInBoundsGEP
(
mHandle
,
indices
);
...
...
@@ -535,12 +536,13 @@ void StaticBuffer::prepareLinearBuffer(BuilderPtr b, llvm::Value * const produce
Value
*
const
mallocedAddrField
=
b
->
CreateInBoundsGEP
(
mHandle
,
indices
);
Value
*
const
bufferStart
=
b
->
CreateLoad
(
mallocedAddrField
);
Value
*
const
consumedChunks
=
b
->
CreateUDiv
(
consumed
,
BLOCK_WIDTH
);
Value
*
const
newBaseAddress
=
b
->
CreateGEP
(
bufferStart
,
b
->
CreateNeg
(
consumedChunks
));
b
->
CreateStore
(
newBaseAddress
,
virtualBaseField
);
Value
*
const
effectiveCapacity
=
b
->
CreateAdd
(
consumedChunks
,
getInternalCapacity
(
b
));
Value
*
const
newBaseAddress
=
b
->
CreateGEP
(
bufferStart
,
b
->
CreateNeg
(
consumedChunks
));
Value
*
const
effectiveCapacity
=
b
->
CreateAdd
(
consumedChunks
,
b
->
getSize
(
mCapacity
));
b
->
CreateStore
(
newBaseAddress
,
virtualBaseField
);
b
->
CreateStore
(
effectiveCapacity
,
capacityField
);
}
}
...
...
@@ -796,8 +798,7 @@ void DynamicBuffer::reserveCapacity(BuilderPtr b, Value * const produced, Value
indices
[
1
]
=
b
->
getInt32
(
EffectiveCapacity
);
Value
*
const
capacityField
=
b
->
CreateInBoundsGEP
(
handle
,
indices
);
Value
*
const
capacity
=
b
->
CreateLoad
(
capacityField
);
Value
*
const
capacity
=
b
->
CreateLoad
(
capacityField
);
Value
*
const
consumedChunks
=
b
->
CreateUDiv
(
consumed
,
BLOCK_WIDTH
);
Value
*
const
producedChunks
=
b
->
CreateCeilUDiv
(
produced
,
BLOCK_WIDTH
);
Value
*
const
requiredCapacity
=
b
->
CreateAdd
(
produced
,
required
);
...
...
@@ -820,10 +821,6 @@ void DynamicBuffer::reserveCapacity(BuilderPtr b, Value * const produced, Value
Value
*
const
bytesToCopy
=
b
->
CreateMul
(
unconsumedChunks
,
CHUNK_SIZE
);
//b->CallPrintInt("consumed", consumed);
//b->CallPrintInt("CHUNK_SIZE", CHUNK_SIZE);
//b->CallPrintInt("bytesToCopy", bytesToCopy);
BasicBlock
*
const
copyBack
=
BasicBlock
::
Create
(
C
,
"copyBack"
,
func
);
BasicBlock
*
const
expandAndCopyBack
=
BasicBlock
::
Create
(
C
,
"expandAndCopyBack"
,
func
);
BasicBlock
*
const
updateBaseAddress
=
BasicBlock
::
Create
(
C
,
"updateBaseAddress"
,
func
);
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/io/source_kernel.cpp
View file @
c9ab9ef7
...
...
@@ -169,24 +169,20 @@ void ReadSourceKernel::generateInitializeMethod(const unsigned codeUnitWidth, co
void
ReadSourceKernel
::
generateDoSegmentMethod
(
const
unsigned
codeUnitWidth
,
const
unsigned
stride
,
BuilderRef
b
)
{
ConstantInt
*
const
strideItems
=
b
->
getSize
(
stride
);
ConstantInt
*
const
itemsToRead
=
b
->
getSize
(
stride
);
ConstantInt
*
const
codeUnitBytes
=
b
->
getSize
(
codeUnitWidth
/
8
);
Constant
*
const
strideBytes
=
ConstantExpr
::
getMul
(
strideItems
,
codeUnitBytes
);
BasicBlock
*
const
entryBB
=
b
->
GetInsertBlock
();
BasicBlock
*
const
moveData
=
b
->
CreateBasicBlock
(
"MoveData"
);
BasicBlock
*
const
prepareBuffer
=
b
->
CreateBasicBlock
(
"PrepareBuffer"
);
BasicBlock
*
const
readData
=
b
->
CreateBasicBlock
(
"ReadData"
);
BasicBlock
*
const
readIncomplete
=
b
->
CreateBasicBlock
(
"readIncomplete"
);
BasicBlock
*
const
setTermination
=
b
->
CreateBasicBlock
(
"SetTermination"
);
BasicBlock
*
const
readExit
=
b
->
CreateBasicBlock
(
"ReadExit"
);
// Can we append to our existing buffer without impacting any subsequent kernel?
Value
*
const
produced
=
b
->
getProducedItemCount
(
"sourceBuffer"
);
Value
*
const
itemsPending
=
b
->
CreateAdd
(
produced
,
strideItems
);
Value
*
const
itemsPending
=
b
->
CreateAdd
(
produced
,
itemsToRead
);
Value
*
const
effectiveCapacity
=
b
->
getScalarField
(
"effectiveCapacity"
);
Value
*
const
baseBuffer
=
b
->
getScalarField
(
"buffer"
);
Value
*
const
fd
=
b
->
getScalarField
(
"fileDescriptor"
);
Value
*
const
permitted
=
b
->
CreateICmpULT
(
itemsPending
,
effectiveCapacity
);
b
->
CreateLikelyCondBr
(
permitted
,
readData
,
moveData
);
...
...
@@ -211,7 +207,7 @@ void ReadSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, con
Value
*
const
unreadItems
=
b
->
CreateSub
(
produced
,
consumed
);
Value
*
const
unreadData
=
b
->
getRawOutputPointer
(
"sourceBuffer"
,
consumed
);
Value
*
const
potentialItems
=
b
->
CreateAdd
(
unreadItems
,
strideItems
);
Value
*
const
potentialItems
=
b
->
CreateAdd
(
unreadItems
,
itemsToRead
);
Value
*
const
toWrite
=
b
->
CreateGEP
(
baseBuffer
,
potentialItems
);
Value
*
const
canCopy
=
b
->
CreateICmpULT
(
toWrite
,
unreadData
);
...
...
@@ -269,33 +265,16 @@ void ReadSourceKernel::generateDoSegmentMethod(const unsigned codeUnitWidth, con
b
->
CreateBr
(
readData
);
// Regardless of whether we're simply appending data or had to allocate a new buffer, read a new page
// of data into the input source buffer.
This may involve multiple read calls
.
// of data into the input source buffer.
If we fail to read a full page ..
.
b
->
SetInsertPoint
(
readData
);
PHINode
*
const
bytesToRead
=
b
->
CreatePHI
(
strideBytes
->
getType
(),
3
);
bytesToRead
->
addIncoming
(
strideBytes
,
entryBB
);
bytesToRead
->
addIncoming
(
strideBytes
,
prepareBuffer
);
PHINode
*
const
producedSoFar
=
b
->
CreatePHI
(
produced
->
getType
(),
3
);
producedSoFar
->
addIncoming
(
produced
,
entryBB
);
producedSoFar
->
addIncoming
(
produced
,
prepareBuffer
);
Value
*
const
sourceBuffer
=
b
->
getRawOutputPointer
(
"sourceBuffer"
,
producedSoFar
);
Value
*
const
sourceBuffer
=
b
->
getRawOutputPointer
(
"sourceBuffer"
,
produced
);
Value
*
const
fd
=
b
->
getScalarField
(
"fileDescriptor"
);
Constant
*
const
bytesToRead
=
ConstantExpr
::
getMul
(
itemsToRead
,
codeUnitBytes
);
Value
*
const
bytesRead
=
b
->
CreateReadCall
(
fd
,
sourceBuffer
,
bytesToRead
);
// There are 4 possibile results from read:
// bytesRead == -1: an error occurred
// bytesRead == 0: EOF, no bytes read
// 0 < bytesRead < bytesToRead: some data read (more may be available)
// bytesRead == bytesToRead, the full amount requested was read.
b
->
CreateUnlikelyCondBr
(
b
->
CreateICmpNE
(
bytesToRead
,
bytesRead
),
readIncomplete
,
readExit
);
b
->
SetInsertPoint
(
readIncomplete
);
// Keep reading until a the full stride is read, or there is no more data.
Value
*
moreToRead
=
b
->
CreateSub
(
bytesToRead
,
bytesRead
);
Value
*
readSoFar
=
b
->
CreateSub
(
strideBytes
,
moreToRead
);
Value
*
const
itemsRead
=
b
->
CreateUDiv
(
readSoFar
,
codeUnitBytes
);
Value
*
const
itemsRead
=
b
->
CreateUDiv
(
bytesRead
,
codeUnitBytes
);
Value
*
const
itemsBuffered
=
b
->
CreateAdd
(
produced
,
itemsRead
);
bytesToRead
->
addIncoming
(
moreToRead
,
readIncomplete
);
producedSoFar
->
addIncoming
(
itemsBuffered
,
readIncomplete
);
b
->
CreateCondBr
(
b
->
CreateICmpSGT
(
bytesRead
,
b
->
getSize
(
0
)),
readData
,
setTermination
);
b
->
CreateUnlikelyCondBr
(
b
->
CreateICmpULT
(
itemsBuffered
,
itemsPending
),
setTermination
,
readExit
);
// ... set the termination signal.
b
->
SetInsertPoint
(
setTermination
);
Value
*
const
bytesToZero
=
b
->
CreateMul
(
b
->
CreateSub
(
itemsPending
,
itemsBuffered
),
codeUnitBytes
);
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/io/stdout_kernel.cpp
View file @
c9ab9ef7
...
...
@@ -100,18 +100,12 @@ void FileSink::generateInitializeMethod(BuilderRef b) {
void
FileSink
::
generateDoSegmentMethod
(
BuilderRef
b
)
{
Value
*
codeUnitBuffer
=
b
->
getInputStreamBlockPtr
(
"codeUnitBuffer"
,
b
->
getInt32
(
0
));
codeUnitBuffer
=
b
->
CreatePointerCast
(
codeUnitBuffer
,
b
->
getInt8PtrTy
());
//b->CallPrintInt("fileSink:codeUnitBuffer", codeUnitBuffer);
Value
*
bytesToDo
=
b
->
getAccessibleItemCount
(
"codeUnitBuffer"
);
if
(
LLVM_UNLIKELY
(
mCodeUnitWidth
>
8
))
{
bytesToDo
=
b
->
CreateMul
(
bytesToDo
,
b
->
getSize
(
mCodeUnitWidth
/
8
));
}
else
if
(
LLVM_UNLIKELY
(
mCodeUnitWidth
<
8
))
{
bytesToDo
=
b
->
CreateUDiv
(
bytesToDo
,
b
->
getSize
(
8
/
mCodeUnitWidth
));
}
//b->CallPrintInt("fileSink:bytesToDo", bytesToDo);
Value
*
const
fileDescriptor
=
b
->
getScalarField
(
"fileDescriptor"
);
b
->
CreateWriteCall
(
fileDescriptor
,
codeUnitBuffer
,
bytesToDo
);
}
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/analysis/buffer_analysis.hpp
View file @
c9ab9ef7
...
...
@@ -58,6 +58,7 @@ void PipelineAnalysis::addStreamSetsToBufferGraph(BuilderRef b) {
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
PipelineInput
,
mBufferGraph
)))
{
const
auto
streamSet
=
target
(
e
,
mBufferGraph
);
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
bn
.
Type
|=
BufferType
::
External
;
if
(
LLVM_LIKELY
(
bn
.
Buffer
==
nullptr
))
{
const
BufferPort
&
rate
=
mBufferGraph
[
e
];
const
Binding
&
input
=
rate
.
Binding
;
...
...
@@ -70,6 +71,7 @@ void PipelineAnalysis::addStreamSetsToBufferGraph(BuilderRef b) {
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
PipelineOutput
,
mBufferGraph
)))
{
const
auto
streamSet
=
source
(
e
,
mBufferGraph
);
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
bn
.
Type
|=
BufferType
::
External
;
if
(
LLVM_LIKELY
(
bn
.
Buffer
==
nullptr
))
{
const
BufferPort
&
rate
=
mBufferGraph
[
e
];
const
Binding
&
output
=
rate
.
Binding
;
...
...
@@ -295,8 +297,6 @@ void PipelineAnalysis::generateInitialBufferGraph() {
BufferPort
bp
(
port
,
binding
,
lb
,
ub
);
bp
.
Countable
=
isCountable
(
binding
);
if
(
LLVM_UNLIKELY
(
rate
.
getKind
()
==
RateId
::
Unknown
))
{
bp
.
IsManaged
=
true
;
}
...
...
@@ -323,13 +323,6 @@ void PipelineAnalysis::generateInitialBufferGraph() {
break
;
case
AttrId
::
Deferred
:
bp
.
IsDeferred
=
true
;
if
(
LLVM_UNLIKELY
(
!
bp
.
Countable
))
{
SmallVector
<
char
,
256
>
tmp
;
raw_svector_ostream
out
(
tmp
);
out
<<
kernelObj
->
getName
()
<<
"."
<<
binding
.
getName
()
<<
" cannot be both a Deferred and Non-Countable rate."
;
report_fatal_error
(
out
.
str
());
}
break
;
case
AttrId
::
SharedManagedBuffer
:
bp
.
IsShared
=
true
;
...
...
@@ -340,7 +333,6 @@ void PipelineAnalysis::generateInitialBufferGraph() {
default:
break
;
}
}
return
bp
;
};
...
...
@@ -488,22 +480,6 @@ void PipelineAnalysis::generateInitialBufferGraph() {
}
}
}
// fill in any unmanaged pipeline input buffers
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
PipelineInput
,
mBufferGraph
)))
{
const
auto
streamSet
=
target
(
e
,
mBufferGraph
);
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
bn
.
Type
|=
BufferType
::
External
;
}
// and pipeline output buffers ...
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
PipelineOutput
,
mBufferGraph
)))
{
const
auto
streamSet
=
source
(
e
,
mBufferGraph
);
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
bn
.
Type
|=
BufferType
::
External
;
}
}
/** ------------------------------------------------------------------------------------------------------------- *
...
...
@@ -641,72 +617,6 @@ void PipelineAnalysis::identifyLinearBuffers() {
#endif
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief identifyDirectUpdatesToStateObjects
** ------------------------------------------------------------------------------------------------------------- */
void
PipelineAnalysis
::
identifyDirectUpdatesToStateObjects
()
{
// We can only safely use the processed item count if it's the last use of it
// and that consumer only uses it once.
SmallVector
<
unsigned
,
64
>
lastConsumer
(
LastStreamSet
-
FirstStreamSet
+
1U
);
for
(
auto
streamSet
=
FirstStreamSet
;
streamSet
<=
LastStreamSet
;
++
streamSet
)
{
bool
multipleUsages
=
false
;
auto
lastKernel
=
PipelineInput
;
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
streamSet
,
mBufferGraph
)))
{
const
auto
consumer
=
target
(
e
,
mBufferGraph
);
if
(
consumer
>
lastKernel
)
{
lastKernel
=
consumer
;
multipleUsages
=
false
;
}
else
if
(
LLVM_UNLIKELY
(
consumer
==
lastKernel
))
{
multipleUsages
=
true
;
}
}
lastConsumer
[
streamSet
-
FirstStreamSet
]
=
multipleUsages
?
-
1U
:
lastKernel
;
}
for
(
auto
kernel
=
FirstKernel
;
kernel
<=
LastKernel
;
++
kernel
)
{
const
Kernel
*
const
kernelObj
=
getKernel
(
kernel
);
const
auto
isInternallySynchronized
=
kernelObj
->
hasAttribute
(
AttrId
::
InternallySynchronized
);
const
auto
canTerminateEarly
=
kernelObj
->
canSetTerminateSignal
();
const
auto
passOutputByAddress
=
isInternallySynchronized
||
canTerminateEarly
;
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
kernel
,
mBufferGraph
)))
{
const
auto
streamSet
=
source
(
e
,
mBufferGraph
);
assert
(
streamSet
>=
FirstStreamSet
&&
streamSet
<=
LastStreamSet
);
const
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
BufferPort
&
rt
=
mBufferGraph
[
e
];
// All uses of an external item count refer to the same processed field.
bool
safeToUpdate
=
true
;
if
(
LLVM_UNLIKELY
(
bn
.
isExternal
()))
{
const
auto
lastConsumedId
=
lastConsumer
[
streamSet
-
FirstStreamSet
];
safeToUpdate
=
(
lastConsumedId
==
kernel
);
}
const
auto
takeInputAddress
=
isInternallySynchronized
||
rt
.
IsDeferred
;
const
auto
nonCountable
=
!
rt
.
Countable
;
rt
.
Addressable
=
(
takeInputAddress
||
nonCountable
);
rt
.
DirectlyUpdatesInternalState
=
safeToUpdate
&&
(
nonCountable
||
isInternallySynchronized
);
rt
.
StoreItemCount
=
safeToUpdate
&&
(
rt
.
IsDeferred
||
!
rt
.
DirectlyUpdatesInternalState
);
}
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
kernel
,
mBufferGraph
)))
{
BufferPort
&
rt
=
mBufferGraph
[
e
];
const
auto
streamSet
=
target
(
e
,
mBufferGraph
);
assert
(
streamSet
>=
FirstStreamSet
&&
streamSet
<=
LastStreamSet
);
const
auto
takeOutputAddress
=
passOutputByAddress
||
rt
.
IsDeferred
;
const
auto
nonCountable
=
!
rt
.
Countable
;
rt
.
Addressable
=
takeOutputAddress
||
nonCountable
;
rt
.
StoreItemCount
=
true
;
// If this kernel can terminate early, we need to store the item count
// that it may end up returning in the case of an unexpected termination.
rt
.
DirectlyUpdatesInternalState
=
(
nonCountable
&&
!
canTerminateEarly
)
||
isInternallySynchronized
;
}
}
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief identifyNonLocalBuffers
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/analysis/consumer_analysis.hpp
View file @
c9ab9ef7
...
...
@@ -19,7 +19,6 @@ void PipelineAnalysis::makeConsumerGraph() {
flat_set
<
unsigned
>
observedGlobalPortIds
;
for
(
auto
streamSet
=
FirstStreamSet
;
streamSet
<=
LastStreamSet
;
++
streamSet
)
{
// copy the producing edge
const
auto
pe
=
in_edge
(
streamSet
,
mBufferGraph
);
const
BufferPort
&
br
=
mBufferGraph
[
pe
];
...
...
@@ -32,6 +31,7 @@ void PipelineAnalysis::makeConsumerGraph() {
continue
;
}
auto
lastConsumer
=
PipelineInput
;
auto
index
=
0U
;
// flag the production rate as ignorable by inserting it upfront
...
...
@@ -39,18 +39,19 @@ void PipelineAnalysis::makeConsumerGraph() {
for
(
const
auto
ce
:
make_iterator_range
(
out_edges
(
streamSet
,
mBufferGraph
)))
{
const
BufferPort
&
br
=
mBufferGraph
[
ce
];
const
auto
consumer
=
target
(
ce
,
mBufferGraph
);
if
(
LLVM_UNLIKELY
(
consumer
==
PipelineOutput
&&
producer
!=
PipelineInput
))
{
continue
;
}
// check if any consumer has a rate we have not yet observed
lastConsumer
=
std
::
max
<
unsigned
>
(
lastConsumer
,
consumer
);
#ifndef TEST_ALL_CONSUMERS
if
(
observedGlobalPortIds
.
insert
(
br
.
GlobalPortId
).
second
)
{
#endif
auto
testConsumer
=
[
&
]()
{
#ifndef TEST_ALL_CONSUMERS
return
observedGlobalPortIds
.
insert
(
br
.
GlobalPortId
).
second
;
#else
return
true
;
#endif
};
if
(
testConsumer
())
{
lastConsumer
=
std
::
max
<
unsigned
>
(
lastConsumer
,
consumer
);
add_edge
(
streamSet
,
consumer
,
ConsumerEdge
{
br
.
Port
,
++
index
,
ConsumerEdge
::
UpdatePhi
},
mConsumerGraph
);
#ifndef TEST_ALL_CONSUMERS
}
#endif
}
observedGlobalPortIds
.
clear
();
...
...
@@ -76,11 +77,27 @@ void PipelineAnalysis::makeConsumerGraph() {
}
}
#ifdef PRINT_CONSUMER_GRAPH
// If this is a pipeline input, we want to update the count at the end of the loop.
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
PipelineInput
,
mBufferGraph
)))
{
const
auto
streamSet
=
target
(
e
,
mBufferGraph
);
ConsumerGraph
::
edge_descriptor
f
;
bool
exists
;
std
::
tie
(
f
,
exists
)
=
edge
(
streamSet
,
PipelineOutput
,
mConsumerGraph
);
const
auto
flags
=
ConsumerEdge
::
UpdateExternalCount
;
if
(
exists
)
{
ConsumerEdge
&
cn
=
mConsumerGraph
[
f
];
cn
.
Flags
|=
flags
;
}
else
{
const
BufferPort
&
br
=
mBufferGraph
[
e
];
add_edge
(
streamSet
,
PipelineOutput
,
ConsumerEdge
{
br
.
Port
,
0
,
flags
},
mConsumerGraph
);
}
}
#if 0
auto & out = errs();
out
<<
"digraph
\"
ConsumerGraph
_"
<<
mPipelineKernel
->
getName
()
<<
"
\"
{
\n
"
;
out << "digraph \"ConsumerGraph\" {\n";
for (auto v : make_iterator_range(vertices(mConsumerGraph))) {
out << "v" << v << " [label=\"" << v << "\"];\n";
}
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/analysis/partitioning_analysis.hpp
View file @
c9ab9ef7
...
...
@@ -2,7 +2,6 @@
#define PARTITIONING_ANALYSIS_HPP
#include "pipeline_analysis.hpp"
#include <toolchain/toolchain.h>
#include <util/slab_allocator.h>
namespace
kernel
{
...
...
@@ -902,12 +901,7 @@ found: ++i;
// shares the same kernels as the first partition of another and we can schedule one after the other,
// this may improve I-Cache utilization.
#if Z3_VERSION_INTEGER >= LLVM_VERSION_CODE(4, 8, 0)
if
(
Z3_optimize_check
(
ctx
,
solver
,
0
,
nullptr
)
!=
Z3_L_TRUE
)
#else
if
(
Z3_optimize_check
(
ctx
,
solver
)
!=
Z3_L_TRUE
)
#endif
{
if
(
Z3_optimize_check
(
ctx
,
solver
)
!=
Z3_L_TRUE
)
{
report_fatal_error
(
"Z3 failed to find a partition ordering solution"
);
}
...
...
@@ -1188,12 +1182,8 @@ found: ++i;
}
END_SCOPED_REGION
#if Z3_VERSION_INTEGER >= LLVM_VERSION_CODE(4, 8, 0)
if
(
Z3_optimize_check
(
ctx
,
solver
,
0
,
nullptr
)
==
Z3_L_FALSE
)
#else
if
(
Z3_optimize_check
(
ctx
,
solver
)
==
Z3_L_FALSE
)
#endif
{
if
(
Z3_optimize_check
(
ctx
,
solver
)
==
Z3_L_FALSE
)
{
report_fatal_error
(
"Z3 failed to find a kernel ordering solution"
);
}
...
...
@@ -1345,7 +1335,7 @@ void PipelineAnalysis::determinePartitionJumpIndices() {
for
(
auto
u
=
PartitionCount
;
u
--
;
)
{
// forward topological ordering
assert
(
out_degree
(
u
,
G
)
>
0
);
M
.
set
();
M
.
set
(
0
,
PartitionCount
,
true
);
assert
(
M
.
count
()
==
PartitionCount
);
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
u
,
G
)))
{
const
auto
v
=
target
(
e
,
G
);
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/analysis/pipeline_analysis.hpp
View file @
c9ab9ef7
...
...
@@ -53,7 +53,6 @@ public:
P
.
makeTerminationPropagationGraph
();
// Finish the buffer graph
P
.
identifyDirectUpdatesToStateObjects
();
P
.
addStreamSetsToBufferGraph
(
b
);
P
.
gatherInfo
();
...
...
@@ -128,7 +127,6 @@ private:
void
identifyLinearBuffers
();
void
identifyNonLocalBuffers
();
void
identifyLocalPortIds
();
void
identifyDirectUpdatesToStateObjects
();
// consumer analysis functions
...
...
@@ -218,7 +216,7 @@ public:
OwningVector
<
Kernel
>
mInternalKernels
;
OwningVector
<
Binding
>
mInternalBindings
;
OwningVec
tor
<
StreamSetBuffer
>
mInternalBuffers
;
OwningVec
<
StreamSetBuffer
>
mInternalBuffers
;
};
}
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/buffer_management_logic.hpp
View file @
c9ab9ef7
...
...
@@ -100,14 +100,7 @@ void PipelineCompiler::allocateOwnedBuffers(BuilderRef b, Value * const expected
b
->
CreateCall
(
func
,
params
);
}
}
// and allocate any output buffers
#ifdef PRINT_DEBUG_MESSAGES
Constant
*
const
pipelineName
=
b
->
GetString
(
mTarget
->
getName
());
SmallVector
<
char
,
256
>
tmp
;
raw_svector_ostream
out
(
tmp
);
out
<<
i
<<
"."
<<
getKernel
(
i
)
->
getName
();
Constant
*
const
kernelName
=
b
->
GetString
(
out
.
str
());
#endif
// and allocate any output buffers
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
i
,
mBufferGraph
)))
{
const
auto
streamSet
=
target
(
e
,
mBufferGraph
);
const
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
...
...
@@ -124,12 +117,7 @@ void PipelineCompiler::allocateOwnedBuffers(BuilderRef b, Value * const expected
assert
(
"a threadlocal buffer cannot be external"
&&
(
bn
.
isInternal
()
||
nonLocal
));
assert
(
buffer
->
getHandle
());
assert
(
isFromCurrentFunction
(
b
,
buffer
->
getHandle
(),
false
));
buffer
->
allocateBuffer
(
b
,
expectedNumOfStrides
);
#ifdef PRINT_DEBUG_MESSAGES
const
BufferPort
&
rd
=
mBufferGraph
[
e
];
const
Binding
&
binding
=
rd
.
Binding
;
debugPrint
(
b
,
"%s:%s.%s capacity = %"
PRId64
,
pipelineName
,
kernelName
,
b
->
GetString
(
binding
.
getName
()),
buffer
->
getCapacity
(
b
));
#endif
buffer
->
allocateBuffer
(
b
,
expectedNumOfStrides
);
}
}
...
...
@@ -230,85 +218,35 @@ void PipelineCompiler::readProcessedItemCounts(BuilderRef b) {
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
mKernelId
,
mBufferGraph
)))
{
const
BufferPort
&
br
=
mBufferGraph
[
e
];
const
auto
inputPort
=
br
.
Port
;
const
auto
streamSet
=
source
(
e
,
mBufferGraph
);
const
BufferNode
&
node
=
mBufferGraph
[
streamSet
];
#ifndef STORE_EXTERNAL_PROCESSED_ITEM_COUNTS
if
(
LLVM_UNLIKELY
(
node
.
isExternal
()))
{
bool
found
=
true
;
for
(
const
auto
f
:
make_iterator_range
(
in_edges
(
streamSet
,
mBufferGraph
)))
{
if
(
source
(
f
,
mBufferGraph
)
==
PipelineInput
)
{
const
BufferPort
&
external
=
mBufferGraph
[
f
];
Value
*
const
processed
=
getProcessedInputItemsPtr
(
external
.
Port
.
Number
);
mProcessedItemCountPtr
[
inputPort
]
=
processed
;
// mProcessedItemCountPtr[inputPort] = mExternallyProcessedItemPtr[streamSet];
// assert (mInitiallyProcessedExternalItems[streamSet]);
// mInitiallyProcessedItemCount[inputPort] = mInitiallyProcessedExternalItems[streamSet];
mInitiallyProcessedItemCount
[
inputPort
]
=
b
->
CreateLoad
(
processed
);
break
;
}
}
assert
(
"cannot locate external processed item count?"
&&
found
);
}
else
{
// internal item count
#endif
const
auto
prefix
=
makeBufferName
(
mKernelId
,
inputPort
);
Value
*
const
processed
=
b
->
getScalarFieldPtr
(
prefix
+
ITEM_COUNT_SUFFIX
);
mProcessedItemCountPtr
[
inputPort
]
=
processed
;
mInitiallyProcessedItemCount
[
inputPort
]
=
b
->
CreateLoad
(
processed
);
if
(
br
.
IsDeferred
)
{
Value
*
const
deferred
=
b
->
getScalarFieldPtr
(
prefix
+
DEFERRED_ITEM_COUNT_SUFFIX
);
mProcessedDeferredItemCountPtr
[
inputPort
]
=
deferred
;
mInitiallyProcessedDeferredItemCount
[
inputPort
]
=
b
->
CreateLoad
(
deferred
);
}
#ifndef STORE_EXTERNAL_PROCESSED_ITEM_COUNTS
const
auto
prefix
=
makeBufferName
(
mKernelId
,
inputPort
);
Value
*
const
processed
=
b
->
getScalarFieldPtr
(
prefix
+
ITEM_COUNT_SUFFIX
);
mProcessedItemCountPtr
[
inputPort
]
=
processed
;
mInitiallyProcessedItemCount
[
inputPort
]
=
b
->
CreateLoad
(
processed
);
if
(
br
.
IsDeferred
)
{
Value
*
const
deferred
=
b
->
getScalarFieldPtr
(
prefix
+
DEFERRED_ITEM_COUNT_SUFFIX
);
mProcessedDeferredItemCountPtr
[
inputPort
]
=
deferred
;
mInitiallyProcessedDeferredItemCount
[
inputPort
]
=
b
->
CreateLoad
(
deferred
);
}
#endif
}
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief writeExternalProcessedItemCounts
** ------------------------------------------------------------------------------------------------------------- */
void
PipelineCompiler
::
writeExternalProcessedItemCounts
(
BuilderRef
b
)
{
// for (const auto e : make_iterator_range(out_edges(PipelineInput, mBufferGraph))) {
// const auto streamSet = target(e, mBufferGraph);
// Value * const ptr = b->CreateAllocaAtEntryPoint(b->getSizeTy());
// b->CreateStore(b->getSize(0), ptr);
// mExternallyProcessedItemPtr[streamSet] = ptr;
// }
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief readProducedItemCounts
** ------------------------------------------------------------------------------------------------------------- */
void
PipelineCompiler
::
readProducedItemCounts
(
BuilderRef
b
)
{
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
mKernelId
,
mBufferGraph
)))
{
const
BufferPort
&
br
=
mBufferGraph
[
e
];
const
auto
outputPort
=
br
.
Port
;
const
auto
prefix
=
makeBufferName
(
mKernelId
,
outputPort
);
const
auto
streamSet
=
target
(
e
,
mBufferGraph
);
const
BufferPort
&
output
=
mBufferGraph
[
e
];
const
auto
outputPort
=
output
.
Port
;
const
BufferNode
&
node
=
mBufferGraph
[
streamSet
];
if
(
LLVM_UNLIKELY
(
node
.
isExternal
()))
{
bool
found
=
true
;
for
(
const
auto
f
:
make_iterator_range
(
out_edges
(
streamSet
,
mBufferGraph
)))
{
if
(
target
(
f
,
mBufferGraph
)
==
PipelineOutput
)
{
const
BufferPort
&
external
=
mBufferGraph
[
f
];
Value
*
const
produced
=
getProducedOutputItemsPtr
(
external
.
Port
.
Number
);
mProducedItemCountPtr
[
outputPort
]
=
produced
;
mInitiallyProducedItemCount
[
streamSet
]
=
b
->
CreateLoad
(
produced
);
break
;
}
}
assert
(
"cannot locate external produced item count?"
&&
found
);
}
else
{
// internal item count
const
auto
prefix
=
makeBufferName
(
mKernelId
,
outputPort
);
Value
*
const
produced
=
b
->
getScalarFieldPtr
(
prefix
+
ITEM_COUNT_SUFFIX
);
mProducedItemCountPtr
[
outputPort
]
=
produced
;
mInitiallyProducedItemCount
[
streamSet
]
=
b
->
CreateLoad
(
produced
);
if
(
output
.
IsDeferred
)
{
Value
*
const
deferred
=
b
->
getScalarField
(
prefix
+
DEFERRED_ITEM_COUNT_SUFFIX
);
mProducedDeferredItemCountPtr
[
outputPort
]
=
deferred
;
mInitiallyProducedDeferredItemCount
[
streamSet
]
=
b
->
CreateLoad
(
deferred
);
}
Value
*
const
produced
=
b
->
getScalarFieldPtr
(
prefix
+
ITEM_COUNT_SUFFIX
);
mProducedItemCountPtr
[
outputPort
]
=
produced
;
mInitiallyProducedItemCount
[
streamSet
]
=
b
->
CreateLoad
(
produced
);
if
(
br
.
IsDeferred
)
{
Value
*
const
deferred
=
b
->
getScalarField
(
prefix
+
DEFERRED_ITEM_COUNT_SUFFIX
);
mProducedDeferredItemCountPtr
[
outputPort
]
=
deferred
;
mInitiallyProducedDeferredItemCount
[
streamSet
]
=
b
->
CreateLoad
(
deferred
);
}
}
}
...
...
@@ -352,59 +290,44 @@ void PipelineCompiler::setLocallyAvailableItemCount(BuilderRef /* b */, const St
mLocallyAvailableItems
[
streamSet
]
=
available
;
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief writeUpdatedItemCounts
** ------------------------------------------------------------------------------------------------------------- */
void
PipelineCompiler
::
writeUpdatedItemCounts
(
BuilderRef
b
)
{
if
(
mKernelIsInternallySynchronized
)
{
return
;
}
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
mKernelId
,
mBufferGraph
)))
{
const
BufferPort
&
br
=
mBufferGraph
[
e
];
if
(
br
.
StoreItemCount
)
{
const
StreamSetPort
inputPort
=
br
.
Port
;
#ifdef PRINT_DEBUG_MESSAGES
const
auto
prefix
=
b
->
GetString
(
makeBufferName
(
mKernelId
,
inputPort
));
#endif
if
(
br
.
IsDeferred
)
{
// If this kernel has a deferred rate and we directly pass the state object's item count field
// to the kernel, that kernel will update the deferred count but still leave the undeferred
// count untouched.
if
(
!
br
.
DirectlyUpdatesInternalState
)
{
b
->
CreateStore
(
mUpdatedProcessedDeferredPhi
[
inputPort
],
mProcessedDeferredItemCountPtr
[
inputPort
]);
#ifdef PRINT_DEBUG_MESSAGES
debugPrint
(
b
,
" @ writing %s_processed(deferred) = %"
PRIu64
,
prefix
,
mUpdatedProcessedDeferredPhi
[
inputPort
]);
#endif
}
}
else
if
(
br
.
DirectlyUpdatesInternalState
)
{
continue
;
}
b
->
CreateStore
(
mUpdatedProcessedPhi
[
inputPort
],
mProcessedItemCountPtr
[
inputPort
]);
const
StreamSetPort
inputPort
=
br
.
Port
;
b
->
CreateStore
(
mUpdatedProcessedPhi
[
inputPort
],
mProcessedItemCountPtr
[
inputPort
]);
#ifdef PRINT_DEBUG_MESSAGES
const
auto
prefix
=
makeBufferName
(
mKernelId
,
inputPort
);
debugPrint
(
b
,
" @ writing "
+
prefix
+
"_processed = %"
PRIu64
,
mUpdatedProcessedPhi
[
inputPort
]);
#endif
if
(
br
.
IsDeferred
)
{
b
->
CreateStore
(
mUpdatedProcessedDeferredPhi
[
inputPort
],
mProcessedDeferredItemCountPtr
[
inputPort
]);
#ifdef PRINT_DEBUG_MESSAGES
debugPrint
(
b
,
" @ writing
%s_processed
= %"
PRIu64
,
prefix
,
mUpdatedProcessedPhi
[
inputPort
]);
debugPrint
(
b
,
" @ writing
"
+
prefix
+
"_processed(deferred)
= %"
PRIu64
,
mUpdatedProcessed
Deferred
Phi
[
inputPort
]);
#endif
}
}
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
mKernelId
,
mBufferGraph
)))
{
const
BufferPort
&
br
=
mBufferGraph
[
e
];
if
(
br
.
StoreItemCount
)
{
const
StreamSetPort
outputPort
=
br
.
Port
;
#ifdef PRINT_DEBUG_MESSAGES
const
auto
prefix
=
b
->
GetString
(
makeBufferName
(
mKernelId
,
outputPort
));
#endif
if
(
br
.
IsDeferred
)
{
if
(
!
br
.
DirectlyUpdatesInternalState
)
{
b
->
CreateStore
(
mUpdatedProducedDeferredPhi
[
outputPort
],
mProducedDeferredItemCountPtr
[
outputPort
]);
#ifdef PRINT_DEBUG_MESSAGES
debugPrint
(
b
,
" @ writing %s_produced(deferred) = %"
PRIu64
,
prefix
,
mUpdatedProducedDeferredPhi
[
outputPort
]);
#endif
}
}
else
if
(
br
.
DirectlyUpdatesInternalState
)
{
continue
;
}
b
->
CreateStore
(
mUpdatedProducedPhi
[
outputPort
],
mProducedItemCountPtr
[
outputPort
]);
const
StreamSetPort
outputPort
=
br
.
Port
;
b
->
CreateStore
(
mUpdatedProducedPhi
[
outputPort
],
mProducedItemCountPtr
[
outputPort
]);
#ifdef PRINT_DEBUG_MESSAGES
const
auto
prefix
=
makeBufferName
(
mKernelId
,
outputPort
);
debugPrint
(
b
,
" @ writing "
+
prefix
+
"_produced = %"
PRIu64
,
mUpdatedProducedPhi
[
outputPort
]);
#endif
if
(
br
.
IsDeferred
)
{
b
->
CreateStore
(
mUpdatedProducedDeferredPhi
[
outputPort
],
mProducedDeferredItemCountPtr
[
outputPort
]);
#ifdef PRINT_DEBUG_MESSAGES
debugPrint
(
b
,
" @ writing
%s_produced
= %"
PRIu64
,
prefix
,
mUpdatedProducedPhi
[
outputPort
]);
debugPrint
(
b
,
" @ writing
"
+
prefix
+
"_produced(deferred)
= %"
PRIu64
,
mUpdatedProduced
Deferred
Phi
[
outputPort
]);
#endif
}
}
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/common/common.hpp
View file @
c9ab9ef7
...
...
@@ -5,10 +5,6 @@
namespace
kernel
{
#ifndef MSC_VER
typedef
long
long
int
__int64
;
#endif
template
<
typename
T
,
unsigned
n
=
16
>
using
Vec
=
SmallVector
<
T
,
n
>
;
...
...
@@ -163,6 +159,10 @@ private:
FixedVector
<
T
>
mArray
;
};
template
<
typename
T
>
using
OwningVec
=
std
::
vector
<
std
::
unique_ptr
<
T
>>
;
#ifndef NDEBUG
static
bool
isFromCurrentFunction
(
BuilderRef
b
,
const
Value
*
const
value
,
const
bool
allowNull
=
true
)
{
if
(
value
==
nullptr
)
{
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/common/graphs.h
View file @
c9ab9ef7
...
...
@@ -260,10 +260,6 @@ struct BufferNode {
return
(
Type
&
BufferType
::
Shared
)
!=
0
;
}
bool
isDynamic
()
const
{
assert
(
Buffer
);
return
isa
<
DynamicBuffer
>
(
Buffer
);
}
};
...
...
@@ -291,14 +287,8 @@ struct BufferPort {
bool
IsShared
=
false
;
bool
IsManaged
=
false
;
bool
Countable
=
false
;
bool
Addressable
=
false
;
bool
DirectlyUpdatesInternalState
=
false
;
bool
StoreItemCount
=
false
;
int
TransitiveAdd
=
0
;
bool
operator
<
(
const
BufferPort
&
rn
)
const
{
if
(
LLVM_LIKELY
(
Port
.
Type
==
rn
.
Port
.
Type
))
{
return
Port
.
Number
<
rn
.
Port
.
Number
;
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/config.h
View file @
c9ab9ef7
#ifndef PIPELINE_KERNEL_COMPILER_CONFIG_H
#define PIPELINE_KERNEL_COMPILER_CONFIG_H
//#define PRINT_DEBUG_MESSAGES
//
#define PRINT_DEBUG_MESSAGES
// #define DISABLE_ZERO_EXTEND
...
...
@@ -13,18 +13,10 @@
// #define FORCE_PIPELINE_ASSERTIONS
// #define DISABLE_PIPELINE_ASSERTIONS
// #define FORCE_SYNCHRONIZATION_FOR_ALL_KERNELS
// #define FORCE_EACH_KERNEL_INTO_UNIQUE_PARTITION
// #define TEST_ALL_CONSUMERS
// #define STORE_EXTERNAL_PROCESSED_ITEM_COUNTS
//#define PRINT_BUFFER_GRAPH
//#define PRINT_CONSUMER_GRAPH
// #define PRINT_BUFFER_GRAPH
#endif // PIPELINE_KERNEL_COMPILER_CONFIG_H
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/consumer_logic.hpp
View file @
c9ab9ef7
...
...
@@ -9,59 +9,38 @@ namespace kernel {
* @brief addConsumerKernelProperties
** ------------------------------------------------------------------------------------------------------------- */
inline
void
PipelineCompiler
::
addConsumerKernelProperties
(
BuilderRef
b
,
const
unsigned
producer
)
{
//
if (producer != PipelineInput || mTraceIndividualConsumedItemCounts) {
if
(
producer
!=
PipelineInput
||
mTraceIndividualConsumedItemCounts
)
{
IntegerType
*
const
sizeTy
=
b
->
getSizeTy
();
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
producer
,
mBufferGraph
)))
{
const
auto
streamSet
=
target
(
e
,
mBufferGraph
);
// If we have a buffer with only external consumers, we do not need to maintain the
// state for it.
bool
atLeastOneInternalConsumer
=
false
;
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
streamSet
,
mBufferGraph
)))
{
const
auto
consumer
=
target
(
e
,
mBufferGraph
);
if
(
consumer
!=
PipelineOutput
)
{
atLeastOneInternalConsumer
=
true
;
break
;
const
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
// If the out-degree for this buffer is zero, then we've proven that its consumption rate
// is identical to its production rate.
const
auto
numOfIndependentConsumers
=
out_degree
(
streamSet
,
mConsumerGraph
);
if
(
LLVM_UNLIKELY
(
numOfIndependentConsumers
!=
0
))
{
const
BufferPort
&
rd
=
mBufferGraph
[
e
];
assert
(
rd
.
Port
.
Type
==
PortType
::
Output
);
const
auto
prefix
=
makeBufferName
(
producer
,
rd
.
Port
);
const
auto
name
=
prefix
+
CONSUMED_ITEM_COUNT_SUFFIX
;
// If we're tracing the consumer item counts, we need to store one for each
// (non-nested) consumer. Any nested consumers will have their own trace.
Type
*
countTy
=
sizeTy
;
if
(
LLVM_UNLIKELY
(
mTraceIndividualConsumedItemCounts
))
{
countTy
=
ArrayType
::
get
(
sizeTy
,
numOfIndependentConsumers
+
1
);
}
}
if
(
LLVM_LIKELY
(
atLeastOneInternalConsumer
))
{
// If the out-degree for this buffer is zero, then we've proven that its consumption rate
// is identical to its production rate.
const
auto
numOfIndependentConsumers
=
out_degree
(
streamSet
,
mConsumerGraph
);
assert
(
numOfIndependentConsumers
<=
out_degree
(
streamSet
,
mBufferGraph
));
const
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
if
(
LLVM_UNLIKELY
(
numOfIndependentConsumers
!=
0
||
bn
.
isExternal
()))
{
if
(
LLVM_LIKELY
(
bn
.
isOwned
()
||
bn
.
isInternal
()
||
mTraceIndividualConsumedItemCounts
))
{
// If we're tracing the consumer item counts, we need to store one for each
// (non-nested) consumer. Any nested consumers will have their own trace.
Type
*
countTy
=
sizeTy
;
if
(
LLVM_UNLIKELY
(
mTraceIndividualConsumedItemCounts
))
{
countTy
=
ArrayType
::
get
(
sizeTy
,
numOfIndependentConsumers
+
1
);
}
const
BufferPort
&
rd
=
mBufferGraph
[
e
];
assert
(
rd
.
Port
.
Type
==
PortType
::
Output
);
const
auto
prefix
=
makeBufferName
(
producer
,
rd
.
Port
);
if
(
numOfIndependentConsumers
>
0
&&
atLeastOneInternalConsumer
)
{
mTarget
->
addInternalScalar
(
countTy
,
prefix
+
CONSUMED_ITEM_COUNT_SUFFIX
,
producer
);
}
else
{
mTarget
->
addNonPersistentScalar
(
countTy
,
prefix
+
CONSUMED_ITEM_COUNT_SUFFIX
);
}
}
if
(
LLVM_LIKELY
(
bn
.
isOwned
()
||
bn
.
isInternal
()
||
mTraceIndividualConsumedItemCounts
))
{
mTarget
->
addInternalScalar
(
countTy
,
name
,
producer
);
}
else
{
mTarget
->
addNonPersistentScalar
(
countTy
,
name
);
}
}
}
//}
}
}
/** ------------------------------------------------------------------------------------------------------------- *
...
...
@@ -86,23 +65,19 @@ void PipelineCompiler::readConsumedItemCounts(BuilderRef b) {
* @brief readExternalConsumerItemCounts
** ------------------------------------------------------------------------------------------------------------- */
inline
void
PipelineCompiler
::
readExternalConsumerItemCounts
(
BuilderRef
b
)
{
// for (const auto e : make_iterator_range(in_edges(PipelineOutput, mBufferGraph))) {
// const auto streamSet = source(e, mBufferGraph);
// const BufferNode & bn = mBufferGraph[streamSet];
// if (LLVM_LIKELY(bn.isOwned() || bn.isShared())) {
// const BufferPort & externalPort = mBufferGraph[e];
// Value * const consumed = getConsumedOutputItems(externalPort.Port.Number); assert (consumed);
// mInitialConsumedItemCount[streamSet] = consumed;
// const auto numOfIndependentConsumers = out_degree(streamSet, mConsumerGraph);
// const auto producer = parent(streamSet, mBufferGraph);
// if (LLVM_UNLIKELY((numOfIndependentConsumers != 0) || (producer == PipelineInput))) {
// setConsumedItemCount(b, streamSet, consumed, 0);
// }
// }
// }
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
PipelineOutput
,
mBufferGraph
)))
{
const
auto
streamSet
=
source
(
e
,
mBufferGraph
);
const
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
if
(
LLVM_LIKELY
(
bn
.
isOwned
()))
{
const
BufferPort
&
externalPort
=
mBufferGraph
[
e
];
Value
*
const
consumed
=
getConsumedOutputItems
(
externalPort
.
Port
.
Number
);
assert
(
consumed
);
const
auto
numOfIndependentConsumers
=
out_degree
(
streamSet
,
mConsumerGraph
);
const
auto
producer
=
parent
(
streamSet
,
mBufferGraph
);
if
(
LLVM_UNLIKELY
((
numOfIndependentConsumers
!=
0
)
||
(
producer
==
PipelineInput
)))
{
setConsumedItemCount
(
b
,
streamSet
,
consumed
,
0
);
}
}
}
}
/** ------------------------------------------------------------------------------------------------------------- *
...
...
@@ -110,88 +85,58 @@ inline void PipelineCompiler::readExternalConsumerItemCounts(BuilderRef b) {
** ------------------------------------------------------------------------------------------------------------- */
Value
*
PipelineCompiler
::
readConsumedItemCount
(
BuilderRef
b
,
const
size_t
streamSet
,
const
bool
useFinalCount
)
{
Value
*
consumed
=
nullptr
;
const
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
if
(
out_degree
(
streamSet
,
mConsumerGraph
)
==
0
)
{
if
(
LLVM_LIKELY
(
bn
.
isInternal
()))
{
// This stream either has no consumers or we've proven that
// its consumption rate is identical to its production rate.
if
(
useFinalCount
)
{
consumed
=
mLocallyAvailableItems
[
streamSet
];
// This stream either has no consumers or we've proven that
// its consumption rate is identical to its production rate.
Value
*
produced
=
nullptr
;
if
(
useFinalCount
)
{
produced
=
mLocallyAvailableItems
[
streamSet
];
}
else
{
produced
=
mInitiallyProducedItemCount
[
streamSet
];
}
const
auto
e
=
in_edge
(
streamSet
,
mBufferGraph
);
const
BufferPort
&
port
=
mBufferGraph
[
e
];
if
(
LLVM_UNLIKELY
(
produced
==
nullptr
))
{
const
auto
producer
=
source
(
e
,
mBufferGraph
);
const
auto
prefix
=
makeBufferName
(
producer
,
port
.
Port
);
if
(
LLVM_UNLIKELY
(
port
.
IsDeferred
))
{
produced
=
b
->
getScalarField
(
prefix
+
DEFERRED_ITEM_COUNT_SUFFIX
);
}
else
{
consumed
=
mInitiallyProducedItemCount
[
streamSet
];
}
const
auto
e
=
in_edge
(
streamSet
,
mBufferGraph
);
const
BufferPort
&
port
=
mBufferGraph
[
e
];
if
(
LLVM_UNLIKELY
(
consumed
==
nullptr
))
{
const
auto
producer
=
source
(
e
,
mBufferGraph
);
const
auto
prefix
=
makeBufferName
(
producer
,
port
.
Port
);
if
(
LLVM_UNLIKELY
(
port
.
IsDeferred
))
{
consumed
=
b
->
getScalarField
(
prefix
+
DEFERRED_ITEM_COUNT_SUFFIX
);
}
else
{
consumed
=
b
->
getScalarField
(
prefix
+
ITEM_COUNT_SUFFIX
);
}
produced
=
b
->
getScalarField
(
prefix
+
ITEM_COUNT_SUFFIX
);
}
auto
delayOrLookBehind
=
std
::
max
(
port
.
Delay
,
port
.
LookBehind
);
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
streamSet
,
mBufferGraph
)))
{
const
BufferPort
&
br
=
mBufferGraph
[
e
];
const
auto
d
=
std
::
max
(
br
.
Delay
,
br
.
LookBehind
);
delayOrLookBehind
=
std
::
max
(
delayOrLookBehind
,
d
);
}
if
(
delayOrLookBehind
)
{
consumed
=
b
->
CreateSaturatingSub
(
consumed
,
b
->
getSize
(
delayOrLookBehind
));
}
}
}
else
{
const
auto
e
=
in_edge
(
streamSet
,
mConsumerGraph
);
const
ConsumerEdge
&
c
=
mConsumerGraph
[
e
];
const
auto
producer
=
source
(
e
,
mConsumerGraph
);
Value
*
consumedPtr
=
nullptr
;
if
(
LLVM_LIKELY
(
producer
!=
PipelineInput
||
mTraceIndividualConsumedItemCounts
))
{
const
StreamSetPort
port
{
PortType
::
Output
,
c
.
Port
};
const
auto
prefix
=
makeBufferName
(
producer
,
port
);
consumedPtr
=
b
->
getScalarFieldPtr
(
prefix
+
CONSUMED_ITEM_COUNT_SUFFIX
);
if
(
LLVM_UNLIKELY
(
mTraceIndividualConsumedItemCounts
))
{
Constant
*
const
ZERO
=
b
->
getInt32
(
0
);
consumedPtr
=
b
->
CreateInBoundsGEP
(
consumedPtr
,
{
ZERO
,
ZERO
}
);
}
}
else
{
consumedPtr
=
getProcessedInputItemsPtr
(
c
.
Port
);
auto
delayOrLookBehind
=
std
::
max
(
port
.
Delay
,
port
.
LookBehind
);
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
streamSet
,
mBufferGraph
)))
{
const
BufferPort
&
br
=
mBufferGraph
[
e
];
const
auto
d
=
std
::
max
(
br
.
Delay
,
br
.
LookBehind
);
delayOrLookBehind
=
std
::
max
(
delayOrLookBehind
,
d
);
}
consumed
=
b
->
CreateLoad
(
consumedPtr
);
if
(
delayOrLookBehind
)
{
produced
=
b
->
CreateSaturatingSub
(
produced
,
b
->
getSize
(
delayOrLookBehind
));
}
return
produced
;
}
//if (LLVM_UNLIKELY(bn.isExternal())) {
bool
foundAny
=
false
;
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
streamSet
,
mBufferGraph
)))
{
if
(
target
(
e
,
mBufferGraph
)
==
PipelineOutput
)
{
const
BufferPort
&
externalPort
=
mBufferGraph
[
e
];
Value
*
const
external
=
getConsumedOutputItems
(
externalPort
.
Port
.
Number
);
assert
(
external
);
const
Binding
&
binding
=
externalPort
.
Binding
;
//b->CallPrintInt(binding.getName() + "_externalConsumed", external);
const
auto
e
=
in_edge
(
streamSet
,
mConsumerGraph
);
const
ConsumerEdge
&
c
=
mConsumerGraph
[
e
];
const
auto
producer
=
source
(
e
,
mConsumerGraph
);
if
(
LLVM_LIKELY
(
producer
!=
PipelineInput
||
mTraceIndividualConsumedItemCounts
))
{
consumed
=
b
->
CreateUMin
(
consumed
,
external
);
foundAny
=
true
;
// break;
}
const
StreamSetPort
port
{
PortType
::
Output
,
c
.
Port
};
const
auto
prefix
=
makeBufferName
(
producer
,
port
);
Value
*
ptr
=
b
->
getScalarFieldPtr
(
prefix
+
CONSUMED_ITEM_COUNT_SUFFIX
);
if
(
LLVM_UNLIKELY
(
mTraceIndividualConsumedItemCounts
))
{
Constant
*
const
ZERO
=
b
->
getInt32
(
0
);
ptr
=
b
->
CreateInBoundsGEP
(
ptr
,
{
ZERO
,
ZERO
}
);
}
return
b
->
CreateLoad
(
ptr
);
assert
(
foundAny
^
bn
.
isInternal
());
//}
}
else
{
return
b
->
CreateLoad
(
getProcessedInputItemsPtr
(
c
.
Port
));
}
return
consumed
;
}
/** ------------------------------------------------------------------------------------------------------------- *
...
...
@@ -287,7 +232,7 @@ inline void PipelineCompiler::computeMinimumConsumedItemCounts(BuilderRef b) {
#ifdef PRINT_DEBUG_MESSAGES
const
auto
consPrefix
=
makeBufferName
(
mKernelId
,
port
);
debugPrint
(
b
,
"* update "
+
consPrefix
+
" -> "
+
prodPrefix
+
"_consumed' = %"
PRIu64
,
cn
.
Consumed
);
debugPrint
(
b
,
consPrefix
+
" -> "
+
prodPrefix
+
"_consumed' = %"
PRIu64
,
cn
.
Consumed
);
#endif
}
}
...
...
@@ -300,15 +245,13 @@ inline void PipelineCompiler::writeConsumedItemCounts(BuilderRef b) {
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
mKernelId
,
mConsumerGraph
)))
{
const
ConsumerEdge
&
c
=
mConsumerGraph
[
e
];
if
(
c
.
Flags
)
{
if
(
c
.
Flags
&
ConsumerEdge
::
UpdatePhi
)
{
const
auto
streamSet
=
source
(
e
,
mConsumerGraph
);
const
ConsumerNode
&
cn
=
mConsumerGraph
[
streamSet
];
if
(
c
.
Flags
&
ConsumerEdge
::
UpdatePhi
)
{
if
(
LLVM_LIKELY
(
cn
.
PhiNode
!=
nullptr
))
{
cn
.
PhiNode
->
addIncoming
(
cn
.
Consumed
,
mKernelLoopExitPhiCatch
);
cn
.
Consumed
=
cn
.
PhiNode
;
cn
.
PhiNode
=
nullptr
;
}
if
(
LLVM_LIKELY
(
cn
.
PhiNode
!=
nullptr
))
{
cn
.
PhiNode
->
addIncoming
(
cn
.
Consumed
,
mKernelLoopExitPhiCatch
);
cn
.
Consumed
=
cn
.
PhiNode
;
cn
.
PhiNode
=
nullptr
;
}
// check to see if we've fully finished processing any stream
if
(
c
.
Flags
&
ConsumerEdge
::
WriteConsumedCount
)
{
...
...
@@ -321,6 +264,7 @@ inline void PipelineCompiler::writeConsumedItemCounts(BuilderRef b) {
#endif
setConsumedItemCount
(
b
,
streamSet
,
cn
.
Consumed
,
0
);
}
}
}
}
...
...
@@ -329,19 +273,19 @@ inline void PipelineCompiler::writeConsumedItemCounts(BuilderRef b) {
* @brief setConsumedItemCount
** ------------------------------------------------------------------------------------------------------------- */
void
PipelineCompiler
::
setConsumedItemCount
(
BuilderRef
b
,
const
size_t
streamSet
,
not_null
<
Value
*>
consumed
,
const
unsigned
slot
)
const
{
const
auto
output
=
in_edge
(
streamSet
,
mBufferGraph
);
const
auto
producer
=
source
(
output
,
mBufferGraph
);
const
BufferPort
&
outputPort
=
mBufferGraph
[
output
];
const
auto
pe
=
in_edge
(
streamSet
,
mBufferGraph
);
const
auto
producer
=
source
(
pe
,
mBufferGraph
);
const
BufferPort
&
rd
=
mBufferGraph
[
pe
];
Value
*
ptr
=
nullptr
;
if
(
LLVM_LIKELY
(
producer
!=
PipelineInput
||
slot
!=
0
||
mTraceIndividualConsumedItemCounts
))
{
const
auto
prefix
=
makeBufferName
(
producer
,
outputPort
.
Port
);
if
(
LLVM_LIKELY
(
producer
!=
PipelineInput
||
mTraceIndividualConsumedItemCounts
))
{
const
auto
prefix
=
makeBufferName
(
producer
,
rd
.
Port
);
ptr
=
b
->
getScalarFieldPtr
(
prefix
+
CONSUMED_ITEM_COUNT_SUFFIX
);
if
(
LLVM_UNLIKELY
(
mTraceIndividualConsumedItemCounts
))
{
ptr
=
b
->
CreateInBoundsGEP
(
ptr
,
{
b
->
getInt32
(
0
),
b
->
getInt32
(
slot
)
});
}
if
(
LLVM_UNLIKELY
(
CheckAssertions
))
{
Value
*
const
prior
=
b
->
CreateLoad
(
ptr
);
const
Binding
&
output
=
outputPort
.
Binding
;
const
Binding
&
output
=
rd
.
Binding
;
// TODO: cross reference which slot the traced count is for?
Constant
*
const
bindingName
=
b
->
GetString
(
output
.
getName
());
...
...
@@ -355,21 +299,8 @@ void PipelineCompiler::setConsumedItemCount(BuilderRef b, const size_t streamSet
prior
,
consumed
);
const
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
Value
*
const
produced
=
mLocallyAvailableItems
[
streamSet
];
assert
(
produced
);
if
(
bn
.
NonLocal
)
{
Value
*
const
consumedLessThanProduced
=
b
->
CreateICmpULE
(
consumed
,
produced
);
Constant
*
const
none
=
getTerminationSignal
(
b
,
TerminationSignal
::
None
);
Value
*
const
terminated
=
b
->
CreateICmpNE
(
mTerminatedAtLoopExitPhi
,
none
);
Value
*
const
valid
=
b
->
CreateOr
(
consumedLessThanProduced
,
terminated
);
b
->
CreateAssert
(
valid
,
"%s.%s: consumed item count (%"
PRId64
") exceeds "
"produced item count (%"
PRId64
")"
,
mCurrentKernelName
,
bindingName
,
consumed
,
produced
);
}
else
{
if
(
!
bn
.
NonLocal
)
{
Value
*
const
produced
=
mLocallyAvailableItems
[
streamSet
];
assert
(
produced
);
// NOTE: static linear buffers are assumed to be threadlocal.
Value
*
const
fullyConsumed
=
b
->
CreateICmpEQ
(
produced
,
consumed
);
Constant
*
const
fatal
=
getTerminationSignal
(
b
,
TerminationSignal
::
Fatal
);
...
...
@@ -384,8 +315,11 @@ void PipelineCompiler::setConsumedItemCount(BuilderRef b, const size_t streamSet
}
}
b
->
CreateStore
(
consumed
,
ptr
);
}
else
{
ptr
=
getProcessedInputItemsPtr
(
rd
.
Port
.
Number
);
}
b
->
CreateStore
(
consumed
,
ptr
);
}
/** ------------------------------------------------------------------------------------------------------------- *
...
...
@@ -395,17 +329,23 @@ inline void PipelineCompiler::initializePipelineInputConsumedPhiNodes(BuilderRef
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
PipelineInput
,
mBufferGraph
)))
{
const
auto
streamSet
=
target
(
e
,
mBufferGraph
);
const
BufferPort
&
br
=
mBufferGraph
[
e
];
const
auto
portNum
=
br
.
Port
.
Number
;
Value
*
const
avail
=
getAvailableInputItems
(
portNum
);
mInitialConsumedItemCount
[
streamSet
]
=
avail
;
// If we have an unused external input, set the value immediately.
if
(
out_degree
(
streamSet
,
mBufferGraph
)
==
0
)
{
Value
*
const
externalPtr
=
getProcessedInputItemsPtr
(
portNum
);
b
->
CreateStore
(
avail
,
externalPtr
);
}
mInitialConsumedItemCount
[
streamSet
]
=
getAvailableInputItems
(
br
.
Port
.
Number
);
}
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief reportExternalConsumedItemCounts
** ------------------------------------------------------------------------------------------------------------- */
inline
void
PipelineCompiler
::
writeExternalConsumedItemCounts
(
BuilderRef
b
)
{
// for (const auto e : make_iterator_range(out_edges(PipelineInput, mBufferGraph))) {
// const auto streamSet = target(e, mBufferGraph);
// const BufferPort & rd = mBufferGraph[e];
// Value * const ptr = getProcessedInputItemsPtr(rd.Port.Number);
// Value * const consumed = mInitialConsumedItemCount[streamSet]; assert (consumed);
// b->CreateStore(consumed, ptr);
// }
}
}
#endif // CONSUMER_LOGIC_HPP
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/kernel_execution_logic.hpp
View file @
c9ab9ef7
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/kernel_io_calculation_logic.hpp
View file @
c9ab9ef7
...
...
@@ -12,6 +12,75 @@
namespace
kernel
{
/** ------------------------------------------------------------------------------------------------------------- *
* @brief readPipelineIOItemCounts
** ------------------------------------------------------------------------------------------------------------- */
void
PipelineCompiler
::
readPipelineIOItemCounts
(
BuilderRef
b
)
{
// TODO: this needs to be considered more: if we have multiple consumers of a pipeline input and
// they process the input data at differing rates, how do we ensure that we always resume processing
// at the correct position? We can store the actual item counts / delta of the consumed count
// internally but this would be problematic for optimization branches as we may have processed data
// using the alternate path and any internally stored counts/deltas are irrelevant.
// Would a simple "reset" be enough?
mKernelId
=
PipelineInput
;
ConstantInt
*
const
ZERO
=
b
->
getSize
(
0
);
for
(
auto
streamSet
=
FirstStreamSet
;
streamSet
<=
LastStreamSet
;
++
streamSet
)
{
mLocallyAvailableItems
[
streamSet
]
=
ZERO
;
}
// NOTE: all outputs of PipelineInput node are inputs to the PipelineKernel
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
PipelineInput
,
mBufferGraph
)))
{
const
StreamSetPort
inputPort
=
mBufferGraph
[
e
].
Port
;
assert
(
inputPort
.
Type
==
PortType
::
Output
);
Value
*
const
available
=
getAvailableInputItems
(
inputPort
.
Number
);
setLocallyAvailableItemCount
(
b
,
inputPort
,
available
);
initializeConsumedItemCount
(
b
,
inputPort
,
available
);
}
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
PipelineInput
,
mBufferGraph
)))
{
const
auto
buffer
=
target
(
e
,
mBufferGraph
);
const
StreamSetPort
inputPort
=
mBufferGraph
[
e
].
Port
;
assert
(
inputPort
.
Type
==
PortType
::
Output
);
Value
*
const
inPtr
=
getProcessedInputItemsPtr
(
inputPort
.
Number
);
Value
*
const
processed
=
b
->
CreateLoad
(
inPtr
);
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
buffer
,
mBufferGraph
)))
{
const
BufferPort
&
rd
=
mBufferGraph
[
e
];
const
auto
kernelIndex
=
target
(
e
,
mBufferGraph
);
const
auto
prefix
=
makeBufferName
(
kernelIndex
,
rd
.
Port
);
Value
*
const
ptr
=
b
->
getScalarFieldPtr
(
prefix
+
ITEM_COUNT_SUFFIX
);
b
->
CreateStore
(
processed
,
ptr
);
}
}
mKernelId
=
PipelineOutput
;
// NOTE: all inputs of PipelineOutput node are outputs of the PipelineKernel
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
PipelineOutput
,
mBufferGraph
)))
{
const
auto
buffer
=
source
(
e
,
mBufferGraph
);
const
StreamSetPort
outputPort
=
mBufferGraph
[
e
].
Port
;
assert
(
outputPort
.
Type
==
PortType
::
Input
);
Value
*
outPtr
=
getProducedOutputItemsPtr
(
outputPort
.
Number
);
Value
*
const
produced
=
b
->
CreateLoad
(
outPtr
);
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
buffer
,
mBufferGraph
)))
{
const
BufferPort
&
rd
=
mBufferGraph
[
e
];
const
auto
kernelId
=
source
(
e
,
mBufferGraph
);
const
auto
prefix
=
makeBufferName
(
kernelId
,
rd
.
Port
);
Value
*
const
ptr
=
b
->
getScalarFieldPtr
(
prefix
+
ITEM_COUNT_SUFFIX
);
b
->
CreateStore
(
produced
,
ptr
);
}
}
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief detemineMaximumNumberOfStrides
** ------------------------------------------------------------------------------------------------------------- */
...
...
@@ -54,7 +123,7 @@ void PipelineCompiler::determineNumOfLinearStrides(BuilderRef b) {
Value
*
numOfLinearStrides
=
nullptr
;
if
(
m
CurrentNumOfStridesAt
LoopEntry
Phi
&&
mMaximumNumOfStrides
)
{
if
(
m
May
Loop
To
Entry
&&
!
ExternallySynchronized
)
{
numOfLinearStrides
=
b
->
CreateSub
(
mMaximumNumOfStrides
,
mCurrentNumOfStridesAtLoopEntryPhi
);
}
else
{
numOfLinearStrides
=
mMaximumNumOfStrides
;
...
...
@@ -108,11 +177,11 @@ void PipelineCompiler::determineNumOfLinearStrides(BuilderRef b) {
const
auto
check
=
(
bn
.
NonLocal
||
bn
.
NonLinear
)
&&
unchecked
(
br
.
LocalPortId
);
if
(
LLVM_LIKELY
(
check
))
{
Value
*
const
strides
=
getNumOfAccessibleStrides
(
b
,
br
,
numOfInputStrides
,
false
);
Value
*
const
strides
=
getNumOfAccessibleStrides
(
b
,
br
,
numOfInputStrides
);
numOfInputStrides
=
b
->
CreateUMin
(
numOfInputStrides
,
strides
);
}
if
(
LLVM_UNLIKELY
(
CheckAssertions
))
{
Value
*
const
strides
=
getNumOfAccessibleStrides
(
b
,
br
,
numOfActualInputStrides
,
true
);
Value
*
const
strides
=
getNumOfAccessibleStrides
(
b
,
br
,
numOfActualInputStrides
);
numOfActualInputStrides
=
b
->
CreateUMin
(
numOfActualInputStrides
,
strides
);
}
}
...
...
@@ -134,7 +203,7 @@ void PipelineCompiler::determineNumOfLinearStrides(BuilderRef b) {
ConstantInt
*
const
ONE
=
b
->
getSize
(
1
);
numOfOutputStrides
=
b
->
CreateUMax
(
numOfInputStrides
,
ONE
);
}
Value
*
const
strides
=
getNumOfWritableStrides
(
b
,
br
,
numOfOutputStrides
,
false
);
Value
*
const
strides
=
getNumOfWritableStrides
(
b
,
br
,
numOfOutputStrides
);
if
(
strides
)
{
Value
*
const
minStrides
=
b
->
CreateUMin
(
numOfOutputStrides
,
strides
);
Value
*
const
isZero
=
b
->
CreateICmpEQ
(
strides
,
ZERO
);
...
...
@@ -172,7 +241,7 @@ void PipelineCompiler::determineNumOfLinearStrides(BuilderRef b) {
ConstantInt
*
const
ONE
=
b
->
getSize
(
1
);
numOfActualOutputStrides
=
b
->
CreateUMax
(
numOfActualInputStrides
,
ONE
);
}
Value
*
const
strides
=
getNumOfWritableStrides
(
b
,
br
,
numOfActualOutputStrides
,
true
);
Value
*
const
strides
=
getNumOfWritableStrides
(
b
,
br
,
numOfActualOutputStrides
);
if
(
strides
)
{
Value
*
const
minStrides
=
b
->
CreateUMin
(
numOfActualOutputStrides
,
strides
);
Value
*
const
isZero
=
b
->
CreateICmpEQ
(
strides
,
ZERO
);
...
...
@@ -669,6 +738,7 @@ Value * PipelineCompiler::getAccessibleInputItems(BuilderRef b, const BufferPort
Value
*
accessible
=
buffer
->
getLinearlyAccessibleItems
(
b
,
processed
,
available
,
overflow
);
// if (LLVM_UNLIKELY(CheckAssertions)) {
// Value * intCapacity = buffer->getInternalCapacity(b);
// if (overflow) {
...
...
@@ -694,11 +764,6 @@ Value * PipelineCompiler::getAccessibleInputItems(BuilderRef b, const BufferPort
Value
*
const
exhausted
=
b
->
CreateICmpUGE
(
processed
,
available
);
Value
*
const
useZeroExtend
=
b
->
CreateAnd
(
closed
,
exhausted
);
mIsInputZeroExtended
[
inputPort
]
=
useZeroExtend
;
#ifdef PRINT_DEBUG_MESSAGES
debugPrint
(
b
,
prefix
+
"_zeroExtended = %"
PRIu64
,
mIsInputZeroExtended
[
inputPort
]);
#endif
if
(
LLVM_LIKELY
(
mHasZeroExtendedInput
==
nullptr
))
{
mHasZeroExtendedInput
=
useZeroExtend
;
}
else
{
...
...
@@ -742,8 +807,7 @@ void PipelineCompiler::ensureSufficientOutputSpace(BuilderRef b, const BufferPor
const
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
if
(
LLVM_UNLIKELY
(
bn
.
isOwned
()
&&
bn
.
isDynamic
()))
{
if
(
LLVM_UNLIKELY
(
bn
.
isOwned
()))
{
const
auto
prefix
=
makeBufferName
(
mKernelId
,
outputPort
);
const
StreamSetBuffer
*
const
buffer
=
bn
.
Buffer
;
...
...
@@ -760,6 +824,7 @@ void PipelineCompiler::ensureSufficientOutputSpace(BuilderRef b, const BufferPor
const
auto
beforeExpansion
=
mWritableOutputItems
[
outputPort
.
Number
];
Value
*
const
hasEnoughSpace
=
b
->
CreateICmpULE
(
required
,
beforeExpansion
[
WITH_OVERFLOW
]);
BasicBlock
*
const
noExpansionExit
=
b
->
GetInsertBlock
();
b
->
CreateLikelyCondBr
(
hasEnoughSpace
,
expanded
,
expandBuffer
);
...
...
@@ -781,7 +846,6 @@ void PipelineCompiler::ensureSufficientOutputSpace(BuilderRef b, const BufferPor
buffer
->
reserveCapacity
(
b
,
produced
,
consumed
,
required
);
recordBufferExpansionHistory
(
b
,
outputPort
,
buffer
);
if
(
cycleCounterAccumulator
)
{
Value
*
const
cycleCounterEnd
=
b
->
CreateReadCycleCounter
();
Value
*
const
duration
=
b
->
CreateSub
(
cycleCounterEnd
,
cycleCounterStart
);
...
...
@@ -908,8 +972,7 @@ Value * PipelineCompiler::getWritableOutputItems(BuilderRef b, const BufferPort
** ------------------------------------------------------------------------------------------------------------- */
Value
*
PipelineCompiler
::
getNumOfAccessibleStrides
(
BuilderRef
b
,
const
BufferPort
&
port
,
Value
*
const
numOfLinearStrides
,
const
bool
debug
)
{
Value
*
const
numOfLinearStrides
)
{
const
auto
inputPort
=
port
.
Port
;
assert
(
inputPort
.
Type
==
PortType
::
Input
);
const
Binding
&
input
=
port
.
Binding
;
...
...
@@ -917,12 +980,6 @@ Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b,
Value
*
numOfStrides
=
nullptr
;
#ifdef PRINT_DEBUG_MESSAGES
const
auto
prefix
=
makeBufferName
(
mKernelId
,
inputPort
);
Constant
*
prefixSymbol
=
nullptr
;
if
(
debug
)
{
prefixSymbol
=
b
->
GetString
(
prefix
+
"_debug"
);
}
else
{
prefixSymbol
=
b
->
GetString
(
prefix
);
}
#endif
if
(
LLVM_UNLIKELY
(
rate
.
isPartialSum
()))
{
numOfStrides
=
getMaximumNumOfPartialSumStrides
(
b
,
port
,
numOfLinearStrides
);
...
...
@@ -932,8 +989,8 @@ Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b,
Value
*
const
accessible
=
getAccessibleInputItems
(
b
,
port
);
assert
(
accessible
);
Value
*
const
strideLength
=
getInputStrideLength
(
b
,
port
);
assert
(
strideLength
);
#ifdef PRINT_DEBUG_MESSAGES
debugPrint
(
b
,
"<
%s
_accessible = %"
PRIu64
,
prefixSymbol
,
accessible
);
debugPrint
(
b
,
"<
%s
_strideLength = %"
PRIu64
,
prefixSymbol
,
strideLength
);
debugPrint
(
b
,
"<
"
+
prefix
+
"
_accessible = %"
PRIu64
,
accessible
);
debugPrint
(
b
,
"<
"
+
prefix
+
"
_strideLength = %"
PRIu64
,
strideLength
);
#endif
numOfStrides
=
b
->
CreateUDiv
(
subtractLookahead
(
b
,
port
,
accessible
),
strideLength
);
}
...
...
@@ -942,7 +999,7 @@ Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b,
numOfStrides
=
b
->
CreateSelect
(
ze
,
numOfLinearStrides
,
numOfStrides
,
"numOfZeroExtendedStrides"
);
}
#ifdef PRINT_DEBUG_MESSAGES
debugPrint
(
b
,
"<
%s
_numOfStrides = %"
PRIu64
,
prefixSymbol
,
numOfStrides
);
debugPrint
(
b
,
"<
"
+
prefix
+
"
_numOfStrides = %"
PRIu64
,
numOfStrides
);
#endif
return
numOfStrides
;
}
...
...
@@ -952,8 +1009,7 @@ Value * PipelineCompiler::getNumOfAccessibleStrides(BuilderRef b,
** ------------------------------------------------------------------------------------------------------------- */
Value
*
PipelineCompiler
::
getNumOfWritableStrides
(
BuilderRef
b
,
const
BufferPort
&
port
,
Value
*
const
numOfLinearStrides
,
const
bool
debug
)
{
Value
*
const
numOfLinearStrides
)
{
const
auto
outputPort
=
port
.
Port
;
assert
(
outputPort
.
Type
==
PortType
::
Output
);
...
...
@@ -962,15 +1018,6 @@ Value * PipelineCompiler::getNumOfWritableStrides(BuilderRef b,
if
(
LLVM_UNLIKELY
(
bn
.
isUnowned
()))
{
return
nullptr
;
}
#ifdef PRINT_DEBUG_MESSAGES
const
auto
prefix
=
makeBufferName
(
mKernelId
,
outputPort
);
Constant
*
prefixSymbol
=
nullptr
;
if
(
debug
)
{
prefixSymbol
=
b
->
GetString
(
prefix
+
"_debug"
);
}
else
{
prefixSymbol
=
b
->
GetString
(
prefix
);
}
#endif
const
Binding
&
output
=
port
.
Binding
;
Value
*
numOfStrides
=
nullptr
;
if
(
LLVM_UNLIKELY
(
output
.
getRate
().
isPartialSum
()))
{
...
...
@@ -981,7 +1028,8 @@ Value * PipelineCompiler::getNumOfWritableStrides(BuilderRef b,
numOfStrides
=
b
->
CreateUDiv
(
writable
,
strideLength
);
}
#ifdef PRINT_DEBUG_MESSAGES
debugPrint
(
b
,
"> %s_numOfStrides = %"
PRIu64
,
prefixSymbol
,
numOfStrides
);
const
auto
prefix
=
makeBufferName
(
mKernelId
,
outputPort
);
debugPrint
(
b
,
"> "
+
prefix
+
"_numOfStrides = %"
PRIu64
,
numOfStrides
);
#endif
return
numOfStrides
;
}
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/kernel_logic.hpp
View file @
c9ab9ef7
...
...
@@ -478,29 +478,6 @@ inline const StreamSetPort PipelineCompiler::getReference(const StreamSetPort po
return
PipelineCommonGraphFunctions
::
getReference
(
mKernelId
,
port
);
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief isSafeToUseProcessedItemCountDirectly
** ------------------------------------------------------------------------------------------------------------- */
bool
PipelineCompiler
::
isSafeToUseProcessedItemCountDirectly
(
const
unsigned
streamSet
)
const
{
const
BufferNode
&
bn
=
mBufferGraph
[
streamSet
];
if
(
bn
.
isExternal
())
{
bool
alreadyHasOneUse
=
false
;
for
(
const
auto
e
:
make_iterator_range
(
out_edges
(
streamSet
,
mBufferGraph
)))
{
const
auto
consumer
=
target
(
e
,
mBufferGraph
);
// We can only safely use the processed item count if it's the last use of it
if
(
consumer
>
mKernelId
)
{
return
false
;
}
else
if
(
consumer
==
mKernelId
)
{
// If we have more than one use of this count in the same kernel, we cannot
// safely reuse it.
if
(
alreadyHasOneUse
)
return
false
;
alreadyHasOneUse
=
true
;
}
}
}
return
true
;
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief reset
** ------------------------------------------------------------------------------------------------------------- */
...
...
@@ -523,8 +500,6 @@ void PipelineCompiler::clearInternalStateForCurrentKernel() {
mAnyRemainingInput
=
nullptr
;
mExhaustedPipelineInputPhi
=
nullptr
;
mExhaustedInputAtJumpPhi
=
nullptr
;
mExecutedAtLeastOnceAtLoopEntryPhi
=
nullptr
;
mCurrentNumOfStridesAtLoopEntryPhi
=
nullptr
;
mKernelIsFinal
=
nullptr
;
mKernelIsPenultimate
=
nullptr
;
...
...
This diff is collapsed.
Click to expand it.
lib/kernel/pipeline/compiler/kernel_segment_processing_logic.hpp
View file @
c9ab9ef7
...
...
@@ -104,7 +104,7 @@ inline void PipelineCompiler::executeKernel(BuilderRef b) {
const
auto
nextPartitionId
=
mCurrentPartitionId
+
1U
;
const
auto
jumpId
=
mPartitionJumpIndex
[
mCurrentPartitionId
];
const
auto
canJumpToAnotherPartition
=
mIsPartitionRoot
&&
(
mIsBounded
||
nextPartitionId
==
jumpId
);
const
auto
handleNoUpdateExit
=
mIsPartitionRoot
;
//
|| !canJumpToAnotherPartition;
const
auto
handleNoUpdateExit
=
mIsPartitionRoot
||
!
canJumpToAnotherPartition
;
#else
const
auto
canJumpToAnotherPartition
=
mIsPartitionRoot
;
const
auto
handleNoUpdateExit
=
mCheckIO
;
...
...
@@ -504,7 +504,11 @@ inline void PipelineCompiler::initializeKernelLoopEntryPhis(BuilderRef b) {
IntegerType
*
const
boolTy
=
b
->
getInt1Ty
();
b
->
SetInsertPoint
(
mKernelLoopEntry
);
assert
(
"no loop start?"
&&
mKernelLoopStart
);
if
(
mKernelLoopStart
==
nullptr
)
{
report_fatal_error
(
"no loop start?"
);
}
assert
(
mKernelLoopStart
);
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
mKernelId
,
mBufferGraph
)))
{
const
BufferPort
&
br
=
mBufferGraph
[
e
];
...
...
@@ -873,6 +877,9 @@ void PipelineCompiler::end(BuilderRef b) {
b
->
CreateUnlikelyCondBr
(
done
,
mPipelineEnd
,
mPipelineLoop
);
}
b
->
SetInsertPoint
(
mPipelineEnd
);
writeExternalConsumedItemCounts
(
b
);
writeExternalProducedItemCounts
(
b
);
if
(
mCurrentThreadTerminationSignalPtr
)
{
b
->
CreateStore
(
terminated
,
mCurrentThreadTerminationSignalPtr
);
}
...
...
@@ -898,4 +905,16 @@ void PipelineCompiler::end(BuilderRef b) {
// b->GetInsertBlock()->getParent()->print(errs());
}
/** ------------------------------------------------------------------------------------------------------------- *
* @brief writeExternalProducedItemCounts
** ------------------------------------------------------------------------------------------------------------- */
void
PipelineCompiler
::
writeExternalProducedItemCounts
(
BuilderRef
b
)
{
for
(
const
auto
e
:
make_iterator_range
(
in_edges
(
PipelineOutput
,
mBufferGraph
)))
{
const
BufferPort
&
external
=
mBufferGraph
[
e
];
const
auto
streamSet
=
source
(
e
,
mBufferGraph
);
Value
*
const
ptr
=
getProducedOutputItemsPtr
(
external
.
Port
.
Number
);
b
->
CreateStore
(
mLocallyAvailableItems
[
streamSet
],
ptr
);
}
}
}
This diff is collapsed.
Click to expand it.
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment