tlx
Loading...
Searching...
No Matches
parallel_sample_sort.hpp
Go to the documentation of this file.
1/*******************************************************************************
2 * tlx/sort/strings/parallel_sample_sort.hpp
3 *
4 * Parallel Super Scalar String Sample Sort (pS5)
5 *
6 * See also Timo Bingmann, Andreas Eberle, and Peter Sanders. "Engineering
7 * parallel string sorting." Algorithmica 77.1 (2017): 235-286.
8 *
9 * Part of tlx - http://panthema.net/tlx
10 *
11 * Copyright (C) 2013-2019 Timo Bingmann <tb@panthema.net>
12 *
13 * All rights reserved. Published under the Boost Software License, Version 1.0
14 ******************************************************************************/
15
16#ifndef TLX_SORT_STRINGS_PARALLEL_SAMPLE_SORT_HEADER
17#define TLX_SORT_STRINGS_PARALLEL_SAMPLE_SORT_HEADER
18
19#include <algorithm>
20#include <atomic>
21#include <cmath>
22#include <cstdint>
23#include <cstdlib>
24#include <cstring>
25#include <random>
26#include <vector>
27
31
32#include <tlx/logger/core.hpp>
33#include <tlx/math/clz.hpp>
34#include <tlx/math/ctz.hpp>
36#include <tlx/multi_timer.hpp>
37#include <tlx/simple_vector.hpp>
38#include <tlx/thread_pool.hpp>
39#include <tlx/unused.hpp>
40
41namespace tlx {
42namespace sort_strings_detail {
43
44class PS5SortStep;
45
46/******************************************************************************/
47//! Parallel Super Scalar String Sample Sort Parameter Struct
48
50{
51public:
52 static const bool debug_steps = false;
53 static const bool debug_jobs = false;
54
55 static const bool debug_bucket_size = false;
56 static const bool debug_recursion = false;
57 static const bool debug_lcp = false;
58
59 static const bool debug_result = false;
60
61 //! enable/disable various sorting levels
62 static const bool enable_parallel_sample_sort = true;
63 static const bool enable_sequential_sample_sort = true;
64 static const bool enable_sequential_mkqs = true;
65
66 //! terminate sort after first parallel sample sort step
67 static const bool use_only_first_sortstep = false;
68
69 //! enable work freeing
70 static const bool enable_work_sharing = true;
71
72 //! whether the base sequential_threshold() on the remaining unsorted string
73 //! set or on the whole string set.
74 static const bool enable_rest_size = false;
75
76 //! key type for sample sort: 32-bit or 64-bit
77 typedef size_t key_type;
78
79 //! depth of classification tree used in sample sorts
80 static const unsigned TreeBits = 10;
81
82 //! classification tree variant for sample sorts
84
85 //! threshold to run sequential small sorts
86 static const size_t smallsort_threshold = 1024 * 1024;
87 //! threshold to switch to insertion sort
88 static const size_t inssort_threshold = 32;
89};
90
91/******************************************************************************/
92//! Parallel Super Scalar String Sample Sort Context
93
94template <typename Parameters>
95class PS5Context : public Parameters
96{
97public:
98 //! total size of input
99 size_t total_size;
100
101 //! number of remaining strings to sort
102 std::atomic<size_t> rest_size;
103
104 //! counters
106
107 //! timers for individual sorting steps
109
110 //! number of threads overall
112
113 //! thread pool
115
116 //! context constructor
117 PS5Context(size_t _thread_num)
119 num_threads(_thread_num),
120 threads_(_thread_num)
121 { }
122
123 //! enqueue a new job in the thread pool
124 template <typename StringPtr>
125 void enqueue(PS5SortStep* sstep, const StringPtr& strptr, size_t depth);
126
127 //! return sequential sorting threshold
129 size_t threshold = this->smallsort_threshold;
130 if (this->enable_rest_size) {
131 return std::max(threshold, rest_size / num_threads);
132 }
133 else {
134 return std::max(threshold, total_size / num_threads);
135 }
136 }
137
138 //! decrement number of unordered strings
139 void donesize(size_t n) {
140 if (this->enable_rest_size)
141 rest_size -= n;
142 }
143};
144
145/******************************************************************************/
146//! LCP calculation of Splitter Strings
147
148template <typename KeyType>
149static inline unsigned char
150lcpKeyType(const KeyType& a, const KeyType& b) {
151 // XOR both values and count the number of zero bytes
152 return clz(a ^ b) / 8;
153}
154
155template <typename KeyType>
156static inline unsigned char
157lcpKeyDepth(const KeyType& a) {
158 // count number of non-zero bytes
159 return sizeof(KeyType) - (ctz(a) / 8);
160}
161
162//! return the d-th character in the (swapped) key
163template <typename KeyType>
164static inline unsigned char
165getCharAtDepth(const KeyType& a, unsigned char d) {
166 return static_cast<unsigned char>(a >> (8 * (sizeof(KeyType) - 1 - d)));
167}
168
169/******************************************************************************/
170//! PS5SortStep Top-Level Class to Keep Track of Substeps
171
173{
174private:
175 //! Number of substeps still running
176 std::atomic<size_t> substep_working_;
177
178 //! Pure virtual function called by substep when all substeps are done.
179 virtual void substep_all_done() = 0;
180
181protected:
183
184 virtual ~PS5SortStep() {
185 assert(substep_working_ == 0);
186 }
187
188 //! Register new substep
189 void substep_add() {
191 }
192
193public:
194 //! Notify superstep that the currently substep is done.
196 assert(substep_working_ > 0);
197 if (--substep_working_ == 0)
199 }
200};
201
202/******************************************************************************/
203//! LCP Calculation for Finished Sample Sort Steps
204
205template <size_t bktnum, typename Context, typename Classify,
206 typename StringPtr, typename BktSizeType>
207void ps5_sample_sort_lcp(const Context& ctx, const Classify& classifier,
208 const StringPtr& strptr, size_t depth,
209 const BktSizeType* bkt) {
210 assert(!strptr.flipped());
211
212 const typename StringPtr::StringSet& strset = strptr.active();
213 typedef typename Context::key_type key_type;
214
215 size_t b = 0; // current bucket number
216 key_type prevkey = 0; // previous key
217
218 // the following while loops only check b < bktnum when b is odd,
219 // because bktnum is always odd. We need a goto to jump into the loop,
220 // as b == 0 start even.
221 goto even_first;
222
223 // find first non-empty bucket
224 while (b < bktnum)
225 {
226 // odd bucket: = bkt
227 if (bkt[b] != bkt[b + 1]) {
228 prevkey = classifier.get_splitter(b / 2);
229 assert(prevkey == get_key_at<key_type>(strset, bkt[b + 1] - 1, depth));
230 break;
231 }
232 ++b;
233even_first:
234 // even bucket: <, << or > bkt
235 if (bkt[b] != bkt[b + 1]) {
236 prevkey = get_key_at<key_type>(strset, bkt[b + 1] - 1, depth);
237 break;
238 }
239 ++b;
240 }
241 ++b;
242
243 // goto depends on whether the first non-empty bucket was odd or
244 // even. the while loop below encodes this in the program counter.
245 if (b < bktnum && b % 2 == 0) goto even_bucket;
246
247 // find next non-empty bucket
248 while (b < bktnum)
249 {
250 // odd bucket: = bkt
251 if (bkt[b] != bkt[b + 1]) {
252 key_type thiskey = classifier.get_splitter(b / 2);
253 assert(thiskey == get_key_at<key_type>(strset, bkt[b], depth));
254
255 int rlcp = lcpKeyType(prevkey, thiskey);
256 strptr.set_lcp(bkt[b], depth + rlcp);
257 // strptr.set_cache(bkt[b], getCharAtDepth(thiskey, rlcp));
258
259 TLX_LOGC(ctx.debug_lcp)
260 << "LCP at odd-bucket " << b
261 << " [" << bkt[b] << "," << bkt[b + 1] << ")"
262 << " is " << depth + rlcp;
263
264 prevkey = thiskey;
265 assert(prevkey == get_key_at<key_type>(strset, bkt[b + 1] - 1, depth));
266 }
267 ++b;
268even_bucket:
269 // even bucket: <, << or > bkt
270 if (bkt[b] != bkt[b + 1]) {
271 key_type thiskey = get_key_at<key_type>(strset, bkt[b], depth);
272
273 int rlcp = lcpKeyType(prevkey, thiskey);
274 strptr.set_lcp(bkt[b], depth + rlcp);
275 // strptr.set_cache(bkt[b], getCharAtDepth(thiskey, rlcp));
276
277 TLX_LOGC(ctx.debug_lcp)
278 << "LCP at even-bucket " << b
279 << " [" << bkt[b] << "," << bkt[b + 1] << ")"
280 << " is " << depth + rlcp;
281
282 prevkey = get_key_at<key_type>(strset, bkt[b + 1] - 1, depth);
283 }
284 ++b;
285 }
286}
287
288/******************************************************************************/
289//! SampleSort: Non-Recursive In-Place Sequential Sample Sort for Small Sorts
290
291template <typename Context, typename StringPtr, typename BktSizeType>
293{
294public:
295 Context& ctx_;
296
297 //! parent sort step
299
301 size_t depth_;
303
304 typedef typename Context::key_type key_type;
306 typedef BktSizeType bktsize_type;
307
308 PS5SmallsortJob(Context& ctx, PS5SortStep* pstep,
309 const StringPtr& strptr, size_t depth)
310 : ctx_(ctx), pstep_(pstep), strptr_(strptr), depth_(depth) {
311 TLX_LOGC(ctx_.debug_steps)
312 << "enqueue depth=" << depth_
313 << " size=" << strptr_.size() << " flip=" << strptr_.flipped();
314 }
315
317 mtimer_.stop();
318 ctx_.mtimer.add(mtimer_);
319 }
320
322 size_t bktcache_size_ = 0;
323
324 void run() {
325 mtimer_.start("sequ_ss");
326
327 size_t n = strptr_.size();
328
329 TLX_LOGC(ctx_.debug_jobs)
330 << "Process PS5SmallsortJob " << this << " of size " << n;
331
332 // create anonymous wrapper job
333 this->substep_add();
334
335 if (ctx_.enable_sequential_sample_sort && n >= ctx_.smallsort_threshold)
336 {
337 bktcache_.resize(n * sizeof(std::uint16_t));
339 }
340 else
341 {
342 mtimer_.start("mkqs");
344 }
345
346 // finish wrapper job, handler delete's this
347 this->substep_notify_done();
348 }
349
350 /*------------------------------------------------------------------------*/
351 //! Stack of Recursive Sample Sort Steps
352
354 {
355 public:
357 size_t idx_;
358 size_t depth_;
359
361 using bktsize_type = BktSizeType;
362
363 typename Context::Classify classifier;
364
365 static const size_t num_splitters = Context::Classify::num_splitters;
366 static const size_t bktnum = 2 * num_splitters + 1;
367
368 unsigned char splitter_lcp[num_splitters + 1];
370
371 SeqSampleSortStep(Context& ctx, const StringPtr& strptr, size_t depth,
372 std::uint16_t* bktcache)
373 : strptr_(strptr), idx_(0), depth_(depth) {
374 size_t n = strptr_.size();
375
376 // step 1: select splitters with oversampling
377
378 const size_t oversample_factor = 2;
379 const size_t sample_size = oversample_factor * num_splitters;
380
381 simple_vector<key_type> samples(sample_size);
382
383 const StringSet& strset = strptr_.active();
384
385 std::minstd_rand rng(reinterpret_cast<uintptr_t>(samples.data()));
386
387 for (size_t i = 0; i < sample_size; ++i)
388 samples[i] = get_key_at<key_type>(strset, rng() % n, depth_);
389
390 std::sort(samples.begin(), samples.end());
391
392 classifier.build(samples.data(), sample_size, splitter_lcp);
393
394 // step 2: classify all strings
395
396 classifier.classify(
397 strset, strset.begin(), strset.end(), bktcache, depth_);
398
399 // step 2.5: count bucket sizes
400
401 bktsize_type bktsize[bktnum];
402 memset(bktsize, 0, bktnum * sizeof(bktsize_type));
403
404 for (size_t si = 0; si < n; ++si)
405 ++bktsize[bktcache[si]];
406
407 // step 3: inclusive prefix sum
408
409 bkt[0] = bktsize[0];
410 for (unsigned int i = 1; i < bktnum; ++i) {
411 bkt[i] = bkt[i - 1] + bktsize[i];
412 }
413 assert(bkt[bktnum - 1] == n);
414 bkt[bktnum] = n;
415
416 // step 4: premute out-of-place
417
418 const StringSet& strB = strptr_.active();
419 // get alternative shadow pointer array
420 const StringSet& sorted = strptr_.shadow();
421 typename StringSet::Iterator sbegin = sorted.begin();
422
423 for (typename StringSet::Iterator str = strB.begin();
424 str != strB.end(); ++str, ++bktcache)
425 *(sbegin + --bkt[*bktcache]) = std::move(*str);
426
427 // bkt is afterwards the exclusive prefix sum of bktsize
428
429 // statistics
430
431 ++ctx.sequ_ss_steps;
432 }
433
434 void calculate_lcp(Context& ctx) {
435 TLX_LOGC(ctx.debug_lcp) << "Calculate LCP after sample sort step";
436 if (strptr_.with_lcp) {
437 ps5_sample_sort_lcp<bktnum>(ctx, classifier, strptr_, depth_, bkt);
438 }
439 }
440 };
441
442 size_t ss_front_ = 0;
443 std::vector<SeqSampleSortStep> ss_stack_;
444
445 void sort_sample_sort(const StringPtr& strptr, size_t depth) {
446 typedef SeqSampleSortStep Step;
447
448 assert(ss_front_ == 0);
449 assert(ss_stack_.size() == 0);
450
451 std::uint16_t* bktcache = reinterpret_cast<std::uint16_t*>(bktcache_.data());
452
453 // sort first level
454 ss_stack_.emplace_back(ctx_, strptr, depth, bktcache);
455
456 // step 5: "recursion"
457
458 while (ss_stack_.size() > ss_front_)
459 {
460 Step& s = ss_stack_.back();
461 size_t i = s.idx_++; // process the bucket s.idx_
462
463 if (i < Step::bktnum)
464 {
465 size_t bktsize = s.bkt[i + 1] - s.bkt[i];
466
467 StringPtr sp = s.strptr_.flip(s.bkt[i], bktsize);
468
469 // i is even -> bkt[i] is less-than bucket
470 if (i % 2 == 0)
471 {
472 if (bktsize == 0) {
473 // empty bucket
474 }
475 else if (bktsize < ctx_.smallsort_threshold)
476 {
477 assert(i / 2 <= Step::num_splitters);
478 if (i == Step::bktnum - 1)
479 TLX_LOGC(ctx_.debug_recursion)
480 << "Recurse[" << s.depth_ << "]: > bkt "
481 << i << " size " << bktsize << " no lcp";
482 else
483 TLX_LOGC(ctx_.debug_recursion)
484 << "Recurse[" << s.depth_ << "]: < bkt "
485 << i << " size " << bktsize << " lcp "
486 << int(s.splitter_lcp[i / 2] & 0x7F);
487
488 ScopedMultiTimerSwitch sts_inssort(mtimer_, "mkqs");
490 sp, s.depth_ + (s.splitter_lcp[i / 2] & 0x7F));
491 }
492 else
493 {
494 if (i == Step::bktnum - 1)
495 TLX_LOGC(ctx_.debug_recursion)
496 << "Recurse[" << s.depth_ << "]: > bkt "
497 << i << " size " << bktsize << " no lcp";
498 else
499 TLX_LOGC(ctx_.debug_recursion)
500 << "Recurse[" << s.depth_ << "]: < bkt "
501 << i << " size " << bktsize << " lcp "
502 << int(s.splitter_lcp[i / 2] & 0x7F);
503
504 ss_stack_.emplace_back(
505 ctx_, sp, s.depth_ + (s.splitter_lcp[i / 2] & 0x7F), bktcache);
506 }
507 }
508 // i is odd -> bkt[i] is equal bucket
509 else
510 {
511 if (bktsize == 0) {
512 // empty bucket
513 }
514 else if (s.splitter_lcp[i / 2] & 0x80) {
515 // equal-bucket has nullptr-terminated key, done.
516 TLX_LOGC(ctx_.debug_recursion)
517 << "Recurse[" << s.depth_ << "]: = bkt "
518 << i << " size " << bktsize << " is done!";
519 StringPtr spb = sp.copy_back();
520
521 if (sp.with_lcp) {
522 spb.fill_lcp(
523 s.depth_ + lcpKeyDepth(s.classifier.get_splitter(i / 2)));
524 }
525 ctx_.donesize(bktsize);
526 }
527 else if (bktsize < ctx_.smallsort_threshold)
528 {
529 TLX_LOGC(ctx_.debug_recursion)
530 << "Recurse[" << s.depth_ << "]: = bkt "
531 << i << " size " << bktsize << " lcp keydepth!";
532
533 ScopedMultiTimerSwitch sts_inssort(mtimer_, "mkqs");
534 sort_mkqs_cache(sp, s.depth_ + sizeof(key_type));
535 }
536 else
537 {
538 TLX_LOGC(ctx_.debug_recursion)
539 << "Recurse[" << s.depth_ << "]: = bkt "
540 << i << " size " << bktsize << " lcp keydepth!";
541
542 ss_stack_.emplace_back(
543 ctx_, sp, s.depth_ + sizeof(key_type), bktcache);
544 }
545 }
546 }
547 else
548 {
549 // finished sort
550 assert(ss_stack_.size() > ss_front_);
551
552 // after full sort: calculate LCPs at this level
553 ss_stack_.back().calculate_lcp(ctx_);
554
555 ss_stack_.pop_back();
556 }
557
558 if (ctx_.enable_work_sharing && ctx_.threads_.has_idle()) {
560 }
561 }
562 }
563
565 assert(ss_stack_.size() >= ss_front_);
566
567 if (ss_stack_.size() == ss_front_) {
568 // ss_stack_ is empty, check other stack
569 return mkqs_free_work();
570 }
571
572 // convert top level of stack into independent jobs
573 TLX_LOGC(ctx_.debug_jobs)
574 << "Freeing top level of PS5SmallsortJob's sample_sort stack";
575
576 typedef SeqSampleSortStep Step;
577 Step& s = ss_stack_[ss_front_];
578
579 while (s.idx_ < Step::bktnum)
580 {
581 size_t i = s.idx_++; // process the bucket s.idx_
582
583 size_t bktsize = s.bkt[i + 1] - s.bkt[i];
584
585 StringPtr sp = s.strptr_.flip(s.bkt[i], bktsize);
586
587 // i is even -> bkt[i] is less-than bucket
588 if (i % 2 == 0)
589 {
590 if (bktsize == 0) {
591 // empty bucket
592 }
593 else
594 {
595 if (i == Step::bktnum - 1)
596 TLX_LOGC(ctx_.debug_recursion)
597 << "Recurse[" << s.depth_ << "]: > bkt "
598 << i << " size " << bktsize << " no lcp";
599 else
600 TLX_LOGC(ctx_.debug_recursion)
601 << "Recurse[" << s.depth_ << "]: < bkt "
602 << i << " size " << bktsize << " lcp "
603 << int(s.splitter_lcp[i / 2] & 0x7F);
604
605 this->substep_add();
606 ctx_.enqueue(this, sp,
607 s.depth_ + (s.splitter_lcp[i / 2] & 0x7F));
608 }
609 }
610 // i is odd -> bkt[i] is equal bucket
611 else
612 {
613 if (bktsize == 0) {
614 // empty bucket
615 }
616 else if (s.splitter_lcp[i / 2] & 0x80) {
617 // equal-bucket has nullptr-terminated key, done.
618 TLX_LOGC(ctx_.debug_recursion)
619 << "Recurse[" << s.depth_ << "]: = bkt "
620 << i << " size " << bktsize << " is done!";
621 StringPtr spb = sp.copy_back();
622
623 if (sp.with_lcp) {
624 spb.fill_lcp(s.depth_ + lcpKeyDepth(
625 s.classifier.get_splitter(i / 2)));
626 }
627 ctx_.donesize(bktsize);
628 }
629 else
630 {
631 TLX_LOGC(ctx_.debug_recursion)
632 << "Recurse[" << s.depth_ << "]: = bkt "
633 << i << " size " << bktsize << " lcp keydepth!";
634
635 this->substep_add();
636 ctx_.enqueue(this, sp, s.depth_ + sizeof(key_type));
637 }
638 }
639 }
640
641 // shorten the current stack
642 ++ss_front_;
643 }
644
645 /*------------------------------------------------------------------------*/
646 //! Stack of Recursive MKQS Steps
647
648 static inline int cmp(const key_type& a, const key_type& b) {
649 return (a > b) ? 1 : (a < b) ? -1 : 0;
650 }
651
652 template <typename Type>
653 static inline size_t
654 med3(Type* A, size_t i, size_t j, size_t k) {
655 if (A[i] == A[j]) return i;
656 if (A[k] == A[i] || A[k] == A[j]) return k;
657 if (A[i] < A[j]) {
658 if (A[j] < A[k]) return j;
659 if (A[i] < A[k]) return k;
660 return i;
661 }
662 else {
663 if (A[j] > A[k]) return j;
664 if (A[i] < A[k]) return i;
665 return k;
666 }
667 }
668
669 //! Insertion sort the strings only based on the cached characters.
670 static inline void
672 const StringSet& strings = strptr.active();
673 size_t n = strptr.size();
674 size_t pi, pj;
675 for (pi = 1; --n > 0; ++pi) {
676 typename StringSet::String tmps = std::move(strings.at(pi));
677 key_type tmpc = cache[pi];
678 for (pj = pi; pj > 0; --pj) {
679 if (cache[pj - 1] <= tmpc)
680 break;
681 strings.at(pj) = std::move(strings.at(pj - 1));
682 cache[pj] = cache[pj - 1];
683 }
684 strings.at(pj) = std::move(tmps);
685 cache[pj] = tmpc;
686 }
687 }
688
689 //! Insertion sort, but use cached characters if possible.
690 template <bool CacheDirty>
691 static inline void
692 insertion_sort_cache(const StringPtr& _strptr, key_type* cache, size_t depth) {
693 StringPtr strptr = _strptr.copy_back();
694
695 if (strptr.size() <= 1) return;
696 if (CacheDirty)
697 return insertion_sort(strptr, depth, /* memory */ 0);
698
699 insertion_sort_cache_block(strptr, cache);
700
701 size_t start = 0, bktsize = 1;
702 for (size_t i = 0; i < strptr.size() - 1; ++i) {
703 // group areas with equal cache values
704 if (cache[i] == cache[i + 1]) {
705 ++bktsize;
706 continue;
707 }
708 // calculate LCP between group areas
709 if (start != 0 && strptr.with_lcp) {
710 int rlcp = lcpKeyType(cache[start - 1], cache[start]);
711 strptr.set_lcp(start, depth + rlcp);
712 // strptr.set_cache(start, getCharAtDepth(cache[start], rlcp));
713 }
714 // sort group areas deeper if needed
715 if (bktsize > 1) {
716 if (cache[start] & 0xFF) {
717 // need deeper sort
719 strptr.sub(start, bktsize), depth + sizeof(key_type),
720 /* memory */ 0);
721 }
722 else {
723 // cache contains nullptr-termination
724 strptr.sub(start, bktsize).fill_lcp(depth + lcpKeyDepth(cache[start]));
725 }
726 }
727 bktsize = 1;
728 start = i + 1;
729 }
730 // tail of loop for last item
731 if (start != 0 && strptr.with_lcp) {
732 int rlcp = lcpKeyType(cache[start - 1], cache[start]);
733 strptr.set_lcp(start, depth + rlcp);
734 // strptr.set_cache(start, getCharAtDepth(cache[start], rlcp));
735 }
736 if (bktsize > 1) {
737 if (cache[start] & 0xFF) {
738 // need deeper sort
740 strptr.sub(start, bktsize), depth + sizeof(key_type),
741 /* memory */ 0);
742 }
743 else {
744 // cache contains nullptr-termination
745 strptr.sub(start, bktsize).fill_lcp(depth + lcpKeyDepth(cache[start]));
746 }
747 }
748 }
749
751 {
752 public:
756 size_t idx_;
757 unsigned char eq_recurse_;
758 // typename StringPtr::StringSet::Char dchar_eq_, dchar_gt_;
759 std::uint8_t lcp_lt_, lcp_eq_, lcp_gt_;
760
761 MKQSStep(Context& ctx, const StringPtr& strptr,
762 key_type* cache, size_t depth, bool CacheDirty)
763 : strptr_(strptr), cache_(cache), depth_(depth), idx_(0) {
764 size_t n = strptr_.size();
765
766 const StringSet& strset = strptr_.active();
767
768 if (CacheDirty) {
769 typename StringSet::Iterator it = strset.begin();
770 for (size_t i = 0; i < n; ++i, ++it) {
771 cache_[i] = get_key<key_type>(strset, *it, depth);
772 }
773 }
774 // select median of 9
775 size_t p = med3(
776 cache_,
777 med3(cache_, 0, n / 8, n / 4),
778 med3(cache_, n / 2 - n / 8, n / 2, n / 2 + n / 8),
779 med3(cache_, n - 1 - n / 4, n - 1 - n / 8, n - 3));
780 // swap pivot to first position
781 std::swap(strset.at(0), strset.at(p));
782 std::swap(cache_[0], cache_[p]);
783 // save the pivot value
784 key_type pivot = cache_[0];
785 // for immediate LCP calculation
786 key_type max_lt = 0, min_gt = std::numeric_limits<key_type>::max();
787
788 // indexes into array:
789 // 0 [pivot] 1 [===] leq [<<<] llt [???] rgt [>>>] req [===] n-1
790 size_t leq = 1, llt = 1, rgt = n - 1, req = n - 1;
791 while (true)
792 {
793 while (llt <= rgt)
794 {
795 int r = cmp(cache[llt], pivot);
796 if (r > 0) {
797 min_gt = std::min(min_gt, cache[llt]);
798 break;
799 }
800 else if (r == 0) {
801 std::swap(strset.at(leq), strset.at(llt));
802 std::swap(cache[leq], cache[llt]);
803 leq++;
804 }
805 else {
806 max_lt = std::max(max_lt, cache[llt]);
807 }
808 ++llt;
809 }
810 while (llt <= rgt)
811 {
812 int r = cmp(cache[rgt], pivot);
813 if (r < 0) {
814 max_lt = std::max(max_lt, cache[rgt]);
815 break;
816 }
817 else if (r == 0) {
818 std::swap(strset.at(req), strset.at(rgt));
819 std::swap(cache[req], cache[rgt]);
820 req--;
821 }
822 else {
823 min_gt = std::min(min_gt, cache[rgt]);
824 }
825 --rgt;
826 }
827 if (llt > rgt)
828 break;
829 std::swap(strset.at(llt), strset.at(rgt));
830 std::swap(cache[llt], cache[rgt]);
831 ++llt;
832 --rgt;
833 }
834 // calculate size of areas = < and >, save into struct
835 size_t num_leq = leq, num_req = n - 1 - req;
836 num_eq_ = num_leq + num_req;
837 num_lt_ = llt - leq;
838 num_gt_ = req - rgt;
839 assert(num_eq_ > 0);
840 assert(num_lt_ + num_eq_ + num_gt_ == n);
841
842 // swap equal values from left to center
843 const size_t size1 = std::min(num_leq, num_lt_);
844 std::swap_ranges(strset.begin(), strset.begin() + size1,
845 strset.begin() + llt - size1);
846 std::swap_ranges(cache, cache + size1, cache + llt - size1);
847
848 // swap equal values from right to center
849 const size_t size2 = std::min(num_req, num_gt_);
850 std::swap_ranges(strset.begin() + llt, strset.begin() + llt + size2,
851 strset.begin() + n - size2);
852 std::swap_ranges(cache + llt, cache + llt + size2,
853 cache + n - size2);
854
855 // No recursive sorting if pivot has a zero byte
856 eq_recurse_ = (pivot & 0xFF);
857
858 // save LCP values for writing into LCP array after sorting further
859 if (strptr_.with_lcp && num_lt_ > 0) {
860 assert(max_lt == *std::max_element(
861 cache_ + 0, cache + num_lt_));
862
863 lcp_lt_ = lcpKeyType(max_lt, pivot);
864 // dchar_eq_ = getCharAtDepth(pivot, lcp_lt_);
865 TLX_LOGC(ctx.debug_lcp) << "LCP lt with pivot: " << depth_ + lcp_lt_;
866 }
867
868 // calculate equal area lcp: +1 for the equal zero termination byte
869 lcp_eq_ = lcpKeyDepth(pivot);
870
871 if (strptr_.with_lcp && num_gt_ > 0) {
872 assert(min_gt == *std::min_element(
873 cache_ + num_lt_ + num_eq_, cache_ + n));
874
875 lcp_gt_ = lcpKeyType(pivot, min_gt);
876 // dchar_gt_ = getCharAtDepth(min_gt, lcp_gt_);
877 TLX_LOGC(ctx.debug_lcp) << "LCP pivot with gt: " << depth_ + lcp_gt_;
878 }
879
880 ++ctx.base_sort_steps;
881 }
882
884 if (strptr_.with_lcp && num_lt_ > 0) {
886 // strptr_.set_cache(num_lt_, dchar_eq_);
887 }
888
889 if (strptr_.with_lcp && num_gt_ > 0) {
891 // strptr_.set_cache(num_lt_ + num_eq_, dchar_gt_);
892 }
893 }
894 };
895
896 size_t ms_front_ = 0;
897 std::vector<MKQSStep> ms_stack_;
898
899 void sort_mkqs_cache(const StringPtr& strptr, size_t depth) {
900 assert(strcmp(mtimer_.running(), "mkqs") == 0);
901
902 if (!ctx_.enable_sequential_mkqs ||
903 strptr.size() < ctx_.inssort_threshold) {
904 TLX_LOGC(ctx_.debug_jobs)
905 << "insertion_sort() size "
906 << strptr.size() << " depth " << depth;
907
908 ScopedMultiTimerSwitch sts_inssort(mtimer_, "inssort");
909 insertion_sort(strptr.copy_back(), depth, /* memory */ 0);
910 ctx_.donesize(strptr.size());
911 return;
912 }
913
914 TLX_LOGC(ctx_.debug_jobs)
915 << "sort_mkqs_cache() size " << strptr.size() << " depth " << depth;
916
917 if (bktcache_.size() < strptr.size() * sizeof(key_type)) {
918 bktcache_.destroy();
919 bktcache_.resize(strptr.size() * sizeof(key_type));
920 }
921
922 // reuse bktcache as keycache
923 key_type* cache = reinterpret_cast<key_type*>(bktcache_.data());
924
925 assert(ms_front_ == 0);
926 assert(ms_stack_.size() == 0);
927
928 // std::deque is much slower than std::vector, so we use an artificial
929 // pop_front variable.
930 ms_stack_.emplace_back(ctx_, strptr, cache, depth, true);
931
932 while (ms_stack_.size() > ms_front_)
933 {
934 MKQSStep& ms = ms_stack_.back();
935 ++ms.idx_; // increment here, because stack may change
936
937 // process the lt-subsequence
938 if (ms.idx_ == 1) {
939 if (ms.num_lt_ == 0) {
940 // empty subsequence
941 }
942 else if (ms.num_lt_ < ctx_.inssort_threshold) {
943 ScopedMultiTimerSwitch sts_inssort(mtimer_, "inssort");
944 insertion_sort_cache<false>(ms.strptr_.sub(0, ms.num_lt_),
945 ms.cache_, ms.depth_);
946 ctx_.donesize(ms.num_lt_);
947 }
948 else {
949 ms_stack_.emplace_back(
950 ctx_,
951 ms.strptr_.sub(0, ms.num_lt_),
952 ms.cache_, ms.depth_, false);
953 }
954 }
955 // process the eq-subsequence
956 else if (ms.idx_ == 2) {
957 StringPtr sp = ms.strptr_.sub(ms.num_lt_, ms.num_eq_);
958
959 assert(ms.num_eq_ > 0);
960
961 if (!ms.eq_recurse_) {
962 StringPtr spb = sp.copy_back();
963 spb.fill_lcp(ms.depth_ + ms.lcp_eq_);
964 ctx_.donesize(spb.size());
965 }
966 else if (ms.num_eq_ < ctx_.inssort_threshold) {
967 ScopedMultiTimerSwitch sts_inssort(mtimer_, "inssort");
968 insertion_sort_cache<true>(sp, ms.cache_ + ms.num_lt_,
969 ms.depth_ + sizeof(key_type));
970 ctx_.donesize(ms.num_eq_);
971 }
972 else {
973 ms_stack_.emplace_back(
974 ctx_, sp,
975 ms.cache_ + ms.num_lt_,
976 ms.depth_ + sizeof(key_type), true);
977 }
978 }
979 // process the gt-subsequence
980 else if (ms.idx_ == 3) {
981 StringPtr sp = ms.strptr_.sub(
982 ms.num_lt_ + ms.num_eq_, ms.num_gt_);
983
984 if (ms.num_gt_ == 0) {
985 // empty subsequence
986 }
987 else if (ms.num_gt_ < ctx_.inssort_threshold) {
988 ScopedMultiTimerSwitch sts_inssort(mtimer_, "inssort");
989 insertion_sort_cache<false>(
990 sp, ms.cache_ + ms.num_lt_ + ms.num_eq_, ms.depth_);
991 ctx_.donesize(ms.num_gt_);
992 }
993 else {
994 ms_stack_.emplace_back(
995 ctx_, sp,
996 ms.cache_ + ms.num_lt_ + ms.num_eq_,
997 ms.depth_, false);
998 }
999 }
1000 // calculate lcps
1001 else {
1002 // finished sort
1003 assert(ms_stack_.size() > ms_front_);
1004
1005 // calculate LCP after the three parts are sorted
1006 ms_stack_.back().calculate_lcp();
1007
1008 ms_stack_.pop_back();
1009 }
1010
1011 if (ctx_.enable_work_sharing && ctx_.threads_.has_idle()) {
1013 }
1014 }
1015 }
1016
1018 assert(ms_stack_.size() >= ms_front_);
1019
1020 for (unsigned int fl = 0; fl < 8; ++fl)
1021 {
1022 if (ms_stack_.size() == ms_front_) {
1023 return;
1024 }
1025
1026 TLX_LOGC(ctx_.debug_jobs)
1027 << "Freeing top level of PS5SmallsortJob's mkqs stack - size "
1028 << ms_stack_.size();
1029
1030 // convert top level of stack into independent jobs
1031
1033
1034 if (ms.idx_ == 0 && ms.num_lt_ != 0)
1035 {
1036 this->substep_add();
1037 ctx_.enqueue(this, ms.strptr_.sub(0, ms.num_lt_), ms.depth_);
1038 }
1039 if (ms.idx_ <= 1) // st.num_eq > 0 always
1040 {
1041 assert(ms.num_eq_ > 0);
1042
1043 StringPtr sp = ms.strptr_.sub(ms.num_lt_, ms.num_eq_);
1044
1045 if (ms.eq_recurse_) {
1046 this->substep_add();
1047 ctx_.enqueue(this, sp, ms.depth_ + sizeof(key_type));
1048 }
1049 else {
1050 StringPtr spb = sp.copy_back();
1051 spb.fill_lcp(ms.depth_ + ms.lcp_eq_);
1052 ctx_.donesize(ms.num_eq_);
1053 }
1054 }
1055 if (ms.idx_ <= 2 && ms.num_gt_ != 0)
1056 {
1057 this->substep_add();
1058 ctx_.enqueue(
1059 this, ms.strptr_.sub(ms.num_lt_ + ms.num_eq_, ms.num_gt_),
1060 ms.depth_);
1061 }
1062
1063 // shorten the current stack
1064 ++ms_front_;
1065 }
1066 }
1067
1068 /*------------------------------------------------------------------------*/
1069 // Called When PS5SmallsortJob is Finished
1070
1071 void substep_all_done() final {
1072 TLX_LOGC(ctx_.debug_recursion)
1073 << "SmallSort[" << depth_ << "] "
1074 << "all substeps done -> LCP calculation";
1075
1076 while (ms_front_ > 0) {
1077 TLX_LOGC(ctx_.debug_lcp)
1078 << "SmallSort[" << depth_ << "] ms_front_: " << ms_front_;
1079 ms_stack_[--ms_front_].calculate_lcp();
1080 }
1081
1082 while (ss_front_ > 0) {
1083 TLX_LOGC(ctx_.debug_lcp)
1084 << "SmallSort[" << depth_ << "] ss_front_: " << ss_front_;
1085 ss_stack_[--ss_front_].calculate_lcp(ctx_);
1086 }
1087
1089 delete this;
1090 }
1091};
1092
1093/******************************************************************************/
1094//! PS5BigSortStep Out-of-Place Parallel Sample Sort with Separate Jobs
1095
1096template <typename Context, typename StringPtr>
1098{
1099public:
1101 typedef typename StringSet::Iterator StrIterator;
1102 typedef typename Context::key_type key_type;
1103
1104 //! context
1105 Context& ctx_;
1106 //! parent sort step for notification
1108
1109 //! string pointers, size, and current sorting depth
1111 size_t depth_;
1112
1113 //! number of parts into which the strings were split
1114 size_t parts_;
1115 //! size of all parts except the last
1116 size_t psize_;
1117 //! number of threads still working
1118 std::atomic<size_t> pwork_;
1119
1120 //! classifier instance and variables (contains splitter tree
1121 typename Context::Classify classifier_;
1122
1123 static const size_t treebits_ = Context::Classify::treebits;
1124 static const size_t num_splitters_ = Context::Classify::num_splitters;
1125 static const size_t bktnum_ = 2 * num_splitters_ + 1;
1126
1127 //! LCPs of splitters, needed for recursive calls
1128 unsigned char splitter_lcp_[num_splitters_ + 1];
1129
1130 //! individual bucket array of threads, keep bkt[0] for DistributeJob
1132 //! bucket ids cache, created by classifier and later counted
1134
1135 /*------------------------------------------------------------------------*/
1136 // Constructor
1137
1138 PS5BigSortStep(Context& ctx, PS5SortStep* pstep,
1139 const StringPtr& strptr, size_t depth)
1140 : ctx_(ctx), pstep_(pstep), strptr_(strptr), depth_(depth) {
1141 // calculate number of parts
1142 parts_ = strptr_.size() / ctx.sequential_threshold() * 2;
1143 if (parts_ == 0) parts_ = 1;
1144
1145 bkt_.resize(parts_);
1146 bktcache_.resize(parts_);
1147
1148 psize_ = (strptr.size() + parts_ - 1) / parts_;
1149
1150 TLX_LOGC(ctx_.debug_steps)
1151 << "enqueue depth=" << depth_
1152 << " size=" << strptr_.size()
1153 << " parts=" << parts_
1154 << " psize=" << psize_
1155 << " flip=" << strptr_.flipped();
1156
1157 ctx.threads_.enqueue([this]() { sample(); });
1158 ++ctx.para_ss_steps;
1159 }
1160
1161 virtual ~PS5BigSortStep() { }
1162
1163 /*------------------------------------------------------------------------*/
1164 // Sample Step
1165
1166 void sample() {
1167 ScopedMultiTimer smt(ctx_.mtimer, "para_ss");
1168 TLX_LOGC(ctx_.debug_jobs) << "Process SampleJob @ " << this;
1169
1170 const size_t oversample_factor = 2;
1171 size_t sample_size = oversample_factor * num_splitters_;
1172
1173 const StringSet& strset = strptr_.active();
1174 size_t n = strset.size();
1175
1176 simple_vector<key_type> samples(sample_size);
1177
1178 std::minstd_rand rng(reinterpret_cast<uintptr_t>(samples.data()));
1179
1180 for (size_t i = 0; i < sample_size; ++i)
1181 samples[i] = get_key_at<key_type>(strset, rng() % n, depth_);
1182
1183 std::sort(samples.begin(), samples.end());
1184
1185 classifier_.build(samples.data(), sample_size, splitter_lcp_);
1186
1187 // create new jobs
1188 pwork_ = parts_;
1189 for (unsigned int p = 0; p < parts_; ++p) {
1190 ctx_.threads_.enqueue([this, p]() { count(p); });
1191 }
1192 }
1193
1194 /*------------------------------------------------------------------------*/
1195 // Counting Step
1196
1197 void count(unsigned int p) {
1198 ScopedMultiTimer smt(ctx_.mtimer, "para_ss");
1199 TLX_LOGC(ctx_.debug_jobs) << "Process CountJob " << p << " @ " << this;
1200
1201 const StringSet& strset = strptr_.active();
1202
1203 StrIterator strB = strset.begin() + p * psize_;
1204 StrIterator strE = strset.begin() + std::min((p + 1) * psize_, strptr_.size());
1205 if (strE < strB) strE = strB;
1206
1207 bktcache_[p].resize(strE - strB);
1208 std::uint16_t* bktcache = bktcache_[p].data();
1209 classifier_.classify(strset, strB, strE, bktcache, depth_);
1210
1211 bkt_[p].resize(bktnum_ + (p == 0 ? 1 : 0));
1212 size_t* bkt = bkt_[p].data();
1213 memset(bkt, 0, bktnum_ * sizeof(size_t));
1214
1215 for (std::uint16_t* bc = bktcache; bc != bktcache + (strE - strB); ++bc)
1216 ++bkt[*bc];
1217
1218 if (--pwork_ == 0)
1220 }
1221
1223 ScopedMultiTimer smt(ctx_.mtimer, "para_ss");
1224 TLX_LOGC(ctx_.debug_jobs) << "Finishing CountJob " << this << " with prefixsum";
1225
1226 // abort sorting if we're measuring only the top level
1227 if (ctx_.use_only_first_sortstep)
1228 return;
1229
1230 // inclusive prefix sum over bkt
1231 size_t sum = 0;
1232 for (unsigned int i = 0; i < bktnum_; ++i) {
1233 for (unsigned int p = 0; p < parts_; ++p) {
1234 bkt_[p][i] = (sum += bkt_[p][i]);
1235 }
1236 }
1237 assert(sum == strptr_.size());
1238
1239 // create new jobs
1240 pwork_ = parts_;
1241 for (unsigned int p = 0; p < parts_; ++p) {
1242 ctx_.threads_.enqueue([this, p]() { distribute(p); });
1243 }
1244 }
1245
1246 /*------------------------------------------------------------------------*/
1247 // Distribute Step
1248
1249 void distribute(unsigned int p) {
1250 ScopedMultiTimer smt(ctx_.mtimer, "para_ss");
1251 TLX_LOGC(ctx_.debug_jobs) << "Process DistributeJob " << p << " @ " << this;
1252
1253 const StringSet& strset = strptr_.active();
1254
1255 StrIterator strB = strset.begin() + p * psize_;
1256 StrIterator strE = strset.begin() + std::min((p + 1) * psize_, strptr_.size());
1257 if (strE < strB) strE = strB;
1258
1259 // get alternative shadow pointer array
1260 const StringSet& sorted = strptr_.shadow();
1261 typename StringSet::Iterator sbegin = sorted.begin();
1262
1263 std::uint16_t* bktcache = bktcache_[p].data();
1264 size_t* bkt = bkt_[p].data();
1265
1266 for (StrIterator str = strB; str != strE; ++str, ++bktcache)
1267 *(sbegin + --bkt[*bktcache]) = std::move(*str);
1268
1269 if (p != 0) // p = 0 is needed for recursion into bkts
1270 bkt_[p].destroy();
1271
1272 bktcache_[p].destroy();
1273
1274 if (--pwork_ == 0)
1276 }
1277
1279 TLX_LOGC(ctx_.debug_jobs)
1280 << "Finishing DistributeJob " << this << " with enqueuing subjobs";
1281
1282 size_t* bkt = bkt_[0].data();
1283 assert(bkt);
1284
1285 // first processor's bkt pointers are boundaries between bkts, just add sentinel:
1286 assert(bkt[0] == 0);
1287 bkt[bktnum_] = strptr_.size();
1288
1289 // keep anonymous subjob handle while creating subjobs
1290 this->substep_add();
1291
1292 size_t i = 0;
1293 while (i < bktnum_ - 1)
1294 {
1295 // i is even -> bkt[i] is less-than bucket
1296 size_t bktsize = bkt[i + 1] - bkt[i];
1297 if (bktsize == 0) {
1298 // empty bucket
1299 }
1300 else if (bktsize == 1) { // just one string pointer, copyback
1301 strptr_.flip(bkt[i], 1).copy_back();
1302 ctx_.donesize(1);
1303 }
1304 else
1305 {
1306 TLX_LOGC(ctx_.debug_recursion)
1307 << "Recurse[" << depth_ << "]: < bkt " << bkt[i]
1308 << " size " << bktsize << " lcp "
1309 << int(splitter_lcp_[i / 2] & 0x7F);
1310 this->substep_add();
1311 ctx_.enqueue(this, strptr_.flip(bkt[i], bktsize),
1312 depth_ + (splitter_lcp_[i / 2] & 0x7F));
1313 }
1314 ++i;
1315 // i is odd -> bkt[i] is equal bucket
1316 bktsize = bkt[i + 1] - bkt[i];
1317 if (bktsize == 0) {
1318 // empty bucket
1319 }
1320 else if (bktsize == 1) { // just one string pointer, copyback
1321 strptr_.flip(bkt[i], 1).copy_back();
1322 ctx_.donesize(1);
1323 }
1324 else
1325 {
1326 if (splitter_lcp_[i / 2] & 0x80) {
1327 // equal-bucket has nullptr-terminated key, done.
1328 TLX_LOGC(ctx_.debug_recursion)
1329 << "Recurse[" << depth_ << "]: = bkt " << bkt[i]
1330 << " size " << bktsize << " is done!";
1331 StringPtr sp = strptr_.flip(bkt[i], bktsize).copy_back();
1332 sp.fill_lcp(
1333 depth_ + lcpKeyDepth(classifier_.get_splitter(i / 2)));
1334 ctx_.donesize(bktsize);
1335 }
1336 else {
1337 TLX_LOGC(ctx_.debug_recursion)
1338 << "Recurse[" << depth_ << "]: = bkt " << bkt[i]
1339 << " size " << bktsize << " lcp keydepth!";
1340 this->substep_add();
1341 ctx_.enqueue(this, strptr_.flip(bkt[i], bktsize),
1342 depth_ + sizeof(key_type));
1343 }
1344 }
1345 ++i;
1346 }
1347
1348 size_t bktsize = bkt[i + 1] - bkt[i];
1349
1350 if (bktsize == 0) {
1351 // empty bucket
1352 }
1353 else if (bktsize == 1) { // just one string pointer, copyback
1354 strptr_.flip(bkt[i], 1).copy_back();
1355 ctx_.donesize(1);
1356 }
1357 else {
1358 TLX_LOGC(ctx_.debug_recursion)
1359 << "Recurse[" << depth_ << "]: > bkt " << bkt[i]
1360 << " size " << bktsize << " no lcp";
1361 this->substep_add();
1362 ctx_.enqueue(this, strptr_.flip(bkt[i], bktsize), depth_);
1363 }
1364
1365 this->substep_notify_done(); // release anonymous subjob handle
1366
1367 if (!strptr_.with_lcp)
1368 bkt_[0].destroy();
1369 }
1370
1371 /*------------------------------------------------------------------------*/
1372 // After Recursive Sorting
1373
1374 void substep_all_done() final {
1375 ScopedMultiTimer smt(ctx_.mtimer, "para_ss");
1376 if (strptr_.with_lcp) {
1377 TLX_LOGC(ctx_.debug_steps)
1378 << "pSampleSortStep[" << depth_ << "]: all substeps done.";
1379
1380 ps5_sample_sort_lcp<bktnum_>(
1381 ctx_, classifier_, strptr_, depth_, bkt_[0].data());
1382 bkt_[0].destroy();
1383 }
1384
1386 delete this;
1387 }
1388};
1389
1390/******************************************************************************/
1391// PS5Context::enqueue()
1392
1393template <typename Parameters>
1394template <typename StringPtr>
1396 PS5SortStep* pstep, const StringPtr& strptr, size_t depth) {
1397 if (this->enable_parallel_sample_sort &&
1398 (strptr.size() > sequential_threshold() ||
1399 this->use_only_first_sortstep)) {
1400 new PS5BigSortStep<PS5Context, StringPtr>(*this, pstep, strptr, depth);
1401 }
1402 else {
1403 if (strptr.size() < (1LLU << 32)) {
1405 *this, pstep, strptr, depth);
1406 threads_.enqueue([j]() { j->run(); });
1407 }
1408 else {
1410 *this, pstep, strptr, depth);
1411 threads_.enqueue([j]() { j->run(); });
1412 }
1413 }
1414}
1415
1416/******************************************************************************/
1417// Externally Callable Sorting Methods
1418
1419//! Main Parallel Sample Sort Function. See below for more convenient wrappers.
1420template <typename PS5Parameters, typename StringPtr>
1421void parallel_sample_sort_base(const StringPtr& strptr, size_t depth) {
1422
1423 using Context = PS5Context<PS5Parameters>;
1424 Context ctx(std::thread::hardware_concurrency());
1425 ctx.total_size = strptr.size();
1426 ctx.rest_size = strptr.size();
1427 ctx.num_threads = ctx.threads_.size();
1428
1429 MultiTimer timer;
1430 timer.start("sort");
1431
1432 ctx.enqueue(/* pstep */ nullptr, strptr, depth);
1433 ctx.threads_.loop_until_empty();
1434
1435 timer.stop();
1436
1437 assert(!ctx.enable_rest_size || ctx.rest_size == 0);
1438
1439 using BigSortStep = PS5BigSortStep<Context, StringPtr>;
1440
1441 TLX_LOGC(ctx.debug_result)
1442 << "RESULT"
1443 << " sizeof(key_type)=" << sizeof(typename PS5Parameters::key_type)
1444 << " splitter_treebits=" << size_t(BigSortStep::treebits_)
1445 << " num_splitters=" << size_t(BigSortStep::num_splitters_)
1446 << " num_threads=" << ctx.num_threads
1447 << " enable_work_sharing=" << size_t(ctx.enable_work_sharing)
1448 << " use_restsize=" << size_t(ctx.enable_rest_size)
1449 << " tm_para_ss=" << ctx.mtimer.get("para_ss")
1450 << " tm_seq_ss=" << ctx.mtimer.get("sequ_ss")
1451 << " tm_mkqs=" << ctx.mtimer.get("mkqs")
1452 << " tm_inssort=" << ctx.mtimer.get("inssort")
1453 << " tm_total=" << ctx.mtimer.total()
1454 << " tm_idle="
1455 << (ctx.num_threads * timer.total()) - ctx.mtimer.total()
1456 << " steps_para_sample_sort=" << ctx.para_ss_steps
1457 << " steps_seq_sample_sort=" << ctx.sequ_ss_steps
1458 << " steps_base_sort=" << ctx.base_sort_steps;
1459}
1460
1461//! Parallel Sample Sort Function for a generic StringSet, this allocates the
1462//! shadow array for flipping.
1463template <typename PS5Parameters, typename StringPtr>
1466 const StringPtr& strptr, size_t depth, size_t memory = 0) {
1467 tlx::unused(memory);
1468
1469 typedef typename StringPtr::StringSet StringSet;
1470 const StringSet& strset = strptr.active();
1471
1473 typedef typename StringSet::Container Container;
1474
1475 // allocate shadow pointer array
1476 Container shadow = strset.allocate(strset.size());
1477 StringShadowPtr new_strptr(strset, StringSet(shadow));
1478
1479 parallel_sample_sort_base<PS5Parameters>(new_strptr, depth);
1480
1481 StringSet::deallocate(shadow);
1482}
1483
1484//! Parallel Sample Sort Function for a generic StringSet with LCPs, this
1485//! allocates the shadow array for flipping.
1486template <typename PS5Parameters, typename StringPtr>
1489 const StringPtr& strptr, size_t depth, size_t memory = 0) {
1490 tlx::unused(memory);
1491
1492 typedef typename StringPtr::StringSet StringSet;
1493 typedef typename StringPtr::LcpType LcpType;
1494 const StringSet& strset = strptr.active();
1495
1497 typedef typename StringSet::Container Container;
1498
1499 // allocate shadow pointer array
1500 Container shadow = strset.allocate(strset.size());
1501 StringShadowLcpPtr new_strptr(strset, StringSet(shadow), strptr.lcp());
1502
1503 parallel_sample_sort_base<PS5Parameters>(new_strptr, depth);
1504
1505 StringSet::deallocate(shadow);
1506}
1507
1508//! Parallel Sample Sort Function with default parameter size for a generic
1509//! StringSet.
1510template <typename StringPtr>
1512 const StringPtr& strptr, size_t depth, size_t memory) {
1513 return parallel_sample_sort_params<PS5ParametersDefault>(
1514 strptr, depth, memory);
1515}
1516
1517} // namespace sort_strings_detail
1518} // namespace tlx
1519
1520#endif // !TLX_SORT_STRINGS_PARALLEL_SAMPLE_SORT_HEADER
1521
1522/******************************************************************************/
MultiTimer can be used to measure time usage of different phases in a program or algorithm.
void start(const char *timer)
start new timer phase, stop the currently running one.
void stop()
stop the currently running timer.
const char * running() const
return name of currently running timer.
double total() const
return total duration of all timers.
RAII Scoped MultiTimer switcher: switches the timer of a MultiTimer on construction and back to old o...
Independent RAII Scoped MultiTimer: contains a MultiTimer which is started with the given timer,...
Simpler non-growing vector without initialization.
iterator data() noexcept
return iterator to beginning of vector
iterator begin() noexcept
return mutable iterator to first element
iterator end() noexcept
return mutable iterator beyond last element
ThreadPool starts a fixed number p of std::threads which process Jobs that are enqueued into a concur...
PS5BigSortStep Out-of-Place Parallel Sample Sort with Separate Jobs.
simple_vector< simple_vector< size_t > > bkt_
individual bucket array of threads, keep bkt[0] for DistributeJob
simple_vector< simple_vector< std::uint16_t > > bktcache_
bucket ids cache, created by classifier and later counted
PS5SortStep * pstep_
parent sort step for notification
std::atomic< size_t > pwork_
number of threads still working
size_t psize_
size of all parts except the last
StringPtr strptr_
string pointers, size, and current sorting depth
Context::Classify classifier_
classifier instance and variables (contains splitter tree
void substep_all_done() final
Pure virtual function called by substep when all substeps are done.
PS5BigSortStep(Context &ctx, PS5SortStep *pstep, const StringPtr &strptr, size_t depth)
unsigned char splitter_lcp_[num_splitters_+1]
LCPs of splitters, needed for recursive calls.
size_t parts_
number of parts into which the strings were split
Parallel Super Scalar String Sample Sort Context.
void enqueue(PS5SortStep *sstep, const StringPtr &strptr, size_t depth)
enqueue a new job in the thread pool
void donesize(size_t n)
decrement number of unordered strings
std::atomic< size_t > rest_size
number of remaining strings to sort
PS5Context(size_t _thread_num)
context constructor
size_t sequential_threshold()
return sequential sorting threshold
size_t num_threads
number of threads overall
std::atomic< size_t > para_ss_steps
counters
MultiTimer mtimer
timers for individual sorting steps
Parallel Super Scalar String Sample Sort Parameter Struct.
static const size_t smallsort_threshold
threshold to run sequential small sorts
static const bool enable_parallel_sample_sort
enable/disable various sorting levels
static const unsigned TreeBits
depth of classification tree used in sample sorts
size_t key_type
key type for sample sort: 32-bit or 64-bit
static const size_t inssort_threshold
threshold to switch to insertion sort
static const bool enable_rest_size
whether the base sequential_threshold() on the remaining unsorted string set or on the whole string s...
static const bool enable_work_sharing
enable work freeing
static const bool use_only_first_sortstep
terminate sort after first parallel sample sort step
MKQSStep(Context &ctx, const StringPtr &strptr, key_type *cache, size_t depth, bool CacheDirty)
SeqSampleSortStep(Context &ctx, const StringPtr &strptr, size_t depth, std::uint16_t *bktcache)
SampleSort: Non-Recursive In-Place Sequential Sample Sort for Small Sorts.
void sort_mkqs_cache(const StringPtr &strptr, size_t depth)
static void insertion_sort_cache(const StringPtr &_strptr, key_type *cache, size_t depth)
Insertion sort, but use cached characters if possible.
static size_t med3(Type *A, size_t i, size_t j, size_t k)
static void insertion_sort_cache_block(const StringPtr &strptr, key_type *cache)
Insertion sort the strings only based on the cached characters.
PS5SmallsortJob(Context &ctx, PS5SortStep *pstep, const StringPtr &strptr, size_t depth)
void substep_all_done() final
Pure virtual function called by substep when all substeps are done.
void sort_sample_sort(const StringPtr &strptr, size_t depth)
static int cmp(const key_type &a, const key_type &b)
Stack of Recursive MKQS Steps.
PS5SortStep Top-Level Class to Keep Track of Substeps.
virtual void substep_all_done()=0
Pure virtual function called by substep when all substeps are done.
void substep_notify_done()
Notify superstep that the currently substep is done.
std::atomic< size_t > substep_working_
Number of substeps still running.
Sample Sort Classification Tree Unrolled, Interleaved, and with Perfect Tree Index Calculations.
Objectified string array pointer array.
size_t size() const
return valid length
const StringSet & active() const
return currently active array
StringPtr sub(size_t offset, size_t sub_size) const
Advance (both) pointers by given offset, return sub-array.
void fill_lcp(const LcpType &) const
fill entire LCP array with v, excluding the first lcp[0] position!
static const bool with_lcp
if we want to save the LCPs
void set_lcp(size_t, const LcpType &) const
set the i-th lcp to v and check its value
Objectified string array pointer and shadow pointer array for out-of-place swapping of pointers.
Objectified string array pointer and shadow pointer array for out-of-place swapping of pointers.
unsigned ctz(Integral x)
unsigned clz(Integral x)
#define TLX_LOGC(cond)
Explicitly specify the condition for logging.
Definition core.hpp:137
static unsigned char lcpKeyType(const KeyType &a, const KeyType &b)
LCP calculation of Splitter Strings.
static unsigned char lcpKeyDepth(const KeyType &a)
static enable_if<!StringPtr::with_lcp, void >::type insertion_sort(const StringPtr &strptr, size_t depth, size_t)
Generic insertion sort for abstract string sets.
void parallel_sample_sort_base(const StringPtr &strptr, size_t depth)
Main Parallel Sample Sort Function. See below for more convenient wrappers.
static unsigned char getCharAtDepth(const KeyType &a, unsigned char d)
return the d-th character in the (swapped) key
void parallel_sample_sort(const StringPtr &strptr, size_t depth, size_t memory)
Parallel Sample Sort Function with default parameter size for a generic StringSet.
enable_if<!StringPtr::with_lcp, void >::type parallel_sample_sort_params(const StringPtr &strptr, size_t depth, size_t memory=0)
Parallel Sample Sort Function for a generic StringSet, this allocates the shadow array for flipping.
void ps5_sample_sort_lcp(const Context &ctx, const Classify &classifier, const StringPtr &strptr, size_t depth, const BktSizeType *bkt)
LCP Calculation for Finished Sample Sort Steps.
void unused(Types &&...)
Definition unused.hpp:20
SFINAE enable_if – copy of std::enable_if<> with less extra cruft.
Definition enable_if.hpp:22