star-line

Structure for accelerating line importance sampling
git clone git://git.meso-star.fr/star-line.git
Log | Files | Refs | README | LICENSE

commit 1f0bb151f24016699eb94a95847c107e11132fea
parent 30f143c5a4add2934d5cf4fe9a47eb14b16c8d35
Author: Vincent Forest <vincent.forest@meso-star.com>
Date:   Mon, 27 Apr 2026 17:03:35 +0200

Prepare for parallelization of the tree building

Add a new internal function that partitions the lines in the first few
levels of the tree by breadth-first, until there are enough nodes to
process in parallel.

Each resulting node is then treated as the root of a subtree, which is
constructed as before. These subtree are currently constructed
sequentially, but could be constructed in parallel. Finally, the
polyline of the top-level nodes - that is, the nodes located above the
roots of the subtree - is constructed once the subtree construction is
complete. This step will certainly remain sequential since it involves
only a few nodes.

The final tree constructed in this way is strictly identical to the one
constructed by the previous function; only the internal order of the
nodes is changed. However, the partitioning and the construction of the
polyline are identical bit by bit. This was verified by comparing the
mesh of the root node of two trees constructed from the same data, but
using either the old routine or the new one.

At this point, this new construction function needs to be parallelize,
which will certainly require a reevaluation of how data is stored and
allocated: to avoid conflicts between threads, each thread will likely
need its own buffer of nodes and vertices. But before we even discuss
parallelization, the progress messages need to be reviewed. These
display the progress of each subtree's construction, not the progress of
the entire tree's construction.

Diffstat:
MREADME.md | 2+-
Mconfig.mk | 4++--
Msln.pc.in | 2+-
Msrc/sln.h | 6+++++-
Msrc/sln_tree.c | 24++++++++++++++++++------
Msrc/sln_tree_build.c | 235++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Msrc/sln_tree_c.h | 4++++
7 files changed, 245 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md @@ -4,7 +4,7 @@ Build an acceleration structure to speed up line importance sampling. ## Requirements -- C compiler (C99) +- C compiler (C99) with OpenMP support - POSIX make - pkg-config - Line By Line Utils diff --git a/config.mk b/config.mk @@ -36,8 +36,8 @@ SHTR_VERSION = 0 SSP_VERSION = 0.15 SBB_VERSION = 0.0 -INCS = $$($(PKG_CONFIG) $(PCFLAGS) --cflags lblu rsys shtr) -LIBS = $$($(PKG_CONFIG) $(PCFLAGS) --libs lblu rsys shtr) -lm +INCS = $$($(PKG_CONFIG) $(PCFLAGS) --cflags lblu rsys shtr) -fopenmp +LIBS = $$($(PKG_CONFIG) $(PCFLAGS) --libs lblu rsys shtr) -fopenmp -lm ################################################################################ # Compilation options diff --git a/sln.pc.in b/sln.pc.in @@ -8,5 +8,5 @@ Name: Star-Line Description: Star Line library Version: @VERSION@ Libs: -L${libdir} -lsln -Libs.private: -lm +Libs.private: -fopenmp -lm CFlags: -I${includedir} diff --git a/src/sln.h b/src/sln.h @@ -128,6 +128,9 @@ struct sln_tree_create_args { * two. For a binary tree, both methods should produce exactly the same tree, * down to the bit */ int collapse_polylines; + + /* Advice on the number of threads to use */ + unsigned nthreads_hint; }; #define SLN_TREE_CREATE_ARGS_DEFAULT__ { \ NULL, /* metadata */ \ @@ -141,7 +144,8 @@ struct sln_tree_create_args { SLN_MESH_UPPER, /* Mesh type */ \ 2, /* Arity */ \ 1, /* Number of lines per leaf */ \ - 0 /* Collapse polylines */ \ + 0, /* Collapse polylines */ \ + (unsigned)(-1), /* #threads hint */ \ } static const struct sln_tree_create_args SLN_TREE_CREATE_ARGS_DEFAULT = SLN_TREE_CREATE_ARGS_DEFAULT__; diff --git a/src/sln_tree.c b/src/sln_tree.c @@ -27,6 +27,8 @@ #include <rsys/cstr.h> #include <rsys/math.h> +#include <omp.h> + struct stream { const char* name; FILE* fp; @@ -407,12 +409,13 @@ node_child_count(const struct sln_node* node, const unsigned tree_arity) ASSERT(nlines); /* Based on the arity of the tree, calculate how the lines of the node are - * distributed among its children. The policy below prioritizes an equal - * distribution of lines among the children over maintaining the tree's arity. - * Thus, if a smaller number of children results in a more equitable - * distribution, this option is preferred over ensuring a number of children - * equal to the tree's arity. In other words, the tree's balance is - * prioritized. */ + * distributed among its children. For low lines count, i.e. when the minimum + * number of lines par child is less than the tree arity, the policy below + * prioritizes an equal distribution of lines among the children over + * maintaining the tree's arity. Thus, if a smaller number of children + * results in a more equitable distribution, this option is preferred over + * ensuring a number of children equal to the tree's arity. In other words, + * the tree's balance is prioritized. */ nlines_per_child = (nlines + tree_arity-1/*ceil*/)/tree_arity; /* From the previous line repartition, compute the number of children */ @@ -433,6 +436,7 @@ sln_tree_create struct sln_tree** out_tree) { struct sln_tree* tree = NULL; + unsigned nthreads_max = 0; res_T res = RES_OK; if(!device || !out_tree) { res = RES_BAD_ARG; goto error; } @@ -445,7 +449,15 @@ sln_tree_create SHTR(isotope_metadata_ref_get(args->metadata)); tree->args = *args; + /* Set the #threads to match the maximum number of available threads */ + nthreads_max = (unsigned)MMAX(omp_get_max_threads(), omp_get_num_procs()); + tree->args.nthreads_hint = MMIN(tree->args.nthreads_hint, nthreads_max); + +#if 1 res = tree_build(tree); +#else + res = tree_build2(tree); +#endif if(res != RES_OK) goto error; exit: diff --git a/src/sln_tree_build.c b/src/sln_tree_build.c @@ -703,7 +703,10 @@ error: goto exit; } static res_T -build_polylines(struct sln_tree* tree) +build_polylines + (struct sln_tree* tree, + const size_t root_index, + const size_t nodes_count) /* Total number of nodes in the tree (for debug) */ { /* Stack */ #define STACK_SIZE (SLN_TREE_DEPTH_MAX*SLN_TREE_ARITY_MAX) @@ -711,7 +714,6 @@ build_polylines(struct sln_tree* tree) size_t istack = 0; /* Progress */ - size_t nnodes_total = 0; size_t nnodes_processed = 0; int progress = 0; @@ -720,7 +722,8 @@ build_polylines(struct sln_tree* tree) size_t inode = 0; res_T res = RES_OK; - ASSERT(tree); + ASSERT(tree && nodes_count != 0); + ASSERT(root_index < darray_node_size_get(&tree->nodes)); build_leaf_polyline_scratch_init(tree->sln->allocator, &scratch); @@ -728,15 +731,13 @@ build_polylines(struct sln_tree* tree) #define NODE(Id) (darray_node_data_get(&tree->nodes) + (Id)) #define IS_LEAF(Id) (NODE(Id)->offset == 0) - nnodes_total = darray_node_size_get(&tree->nodes); - /* Print that nothing has been done yet */ INFO(tree->sln, LOG_MSG, progress); /* Push back SIZE_MAX which, once pop up, will mark the end of recursion */ stack[istack++] = SIZE_MAX; - inode = 0; /* Root node */ + inode = root_index; /* Root node */ while(inode != SIZE_MAX) { const size_t istack_saved = istack; @@ -794,17 +795,17 @@ build_polylines(struct sln_tree* tree) int pcent = 0; nnodes_processed += istack_saved - istack; - ASSERT(nnodes_processed <= nnodes_total); + ASSERT(nnodes_processed <= nodes_count); /* Print progress update */ - pcent = (int)((double)nnodes_processed*100.0/(double)nnodes_total+0.5); + pcent = (int)((double)nnodes_processed*100.0/(double)nodes_count+0.5); if(pcent/10 > progress/10) { progress = pcent; INFO(tree->sln, LOG_MSG, progress); } } } - ASSERT(nnodes_processed == nnodes_total); + ASSERT(nnodes_processed == nodes_count); #undef NODE #undef IS_LEAF @@ -819,7 +820,7 @@ error: } static res_T -partition_lines(struct sln_tree* tree) +partition_lines(struct sln_tree* tree, const size_t root_index) { /* Stack */ #define STACK_SIZE (SLN_TREE_DEPTH_MAX*(SLN_TREE_ARITY_MAX-1)) @@ -836,9 +837,7 @@ partition_lines(struct sln_tree* tree) res_T res = RES_OK; ASSERT(tree); /* Pre-condition */ - - SHTR(line_list_get_size(tree->args.lines, &nlines_total)); - nlines_processed = 0; + ASSERT(root_index < darray_node_size_get(&tree->nodes)); #define LOG_MSG "Partitioning: %3d%%\n" #define NODE(Id) (darray_node_data_get(&tree->nodes) + (Id)) @@ -847,11 +846,8 @@ partition_lines(struct sln_tree* tree) if(res != RES_OK) goto error; \ } (void)0 - CREATE_NODE; /* Alloc the root node */ - - /* Setup the root node */ - NODE(0)->range[0] = 0; - NODE(0)->range[1] = nlines_total - 1; + nlines_total = NODE(root_index)->range[1] - NODE(root_index)->range[0] + 1; + nlines_processed = 0; /* Push back SIZE_MAX which, once pop up, will mark the end of recursion */ stack[istack++] = SIZE_MAX; @@ -860,7 +856,7 @@ partition_lines(struct sln_tree* tree) * the calculation is nevertheless in progress */ INFO(tree->sln, LOG_MSG, progress); - inode = 0; /* Root node */ + inode = root_index; /* Root node */ while(inode != SIZE_MAX) { /* #lines into the node */ size_t nlines_node = NODE(inode)->range[1] - NODE(inode)->range[0] + 1; @@ -956,10 +952,207 @@ error: res_T tree_build(struct sln_tree* tree) { + struct sln_node node = SLN_NODE_NULL; + size_t nlines = 0; + size_t nnodes = 0; + size_t iroot = 0; res_T res = RES_OK; - if((res = partition_lines(tree)) != RES_OK) goto error; - if((res = build_polylines(tree)) != RES_OK) goto error; + SHTR(line_list_get_size(tree->args.lines, &nlines)); + + iroot = darray_node_size_get(&tree->nodes); + ASSERT(iroot == 0); /* assume that the tree contains no data */ + + /* Setup the root node */ + node.range[0] = 0; + node.range[1] = nlines - 1/* inclusive bound */; + if((res = darray_node_push_back(&tree->nodes, &node)) != RES_OK) goto error; + + if((res = partition_lines(tree, iroot)) != RES_OK) goto error; + + nnodes = darray_node_size_get(&tree->nodes); + if((res = build_polylines(tree, iroot, nnodes)) != RES_OK) goto error; + +exit: + return res; +error: + goto exit; +} + +res_T +tree_build2(struct sln_tree* tree) +{ + /* Static memory space of the queue */ + #define QUEUE_SIZE_MAX 256 + struct item { + struct list_node link; + size_t inode; + } items[QUEUE_SIZE_MAX]; + + size_t sub_trees[QUEUE_SIZE_MAX]; /* List of sub-tree root indices */ + #undef QUEUE_SIZE_MAX + + /* Linked lists for managing queue data */ + struct list_node free_items; + struct list_node queue; + struct list_node* it; + size_t nnodes = 0; /* Number of enqueud nodes */ + + /* Miscellaneous */ + size_t nlines_total = 0; + size_t nnodes_upper_tree = 0; /* #nodes from the root to the subtrees */ + size_t i = 0; + res_T res = RES_OK; + + ASSERT(tree); + + /* Macros to help in managing nodes */ + #define NODE(Id) (darray_node_data_get(&tree->nodes) + (Id)) + #define CREATE_NODE { \ + res = darray_node_push_back(&tree->nodes, &SLN_NODE_NULL); \ + if(res != RES_OK) goto error; \ + } (void)0 + + /* Helper macro for the queue data structure */ + #define ENQUEUE(INode) { \ + struct item* item__ = NULL; \ + ASSERT(!is_list_empty(&free_items)); \ + item__ = CONTAINER_OF(list_head(&free_items), struct item, link); \ + item__->inode = INode; \ + list_move_tail(&item__->link, &queue); \ + ++nnodes; \ + } + #define DEQUEUE \ + (list_move_tail(list_head(&queue), &free_items), \ + --nnodes, \ + CONTAINER_OF(list_tail(&free_items), struct item, link)->inode) + + /* Setup the linked lists */ + list_init(&free_items); + list_init(&queue); + FOR_EACH(i, 0, sizeof(items)/sizeof(items[0])) { + items[i].inode = SIZE_MAX; + list_init(&items[i].link); + list_add_tail(&free_items, &items[i].link); + } + + SHTR(line_list_get_size(tree->args.lines, &nlines_total)); + + /* Setup the root node */ + CREATE_NODE; /* Alloc the root node */ + NODE(0)->range[0] = 0; + NODE(0)->range[1] = nlines_total - 1; + + /* Register the root node in the queue */ + ENQUEUE(0); + + /* Breadth-first tree building */ + while(!is_list_empty(&queue)) { + size_t node_range[2] = {0,0}; + size_t nlines_node = 0; /* #lines in the node */ + size_t nlines_child_min = 0; /* Min #lines per child */ + size_t nlines_remain = 0; /* Lines to be distributed among the children */ + size_t nchildren = 0; /* #children of the node */ + size_t ichildren = 0; /* Index of the node's first child */ + size_t iline = 0; + size_t inode = 0; + + inode = DEQUEUE; /* Get the current node */ + + /* Retrieve the range of lines contained within the node */ + node_range[0] = NODE(inode)->range[0]; + node_range[1] = NODE(inode)->range[1]; + nlines_node = node_range[1] - node_range[0] + 1/*inclusive bound*/; + ASSERT(nlines_node > tree->args.leaf_nlines); + + /* Compute how the number of children that node has */ + nchildren = node_child_count(NODE(inode), tree->args.arity); + ASSERT(nchildren <= tree->args.arity); + + /* Check whether, once the child nodes have been queued, the total number of + * queued nodes would exceed the number of threads - or, in other words, + * whether the number of subtree to be constructed would exceed the number + * of threads available to construct them in parallel. If so, terminate this + * breadth first traversal and proceed to the parallel construction of the + * subtrees already queued so that there is at least one thread available + * per subtree to be built */ + if(nnodes + nchildren > tree->args.nthreads_hint) { + ENQUEUE(inode); /* Do not forget to enqueue the current node */ + break; + } + + /* Calculate the index of the first child */ + ichildren = darray_node_size_get(&tree->nodes); + ASSERT(ichildren > inode); + + /* Define the offset from the current node to its children */ + NODE(inode)->offset = ui64_to_ui32((uint64_t)(ichildren - inode)); + + /* Calculate the minimum number of lines per child and the number of + * remaining lines to be distributed equally among them */ + nlines_child_min = nlines_node / nchildren; + nlines_remain = nlines_node % nchildren; + + iline = node_range[0]; + FOR_EACH(i, 0, nchildren) { + /* Compute the number of lines per child. Start by assigning the minimum + * number of lines to each child, then distribute the remaining lines + * among the first children */ + size_t nlines_child = nlines_child_min + (i < nlines_remain); + + CREATE_NODE; + + /* Set the range of lines line for the newly created child. Note that + * the boundaries of the range are inclusive, which is why 1 is + * subtracted to the upper bound */ + NODE(ichildren+i)->range[0] = iline; + NODE(ichildren+i)->range[1] = iline + nlines_child - 1/*inclusive bound*/; + iline += nlines_child; + + /* Check that the child's lines are a subset of the parent's lines */ + ASSERT(NODE(ichildren+i)->range[0] >= node_range[0]); + ASSERT(NODE(ichildren+i)->range[1] <= node_range[1]); + + ENQUEUE(ichildren+i); /* Register the child node */ + } + } + + ASSERT(!is_list_empty(&queue)); + + /* Retrieves the number of registered nodes. This is the number of nodes from + * the root to the subtrees, excluding the roots of those subtrees */ + nnodes_upper_tree = darray_node_size_get(&tree->nodes); + nnodes_upper_tree -= nnodes; /* Exclude the root node of the subtrees */ + + /* Configure the list of subtree structures to be built in parallel */ + i = 0; + LIST_FOR_EACH(it, &queue) { + sub_trees[i++] = CONTAINER_OF(it, struct item, link)->inode; + } + + /* This will the loop to parallelize, with a chunk size of 1 */ + for(i=0; i < nnodes; ++i) { + size_t nnodes_subtree = darray_node_size_get(&tree->nodes); + + res = partition_lines(tree, sub_trees[i]); + if(res != RES_OK) goto error; + + nnodes_subtree = darray_node_size_get(&tree->nodes) - nnodes_subtree; + nnodes_subtree += 1; /* Do not forget the root node */ + res = build_polylines(tree, sub_trees[i], nnodes_subtree); + if(res != RES_OK) goto error; + } + + /* Build the polylines of the upper level if necessary */ + if(nnodes_upper_tree != 0) { + res = build_polylines(tree, 0/*root*/, nnodes_upper_tree); + if(res != RES_OK) goto error; + } + + #undef NODE + #undef CREATE_NODE + #undef ENQUEUE + #undef DEQUEUE exit: return res; diff --git a/src/sln_tree_c.h b/src/sln_tree_c.h @@ -70,6 +70,10 @@ extern LOCAL_SYM res_T tree_build (struct sln_tree* tree); +extern LOCAL_SYM res_T +tree_build2 + (struct sln_tree* tree); + /* Assume that the node is an internal node */ extern LOCAL_SYM unsigned node_child_count