performance.bib

@inproceedings{Fontenaille2018scalable,
  title = {Scalable Work-Stealing Load-Balancer for HPC Distributed Memory Systems},
  author = {Clement Fontenaille and Eric Petit and Pablo de Oliveira Castro and Seijilo Uemura and Devan Sohier and Piotr Lesnicki and Ghislain Lartigue and Vincent Moureau},
  booktitle = {COLOC: 2nd Workshop on Data Locality, in conjunction with Euro-Par 2018},
  year = {2018}
}

@article{Popov2017piecewise,
  author = {Popov, Mihail and Akel, Chadi and Chatelain, Yohan and Jalby, William and de Oliveira Castro, Pablo},
  title = {Piecewise holistic autotuning of parallel programs with CERE},
  journal = {Concurrency and Computation: Practice and Experience},
  year = {2017},
  issn = {1532-0634},
  url = {http://dx.doi.org/10.1002/cpe.4190},
  doi = {10.1002/cpe.4190},
  pages = {e4190},
  abstract = {Current architecture complexity requires fine tuning of compiler and runtime parameters to achieve best performance. Autotuning substantially improves default parameters in many scenarios, but it is a costly process requiring long iterative evaluations. We propose an automatic piecewise autotuner based on CERE (Codelet Extractor and REplayer). CERE decomposes applications into small pieces called codelets: Each codelet maps to a loop or to an OpenMP parallel region and can be replayed as a standalone program. Codelet autotuning achieves better speedups at a lower tuning cost. By grouping codelet invocations with the same performance behavior, CERE reduces the number of loops or OpenMP regions to be evaluated. Moreover, unlike whole-program tuning, CERE customizes the set of best parameters for each specific OpenMP region or loop. We demonstrate the CERE tuning of compiler optimizations, number of threads, thread affinity, and scheduling policy on both nonuniform memory access and heterogeneous architectures. Over the NAS benchmarks, we achieve an average speedup of 1.08x after tuning. Tuning a codelet is 13x cheaper than whole-program evaluation and predicts the tuning impact with a 94.7\% accuracy. Similarly, exploring thread configurations and scheduling policies for a Black‐Scholes solver on an heterogeneous big.LITTLE architecture is over 40x faster using CERE.},
  documenturl = {https://hal-uvsq.archives-ouvertes.fr/hal-01542912/document}
}

@inproceedings{Popov2016piecewise,
  title = {Piecewise Holistic Autotuning of Compiler and Runtime Parameters},
  author = {Popov, Mihail and Akel, Chadi and Jalby, William and Castro, Pablo de Oliveira},
  booktitle = {Euro-Par 2016 Parallel Processing - 22nd International Conference},
  year = {2016},
  pages = {238-250},
  ee = {http://dx.doi.org/10.1007/978-3-319-43659-3_18},
  editor = {Christos Kaklamanis and Theodore S. Papatheodorou and Paul G. Spirakis},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  volume = {9833},
  isbn = {978-3-319-43659-3},
  publisher = {Springer},
  abstract = {Current architecture complexity requires fine tuning of compiler and runtime parameters to achieve full potential performance.  Autotuning substantially improves default parameters in many scenarios but it is a costly process requiring a long iterative evaluation.  We propose an automatic piecewise autotuner based on CERE (Codelet Extractor and REplayer). CERE decomposes applications into small pieces called codelets: each codelet maps to a loop or to an OpenMP parallel region and can be replayed as a standalone program.  Codelet autotuning achieves better speedups at a lower tuning cost. By grouping codelet invocations with the same performance behavior, CERE reduces the number of loops or OpenMP regions to be evaluated. Moreover unlike whole-program tuning, CERE customizes the set of best parameters for each specific OpenMP region or loop.  We demonstrate CERE tuning of compiler optimizations, number of threads and thread affinity on a NUMA architecture. On average over the NAS 3.0 benchmarks, we achieve a speedup of 1.08x after tuning.  Tuning a single codelet is 13x cheaper than whole-program evaluation and estimates the tuning impact on the original region with a 94.7% accuracy. On a Reverse Time Migration (RTM) proto-application we achieve a 1.11x speedup with a 200x cheaper exploration.},
  pdf = {europar16.pdf},
  documenturl = {europar16-slides.pdf}
}

@inproceedings{Popov2015pcere,
  title = {PCERE: Fine-grained Parallel Benchmark Decomposition for Scalability Prediction},
  author = {Popov, Mihail and Akel, Chadi and Conti, Florent and Jalby, William and Castro, Pablo de Oliveira},
  booktitle = {Parallel and Distributed Processing Symposium (IPDPS), 2015 IEEE International},
  pages = {1151--1160},
  year = {2015},
  organization = {IEEE},
  abstract = {
                  Evaluating the strong scalability of OpenMP applications is a costly and time-consuming process. It traditionally requires executing the whole application multiple times with different number of threads. We propose the Parallel Codelet Extractor and REplayer (PCERE), a tool to reduce the cost of scalability evaluation. PCERE decomposes applications into small pieces called codelets: each codelet maps to an OpenMP parallel region and can be replayed as a standalone program. To accelerate scalability prediction, PCERE replays codelets while varying the number of threads. Prediction speedup comes from two key ideas. First, the number of invocations during replay can be significantly reduced. Invocations that have the same performance are grouped together and a single representative is replayed. Second, sequential parts of the programs do not need to be replayed for each different thread configuration. PCERE codelets can be captured once and replayed accurately on multiple architectures, enabling cross-architecture parallel performance prediction. We evaluate PCERE on a C version of the NAS 3.0 Parallel Benchmarks (NPB). We achieve an average speed-up of 25 times on evaluating OpenMP applications scalability with an average error of 4.9\% (median error of 1.7\%).
                  },
  pdf = {pcere15.pdf},
  documenturl = {pcere15-slides.pdf}
}

@article{Oliveira2015CERE,
  title = {{CERE: LLVM Based Codelet Extractor and REplayer for Piecewise Benchmarking and Optimization}},
  author = {de Oliveira Castro, Pablo and Akel, Chadi and Petit, Eric and Popov, Mihail and Jalby, William},
  journal = {ACM Transactions on Architecture and Code Optimization (TACO)},
  volume = {12},
  number = {1},
  pages = {6},
  year = {2015},
  publisher = {ACM},
  doi = {10.1145/2724717},
  abstract = {This article presents Codelet Extractor and REplayer (CERE), an
               open-source framework for code isolation. CERE finds
               and extracts the hotspots of an application as
               isolated fragments of code, called
               codelets. Codelets can be modified, compiled, run,
               and measured independently from the original
               application. Code isolation reduces benchmarking
               cost and allows piecewise optimization of an
               application. Unlike previous approaches, CERE
               isolates codes at the compiler Intermediate
               Representation (IR) level. Therefore CERE is
               language agnostic and supports many input languages
               such as C, C++, Fortran, and D. CERE automatically
               detects codelets invocations that have the same
               performance behavior. Then, it selects a reduced set
               of representative codelets and invocations, much
               faster to replay, which still captures accurately
               the original application. In addition, CERE supports
               recompiling and retargeting the extracted
               codelets. Therefore, CERE can be used for
               cross-architecture performance prediction or
               piecewise code optimization. On the SPEC 2006 FP
               benchmarks, CERE codelets cover 90.9\% and accurately
               replay 66.3\% of the execution time. We use CERE
               codelets in a realistic study to evaluate three
               different architectures on the NAS benchmarks. CERE
               accurately estimates each architecture performance
               and is 7.3x to 46.6x cheaper than running the full
               benchmark.  },
  pdf = {cere15.pdf}
}

@inproceedings{Oliveira2014finegrained,
  title = {{Fine-grained Benchmark Subsetting for System Selection}},
  author = {de Oliveira Castro, Pablo and Kashnikov, Yuriy and Akel, Chadi and Popov, Mihail and Jalby, William},
  booktitle = {Proceedings of Annual IEEE/ACM International Symposium on Code Generation and Optimization},
  series = {CGO '14},
  year = {2014},
  isbn = {978-1-4503-2670-4},
  location = {Orlando, FL, USA},
  pages = {132:132--132:142},
  numpages = {11},
  url = {http://doi.acm.org/10.1145/2544137.2544144},
  doi = {10.1145/2544137.2544144},
  publisher = {ACM},
  address = {New York, NY, USA},
  abstract = {System selection aims at finding the best architecture for a set of
                 programs and workloads. It traditionally requires long running
                 benchmarks. We propose a method to reduce the cost of system
                 selection. We break down benchmarks into elementary fragments of
                 source code, called codelets. Then, we identify two causes of
                 redundancy: first, similar codelets; second, codelets called
                 repeatedly. The key idea is to minimize redundancy inside the
                 benchmark suite to speed it up. For each group of similar codelets,
                 only one representative is kept. For codelets called repeatedly and for
                 which the performance does not vary across calls, the number of
                 invocations is reduced. Given an initial benchmark suite, our
                 method produces a set of reduced benchmarks that can be used in
                 place of the original one for system selection.
                 We evaluate our method on the NAS SER benchmarks, producing a reduced
                 benchmark suite 30 times faster in average than the original suite,
                 with a maximum of 44 times. The reduced suite predicts the execution
                 time on three target architectures with a median error between 3.9\%
                 and 8\%.  },
  pdf = {finegrained-cgo14.pdf},
  documenturl = {finegrained-slides.pdf}
}

@article{Oliveira2013Adaptive,
  title = {Adaptive Sampling for Performance Characterization of Application Kernels},
  author = {de Oliveira Castro, Pablo and Petit, Eric and Farjallah, Asma and Jalby, William},
  journal = {Concurrency and Computation: Practice and Experience},
  year = {2013},
  publisher = {Wiley},
  issn = {1532-0634},
  doi = {10.1002/cpe.3097},
  keywords = {performance, sampling, modeling, stencil},
  abstract = {Characterizing performance is essential to optimize programs and architectures.
               The open source Adaptive Sampling Kit (ASK) measures the performance
               trade-off in large design spaces. Exhaustively sampling all sets of
               parameters is computationally intractable. Therefore, ASK concentrates
               exploration in the most irregular regions of the design space through
               multiple adaptive sampling strategies. The paper presents the ASK
               architecture and a set of adaptive sampling strategies, including a new
               approach called Hierarchical Variance Sampling. ASK's usage is demonstrated
               on three performance characterization problems: memory stride accesses,
               Jacobian stencil code, and an industrial seismic application using 3D stencils.
               ASK builds accurate models of performance with a small number of measures.
               It considerably reduces the cost of performance exploration. For instance,
               the Jacobian stencil code design space, which has more than 31 × 10^8
               combinations of parameters, is accurately predicted using only 1500
               combinations.},
  pdf = {ASK-cpe13.pdf}
}

@inproceedings{Akel2013sourcecode,
  title = {{Is Source-code Isolation Viable for Performance Characterization?}},
  author = {Akel, Chadi and Kashnikov, Yuriy and de Oliveira Castro, Pablo and Jalby, William},
  booktitle = {International Workshop on Parallel Software Tools and Tool Infrastructures (PSTI)},
  year = {2013},
  publisher = {IEEE Computer Society},
  abstract = {Source-code isolation finds and extracts the hotspots of an application as
                 independent isolated fragments of code, called codelets. Codelets can be
                 modified, compiled, run, and measured independently from the original
                 application. Source-code isolation reduces benchmarking cost and allows
                 piece-wise optimization of an application. Source-code isolation is faster
                 than whole-program benchmarking and optimization since the user can
                 concentrate only on the bottlenecks. This paper examines the viability of
                 using isolated codelets in place of the original application for
                 performance characterization and optimization. On the NAS benchmarks, we
                 show that codelets capture 92.3\% of the original execution time. We present
                 a set of techniques for keeping codelets as faithful as possible to the
                 original hotspots: 63.6\% of the codelets have the same assembly as the
                 original hotspots and 81.6\% of the codelets have the same run time
                 performance as the original hotspots.},
  pdf = {psti13.pdf},
  documenturl = {psti13-slides.pdf}
}

@conference{Kashnikov2013evaluating,
  title = {{Evaluating Architecture and Compiler Design through Static Loop Analysis}},
  author = {Kashnikov, Yuriy and de Oliveira Castro, Pablo and Oseret, Emmanuel and Jalby, William},
  booktitle = {High Performance Computing and Simulation (HPCS), 2013 International Conference on},
  pages = {535 - 544},
  doi = {10.1109/HPCSim.2013.6641465},
  isbn = {978-1-4799-0836-3},
  year = {2013},
  publisher = {IEEE Computer Society},
  abstract = {Using the MAQAO loop static analyzer, we characterize a corpus of binary
               loops extracted from common benchmark suits such as SPEC, NAS, etc.
               and several industrial applications. For each loop, MAQAO extracts
               low-level assembly features such as: integer and floating-point
               vectorization ratio, number of registers used and spill-fill, number
               of concurrent memory streams accessed, etc. The distributions of
               these features on a large representative code corpus can be used to
               evaluate compilers and architectures and tune them for the most
               frequently used assembly patterns. In this paper, we present the
               MAQAO loop analyzer and a characterization of the 4857 binary loops.
               We evaluate register allocation and vectorization on two compilers
               and propose a method to tune loop buffer size and stream prefetcher
               based on static analysis of benchmarks.},
  pdf = {hpcs13.pdf}
}

@inproceedings{Oliveira2012ASK,
  title = {{ASK: Adaptive Sampling Kit for Performance Characterization}},
  author = {de Oliveira Castro, Pablo and Petit, Eric and Beyler, Jean Christophe and Jalby, William},
  year = {2012},
  pages = {89-101},
  ee = {http://dx.doi.org/10.1007/978-3-642-32820-6_11},
  editor = {Christos Kaklamanis and Theodore S. Papatheodorou and Paul G. Spirakis},
  booktitle = {Euro-Par 2012 Parallel Processing - 18th International Conference},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  volume = {7484},
  isbn = {978-3-642-32819-0},
  abstract = {Characterizing performance is essential to optimize programs
                 and architectures. The open source Adaptive Sampling Kit (ASK) measures
                 the performance trade-offs in large design spaces. Exhaustively
                 sampling all points is computationally intractable. Therefore, ASK
                 concentrates exploration in the most irregular regions of the design space
                 through multiple adaptive sampling methods. The paper presents the
                 ASK architecture and a set of adaptive sampling strategies, including a
                 new approach: Hierarchical Variance Sampling. ASK’s usage is demonstrated
                 on two performance characterization problems: memory stride
                 accesses and stencil codes. ASK builds precise models of performance
                 with a small number of measures. It considerably reduces the cost of
                 performance exploration. For instance, the stencil code design space,
                 which has more than 31.10^8 points, is accurately predicted using only
                 1500 points.},
  pdf = {ASK-europar12.pdf},
  documenturl = {ASK-europar12-slides.pdf}
}

@inproceedings{Petit2012computing,
  title = {Computing-Kernels Performance Prediction Using DataFlow Analysis and
               Microbenchmarking},
  author = {Petit, Eric and de Oliveira Castro, Pablo and Menour, Tarek and Krammer,
               Bettina and
               Jalby, William},
  booktitle = {International Workshop on Compilers for Parallel Computers},
  year = {2012}
}