performance.bib

@article{Popov2017piecewise,
  author = {Popov, Mihail and Akel, Chadi and Chatelain, Yohan and Jalby, William and de Oliveira Castro, Pablo},
  title = {Piecewise holistic autotuning of parallel programs with CERE},
  journal = {Concurrency and Computation: Practice and Experience},
  year = {2017},
  issn = {1532-0634},
  url = {http://dx.doi.org/10.1002/cpe.4190},
  doi = {10.1002/cpe.4190},
  pages = {e4190}
}
@inproceedings{Popov2016piecewise,
  title = {Piecewise Holistic Autotuning of Compiler and Runtime Parameters},
  author = {Popov, Mihail and Akel, Chadi and Jalby, William and Castro, Pablo de Oliveira},
  booktitle = {Euro-Par 2016 Parallel Processing - 22nd International Conference},
  year = {2016},
  pages = {238-250},
  ee = {http://dx.doi.org/10.1007/978-3-319-43659-3_18},
  editor = {Christos Kaklamanis and Theodore S. Papatheodorou and Paul G. Spirakis},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  volume = {9833},
  isbn = {978-3-319-43659-3},
  publisher = {Springer},
  abstract = {Current architecture complexity requires fine tuning of compiler and runtime parameters to achieve full potential performance.  Autotuning substantially improves default parameters in many scenarios but it is a costly process requiring a long iterative evaluation.  We propose an automatic piecewise autotuner based on CERE (Codelet Extractor and REplayer). CERE decomposes applications into small pieces called codelets: each codelet maps to a loop or to an OpenMP parallel region and can be replayed as a standalone program.  Codelet autotuning achieves better speedups at a lower tuning cost. By grouping codelet invocations with the same performance behavior, CERE reduces the number of loops or OpenMP regions to be evaluated. Moreover unlike whole-program tuning, CERE customizes the set of best parameters for each specific OpenMP region or loop.  We demonstrate CERE tuning of compiler optimizations, number of threads and thread affinity on a NUMA architecture. On average over the NAS 3.0 benchmarks, we achieve a speedup of 1.08x after tuning.  Tuning a single codelet is 13x cheaper than whole-program evaluation and estimates the tuning impact on the original region with a 94.7% accuracy. On a Reverse Time Migration (RTM) proto-application we achieve a 1.11x speedup with a 200x cheaper exploration.},
  pdf = {europar16.pdf},
  documenturl = {europar16-slides.pdf}
}
@inproceedings{Denis2016verificarlo,
  author = {Christophe Denis and
             Pablo de Oliveira Castro and
             Eric Petit},
  title = {Verificarlo: Checking Floating Point Accuracy through Monte Carlo
             Arithmetic},
  booktitle = {23nd {IEEE} Symposium on Computer Arithmetic, {ARITH} 2016, Silicon
             Valley, CA, USA, July 10-13, 2016},
  pages = {55--62},
  year = {2016},
  url = {http://dx.doi.org/10.1109/ARITH.2016.31},
  doi = {10.1109/ARITH.2016.31},
  abstract = {Numerical accuracy of floating point computation is a well studied topic which has not made its way to the end-user in scientific computing. Yet, it has become a critical issue with the recent requirements for code modernization to harness new highly parallel hardware and perform higher resolution computation. To democratize numerical accuracy analysis, it is important to propose tools and methodologies to study large use cases in a reliable and automatic way. In this paper, we propose verificarlo, an extension to the LLVM compiler to automatically use Monte Carlo Arithmetic in a transparent way for the end-user. It supports all the major languages including C, C++, and Fortran. Unlike source-to-source approaches, our implementation captures the influence of compiler optimizations on the numerical accuracy. We illustrate how Monte Carlo Arithmetic using the verificarlo tool outperforms the existing approaches on various use cases and is a step toward automatic numerical analysis.},
  pdf = {https://hal.archives-ouvertes.fr/hal-01192668/file/verificarlo-preprint.pdf}
}
@inproceedings{Popov2015pcere,
  title = {PCERE: Fine-grained Parallel Benchmark Decomposition for Scalability Prediction},
  author = {Popov, Mihail and Akel, Chadi and Conti, Florent and Jalby, William and Castro, Pablo de Oliveira},
  booktitle = {Parallel and Distributed Processing Symposium (IPDPS), 2015 IEEE International},
  pages = {1151--1160},
  year = {2015},
  organization = {IEEE},
  abstract = {
Evaluating the strong scalability of OpenMP applications is a costly and time-consuming process. It traditionally requires executing the whole application multiple times with different number of threads. We propose the Parallel Codelet Extractor and REplayer (PCERE), a tool to reduce the cost of scalability evaluation. PCERE decomposes applications into small pieces called codelets: each codelet maps to an OpenMP parallel region and can be replayed as a standalone program. To accelerate scalability prediction, PCERE replays codelets while varying the number of threads. Prediction speedup comes from two key ideas. First, the number of invocations during replay can be significantly reduced. Invocations that have the same performance are grouped together and a single representative is replayed. Second, sequential parts of the programs do not need to be replayed for each different thread configuration. PCERE codelets can be captured once and replayed accurately on multiple architectures, enabling cross-architecture parallel performance prediction. We evaluate PCERE on a C version of the NAS 3.0 Parallel Benchmarks (NPB). We achieve an average speed-up of 25 times on evaluating OpenMP applications scalability with an average error of 4.9\% (median error of 1.7\%).
    },
  pdf = {pcere15.pdf},
  documenturl = {pcere15-slides.pdf}
}
@article{Oliveira2015CERE,
  title = {{CERE: LLVM Based Codelet Extractor and REplayer for Piecewise Benchmarking and Optimization}},
  author = {de Oliveira Castro, Pablo and Akel, Chadi and Petit, Eric and Popov, Mihail and Jalby, William},
  journal = {ACM Transactions on Architecture and Code Optimization (TACO)},
  volume = {12},
  number = {1},
  pages = {6},
  year = {2015},
  publisher = {ACM},
  doi = {10.1145/2724717},
  abstract = {
        This article presents Codelet Extractor and REplayer (CERE), an
        open-source framework for code isolation. CERE finds
        and extracts the hotspots of an application as
        isolated fragments of code, called
        codelets. Codelets can be modified, compiled, run,
        and measured independently from the original
        application. Code isolation reduces benchmarking
        cost and allows piecewise optimization of an
        application. Unlike previous approaches, CERE
        isolates codes at the compiler Intermediate
        Representation (IR) level. Therefore CERE is
        language agnostic and supports many input languages
        such as C, C++, Fortran, and D. CERE automatically
        detects codelets invocations that have the same
        performance behavior. Then, it selects a reduced set
        of representative codelets and invocations, much
        faster to replay, which still captures accurately
        the original application. In addition, CERE supports
        recompiling and retargeting the extracted
        codelets. Therefore, CERE can be used for
        cross-architecture performance prediction or
        piecewise code optimization. On the SPEC 2006 FP
        benchmarks, CERE codelets cover 90.9\% and accurately
        replay 66.3\% of the execution time. We use CERE
        codelets in a realistic study to evaluate three
        different architectures on the NAS benchmarks. CERE
        accurately estimates each architecture performance
        and is 7.3x to 46.6x cheaper than running the full
        benchmark.
    },
  pdf = {cere15.pdf}
}
@inproceedings{Oliveira2014finegrained,
  title = {{Fine-grained Benchmark Subsetting for System Selection}},
  author = {de Oliveira Castro, Pablo and Kashnikov, Yuriy and Akel, Chadi and Popov, Mihail and Jalby, William},
  booktitle = {Proceedings of Annual IEEE/ACM International Symposium on Code Generation and Optimization},
  series = {CGO '14},
  year = {2014},
  isbn = {978-1-4503-2670-4},
  location = {Orlando, FL, USA},
  pages = {132:132--132:142},
  numpages = {11},
  url = {http://doi.acm.org/10.1145/2544137.2544144},
  doi = {10.1145/2544137.2544144},
  publisher = {ACM},
  address = {New York, NY, USA},
  abstract = {
        System selection aims at finding the best architecture for a set of
        programs and workloads. It traditionally requires long running
        benchmarks. We propose a method to reduce the cost of system
        selection. We break down benchmarks into elementary fragments of
        source code, called codelets. Then, we identify two causes of
        redundancy: first, similar codelets; second, codelets called
        repeatedly. The key idea is to minimize redundancy inside the
        benchmark suite to speed it up. For each group of similar codelets,
        only one representative is kept. For codelets called repeatedly and for
        which the performance does not vary across calls, the number of
        invocations is reduced. Given an initial benchmark suite, our
        method produces a set of reduced benchmarks that can be used in
        place of the original one for system selection.

        We evaluate our method on the NAS SER benchmarks, producing a reduced
        benchmark suite 30 times faster in average than the original suite,
        with a maximum of 44 times. The reduced suite predicts the execution
        time on three target architectures with a median error between 3.9\%
        and 8\%.
    },
  pdf = {finegrained-cgo14.pdf},
  documenturl = {finegrained-slides.pdf}
}
@article{Oliveira2013Adaptive,
  title = {Adaptive Sampling for Performance Characterization of Application Kernels},
  author = {de Oliveira Castro, Pablo and Petit, Eric and Farjallah, Asma and Jalby, William},
  journal = {Concurrency and Computation: Practice and Experience},
  year = {2013},
  publisher = {Wiley},
  issn = {1532-0634},
  doi = {10.1002/cpe.3097},
  keywords = {performance, sampling, modeling, stencil},
  abstract = {

    Characterizing performance is essential to optimize programs and architectures.
    The open source Adaptive Sampling Kit (ASK) measures the performance
    trade-off in large design spaces. Exhaustively sampling all sets of
    parameters is computationally intractable. Therefore, ASK concentrates
    exploration in the most irregular regions of the design space through
    multiple adaptive sampling strategies. The paper presents the ASK
    architecture and a set of adaptive sampling strategies, including a new
    approach called Hierarchical Variance Sampling. ASK's usage is demonstrated
    on three performance characterization problems: memory stride accesses,
    Jacobian stencil code, and an industrial seismic application using 3D stencils.
    ASK builds accurate models of performance with a small number of measures.
    It considerably reduces the cost of performance exploration. For instance,
    the Jacobian stencil code design space, which has more than 31 × 10^8
    combinations of parameters, is accurately predicted using only 1500
    combinations.

    },
  pdf = {ASK-cpe13.pdf}
}
@inproceedings{Akel2013sourcecode,
  title = {{Is Source-code Isolation Viable for Performance Characterization?}},
  author = {Akel, Chadi and Kashnikov, Yuriy and de Oliveira Castro, Pablo and Jalby, William},
  booktitle = {International Workshop on Parallel Software Tools and Tool Infrastructures (PSTI)},
  year = {2013},
  publisher = {IEEE Computer Society},
  abstract = {

Source-code isolation finds and extracts the hotspots of an application as
independent isolated fragments of code, called codelets. Codelets can be
modified, compiled, run, and measured independently from the original
application. Source-code isolation reduces benchmarking cost and allows
piece-wise optimization of an application. Source-code isolation is faster
than whole-program benchmarking and optimization since the user can
concentrate only on the bottlenecks. This paper examines the viability of
using isolated codelets in place of the original application for
performance characterization and optimization. On the NAS benchmarks, we
show that codelets capture 92.3\% of the original execution time. We present
a set of techniques for keeping codelets as faithful as possible to the
original hotspots: 63.6\% of the codelets have the same assembly as the
original hotspots and 81.6\% of the codelets have the same run time
performance as the original hotspots.

},
  pdf = {psti13.pdf},
  documenturl = {psti13-slides.pdf}
}
@conference{Kashnikov2013evaluating,
  title = {{Evaluating Architecture and Compiler Design through Static Loop Analysis}},
  author = {Kashnikov, Yuriy and de Oliveira Castro, Pablo and Oseret, Emmanuel and Jalby, William},
  booktitle = {High Performance Computing and Simulation (HPCS), 2013 International Conference on},
  pages = {535 - 544},
  doi = {10.1109/HPCSim.2013.6641465},
  isbn = {978-1-4799-0836-3},
  year = {2013},
  publisher = {IEEE Computer Society},
  abstract = {
      Using the MAQAO loop static analyzer, we characterize a corpus of binary
          loops extracted from common benchmark suits such as SPEC, NAS, etc.
          and several industrial applications. For each loop, MAQAO extracts
          low-level assembly features such as: integer and floating-point
          vectorization ratio, number of registers used and spill-fill, number
          of concurrent memory streams accessed, etc. The distributions of
          these features on a large representative code corpus can be used to
          evaluate compilers and architectures and tune them for the most
          frequently used assembly patterns. In this paper, we present the
          MAQAO loop analyzer and a characterization of the 4857 binary loops.
          We evaluate register allocation and vectorization on two compilers
          and propose a method to tune loop buffer size and stream prefetcher
          based on static analysis of benchmarks.
  },
  pdf = {hpcs13.pdf}
}
@inproceedings{Oliveira2012ASK,
  title = {{ASK: Adaptive Sampling Kit for Performance Characterization}},
  author = {de Oliveira Castro, Pablo and Petit, Eric and Beyler, Jean Christophe and Jalby, William},
  year = {2012},
  pages = {89-101},
  ee = {http://dx.doi.org/10.1007/978-3-642-32820-6_11},
  editor = {Christos Kaklamanis and Theodore S. Papatheodorou and Paul G. Spirakis},
  booktitle = {Euro-Par 2012 Parallel Processing - 18th International Conference},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  volume = {7484},
  isbn = {978-3-642-32819-0},
  abstract = {
    Characterizing performance is essential to optimize programs
    and architectures. The open source Adaptive Sampling Kit (ASK) measures
    the performance trade-offs in large design spaces. Exhaustively
    sampling all points is computationally intractable. Therefore, ASK
    concentrates exploration in the most irregular regions of the design space
    through multiple adaptive sampling methods. The paper presents the
    ASK architecture and a set of adaptive sampling strategies, including a
    new approach: Hierarchical Variance Sampling. ASK’s usage is demonstrated
    on two performance characterization problems: memory stride
    accesses and stencil codes. ASK builds precise models of performance
    with a small number of measures. It considerably reduces the cost of
    performance exploration. For instance, the stencil code design space,
    which has more than 31.10^8 points, is accurately predicted using only
    1500 points.
    },
  pdf = {ASK-europar12.pdf},
  documenturl = {ASK-europar12-slides.pdf}
}
@inproceedings{Petit2012computing,
  title = {Computing-Kernels Performance Prediction Using DataFlow Analysis and
Microbenchmarking},
  author = {Petit, Eric and de Oliveira Castro, Pablo and Menour, Tarek and Krammer,
Bettina and
Jalby, William},
  booktitle = {International Workshop on Compilers for Parallel Computers},
  year = {2012}
}