@inproceedings {497, title = {HALO 1.0: A Hardware-agnostic Accelerator Orchestration Framework for Enabling Hardware-agnostic Programming with True Performance Portability for Heterogeneous HPC}, year = {Under Review}, abstract = {

Hardware-agnostic programming with high performance portability will be the bedrock for realizing the ubiquitous adoption of emerging accelerator technologies in future heterogeneous high-performance computing (HPC) systems, which is the key to achieving the next level of HPC performance on an expanding accelerator landscape. In this paper, we present HALO 1.0, an open-ended extensible multi-agent software framework, that implements a set of proposed hardware-agnostic accelerator orchestration (HALO) principles and a novel compute-centric message passing interface (C2MPI) specification for enabling the portable and performance-optimized execution of hardware-agnostic application codes across heterogeneous accelerator resources. The experiment results of evaluating eight widely used HPC subroutines based on Intel Xeon E5-2620 v4 CPUs, Intel Arria 10 GX FPGAs, and NVIDIA GeForce RTX 2080 Ti GPUs show that HALO 1.0 allows the same hardware-agnostic application codes of the HPC kernels, without any change, to run across all the computing devices with a consistently maximum performance portability score of 1.0, which is 2x-861,883x higher than the OpenCL-based solution that suffers from an unstably low performance portability score.

}, keywords = {psclab}, url = {https://arxiv.org/pdf/2011.10896.pdf}, author = {Michael Riera and Bank-Tavakoli, Erfan and Masudul Hassan Quraishi and Fengbo Ren} } @inproceedings {Bordeaux, France, title = {A Data-Driven Approach for Automated Integrated Circuit Segmentation of Scan Electron Microscopy Images}, year = {2022}, month = {10/2022}, address = {Bordeaux, France}, abstract = {

This paper proposes an automated data-driven integrated
circuit segmentation approach of scan electron microscopy
(SEM) images inspired by state-of-the-art CNN-based image
perception methods. Based on the requirements derived from
real industry applications, we take wire segmentation and via
detection algorithms to generate integrated circuit segmentation
maps from SEMs in our approach. On SEM images
collected in the industrial applications, our method achieves
an average of 50.71 on Electrically Significant Difference
(ESD) in the wire segmentation task and 99.05\% F1 score
in the via detection task, which achieves about 85\% and 8\%
improvements over the reference method, respectively.

}, keywords = {psclab}, author = {Zifan Yu and Bruno Machado Trindade and Michael Green and Zhikang Zhang and Pullela Sneha and Bank-Tavakoli, Erfan and Christopher Pawlowicz and Ren, Fengbo} } @inproceedings {Virtual Event, title = {FSCHOL: An OpenCL-based HPC Framework for Accelerating Sparse Cholesky Factorization on FPGAs}, year = {2021}, month = {10/2021}, pages = {209-220}, address = {Virtual Event}, abstract = {

The proposed FSCHOL framework consists of an FPGA kernel implementing a throughput-optimized hardware architecture for accelerating the supernodal multifrontal algorithm for sparse Cholesky factorization and a host program implementing a novel scheduling algorithm for finding the optimal execution order of supernodes computations for an elimination tree on the FPGA to eliminate the need for off-chip memory access for storing intermediate results. Moreover, the proposed scheduling algorithm minimizes on-chip memory requirements for buffering intermediate results by resolving the dependency of parent nodes in an elimination tree through temporal parallelism. Experiment results for factorizing a set of sparse matrices in various sizes from SuiteSparse Matrix Collection show that the proposed FSCHOL implemented on an Intel Stratix 10 GX FPGA development board achieves on average 5.5\× and 9.7\× higher performance and 10.3\× and 24.7\× lower energy consumption than implementations of CHOLMOD on an Intel Xeon E5-2637 CPU and an NVIDIA V100 GPU, respectively.

}, keywords = {psclab}, doi = {10.1109/SBAC-PAD53543.2021.00032}, author = {Erfan Bank-Tavakoli and Michael Riera and Masudul Hassan Quraishi and Fengbo Ren} } @article {498, title = {A Survey of System Architectures and Techniques for FPGA Virtualization}, journal = { IEEE Transactions on Parallel and Distributed Systems}, volume = {32}, year = {2021}, month = {09/2021}, pages = {2216-2230}, abstract = {

FPGA accelerators are gaining increasing attention in both cloud and edge computing because of their hardware flexibility, high computational throughput, and low power consumption. However, the design flow of FPGAs often requires specific knowledge of the underlying hardware, which hinders the wide adoption of FPGAs by application developers. Therefore, the virtualization of FPGAs becomes extremely important to create a useful abstraction of the hardware suitable for application developers. Such abstraction also enables the sharing of FPGA resources among multiple users and accelerator applications, which is important because, traditionally, FPGAs have been mostly used in single-user, single-embedded-application scenarios. There are many works in the field of FPGA virtualization covering different aspects and targeting different application areas. In this survey, we review the system architectures used in the literature for FPGA virtualization. In addition, we identify the primary objectives of FPGA virtualization, based on which we summarize the techniques for realizing FPGA virtualization. This survey helps researchers to efficiently learn about FPGA virtualization research by providing a comprehensive review of the existing literature.

}, keywords = {psclab}, issn = {1045-9219}, doi = {10.1109/TPDS.2021.3063670}, author = {Masudul Hassan Quraishi and Erfan Bank-Tavakoli and Fengbo Ren} } @inproceedings {360, title = {Are FPGAs Suitable for Edge Computing?}, year = {2018}, month = {07/2018}, address = {BOSTON, MA}, abstract = {

The rapid growth of Internet-of-things (IoT) and artificial intelligence applications have called forth a new computing paradigm\–edge computing. In this paper, we study the suitability of deploying FPGAs for edge computing from the perspectives of throughput sensitivity to workload size, architectural adaptiveness to algorithm characteristics, and energy efficiency. This goal is accomplished by conducting comparison experiments on an Intel Arria 10 GX1150 FPGA and an Nvidia Tesla K40m GPU. The experiment results imply that the key advantages of adopting FPGAs for edge computing over GPUs are three-fold: 1) FPGAs can provide a consistent throughput invariant to the size of application workload, which is critical to aggregating individual service requests from various IoT sensors; (2) FPGAs offer both spatial and temporal parallelism at a fine granularity and a massive scale, which guarantees a consistently high performance for accelerating both high-concurrency and high-dependency algorithms; and (3) FPGAs feature 3\– 4 times lower power consumption and up to 30.7 times better energy efficiency, offering better thermal stability and lower energy cost per functionality.

}, keywords = {psclab}, author = {Saman Biookaghazadeh and Ming Zhao and Fengbo Ren} }