@inproceedings {497, title = {HALO 1.0: A Hardware-agnostic Accelerator Orchestration Framework for Enabling Hardware-agnostic Programming with True Performance Portability for Heterogeneous HPC}, year = {Under Review}, abstract = {

Hardware-agnostic programming with high performance portability will be the bedrock for realizing the ubiquitous adoption of emerging accelerator technologies in future heterogeneous high-performance computing (HPC) systems, which is the key to achieving the next level of HPC performance on an expanding accelerator landscape. In this paper, we present HALO 1.0, an open-ended extensible multi-agent software framework, that implements a set of proposed hardware-agnostic accelerator orchestration (HALO) principles and a novel compute-centric message passing interface (C2MPI) specification for enabling the portable and performance-optimized execution of hardware-agnostic application codes across heterogeneous accelerator resources. The experiment results of evaluating eight widely used HPC subroutines based on Intel Xeon E5-2620 v4 CPUs, Intel Arria 10 GX FPGAs, and NVIDIA GeForce RTX 2080 Ti GPUs show that HALO 1.0 allows the same hardware-agnostic application codes of the HPC kernels, without any change, to run across all the computing devices with a consistently maximum performance portability score of 1.0, which is 2x-861,883x higher than the OpenCL-based solution that suffers from an unstably low performance portability score.

}, keywords = {psclab}, url = {https://arxiv.org/pdf/2011.10896.pdf}, author = {Michael Riera and Bank-Tavakoli, Erfan and Masudul Hassan Quraishi and Fengbo Ren} } @inproceedings {Virtual Event, title = {FSCHOL: An OpenCL-based HPC Framework for Accelerating Sparse Cholesky Factorization on FPGAs}, year = {2021}, month = {10/2021}, pages = {209-220}, address = {Virtual Event}, abstract = {

The proposed FSCHOL framework consists of an FPGA kernel implementing a throughput-optimized hardware architecture for accelerating the supernodal multifrontal algorithm for sparse Cholesky factorization and a host program implementing a novel scheduling algorithm for finding the optimal execution order of supernodes computations for an elimination tree on the FPGA to eliminate the need for off-chip memory access for storing intermediate results. Moreover, the proposed scheduling algorithm minimizes on-chip memory requirements for buffering intermediate results by resolving the dependency of parent nodes in an elimination tree through temporal parallelism. Experiment results for factorizing a set of sparse matrices in various sizes from SuiteSparse Matrix Collection show that the proposed FSCHOL implemented on an Intel Stratix 10 GX FPGA development board achieves on average 5.5\× and 9.7\× higher performance and 10.3\× and 24.7\× lower energy consumption than implementations of CHOLMOD on an Intel Xeon E5-2637 CPU and an NVIDIA V100 GPU, respectively.

}, keywords = {psclab}, doi = {10.1109/SBAC-PAD53543.2021.00032}, author = {Erfan Bank-Tavakoli and Michael Riera and Masudul Hassan Quraishi and Fengbo Ren} } @article {498, title = {A Survey of System Architectures and Techniques for FPGA Virtualization}, journal = { IEEE Transactions on Parallel and Distributed Systems}, volume = {32}, year = {2021}, month = {09/2021}, pages = {2216-2230}, abstract = {

FPGA accelerators are gaining increasing attention in both cloud and edge computing because of their hardware flexibility, high computational throughput, and low power consumption. However, the design flow of FPGAs often requires specific knowledge of the underlying hardware, which hinders the wide adoption of FPGAs by application developers. Therefore, the virtualization of FPGAs becomes extremely important to create a useful abstraction of the hardware suitable for application developers. Such abstraction also enables the sharing of FPGA resources among multiple users and accelerator applications, which is important because, traditionally, FPGAs have been mostly used in single-user, single-embedded-application scenarios. There are many works in the field of FPGA virtualization covering different aspects and targeting different application areas. In this survey, we review the system architectures used in the literature for FPGA virtualization. In addition, we identify the primary objectives of FPGA virtualization, based on which we summarize the techniques for realizing FPGA virtualization. This survey helps researchers to efficiently learn about FPGA virtualization research by providing a comprehensive review of the existing literature.

}, keywords = {psclab}, issn = {1045-9219}, doi = {10.1109/TPDS.2021.3063670}, author = {Masudul Hassan Quraishi and Erfan Bank-Tavakoli and Fengbo Ren} } @inproceedings {Seattle, WA, title = {Learning in the Frequency Domain}, year = {2020}, month = {06/2020}, pages = {1740-1749}, address = {Seattle, WA}, abstract = {

Deep neural networks have achieved remarkable success in computer vision tasks. Existing neural networks mainly operate in the spatial domain with fixed input sizes. For practical applications, images are usually large and have to be downsampled to the predetermined input size of neural networks. Even though the downsampling operations reduce computation and the required communication bandwidth, it removes both redundant and salient information obliviously, which results in accuracy degradation. Inspired by digital signal processing theories, we analyze the spectral bias from the frequency perspective and propose a learning-based frequency selection method to identify the trivial frequency components which can be removed with- out accuracy loss. The proposed method of learning in the frequency domain leverages identical structures of the well- known neural networks, such as ResNet-50, MobileNetV2, and Mask R-CNN, while accepting the frequency-domain information as the input. Experiment results show that learning in the frequency domain with static channel selection can achieve higher accuracy than the conventional spatial downsampling approach and meanwhile further reduce the input data size. Specifically for ImageNet classification with the same input size, the proposed method achieves 1.41\% and 0.66\% top-1 accuracy improvements on ResNet-50 and MobileNetV2, respectively. Even with half input size, the proposed method still improves the top-1 accuracy on ResNet-50 by 1\%. In addition, we observe a 0.8\% average precision improvement on Mask R-CNN for instance segmentation on the COCO dataset.

}, keywords = {psclab}, author = {Kai Xu and Minghai Qin and Fei Sun and Yuhao Wang and Yen-Kuang Chen and Fengbo Ren} } @article {19, title = {Healable Capacitive Touch Screen Sensors Based on Transparent Composite Electrodes Comprising Silver Nanowires and a Furan/Maleimide Diels-Alder Cycloaddition Polymer}, journal = {ACS Nano}, volume = {8}, year = {2014}, month = {Dec.}, pages = {12874{\textendash}12882}, abstract = {

A healable transparent capacitive touch screen sensor has been fabricated based on a healable silver nanowire{\textendash}polymer composite electrode. The composite electrode features a layer of silver nanowire percolation network embedded into the surface layer of a polymer substrate comprising an ultrathin soldering polymer layer to confine the nanowires to the surface of a healable Diels{\textendash}Alder cycloaddition copolymer and to attain low contact resistance between the nanowires. The composite electrode has a figure-of-merit sheet resistance of 18 Ω/sq with 80\% transmittance at 550 nm. A surface crack cut on the conductive surface with 18 Ω is healed by heating at 100 {\textdegree}C, and the sheet resistance recovers to 21 Ω in 6 min. A healable touch screen sensor with an array of 8 {\texttimes} 8 capacitive sensing points is prepared by stacking two composite films patterned with 8 rows and 8 columns of coupling electrodes at 90{\textdegree} angle. After deliberate damage, the coupling electrodes recover touch sensing function upon heating at 80 {\textdegree}C for 30 s. A capacitive touch screen based on Arduino is demonstrated capable of performing quick recovery from malfunction caused by a razor blade cutting. After four cycles of cutting and healing, the sensor array remains functional.

}, keywords = {psclab}, url = {http://pubs.acs.org/doi/abs/10.1021/nn506610p}, author = {Li, Junpeng and Liang, Jiajie and Li, Lu and Ren, Fengbo and Hu, Wei and Li, Juan and Qi, Shuhua and Pei, Qibing} }