@mastersthesis {Tempe, title = {Hardware-friendly Deep Learning for Edge Computing}, volume = {Doctor of Philosophy in Computer Engineering}, year = {2021}, month = {05/2021}, school = {Arizona State University}, address = {Tempe}, abstract = {
The Internet-of-Things (IoT) boosts the vast amount of streaming data. However, even considering the growth of the cloud computing infrastructure, IoT devices will generate two orders of magnitude more than the capacity that centralized data center servers can process or store. This trend inevitability calls for the need for offloading IoT data processing to a decentralized edge computing infrastructure. On the other hand, deep-learning-based applications gain great progress by taking advantage of heavy centralized computing resources for training large models to fit increasingly complicated tasks. Even though large-scale deep learning models perform well in terms of accuracy, their high computational complexity makes it impossible to offload them onto edge devices for real-time inference and timely response.
To enable timely IoT services on edge devices, this dissertation addresses the challenge from two perspectives. On the hardware side, a new FPGA-based framework for binary neural network and an ASIC accelerator for natural scene text interpretation are proposed, with the awareness of the computing resources and power constraint on edge. On the algorithm side, this work presents both the methodology of building more compact models and finding better computation-accuracy trade-off for existing models.
}, keywords = {psclab}, author = {YIxing Li} } @article {561, title = {OpenICS: Open Image Compressive Sensing Toolbox and Benchmark}, journal = {Software Impacts}, volume = {9}, year = {2021}, month = {05/2021}, abstract = {The real-world application of image compressive sensing is largely limited by the lack of standardization in implementation and evaluation. To address this limitation, we present OpenICS, an image compressive sensing toolbox that implements multiple popular image compressive sensing algorithms into a unified framework with a standardized user interface. Furthermore, a corresponding benchmark is also proposed to provide a fair and complete evaluation of the implemented algorithms. We hope this work can serve the growing research community of compressive sensing and the industry to facilitate the development and application of image compressive sensing.
}, keywords = {psclab}, doi = {https://doi.org/10.1016/j.simpa.2021.100081}, url = {https://arxiv.org/pdf/2103.00652.pdf}, author = {Jonathan Zhao and Matthew Westerham and Mark Lakatos-Toth and Zhikang Zhang and Avi Moskoff and Fengbo Ren} } @inproceedings {Santa Clara, CA, title = {BNN Pruning: Pruning Binary Neural Network Guided by Weight Flipping Frequency}, year = {2020}, month = {03/2020}, address = {Santa Clara, CA}, abstract = {Due to the high computational complexity and memory storage requirement, it is hard to directly de- ploy a full-precision convolutional neural network (CNN) on embedded devices. The hardware-friendly designs are needed for resource-limited and energy-constrained embedded devices. Emerging solutions are adopted for the neural network compression, e.g., binary/ternary weight network, pruned network and quantized network. Among them, binary neural network (BNN) is believed to be the most hardware- friendly framework due to its small network size and low computational complexity. No existing work has further shrunk the size of BNN. In this work, we explore the redundancy in BNN and build a com- pact BNN (CBNN) based on the bit-level sensitivity analysis and bit-level data pruning. The input data is converted to a high dimensional bit-sliced format. In the post-training stage, we analyze the impact of different bit slices to the accuracy. By pruning the redundant input bit slices and shrinking the network size, we are able to build a more compact BNN. Our result shows that we can further scale down the net- work size of the BNN up to 3.9x with no more than 1\% accuracy drop. The actual runtime can be reduced up to 2x and 9.9x compared with the baseline BNN and its full-precision counterpart, respectively.
}, keywords = {psclab}, author = {YIxing Li and Shuai Zhang and Xichuan Zhou and Fengbo Ren} } @inproceedings {New Orleans, Louisiana, title = {Light-Weight RetinaNet for Object Detection on Edge Devices}, year = {2020}, month = {04/2020}, address = {New Orleans, Louisiana}, abstract = {This paper aims at reducing computation for Reti\nanet, an mAP-30-tier network, to facilitate its practical deploy\ment on edge devices for providing IoT-based object detection services. We first validate RetinaNet has the best FLOP-mAP trade-off among all mAP-30-tier network. Then, we propose a light-weight RetinaNet structure with effective computation\-accuracy trade-off by only reducing FLOPs in computationally intensive layers. Compared with the most common way of trading off computation with accuracy-input image scaling, the proposed solution shows a consistently better FLOPs-mAP trade-off curve. Light-weight RetinaNet achieves a 0.3\% mAP improvement at 1.Sx FLOPs reduction point over the original RetinaNet, and gains 1.Sx more energy-efficiency on an Intel Arria 10 FPGA accelerator in the context of edge computing. The proposed method potentially can help a wide range of the object detection applications to move closer to a preferred corner for a better runtime and accuracy, while enjoys more energy-efficient inference at the edge.\
}, keywords = {psclab}, author = {YIxing Li and Akshay Dua and Fengbo Ren} } @inproceedings {Virtual Event, title = {MoNet3D: Towards Accurate Monocular 3D Object Localization in Real Time}, year = {2020}, month = {07/2020}, address = {Virtual Event}, abstract = {Monocular multi-object detection and localization in 3D space has been proven to be a challenging task. The MoNet3D algorithm is a novel and effective framework that can predict the 3D position of each object in a monocular image and draw a 3D bounding box for each object. The MoNet3D method incorporates prior knowledge of the spatial geometric correlation of neighboring objects into the deep neural network training process to improve the accuracy of 3D object localization. Experiments on the KITTI dataset show that the accuracy for predicting the depth and horizontal coordinates of objects in 3D space can reach 96.25\% and 94.74\%, respectively. Moreover, the method can realize the real-time image processing at 27.85 FPS, showing promising potential for embedded advanced driving assistance system applications. Our code is publicly available at
}, keywords = {psclab}, author = {Xichuan Zhou and Yicong Peng and Chunqiao Long and Fengbo Ren and Cong Shi} } @article {414, title = {A Review of Algorithm \& Hardware Design for AI-Based Biomedical Applications}, journal = {IEEE Transactions on Biomedical Circuits and Systems }, volume = {14}, year = {2020}, month = {04/2020}, pages = {145-163}, abstract = {This paper reviews the state of the arts and trends of the AI-based biomedical processing algorithms and hardwares. The algorithms and hardwares for different biomedical applications such as ECG, EEG and hearing aid have been reviewed and discussed. For algorithm design, various widely used biomedical signal classification algorithms have been discussed including support vector machine (SVM), back propagation neural network (BPNN), convolutional neural networks (CNN), probabilistic neural networks (PNN), recurrent neural networks (RNN), Short-term Memory Network (LSTM), fuzzy neural network and etc. The pros and cons of the classification algorithms have been analyzed and compared in the context of application scenarios. The research trends of AI-based biomedical processing algorithms and applications are also discussed. For hardware design, various AI-based biomedical processors have been reviewed and discussed, including ECG classification processor, EEG classification processor, EMG classification processor and hearing aid processor. Various techniques on architecture and circuit level have been analyzed and compared. The research trends of the AI-based biomedical processor have also been discussed.
}, keywords = {psclab}, issn = {1940-9990}, doi = {10.1109/TBCAS.2020.2974154}, author = {Ying Wei and Jun Zhou and Yin Wang and Yinggang Liu and Qingsong Liu and Jiansheng Luo and Chao Wang and Fengbo Ren and Li Huang} } @article {499, title = {Systolic-CNN: An OpenCL-defined Scalable Run-time-flexible FPGA Accelerator Architecture for Accelerating Convolutional Neural Network Inference in Cloud/Edge Computing}, year = {2020}, month = {12/2020}, abstract = {This paper presents Systolic-CNN, an OpenCL-defined scalable, runtime-flexible FPGA accelerator architecture, optimized for accelerating the inference of various convolutional neural networks (CNNs) in multi-tenancy cloud/edge computing. The existing OpenCL-defined FPGA accelerators for CNN inference are insufficient due to limited flexibility for supporting multiple CNN models at run time and poor scalability resulting in underutilized FPGA resources and limited computational parallelism. Systolic-CNN adopts a highly pipelined and paralleled 1-D systolic array architecture, which efficiently explores both spatial and temporal parallelism for accelerating CNN inference on FPGAs. Systolic-CNN is highly scalable and parameterized, which can be easily adapted by users to achieve up to 100\% utilization of the coarse-grained computation resources (i.e., DSP blocks) for a given FPGA. Systolic-CNN is also run-time-flexible in the context of multi-tenancy cloud/edge computing, which can be time-shared to accelerate a variety of CNN models at run time without the need of recompiling the FPGA kernel hardware nor reprogramming the FPGA. The experiment results based on an Intel Arria/Stratix 10 GX FPGA Development board show that the optimized single-precision implementation of Systolic-CNN can achieve an average inference latency of 7ms/2ms, 84ms/33ms, 202ms/73ms, 1615ms/873ms, and 900ms/498ms per image for accelerating AlexNet, ResNet-50, ResNet-152, RetinaNet, and Light-weight RetinaNet, respectively. Codes are available at https://github.com/PSCLab-ASU/Systolic-CNN.
}, keywords = {psclab}, url = {https://arxiv.org/pdf/2012.03177.pdf}, author = {Akshay Dua and YIxing Li and Fengbo Ren} } @article {379, title = {A 34-FPS 698-GOP/s/W Binarized Deep Neural Network-based Natural Scene Text Interpretation Accelerator for Mobile Edge Computing}, journal = {IEEE Transactions on Industrial Electronics (TIE)}, volume = {66}, year = {2019}, month = {10/2018}, pages = {7407-7416}, abstract = {The scene text interpretation is a critical part of natural scene interpretation. Currently, most of the existing work is based on high-end GPU implementation, which is commonly used on the server side. However, in IoT application scenarios, the communication overhead from the edge device to the server is quite large, which sometimes even dominates the total processing time. Hence, the edgecomputing oriented design is needed to solve this problem. In this paper, we present an architectural design and implementation of a natural scene text interpretation (NSTI) accelerator, which can classify and localize the text region on pixel-level efficiently in real-time on mobile devices. To target the real-time and low-latency processing, the Binary Convolutional Encoder-decoder Network (B-CEDNet) is adopted as the core architecture to enable massive parallelism due to its binary feature. Massively parallelized computations and a highly pipelined data flow control enhance its latency and throughput performance. In addition, all the binarized intermediate results and parameters are stored on chip to eliminate the power consumption and latency overhead of the off-chip communication. The NSTI accelerator is implemented in a 40nm CMOS technology, which can process scene text images (size of 128x32) at 34 fps and latency of 40 ms for pixelwise interpretation with the pixelwise classification accuracy over 90\% on ICDAR- 03 and ICDAR-13 dataset. The real energy-efficiency is 698 GOP/s/W and the peak energy-efficiency can get up to 7825 GOP/s/W. The proposed accelerator is 7 more energy efficient than its optimized GPU-based implementation counterpart, while maintaining a real-time throughput with latency of 40 ms.
}, keywords = {psclab}, doi = {10.1109/TIE.2018.2875643}, url = {https://ieeexplore.ieee.org/document/8513982}, author = {YIxing Li and Zichuan Liu and Wenye Liu and Yu Jiang and Yongliang Wang and Wang Ling Goh and Hao Yu and Fengbo Ren} } @article {390, title = {Enabling Deep Learning for Edge Computing}, year = {2019}, keywords = {psclab}, author = {Baoxin Li and Fengbo Ren} } @article {369, title = {A GPU-Outperforming FPGA Accelerator Architecture for Binary Convolutional Neural Networks}, journal = {ACM Journal on Emerging Technologies in Computing (JETC) - Special Issue on Frontiers of Hardware and Algorithms for On-chip Learning}, volume = {14}, year = {2018}, month = {07/2018}, pages = {18.16}, chapter = {18.1}, abstract = {FPGA-based hardware accelerators for convolutional neural networks (CNNs) have obtained great attentions due to their higher energy efficiency than GPUs. However, it is challenging for FPGA-based solutions to achieve a higher throughput than GPU counterparts. In this paper, we demonstrate that FPGA acceleration can be a superior solution in terms of both throughput and energy efficiency when a CNN is trained with binary constraints on weights and activations. Specifically, we propose an optimized FPGA accelerator architecture tailored for bitwise convolution and normalization that features massive spatial parallelism with deep pipelines stages. A key advantage of the FPGA accelerator is that its performance is insensitive to data batch size, while the performance of GPU acceleration varies largely depending on the batch size of the data. Experiment results show that the proposed accelerator architecture for binary CNNs running on a Virtex-7 FPGA is 8.3x faster and 75x more energy-efficient than a Titan X GPU for processing online individual requests in small batch sizes. For processing static data in large batch sizes, the proposed solution is on a par with a Titan X GPU in terms of throughput while delivering 9.5x higher energy efficiency.
}, keywords = {psclab}, doi = {10.1145/3154839}, url = {https://dl.acm.org/citation.cfm?id=3154839}, author = {YIxing Li and Zichuan Liu and Kai Xu and Hao Yu and Fengbo Ren} } @inproceedings {Orleans, Louisana, title = {SqueezedText: A Real-time Scene Text Recognition by Binary Convolutional Encoder-decoder Network}, year = {2018}, month = {04/2018}, pages = {7194-7201}, address = {New Orleans, Louisana}, abstract = {A new approach for real-time scene text recognition is proposed in this paper. A novel binary convolutional encoderdecoder network (B-CEDNet) together with a bidirectional recurrent neural network (Bi-RNN). The B-CEDNet is engaged as a visual front-end to provide elaborated character detection, and a back-end Bi-RNN performs characterlevel sequential correction and classification based on learned contextual knowledge. The front-end B-CEDNet can process multiple regions containing characters using a one-off forward operation, and is trained under binary constraints with significant compression. Hence it leads to both remarkable inference run-time speedup as well as memory usage reduction. With the elaborated character detection, the back-end Bi-RNN merely processes a low dimension feature sequence with category and spatial information of extracted characters for sequence correction and classification. By training with over 1,000,000 synthetic scene text images, the B-CEDNet achieves a recall rate of 0.86, precision of 0.88 and F-score of 0.87 on ICDAR-03 and ICDAR-13. With the correction and classification by Bi-RNN, the proposed real-time scene text recognition achieves state-of-the-art accuracy while only consumes less than 1-ms inference run-time. The flow processing flow is realized on GPU with a small network size of 1.01 MB for B-CEDNet and 3.23 MB for Bi-RNN, which is much faster and smaller than the existing solutions.
}, keywords = {psclab}, author = {Zichuan Liu and YIxing Li and Fengbo Ren and Hao Yu and Wangling Goh} } @inproceedings {2016, title = {A Data-Driven Compressive Sensing Framework Tailored For Energy-Efficient Wearable Sensing}, year = {2017}, month = {03/2017}, pages = {861-865}, address = {New Orleans, LA}, abstract = {Compressive sensing (CS) is a promising technology for realizing energy-efficient wireless sensors for long-term health monitoring. However, conventional model-driven CS frameworks suffer from limited compression ratio and reconstruction quality when dealing with physiological signals due to inaccurate models and the overlook of individual variability. In this paper, we propose a data-driven CS framework that can learn signal characteristics and personalized features from any individual recording of physiologic signals to enhance CS performance with a minimized number of measurements. Such improvements are accomplished by a co-training approach that optimizes the sensing matrix and the dictionary towards improved restricted isometry property and signal sparsity, respectively. Experimental results upon ECG signals show that the proposed method, at a compression ratio of 10x, successfully reduces the isometry constant of the trained sensing matrices by 86\% against random matrices and improves the overall reconstructed signal-to-noise ratio by 15dB over conventional model-driven approaches.\
}, keywords = {psclab}, author = {Xu, Kai and Yixing Li and Ren, Fengbo} } @article {2016, title = {Data-Driven Sampling Matrix Boolean Optimization for Energy-Efficient Biomedical Signal Acquisition by Compressive Sensing}, journal = {IEEE Transactions on Biomedical Circuits and Systems}, volume = {11}, year = {2017}, month = {11/2016}, pages = {255-266}, abstract = {Compressive sensing is widely used in biomedical applications, and the sampling matrix plays a critical role on both quality and power consumption of signal acquisition. It projects a high-dimensional vector of data into a low-dimensional subspace by matrix-vector multiplication. An optimal sampling matrix can ensure accurate data reconstruction and/or high compression ratio. Most existing optimization methods can only produce real-valued embedding matrices that result in large energy consumption during data acquisition. In this paper, we propose an efficient method that finds an optimal Boolean sampling matrix in order to reduce the energy consumption. Compared to random Boolean embedding, our data-driven Boolean sampling matrix can improve the image recovery quality by 9 dB. Moreover, in terms of sampling hardware complexity, it reduces the energy consumption by 4.6\× and the silicon area by 1.9\× over the data-driven real-valued embedding.
}, keywords = {psclab}, author = {Wang, Yuhao and Li, Xin and Xu, Kai and Ren, Fengbo and Yu, Hao} } @inproceedings {Los Angeles, CA, title = {Recognizing terrain features on terrestrial surface using a deep learning model - An example with crater detection}, year = {2017}, month = {11/2017}, pages = {33-36}, publisher = {ACM}, address = {Los Angeles, CA}, abstract = {This paper exploits the use of a popular deep learning model - the faster-RCNN - to support automatic terrain feature detection and classification using a mixed set of optimal remote sensing and natural images. Crater detection is used as the case study in this research since this geomorphological feature provides important information about surface aging. Craters, such as impact craters, also effect global changes in many aspects, such as geography, topography, mineral and hydrocarbon production, etc. The collected data were labeled and the network was trained through a GPU server. Experimental results show that the faster-RCNN model coupled with a widely used convolutional network ZF-net performs well in detecting craters on the terrestrial surface.
}, keywords = {psclab}, isbn = {978-1-4503-5498-1/17/11}, author = {Wenwen Li and Bin Zhou and Chia-Yu Hsu and YIxing Li and Fengbo Ren} } @inproceedings {2016, title = {A Binary Convolutional Encoder-decoder Network for Real-time Natural Scene Text Processing}, year = {2016}, month = {12/2016}, abstract = {In this paper, we develop a binary convolutional encoder-decoder network (B-CEDNet) for natural scene text processing (NSTP). It converts a text image to a class-distinguished salience map that reveals the categorical, spatial and morphological information of characters. The existing solutions are either memory consuming or run-time consuming that cannot be applied to real-time applications on resource-constrained devices such as advanced driver assistance systems. The developed network can process multiple regions containing characters by one-off forward operation, and is trained to have binary weights and binary feature maps, which lead to both remarkable inference run-time speedup and memory usage reduction. By training with over 200, 000 synthesis scene text images (size of 32 \× 128), it can achieve 90\% and 91\% pixel-wise accuracy on ICDAR-03 and ICDAR-13 datasets. It only consumes 4.59 ms inference run-time realized on GPU with a small network size of 2.14 MB, which is up to 8\× faster and 96\% smaller than it full-precision version.\
}, keywords = {psclab}, author = {Liu, Zichuan and Yixing Li and Ren, Fengbo and Yu, Hao} } @inproceedings {9999, title = {An Energy-Efficient Compressive Sensing Framework Incorporating Online Dictionary Learning for Long-Term Wireless Health Monitoring}, year = {2016}, month = {03/2016}, pages = {804 - 808}, publisher = {IEEE}, address = {Shanghai, China}, abstract = {Wireless body area network (WBAN) is emerging in the mobile healthcare area to replace the traditional wire-connected monitoring devices. As wireless data transmission dominates power cost of sensor nodes, it is beneficial to reduce the data size without much information loss. Compressive sensing (CS) is a perfect candidate to achieve this goal compared to existing compression techniques. In this paper, we proposed a general framework that utilize CS and online dictionary learning (ODL) together. The learned dictionary carries individual characteristics of the original signal, under which the signal has an even sparser representation compared to pre-determined dictionaries. As a consequence, the compression ratio is effectively improved by 2-4x comparing to prior works. Besides, the proposed framework offloads pre-processing from sensor nodes to the server node prior to dictionary learning, providing further reduction in hardware costs. As it is data driven, the proposed framework has the potential to be used with a wide range of physiological signals.
}, keywords = {psclab}, author = {Kai Xu and YIxing Li and Fengbo Ren} } @article {19, title = {Healable Capacitive Touch Screen Sensors Based on Transparent Composite Electrodes Comprising Silver Nanowires and a Furan/Maleimide Diels-Alder Cycloaddition Polymer}, journal = {ACS Nano}, volume = {8}, year = {2014}, month = {Dec.}, pages = {12874{\textendash}12882}, abstract = {A healable transparent capacitive touch screen sensor has been fabricated based on a healable silver nanowire{\textendash}polymer composite electrode. The composite electrode features a layer of silver nanowire percolation network embedded into the surface layer of a polymer substrate comprising an ultrathin soldering polymer layer to confine the nanowires to the surface of a healable Diels{\textendash}Alder cycloaddition copolymer and to attain low contact resistance between the nanowires. The composite electrode has a figure-of-merit sheet resistance of 18 Ω/sq with 80\% transmittance at 550 nm. A surface crack cut on the conductive surface with 18 Ω is healed by heating at 100 {\textdegree}C, and the sheet resistance recovers to 21 Ω in 6 min. A healable touch screen sensor with an array of 8 {\texttimes} 8 capacitive sensing points is prepared by stacking two composite films patterned with 8 rows and 8 columns of coupling electrodes at 90{\textdegree} angle. After deliberate damage, the coupling electrodes recover touch sensing function upon heating at 80 {\textdegree}C for 30 s. A capacitive touch screen based on Arduino is demonstrated capable of performing quick recovery from malfunction caused by a razor blade cutting. After four cycles of cutting and healing, the sensor array remains functional.
}, keywords = {psclab}, url = {http://pubs.acs.org/doi/abs/10.1021/nn506610p}, author = {Li, Junpeng and Liang, Jiajie and Li, Lu and Ren, Fengbo and Hu, Wei and Li, Juan and Qi, Shuhua and Pei, Qibing} } @article {22, title = {A Square-Root-Free Matrix Decomposition Method for Energy-Efficient Least Square Computation on Embedded Systems}, journal = {IEEE Embedded Systems Letters}, volume = {6}, year = {2014}, month = {Aug.}, pages = {73{\textendash}76}, abstract = {QR decomposition (QRD) is used to solve least-squares (LS) problems for a wide range of applications. However, traditional QR decomposition methods, such as Gram-Schmidt (GS), require high computational complexity and nonlinear operations to achieve high throughput, limiting their usage on resource-limited platforms. To enable efficient LS computation on embedded systems for real-time applications, this paper presents an alternative decomposition method, called QDRD, which relaxes system requirements while maintaining the same level of performance. Specifically, QDRD eliminates both the square-root operations in the normalization step and the divisions in the subsequent backward substitution. Simulation results show that the accuracy and reliability of factorization matrices can be significantly improved by QDRD, especially when executed on precision-limited platforms. Furthermore, benchmarking results on an embedded platform show that QDRD provides constantly better energy-efficiency and higher throughput than GS-QRD in solving LS problems. Up to 4 and 6.5 times improvement in energy-efficiency and throughput, respectively, can be achieved for small-size problems.
}, keywords = {psclab}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6882128}, author = {Ren, Fengbo and Zhang, Chenxin and Liu, Liang and Xu, Wenyao and Owall, V and Markovi{\'c}, Dejan} } @inproceedings {29, title = {mCOPD: Mobile Phone Based Lung Function Diagnosis and Exercise System for COPD}, year = {2013}, month = {05/2013}, publisher = {ACM}, abstract = {COPD (Chronic Obstructive Pulmonary Disease) is a serious lung disease that causes difficulty in breathing. COPD patients require lung function examinations and perform breathing exercises on a regular basis in order to manage and be more aware of their health status. In this paper, we designed and developed a mobile-phone based system for lung function diagnosis, called mCOPD. Besides enabling accurate COPD examinations at home, the mCOPD system also offers a video-game based guidance system for breathing exercises. We evaluated mCOPD in controlled and uncontrolled environments with 40 subjects. The experimental results show that our system is a promising tool for remote medical treatment of COPD.
}, keywords = {psclab}, url = {http://dl.acm.org/citation.cfm?id=2504383}, author = {Xu, Wenyao and Huang, Ming-Chun and Liu, Jason J and Ren, Fengbo and Shen, Xinchen and Liu, Xiao and Sarrafzadeh, Majid} }