<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.0 20120330//EN" "JATS-journalpublishing1.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">JDS</journal-id>
<journal-title-group><journal-title>Journal of Data Science</journal-title></journal-title-group>
<issn pub-type="epub">1683-8602</issn><issn pub-type="ppub">1680-743X</issn><issn-l>1680-743X</issn-l>
<publisher>
<publisher-name>School of Statistics, Renmin University of China</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">JDS1052</article-id>
<article-id pub-id-type="doi">10.6339/22-JDS1052</article-id>
<article-categories><subj-group subj-group-type="heading">
<subject>Computing in Data Science</subject></subj-group></article-categories>
<title-group>
<article-title>Variable Selection with Scalable Bootstrapping in Generalized Linear Model for Massive Data</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Zhang</surname><given-names>Zhang</given-names></name><xref ref-type="aff" rid="j_jds1052_aff_001">1</xref>
</contrib>
<contrib contrib-type="author">
<name><surname>He</surname><given-names>Zhibing</given-names></name><xref ref-type="aff" rid="j_jds1052_aff_002">2</xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Qin</surname><given-names>Yichen</given-names></name><xref ref-type="aff" rid="j_jds1052_aff_003">3</xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Shen</surname><given-names>Ye</given-names></name><xref ref-type="aff" rid="j_jds1052_aff_004">4</xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Shia</surname><given-names>Ben-Chang</given-names></name><xref ref-type="aff" rid="j_jds1052_aff_005">5</xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname><given-names>Yang</given-names></name><email xlink:href="mailto:yang.li@ruc.edu.cn">yang.li@ruc.edu.cn</email><xref ref-type="aff" rid="j_jds1052_aff_001">1</xref><xref ref-type="aff" rid="j_jds1052_aff_006">6</xref><xref ref-type="corresp" rid="cor1">∗</xref>
</contrib>
<aff id="j_jds1052_aff_001"><label>1</label>Center for Applied Statistics and School of Statistics, <institution>Renmin University of China, Beijing</institution>, <country>China</country></aff>
<aff id="j_jds1052_aff_002"><label>2</label>School of Mathematical and Statistical Sciences, <institution>Arizona State University</institution>, AZ, <country>USA</country></aff>
<aff id="j_jds1052_aff_003"><label>3</label>Department of Operations, Business Analytics, and Information Systems, <institution>University of Cincinnati</institution>, OH, <country>USA</country></aff>
<aff id="j_jds1052_aff_004"><label>4</label>College of Public Health, <institution>University of Georgia</institution>, GA, <country>USA</country></aff>
<aff id="j_jds1052_aff_005"><label>5</label>Graduate Institute of Business Administration and College of Management, <institution>Fu Jen Catholic University</institution>, <country>Taiwan</country></aff>
<aff id="j_jds1052_aff_006"><label>6</label>RSS and China-Re Life Joint Lab on Public Health and Risk Management, <institution>Renmin University of China</institution>, Beijing, <country>China</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>∗</label>Corresponding author. Email: <ext-link ext-link-type="uri" xlink:href="mailto:yang.li@ruc.edu.cn">yang.li@ruc.edu.cn</ext-link>.</corresp>
</author-notes>
<pub-date pub-type="ppub"><year>2023</year></pub-date><pub-date pub-type="epub"><day>7</day><month>7</month><year>2022</year></pub-date><volume>21</volume><issue>1</issue><fpage>87</fpage><lpage>105</lpage><supplementary-material id="S1" content-type="archive" xlink:href="jds1052_s001.zip" mimetype="application" mime-subtype="x-zip-compressed">
<caption>
<title>Supplementary Material</title>
<p>.zip contains the following files and/or directories: 
<list>
<list-item id="j_jds1052_li_001">
<label>•</label>
<p>/code and data/: Directory that includes code and files necessary to reproduce the numerical results presented in this paper.</p>
</list-item>
<list-item id="j_jds1052_li_002">
<label>•</label>
<p>supplementary.pdf: Online supplementary material.</p>
</list-item>
</list>
</p>
</caption>
</supplementary-material><history><date date-type="received"><day>20</day><month>2</month><year>2022</year></date><date date-type="accepted"><day>26</day><month>5</month><year>2022</year></date></history>
<permissions><copyright-statement>2023 The Author(s). Published by the School of Statistics and the Center for Applied Statistics, Renmin University of China.</copyright-statement><copyright-year>2023</copyright-year>
<license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>Open access article under the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">CC BY</ext-link> license.</license-p></license></permissions>
<abstract>
<p>Bootstrapping is commonly used as a tool for non-parametric statistical inference to assess the quality of estimators in variable selection models. However, for a massive dataset, the computational requirement when using bootstrapping in variable selection models (BootVS) can be crucial. In this study, we propose a novel framework using a bag of little bootstraps variable selection (BLBVS) method with a ridge hybrid procedure to assess the quality of estimators in generalized linear models with a regularized term, such as lasso and group lasso penalties. The proposed method can be easily and naturally implemented with distributed computing, and thus has significant computational advantages for massive datasets. The simulation results show that our novel BLBVS method performs excellently in both accuracy and efficiency when compared with BootVS. Real data analyses including regression on a bike sharing dataset and classification of a lending club dataset are presented to illustrate the computational superiority of BLBVS in large-scale datasets.</p>
</abstract>
<kwd-group>
<label>Keywords</label>
<kwd>distributed computing</kwd>
<kwd>large-scale dataset</kwd>
<kwd>scalable bootstrap</kwd>
<kwd>variable selection</kwd>
</kwd-group>
<funding-group><award-group><funding-source xlink:href="https://doi.org/10.13039/501100001809">National Natural Science Foundation of China</funding-source><award-id>71771211</award-id></award-group><funding-statement>Dr. Yang Li was supported by Platform of Public Health &amp; Disease Control and Prevention, Major Innovation &amp; Planning Interdisciplinary Platform for the “Double-First Class” Initiative, Renmin University of China and National Natural Science Foundation of China (71771211). </funding-statement></funding-group>
</article-meta>
</front>
<body/>
<back>
<ref-list id="j_jds1052_reflist_001">
<title>References</title>
<ref id="j_jds1052_ref_001">
<mixed-citation publication-type="chapter"> <string-name><surname>Bickel</surname> <given-names>PJ</given-names></string-name>, <string-name><surname>Götze</surname> <given-names>F</given-names></string-name>, <string-name><surname>van Zwet</surname> <given-names>WR</given-names></string-name> (<year>2012</year>). <chapter-title>Resampling fewer than n observations: gains, losses, and remedies for losses</chapter-title>. In: <source><italic>Selected Works of Willem van Zwet</italic></source>, <fpage>267</fpage>–<lpage>297</lpage>. <publisher-name>Springer</publisher-name>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_002">
<mixed-citation publication-type="journal"> <string-name><surname>Breiman</surname> <given-names>L</given-names></string-name> (<year>2001</year>). <article-title>Random forests</article-title>. <source><italic>Machine Learning</italic></source>, <volume>45</volume>(<issue>1</issue>): <fpage>5</fpage>–<lpage>32</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_003">
<mixed-citation publication-type="journal"> <string-name><surname>Chatterjee</surname> <given-names>A</given-names></string-name>, <string-name><surname>Lahiri</surname> <given-names>SN</given-names></string-name> (<year>2011</year>). <article-title>Bootstrapping lasso estimators</article-title>. <source><italic>Journal of the American Statistical Association</italic></source>, <volume>106</volume>(<issue>494</issue>): <fpage>608</fpage>–<lpage>625</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_004">
<mixed-citation publication-type="journal"> <string-name><surname>Chen</surname> <given-names>X</given-names></string-name>, <string-name><surname>Xie</surname> <given-names>Mg</given-names></string-name> (<year>2014</year>). <article-title>A split-and-conquer approach for analysis of extraordinarily large data</article-title>. <source><italic>Statistica Sinica</italic></source>, <volume>24</volume>: <fpage>1655</fpage>–<lpage>1684</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_005">
<mixed-citation publication-type="journal"> <string-name><surname>De Bin</surname> <given-names>R</given-names></string-name>, <string-name><surname>Janitza</surname> <given-names>S</given-names></string-name>, <string-name><surname>Sauerbrei</surname> <given-names>W</given-names></string-name>, <string-name><surname>Boulesteix</surname> <given-names>AL</given-names></string-name> (<year>2016</year>). <article-title>Subsampling versus bootstrapping in resampling-based model selection for multivariable regression</article-title>. <source><italic>Biometrics</italic></source>, <volume>72</volume>(<issue>1</issue>): <fpage>272</fpage>–<lpage>280</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_006">
<mixed-citation publication-type="journal"> <string-name><surname>Efron</surname> <given-names>B</given-names></string-name>, <string-name><surname>Hastie</surname> <given-names>T</given-names></string-name>, <string-name><surname>Johnstone</surname> <given-names>I</given-names></string-name>, <string-name><surname>Tibshirani</surname> <given-names>R</given-names></string-name>, <etal>et al.</etal> (<year>2004</year>). <article-title>Least angle regression</article-title>. <source><italic>Annals of Statistics</italic></source>, <volume>32</volume>(<issue>2</issue>): <fpage>407</fpage>–<lpage>499</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_007">
<mixed-citation publication-type="journal"> <string-name><surname>Fan</surname> <given-names>J</given-names></string-name>, <string-name><surname>Li</surname> <given-names>R</given-names></string-name> (<year>2001</year>). <article-title>Variable selection via nonconcave penalized likelihood and its oracle properties</article-title>. <source><italic>Journal of the American Statistical Association</italic></source>, <volume>96</volume>(<issue>456</issue>): <fpage>1348</fpage>–<lpage>1360</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_008">
<mixed-citation publication-type="journal"> <string-name><surname>Fan</surname> <given-names>J</given-names></string-name>, <string-name><surname>Lv</surname> <given-names>J</given-names></string-name> (<year>2008</year>). <article-title>Sure independence screening for ultrahigh dimensional feature space</article-title>. <source><italic>Journal of the Royal Statistical Society: Series B (Statistical Methodology)</italic></source>, <volume>70</volume>(<issue>5</issue>): <fpage>849</fpage>–<lpage>911</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_009">
<mixed-citation publication-type="journal"> <string-name><surname>Fan</surname> <given-names>TH</given-names></string-name>, <string-name><surname>Cheng</surname> <given-names>KF</given-names></string-name> (<year>2007</year>). <article-title>Tests and variables selection on regression analysis for massive datasets</article-title>. <source><italic>Data &amp; Knowledge Engineering</italic></source>, <volume>63</volume>(<issue>3</issue>): <fpage>811</fpage>–<lpage>819</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_010">
<mixed-citation publication-type="journal"> <string-name><surname>Genkin</surname> <given-names>A</given-names></string-name>, <string-name><surname>Lewis</surname> <given-names>DD</given-names></string-name>, <string-name><surname>Madigan</surname> <given-names>D</given-names></string-name> (<year>2007</year>). <article-title>Large-scale bayesian logistic regression for text categorization</article-title>. <source><italic>Technometrics</italic></source>, <volume>49</volume>(<issue>3</issue>): <fpage>291</fpage>–<lpage>304</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_011">
<mixed-citation publication-type="journal"> <string-name><surname>Hong</surname> <given-names>C</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Cai</surname> <given-names>T</given-names></string-name> (<year>2022</year>). <article-title>A divide-and-conquer method for sparse risk prediction and evaluation</article-title>. <source><italic>Biostatistics</italic></source>, <volume>23</volume>(<issue>2</issue>): <fpage>397</fpage>–<lpage>411</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_012">
<mixed-citation publication-type="journal"> <string-name><surname>Kleiner</surname> <given-names>A</given-names></string-name>, <string-name><surname>Talwalkar</surname> <given-names>A</given-names></string-name>, <string-name><surname>Sarkar</surname> <given-names>P</given-names></string-name>, <string-name><surname>Jordan</surname> <given-names>MI</given-names></string-name> (<year>2014</year>). <article-title>A scalable bootstrap for massive data</article-title>. <source><italic>Journal of the Royal Statistical Society: Series B (Statistical Methodology)</italic></source>, <volume>76</volume>(<issue>4</issue>): <fpage>795</fpage>–<lpage>816</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_013">
<mixed-citation publication-type="journal"> <string-name><surname>Li</surname> <given-names>R</given-names></string-name>, <string-name><surname>Zhong</surname> <given-names>W</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>L</given-names></string-name> (<year>2012</year>). <article-title>Feature screening via distance correlation learning</article-title>. <source><italic>Journal of the American Statistical Association</italic></source>, <volume>107</volume>(<issue>499</issue>): <fpage>1129</fpage>–<lpage>1139</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_014">
<mixed-citation publication-type="journal"> <string-name><surname>Lin</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Jeon</surname> <given-names>Y</given-names></string-name> (<year>2006</year>). <article-title>Random forests and adaptive nearest neighbors</article-title>. <source><italic>Journal of the American Statistical Association</italic></source>, <volume>101</volume>(<issue>474</issue>): <fpage>578</fpage>–<lpage>590</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_015">
<mixed-citation publication-type="journal"> <string-name><surname>Liu</surname> <given-names>L</given-names></string-name>, <string-name><surname>Gu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Van Limbergen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Kenney</surname> <given-names>T</given-names></string-name> (<year>2021</year>). <article-title>Surf: A new method for sparse variable selection, with application in microbiome data analysis</article-title>. <source><italic>Statistics in Medicine</italic></source>, <volume>40</volume>(<issue>4</issue>): <fpage>897</fpage>–<lpage>919</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_016">
<mixed-citation publication-type="journal"> <string-name><surname>Meier</surname> <given-names>L</given-names></string-name>, <string-name><surname>Van De Geer</surname> <given-names>S</given-names></string-name>, <string-name><surname>Bühlmann</surname> <given-names>P</given-names></string-name> (<year>2008</year>). <article-title>The group lasso for logistic regression</article-title>. <source><italic>Journal of the Royal Statistical Society: Series B (Statistical Methodology)</italic></source>, <volume>70</volume>(<issue>1</issue>): <fpage>53</fpage>–<lpage>71</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_017">
<mixed-citation publication-type="journal"> <string-name><surname>Meinshausen</surname> <given-names>N</given-names></string-name> (<year>2007</year>). <article-title>Relaxed lasso</article-title>. <source><italic>Computational Statistics &amp; Data Analysis</italic></source>, <volume>52</volume>(<issue>1</issue>): <fpage>374</fpage>–<lpage>393</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_018">
<mixed-citation publication-type="journal"> <string-name><surname>Meinshausen</surname> <given-names>N</given-names></string-name>, <string-name><surname>Bühlmann</surname> <given-names>P</given-names></string-name> (<year>2010</year>). <article-title>Stability selection</article-title>. <source><italic>Journal of the Royal Statistical Society: Series B (Statistical Methodology)</italic></source>, <volume>72</volume>(<issue>4</issue>): <fpage>417</fpage>–<lpage>473</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_019">
<mixed-citation publication-type="journal"> <string-name><surname>Shao</surname> <given-names>J</given-names></string-name> (<year>1996</year>). <article-title>Bootstrap model selection</article-title>. <source><italic>Journal of the American Statistical Association</italic></source>, <volume>91</volume>(<issue>434</issue>): <fpage>655</fpage>–<lpage>665</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_020">
<mixed-citation publication-type="journal"> <string-name><surname>Tang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>L</given-names></string-name>, <string-name><surname>Song</surname> <given-names>PXK</given-names></string-name> (<year>2020</year>). <article-title>Distributed simultaneous inference in generalized linear models via confidence distribution</article-title>. <source><italic>Journal of Multivariate Analysis</italic></source>, <volume>176</volume>: <fpage>104567</fpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_021">
<mixed-citation publication-type="journal"> <string-name><surname>Tibshirani</surname> <given-names>R</given-names></string-name> (<year>1996</year>). <article-title>Regression shrinkage and selection via the lasso</article-title>. <source><italic>Journal of the Royal Statistical Society: Series B (Methodological)</italic></source>, <volume>58</volume>(<issue>1</issue>): <fpage>267</fpage>–<lpage>288</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_022">
<mixed-citation publication-type="journal"> <string-name><surname>Tibshirani</surname> <given-names>RJ</given-names></string-name>, <string-name><surname>Efron</surname> <given-names>B</given-names></string-name> (<year>1993</year>). <article-title>An introduction to the bootstrap</article-title>. <source><italic>Monographs on Statistics and Applied Probability</italic></source>, <volume>57</volume>: <fpage>1</fpage>–<lpage>436</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_023">
<mixed-citation publication-type="journal"> <string-name><surname>Wang</surname> <given-names>K</given-names></string-name>, <string-name><surname>Li</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>B</given-names></string-name> (<year>2021</year>a). <article-title>Robust communication-efficient distributed composite quantile regression and variable selection for massive data</article-title>. <source><italic>Computational Statistics &amp; Data Analysis</italic></source>, <volume>161</volume>: <fpage>107262</fpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_024">
<mixed-citation publication-type="journal"> <string-name><surname>Wang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Hong</surname> <given-names>C</given-names></string-name>, <string-name><surname>Palmer</surname> <given-names>N</given-names></string-name>, <string-name><surname>Di</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Schwartz</surname> <given-names>J</given-names></string-name>, <string-name><surname>Kohane</surname> <given-names>I</given-names></string-name>, <etal>et al.</etal> (<year>2021</year>b). <article-title>A fast divide-and-conquer sparse cox regression</article-title>. <source><italic>Biostatistics</italic></source>, <volume>22</volume>(<issue>2</issue>): <fpage>381</fpage>–<lpage>401</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_025">
<mixed-citation publication-type="journal"> <string-name><surname>Wu</surname> <given-names>CFJ</given-names></string-name>, <etal>et al.</etal> (<year>1986</year>). <article-title>Jackknife, bootstrap and other resampling methods in regression analysis</article-title>. <source><italic>Annals of Statistics</italic></source>, <volume>14</volume>(<issue>4</issue>): <fpage>1261</fpage>–<lpage>1295</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_026">
<mixed-citation publication-type="journal"> <string-name><surname>Xie</surname> <given-names>J</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Yan</surname> <given-names>X</given-names></string-name>, <string-name><surname>Tang</surname> <given-names>N</given-names></string-name> (<year>2020</year>). <article-title>Category-adaptive variable screening for ultra-high dimensional heterogeneous categorical data</article-title>. <source><italic>Journal of the American Statistical Association</italic></source>, <volume>115</volume>(<issue>530</issue>): <fpage>747</fpage>–<lpage>760</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_027">
<mixed-citation publication-type="journal"> <string-name><surname>Yao</surname> <given-names>W</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Q</given-names></string-name> (<year>2013</year>). <article-title>Robust variable selection through mave</article-title>. <source><italic>Computational Statistics &amp; Data Analysis</italic></source>, <volume>63</volume>: <fpage>42</fpage>–<lpage>49</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_028">
<mixed-citation publication-type="journal"> <string-name><surname>Yuan</surname> <given-names>M</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>Y</given-names></string-name> (<year>2006</year>). <article-title>Model selection and estimation in regression with grouped variables</article-title>. <source><italic>Journal of the Royal Statistical Society: Series B (Statistical Methodology)</italic></source>, <volume>68</volume>(<issue>1</issue>): <fpage>49</fpage>–<lpage>67</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_029">
<mixed-citation publication-type="journal"> <string-name><surname>Zhang</surname> <given-names>CH</given-names></string-name> (<year>2010</year>). <article-title>Nearly unbiased variable selection under minimax concave penalty</article-title>. <source><italic>Annals of Statistics</italic></source>, <volume>38</volume>(<issue>2</issue>): <fpage>894</fpage>–<lpage>942</lpage>.</mixed-citation>
</ref>
<ref id="j_jds1052_ref_030">
<mixed-citation publication-type="journal"> <string-name><surname>Zou</surname> <given-names>H</given-names></string-name> (<year>2006</year>). <article-title>The adaptive lasso and its oracle properties</article-title>. <source><italic>Journal of the American Statistical Association</italic></source>, <volume>101</volume>(<issue>476</issue>): <fpage>1418</fpage>–<lpage>1429</lpage>.</mixed-citation>
</ref>
</ref-list>
</back>
</article>
