Skip to contents

Reading Data File Larger than Memory for Fitting GLMs Using big.drglm Function

Usage

make.data(filename, chunksize, ...)

Arguments

filename

Path to the data set on disk.

chunksize

Size of the chunk or subset to be read from the large file for fitting GLMs.

...

Additional arguments to be passed to read.csv.

Value

A function that reads chunks of the data set.

Examples

# Create a toy dataset
set.seed(123)
# Number of rows to be generated
n <- 10000

# Creating dataset
dataset <- data.frame(
  Var_1 = round(rnorm(n, mean = 50, sd = 10)),
  Var_2 = round(rnorm(n, mean = 7.5, sd = 2.1)),
  Var_3 = as.factor(sample(c("0", "1"), n, replace = TRUE)),
  Var_4 = as.factor(sample(c("0", "1", "2"), n, replace = TRUE)),
  Var_5 = as.factor(sample(0:15, n, replace = TRUE)),
  Var_6 = round(rnorm(n, mean = 60, sd = 5))
)

# Save the dataset to a temporary file
temp_file <- tempfile(fileext = ".csv")
write.csv(dataset, file = temp_file, row.names = FALSE)

# Path to the temporary file
dataset_path <- temp_file
dataset_path  # Display the path to the temporary file
#> [1] "/tmp/RtmpAw2DLw/file17d5fe8f63f.csv"

# Initialize the data reading function with the data set path and chunk size
da <- drglm::make.data(dataset_path, chunksize = 1000)

# Fitting MLR Models
nmodel <- drglm::big.drglm(da,
formula = Var_1 ~ Var_2 + factor(Var_3) + factor(Var_4) + factor(Var_5) + Var_6,
10, family = "gaussian")
# View the results table
print(nmodel)
#>                    Estimate standard error    t value   Pr(>|t|)
#> (Intercept)     51.72130615     1.32114969 39.1487102 0.00000000
#> Var_2            0.02094802     0.04748735  0.4411285 0.65911997
#> factor(Var_3)1  -0.13949603     0.20171843 -0.6915384 0.48922728
#> factor(Var_4)1   0.38350656     0.24833980  1.5442815 0.12252015
#> factor(Var_4)2   0.23785108     0.24752089  0.9609334 0.33658568
#> factor(Var_5)1  -1.06696639     0.56657382 -1.8831904 0.05967457
#> factor(Var_5)2  -0.80267657     0.56004238 -1.4332426 0.15178853
#> factor(Var_5)3  -0.64240893     0.56243644 -1.1421894 0.25337531
#> factor(Var_5)4  -0.87049071     0.56948141 -1.5285674 0.12637173
#> factor(Var_5)5  -0.51662926     0.56337343 -0.9170281 0.35912793
#> factor(Var_5)6  -0.51405571     0.56179393 -0.9150254 0.36017830
#> factor(Var_5)7  -0.68371489     0.56680847 -1.2062538 0.22771963
#> factor(Var_5)8  -0.83233284     0.56987357 -1.4605570 0.14413705
#> factor(Var_5)9  -0.76583552     0.56309505 -1.3600466 0.17381517
#> factor(Var_5)10 -0.69443427     0.56813346 -1.2223083 0.22159105
#> factor(Var_5)11 -0.75598173     0.55912331 -1.3520841 0.17634842
#> factor(Var_5)12 -1.32332553     0.56884076 -2.3263550 0.01999962
#> factor(Var_5)13 -0.76349854     0.56265917 -1.3569468 0.17479812
#> factor(Var_5)14 -0.60991931     0.57137187 -1.0674647 0.28576204
#> factor(Var_5)15  0.14287426     0.57115597  0.2501493 0.80247190
#> Var_6           -0.02291395     0.02004498 -1.1431264 0.25298613
#>                            95% CI
#> (Intercept)     [ 49.13 , 54.31 ]
#> Var_2            [ -0.07 , 0.11 ]
#> factor(Var_3)1   [ -0.53 , 0.26 ]
#> factor(Var_4)1    [ -0.1 , 0.87 ]
#> factor(Var_4)2   [ -0.25 , 0.72 ]
#> factor(Var_5)1   [ -2.18 , 0.04 ]
#> factor(Var_5)2    [ -1.9 , 0.29 ]
#> factor(Var_5)3   [ -1.74 , 0.46 ]
#> factor(Var_5)4   [ -1.99 , 0.25 ]
#> factor(Var_5)5   [ -1.62 , 0.59 ]
#> factor(Var_5)6   [ -1.62 , 0.59 ]
#> factor(Var_5)7   [ -1.79 , 0.43 ]
#> factor(Var_5)8   [ -1.95 , 0.28 ]
#> factor(Var_5)9   [ -1.87 , 0.34 ]
#> factor(Var_5)10  [ -1.81 , 0.42 ]
#> factor(Var_5)11  [ -1.85 , 0.34 ]
#> factor(Var_5)12 [ -2.44 , -0.21 ]
#> factor(Var_5)13  [ -1.87 , 0.34 ]
#> factor(Var_5)14  [ -1.73 , 0.51 ]
#> factor(Var_5)15  [ -0.98 , 1.26 ]
#> Var_6            [ -0.06 , 0.02 ]

# Fitting logistic Regression Model
bmodel <- drglm::big.drglm(da,
formula = factor(Var_3) ~ Var_1 + Var_2 + factor(Var_4) + factor(Var_5) + Var_6,
10, family = "binomial")
# View the results table
print(bmodel)
#>                      Estimate Odds Ratio standard error     z value   Pr(>|z|)
#> (Intercept)      0.0952195429  1.0999003    0.286765015  0.33204728 0.73985356
#> Var_1           -0.0013836918  0.9986173    0.002045315 -0.67651761 0.49871207
#> Var_2           -0.0004142688  0.9995858    0.009605457 -0.04312848 0.96559912
#> factor(Var_4)1   0.0184132664  1.0185838    0.050213834  0.36669708 0.71384499
#> factor(Var_4)2   0.0863909757  1.0902325    0.050044304  1.72628990 0.08429527
#> factor(Var_5)1  -0.1066254801  0.8988623    0.114997732 -0.92719637 0.35382459
#> factor(Var_5)2  -0.0591881914  0.9425294    0.113290817 -0.52244474 0.60136071
#> factor(Var_5)3  -0.0807833291  0.9223935    0.113929295 -0.70906547 0.47828385
#> factor(Var_5)4   0.0229787647  1.0232448    0.115269581  0.19934804 0.84199051
#> factor(Var_5)5  -0.0057667632  0.9942498    0.113913596 -0.05062401 0.95962513
#> factor(Var_5)6   0.0254936407  1.0258214    0.113816898  0.22398819 0.82276649
#> factor(Var_5)7   0.0233435801  1.0236182    0.114746435  0.20343621 0.83879410
#> factor(Var_5)8  -0.0092262931  0.9908161    0.115149070 -0.08012477 0.93613802
#> factor(Var_5)9  -0.1390418914  0.8701916    0.114051937 -1.21911030 0.22280233
#> factor(Var_5)10  0.0532619633  1.0547059    0.114808460  0.46392020 0.64270492
#> factor(Var_5)11 -0.0815427288  0.9216933    0.113599237 -0.71781053 0.47287412
#> factor(Var_5)12  0.0934829685  1.0979919    0.114770538  0.81452061 0.41534677
#> factor(Var_5)13  0.0508340238  1.0521482    0.114111051  0.44547853 0.65597397
#> factor(Var_5)14 -0.0722004220  0.9303444    0.115422244 -0.62553300 0.53162130
#> factor(Var_5)15  0.1086928534  1.1148199    0.115575343  0.94045019 0.34698669
#> Var_6           -0.0008249102  0.9991754    0.004057493 -0.20330540 0.83889633
#>                           95% CI
#> (Intercept)     [ -0.47 , 0.66 ]
#> Var_1              [ -0.01 , 0 ]
#> Var_2           [ -0.02 , 0.02 ]
#> factor(Var_4)1  [ -0.08 , 0.12 ]
#> factor(Var_4)2  [ -0.01 , 0.18 ]
#> factor(Var_5)1  [ -0.33 , 0.12 ]
#> factor(Var_5)2  [ -0.28 , 0.16 ]
#> factor(Var_5)3   [ -0.3 , 0.14 ]
#> factor(Var_5)4   [ -0.2 , 0.25 ]
#> factor(Var_5)5  [ -0.23 , 0.22 ]
#> factor(Var_5)6   [ -0.2 , 0.25 ]
#> factor(Var_5)7   [ -0.2 , 0.25 ]
#> factor(Var_5)8  [ -0.23 , 0.22 ]
#> factor(Var_5)9  [ -0.36 , 0.08 ]
#> factor(Var_5)10 [ -0.17 , 0.28 ]
#> factor(Var_5)11  [ -0.3 , 0.14 ]
#> factor(Var_5)12 [ -0.13 , 0.32 ]
#> factor(Var_5)13 [ -0.17 , 0.27 ]
#> factor(Var_5)14  [ -0.3 , 0.15 ]
#> factor(Var_5)15 [ -0.12 , 0.34 ]
#> Var_6           [ -0.01 , 0.01 ]

# Fitting Poisson Regression Model
pmodel <- drglm::big.drglm(da,
formula = Var_5 ~ Var_1 + Var_2 + factor(Var_3) + factor(Var_4) + Var_6,
10, family = "poisson")
# View the results table
print(pmodel)
#>                     Estimate Odds Ratio standard error       z value   Pr(>|z|)
#> (Intercept)     1.889610e+00  6.6167865              1  1.889610e+00 0.05881016
#> Var_1           1.209546e-05  1.0000121              1  1.209546e-05 0.99999035
#> Var_2           2.141166e-03  1.0021435              1  2.141166e-03 0.99829160
#> factor(Var_3)1  1.545550e-02  1.0155756              1  1.545550e-02 0.98766878
#> factor(Var_4)1 -1.155294e-02  0.9885135              1 -1.155294e-02 0.99078229
#> factor(Var_4)2 -5.857627e-03  0.9941595              1 -5.857627e-03 0.99532632
#> Var_6           1.656243e-03  1.0016576              1  1.656243e-03 0.99867851
#>                          95% CI
#> (Intercept)    [ -0.07 , 3.85 ]
#> Var_1          [ -1.96 , 1.96 ]
#> Var_2          [ -1.96 , 1.96 ]
#> factor(Var_3)1 [ -1.94 , 1.98 ]
#> factor(Var_4)1 [ -1.97 , 1.95 ]
#> factor(Var_4)2 [ -1.97 , 1.95 ]
#> Var_6          [ -1.96 , 1.96 ]