Reading Data File Larger than Memory for Fitting GLMs Using big.drglm
Function
make.data.Rd
Reading Data File Larger than Memory for Fitting GLMs Using big.drglm
Function
Examples
# Create a toy dataset
set.seed(123)
# Number of rows to be generated
n <- 10000
# Creating dataset
dataset <- data.frame(
Var_1 = round(rnorm(n, mean = 50, sd = 10)),
Var_2 = round(rnorm(n, mean = 7.5, sd = 2.1)),
Var_3 = as.factor(sample(c("0", "1"), n, replace = TRUE)),
Var_4 = as.factor(sample(c("0", "1", "2"), n, replace = TRUE)),
Var_5 = as.factor(sample(0:15, n, replace = TRUE)),
Var_6 = round(rnorm(n, mean = 60, sd = 5))
)
# Save the dataset to a temporary file
temp_file <- tempfile(fileext = ".csv")
write.csv(dataset, file = temp_file, row.names = FALSE)
# Path to the temporary file
dataset_path <- temp_file
dataset_path # Display the path to the temporary file
#> [1] "/tmp/RtmpAw2DLw/file17d5fe8f63f.csv"
# Initialize the data reading function with the data set path and chunk size
da <- drglm::make.data(dataset_path, chunksize = 1000)
# Fitting MLR Models
nmodel <- drglm::big.drglm(da,
formula = Var_1 ~ Var_2 + factor(Var_3) + factor(Var_4) + factor(Var_5) + Var_6,
10, family = "gaussian")
# View the results table
print(nmodel)
#> Estimate standard error t value Pr(>|t|)
#> (Intercept) 51.72130615 1.32114969 39.1487102 0.00000000
#> Var_2 0.02094802 0.04748735 0.4411285 0.65911997
#> factor(Var_3)1 -0.13949603 0.20171843 -0.6915384 0.48922728
#> factor(Var_4)1 0.38350656 0.24833980 1.5442815 0.12252015
#> factor(Var_4)2 0.23785108 0.24752089 0.9609334 0.33658568
#> factor(Var_5)1 -1.06696639 0.56657382 -1.8831904 0.05967457
#> factor(Var_5)2 -0.80267657 0.56004238 -1.4332426 0.15178853
#> factor(Var_5)3 -0.64240893 0.56243644 -1.1421894 0.25337531
#> factor(Var_5)4 -0.87049071 0.56948141 -1.5285674 0.12637173
#> factor(Var_5)5 -0.51662926 0.56337343 -0.9170281 0.35912793
#> factor(Var_5)6 -0.51405571 0.56179393 -0.9150254 0.36017830
#> factor(Var_5)7 -0.68371489 0.56680847 -1.2062538 0.22771963
#> factor(Var_5)8 -0.83233284 0.56987357 -1.4605570 0.14413705
#> factor(Var_5)9 -0.76583552 0.56309505 -1.3600466 0.17381517
#> factor(Var_5)10 -0.69443427 0.56813346 -1.2223083 0.22159105
#> factor(Var_5)11 -0.75598173 0.55912331 -1.3520841 0.17634842
#> factor(Var_5)12 -1.32332553 0.56884076 -2.3263550 0.01999962
#> factor(Var_5)13 -0.76349854 0.56265917 -1.3569468 0.17479812
#> factor(Var_5)14 -0.60991931 0.57137187 -1.0674647 0.28576204
#> factor(Var_5)15 0.14287426 0.57115597 0.2501493 0.80247190
#> Var_6 -0.02291395 0.02004498 -1.1431264 0.25298613
#> 95% CI
#> (Intercept) [ 49.13 , 54.31 ]
#> Var_2 [ -0.07 , 0.11 ]
#> factor(Var_3)1 [ -0.53 , 0.26 ]
#> factor(Var_4)1 [ -0.1 , 0.87 ]
#> factor(Var_4)2 [ -0.25 , 0.72 ]
#> factor(Var_5)1 [ -2.18 , 0.04 ]
#> factor(Var_5)2 [ -1.9 , 0.29 ]
#> factor(Var_5)3 [ -1.74 , 0.46 ]
#> factor(Var_5)4 [ -1.99 , 0.25 ]
#> factor(Var_5)5 [ -1.62 , 0.59 ]
#> factor(Var_5)6 [ -1.62 , 0.59 ]
#> factor(Var_5)7 [ -1.79 , 0.43 ]
#> factor(Var_5)8 [ -1.95 , 0.28 ]
#> factor(Var_5)9 [ -1.87 , 0.34 ]
#> factor(Var_5)10 [ -1.81 , 0.42 ]
#> factor(Var_5)11 [ -1.85 , 0.34 ]
#> factor(Var_5)12 [ -2.44 , -0.21 ]
#> factor(Var_5)13 [ -1.87 , 0.34 ]
#> factor(Var_5)14 [ -1.73 , 0.51 ]
#> factor(Var_5)15 [ -0.98 , 1.26 ]
#> Var_6 [ -0.06 , 0.02 ]
# Fitting logistic Regression Model
bmodel <- drglm::big.drglm(da,
formula = factor(Var_3) ~ Var_1 + Var_2 + factor(Var_4) + factor(Var_5) + Var_6,
10, family = "binomial")
# View the results table
print(bmodel)
#> Estimate Odds Ratio standard error z value Pr(>|z|)
#> (Intercept) 0.0952195429 1.0999003 0.286765015 0.33204728 0.73985356
#> Var_1 -0.0013836918 0.9986173 0.002045315 -0.67651761 0.49871207
#> Var_2 -0.0004142688 0.9995858 0.009605457 -0.04312848 0.96559912
#> factor(Var_4)1 0.0184132664 1.0185838 0.050213834 0.36669708 0.71384499
#> factor(Var_4)2 0.0863909757 1.0902325 0.050044304 1.72628990 0.08429527
#> factor(Var_5)1 -0.1066254801 0.8988623 0.114997732 -0.92719637 0.35382459
#> factor(Var_5)2 -0.0591881914 0.9425294 0.113290817 -0.52244474 0.60136071
#> factor(Var_5)3 -0.0807833291 0.9223935 0.113929295 -0.70906547 0.47828385
#> factor(Var_5)4 0.0229787647 1.0232448 0.115269581 0.19934804 0.84199051
#> factor(Var_5)5 -0.0057667632 0.9942498 0.113913596 -0.05062401 0.95962513
#> factor(Var_5)6 0.0254936407 1.0258214 0.113816898 0.22398819 0.82276649
#> factor(Var_5)7 0.0233435801 1.0236182 0.114746435 0.20343621 0.83879410
#> factor(Var_5)8 -0.0092262931 0.9908161 0.115149070 -0.08012477 0.93613802
#> factor(Var_5)9 -0.1390418914 0.8701916 0.114051937 -1.21911030 0.22280233
#> factor(Var_5)10 0.0532619633 1.0547059 0.114808460 0.46392020 0.64270492
#> factor(Var_5)11 -0.0815427288 0.9216933 0.113599237 -0.71781053 0.47287412
#> factor(Var_5)12 0.0934829685 1.0979919 0.114770538 0.81452061 0.41534677
#> factor(Var_5)13 0.0508340238 1.0521482 0.114111051 0.44547853 0.65597397
#> factor(Var_5)14 -0.0722004220 0.9303444 0.115422244 -0.62553300 0.53162130
#> factor(Var_5)15 0.1086928534 1.1148199 0.115575343 0.94045019 0.34698669
#> Var_6 -0.0008249102 0.9991754 0.004057493 -0.20330540 0.83889633
#> 95% CI
#> (Intercept) [ -0.47 , 0.66 ]
#> Var_1 [ -0.01 , 0 ]
#> Var_2 [ -0.02 , 0.02 ]
#> factor(Var_4)1 [ -0.08 , 0.12 ]
#> factor(Var_4)2 [ -0.01 , 0.18 ]
#> factor(Var_5)1 [ -0.33 , 0.12 ]
#> factor(Var_5)2 [ -0.28 , 0.16 ]
#> factor(Var_5)3 [ -0.3 , 0.14 ]
#> factor(Var_5)4 [ -0.2 , 0.25 ]
#> factor(Var_5)5 [ -0.23 , 0.22 ]
#> factor(Var_5)6 [ -0.2 , 0.25 ]
#> factor(Var_5)7 [ -0.2 , 0.25 ]
#> factor(Var_5)8 [ -0.23 , 0.22 ]
#> factor(Var_5)9 [ -0.36 , 0.08 ]
#> factor(Var_5)10 [ -0.17 , 0.28 ]
#> factor(Var_5)11 [ -0.3 , 0.14 ]
#> factor(Var_5)12 [ -0.13 , 0.32 ]
#> factor(Var_5)13 [ -0.17 , 0.27 ]
#> factor(Var_5)14 [ -0.3 , 0.15 ]
#> factor(Var_5)15 [ -0.12 , 0.34 ]
#> Var_6 [ -0.01 , 0.01 ]
# Fitting Poisson Regression Model
pmodel <- drglm::big.drglm(da,
formula = Var_5 ~ Var_1 + Var_2 + factor(Var_3) + factor(Var_4) + Var_6,
10, family = "poisson")
# View the results table
print(pmodel)
#> Estimate Odds Ratio standard error z value Pr(>|z|)
#> (Intercept) 1.889610e+00 6.6167865 1 1.889610e+00 0.05881016
#> Var_1 1.209546e-05 1.0000121 1 1.209546e-05 0.99999035
#> Var_2 2.141166e-03 1.0021435 1 2.141166e-03 0.99829160
#> factor(Var_3)1 1.545550e-02 1.0155756 1 1.545550e-02 0.98766878
#> factor(Var_4)1 -1.155294e-02 0.9885135 1 -1.155294e-02 0.99078229
#> factor(Var_4)2 -5.857627e-03 0.9941595 1 -5.857627e-03 0.99532632
#> Var_6 1.656243e-03 1.0016576 1 1.656243e-03 0.99867851
#> 95% CI
#> (Intercept) [ -0.07 , 3.85 ]
#> Var_1 [ -1.96 , 1.96 ]
#> Var_2 [ -1.96 , 1.96 ]
#> factor(Var_3)1 [ -1.94 , 1.98 ]
#> factor(Var_4)1 [ -1.97 , 1.95 ]
#> factor(Var_4)2 [ -1.97 , 1.95 ]
#> Var_6 [ -1.96 , 1.96 ]