Skip to contents

Overview

This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with variables from multiple probability distributions.

Each dataset contains:

  • Variables/columns from common distributions such as Normal, Binomial, Poisson, and others.

  • Adjustable sample sizes to meet needs.

Generate a small dataset (i.e., 100 rows)

data_small <- samplezoo("small")
head(data_small)
#>       norm   norm_2    norm_3 bern neg pois        exp       unif      beta
#> 1 28.99935 56.12786 31.412398    0   0    4  1.9930731 0.85065681 0.4924950
#> 2 53.82976 52.14567 67.209227    1   3    2  0.5548857 0.02154078 0.1720294
#> 3 13.44105 49.43263 38.582851    0   1    2  6.6549962 0.25282988 0.1254429
#> 4 49.91643 52.04459 34.556926    1   2    1  4.9418966 0.65671150 0.2700426
#> 5 59.32329 42.43725 -8.933601    1   0    1 21.7477214 0.78914941 0.4219464
#> 6 67.22617 53.09462 41.309733    1   0    2  3.2594742 0.70884565 0.3508093
#>       gamma    chi_sq     t_dist    f_dist
#> 1 9.2714439 10.728394  0.1289406 1.1673265
#> 2 0.8928559  4.106432 -1.4075306 0.9905905
#> 3 2.1155766  5.931797 -0.2962882 1.2695938
#> 4 1.6750690 14.975020 -0.6428353 1.5489007
#> 5 1.3651080 17.334911  0.4973108 1.4109224
#> 6 5.3570542  8.240642 -0.6416164 0.6939188

Generate a medium sized dataset (i.e., 1,000 rows)

data_medium <- samplezoo("medium")
head(data_medium)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif      beta
#> 1 41.37589 63.35978 20.841942    1   1    8 12.233794 0.9370065 0.6279503
#> 2 45.59161 57.52457 79.149264    1   0    2 17.107506 0.1403052 0.2132610
#> 3 45.64013 66.42630 38.448860    0   3    4  2.050155 0.1616998 0.1254138
#> 4 53.47419 48.34354 23.529545    1   2    1  7.479380 0.9222718 0.3970546
#> 5 48.99192 43.48127 -4.924535    1   0    4  1.354146 0.7623292 0.1610967
#> 6 68.66007 71.72267 55.422912    1   0    3  9.393652 0.7731280 0.4707573
#>      gamma    chi_sq     t_dist    f_dist
#> 1 2.322830  5.625974  0.8357163 1.7072051
#> 2 5.561163 10.573718 -0.8650410 1.5643331
#> 3 3.936257 18.122312 -0.2017988 2.0912690
#> 4 1.030830  8.582076  0.8066501 1.0403154
#> 5 3.535528  5.229105  1.0143382 0.9507692
#> 6 2.313653 18.557863  0.6462972 1.1554207

Generate a large sized dataset (i.e., 10,000 rows)

data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2   norm_3 bern neg pois        exp       unif       beta
#> 1 57.03881 49.06911 56.02105    0   1    4  1.7918004 0.25833690 0.18539389
#> 2 29.08872 59.49917 32.78833    1   1    0  4.2511026 0.04949048 0.22955313
#> 3 42.02363 59.29340 27.18639    0   1    0 24.8728112 0.58669929 0.05307951
#> 4 40.33139 72.20071 66.49272    0   1    4  3.7056449 0.74832800 0.28652734
#> 5 64.31326 60.11354 36.46774    0   3    3  0.7327576 0.25238871 0.39573182
#> 6 50.97774 58.62958 37.41237    1   0    2  4.2871728 0.49144146 0.25286105
#>      gamma    chi_sq     t_dist    f_dist
#> 1 3.045566 13.005333  2.3342144 0.5333934
#> 2 7.836300  5.656557  1.6249767 0.9854967
#> 3 5.835159  6.670267 -0.3537878 0.6946501
#> 4 1.362646 13.355680  0.4125888 2.1936965
#> 5 1.840758 11.375872 -0.6604255 1.3206411
#> 6 5.255833  2.564878 -0.6021285 1.7672593

Adding Variation or Ensuring Reproducibility with set.seed()

To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.

Reproducibility

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853
set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853

Variation

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2    norm_3 bern neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065    0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540    1   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295    1   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849    0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743    0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576    0   1    4  6.363993 0.1442317 0.35908460
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.9893762 10.286282 -0.3814568 0.7264343
#> 2 5.4087626  6.519658 -2.3409216 0.9698166
#> 3 1.2587867  8.011417 -0.4744159 0.4329175
#> 4 0.9871787 14.780626  0.4292511 1.0227474
#> 5 2.4021943  6.799788 -0.6692669 2.7446729
#> 6 4.2109032 17.858701 -0.3370763 1.3993853
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#>       norm   norm_2     norm_3 bern neg pois        exp      unif       beta
#> 1 29.84718 68.13494  7.9885694    0   0    5  3.4417303 0.8866347 0.05413307
#> 2 59.32663 52.32066 21.2526086    0   3    3  0.8114356 0.7976466 0.07195440
#> 3 62.01312 62.47569 38.4789563    0   2    6 46.8038907 0.6469920 0.22555129
#> 4 29.16661 53.51086 -0.8656269    0   1    5 11.6955326 0.2036753 0.71455809
#> 5 39.28465 47.19406 47.7819258    1   1    1  0.3535625 0.3653401 0.34619912
#> 6 45.13908 63.33566 53.3620528    1   1    2  4.5592136 0.7628573 0.25880522
#>       gamma    chi_sq     t_dist    f_dist
#> 1 6.7914120  4.464348 -1.0150596 2.2557295
#> 2 3.0132520  8.062120  0.3262369 1.4955877
#> 3 4.7360954 10.969593  1.5141157 1.0766901
#> 4 5.1235878  6.249247  0.6432708 1.1251542
#> 5 6.6851637  4.358815  0.2025742 0.4754946
#> 6 0.3903841 20.019575  1.6257109 0.6653886