Overview
This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with variables from multiple probability distributions.
Each dataset contains:
Variables/columns from common distributions such as Normal, Binomial, Poisson, and others.
Adjustable sample sizes to meet needs.
Generate a small dataset (i.e., 100 rows)
data_small <- samplezoo("small")
head(data_small)
#> norm norm2 norm3 binom neg pois exp unif beta
#> 1 28.99935 56.12786 31.412398 0 0 4 1.9930731 0.85065681 0.4924950
#> 2 53.82976 52.14567 67.209227 0 3 2 0.5548857 0.02154078 0.1720294
#> 3 13.44105 49.43263 38.582851 0 1 2 6.6549962 0.25282988 0.1254429
#> 4 49.91643 52.04459 34.556926 0 2 1 4.9418966 0.65671150 0.2700426
#> 5 59.32329 42.43725 -8.933601 1 0 1 21.7477214 0.78914941 0.4219464
#> 6 67.22617 53.09462 41.309733 1 0 2 3.2594742 0.70884565 0.3508093
#> gamma chisq t_dist
#> 1 9.2714439 0.68455500 1.2037980
#> 2 0.8928559 1.60444217 -0.5534639
#> 3 2.1155766 4.49945206 -1.3502169
#> 4 1.6750690 4.84609857 -0.3750065
#> 5 1.3651080 0.02755675 1.3092035
#> 6 5.3570542 0.28503365 -0.5751219
Generate a medium sized dataset (i.e., 1,000 rows)
data_medium <- samplezoo("medium")
head(data_medium)
#> norm norm2 norm3 binom neg pois exp unif beta
#> 1 69.96159 59.57401 43.40142 0 3 5 1.274976 0.07047117 0.4863100
#> 2 74.17472 61.05326 34.17646 0 1 1 10.078338 0.04534304 0.2507725
#> 3 51.08033 76.55385 77.31065 1 0 5 25.686386 0.29974158 0.1638371
#> 4 39.33865 73.49664 29.85653 1 3 4 21.198528 0.15200913 0.4993536
#> 5 36.93040 74.82166 -6.81062 0 1 1 20.896009 0.65066112 0.1653648
#> 6 45.38753 69.03912 51.20349 1 0 5 8.138273 0.41492510 0.3245714
#> gamma chisq t_dist
#> 1 4.525778 8.147101 2.6737009
#> 2 4.736469 2.105359 1.0207547
#> 3 2.439770 3.406609 0.6765112
#> 4 2.022789 15.126491 -1.5834662
#> 5 5.084430 5.850564 -1.0412767
#> 6 8.639389 3.345060 1.1964198
Generate a large sized dataset (i.e., 10,000 rows)
data_large <- samplezoo("large")
head(data_large)
#> norm norm2 norm3 binom neg pois exp unif beta
#> 1 41.60301 52.15786 35.34764 0 2 4 7.281122 0.35964796 0.6594983
#> 2 44.64960 55.93972 33.11297 0 0 6 8.209673 0.65390116 0.4829964
#> 3 27.60196 50.77917 12.82378 1 1 5 14.016232 0.53271583 0.1622005
#> 4 41.91151 64.74327 25.69894 0 0 5 5.183046 0.48296415 0.2178495
#> 5 62.88090 57.76831 33.39194 0 1 3 3.971781 0.08392318 0.4309143
#> 6 34.88823 50.74384 32.91423 0 3 4 11.510243 0.18772983 0.5887536
#> gamma chisq t_dist
#> 1 1.448478 14.106332 -0.48860898
#> 2 11.112424 7.177751 0.71247219
#> 3 1.537635 10.838395 0.68590255
#> 4 1.219717 9.658237 -0.01226877
#> 5 4.489762 9.173560 -0.10279388
#> 6 7.682093 8.276933 0.54396998
Adding Variation or Ensuring Reproducibility with set.seed()
To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.
set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#> norm norm2 norm3 binom neg pois exp unif beta
#> 1 41.59287 83.70725 23.274065 0 1 6 6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540 0 0 5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295 0 2 4 0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848 6.643849 0 2 2 8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743 0 0 2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986 6.687576 0 1 4 6.363993 0.1442317 0.35908460
#> gamma chisq t_dist
#> 1 6.9893762 10.286282 -0.3814568
#> 2 5.4087626 6.519658 -2.3409216
#> 3 1.2587867 8.011417 -0.4744159
#> 4 0.9871787 14.780626 0.4292511
#> 5 2.4021943 6.799788 -0.6692669
#> 6 4.2109032 17.858701 -0.3370763
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#> norm norm2 norm3 binom neg pois exp unif beta
#> 1 29.84718 68.13494 7.9885694 0 0 5 3.4417303 0.8866347 0.05413307
#> 2 59.32663 52.32066 21.2526086 0 3 3 0.8114356 0.7976466 0.07195440
#> 3 62.01312 62.47569 38.4789563 0 2 6 46.8038907 0.6469920 0.22555129
#> 4 29.16661 53.51086 -0.8656269 0 1 5 11.6955326 0.2036753 0.71455809
#> 5 39.28465 47.19406 47.7819258 1 1 1 0.3535625 0.3653401 0.34619912
#> 6 45.13908 63.33566 53.3620528 1 1 2 4.5592136 0.7628573 0.25880522
#> gamma chisq t_dist
#> 1 6.7914120 4.464348 -1.0150596
#> 2 3.0132520 8.062120 0.3262369
#> 3 4.7360954 10.969593 1.5141157
#> 4 5.1235878 6.249247 0.6432708
#> 5 6.6851637 4.358815 0.2025742
#> 6 0.3903841 20.019575 1.6257109