Generating Small, Medium, and Large Datasets

Overview

This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with random variables from multiple probability distributions.

Each dataset contains:

library(samplezoo)

Generate a small dataset (i.e., 100 rows)

data_small <- samplezoo("small")
head(data_small)
#>       norm    norm2    norm3 binom neg pois        exp      unif       beta
#> 1 65.29742 67.20541 39.73562     0   5    2 10.3332425 0.7231855 0.27041914
#> 2 30.61407 48.31246 30.86564     0   3    2  1.7652295 0.1450918 0.02491969
#> 3 44.94947 67.29217 38.69375     0   0    4 10.6108062 0.4505904 0.05951487
#> 4 29.59907 44.24359 67.84280     0   0    2  0.5679928 0.8819497 0.60979494
#> 5 43.41407 56.94686 10.84323     1   1    2  2.2938456 0.1684621 0.49810648
#> 6 34.97345 69.90718 33.96964     0   0    0 27.0255575 0.7412955 0.68493214
#>       gamma     chisq     t_dist
#> 1 0.9977802 2.1836605 -0.8853693
#> 2 3.3005588 0.9465372  0.7570046
#> 3 0.7583037 1.3795962 -0.5036858
#> 4 0.9311511 0.4779120 -1.0671415
#> 5 3.0245772 5.3672453 -1.1618699
#> 6 4.2327450 4.3024497 -0.4104507

Generate a medium sized dataset (i.e., 1,000 rows)

data_medium <- samplezoo("medium")
head(data_medium)
#>       norm    norm2     norm3 binom neg pois       exp      unif       beta
#> 1 57.06740 64.11136  3.467672     0   2    1  3.686053 0.3375031 0.14378442
#> 2 47.31645 52.63125 41.940552     0   1    2  8.581828 0.2236899 0.11224683
#> 3 55.08562 66.59396 30.838302     0   0    1  4.933613 0.9181436 0.39128257
#> 4 41.87706 72.75178 65.439129     0   1    0 17.925673 0.6346326 0.29691502
#> 5 65.32016 70.22718 47.284516     1   2    3 47.631865 0.7300565 0.05317965
#> 6 46.27007 67.93487 56.690884     0   0    3  1.982736 0.1547931 0.25384270
#>      gamma     chisq      t_dist
#> 1 3.331139  2.985590 -0.67784531
#> 2 4.227148  4.953648  0.05318012
#> 3 5.768409 12.689136  0.87468194
#> 4 1.144292  8.226658  0.64553503
#> 5 5.610336  7.530971  3.24241463
#> 6 5.249032  5.406572  0.34418083

Generate a large sized dataset (i.e., 10,000 rows)

data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2    norm3 binom neg pois        exp       unif      beta
#> 1 24.60660 72.14700 19.24886     0   5    5  6.4277008 0.84537939 0.3853909
#> 2 43.87874 61.09530 41.63585     0   1    1  0.5646119 0.93715738 0.2039308
#> 3 37.14783 67.77147 37.70517     0   3    2  1.7067526 0.98601103 0.6175510
#> 4 50.42442 69.47083 49.75178     0   0    4 41.4743139 0.22641873 0.2663413
#> 5 59.50994 44.44783 36.45096     0   6    3  0.5456206 0.06758606 0.1825875
#> 6 57.72893 55.34474 21.66427     1   0    3  1.0757587 0.05526381 0.1545085
#>      gamma     chisq     t_dist
#> 1 1.289751 14.224862 -0.7209851
#> 2 3.437733  6.544303  0.4948181
#> 3 1.173168 10.388673  0.2897624
#> 4 3.818564  8.051823 -0.3072181
#> 5 3.348005 15.916600  1.8211526
#> 6 2.625976 10.669652 -0.8691618

Adding Variation or Ensuring Reproducibility with set.seed()

To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2     norm3 binom neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065     0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540     0   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295     0   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849     0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743     0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576     0   1    4  6.363993 0.1442317 0.35908460
#>       gamma     chisq     t_dist
#> 1 6.9893762 10.286282 -0.3814568
#> 2 5.4087626  6.519658 -2.3409216
#> 3 1.2587867  8.011417 -0.4744159
#> 4 0.9871787 14.780626  0.4292511
#> 5 2.4021943  6.799788 -0.6692669
#> 6 4.2109032 17.858701 -0.3370763
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2      norm3 binom neg pois        exp      unif       beta
#> 1 29.84718 68.13494  7.9885694     0   0    5  3.4417303 0.8866347 0.05413307
#> 2 59.32663 52.32066 21.2526086     0   3    3  0.8114356 0.7976466 0.07195440
#> 3 62.01312 62.47569 38.4789563     0   2    6 46.8038907 0.6469920 0.22555129
#> 4 29.16661 53.51086 -0.8656269     0   1    5 11.6955326 0.2036753 0.71455809
#> 5 39.28465 47.19406 47.7819258     1   1    1  0.3535625 0.3653401 0.34619912
#> 6 45.13908 63.33566 53.3620528     1   1    2  4.5592136 0.7628573 0.25880522
#>       gamma     chisq     t_dist
#> 1 6.7914120  4.464348 -1.0150596
#> 2 3.0132520  8.062120  0.3262369
#> 3 4.7360954 10.969593  1.5141157
#> 4 5.1235878  6.249247  0.6432708
#> 5 6.6851637  4.358815  0.2025742
#> 6 0.3903841 20.019575  1.6257109