library(data.table)
# CJ stands for cross-join; it returns all possible
# pairs of elements of y1 and y2 as a data.table,
# just like expand.grid does in base R but ~ 5x faster
pmf = CJ(y1 = 0:1, y2 = 0:2
)[ , p := c(.38, .14, .24, .17, .02, .05)][]
# Jumping into the weeds here a bit.
# Approach: Sample **row numbers** of the pmf with
# probability corresponding to the likelihood of the
# corresponding cell in the pmf table.
# Then draw the y1/y2 corresponding to that row number,
# and calculate the correlation of y1/y2 that we draw
# sample(.I, 1e6, prob = p, replace = TRUE)
# * .I is the shorthand for the row number in a data.table
# * 1e6 is the number of samples we want to draw
# * prob = p tells sample not to draw the rows with equal
# probability, but, e.g., row 1 will be chosen with
# probability p[1]; row 2, with probability p[2], etc.
# .SD[sample(...)]
# * .SD, called within a [] call to a data.table, is simply
# the data.table itself. So if we write pmf[ , .SD], it's
# exactly equivalent to having written pmf[]. Oftentimes,
# it's convenient to be able to refer to the data itself
# within the [] call, as we do here. So .SD[sample(...)]
# is the same as pmf[sample(...)], except that we use
# variables within sample(...) that we can't use in the
# first argument (called i) to [] -- namely, .I and p
# Ultimately, .SD[sample(...)] extracts the rows (with
# repetitions as necessary) corresponding to each element
# of sample(...).
# .SD[sample(...), cor(y1, y2)]
# * Now that we've "subsetted" the data to those rows
# corresponding to sample(...), we simply calculate the
# correlation on our sample.
pmf[ , .SD[sample(.I, 1e6, prob = p, replace = TRUE), cor(y1, y2)]]
## [1] -0.1546545
Each element of the PMF matrix will be c*(something)
; when we add up across all cells, it’ll come to c*(sum of cells)
, which we need to equal 1. Knowing this, we can solve:
x = c(1, 2, 4)
y = c(1, 3)
pmf.matrix = outer(x, y, function(x, y) x^2 + y^2)
pmf.matrix
## [,1] [,2]
## [1,] 2 10
## [2,] 5 13
## [3,] 17 25
cc = 1/sum(pmf.matrix)
cc
## [1] 0.01388889
#Same approach as above, but done to death
pmf = CJ(x = x, y = y)[ , p := cc * c(pmf.matrix)]
pmf[ , .SD[sample(.I, 1e6, prob = p, replace = TRUE),
#Using backticks since these aren't kosher variable names
.(`(b)` = mean(y < x), `(c)` = mean(y > x),
`(d)` = mean(y == x), `(e)` = mean(y == 3),
`(g.1)` = mean(x), `(g.2)` = mean(y),
`(g.3)` = mean(x*y), `(h.1)` = var(x),
`(h.2)` = var(y), `(h.3)` = var(x + y),
`(i.1)` = mean(x[x >= y]),
`(i.2)` = var(x[x >= y]))]]
## (b) (c) (d) (e) (g.1) (g.2) (g.3) (h.1)
## 1: 0.764056 0.208069 0.027875 0.555846 2.958833 2.111692 6.434377 1.290354
## (h.2) (h.3) (i.1) (i.2)
## 1: 0.9875259 2.650346 3.298481 1.016507
Doing the simulation for an unknown joint CDF is a bit tough. So we’re instead going to approximate by discretizing.
pdf = CJ(y1 = seq(0, 1, length.out = 1000L),
y2 = seq(0, 1, length.out = 1000L)
)[ , p := 4 * y1 * y2
#have to normalize since we discretized
][ , p := p/sum(p)]
#sanity check
pdf[ , sum(p)]
## [1] 1
#now calculate CDF
pdf[y1 <= .5 & y2 <= .75, sum(p)]
## [1] 0.1404374
#marginal dist'n for Y1
pdf[ , .(p_1 = sum(p)), by = y1]
## y1 p_1
## 1: 0.000000000 0.000000e+00
## 2: 0.001001001 2.002002e-06
## 3: 0.002002002 4.004004e-06
## 4: 0.003003003 6.006006e-06
## 5: 0.004004004 8.008008e-06
## ---
## 996: 0.995995996 1.991992e-03
## 997: 0.996996997 1.993994e-03
## 998: 0.997997998 1.995996e-03
## 999: 0.998998999 1.997998e-03
## 1000: 1.000000000 2.000000e-03
#marginal dist'n for Y2
pdf[ , .(p_2 = sum(p)), by = y2]
## y2 p_2
## 1: 0.000000000 0.000000e+00
## 2: 0.001001001 2.002002e-06
## 3: 0.002002002 4.004004e-06
## 4: 0.003003003 6.006006e-06
## 5: 0.004004004 8.008008e-06
## ---
## 996: 0.995995996 1.991992e-03
## 997: 0.996996997 1.993994e-03
## 998: 0.997997998 1.995996e-03
## 999: 0.998998999 1.997998e-03
## 1000: 1.000000000 2.000000e-03
pdf[ , .SD[sample(.I, 1e6, prob = p, replace = TRUE), var(y1 - y2)]]
## [1] 0.1111621
k = 1/integrate(function(y) y^3, 0, 6)$value/
integrate(function(x) x^2, 0, 6)$value
k
## [1] 4.286694e-05
We can’t get an expression for the marginal PDF of \(X\), but we can plot it like so:
x = seq(0, 6, length.out = 1000L)
plot(x, sapply(x, function(xx)
integrate(function(y) k * xx^2 * y^3, 0, 6)$value),
main = "Marginal PDF of X", ylab = "Density",
type = "l", lwd = 3L, col = "red", las = 1L)
y = seq(0, 6, length.out = 1000L)
plot(y, sapply(y, function(yy)
integrate(function(x) k * x^2 * yy^3, 0, 6)$value),
main = "Marginal PDF of Y", ylab = "Density",
type = "l", lwd = 3L, col = "blue", las = 1L)
pdf = CJ(x = x, y = y)[ , p := k * x^2 * y^3
][ , p := p/sum(p)]
pdf[ , .SD[sample(.I, 1e6, prob = p, replace = TRUE),
.(`(d)` = var(x), `(e)` = var(y), `(f)` = cov(x, y))]]
## (d) (e) (f)
## 1: 1.351167 0.9614504 -0.0002651709