Gibbs 輸出的邊際可能性
我正在從頭開始復制第 4.2.1 節中的結果
悉達多赤布
美國統計協會雜誌,卷。90,第 432 期。(1995 年 12 月),第 1313-1321 頁。
它是具有已知數量的法線模型的混合的組件。
該模型的 Gibbs 採樣器是使用 Tanner 和 Wong 的數據增強技術實現的。一組分配變量假設值被引入,我們指定和. 由此可見,在' 給出原始可能性.
數據集由以下速度形成來自北冕座的星系。
set.seed(1701) x <- c( 9.172, 9.350, 9.483, 9.558, 9.775, 10.227, 10.406, 16.084, 16.170, 18.419, 18.552, 18.600, 18.927, 19.052, 19.070, 19.330, 19.343, 19.349, 19.440, 19.473, 19.529, 19.541, 19.547, 19.663, 19.846, 19.856, 19.863, 19.914, 19.918, 19.973, 19.989, 20.166, 20.175, 20.179, 20.196, 20.215, 20.221, 20.415, 20.629, 20.795, 20.821, 20.846, 20.875, 20.986, 21.137, 21.492, 21.701, 21.814, 21.921, 21.960, 22.185, 22.209, 22.242, 22.249, 22.314, 22.374, 22.495, 22.746, 22.747, 22.888, 22.914, 23.206, 23.241, 23.263, 23.484, 23.538, 23.542, 23.666, 23.706, 23.711, 24.129, 24.285, 24.289, 24.366, 24.717, 24.990, 25.633, 26.960, 26.995, 32.065, 32.789, 34.279 ) nn <- length(x)
我們假設, 這的,以及是先驗獨立的
k <- 3 mu0 <- 20 va0 <- 100 nu0 <- 6 de0 <- 40 a <- rep(1, k)
使用貝葉斯定理,完整的條件是
其中
和. 目標是計算模型的邊際似然估計值。Chib 的方法從使用完整條件句的 Gibbs 採樣器的第一次運行開始。
burn_in <- 1000 run <- 15000 cat("First Gibbs run (full):\n") N <- burn_in + run w <- matrix(1, nrow = N, ncol = k) mu <- matrix(0, nrow = N, ncol = k) va <- matrix(1, nrow = N, ncol = k) z <- matrix(1, nrow = N, ncol = nn) n <- integer(k) m <- numeric(k) de <- numeric(k) rdirichlet <- function(a) { y <- rgamma(length(a), a, 1); y / sum(y) } pb <- txtProgressBar(min = 2, max = N, style = 3) z[1,] <- sample.int(k, size = nn, replace = TRUE) for (t in 2:N) { n <- tabulate(z[t-1,], nbins = k) w[t,] <- rdirichlet(a + n) m <- sapply(1:k, function(j) sum(x[z[t-1,]==j])) m[n > 0] <- m[n > 0] / n[n > 0] mu[t,] <- rnorm(k, mean = (n*m*va0+mu0*va[t-1,])/(n*va0+va[t-1,]), sd = sqrt(va0*va[t-1,]/(n*va0+va[t-1,]))) de <- sapply(1:k, function(j) sum((x[z[t-1,]==j] - mu[t,j])^2)) va[t,] <- 1 / rgamma(k, shape = (nu0+n)/2, rate = (de0+de)/2) z[t,] <- sapply(1:nn, function(i) sample.int(k, size = 1, prob = exp(log(w[t,]) + dnorm(x[i], mean = mu[t,], sd = sqrt(va[t,]), log = TRUE)))) setTxtProgressBar(pb, t) } close(pb)
從第一次運行我們得到一個近似點的最大可能性。由於可能性實際上是無界的,這個過程可能給出的是一個近似的本地 MAP。
w <- w[(burn_in+1):N,] mu <- mu[(burn_in+1):N,] va <- va[(burn_in+1):N,] z <- z[(burn_in+1):N,] N <- N - burn_in log_L <- function(x, w, mu, va) sum(log(sapply(1:nn, function(i) sum(exp(log(w) + dnorm(x[i], mean = mu, sd = sqrt(va), log = TRUE)))))) ts <- which.max(sapply(1:N, function(t) log_L(x, w[t,], mu[t,], va[t,]))) ws <- w[ts,] mus <- mu[ts,] vas <- va[ts,]
Chib 對邊際似然的對數估計是
我們已經有了前兩個術語。
log_prior <- function(w, mu, va) { lgamma(sum(a)) - sum(lgamma(a)) + sum((a-1)*log(w)) + sum(dnorm(mu, mean = mu0, sd = sqrt(va0), log = TRUE)) + sum((nu0/2)*log(de0/2) - lgamma(nu0/2) - (nu0/2+1)*log(va) - de0/(2*va)) } chib <- log_L(x, ws, mus, vas) + log_prior(ws, mus, vas)
Rao-Blackwellized 估計是
並且很容易從第一次 Gibbs 運行中獲得。
pi.mu_va.z.x <- function(mu, va, z) { n <- tabulate(z, nbins = k) m <- sapply(1:k, function(j) sum(x[z==j])) m[n > 0] <- m[n > 0] / n[n > 0] exp(sum(dnorm(mu, mean = (n*m*va0+mu0*va)/(n*va0+va), sd = sqrt(va0*va/(n*va0+va)), log = TRUE))) } chib <- chib - log(mean(sapply(1:N, function(t) pi.mu_va.z.x(mus, va[t,], z[t,]))))
Rao-Blackwellized 估計是
並且是從第二次縮減的 Gibbs 運行中計算出來的,其中’s 沒有更新,但等於在每個迭代步驟。
cat("Second Gibbs run (reduced):\n") N <- burn_in + run w <- matrix(1, nrow = N, ncol = k) va <- matrix(1, nrow = N, ncol = k) z <- matrix(1, nrow = N, ncol = nn) pb <- txtProgressBar(min = 2, max = N, style = 3) z[1,] <- sample.int(k, size = nn, replace = TRUE) for (t in 2:N) { n <- tabulate(z[t-1,], nbins = k) w[t,] <- rdirichlet(a + n) de <- sapply(1:k, function(j) sum((x[z[t-1,]==j] - mus[j])^2)) va[t,] <- 1 / rgamma(k, shape = (nu0+n)/2, rate = (de0+de)/2) z[t,] <- sapply(1:nn, function(i) sample.int(k, size = 1, prob = exp(log(w[t,]) + dnorm(x[i], mean = mus, sd = sqrt(va[t,]), log = TRUE)))) setTxtProgressBar(pb, t) } close(pb) w <- w[(burn_in+1):N,] va <- va[(burn_in+1):N,] z <- z[(burn_in+1):N,] N <- N - burn_in pi.va_mu.z.x <- function(va, mu, z) { n <- tabulate(z, nbins = k) de <- sapply(1:k, function(j) sum((x[z==j] - mu[j])^2)) exp(sum(((nu0+n)/2)*log((de0+de)/2) - lgamma((nu0+n)/2) - ((nu0+n)/2+1)*log(va) - (de0+de)/(2*va))) } chib <- chib - log(mean(sapply(1:N, function(t) pi.va_mu.z.x(vas, mus, z[t,]))))
同理,Rao-Blackwellized 估計是
並且是從第三次縮減的 Gibbs 運行中計算出來的,其中的和’s 沒有更新,但等於和分別在每個迭代步驟。
cat("Third Gibbs run (reduced):\n") N <- burn_in + run w <- matrix(1, nrow = N, ncol = k) z <- matrix(1, nrow = N, ncol = nn) pb <- txtProgressBar(min = 2, max = N, style = 3) z[1,] <- sample.int(k, size = nn, replace = TRUE) for (t in 2:N) { n <- tabulate(z[t-1,], nbins = k) w[t,] <- rdirichlet(a + n) z[t,] <- sapply(1:nn, function(i) sample.int(k, size = 1, prob = exp(log(w[t,]) + dnorm(x[i], mean = mus, sd = sqrt(vas), log = TRUE)))) setTxtProgressBar(pb, t) } close(pb) w <- w[(burn_in+1):N,] z <- z[(burn_in+1):N,] N <- N - burn_in pi.w_z.x <- function(w, z) { n <- tabulate(z, nbins = k) exp(lgamma(sum(a+n)) - sum(lgamma(a+n)) + sum((a+n-1)*log(w))) } chib <- chib - log(mean(sapply(1:N, function(t) pi.w_z.x(ws, z[t,]))))
畢竟,我們得到一個對數估計這比 Chib 報告的要大:有蒙特卡洛錯誤.
為了檢查我是否弄亂了 Gibbs 採樣器,我使用 RJAGS 重新實現了整個事情。以下代碼給出了相同的結果。
x <- c( 9.172, 9.350, 9.483, 9.558, 9.775, 10.227, 10.406, 16.084, 16.170, 18.419, 18.552, 18.600, 18.927, 19.052, 19.070, 19.330, 19.343, 19.349, 19.440, 19.473, 19.529, 19.541, 19.547, 19.663, 19.846, 19.856, 19.863, 19.914, 19.918, 19.973, 19.989, 20.166, 20.175, 20.179, 20.196, 20.215, 20.221, 20.415, 20.629, 20.795, 20.821, 20.846, 20.875, 20.986, 21.137, 21.492, 21.701, 21.814, 21.921, 21.960, 22.185, 22.209, 22.242, 22.249, 22.314, 22.374, 22.495, 22.746, 22.747, 22.888, 22.914, 23.206, 23.241, 23.263, 23.484, 23.538, 23.542, 23.666, 23.706, 23.711, 24.129, 24.285, 24.289, 24.366, 24.717, 24.990, 25.633, 26.960, 26.995, 32.065, 32.789, 34.279 ) library(rjags) nn <- length(x) k <- 3 mu0 <- 20 va0 <- 100 nu0 <- 6 de0 <- 40 a <- rep(1, k) burn_in <- 10^3 N <- 10^4 full <- " model { for (i in 1:n) { x[i] ~ dnorm(mu[z[i]], tau[z[i]]) z[i] ~ dcat(w[]) } for (i in 1:k) { mu[i] ~ dnorm(mu0, 1/va0) tau[i] ~ dgamma(nu0/2, de0/2) va[i] <- 1/tau[i] } w ~ ddirich(a) } " data <- list(x = x, n = nn, k = k, mu0 = mu0, va0 = va0, nu0 = nu0, de0 = de0, a = a) model <- jags.model(textConnection(full), data = data, n.chains = 1, n.adapt = 100) update(model, n.iter = burn_in) samples <- jags.samples(model, c("mu", "va", "w", "z"), n.iter = N) mu <- matrix(samples$mu, nrow = N, byrow = TRUE) va <- matrix(samples$va, nrow = N, byrow = TRUE) w <- matrix(samples$w, nrow = N, byrow = TRUE) z <- matrix(samples$z, nrow = N, byrow = TRUE) log_L <- function(x, w, mu, va) sum(log(sapply(1:nn, function(i) sum(exp(log(w) + dnorm(x[i], mean = mu, sd = sqrt(va), log = TRUE)))))) ts <- which.max(sapply(1:N, function(t) log_L(x, w[t,], mu[t,], va[t,]))) ws <- w[ts,] mus <- mu[ts,] vas <- va[ts,] log_prior <- function(w, mu, va) { lgamma(sum(a)) - sum(lgamma(a)) + sum((a-1)*log(w)) + sum(dnorm(mu, mean = mu0, sd = sqrt(va0), log = TRUE)) + sum((nu0/2)*log(de0/2) - lgamma(nu0/2) - (nu0/2+1)*log(va) - de0/(2*va)) } chib <- log_L(x, ws, mus, vas) + log_prior(ws, mus, vas) cat("log-likelihood + log-prior =", chib, "\n") pi.mu_va.z.x <- function(mu, va, z, x) { n <- sapply(1:k, function(j) sum(z==j)) m <- sapply(1:k, function(j) sum(x[z==j])) m[n > 0] <- m[n > 0] / n[n > 0] exp(sum(dnorm(mu, mean = (n*m*va0+mu0*va)/(n*va0+va), sd = sqrt(va0*va/(n*va0+va)), log = TRUE))) } chib <- chib - log(mean(sapply(1:N, function(t) pi.mu_va.z.x(mus, va[t,], z[t,], x)))) cat("log-likelihood + log-prior - log-pi.mu_ =", chib, "\n") fixed.mu <- " model { for (i in 1:n) { x[i] ~ dnorm(mus[z[i]], tau[z[i]]) z[i] ~ dcat(w[]) } for (i in 1:k) { tau[i] ~ dgamma(nu0/2, de0/2) va[i] <- 1/tau[i] } w ~ ddirich(a) } " data <- list(x = x, n = nn, k = k, nu0 = nu0, de0 = de0, a = a, mus = mus) model <- jags.model(textConnection(fixed.mu), data = data, n.chains = 1, n.adapt = 100) update(model, n.iter = burn_in) samples <- jags.samples(model, c("va", "w", "z"), n.iter = N) va <- matrix(samples$va, nrow = N, byrow = TRUE) w <- matrix(samples$w, nrow = N, byrow = TRUE) z <- matrix(samples$z, nrow = N, byrow = TRUE) pi.va_mu.z.x <- function(va, mu, z, x) { n <- sapply(1:k, function(j) sum(z==j)) de <- sapply(1:k, function(j) sum((x[z==j] - mu[j])^2)) exp(sum(((nu0+n)/2)*log((de0+de)/2) - lgamma((nu0+n)/2) - ((nu0+n)/2+1)*log(va) - (de0+de)/(2*va))) } chib <- chib - log(mean(sapply(1:N, function(t) pi.va_mu.z.x(vas, mus, z[t,], x)))) cat("log-likelihood + log-prior - log-pi.mu_ - log-pi.va_ =", chib, "\n") fixed.mu.and.va <- " model { for (i in 1:n) { x[i] ~ dnorm(mus[z[i]], 1/vas[z[i]]) z[i] ~ dcat(w[]) } w ~ ddirich(a) } " data <- list(x = x, n = nn, a = a, mus = mus, vas = vas) model <- jags.model(textConnection(fixed.mu.and.va), data = data, n.chains = 1, n.adapt = 100) update(model, n.iter = burn_in) samples <- jags.samples(model, c("w", "z"), n.iter = N) w <- matrix(samples$w, nrow = N, byrow = TRUE) z <- matrix(samples$z, nrow = N, byrow = TRUE) pi.w_z.x <- function(w, z, x) { n <- sapply(1:k, function(j) sum(z==j)) exp(lgamma(sum(a)+nn) - sum(lgamma(a+n)) + sum((a+n-1)*log(w))) } chib <- chib - log(mean(sapply(1:N, function(t) pi.w_z.x(ws, z[t,], x)))) cat("log-likelihood + log-prior - log-pi.mu_ - log-pi.va_ - log-pi.w_ =", chib, "\n")
我的問題是,在上面的描述中是否對Chib的方法有任何誤解或在其實現中是否有任何錯誤。
There is a slight programming mistake in the prior
log_prior <- function(w, mu, va) { lgamma(sum(a)) - sum(lgamma(a)) + sum((a-1)*log(w)) + sum(dnorm(mu, mean = mu0, sd = sqrt(va0), log = TRUE)) + sum((nu0/2)*log(de0/2) - lgamma(nu0/2) - (nu0/2+1)*log(va) - de0/(2*va)) }
as it should be instead
log_prior <- function(w, mu, va) { lgamma(sum(a)) - sum(lgamma(a)) + sum((a-1)*log(w)) + sum(dnorm(mu, mean = mu0, sd = sqrt(va0), log = TRUE)) + sum((nu0/2)*log(de0/2) - lgamma(nu0/2) - (nu0/2+1)*log(va) - de0/(2*va)) }
Rerunning the code this way leads to
> chib [1] -228.194
which is not the value produced in Chib (1995) for that case! However, in Neal’s (1999) reanalysis of the problem, he mentions that
According to one anonymous JASA referee, the figure of -224.138 for the log of the marginal likelihood for the three component model with unequal variances that was given in Chib’s paper is a “typo” wtih the correct figure being -228.608.
So this solves the discrepancy issue.