Bayesian

Gibbs 輸出的邊際可能性

  • December 15, 2014

我正在從頭開始復制第 4.2.1 節中的結果

Gibbs 輸出的邊際可能性

悉達多赤布

美國統計協會雜誌,卷。90,第 432 期。(1995 年 12 月),第 1313-1321 頁。

它是具有已知數量的法線模型的混合的組件。

該模型的 Gibbs 採樣器是使用 Tanner 和 Wong 的數據增強技術實現的。一組分配變量假設值被引入,我們指定和. 由此可見,在' 給出原始可能性.

數據集由以下速度形成來自北冕座的星系。

set.seed(1701)

x <- c(  9.172,  9.350,  9.483,  9.558,  9.775, 10.227, 10.406, 16.084, 16.170, 18.419, 18.552, 18.600, 18.927,
       19.052, 19.070, 19.330, 19.343, 19.349, 19.440, 19.473, 19.529, 19.541, 19.547, 19.663, 19.846, 19.856,
       19.863, 19.914, 19.918, 19.973, 19.989, 20.166, 20.175, 20.179, 20.196, 20.215, 20.221, 20.415, 20.629,
       20.795, 20.821, 20.846, 20.875, 20.986, 21.137, 21.492, 21.701, 21.814, 21.921, 21.960, 22.185, 22.209,
       22.242, 22.249, 22.314, 22.374, 22.495, 22.746, 22.747, 22.888, 22.914, 23.206, 23.241, 23.263, 23.484,
       23.538, 23.542, 23.666, 23.706, 23.711, 24.129, 24.285, 24.289, 24.366, 24.717, 24.990, 25.633, 26.960,
       26.995, 32.065, 32.789, 34.279 )

nn <- length(x)

我們假設, 這的,以及是先驗獨立的

k <- 3

mu0 <- 20
va0 <- 100

nu0 <- 6
de0 <- 40

a <- rep(1, k)

使用貝葉斯定理,完整的條件是

其中

和. 目標是計算模型的邊際似然估計值。Chib 的方法從使用完整條件句的 Gibbs 採樣器的第一次運行開始。

burn_in <- 1000
run     <- 15000

cat("First Gibbs run (full):\n")

N <- burn_in + run

w  <- matrix(1, nrow = N, ncol = k)
mu <- matrix(0, nrow = N, ncol = k)
va <- matrix(1, nrow = N, ncol = k)
z  <- matrix(1, nrow = N, ncol = nn)

n <- integer(k)
m <- numeric(k)
de <- numeric(k)

rdirichlet <- function(a) { y <- rgamma(length(a), a, 1); y / sum(y) }

pb <- txtProgressBar(min = 2, max = N, style = 3)
z[1,] <- sample.int(k, size = nn, replace = TRUE)
for (t in 2:N) {
   n <- tabulate(z[t-1,], nbins = k)
   w[t,] <- rdirichlet(a + n)
   m <- sapply(1:k, function(j) sum(x[z[t-1,]==j]))
   m[n > 0] <- m[n > 0] / n[n > 0]
   mu[t,] <- rnorm(k, mean = (n*m*va0+mu0*va[t-1,])/(n*va0+va[t-1,]), sd = sqrt(va0*va[t-1,]/(n*va0+va[t-1,])))
   de <- sapply(1:k, function(j) sum((x[z[t-1,]==j] - mu[t,j])^2))
   va[t,] <- 1 / rgamma(k, shape = (nu0+n)/2, rate = (de0+de)/2)
   z[t,] <- sapply(1:nn, function(i) sample.int(k, size = 1, prob = exp(log(w[t,]) + dnorm(x[i], mean = mu[t,], sd = sqrt(va[t,]), log = TRUE))))
   setTxtProgressBar(pb, t)
}
close(pb)

從第一次運行我們得到一個近似點的最大可能性。由於可能性實際上是無界的,這個過程可能給出的是一個近似的本地 MAP。

w  <- w[(burn_in+1):N,]
mu <- mu[(burn_in+1):N,]
va <- va[(burn_in+1):N,]
z  <- z[(burn_in+1):N,]
N  <- N - burn_in

log_L <- function(x, w, mu, va) sum(log(sapply(1:nn, function(i) sum(exp(log(w) + dnorm(x[i], mean = mu, sd = sqrt(va), log = TRUE))))))

ts <- which.max(sapply(1:N, function(t) log_L(x, w[t,], mu[t,], va[t,])))

ws <- w[ts,]
mus <- mu[ts,]
vas <- va[ts,]

Chib 對邊際似然的對數估計是

我們已經有了前兩個術語。

log_prior <- function(w, mu, va) {
   lgamma(sum(a)) - sum(lgamma(a)) + sum((a-1)*log(w))
   + sum(dnorm(mu, mean = mu0, sd = sqrt(va0), log = TRUE))
   + sum((nu0/2)*log(de0/2) - lgamma(nu0/2) - (nu0/2+1)*log(va) - de0/(2*va))
}

chib <- log_L(x, ws, mus, vas) + log_prior(ws, mus, vas)

Rao-Blackwellized 估計是

並且很容易從第一次 Gibbs 運行中獲得。

pi.mu_va.z.x <- function(mu, va, z) {
   n <- tabulate(z, nbins = k)
   m <- sapply(1:k, function(j) sum(x[z==j]))
   m[n > 0] <- m[n > 0] / n[n > 0]
   exp(sum(dnorm(mu, mean = (n*m*va0+mu0*va)/(n*va0+va), sd = sqrt(va0*va/(n*va0+va)), log = TRUE)))
}

chib <- chib - log(mean(sapply(1:N, function(t) pi.mu_va.z.x(mus, va[t,], z[t,]))))

Rao-Blackwellized 估計是

並且是從第二次縮減的 Gibbs 運行中計算出來的,其中’s 沒有更新,但等於在每個迭代步驟。

cat("Second Gibbs run (reduced):\n")

N <- burn_in + run

w  <- matrix(1, nrow = N, ncol = k)
va <- matrix(1, nrow = N, ncol = k)
z  <- matrix(1, nrow = N, ncol = nn) 

pb <- txtProgressBar(min = 2, max = N, style = 3)
z[1,] <- sample.int(k, size = nn, replace = TRUE)
for (t in 2:N) {
   n <- tabulate(z[t-1,], nbins = k)
   w[t,] <- rdirichlet(a + n)
   de <- sapply(1:k, function(j) sum((x[z[t-1,]==j] - mus[j])^2))
   va[t,] <- 1 / rgamma(k, shape = (nu0+n)/2, rate = (de0+de)/2)
   z[t,] <- sapply(1:nn, function(i) sample.int(k, size = 1, prob = exp(log(w[t,]) + dnorm(x[i], mean = mus, sd = sqrt(va[t,]), log = TRUE))))
   setTxtProgressBar(pb, t)
}
close(pb)

w  <- w[(burn_in+1):N,]
va <- va[(burn_in+1):N,]
z  <- z[(burn_in+1):N,]
N  <- N - burn_in

pi.va_mu.z.x <- function(va, mu, z) {
   n <- tabulate(z, nbins = k)         
   de <- sapply(1:k, function(j) sum((x[z==j] - mu[j])^2))
   exp(sum(((nu0+n)/2)*log((de0+de)/2) - lgamma((nu0+n)/2) - ((nu0+n)/2+1)*log(va) - (de0+de)/(2*va)))
}

chib <- chib - log(mean(sapply(1:N, function(t) pi.va_mu.z.x(vas, mus, z[t,]))))

同理,Rao-Blackwellized 估計是

並且是從第三次縮減的 Gibbs 運行中計算出來的,其中的和’s 沒有更新,但等於和分別在每個迭代步驟。

cat("Third Gibbs run (reduced):\n")

N <- burn_in + run

w  <- matrix(1, nrow = N, ncol = k)
z  <- matrix(1, nrow = N, ncol = nn) 

pb <- txtProgressBar(min = 2, max = N, style = 3)
z[1,] <- sample.int(k, size = nn, replace = TRUE)
for (t in 2:N) {
   n <- tabulate(z[t-1,], nbins = k)
   w[t,] <- rdirichlet(a + n)
   z[t,] <- sapply(1:nn, function(i) sample.int(k, size = 1, prob = exp(log(w[t,]) + dnorm(x[i], mean = mus, sd = sqrt(vas), log = TRUE))))
   setTxtProgressBar(pb, t)
}
close(pb)

w  <- w[(burn_in+1):N,]
z  <- z[(burn_in+1):N,]
N  <- N - burn_in

pi.w_z.x <- function(w, z) {
   n <- tabulate(z, nbins = k)
   exp(lgamma(sum(a+n)) - sum(lgamma(a+n)) + sum((a+n-1)*log(w)))
}

chib <- chib - log(mean(sapply(1:N, function(t) pi.w_z.x(ws, z[t,]))))

畢竟,我們得到一個對數估計這比 Chib 報告的要大:有蒙特卡洛錯誤.

為了檢查我是否弄亂了 Gibbs 採樣器,我使用 RJAGS 重新實現了整個事情。以下代碼給出了相同的結果。

x <- c( 9.172,  9.350,  9.483,  9.558,  9.775, 10.227, 10.406, 16.084, 16.170, 18.419, 18.552, 18.600, 18.927, 19.052, 19.070, 19.330,
      19.343, 19.349, 19.440, 19.473, 19.529, 19.541, 19.547, 19.663, 19.846, 19.856, 19.863, 19.914, 19.918, 19.973, 19.989, 20.166,
      20.175, 20.179, 20.196, 20.215, 20.221, 20.415, 20.629, 20.795, 20.821, 20.846, 20.875, 20.986, 21.137, 21.492, 21.701, 21.814,
      21.921, 21.960, 22.185, 22.209, 22.242, 22.249, 22.314, 22.374, 22.495, 22.746, 22.747, 22.888, 22.914, 23.206, 23.241, 23.263,
      23.484, 23.538, 23.542, 23.666, 23.706, 23.711, 24.129, 24.285, 24.289, 24.366, 24.717, 24.990, 25.633, 26.960, 26.995, 32.065,
      32.789, 34.279 )

library(rjags)

nn <- length(x)

k <- 3

mu0 <- 20
va0 <- 100

nu0 <- 6
de0 <- 40

a <- rep(1, k)

burn_in <- 10^3

N <- 10^4

full <- "
   model {
       for (i in 1:n) {
           x[i] ~ dnorm(mu[z[i]], tau[z[i]])
           z[i] ~ dcat(w[])
       }
       for (i in 1:k) {
           mu[i] ~ dnorm(mu0, 1/va0)
           tau[i] ~ dgamma(nu0/2, de0/2)
           va[i] <- 1/tau[i]
       }
       w ~ ddirich(a)
   }
"
data <- list(x = x, n = nn, k = k, mu0 = mu0, va0 = va0, nu0 = nu0, de0 = de0, a = a)
model <- jags.model(textConnection(full), data = data, n.chains = 1, n.adapt = 100)
update(model, n.iter = burn_in)
samples <- jags.samples(model, c("mu", "va", "w", "z"), n.iter = N)

mu <- matrix(samples$mu, nrow = N, byrow = TRUE)
   va <- matrix(samples$va, nrow = N, byrow = TRUE)
w <- matrix(samples$w, nrow = N, byrow = TRUE)
   z <- matrix(samples$z, nrow = N, byrow = TRUE)

log_L <- function(x, w, mu, va) sum(log(sapply(1:nn, function(i) sum(exp(log(w) + dnorm(x[i], mean = mu, sd = sqrt(va), log = TRUE))))))

ts <- which.max(sapply(1:N, function(t) log_L(x, w[t,], mu[t,], va[t,])))

ws <- w[ts,]
mus <- mu[ts,]
vas <- va[ts,]

log_prior <- function(w, mu, va) {
   lgamma(sum(a)) - sum(lgamma(a)) + sum((a-1)*log(w))
   + sum(dnorm(mu, mean = mu0, sd = sqrt(va0), log = TRUE))
   + sum((nu0/2)*log(de0/2) - lgamma(nu0/2) - (nu0/2+1)*log(va) - de0/(2*va))
}

chib <- log_L(x, ws, mus, vas) + log_prior(ws, mus, vas)

cat("log-likelihood + log-prior =", chib, "\n")

pi.mu_va.z.x <- function(mu, va, z, x) {
   n <- sapply(1:k, function(j) sum(z==j))
   m <- sapply(1:k, function(j) sum(x[z==j]))
   m[n > 0] <- m[n > 0] / n[n > 0]
   exp(sum(dnorm(mu, mean = (n*m*va0+mu0*va)/(n*va0+va), sd = sqrt(va0*va/(n*va0+va)), log = TRUE)))
}

chib <- chib - log(mean(sapply(1:N, function(t) pi.mu_va.z.x(mus, va[t,], z[t,], x))))

cat("log-likelihood + log-prior - log-pi.mu_ =", chib, "\n")

fixed.mu <- "
   model {
       for (i in 1:n) {
           x[i] ~ dnorm(mus[z[i]], tau[z[i]])
           z[i] ~ dcat(w[])
       }
       for (i in 1:k) {
           tau[i] ~ dgamma(nu0/2, de0/2)
           va[i] <- 1/tau[i]
       }
       w ~ ddirich(a)
   }
"
data <- list(x = x, n = nn, k = k, nu0 = nu0, de0 = de0, a = a, mus = mus)
model <- jags.model(textConnection(fixed.mu), data = data, n.chains = 1, n.adapt = 100)
update(model, n.iter = burn_in)
samples <- jags.samples(model, c("va", "w", "z"), n.iter = N)

va <- matrix(samples$va, nrow = N, byrow = TRUE)
   w <- matrix(samples$w, nrow = N, byrow = TRUE)
z <- matrix(samples$z, nrow = N, byrow = TRUE)

pi.va_mu.z.x <- function(va, mu, z, x) {
   n <- sapply(1:k, function(j) sum(z==j))
   de <- sapply(1:k, function(j) sum((x[z==j] - mu[j])^2))
   exp(sum(((nu0+n)/2)*log((de0+de)/2) - lgamma((nu0+n)/2) - ((nu0+n)/2+1)*log(va) - (de0+de)/(2*va)))
}

chib <- chib - log(mean(sapply(1:N, function(t) pi.va_mu.z.x(vas, mus, z[t,], x))))

cat("log-likelihood + log-prior - log-pi.mu_ - log-pi.va_ =", chib, "\n")

fixed.mu.and.va <- "
   model {
       for (i in 1:n) {
           x[i] ~ dnorm(mus[z[i]], 1/vas[z[i]])
           z[i] ~ dcat(w[])
       }
       w ~ ddirich(a)
   }
"
data <- list(x = x, n = nn, a = a, mus = mus, vas = vas)
model <- jags.model(textConnection(fixed.mu.and.va), data = data, n.chains = 1, n.adapt = 100)
update(model, n.iter = burn_in)
samples <- jags.samples(model, c("w", "z"), n.iter = N)

w <- matrix(samples$w, nrow = N, byrow = TRUE)
   z <- matrix(samples$z, nrow = N, byrow = TRUE)

pi.w_z.x <- function(w, z, x) {
   n <- sapply(1:k, function(j) sum(z==j))
   exp(lgamma(sum(a)+nn) - sum(lgamma(a+n)) + sum((a+n-1)*log(w)))
}

chib <- chib - log(mean(sapply(1:N, function(t) pi.w_z.x(ws, z[t,], x))))

cat("log-likelihood + log-prior - log-pi.mu_ - log-pi.va_ - log-pi.w_ =", chib, "\n")

我的問題是,在上面的描述中是否對Chib的方法有任何誤解或在其實現中是否有任何錯誤。

There is a slight programming mistake in the prior

log_prior <- function(w, mu, va) {
   lgamma(sum(a)) - sum(lgamma(a)) + sum((a-1)*log(w))
   + sum(dnorm(mu, mean = mu0, sd = sqrt(va0), log = TRUE))
   + sum((nu0/2)*log(de0/2) - lgamma(nu0/2) - (nu0/2+1)*log(va) - de0/(2*va))
}

as it should be instead

log_prior <- function(w, mu, va) {
   lgamma(sum(a)) - sum(lgamma(a)) + sum((a-1)*log(w)) +
     sum(dnorm(mu, mean = mu0, sd = sqrt(va0), log = TRUE)) +
     sum((nu0/2)*log(de0/2) - lgamma(nu0/2) - (nu0/2+1)*log(va) - de0/(2*va))
}

Rerunning the code this way leads to

> chib
[1] -228.194

which is not the value produced in Chib (1995) for that case! However, in Neal’s (1999) reanalysis of the problem, he mentions that

According to one anonymous JASA referee, the figure of -224.138 for the log of the marginal likelihood for the three component model with unequal variances that was given in Chib’s paper is a “typo” wtih the correct figure being -228.608.

So this solves the discrepancy issue.

引用自:https://stats.stackexchange.com/questions/129109

comments powered by Disqus