Outliers

在散點圖上查找異常值

  • September 3, 2014

我有一組數據點應該位於一個軌跡上並遵循一個模式,但是主軌跡上的一些散點會導致我最終分析中的不確定性。我想獲得一個整潔的軌跡,以便以後應用它進行分析。藍點或多或少是我想通過複雜的方式查找並排除它們的散點,而無需手動執行。在此處輸入圖像描述

我正在考慮使用最近鄰回歸之類的方法,但我不確定它是否是最好的方法,或者我不太熟悉應該如何實施它才能給我一個合適的結果。順便說一句,我想在沒有任何擬合程序的情況下進行。

數據的轉置版本如下:

X=array([[ 0.87 , -0.01 ,  0.575,  1.212,  0.382,  0.418, -0.01 ,  0.474,
        0.432,  0.702,  0.574,  0.45 ,  0.334,  0.565,  0.414,  0.873,
        0.381,  1.103,  0.848,  0.503,  0.27 ,  0.416,  0.939,  1.211,
        1.106,  0.321,  0.709,  0.744,  0.309,  0.247,  0.47 , -0.107,
        0.925,  1.127,  0.833,  0.963,  0.385,  0.572,  0.437,  0.577,
        0.461,  0.474,  1.046,  0.892,  0.313,  1.009,  1.048,  0.349,
        1.189,  0.302,  0.278,  0.629,  0.36 ,  1.188,  0.273,  0.191,
       -0.068,  0.95 ,  1.044,  0.776,  0.726,  1.035,  0.817,  0.55 ,
        0.387,  0.476,  0.473,  0.863,  0.252,  0.664,  0.365,  0.244,
        0.238,  1.203,  0.339,  0.528,  0.326,  0.347,  0.385,  1.139,
        0.748,  0.879,  0.324,  0.265,  0.328,  0.815,  0.38 ,  0.884,
        0.571,  0.416,  0.485,  0.683,  0.496,  0.488,  1.204,  1.18 ,
        0.465,  0.34 ,  0.335,  0.447,  0.28 ,  1.02 ,  0.519,  0.335,
        1.037,  1.126,  0.323,  0.452,  0.201,  0.321,  0.285,  0.587,
        0.292,  0.228,  0.303,  0.844,  0.229,  1.077,  0.864,  0.515,
        0.071,  0.346,  0.255,  0.88 ,  0.24 ,  0.533,  0.725,  0.339,
        0.546,  0.841,  0.43 ,  0.568,  0.311,  0.401,  0.212,  0.691,
        0.565,  0.292,  0.295,  0.587,  0.545,  0.817,  0.324,  0.456,
        0.267,  0.226,  0.262,  0.338,  1.124,  0.373,  0.814,  1.241,
        0.661,  0.229,  0.416,  1.103,  0.226,  1.168,  0.616,  0.593,
        0.803,  1.124,  0.06 ,  0.573,  0.664,  0.882,  0.286,  0.139,
        1.095,  1.112,  1.167,  0.589,  0.3  ,  0.578,  0.727,  0.252,
        0.174,  0.317,  0.427,  1.184,  0.397,  0.43 ,  0.229,  0.261,
        0.632,  0.938,  0.576,  0.37 ,  0.497,  0.54 ,  0.306,  0.315,
        0.335,  0.24 ,  0.344,  0.93 ,  0.134,  0.4  ,  0.223,  1.224,
        1.187,  1.031,  0.25 ,  0.53 , -0.147,  0.087,  0.374,  0.496,
        0.441,  0.884,  0.971,  0.749,  0.432,  0.582,  0.198,  0.615,
        1.146,  0.475,  0.595,  0.304,  0.416,  0.645,  0.281,  0.576,
        1.139,  0.316,  0.892,  0.648,  0.826,  0.299,  0.381,  0.926,
        0.606],
      [-0.154, -0.392, -0.262,  0.214, -0.403, -0.363, -0.461, -0.326,
       -0.349, -0.21 , -0.286, -0.358, -0.436, -0.297, -0.394, -0.166,
       -0.389,  0.029, -0.124, -0.335, -0.419, -0.373, -0.121,  0.358,
        0.042, -0.408, -0.189, -0.213, -0.418, -0.479, -0.303, -0.645,
       -0.153,  0.098, -0.171, -0.066, -0.368, -0.273, -0.329, -0.295,
       -0.362, -0.305, -0.052, -0.171, -0.406, -0.102,  0.011, -0.375,
        0.126, -0.411, -0.42 , -0.27 , -0.407,  0.144, -0.419, -0.465,
       -0.036, -0.099,  0.007, -0.167, -0.205, -0.011, -0.151, -0.267,
       -0.368, -0.342, -0.299, -0.143, -0.42 , -0.232, -0.368, -0.417,
       -0.432,  0.171, -0.388, -0.319, -0.407, -0.379, -0.353,  0.043,
       -0.211, -0.14 , -0.373, -0.431, -0.383, -0.142, -0.345, -0.144,
       -0.302, -0.38 , -0.337, -0.2  , -0.321, -0.269,  0.406,  0.223,
       -0.322, -0.395, -0.379, -0.324, -0.424,  0.01 , -0.298, -0.386,
        0.018,  0.157, -0.384, -0.327, -0.442, -0.388, -0.387, -0.272,
       -0.397, -0.415, -0.388, -0.106, -0.504,  0.034, -0.153, -0.32 ,
       -0.271, -0.417, -0.417, -0.136, -0.447, -0.279, -0.225, -0.372,
       -0.316, -0.161, -0.331, -0.261, -0.409, -0.338, -0.437, -0.242,
       -0.328, -0.403, -0.433, -0.274, -0.331, -0.163, -0.361, -0.298,
       -0.392, -0.447, -0.429, -0.388,  0.11 , -0.348, -0.174,  0.244,
       -0.182, -0.424, -0.319,  0.088, -0.547,  0.189, -0.216, -0.228,
       -0.17 ,  0.125, -0.073, -0.266, -0.234, -0.108, -0.395, -0.395,
        0.131,  0.074,  0.514, -0.235, -0.389, -0.288, -0.22 , -0.416,
       -0.777, -0.358, -0.31 ,  0.817, -0.363, -0.328, -0.424, -0.416,
       -0.248, -0.093, -0.28 , -0.357, -0.348, -0.298, -0.384, -0.394,
       -0.362, -0.415, -0.349, -0.08 , -0.572, -0.07 , -0.423,  0.359,
        0.4  ,  0.099, -0.426, -0.252, -0.697, -0.508, -0.348, -0.254,
       -0.307, -0.116, -0.029, -0.201, -0.302, -0.25 , -0.44 , -0.233,
        0.274, -0.295, -0.223, -0.398, -0.298, -0.209, -0.389, -0.247,
        0.225, -0.395, -0.124, -0.237, -0.104, -0.361, -0.335, -0.083,
       -0.254]])

作為識別“分散”點的開始,請考慮關注內核密度估計相對較低的位置。

這個建議假設最初對點的“軌跡”知之甚少,甚至一無所知——它們中的大多數將沿著一條或多條曲線落下——它是本著對數據進行半自動探索的精神提出的(而不是檢驗假設)。

您可能需要使用內核寬度和“相對較低”的閾值。存在很好的自動方法來估計前者,而後者可以通過分析數據點的密度來識別(以識別低值集群)。


例子

該圖是由兩種數據組合生成的:一種用紅點表示,是高精度數據,另一種用藍點表示,是在極低值附近得到的相對低精度的數據。. 在其背景中是(a)內核密度估計的輪廓(灰度)和(b)圍繞其生成點的曲線(黑色)。

數字

密度相對較低的點已被自動圈出。(這些點的密度小於所有點平均密度的八分之一。)它們包括大部分——但不是全部!——低精度點和一些高精度點(在頂部正確的)。位於曲線附近的低精度點(由高精度點推斷)沒有被圈出。高精度點的圈出凸顯了點稀疏的地方,底層曲線的軌跡將是不確定的。這是建議方法的一個特點,而不是限制!


代碼

R生成此示例的代碼如下。它使用該ks庫評估點模式中的各向異性來開發定向內核形狀。這種方法在樣本數據中效果很好,其點雲往往又長又瘦。

#
# Simulate some data.
#
f <- function(x) -0.55 + 0.45*x + 0.01/(1.2-x)^2 # The underlying curve

set.seed(17)
n1 <- 280; n2 <- 15
x <- c(1.2 - rbeta(n1,.9, .6), rep(0.1, n2))
y <- f(x)
d <- data.frame(x=x + c(rnorm(n1, 0, 0.025), rnorm(n2, 0, 0.1)),
                  y=y + c(rnorm(n1, 0, 0.025), rnorm(n2, 0, 0.33)),
                  group=c(rep(1, n1), rep(2, n2)))
d <- subset(d, subset=(y <= 1.0)) # Omit any high-y points
#
# Plot the density estimate.
#
require(ks)
p <- cbind(d$x, d$y)
dens <- kde(p)
n.levels <- 13
colors <- gray(seq(1, 0, length.out=n.levels))
plot(dens, display="filled.contour2", cont=seq(0, 100, length.out=n.levels),
    col=colors, xlab="X", ylab="Y")
#
# Evaluate densities at the data points.
#
dens <- kde(p, eval.points=p)
d$Density <- dens$estimate
#
# Plot the (correct) curve and the points.
#
curve(f(x), add=TRUE, to=1.2, col="Black")
points(d$x, d$y, ylim=c(-1,1), pch=19, cex=sqrt(d$Density/8),
    col=ifelse(d$group==1, "Red", "Blue"))
#
# Highlight some low-density points.
#
m <- mean(d$Density)
e <- subset(d, subset=(Density < m/10))
points(e$x, e$y, col="#00000080")

引用自:https://stats.stackexchange.com/questions/114214

comments powered by Disqus