Outliers
在散點圖上查找異常值
我有一組數據點應該位於一個軌跡上並遵循一個模式,但是主軌跡上的一些散點會導致我最終分析中的不確定性。我想獲得一個整潔的軌跡,以便以後應用它進行分析。藍點或多或少是我想通過複雜的方式查找並排除它們的散點,而無需手動執行。
我正在考慮使用最近鄰回歸之類的方法,但我不確定它是否是最好的方法,或者我不太熟悉應該如何實施它才能給我一個合適的結果。順便說一句,我想在沒有任何擬合程序的情況下進行。
數據的轉置版本如下:
X=array([[ 0.87 , -0.01 , 0.575, 1.212, 0.382, 0.418, -0.01 , 0.474, 0.432, 0.702, 0.574, 0.45 , 0.334, 0.565, 0.414, 0.873, 0.381, 1.103, 0.848, 0.503, 0.27 , 0.416, 0.939, 1.211, 1.106, 0.321, 0.709, 0.744, 0.309, 0.247, 0.47 , -0.107, 0.925, 1.127, 0.833, 0.963, 0.385, 0.572, 0.437, 0.577, 0.461, 0.474, 1.046, 0.892, 0.313, 1.009, 1.048, 0.349, 1.189, 0.302, 0.278, 0.629, 0.36 , 1.188, 0.273, 0.191, -0.068, 0.95 , 1.044, 0.776, 0.726, 1.035, 0.817, 0.55 , 0.387, 0.476, 0.473, 0.863, 0.252, 0.664, 0.365, 0.244, 0.238, 1.203, 0.339, 0.528, 0.326, 0.347, 0.385, 1.139, 0.748, 0.879, 0.324, 0.265, 0.328, 0.815, 0.38 , 0.884, 0.571, 0.416, 0.485, 0.683, 0.496, 0.488, 1.204, 1.18 , 0.465, 0.34 , 0.335, 0.447, 0.28 , 1.02 , 0.519, 0.335, 1.037, 1.126, 0.323, 0.452, 0.201, 0.321, 0.285, 0.587, 0.292, 0.228, 0.303, 0.844, 0.229, 1.077, 0.864, 0.515, 0.071, 0.346, 0.255, 0.88 , 0.24 , 0.533, 0.725, 0.339, 0.546, 0.841, 0.43 , 0.568, 0.311, 0.401, 0.212, 0.691, 0.565, 0.292, 0.295, 0.587, 0.545, 0.817, 0.324, 0.456, 0.267, 0.226, 0.262, 0.338, 1.124, 0.373, 0.814, 1.241, 0.661, 0.229, 0.416, 1.103, 0.226, 1.168, 0.616, 0.593, 0.803, 1.124, 0.06 , 0.573, 0.664, 0.882, 0.286, 0.139, 1.095, 1.112, 1.167, 0.589, 0.3 , 0.578, 0.727, 0.252, 0.174, 0.317, 0.427, 1.184, 0.397, 0.43 , 0.229, 0.261, 0.632, 0.938, 0.576, 0.37 , 0.497, 0.54 , 0.306, 0.315, 0.335, 0.24 , 0.344, 0.93 , 0.134, 0.4 , 0.223, 1.224, 1.187, 1.031, 0.25 , 0.53 , -0.147, 0.087, 0.374, 0.496, 0.441, 0.884, 0.971, 0.749, 0.432, 0.582, 0.198, 0.615, 1.146, 0.475, 0.595, 0.304, 0.416, 0.645, 0.281, 0.576, 1.139, 0.316, 0.892, 0.648, 0.826, 0.299, 0.381, 0.926, 0.606], [-0.154, -0.392, -0.262, 0.214, -0.403, -0.363, -0.461, -0.326, -0.349, -0.21 , -0.286, -0.358, -0.436, -0.297, -0.394, -0.166, -0.389, 0.029, -0.124, -0.335, -0.419, -0.373, -0.121, 0.358, 0.042, -0.408, -0.189, -0.213, -0.418, -0.479, -0.303, -0.645, -0.153, 0.098, -0.171, -0.066, -0.368, -0.273, -0.329, -0.295, -0.362, -0.305, -0.052, -0.171, -0.406, -0.102, 0.011, -0.375, 0.126, -0.411, -0.42 , -0.27 , -0.407, 0.144, -0.419, -0.465, -0.036, -0.099, 0.007, -0.167, -0.205, -0.011, -0.151, -0.267, -0.368, -0.342, -0.299, -0.143, -0.42 , -0.232, -0.368, -0.417, -0.432, 0.171, -0.388, -0.319, -0.407, -0.379, -0.353, 0.043, -0.211, -0.14 , -0.373, -0.431, -0.383, -0.142, -0.345, -0.144, -0.302, -0.38 , -0.337, -0.2 , -0.321, -0.269, 0.406, 0.223, -0.322, -0.395, -0.379, -0.324, -0.424, 0.01 , -0.298, -0.386, 0.018, 0.157, -0.384, -0.327, -0.442, -0.388, -0.387, -0.272, -0.397, -0.415, -0.388, -0.106, -0.504, 0.034, -0.153, -0.32 , -0.271, -0.417, -0.417, -0.136, -0.447, -0.279, -0.225, -0.372, -0.316, -0.161, -0.331, -0.261, -0.409, -0.338, -0.437, -0.242, -0.328, -0.403, -0.433, -0.274, -0.331, -0.163, -0.361, -0.298, -0.392, -0.447, -0.429, -0.388, 0.11 , -0.348, -0.174, 0.244, -0.182, -0.424, -0.319, 0.088, -0.547, 0.189, -0.216, -0.228, -0.17 , 0.125, -0.073, -0.266, -0.234, -0.108, -0.395, -0.395, 0.131, 0.074, 0.514, -0.235, -0.389, -0.288, -0.22 , -0.416, -0.777, -0.358, -0.31 , 0.817, -0.363, -0.328, -0.424, -0.416, -0.248, -0.093, -0.28 , -0.357, -0.348, -0.298, -0.384, -0.394, -0.362, -0.415, -0.349, -0.08 , -0.572, -0.07 , -0.423, 0.359, 0.4 , 0.099, -0.426, -0.252, -0.697, -0.508, -0.348, -0.254, -0.307, -0.116, -0.029, -0.201, -0.302, -0.25 , -0.44 , -0.233, 0.274, -0.295, -0.223, -0.398, -0.298, -0.209, -0.389, -0.247, 0.225, -0.395, -0.124, -0.237, -0.104, -0.361, -0.335, -0.083, -0.254]])
作為識別“分散”點的開始,請考慮關注內核密度估計相對較低的位置。
這個建議假設最初對點的“軌跡”知之甚少,甚至一無所知——它們中的大多數將沿著一條或多條曲線落下——它是本著對數據進行半自動探索的精神提出的(而不是檢驗假設)。
您可能需要使用內核寬度和“相對較低”的閾值。存在很好的自動方法來估計前者,而後者可以通過分析數據點的密度來識別(以識別低值集群)。
例子
該圖是由兩種數據組合生成的:一種用紅點表示,是高精度數據,另一種用藍點表示,是在極低值附近得到的相對低精度的數據。. 在其背景中是(a)內核密度估計的輪廓(灰度)和(b)圍繞其生成點的曲線(黑色)。
密度相對較低的點已被自動圈出。(這些點的密度小於所有點平均密度的八分之一。)它們包括大部分——但不是全部!——低精度點和一些高精度點(在頂部正確的)。位於曲線附近的低精度點(由高精度點推斷)沒有被圈出。高精度點的圈出凸顯了點稀疏的地方,底層曲線的軌跡將是不確定的。這是建議方法的一個特點,而不是限制!
代碼
R
生成此示例的代碼如下。它使用該ks
庫評估點模式中的各向異性來開發定向內核形狀。這種方法在樣本數據中效果很好,其點雲往往又長又瘦。# # Simulate some data. # f <- function(x) -0.55 + 0.45*x + 0.01/(1.2-x)^2 # The underlying curve set.seed(17) n1 <- 280; n2 <- 15 x <- c(1.2 - rbeta(n1,.9, .6), rep(0.1, n2)) y <- f(x) d <- data.frame(x=x + c(rnorm(n1, 0, 0.025), rnorm(n2, 0, 0.1)), y=y + c(rnorm(n1, 0, 0.025), rnorm(n2, 0, 0.33)), group=c(rep(1, n1), rep(2, n2))) d <- subset(d, subset=(y <= 1.0)) # Omit any high-y points # # Plot the density estimate. # require(ks) p <- cbind(d$x, d$y) dens <- kde(p) n.levels <- 13 colors <- gray(seq(1, 0, length.out=n.levels)) plot(dens, display="filled.contour2", cont=seq(0, 100, length.out=n.levels), col=colors, xlab="X", ylab="Y") # # Evaluate densities at the data points. # dens <- kde(p, eval.points=p) d$Density <- dens$estimate # # Plot the (correct) curve and the points. # curve(f(x), add=TRUE, to=1.2, col="Black") points(d$x, d$y, ylim=c(-1,1), pch=19, cex=sqrt(d$Density/8), col=ifelse(d$group==1, "Red", "Blue")) # # Highlight some low-density points. # m <- mean(d$Density) e <- subset(d, subset=(Density < m/10)) points(e$x, e$y, col="#00000080")