\documentclass[a4paper,11pt]{article} \usepackage{amsmath,amssymb,amsopn,natbib} \usepackage[left=2.5cm,top=2.5cm,right=2.5cm,bottom=2.5cm]{geometry} \renewcommand{\today}{\begingroup \number \day\space \ifcase \month \or January\or February\or March\or April\or May\or June\or July\or August\or September\or October\or November\or December\fi \space \number \year \endgroup} \renewcommand{\vec}[1]{\boldsymbol{#1}} \newcommand{\mat}[1]{\mathbf{#1}} \def\bH{\mat{H}} \def\vecx{\vec{x}} \def\vecX{\vec{X}} \DeclareMathOperator{\E}{\boldsymbol{\mathbb{E}}} \let\code=\texttt \let\proglang=\texttt \let\pkg=\texttt %\VignetteIndexEntry{kde} %\SweaveOpts{eps=FALSE} \title{ks: Kernel density estimation for bivariate data} \author{Tarn Duong} \begin{document} \maketitle \noindent Kernel density estimation is a popular tool for visualising the distribution of data. See \citet*{simonoff1996}, for example, for an overview. When multivariate kernel density estimation is considered it is usually in the constrained context with diagonal bandwidth matrices, e.g. in the \proglang{R} packages \pkg{sm} \citep*{sm} and \pkg{KernSmooth} \citep*{KernSmooth}. We introduce a new \proglang{R} package \pkg{ks} which implements diagonal and unconstrained data-driven bandwidth matrices for kernel density estimation, which can also be used for multivariate kernel discriminant analysis. The \pkg{ks} package implements selectors for 1- to 6-dimensional data. This vignette contains only a brief introduction to using \pkg{ks} for kernel density estimation for 2-dimensional data. See \citet*{duong2007c} for a more detailed account. For a bivariate random sample $\vecX_1, \vecX_2, \ldots, \vecX_n$ drawn from a density $f$, the kernel density estimate is defined by $$ \hat{f} (\vecx; \bH) = n^{-1}\sum_{i=1}^n K_{\bH} ( \vecx - \vec{X}_i) $$ where $\vecx = (x_1, x_2)^T$ and $\vec{X}_i = (X_{i1}, X_{i2})^T, i = 1, 2, \ldots, n$. Here $K(\vecx)$ is the kernel which is a symmetric probability density function, $\bH$ is the bandwidth matrix which is symmetric and positive-definite, and $K_{\bH}(\vecx) = |\bH|^{-1/2} K( \bH^{-1/2} \vecx)$. The choice of $K$ is not crucial: we take $K(\vecx) = (2\pi)^{-1} \exp(-\tfrac{1}{2} \vecx^T \vecx)$ the standard normal throughout. In contrast, the choice of $\bH$ is crucial in determining the performance of $\hat f$. The most common parameterisations of the bandwidth matrix are the diagonal and the general or unconstrained which has no restrictions on $\bH$ provided that $\bH$ remains positive definite and symmetric, that is $$ \bH = \begin{bmatrix}h_1^2 & 0 \\0 & h_2^2 \end{bmatrix} \ \mathrm{or} \ \bH = \begin{bmatrix}h_1^2 & h_{12} \\ h_{12} & h_2^2 \end{bmatrix}. $$ This latter parameterisation allows kernels to have an arbitrary orientation whereas the former only allows kernels which are oriented to the co-ordinate axes. For our target density, we use the `dumbbell' density, given by the normal mixture $$ \frac{4}{11} N \bigg( \begin{bmatrix}-2 \\ 2\end{bmatrix}, \begin{bmatrix}1 & 0 \\ 0 & 1 \end{bmatrix} \bigg)+ \frac{3}{11} N \bigg( \begin{bmatrix}0 \\ 0\end{bmatrix}, \begin{bmatrix}0.8 & -0.72 \\ -0.72 & 0.8\end{bmatrix} \bigg)+ \frac{4}{11} N \bigg( \begin{bmatrix}2 \\ -2\end{bmatrix}, \begin{bmatrix}1 & 0 \\ 0 & 1 \end{bmatrix} \bigg), $$ displayed on the left in Figure \ref{fig:dens-db}. This density is unimodal. On the right is a sample of 200 data points. <>= library(ks) set.seed(8192) samp <- 200 mus <- rbind(c(-2,2), c(0,0), c(2,-2)) Sigmas <- rbind(diag(2), matrix(c(0.8, -0.72, -0.72, 0.8), nrow=2), diag(2)) cwt <- 3/11 props <- c((1-cwt)/2, cwt, (1-cwt)/2) x <- rmvnorm.mixt(n=samp, mus=mus, Sigmas=Sigmas, props=props) @ \setkeys{Gin}{width=0.45\textwidth} \begin{figure}[!ht] \begin{center} <>= plotmixt(mus=mus, Sigmas=Sigmas, props=props, xlim=c(-4,4), ylim=c(-4,4)) @ <>= plot(x, xlim=c(-4,4), ylim=c(-4,4), xlab="x", ylab="y") @ \end{center} \caption{Target `dumbbell' density. (Left) contour plot. (Right) Scatter plot.} \label{fig:dens-db} \end{figure} We use \code{Hpi} for unconstrained plug-in selectors and \code{Hpi.diag} for diagonal plug-in selectors. <>= Hpi1 <- Hpi(x=x) Hpi2 <- Hpi.diag(x=x) @ To compute a kernel density estimate, the command is \code{kde}, which creates a \code{kde} class object <<>>= fhat.pi1 <- kde(x=x, H=Hpi1) fhat.pi2 <- kde(x=x, H=Hpi2) @ We use the \code{plot} method for \code{kde} objects to display these kernel density estimates. The default is a contour plot with the upper 25\%, 50\% and 75\% contours of the (sample) highest density regions. %, as %defined in \citet*{bowman1993} and \citet*{hyndman1996}. These regions are also plotted by the \pkg{sm} library. <>= plot(fhat.pi1) plot(fhat.pi2) @ The respective kernel density estimates are produced in Figure \ref{fig:pi}. The diagonal bandwidth matrix constrains the smoothing to be performed in directions parallel to the co-ordinate axes, so it is not able to apply accurate levels of smoothing to the obliquely oriented central portion. The result is a multimodal density estimate. The unconstrained bandwidth matrix correctly produces a unimodal density estimate. \begin{figure}[!ht] \centering <>= plot(fhat.pi1, main="Plug-in", cex.main=1.4, xlim=c(-4,4), ylim=c(-4,4)) @ <>= plot(fhat.pi2, main="Plug-in diagonal", cex.main=1.4, xlim=c(-4,4), ylim=c(-4,4)) @ \caption{Kernel density estimates with plug-in selectors} \label{fig:pi} \end{figure} The unconstrained SCV (Smoothed Cross Validation) selector is \code{Hscv} and its diagonal version is \code{Hscv.diag}. In Figure \ref{fig:cv}, the most reasonable density estimate is from the unconstrained SCV selector. <>= Hscv1 <- Hscv(x=x) Hscv2 <- Hscv.diag(x=x) @ \begin{figure}[!ht] \centering <>= fhat.cv1 <- kde(x=x, H=Hscv1) fhat.cv2 <- kde(x=x, H=Hscv2) @ <>= plot(fhat.cv1, main="SCV", cex.main=1.4, xlim=c(-4,4), ylim=c(-4,4)) @ <>= plot(fhat.cv2, main="SCV diagonal", cex.main=1.4, xlim=c(-4,4), ylim=c(-4,4)) @ \caption{Kernel density estimates with cross validation selectors} \label{fig:cv} \end{figure} The unconstrained bandwidth selectors will be better than their diagonal counterparts when the data have large mass oriented obliquely to the co-ordinate axes, like for the dumbbell data. The unconstrained plug-in and the SCV selectors can be viewed as generally recommended selectors. \bibliographystyle{apalike} \begin{thebibliography}{} \bibitem[Bowman and Azzalini, 2007]{sm} Bowman, A. W. and Azzalini, A. (2007). \newblock {\em sm: kernel smoothing methods: Bowman and Azzalini (1997)}. \newblock R package version 2.2. \bibitem[Duong, 2007]{duong2007c} Duong, T. (2007). \newblock ks: {K}ernel density estimation and kernel discriminant analysis for multivariate data in {R}. \newblock {\em Journal of Statistical Software}. \textbf{21 (7)}, URL \texttt{http://www.jstatsoft.org/v21/i07}. \bibitem[Simonoff, 1996]{simonoff1996} Simonoff, J. S. (1996). \newblock {\em Smoothing Methods in Statistics}. \newblock Springer-Verlag, New York. \bibitem[Wand, 2006]{KernSmooth} Wand, M. P. (2006). \newblock {\em KernSmooth: Functions for kernel smoothing for Wand \& Jones (1995)}. \newblock R package version 2.22-19. R port by Brian Ripley. \end{thebibliography} \end{document}