\documentclass[a4paper,11pt]{article}

\usepackage{amsmath,amssymb,amsopn,natbib}
\usepackage[left=2.5cm,top=2.5cm,right=2.5cm,bottom=2.5cm]{geometry}

\renewcommand{\today}{\begingroup
\number \day\space  \ifcase \month \or January\or February\or March\or 
April\or May\or June\or July\or August\or September\or October\or 
November\or December\fi 
\space  \number \year \endgroup}

\renewcommand{\vec}[1]{\boldsymbol{#1}}
\newcommand{\mat}[1]{\mathbf{#1}}
\def\bH{\mat{H}}
\def\vecx{\vec{x}}
\def\vecX{\vec{X}}


\DeclareMathOperator{\E}{\boldsymbol{\mathbb{E}}}
\let\code=\texttt
\let\proglang=\texttt
\let\pkg=\texttt

%\VignetteIndexEntry{kde} 
%\SweaveOpts{eps=FALSE}

\title{ks: Kernel density estimation for bivariate data}
\author{Tarn Duong}

\begin{document}

\maketitle

\noindent Kernel density estimation is a popular tool for visualising 
the distribution of data. See \citet*{simonoff1996}, for example, for
an overview.
When multivariate kernel density estimation is considered it is usually
in the constrained context with diagonal bandwidth matrices, e.g. 
in the \proglang{R} packages \pkg{sm} \citep*{sm} and \pkg{KernSmooth} 
\citep*{KernSmooth}.  
We introduce a new \proglang{R} package \pkg{ks} 
which implements diagonal and unconstrained data-driven bandwidth matrices
for kernel density estimation,
which can also be used for multivariate kernel
discriminant analysis.  
The \pkg{ks} package implements selectors for 1- to 6-dimensional
data. 

This vignette contains only a brief introduction 
to using \pkg{ks} for kernel density estimation
for 2-dimensional data. 
See \citet*{duong2007c} for a more detailed account. 
For a bivariate random sample $\vecX_1, \vecX_2, \ldots, \vecX_n$ 
drawn from a density $f$, 
the kernel density estimate is defined by
$$
\hat{f} (\vecx; \bH) = n^{-1}\sum_{i=1}^n K_{\bH} ( \vecx - \vec{X}_i)
$$
where $\vecx = (x_1, x_2)^T$ and $\vec{X}_i = (X_{i1}, X_{i2})^T, i = 1, 2,  
\ldots, n$.  Here 
$K(\vecx)$ is the kernel which is a symmetric probability density function, 
$\bH$ 
is the bandwidth matrix which is symmetric and positive-definite,  
and $K_{\bH}(\vecx) = |\bH|^{-1/2} 
K( \bH^{-1/2} \vecx)$. 
The choice of $K$ is not crucial: we take 
$K(\vecx) = (2\pi)^{-1} \exp(-\tfrac{1}{2} \vecx^T \vecx)$ the standard normal
throughout.  
In contrast, the choice of $\bH$ is crucial in determining 
the performance of $\hat f$. 
The most common parameterisations of the bandwidth matrix
are the diagonal and the 
general or unconstrained which has no restrictions on $\bH$
provided that $\bH$
remains positive definite and symmetric, that is 
$$
\bH = \begin{bmatrix}h_1^2 & 0 \\0 & h_2^2 \end{bmatrix}
\ \mathrm{or} \ 
\bH = \begin{bmatrix}h_1^2 & h_{12} \\ h_{12}  & h_2^2 \end{bmatrix}.
$$
This latter
parameterisation allows kernels to have an arbitrary orientation
whereas the former only allows kernels which are oriented to the
co-ordinate axes.

For our target density, we use the 
 `dumbbell' density, given by the normal mixture
$$ \frac{4}{11} N \bigg( \begin{bmatrix}-2 \\ 2\end{bmatrix}, 
\begin{bmatrix}1 & 0 \\ 0 & 1 \end{bmatrix} \bigg)+ 
\frac{3}{11} N \bigg( \begin{bmatrix}0 \\ 0\end{bmatrix},
\begin{bmatrix}0.8 & -0.72 \\ -0.72 & 0.8\end{bmatrix} \bigg)+
\frac{4}{11} N \bigg( \begin{bmatrix}2 \\ -2\end{bmatrix}, 
\begin{bmatrix}1 & 0 \\ 0 & 1 \end{bmatrix} \bigg),
$$
displayed on the left in Figure \ref{fig:dens-db}. This 
density is unimodal. On the right is a
sample of 200 data points.
 
<<results=hide,print=FALSE>>=
library(ks)
set.seed(8192)
samp <- 200
mus <- rbind(c(-2,2), c(0,0), c(2,-2))
Sigmas <- rbind(diag(2), matrix(c(0.8, -0.72, -0.72, 0.8), nrow=2), diag(2))
cwt <- 3/11
props <- c((1-cwt)/2, cwt, (1-cwt)/2)
x <- rmvnorm.mixt(n=samp, mus=mus, Sigmas=Sigmas, props=props)
@ 

\setkeys{Gin}{width=0.45\textwidth}
\begin{figure}[!ht]
\begin{center}
<<fig=TRUE,print=FALSE,echo=FALSE>>=
plotmixt(mus=mus, Sigmas=Sigmas, props=props, xlim=c(-4,4), ylim=c(-4,4))
@ 
<<fig=TRUE,print=FALSE,echo=FALSE>>=
plot(x, xlim=c(-4,4), ylim=c(-4,4), xlab="x", ylab="y")
@
\end{center}
\caption{Target `dumbbell' density. (Left) contour plot. (Right) Scatter plot.}
\label{fig:dens-db}
\end{figure}

We use \code{Hpi} for 
unconstrained plug-in selectors and \code{Hpi.diag} for diagonal plug-in selectors.
<<print=TRUE>>=
Hpi1 <- Hpi(x=x)
Hpi2 <- Hpi.diag(x=x)
@ 
To compute a kernel density estimate, the 
command is \code{kde}, which creates a \code{kde} class object
<<>>=
fhat.pi1 <- kde(x=x, H=Hpi1)
fhat.pi2 <- kde(x=x, H=Hpi2)
@ 
We use the \code{plot} method for \code{kde} objects to display these
kernel density estimates. The default is a contour plot with 
the upper 25\%, 50\% and 75\% contours of the 
(sample) highest density regions. %, as
%defined in \citet*{bowman1993} and \citet*{hyndman1996}.
These regions are also plotted by the \pkg{sm} library.
<<eval=FALSE>>=
plot(fhat.pi1)
plot(fhat.pi2)
@
The respective kernel density estimates are produced in Figure \ref{fig:pi}.
The diagonal bandwidth matrix constrains the smoothing to be performed in directions
parallel to the co-ordinate axes, so it is not able to apply accurate levels
of smoothing to the obliquely oriented central portion. The result is a 
multimodal
density estimate. The unconstrained bandwidth matrix correctly produces 
a unimodal density estimate. 

\begin{figure}[!ht]
\centering
<<fig=TRUE,echo=FALSE,print=FALSE>>=
plot(fhat.pi1, main="Plug-in", cex.main=1.4, xlim=c(-4,4), ylim=c(-4,4))
@ 
<<fig=TRUE,echo=FALSE,print=FALSE>>=
plot(fhat.pi2, main="Plug-in diagonal", cex.main=1.4, xlim=c(-4,4), ylim=c(-4,4)) 
@  
\caption{Kernel density estimates with plug-in selectors}
\label{fig:pi}
\end{figure}

The unconstrained SCV (Smoothed Cross Validation) selector is \code{Hscv} and 
its diagonal version is \code{Hscv.diag}.
In Figure \ref{fig:cv}, the most
reasonable density estimate is from the unconstrained SCV selector. 
 
<<print=TRUE>>=
Hscv1 <- Hscv(x=x)
Hscv2 <- Hscv.diag(x=x)
@ 
\begin{figure}[!ht]
\centering
<<echo=FALSE,print=FALSE>>=
fhat.cv1 <- kde(x=x, H=Hscv1)
fhat.cv2 <- kde(x=x, H=Hscv2)
@ 
<<fig=TRUE,echo=FALSE,print=FALSE>>=
plot(fhat.cv1, main="SCV", cex.main=1.4, xlim=c(-4,4), ylim=c(-4,4))
@ 
<<fig=TRUE,echo=FALSE,print=FALSE>>=
plot(fhat.cv2, main="SCV diagonal", cex.main=1.4, xlim=c(-4,4), ylim=c(-4,4))
@ 
\caption{Kernel density estimates with cross validation selectors}
\label{fig:cv}
\end{figure}

The unconstrained bandwidth selectors will be better than their diagonal counterparts
when the data have large mass oriented obliquely to the co-ordinate axes,
like for the dumbbell data. 
The unconstrained plug-in and the SCV selectors
can be viewed as generally recommended selectors.

\bibliographystyle{apalike}

\begin{thebibliography}{}

\bibitem[Bowman and Azzalini, 2007]{sm}
Bowman, A. W. and Azzalini, A. (2007).
\newblock {\em sm: kernel smoothing methods: Bowman and Azzalini (1997)}.
\newblock R package version 2.2.

\bibitem[Duong, 2007]{duong2007c}
Duong, T. (2007).
\newblock ks: {K}ernel density estimation and kernel discriminant analysis for
  multivariate data in {R}.
\newblock {\em Journal of Statistical Software}. \textbf{21 (7)}, URL \texttt{http://www.jstatsoft.org/v21/i07}.

\bibitem[Simonoff, 1996]{simonoff1996}
Simonoff, J. S. (1996).
\newblock {\em Smoothing Methods in Statistics}.
\newblock Springer-Verlag, New York.

\bibitem[Wand, 2006]{KernSmooth}
Wand, M. P. (2006).
\newblock {\em KernSmooth: Functions for kernel smoothing for Wand \& Jones
  (1995)}.
\newblock R package version 2.22-19. R port by Brian Ripley.

\end{thebibliography}

\end{document}