% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/SelectionVarStat.R
\name{SelectionVarStat}
\alias{SelectionVarStat}
\title{Fast feature (m/z) selection using multiple hypothesis testing (LIMMA/ANOVA/Kruskal) with optional class balancing (no/up/down/SMOTE)}
\usage{
SelectionVarStat(
  X,
  Y,
  stat.test = c("Limma", "anova", "kruskal"),
  pi0.method = "abh",
  fdr = 0.05,
  Sampling = c("no", "up", "down", "smote"),
  seed = NULL
)
}
\arguments{
\item{X}{Numeric matrix with samples in rows and features (peaks) in columns.
If a sparse Matrix is provided, it is coerced to a base R dense matrix.
Infinities are set to NA; NAs are allowed.}

\item{Y}{Class labels. A factor (or coercible to factor) of length \code{nrow(X)}.
Must contain at least two levels.}

\item{stat.test}{Character string; which univariate test to use. One of
\code{"Limma"}, \code{"anova"}, or \code{"kruskal"}. Default is \code{"Limma"}.
\itemize{
\item Limma uses \code{limma::lmFit} on \code{~ 0 + Y}, applies k-1 contrasts
versus a reference group with \code{limma::makeContrasts} and then
\code{limma::eBayes} to compute a moderated F p-value testing overall
between-group differences.
\item anova uses a fast C++ one-way ANOVA (upper-tail F p-value).
\item kruskal uses a fast C++ Kruskal–Wallis test with tie correction
(upper-tail chi-square p-value).
}}

\item{pi0.method}{Character; method for \code{cp4p::estim.pi0} and
\code{cp4p::adjust.p}. Default \code{"abh"}. See cp4p documentation for options.}

\item{fdr}{Numeric in (0, 1]; FDR threshold used to select features after
p-value adjustment. Default \code{0.05}.}

\item{Sampling}{Character string; optional class balancing applied before testing.
One of \code{"no"}, \code{"up"}, \code{"down"}, \code{"smote"}. Default \code{"no"}.
\itemize{
\item "up": up-samples minority classes to the majority size (base R implementation).
\item "down": down-samples majority classes to the minority size (base R).
\item "smote": uses the package’s internal \code{smote_classif()} on \code{data.frame(Y, X)}.
Column names of \code{X} are preserved; if SMOTE fails or yields <2 classes,
the function falls back to up-sampling.
}}

\item{seed}{Optional integer. If provided, sets the random seed for
up/down sampling and SMOTE to ensure reproducibility.}
}
\value{
A list with:
\itemize{
\item \code{nb_to_sel}: integer, \code{floor(ncol(X) * (1 - pi0))} using \code{cp4p::estim.pi0}.
\item \code{n_selected_fdr}: integer, number of features with adjusted p-value < \code{fdr}.
\item \code{sel_moz}: character vector of selected feature names (columns of \code{X})
with adjusted p-value < \code{fdr}.
\item \code{ap}: the full object returned by \code{cp4p::adjust.p} (contains adjusted p-values).
}
}
\description{
Runs univariate statistical tests across all features (columns) to identify
discriminant m/z values. Supports three tests:
\itemize{
\item "Limma": moderated F-test via the limma R package on k-1 contrasts between groups
\item "anova": classical one-way ANOVA (implemented in C++ for speed)
\item "kruskal": Kruskal–Wallis rank-sum test (implemented in C++ with tie correction)
}
}
\details{
Optional class balancing can be applied before testing: no sampling,
up-sampling, down-sampling, or SMOTE using the internal \code{smote_classif()} function.
P-values are adjusted with the cp4p R package, and features below the FDR
threshold are returned, together with an estimated proportion of nulls, \code{pi0}.

Missing values are preserved as \code{NA}: for ANOVA/Kruskal, the C++ backends
skip NAs per feature; for Limma, missing values are handled by limma internally.
Non-finite values (Inf/-Inf) are coerced to \code{NA}.
\itemize{
\item Limma design: \code{model.matrix(~ 0 + Y)} is used; a full-rank set of k-1
contrasts vs the first level is constructed with \code{limma::makeContrasts}.
The returned p-values are moderated F-test p-values for overall group differences.
\item ANOVA/Kruskal implementations are in C++ (see \code{anova_cols_cpp} and
\code{kruskal_cols_cpp}); they process all features in one pass and skip NAs per feature.
\item Non-finite p-values (if any) are set to 1 before \code{pi0}/adjustment.
\item SMOTE requires at least two observations per class; otherwise the function
automatically falls back to up-sampling.
\item For limma, if residual degrees of freedom are not positive
(\code{nrow(X) - nlevels(Y) <= 0}), p-values are set to 1 with a warning.
}
}
\examples{
###############################################################################
## 1. Pre-processing of mass spectra

# load mass spectra and their metadata
data("CitrobacterRKIspectra","CitrobacterRKImetadata", package = "MSclassifR")
# standard pre-processing of mass spectra
spectra <- MSclassifR::SignalProcessing(CitrobacterRKIspectra)
# detection of peaks in pre-processed mass spectra
peaks <- MSclassifR::PeakDetection(x = spectra, labels = CitrobacterRKImetadata$Strain_name_spot)
# build matrix with intensities of peaks (rows = samples, columns = m/z)
Y <- factor(CitrobacterRKImetadata$Species)
xy <- build_XY_from_peaks(peaks, labels = Y, normalize = "max", sparse = FALSE)
X <- xy$X
Y <- xy$Y

###############################################################################
## 2. Estimate the optimal number of peaks to discriminate the different species

OptiPeaks <- MSclassifR::SelectionVarStat(X,
                             Y,
                             stat.test = "Limma",
                             pi0.method = "abh",
                             fdr = 0.05,
                             Sampling = "smote",
                             seed = 1)

## Estimation of the optimal number of peaks to discriminate species (from the pi0 parameter)
OptiPeaks$nb_to_sel

## discriminant mass-to-charge values estimated using a 5 per cent false discovery rate
OptiPeaks$sel_moz

## p-values and adjusted p-values estimated for all the tested mass-to-charge values
OptiPeaks$ap$adjp

}
\seealso{
limma::lmFit, limma::contrasts.fit, limma::eBayes, limma::makeContrasts;
cp4p::estim.pi0, cp4p::adjust.p
}
