#' Score Anomalies Using Unsupervised Machine Learning
#'
#' Calculates anomaly scores for each record using Isolation Forest or
#' Local Outlier Factor algorithms. Optionally evaluates performance against
#' ground truth labels for benchmarking.
#'
#' @importFrom stats predict quantile
#' @param data A data frame containing the data to be scored.
#' @param method Character string indicating the anomaly detection method.
#'   Options: "iforest" (Isolation Forest, default) or "lof" (Local Outlier Factor).
#' @param contamination Numeric value between 0 and 1 indicating the expected
#'   proportion of anomalies in the data. Default is 0.05 (5%).
#' @param ground_truth_col Character string naming a column in \code{data} that
#'   contains binary ground truth labels (0/1 or FALSE/TRUE) for known anomalies.
#'   If provided, benchmarking metrics will be calculated. Default is NULL.
#' @param id_cols Character vector of column names to exclude from scoring.
#'   Passed to \code{prep_for_anomaly()}.
#' @param exclude_cols Character vector of additional columns to exclude.
#'   Passed to \code{prep_for_anomaly()}.
#' @param ... Additional arguments passed to the underlying algorithm.
#'   For Isolation Forest: \code{ntrees}, \code{sample_size}, \code{max_depth}.
#'   For LOF: \code{minPts} (number of neighbors; deprecated \code{k} is converted to \code{minPts}).
#'
#' @return A data frame with the original data plus an \code{anomaly_score} column.
#'   If \code{ground_truth_col} is provided, the result includes an attribute
#'   \code{benchmark_metrics} containing: \code{auc_roc} (Area Under the ROC Curve),
#'   \code{auc_pr} (Area Under the Precision-Recall Curve), \code{top_k_recall}
#'   (List of recall values for top K records: K = 10, 50, 100, 500), and
#'   \code{contamination_rate} (Actual proportion flagged as anomalous).
#'
#' @export
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   patient_id = 1:50,
#'   age = rnorm(50, 50, 15),
#'   cost = rnorm(50, 10000, 5000)
#' )
#' scored_data <- score_anomaly(data, method = "iforest", contamination = 0.05)
#' }
score_anomaly <- function(data, method = "iforest", contamination = 0.05,
                          ground_truth_col = NULL, id_cols = NULL,
                          exclude_cols = NULL, ...) {
  
  # Validate inputs
  if (!is.data.frame(data)) {
    stop("data must be a data frame")
  }
  
  if (!method %in% c("iforest", "lof")) {
    stop("method must be 'iforest' or 'lof'")
  }
  
  if (contamination <= 0 || contamination >= 1) {
    stop("contamination must be between 0 and 1")
  }
  
  ground_truth <- NULL
  if (!is.null(ground_truth_col)) {
    if (length(ground_truth_col) != 1 || !ground_truth_col %in% names(data)) {
      stop("ground_truth_col '", ground_truth_col, "' not found in data")
    }
    ground_truth <- data[[ground_truth_col]]
    if (is.logical(ground_truth)) {
      ground_truth <- as.numeric(ground_truth)
    } else if (is.factor(ground_truth)) {
      ground_truth <- as.numeric(ground_truth) - 1
    }
    if (!all(ground_truth %in% c(0, 1))) {
      stop("ground_truth_col must contain binary values (0/1, TRUE/FALSE, or factor)")
    }
  }
  
  prep_result <- prep_for_anomaly(data, id_cols = id_cols, exclude_cols = exclude_cols)
  prepared_data <- prep_result$prepared_data
  metadata <- prep_result$metadata
  
  if (method == "iforest") {
    scores <- score_iforest(prepared_data, contamination = contamination, ...)
  } else if (method == "lof") {
    scores <- score_lof(prepared_data, contamination = contamination, ...)
  }
  
  # Normalize to 0-1 range
  scores <- (scores - min(scores)) / (max(scores) - min(scores) + 1e-10)
  
  result <- data
  result$anomaly_score <- scores
  
  if (!is.null(ground_truth)) {
    benchmark_metrics <- calculate_benchmark_metrics(
      scores = scores,
      ground_truth = ground_truth,
      contamination = contamination
    )
    attr(result, "benchmark_metrics") <- benchmark_metrics
  }
  
  attr(result, "method") <- method
  attr(result, "contamination") <- contamination
  attr(result, "metadata") <- metadata
  
  return(result)
}

#' Score anomalies using Isolation Forest
#'
#' @keywords internal
#' @param prepared_data Numeric matrix of prepared data
#' @param contamination Expected proportion of anomalies
#' @param ... Additional arguments for isotree::isolation.forest
#' @return Numeric vector of anomaly scores
score_iforest <- function(prepared_data, contamination = 0.05, ...) {
  default_args <- list(
    sample_size = min(256, nrow(prepared_data)),
    ntrees = 100
  )
  
  additional_args <- list(...)
  # Merge: additional_args override defaults (use modifyList for proper merging)
  args <- default_args
  args[names(additional_args)] <- additional_args
  args$data <- prepared_data
  
  model <- do.call(isotree::isolation.forest, args)
  scores <- stats::predict(model, prepared_data)
  
  # Invert so higher = more anomalous
  scores <- -scores
  
  return(scores)
}

#' Score anomalies using Local Outlier Factor
#'
#' @keywords internal
#' @param prepared_data Numeric matrix of prepared data
#' @param contamination Expected proportion of anomalies
#' @param ... Additional arguments for dbscan::lof
#' @return Numeric vector of anomaly scores
score_lof <- function(prepared_data, contamination = 0.05, ...) {
  additional_args <- list(...)
  
  # Convert deprecated k to minPts
  if ("k" %in% names(additional_args)) {
    additional_args$minPts <- additional_args$k
    additional_args$k <- NULL
    warning("Parameter 'k' is deprecated in dbscan::lof(). Using 'minPts' instead.")
  }
  
  if (!"minPts" %in% names(additional_args)) {
    additional_args$minPts <- min(5, max(1, floor(nrow(prepared_data) * 0.1)))
  }
  
  scores <- do.call(dbscan::lof, c(list(x = prepared_data), additional_args))
  return(scores)
}

#' Calculate Benchmarking Metrics
#'
#' Computes AUC-ROC, AUC-PR, and Top-K Recall metrics for evaluating
#' anomaly detection performance against ground truth.
#'
#' @param scores Numeric vector of anomaly scores
#' @param ground_truth Binary vector (0/1) of true anomaly labels
#' @param contamination Expected proportion of anomalies
#' @return List of benchmarking metrics
calculate_benchmark_metrics <- function(scores, ground_truth, contamination = 0.05) {
  roc_obj <- pROC::roc(ground_truth, scores, quiet = TRUE)
  auc_roc <- as.numeric(pROC::auc(roc_obj))
  
  pr_obj <- PRROC::pr.curve(scores.class0 = scores[ground_truth == 1],
                            scores.class1 = scores[ground_truth == 0],
                            curve = TRUE)
  auc_pr <- pr_obj$auc.integral
  
  n <- length(scores)
  top_k_values <- c(10, 50, 100, 500)
  top_k_values <- top_k_values[top_k_values <= n]
  
  top_k_recall <- list()
  for (k in top_k_values) {
    top_k_indices <- order(scores, decreasing = TRUE)[1:k]
    top_k_recall[[paste0("top_", k)]] <- mean(ground_truth[top_k_indices])
  }
  
  threshold <- stats::quantile(scores, 1 - contamination)
  contamination_rate <- mean(scores >= threshold)
  
  return(list(
    auc_roc = auc_roc,
    auc_pr = auc_pr,
    top_k_recall = top_k_recall,
    contamination_rate = contamination_rate,
    threshold = threshold
  ))
}

