findmleHSMM <- function(x, J, M = NA, obsdist, dwelldist, obspar, dwellpar, Pi, delta = NULL, maxiter = 100, tol = 1e-05, shift = FALSE, verbose = TRUE, seed = NULL) {

  # Set seed if provided for reproducibility
  if (!is.null(seed)) {
    set.seed(seed)
  }

  # Compute the stationary distribution via the transition probability matrix
  if(is.null(delta)){
    delta <- solve(t(diag(J) - Pi + 1), rep(1, J))
  }

  # Input validation
  if(!is.numeric(J) || floor(J) != J || J < 2) {
    stop("J must be an integer strictly greater than 1")
  }
  if(length(delta)!=J || nrow(Pi)!= J|| ncol(Pi)!=J) {
    stop("initial distribution must be vector of length J, transition matrix must be J x J")
  }
  if(!dwelldist %in% c("pois", "nbinom", "betabinom", "geom")) {
    stop("dwell distribution is not supported")
  }
  if(!obsdist %in% c("pois", "norm", "weibull", "zip", "nbinom", "zinb", "exp", "gamma", "lnorm", "gev", "ZInormal", "ZIgamma")) {
    stop("observation distribution is not supported")
  }

  # Setup parameters
  if(!isTRUE(shift)){
    dwellpar$shift<- rep(1, J)
  }

  # Initialize EM algorithm
  Pi2 <- unserialize(serialize(Pi, NULL))  # Deep copy
  tau <- length(x)
  if(is.na(M)) M <- min(tau, 1000)
  loglik <- c()

  # Compute initial dwell-time probabilities
  dwell <- dwellprobs(M, J, dwelldist, dwellpar)
  dwellprobs <- dwell$dwellprobs
  surv <- dwell$surv
  delta_preserved <- as.numeric(unlist(delta))

  # EM algorithm main loop
  for(iter in 1:maxiter) {
    current_delta <- delta_preserved

    # E-step: compute observation probabilities
    obsprobs <- obsprobs(x, J, obsdist, obspar)

    # Null checks
    if (is.null(Pi)) stop("Pi is NULL")
    if (is.null(current_delta)) stop("delta is NULL")
    if (is.null(obsprobs)) stop("obsprobs is NULL")
    if (is.null(dwellprobs)) stop("dwellprobs is NULL")
    if (is.null(surv)) stop("surv is NULL")

    # Forward-backward algorithm for HSMMs
    forback <- backwards(
      transProb = as.matrix(Pi2),
      initDist = as.numeric(current_delta),
      obsProb = as.matrix(obsprobs),
      durProb = as.matrix(dwellprobs),
      survProb = as.matrix(surv),
      seqLen = as.integer(tau),
      numStates = as.integer(J),
      maxDur = as.integer(rep(M,J)),
      backwardProb = matrix(0, nrow = tau, ncol=J),
      normConst = numeric(tau),
      eta = numeric(M*J),
      forwardVars = matrix(0, nrow=tau, ncol=J),
      stateProb = numeric(J*tau),
      xi = matrix(0, nrow=tau, ncol=J),
      numSeq = as.integer(1),
      totalLen = as.integer(tau),
      backwardVars = matrix(0, nrow=tau, ncol=J)
    )

    # Extract posterior probabilities and transition matrix
    Pi <- forback$transProb
    delta_preserved <- if(is.null(forback$init)) current_delta else as.numeric(forback$init)
    eta <- forback$eta  # Dwell-time posteriors
    xi <- as.vector(forback$xi)  # State posteriors
    N <- forback$normConst  # Normalization constants

    # Handle numerical precision issues
    if(any(xi<0)) xi <- zapsmall(xi)
    if(any(eta<0)) eta <- zapsmall(eta)
    if(any(N<0)) N <- zapsmall(N)

    # M-step: re-estimate parameters
    weight <- matrix(xi, ncol=J)
    reestimates <- reestimate(x, weight, obsdist, eta, J, dwelldist, dwellpar)
    obspar <- reestimates$obspar
    dwellpar <- reestimates$dwellpar
    dwellprobs <- reestimates$dwellprobs
    surv <- reestimates$surv

    # Compute log-likelihood
    loglik[iter] <- sum(log(N))
    if (verbose) message("loglik at iter ", iter, " : ", loglik[iter])

    # Check convergence
    if(iter > 2) {
      if(abs(loglik[iter] - loglik[iter - 1]) < tol) break()
    }
  }

  # Compute AIC
  if(shift==TRUE){
    AIC <- -2*loglik[iter] + 2*(J*(J-2) + (length(dwellpar)*J) + (length(obspar)*J))
  } else if(shift==FALSE){
    AIC <- -2*loglik[iter] + 2*(J*(J-2) + ((length(dwellpar)-1)*J) + (length(obspar)*J))
  }

  # Compute BIC
  if(shift==TRUE){
    BIC <- (J*(J-2) + (length(dwellpar)*J) + (length(obspar)*J))*log(length(x)) - 2*loglik[iter]
  } else if(shift==FALSE){
    BIC <- (J*(J-2) + ((length(dwellpar)-1)*J) + (length(obspar)*J))*log(length(x)) - 2*loglik[iter]
  }

  # Prepare output
  outcome <- list()
  outcome$loglikelihoods <- loglik[1:iter]
  outcome$AIC <- AIC
  outcome$BIC <- BIC
  outcome$delta <- delta_preserved
  outcome$Pi <- Pi
  outcome$dwellparameters <- dwellpar
  outcome$observationparameters <- obspar
  dim(Pi) <- c(J, J)

  # Report convergence status
  if(iter == maxiter) {
    if (verbose) message("function did not converge")
    return(outcome)
  } else {
    if (verbose) message("function converged at iteration ", iter)
    return(outcome)
  }
}


