# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# +                   Master's Thesis - Maria Thurow                          +
# +             R-Code: Functions needed for the simulation                   +
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# + This file contains the following functions:                               +
# + - data_estim_CVM_p                                                        +
# + - data_simul_Bootstr                                                      +
# + - data_simul_Estim                                                        +
# + - data_simul_KDE                                                          +
# + - RealSurvSim                                                             +
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

library(kdensity)
library(fitdistrplus)
library(flexsurv)
library(univariateML)
library(actuar)
library(survival)

#' Kernel Density Estimation-based Data Simulation
#'
#' Simulates data based on the kernel density estimation (KDE) of given data.
#' KDE is a non-parametric way to estimate the probability density function of a random variable.
#' This function applies the accept-reject method to generate values that follow
#' the estimated density of the original dataset.
#'
#' @param orig_vals Numeric vector of values from the original dataset.
#' @param n Integer, number of observations to simulate. If `NULL`, the function
#' simulates the same number of observations as in the original dataset. Defaults to `NULL`.
#' @param kernel Character, specifying the kernel to be used for KDE. Defaults to "gaussian".
#'
#' @return Numeric vector of `n` simulated values.
#'
#' @examples
#' original_data <- c(rnorm(100, mean = 50, sd = 10))
#' simulated_data <- data_simul_KDE(original_data, n = 100)
#' @export
#' @importFrom kdensity kdensity
#' @importFrom stats runif
data_simul_KDE <- function(orig_vals, n = NULL, kernel = "gaussian"){
  #-----------------------------------------------------------------------------
  # data_simul_KDE   - Simulating data based on the kernel density estimation of
  #                    given data. KDE is a non-parametric way to estimate the
  #                    probability density function of a random variable.
  #                    This function applies the accept-reject method to generate values that follow
  #                    the estimated density of the original dataset.
  #
  # Input:           - orig_vals: Numeric vector of values from the original data set.
  #                  - n: Integer, number of observations to simulate. If NULL, the function
  #                    simulates the same number of observations as in the original dataset. (default: NULL)
  #                  - kernel: The kernel that should be used for kernel density
  #                            estimation (default: "gaussian")
  #
  # Output: A numeric vector of n simulated values.
  #-----------------------------------------------------------------------------
  if (!is.numeric(orig_vals)) {
    stop("orig_vals must be a numeric vector.")
  }
  if (!kernel %in% c("gaussian")) {
    stop("Unsupported kernel type. Currently, only 'gaussian' is supported.")
  }
  if (length(orig_vals) == 0) {
    stop("original data cannot be empty")
  }
  #Set the number of observations to the length of the original dataset if n is NULL
  if(is.null(n)){
    n <- length(orig_vals)
  }
  if (!is.null(n) && (!is.numeric(n) || n <= 0 || n != round(n))) {
    stop("n must be a positive integer.")
  }

  # Estimate the density of the original dataset using KDE
  dens <- kdensity(orig_vals, kernel = kernel) #estimate density by using a kde

  # Generate simulated data using the accept-reject method based on the KDe
  sim_data <- sapply(1:n,
                     function(i){
                       # Generate random values within the range of the original dataset
                       # and accept or reject them based on their density
                       u <- runif(1)
                       t <- runif(1, min = 0, max = max(orig_vals))
                       while(u > dens(t)){
                         u <- runif(1)
                         t <- runif(1, min = 0, max = max(orig_vals))
                       }
                       return(t)
                     })
  return(sim_data)
}
#' Map Distribution Name for Fitting
#'
#' This function maps a user-specified distribution name to the appropriate name
#' used for parameter estimation with \code{fitdist()} from the \pkg{fitdistrplus} package.
#'
#' @param distrib A character string specifying the distribution (e.g., "poisson", "exp", "gumbel").
#'
#' @return A character string representing the distribution name as expected by \code{fitdist()}.
#'
#' @seealso \code{\link{get_dist_names}}
#' @keywords internal
get_fitdist_name <- function(distrib) {
  switch(distrib,
         cauchy = "cauchy",
         # `chi` = "chi",
         # `chi-squared` = "chisq",
         # `chi-squared-noncentral` = "chisq",
         #erlang = "gamma",
         exp = "exp",
         # f = "f",
         # `fatigue-life` = "fatigue",      # if recognized by fitdistplus
         # frechet = "frechet",
         gamma = "gamma",
         gumbel = "gumbel",
         # `half-normal` = "halfnorm",      # if recognized by fitdistplus
         # `hyperbolic-secant` = "hypersec",# if recognized
         # `inverse-gamma` = "invgamma",    # if recognized or custom
         # `inverse-gaussian` = "invgauss", # if recognized
         # `johnson-sb` = "johnsonsb",
         # `johnson-sl` = "johnsonsl",
         # `johnson-su` = "johnsonsu",
         # laplace = "laplace",
         # levy = "levy",
         llogis = "llogis",
         `log-normal` = "lnorm",
         logistic = "logis",
         #nakagami = "nakagami",
         normal = "norm",
         #pareto = "pareto",
         `#pearson-6` = "pearson6",
         #power = "power",
         #rayleigh = "rayleigh",
         #t = "t",
         #triangular = "triang",
         weibull = "weibull",
         distrib
  )
}

#' Map Distribution Names for Density and Random Generation Functions
#'
#' This function maps a user-specified distribution name to a list containing the names
#' of the corresponding density and random generation functions. If the distribution
#' is not explicitly mapped, it assumes the standard naming convention by prepending
#' \code{"d"} or \code{"r"} to the distribution name.
#'
#' @param distrib A character string specifying the distribution.
#'
#' @return A list with components \code{d} and \code{r}, which are character strings
#' representing the names of the density and random generation functions, respectively.
#'
#' @seealso \code{\link{get_fitdist_name}}
#' @keywords internal
get_dist_names <- function(distrib) {
  switch(distrib,
         cauchy = list(d = "dcauchy", r = "rcauchy"),
         # `chi` = list(d = "dchi", r = "rchi"),                # may require a custom definition
         # `chi-squared` = list(d = "dchisq", r = "rchisq"),
         # `chi-squared-noncentral` = list(d = "dchisq", r = "rchisq"),
         # erlang = list(d = "dgamma", r = "rgamma"),
         exp = list(d = "dexp", r = "rexp"),
         # f = list(d = "df", r = "rf"),
         # `fatigue-life` = list(d = "dfatigue", r = "rfatigue"),  # if implemented
         # frechet = list(d = "dfrechet", r = "rfrechet"),         # if implemented
         gamma = list(d = "dgamma", r = "rgamma"),
         gumbel = list(d = "dgumbel", r = "rgumbel"),            # if in a custom package
         # `half-normal` = list(d = "dhalfnorm", r = "rhalfnorm"), # if implemented
         # `hyperbolic-secant` = list(d = "dhypersec", r = "rhypersec"), # if implemented
         # `inverse-gamma` = list(d = "dinvgamma", r = "rinvgamma"),
         # `inverse-gaussian` = list(d = "dinvgauss", r = "rinvgauss"),
         # `johnson-sb` = list(d = "djohnsonsb", r = "rjohnsonsb"),
         # `johnson-sl` = list(d = "djohnsonsl", r = "rjohnsonsl"),
         # `johnson-su` = list(d = "djohnsonsu", r = "rjohnsonsu"),
         # laplace = list(d = "dlaplace", r = "rlaplace"),
         # levy = list(d = "dlevy", r = "rlevy"),
         llogis = list(d = "dllogis", r = "rllogis"),
         `log-normal` = list(d = "dlnorm", r = "rlnorm"),
         #logistic = list(d = "dlogis", r = "rlogis"),
         #nakagami = list(d = "dnakagami", r = "rnakagami"),      # if implemented
         normal = list(d = "dnorm", r = "rnorm"),
         #pareto = list(d = "dpareto", r = "rpareto"),            # if implemented
         #`pearson-6` = list(d = "dpearson6", r = "rpearson6"),
         #power = list(d = "dpower", r = "rpower"),               # if implemented
         #rayleigh = list(d = "drayleigh", r = "rrayleigh"),
         #t = list(d = "dt", r = "rt"),
         #triangular = list(d = "dtriang", r = "rtriang"),         # if implemented
         weibull = list(d = "dweibull", r = "rweibull"),
         list(d = paste0("d", distrib), r = paste0("r", distrib))
  )
}

# To resolve gumbel problem , you must either:
#
#   Exclude such distributions from your simulation function.
# Or, for each problematic distribution, implement custom density (and corresponding random generation) functions along with a custom likelihood routine that fitdistrplus can use.
# Thus, the issue isn’t with your start values per se—it’s that fitdistrplus isn’t designed to estimate parameters for these unsupported distributions.

#' Simulate Data Based on Parametric Distribution Estimates
#'
#' This function simulates data based on parameter estimates from a specified parametric distribution.
#' It fits a chosen distribution to the original dataset and samples new values from this fitted distribution.
#' Supported distributions include "inverse_gamma", "llogis" (log-logistic), "gumbel". "log-normal", "gamma", "exp", "cauchy".
#'
#' @param orig_vals Numeric vector of values from the original dataset.
#' @param n Integer specifying the number of observations to simulate. If `NULL`, the function simulates
#' the same number of observations as in the original dataset. Defaults to `NULL`.
#' @param distrib Character; one of "inverse_gamma", "llogis", "gumbel",
#'    "exp", "gamma", "normal", or "cauchy".
#'
#' @return Numeric vector of `n` simulated values based on the fitted parametric distribution.
#'
#' @examples
#' original_data <- rnorm(100, mean = 50, sd = 10)
#' simulated_data <- data_simul_Estim(orig_vals = original_data, n = 100, distrib = "inverse_gamma")
#' @export
#' @importFrom stats rnorm
#' @importFrom stats median
#' @importFrom fitdistrplus fitdist
#' @importFrom flexsurv dgompertz
#' @importFrom actuar dllogis rllogis
#' @importFrom FAdist dgumbel pgumbel qgumbel rgumbel
#' @import univariateML
data_simul_Estim <- function(orig_vals, n = NULL, distrib = "exp"){
  #-----------------------------------------------------------------------------
  # data_simul_Estim - simulates data based on parameter estimates from a specified distribution.
  #                   This function allows for the generation of new data sets by
  #                   fitting a parametric distribution to the original data
  #                   and then sampling from this fitted distribution.
  #
  # Input:           - orig_vals: Numeric vector of values from the original data set
  #                  - n: Integer, number of observations to simulate. If NULL, the function
  #                    simulates the same number of observations as in the original dataset. (default: NULL)
  #                  - distrib: specifies the parametric distribution from which the data
  #                    should be sampled. Supported distributions include "inverse_gamma",
  #                    "llogis" (log-logistic), "gumbel".
  #
  # Output: A numeric vector of n simulated values.
  #-----------------------------------------------------------------------------
  if (!is.numeric(orig_vals)) {
    stop("orig_vals must be a numeric vector.")
  }


  if (length(orig_vals) == 0) {
    stop("orig_vals must not be empty.")
  }
  if (length(orig_vals) <= 1) {
    warning("Data length is not sufficient for fitting the distribution. Returning NA.")
  }

  if (!is.null(n) && (!is.numeric(n) || n <= 0 || n != round(n))) {
    stop("n must be a positive integer.")
  }
  # Check if 'n' is NULL; if so, use the length of 'orig_vals' as 'n'.
  if(is.null(n)){
    n <- length(orig_vals)
  }

  if(distrib == "inverse_gamma"){
    # Estimate parameters using maximum likelihood for the inverse gamma distribution.
    param_estim <- mlinvgamma(orig_vals)
    # Simulate 'n' values from the inverse gamma distribution with the estimated parameters.
    sim_data <- rml(n, param_estim)
  }
  else if(distrib == "gompertz"){
    # The simulation is similar to the inverse-gamma distribution
    # Fit the Gompertz distribution to the original values and extract parameter estimates.
    # The 'fitdist' function is used with a specified starting point for the optimization.
    # param_estim <- fitdist(data = orig_vals, distr = distrib,
    #                        start = list("shape" = exp(1), "rate" = exp(1)),
    #                        lower = c(0, 0), upper = c(Inf, Inf))$estimate
    # sim_data <- do.call(paste0("r", distrib), list(n, param_estim))
    fitdist_name <- get_fitdist_name(distrib)
    param_estim <- fitdistrplus::fitdist(data = orig_vals, distr = fitdist_name,
                                         start = list("shape" = exp(1), "rate" = exp(1)),
                                         lower = c(0, 0), upper = c(Inf, Inf))$estimate
    dist_names <- get_dist_names(distrib)
    sim_data <- do.call(dist_names$r, list(n, param_estim))
  }
  else if(distrib == "llogis"){
    # Initial values for the shape and scale parameters are based on the median of 'orig_vals' and a scale of 1.
    # param_estim <- fitdist(data = orig_vals, distr = distrib,
    #                        start = list("shape" = median(orig_vals),
    #                                     "scale" = 1))$estimate
    # sim_data <- do.call(paste0("r", distrib), list(n, param_estim))
    fitdist_name <- get_fitdist_name(distrib)
    param_estim <- fitdistrplus::fitdist(data = orig_vals, distr = fitdist_name,
                                         start = list("shape" = median(orig_vals), "scale" = 1))$estimate
    dist_names <- get_dist_names(distrib)
    sim_data <- do.call(dist_names$r, list(n, param_estim))
  }
  else if(distrib == "gumbel"){
    # # Initial values for the location and scale are set to 0 and 1, respectively.
    # param_estim <- fitdist(data = orig_vals, distr = distrib,
    #                        start = list("location" = 0,
    #                                     "scale" = 1))$estimate
    # sim_data <- do.call(paste0("r", distrib), list(n, param_estim))
    fitdist_name <- get_fitdist_name(distrib)
    param_estim <- fitdistrplus::fitdist(data = orig_vals, distr = fitdist_name,
                                         start = list("location" = 0, "scale" = 1))$estimate
    dist_names <- get_dist_names(distrib)
    sim_data <- do.call(dist_names$r, list(n, param_estim))
  }
  else{
    # # For any other specified distribution, fit the distribution to the
    # # original values without specifying starting values.
    # param_estim <- fitdist(data = orig_vals, distr = distrib)$estimate
    # sim_data <- do.call(paste0("r", distrib), list(n, param_estim))

    fitdist_name <- get_fitdist_name(distrib)
    param_estim <- fitdistrplus::fitdist(data = orig_vals, distr = fitdist_name)$estimate
    dist_names <- get_dist_names(distrib)
    sim_data <- do.call(dist_names$r, list(n, param_estim))
  }

  return(sim_data)
}


#' Simulate Data Using Bootstrap Methods
#'
#' Simulates event and censoring times from an original dataset using specified bootstrap methodologies. This function supports
#' conditional and case resampling bootstrap methods, allowing for flexible data simulation scenarios tailored to survival analysis.
#'
#' @param dat A dataframe containing the original dataset, expected to include columns for event times (V1),
#'            censoring indicators (V2), and group indicators (optional).
#' @param n Integer specifying the number of observations to simulate. If `NULL`, the function simulates the
#'          same number of observations as in the original dataset. Defaults to `NULL`.
#' @param type Character string specifying the type of bootstrap method to be used. Supported types include
#'             "cond" for conditional and "case" for case resampling. Defaults to "cond".
#'
#' @return A dataframe or a numeric vector of simulated values depending on the chosen bootstrap method.
#'         - For "case" bootstrap
#'         - For "cond" bootstrap, the arbitary n function does not work
#'
#' @examples
#' dat <- data.frame(
#'   V1 = rexp(100, rate = 0.1), # Time-to-event data
#'   V2 = sample(0:1, 100, replace = TRUE),
#'   V3 = sample(0:1, 100, replace = TRUE)# Event indicator (0 = censored, 1 = event)
#' )
#' simulated_case <- data_simul_Bootstr(dat = dat, n = 100, type = "case")
#' simulated_cond <- data_simul_Bootstr(dat = dat, type = "cond")
#' @export
#' @importFrom stats runif
data_simul_Bootstr <- function(dat, n = NULL, type = "cond"){

  #-----------------------------------------------------------------------------
  # data_simul_Bootstr - simulates data using bootstrap methods based on the original dataset.
  #                      This function is designed for simulating event and censoring times, allowing for different
  #                      bootstrap methodologies to be applied
  #
  # Input:             - dat: a dataframe the original data set. Expected to
  #                    include columns for event times, censoring indicators, and group indicators
  #                    - n: Integer, number of observations to simulate. If NULL, the function
  #                     simulates the same number of observations as in the original dataset. (default: NULL)
  #                    - type: type of the bootstrap method that should be used.  Supported types
  #                     include "cond" for conditional and "case" for case resampling. Default is "cond".
  #
  #
  # Output:  A numeric vector of n simulated values.
  #-----------------------------------------------------------------------------

  if (!is.data.frame(dat)) {
    stop("dat must be a dataframe.")
  }

  # Check for required columns
  required_cols <- c("V1", "V2")
  if (!all(required_cols %in% names(dat))) {
    stop("dat must contain the columns: V1 and V2.")
  }


  if (type == "cond") {
    n <- nrow(dat)
  } else {
    if (is.null(n)) {
      n <- nrow(dat)
    }
    if (!is.numeric(n) || length(n) != 1 || n <= 0 || n != floor(n)) {
      stop("n must be a single positive integer.")
    }
  }

  # Check if 'type' is valid
  if (!type %in% c("cond", "case")) {
    stop(paste("This bootstrap method is not implemented."))
  }

  if (nrow(dat) == 0) {
    stop("The dataset is empty.")
  }


  # Extract event times where V2 == 1, and censoring times where V2 == 0
  events <- dat$V1[dat$V2 == 1]
  censored <- dat$V1[dat$V2 == 0]

  if (length(events) == 0 && type == "cond") {
    stop("No event data available for conditional bootstrap.")
  }

  if (length(censored) == 0 && type == "cond") {
    stop("No censoring data available for conditional bootstrap.")
  }

  # Split the dataset by censoring indicator V2
  dat_spl <- split(dat, as.factor(dat$V2))

  if (length(dat_spl) == 1 && type == "cond") {
    stop("Not enough data to perform conditional bootstrap.")
  }

  # Adjust censoring times in the split dataset to handle ties by adding a tiny value
  # This ensures that censoring is considered to happen immediately after an event
  dat_spl[[2]]$V1 <- sapply(dat_spl[[2]]$V1, # check for ties -> add small value
                            function(x){
                              ifelse(is.element(x, dat_spl[[1]]$V1),
                                     yes = x + runif(1, 1e-30, 1e-20),
                                     no = x)
                            })
  dat <- unsplit(dat_spl, as.factor(dat$V2))


  # Case bootstrap: randomly resample observations with replacement
  if(type == "case"){
    indic_sampled <- sample(1:n, n, replace = TRUE)#for i in n
    return(dat[indic_sampled, ])
  }

  # conditional bootstrap
  else if(type == "cond"){
    sim_cens <- numeric(n)
    #print( sim_cens)
    # Retain censoring times for censored observations
    sim_cens[dat$V2 == 0] <- censored

    # For events, sample censoring times larger than the event time
    for(i in which(dat$V2 == 1)){
      if(i == which.max(dat$V1)){
        # For the largest observed event, set censoring shortly after
        sim_cens[i] <- dat$V1[i] + runif(1, 1e-30, 1e-20)
      }
      else{
        # Sample from censoring times that are greater than or equal to the event time
        sim_cens[i] <- sample(censored[censored >= dat$V1[i]], 1)
      }
    }
    #print(sim_cens)
    # Sample event times from observed event times with replacement
    sim_events <- sample(events, n, replace = TRUE)

    if(dat$V2[which.max(dat$V1)] == 0){
      # For the largest censored observation, set event time shortly after
      i <- which.max(dat$V1)
      sim_events[i] <- dat$V1[i] + runif(1, 1e-30, 1e-20)
    }

    # Set observed values to the minimum of event and censoring times
    sim_data <- apply(cbind(sim_cens, sim_events), 1, min)

    # Update censoring indicator based on whether event or censoring time is observed
    cens_indic <- apply(cbind(sim_cens, sim_events), 1, which.min) - 1
    return(data.frame(V1 = sim_data, V2 = cens_indic))
  }
  else{
    # throw an error if another method is chosen
    stop(paste("The bootstrap method", type, "is not implemented.", " "))
  }

}
#' Simulate Datasets Using Various Simulation Models
#'
#' Simulates survival datasets(Time-to-event data) based on original or reconstructed data using four different simulation models:
#' Kernel Density Estimation (KDE), parametric distributions, conditional bootstrap, and Case Resampling.
#' This function is designed to support comprehensive survival analysis simulations.
#'
#' @param dat A data.frame representing the original or reconstructed dataset for simulation.
#'            The dataset must include three columns: for event times, for censoring status,
#'            and for group identifiers.
#' @param col_time The name or index of the column in `dat` representing time to event.
#' @param col_status The name or index of the column in `dat` representing the event status
#'                   (1 for event occurred, 0 for censored).
#' @param col_group The name or index of the column in `dat` representing group assignments.
#' @param reps The number of iterations, equivalent to the number of datasets simulated for each simulation model.
#'             Defaults to 10000.
#' @param random_seed Seed for random number generation to ensure reproducibility. Defaults to 123.
#' @param n An optional numeric vector specifying the number of observations to simulate for each group.
#'          If `NULL`, the function uses the original dataset's group sizes for simulation.
#'          For all simulation types except "conditional bootstrap," `n` can be set to arbitrary values,
#'          such as `c(50, 60)`, where each element specifies the number of observations for a group.
#'          Defaults to `NULL`.
#' @param simul_type A vector of characters specifying the types of simulation to perform. It includes
#'                   "cond" (conditional bootstrap), "case" (case resampling), "distr" (parametric distributions),
#'                   and "KDE" (kernel density estimation, supports all kernels available in the `kdensity` function. Refer to 'kdensity'). Note: Only one simulation type can be used at a time.
#' @param distribs Character vector of length 4, one distribution per stratum.
#'   Must be one of:
#'   - `"inverse_gamma"`
#'   - `"llogis"`
#'   - `"gumbel"`
#'   - `"exp"`
#'   - `"gamma"`
#'   - `"normal"`
#'   - `"cauchy"`
#'
#'  Defaults to `c("exp", "exp", "exp", "exp")`.
#'
#' @return A list containing the simulated datasets for each specified simulation model.
#'           The structure of the output list is as follows:
#'
#'          - {datasets}: A list of data frames, where each data frame represents a simulated dataset.
#'          - Each data frame contains:
#'              - {V1}: A numeric vector representing the simulated time-to-event data.
#'              - {V2}: A numeric or integer vector indicating the status, representing
#'                whether the event of interest has occurred (1) or is censored (0).
#'              - {V3}: An integer vector representing group.
#'
#'          - The number of data frames within {datasets} corresponds to the number of repetitions specified
#'            by the {reps} parameter.
#'
#'@examples # liang should have columns: V1 (time), V2 (status), V3 (group)
#'
#'          # Simulate data using parametric distribution fitting
#'
#'           liang<- dats$Liang
#'           liang_distr <- RealSurvSim(
#'            dat = liang,
#'            col_time = "V1",
#'            col_status = "V2",
#'            col_group = "V3",
#'            reps = 10,
#'            simul_type = "distr",
#'            distribs = c("exp", "exp", "exp", "exp")
#'          )
#'
#'
#'
#' @export
#' @importFrom survival Surv
RealSurvSim <- function(dat,col_time, col_status, col_group, reps = 10000, random_seed = 123,n = NULL,
                        simul_type= c("cond", "case", "distr", "KDE"),
                        distribs = c("exp", "exp",
                                     "exp", "exp")){
  set.seed(random_seed)
  # Parameter checks

  # Validate the input dataset is a dataframe
  if (!is.data.frame(dat)) {
    stop("dat must be a data frame.")
  }

  if (!is.character(simul_type) || !all(simul_type %in% c("cond", "case", "distr", "KDE"))) {
    stop("simul_type must be a character vector with any of these values: 'cond', 'case', 'distr', 'KDE'")
  }

  if (!is.character(distribs) || length(distribs) != 4) {
    stop("distribs should be a character vector of length 4.")
  }


  # Validate column specifications are correct and within the dataframe
  # cols <- list(col_time, col_status, col_group)
  # col_names <- c(col_time = col_time, col_status = col_status, col_group = col_group)
  # Map specified columns to V1, V2, V3 in the correct order:
  time_i   <- which(names(dat) == col_time)
  status_i <- which(names(dat) == col_status)
  group_i  <- which(names(dat) == col_group)

  if (length(time_i) != 1 || length(status_i) != 1 || length(group_i) != 1) {
    stop("Your col_time/col_status/col_group specifications must each match exactly one column.")
  }

  names(dat)[ time_i   ] <- "V1"
  names(dat)[ status_i ] <- "V2"
  names(dat)[ group_i  ] <- "V3"
  dat <- dat[, c("V1","V2","V3")]
  # Map specified columns to V1, V2, V3
  # original_names <- names(dat)
  # names(dat)[names(dat) %in% col_names] <- c("V1", "V2", "V3")


  # if(is.null(n)){
  #   n <- c(length(dat$V3[dat$V3==0]), length(dat$V3[dat$V3==1]))
  # }
  if (is.null(n)) {
    tbl <- table(dat$V3)
    n   <- as.integer(tbl)
    names(n) <- names(tbl)
  }


  message(">>> After rename, names(dat) = ", paste(names(dat), collapse=", "))

  # Ensure reps is a single positive integer
  if (!is.numeric(reps) || length(reps) != 1 || reps <= 0 || reps != floor(reps)) {
    stop("reps should be a single positive integer.")
  }

  # Set the random seed for reproducibility
  if (!is.numeric(random_seed) || length(random_seed) != 1) {
    stop("random_seed should be a single numeric value.")
  }

  # Validate the 'n' parameter if provided
  if (!is.null(n)) {
    if (!is.numeric(n) || any(n < 0) || any(n != floor(n))) {
      stop("n should be a vector of non-negative integers.")
    }
  }

  if (nrow(dat) == 0) {
    stop("dat must be a data frame.")
  }


#
#   # Check if distribs contains valid distribution names
#   valid_distribs <- c("gompertz", "exp", "gumbel", "llogis", "inverse_gamma")
#   if (!all(distribs %in% valid_distribs)) {
#     stop("distribs contains invalid distribution names. Valid names are: ", paste(valid_distribs, collapse = ", "), ".")
#   }


  # Prepare simulation environment based on the specified simulation type
  if(simul_type %in% c("cond", "case", "distr", "KDE")){
    #names(dat) <- c("V1", "V2", "V3")
    set.seed(random_seed)
    seed_list <- sample(1:1e9, reps * 3)
    i_seed <- 0 # Initialize seed index


    # Calculate censoring rates in the original data for proper simulation
    cens_rates <- 1 - tapply(dat$V2, dat$V3, mean) #censoring rates in original data

    floor_ceil <- function(x){return(c(floor(x[1]), ceiling(x[2])))}

    # Adjust 'n' for group-specific censoring and event rates
    n_g0 <- floor_ceil(n[1] * c(cens_rates[1], 1- cens_rates[1]))

    if(sum(n_g0) != n[1]){
      if(abs(cens_rates[1] - n_g0[1]/n[1]) < abs((1 - cens_rates[1]) - n_g0[2]/n[1])){
        n_g0 <- n_g0 + sign((1 - cens_rates[1]) - n_g0[2]/n[1]) * c(0, 1)
      }
      else{
        n_g0 <- n_g0 + sign(cens_rates[1] - n_g0[1]/n[1])  * c(1, 0)
      }

    }

    n_g1 <- floor_ceil(n[2] * c(cens_rates[2], 1- cens_rates[2]))

    if(sum(n_g1) != n[2]){
      if(abs(cens_rates[2] - n_g1[1]/n[2]) < abs((1 - cens_rates[2]) - n_g1[2]/n[2])){
        n_g1 <- n_g1 + sign((1 - cens_rates[2]) - n_g1[2]/n[2]) * c(0, 1)
      }
      else{
        n_g1 <- n_g1 + sign(cens_rates[2] - n_g1[1]/n[2]) * c(1, 0)
      }

    }

    if(simul_type %in% c("KDE", "distr")){
      names(n_g0) <- c("00", "10")
      names(n_g1) <- c("01", "11")
      n <- c(n_g0, n_g1)
     n <- n[which(n != 0)]
      #print(n)
    }else{
      names(n)<-c("0","1")
    }



    # Kernel density estimation
    if(simul_type=="KDE"){
      tmp <- lapply(1:reps,
                    function(x){
                      #time <- Sys.time()
                      # split the data set in four subsets based on the treatment
                      # group and the censoring
                      dat_split <- split(dat, as.factor(paste0(dat$V2, dat$V3)))
                      nams <- names(dat_split)
                      i_seed <- i_seed + x
                      set.seed(seed_list[i_seed])

                      # simulate the data
                      res <- lapply(1:length(dat_split),
                                    function(j){cbind(data_simul_KDE(dat_split[[j]][, 1],
                                                                     n = n[nams[j]]),
                                                      dat_split[[j]][1, 2],
                                                      dat_split[[j]][1, 3])})
                      dat <- do.call("rbind", res)
                      dimnames(dat) <- list(1:nrow(dat), paste0("V", 1:3))
                      return(dat)
                    })

      # runtimes for the simulation based on kernel density estimation
      acc_rej <- lapply(tmp, function(x){as.data.frame(x)})


      i_seed <- i_seed + reps
    }

    # Parametric distributions
    else if(simul_type=="distr"){
      tmp <- lapply(1:reps,
                    function(x){
                      #time <- Sys.time()
                      # split the data set in four subsets based on the treatment
                      # group and the censoring
                      dat_split <- split(dat, as.factor(paste0(dat$V2, dat$V3)))
                      #print(dat_split)
                      nams <- names(dat_split)

                      i_seed <- i_seed + x
                      set.seed(seed_list[i_seed])
                      # simulate the data
                      res <- lapply(1:length(dat_split),
                                    function(j){#print(n[nams[j]])
                                      #print(j)
                                      cbind(data_simul_Estim(dat_split[[j]][, 1], n = n[nams[j]],
                                                             distrib = distribs[j]),
                                            dat_split[[j]][1, 2],
                                            dat_split[[j]][1, 3])})

                      dat <- do.call("rbind", res)

                      dimnames(dat) <- list(1:nrow(dat), paste0("V", 1:3))

                      return(dat)})


      #estim_t <- data.frame(sec = sapply(tmp, function(x){x[[1]]}), meth = "estim")

      # data simulated based on parametric distributions
      estim <- lapply(tmp, function(x){as.data.frame(x)})


      i_seed <- i_seed + reps
    }

    # bootstrap methods
    else if(simul_type == "cond" | simul_type== "case"){
      bootstr <- list()


      tmp <- lapply(1:reps,
                    function(x){
                      #time <- Sys.time()
                      # split the data set in two subsets based on the treatment
                      # group

                      dat_split <- split(dat, as.factor(dat$V3))

                      nams <- names(dat_split)
                      i_seed <- i_seed + x
                      set.seed(seed_list[i_seed])


                      res <- lapply(1:length(dat_split),
                                    function(j){
                                      cbind(data_simul_Bootstr(dat_split[[j]][, 1:2],
                                                               n = n[nams[j]], type = simul_type),

                                            dat_split[[j]][1, 3])})
                      dat <- do.call("rbind", res)
                      dimnames(dat) <- list(1:nrow(dat), paste0("V", 1:3))

                      return(dat)

                    })


      #bootstr_t <- data.frame(sec = sapply(tmp, function(x){x[[1]]}),
      #meth = simul_type)

      # Data simulated based on the bootstrap approaches
      bootstr <- lapply(tmp, function(x){as.data.frame(x)})
    }

    # Return structured output containing simulated datasets
    if(simul_type == "KDE") {
      return(list(datasets = acc_rej))
    } else if(simul_type == "distr") {
      return(list(datasets = estim))
    } else if(simul_type == "cond" | simul_type== "case") {
      return(list(datasets = bootstr))
    }
  }
  else
  {
    stop("Invalid input for simul_type: ",simul_type,  ". Allowed inputs are: ", paste(c("cond", "case", "distr", "KDE"), collapse = ", "), ".", call. = FALSE)}
}


#############################trials#############################################################################################################


# liang_recon for default n
#seto_v <- RealSurvSim(seto, seto$V1, seto$V2, seto$V3, reps=3, simul_type = "cond")
#liang_distr<- RealSurvSim(liang_recon,  liang_recon$V1, liang_recon$V2, liang_recon$V3,reps=10, simul_type = "distr", distrib=c("llogis", "exp","exp", "exp"))
#liang_distrib1 <- RealSurvSim(liang_recon,  liang_recon$V1, liang_recon$V2, liang_recon$V3,reps=10, simul_type = "distr", distrib=c("gumbel", "gumbel","gumbel", "gumbel"))
#liang_cond<- RealSurvSim(liang_recon,liang_recon$V1, liang_recon$V2, liang_recon$V3, reps=11, simul_type = "cond")
#liang_case<- RealSurvSim(liang_recon,liang_recon$V1, liang_recon$V2, liang_recon$V3, reps=2, simul_type = "case")
#
# # #liang_recon for arbitary n
#arbliang_kde<- RealSurvSim(liang_recon, liang_recon$V1, liang_recon$V2, liang_recon$V3, reps=10,n = c(40,50), simul_type = "KDE")
#arbliang_distr<- RealSurvSim(liang_recon,  liang_recon$V1, liang_recon$V2, liang_recon$V3,reps=10,n = c(40,50), simul_type = "distr", distrib=c("exp", "llogis","llogis", "exp"))
#arbliang_cond<- RealSurvSim(liang_recon,liang_recon$V1, liang_recon$V2, liang_recon$V3, reps=3, n = c(40,50), simul_type = "cond")
#arbliang_case<- RealSurvSim(liang_recon,liang_recon$V1, liang_recon$V2, liang_recon$V3, reps=3, n = c(40,50), simul_type = "cond")



# # wu_recon for default n
#wu_kde<- RealSurvSim(wu_recon, wu_recon$V1, wu_recon$V2, wu_recon$V3, reps=100, simul_type = "KDE")
#wu_distr<- RealSurvSim(wu_recon,  wu_recon$V1, wu_recon$V2, wu_recon$V3,reps=15, simul_type = "distr", distrib=c("gompertz", "gompertz","gompertz", "gompertz"))
#wu_cond<- RealSurvSim(wu_recon,wu_recon$V1, wu_recon$V2, wu_recon$V3, reps=100, simul_type = "cond")
#wu_case<- RealSurvSim(wu_recon,wu_recon$V1, wu_recon$V2, wu_recon$V3, reps=100, simul_type = "case")


# # wu_recon for arbitary n
 #arbwu_kde<- RealSurvSim(wu_recon, wu_recon$V1, wu_recon$V2, wu_recon$V3, reps=100, n = c(40,50), simul_type = "KDE")
 #arbwu_distr<- RealSurvSim(wu_recon,  wu_recon$V1, wu_recon$V2, wu_recon$V3,reps=100,n = c(40,50), simul_type = "distr")
 #arbwu_cond<- RealSurvSim(wu_recon, wu_recon$V1, wu_recon$V2, wu_recon$V3, reps=100, n = c(40,50), simul_type = "cond")
 #arbwu_case<- RealSurvSim(wu_recon, wu_recon$V1, wu_recon$V2, wu_recon$V3, reps=100,n = c(40,50),  simul_type = "case")
#
#
# seto <- dats$Seto
# spigel_recon for default n
# spigel_kde<- RealSurvSim(, spigel_recon$V1, spigel_recon$V2, spigel_recon$V3, reps=100, simul_type = "KDE")
# spigel_distr<- RealSurvSim(spigel_recon,  spigel_recon$V1, spigel_recon$V2, spigel_recon$V3,reps=3, simul_type = "distr", distrib=c("geometric", "exp","cauchy", "cauchy"))
# seto_cond<- RealSurvSim(seto, seto$V1, seto$V2, seto$V3, reps=100, simul_type = "cond")
# spigel_case<- RealSurvSim(spigel_recon, spigel_recon$V1, spigel_recon$V2, spigel_recon$V3, reps=100, simul_type = "case")

# spigel_recon for arbitary n
 #arbspigel_kde<- RealSurvSim(spigel_recon, spigel_recon$V1, spigel_recon$V2, spigel_recon$V3, reps=100000000, n = c(40,50),simul_type = "KDE")
# arbspigel_distr<- RealSurvSim(spigel_recon,  spigel_recon$V1, spigel_recon$V2, spigel_recon$V3,reps=10, n = c(40,50),simul_type = "distr",distrib=c("exp", "inverse_gamma","cauchy", "cauchy") )
# arbspigel_cond<- RealSurvSim(spigel_recon, spigel_recon$V1, spigel_recon$V2, spigel_recon$V3, reps=1000000,n = c(40,50), simul_type = "cond")
# arbspigel_case<- RealSurvSim(spigel_recon, spigel_recon$V1, spigel_recon$V2, spigel_recon$V3, reps=1000000,n = c(40,50), simul_type = "case")
# liang_llogis <- RealSurvSim(
#   dat         = liang_recon,
#   col_time    = "V1",
#   col_status  = "V2",
#   col_group   = "V3",
#   reps        = 10,
#   simul_type  = "distr",
#   distribs    = c("llogis","llogis","llogis","llogis")
# )
# liang_gumbel <- RealSurvSim(
#   dat         = liang_recon,
#   col_time    = "V1",
#   col_status  = "V2",
#   col_group   = "V3",
#   reps        = 10,
#   simul_type  = "distr",
#   distribs    = c("gumbel","gumbel","gumbel","gumbel")
# )




# for testing column name formatting
# set.seed(42)
# synthetic_dat <- with(
#   expand.grid(event = 0:1, grp = 0:1, rep = 1:25),
#   data.frame(
#     event = event,
#     grp   = grp,
#     time  = rexp(100, rate = 0.1)
#   )
# )
# table(synthetic_dat$event, synthetic_dat$grp)
# #      grp
# #event  0  1
# #    0 25 25
# #    1 25 25
#
# # 2) Call RealSurvSim on that:
# sim_out <- RealSurvSim(
#   dat         = synthetic_dat,
#   col_time    = "time",     # will be renamed internally to V1
#   col_status  = "event",    # → V2
#   col_group   = "grp",      # → V3
#   reps        = 10,         # 10 simulated datasets
#   random_seed = 123,
#   simul_type  = "case",
#   distribs    = c("exp", "exp", "exp", "exp")
# )
#
# # Check one replicate:
# str(sim_out$datasets[[1]])

