Source code for hydroBayesCal.hysim

"""
Core code for coupling any hydrodynamic simulation software with the main script for GPE surrogate model construction
and Bayesian Active Learning .

Author: Andres Heredia M.Sc.

"""
import os
import csv
import json
import pickle
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from datetime import datetime

from hydroBayesCal.utils.config_logging import logger_error



[docs]
class HydroSimulations(ABC):

[docs]
    def __init__(
            self,
            control_file = "control.cas",
            model_dir="",
            res_dir="",
            calibration_pts_file_path="",
            n_cpus=int(),
            init_runs=1,
            calibration_parameters=None,
            param_values=None,
            calibration_quantities=None,
            extraction_quantities=None,
            dict_output_name='extraction-data',
            user_param_values=False,
            max_runs=1,
            complete_bal_mode=False,
            only_bal_mode=False,
            check_inputs=False,
            delete_complex_outputs=True,
            validation=False,
            multitask_selection="variables",
            *args,
            **kwargs,
    ):
        """
        Constructor of the HydroSimulations class to manage and run hydrodynamic simulations within the context of
        Bayesian Calibration using a Gaussian Process Emulator (GPE). The class is designed to handle simulation setup,
        execution, and result storage while managing calibration parameters and Bayesian Active Learning (BAL) iterations.

        Parameters
        ----------
        control_file : str
            Name of the file that controls the full complexity model simulation (default is "control.cas" as an example for Telemac).
        model_dir : str
            Full complexity model directory where all simulation files (mesh, control file, boundary conditions) are located.
        res_dir : str
            Directory where a subfolder called "auto-saved-results-HydroBayesCal" will be created to store all the result files. In this directory, the results of the calibration
            process will be stored according to the calibration quantity name. Addiionally, subfolders for plots, surrogate models, and restart data will be created.
        calibration_pts_file_path : str or optional
            File path to the calibration points data file. Please check documentation for further details of the file format.
        n_cpus : int
            Number of CPUs to be used for parallel processing (if available).
        init_runs : int
            Initial runs of the full complexity model (before Bayesian Active Learning).
        calibration_parameters : list of str
            Names of the considered calibration parameters (e.g. roughness coefficients, empirical constants, turbulent viscosity, etc).
        param_values : list
            Value ranges considered for parameter sampling. Example: [[min1, max1], [min2, max2], ...].
        calibration_quantities : list of str
            Names of the calibration targets (model outputs) used for calibration. These quantities usually correspond to the measured values for calibration purposes.
            Example: ['WATER DEPTH'] for a single quantity.
            Example: ['WATER DEPTH', 'SCALAR VELOCITY'] for multiple quantities.
        extraction_quantities : list of str
            Names of the quantities to be extracted from the model output files. Generally, the same or more than the calibration_quantities. These quantities will be extracted from the model.
            Example: ``calibration_quantities = ['WATER DEPTH']`` (WATER DEPTH as calibration parameter).
            Example: ``extraction_quantities = ['WATER DEPTH', 'SCALAR VELOCITY', 'TURBULENT ENERG', 'VELOCITY U', 'VELOCITY V']``.
            Any of these additional quantities can be used for calibration purposes when restarting the
            calibration process with ``only_bal_mode = True``.
        dict_output_name : str
            Base name for output dictionary files where the outputs are saved as .json files.
            This dictionary will be saved in the calibration-data subfolder for the considered calibration target.
        parameter_sampling_method : str
            Method used for sampling parameter values during the calibration process. The available options are:
            - "random"           : Random sampling.
            - "latin_hypercube"  : Latin Hypercube Sampling (LHS).
            - "sobol"            : Sobol sequence sampling.
            - "halton"           : Halton sequence sampling.
            - "hammersley"       : Hammersley sequence sampling.
            - "chebyshev(FT)"    : Chebyshev nodes (Fourier Transform-based).
            - "grid(FT)"         : Grid-based sampling (Fourier Transform-based).
            - "user"             : User-defined sampling.

            Example::

                parameter_sampling_method = "sobol"  # Uses Sobol sequence sampling.

            If "user" is selected, a ``.csv`` file containing user-defined collocation points must be provided
            in the restart data folder. The file should follow this format::

                param1    param2    param3    param4    param5
                0.148     0.770     0.014     0.014     0.700
                0.066     0.066     0.066     0.066     0.066

        max_runs : int
            Maximum (total) number of model simulations, including initial runs and Bayesian Active Learning iterations.
        complete_bal_mode : bool, optional (Default: True)
            - If True: Bayesian Active Learning (BAL) is performed after the initial runs, enabling a complete surrogate‐assisted calibration process.
              **This option MUST be selected if you choose to perform only BAL** (i.e., when `only_bal_mode = True`).
            - If False: Only the initial runs of the full complexity model are executed, and the model outputs are stored as `.json` files.

        only_bal_mode : bool, optional (Default: False)
            - If False: The process will either execute a complete surrogate‐assisted calibration or only the initial runs, depending on the value of `complete_bal_mode`.
            - If True: Only the surrogate model construction and Bayesian Active Learning of preexisting model outputs at predefined collocation points are performed.
              **This mode can be executed only if either a complete process has already been performed** (`complete_bal_mode = True` and `only_bal_mode = True`) **or if only the initial runs have been executed** (`complete_bal_mode = False` and `only_bal_mode = False`).

        Shortcut combinations and their corresponding tasks::

            complete_bal_mode | only_bal_mode                   | task
            ------------------+---------------------------------+-----------------------------------------------------
            True              | False                           | Complete surrogate-assisted calibration
            False             | False                           | Only initial runs (no surrogate model)
            True              | True, with init_runs = max_runs | Surrogate construction with predefined runs (no BAL)
            True              | True, with init_runs > max_runs | Surrogate construction + Bayesian Active Learning
        validation : bool, optional (Default: False)
            If True, creates output files (inputs and outputs) corresponding to validation process.
        *args : tuple, optional
            Additional positional arguments.
        **kwargs : dict, optional
            Additional keyword arguments.

        Attributes
        ----------
        asr_dir : str
            Directory for auto-saved results.
        model_dir : str
            Path to the directory containing the model files.
        res_dir : str
            Path to the directory where results will be stored.
        control_file : str
            Path to the control file used in the calibration process.
        calibration_pts_file_path : str
            Path to the file containing the calibration points data.
        nproc : int
            Number of processors (CPUs) to be used.
        param_values : array
            Parameter values used in calibration.
        calibration_quantities : list
            Calibration quantities to be evaluated.
        extraction_quantities : list
           Quantities extracted from the model during calibration (calibration quantities must be included here).
        calibration_parameters : list
            Parameters involved in the calibration process.
        parameter_sampling_method : str
            Method used for sampling parameters during calibration. Options:
            - "random"
            - "latin_hypercube"
            - "sobol"
            - "halton"
            - "hammersley"
            - "chebyshev(FT)"
            - "grid(FT)"
            - "user" (requires a CSV file with user-defined collocation points in the restart data folder).
        init_runs : int
            Number of initial runs before surrogate-assisted calibration.
        max_runs : int
            Maximum number of calibration runs, including Bayesian Active Learning iterations.
        complete_bal_mode : bool
            If True, enables complete surrogate-assisted calibration with Bayesian Active Learning.
            Must be selected if `only_bal_mode = True`.
        only_bal_mode : bool
            If True, only surrogate model construction and Bayesian Active Learning are performed.
            Requires prior execution of either the full calibration process (`complete_bal_mode = True`)
            or initial runs (`complete_bal_mode = False`).
        delete_complex_outputs : bool
            If True, deletes complex model outputs after processing.
        validation : bool
            If True, the model is run in validation mode.
        multitask_selection : bool
            If True, enables multitask selection for surrogate modeling.
        dict_output_name : str
            Name of the output dictionary file. Appends "-validation" if validation mode is enabled.
        nloc : int
            Number of calibration points where measured data are available.
        ndim : int
            Number of model parameters used in the calibration.
        param_dic : dict
            Dictionary containing calibration parameters and their respective ranges.
        num_calibration_quantities : int
            Number of quantities used for calibration.
        num_extraction_quantities : int
            Number of additional quantities extracted from the model.
        observations : array
            Observed values at each calibration point.
        measurement_errors : array
            Measurement errors associated with each calibration point.
        calibration_pts_df : pandas.DataFrame
            Contains calibration point information.
            Header format::

                Point | X | Y | <quantity>_DATA | <quantity>_ERROR | ...
        user_collocation_points : array
            User-defined collocation points loaded from a CSV file (only applicable when `parameter_sampling_method="user"`).
        calibration_folder : str
            Directory where calibration data are stored.
        restart_data_folder : str
            Directory for restart data, used for resuming calibration runs.
        model_evaluations : numpy.ndarray
            2D array of processed model outputs, shape
            ``[num_runs, nloc * num_calibration_quantities]``: ``num_runs`` is the
            total number of evaluations (initial runs plus BAL iterations) and the
            columns interleave the calibration quantities per location. For example,
            with two quantities and two locations, columns 1-2 hold the two
            quantities at the first location and columns 3-4 the second.
        """
        if check_inputs:
            self.check_inputs(model_dir=model_dir,
                              res_dir=res_dir,
                              control_file=control_file,
                              init_runs=init_runs,
                              nproc=n_cpus,
                              max_runs=max_runs,
                              calibration_parameters=calibration_parameters,
                              param_values=param_values,
                              calibration_quantities=calibration_quantities
                              )
        self.model_dir = model_dir
        self.res_dir = res_dir
        self.control_file = control_file
        self.calibration_pts_file_path = calibration_pts_file_path
        self.nproc = n_cpus
        self.param_values = param_values
        self.calibration_quantities = calibration_quantities
        self.extraction_quantities = extraction_quantities
        self.calibration_parameters = calibration_parameters
        self.init_runs = init_runs
        self.max_runs = max_runs
        self.complete_bal_mode = complete_bal_mode
        self.only_bal_mode = only_bal_mode
        self.delete_complex_outputs=delete_complex_outputs
        self.validation=validation
        self.multitask_selection = multitask_selection
        self.user_param_values = user_param_values
        if self.validation:
            self.dict_output_name = dict_output_name + "-validation"
        else:
            self.dict_output_name = dict_output_name
        self.nloc = None
        self.ndim = None
        self.param_dic = None
        self.num_calibration_quantities = None
        self.num_extraction_quantities = None
        self.observations = None
        self.measurement_errors = None
        self.variances = None
        self.calibration_pts_df = None
        self.user_collocation_points = None
        self.restart_collocation_points = None
        self.model_evaluations = None


        if calibration_parameters:
            self.param_dic, self.ndim = self.set_calibration_parameters(calibration_parameters, param_values)
        if calibration_pts_file_path:
            self.observations,self.variances, self.measurement_errors, self.nloc, self.num_calibration_quantities, self.calibration_pts_df, self.num_extraction_quantities = self.set_observations_and_variances(
                calibration_pts_file_path, calibration_quantities, extraction_quantities)

        self.asr_dir = os.path.join(res_dir,
                                    f"auto-saved-results-HydroBayesCal")
        self.calibration_folder = os.path.join(self.asr_dir,"calibration-data","_".join(self.calibration_quantities))
        self.restart_data_folder = os.path.join(self.asr_dir,"restart_data")
        if not os.path.exists(self.asr_dir):
            os.makedirs(self.asr_dir)
        if not os.path.exists(self.calibration_folder) and not self.user_param_values:
            os.makedirs(self.calibration_folder)
        if not os.path.exists(self.restart_data_folder):
            os.makedirs(self.restart_data_folder)
        if not os.path.exists(os.path.join(self.asr_dir, "plots")):
            os.makedirs(os.path.join(self.asr_dir, "plots"))
        if not os.path.exists(os.path.join(self.asr_dir, "surrogate-gpe")):
            os.makedirs(os.path.join(self.asr_dir, "surrogate-gpe"))
        if self.user_param_values:
            collocation_path = os.path.join(self.restart_data_folder, 'user-collocation-points.csv')
            self.user_collocation_points = np.loadtxt(collocation_path, delimiter=',', skiprows=1, ndmin=2)
        if self.only_bal_mode:
            collocation_path = os.path.join(self.restart_data_folder, 'initial-collocation-points.csv')
            self.restart_collocation_points = np.loadtxt(collocation_path, delimiter=',', skiprows=1, max_rows=self.init_runs,ndmin=2)



[docs]
    def extract_data_point(
            self,
            input_file,
            calibration_pts_df,
            output_name,
            extraction_quantity,
            simulation_number,
            model_directory,
            results_folder_directory,
            *args,
            **kwargs
    ):
        """
        Extract data from a specified coordinate in a hydrodynamic model output file.

        This generic method is designed for use with various hydrodynamic models
        (e.g., Telemac, OpenFOAM, etc.). It extracts data from an input file
        based on a provided CSV file containing the coordinates of the target points.

        Parameters
        ----------
        input_file : str
            Path to the hydrodynamic model output file from which data will be extracted.
        calibration_pts_df : pd.DataFrame
            Contains the coordinates of the points where data extraction is required.
            It must include:
            - Point descriptions (e.g., "P1").
            - X and Y coordinates of the measurement points.
            - Measured values and errors for the calibration quantities.

            Expected columns:
            - For a single calibration quantity: `['Point Name', 'X', 'Y', 'Measured Value', 'Measured Error']`
            - For two calibration quantities: `['Point Name', 'X', 'Y', 'Measured Value 1', 'Measured Error 1', 'Measured Value 2', 'Measured Error 2']`
        output_name : str
            Base name for the output file where extracted data will be stored.
        extraction_quantity : list of str
            List of variables or quantities to be extracted.
            Example: extraction_quantities=["WATER DEPTH", "SCALAR VELOCITY", "TURBULENT ENERG"]
        simulation_number : int
            The current simulation number, used to manage and organize data extraction (e.g. simulation number).
        model_directory : str
            Path to the directory containing the model output files.
        results_folder_directory : str
            Path to the directory where the extracted data will be saved.
        *args :
            Additional positional arguments defining specific extraction criteria, such as
            data indices or custom processing parameters.
        **kwargs :
            Additional keyword arguments for flexible data extraction criteria, such as:
            - `time`: Specific time step for extraction.
            - `location`: Specific coordinate or region of interest.
            - `variable_name`: Name of the variable to extract.
            - Any other model-specific parameters required for data extraction.

        Returns
        -------
        None
            The extracted data is saved to output files in the specified results directory.
        """
        raise NotImplementedError(
            f"{type(self).__name__} does not implement extract_data_point()."
        )



[docs]
    @abstractmethod
    def run_multiple_simulations(
            self,
            collocation_points=None,
            bal_new_set_parameters=None,
            bal_iteration=0,
            complete_bal_mode=True,
            validation=False,
            *args,
            **kwargs
    ):
        """Run the full-complexity model for a set of collocation points (BAL).

        Executes multiple hydrodynamic simulations in the context of Bayesian
        Active Learning (BAL). A new set of calibration parameters may be added
        as an array during BAL iterations.

        * If ``complete_bal_mode=True``, the process includes initial runs,
          surrogate-model construction and BAL iterations.
        * If ``complete_bal_mode=False``, only the initial model runs are
          performed.
        * If ``validation=True``, a separate set of runs is executed for
          validation (e.g. assessing surrogate-model performance).

        The number of processors is defined by ``self.nproc`` at initialisation.

        Parameters
        ----------
        collocation_points : numpy.ndarray, optional
            Array of shape ``[init_runs, n_parameters]`` with the initial
            collocation points for the iterative runs. ``None`` during the BAL
            phase.
        bal_new_set_parameters : numpy.ndarray, optional
            Array of shape ``[1, n_parameters]`` with the new parameter set for
            a BAL iteration. ``None`` during the initial runs.
        bal_iteration : int, optional
            BAL iteration number (default ``0``).
        complete_bal_mode : bool, optional
            ``True`` (default) to run the full process (initial runs, surrogate
            construction and BAL); ``False`` for initial runs only.
        validation : bool, optional
            ``True`` to run a separate set of validation simulations.
        *args, **kwargs
            Binding-specific options (e.g. Telemac's ``output_extraction``,
            ``output_extraction_time`` and ``n``). See the concrete subclass.

        Returns
        -------
        numpy.ndarray
            2D array of processed model outputs, shape
            ``[num_runs, nloc * num_calibration_quantities]``, where ``num_runs``
            is the total number of evaluations (initial runs plus BAL
            iterations) and the columns interleave the calibration quantities
            per location. For example, with two quantities and two locations,
            columns 1-2 hold the two quantities at the first location and
            columns 3-4 the second.
        """



[docs]
    def run_single_simulation(
            self,
            control_file="control_file.hydro",
    ):
        """
        Executes a single model run using a specified script or launcher file.

        This method is intended to handle the execution of a single simulation for various models
        (e.g., Telemac, OpenFOAM, Basement) by calling the appropriate launcher script.

        Parameters
        ----------
        control_file : str
            The name of the control file used to launch the simulation. Defaults to "control_file.hydro" as an example.
            This file should be present in the appropriate directory and executable through a terminal.

        Returns
        -------
        None
            The method executes the model run using a launcher command.
        """
        raise NotImplementedError(
            f"{type(self).__name__} does not implement run_single_simulation()."
        )



[docs]
    def set_observations_and_variances(
            self,
            calibration_pts_file_path,
            calibration_quantities,
            extraction_quantities,
            gpe_error=0.10,
            measurement_error=0.10):
        """
        Reads calibration point data and constructs observation variances.

        Total variance is computed as::

            variance = measurement_error**2 + gpe_error**2 + site_specific_error**2

        where:

        - ``measurement_error`` is assigned as a percentage of the measured value.
        - ``gpe_error`` is assigned as a percentage of the measured value.
        - ``site_specific_error`` is read from ``<quantity>_ERROR`` columns and should
          already be in the physical units of the corresponding calibration quantity.
        """

        calibration_pts_df = pd.read_csv(calibration_pts_file_path)

        # Resolve the required <quantity>_DATA / <quantity>_ERROR columns
        # case-insensitively, so both the Telemac (e.g. "WATER DEPTH_DATA") and
        # OpenFOAM (e.g. "U_x_DATA") naming conventions work regardless of the
        # exact case used in the CSV header.
        column_lookup = {col.lower(): col for col in calibration_pts_df.columns}

        observation_columns = []
        error_columns = []
        missing_columns = []

        for quantity in calibration_quantities:
            obs_actual = column_lookup.get(f"{quantity}_DATA".lower())
            err_actual = column_lookup.get(f"{quantity}_ERROR".lower())
            if obs_actual is not None:
                observation_columns.append(obs_actual)
            else:
                missing_columns.append(f"{quantity}_DATA")
            if err_actual is not None:
                error_columns.append(err_actual)
            else:
                missing_columns.append(f"{quantity}_ERROR")

        if missing_columns:
            raise ValueError(
                f"Missing required columns in calibration file: {missing_columns}"
            )

        observations_2d = calibration_pts_df[observation_columns].to_numpy(dtype=float)

        site_specific_errors_2d = calibration_pts_df[error_columns].to_numpy(dtype=float)

        abs_observations_2d = np.abs(observations_2d)

        measurement_errors_2d = abs_observations_2d * measurement_error
        gpe_errors_2d = abs_observations_2d * gpe_error

        observations = observations_2d.flatten().reshape(1, -1)

        measurement_errors = measurement_errors_2d.flatten()
        gpe_errors = gpe_errors_2d.flatten()
        site_specific_errors = site_specific_errors_2d.flatten()

        variances = (
                measurement_errors ** 2
                + gpe_errors ** 2
                + site_specific_errors ** 2
        )

        n_loc = len(calibration_pts_df)
        n_calib_quantities = len(calibration_quantities)

        if extraction_quantities is None:
            n_extraction_quantities = 0
        else:
            n_extraction_quantities = len(extraction_quantities)

        return (
            observations,
            variances,
            measurement_errors,
            n_loc,
            n_calib_quantities,
            calibration_pts_df,
            n_extraction_quantities
        )




[docs]
    @staticmethod
    def read_data(results_folder, file_name):
        """
        Reads and extracts data from various file types based on the provided file name.

        The function supports file types such as .csv, .json, .txt, .pkl, and .pickle.

        Parameters
        ----------
        results_folder : str
            The base directory where the results files are stored.
        file_name : str
            The name of the file, including its extension (e.g., 'data.csv', 'output.json').

        Returns
        -------
        data : object
            The extracted data, which can be a DataFrame, dictionary, list, or other object depending on the file type.
            Returns None if the file type is unsupported or an error occurs while reading the file.
        """
        file_path = os.path.join(results_folder, file_name)

        try:
            # Determine the file extension
            file_extension = os.path.splitext(file_name)[1]

            if file_extension == '.csv':
                data = pd.read_csv(file_path).to_numpy()
            elif file_extension == '.json':
                with open(file_path, 'r') as file:
                    data = json.load(file)
            elif file_extension == '.txt':
                with open(file_path, 'r') as file:
                    data = file.readlines()
            elif file_extension in ['.pkl', '.pickle']:
                with open(file_path, 'rb') as file:
                    data = pickle.load(file)
            else:
                raise ValueError(
                    "Unsupported file type. Only .csv, .json, .txt, .pkl, and .pickle files are supported.")
            return data

        except Exception as e:
            print(f"An error occurred while reading the file: {e}")
            return None



[docs]
    def set_calibration_parameters(self, params, values):
        """
        Create a dictionary from calibration parameters and their value ranges if both params and values exist.
        If only one of them exists, compute the number of dimensions.

        :param params: List of parameter names.
        :param values: List of value ranges corresponding to the parameter names.
        :return: Dictionary with parameter names as keys and value ranges as values, and the number of dimensions.
        :raises ValueError: If the number of parameters does not match the number of values when both are provided.
        """
        # Initialize default return values
        param_dict = None
        ndim = 0

        # Check if both params and values exist
        if params and values:
            if len(params) != len(values):
                logger_error.error("Mismatch between the number of parameters (%d) and values (%d)", len(params),
                                   len(values))
                raise ValueError("The number of parameters and values must be the same.")

            # Create the dictionary
            param_dict = dict(zip(params, values))
            ndim = len(params)

        # If only one of params or values exists, compute ndim
        elif params or values:
            ndim = len(params) if params else len(values)

        # Return the dictionary and number of dimensions
        return param_dict, ndim



[docs]
    def update_model_controls(
            self,
            collocation_point_values,
            calibration_parameters,
            auxiliary_file_path,
            simulation_id=0,
    ):
        """
        Updates the model control files for Bayesian calibration.Incorporates new parameter values, ensuring that the model runs
        with the specified settings during Bayesian calibration.

        Parameters
        ----------
        collocation_point_values : array
            Contains values for the calibration parameters. These values are used
            to update the model control files.

        calibration_parameters : list of str
            Calibration parameter names that are to be updated in the model control files. Each
            string in the list should correspond to a parameter used in the model.

        auxiliary_file_path : str
            Path to an auxiliary file that may be required for running the model controls (i.e., .tbl file in Telemac).

        simulation_id : int
            An optional identifier for the simulation. The default is 0. This ID can be used to distinguish
            different simulations or runs.

        Returns
        -------
        None
            This method does not return any value. It modifies the model control files.
        """
        raise NotImplementedError(
            f"{type(self).__name__} does not implement update_model_controls()."
        )



[docs]
    @abstractmethod
    def output_processing(
            self,
            output_data="",
            delete_complex_outputs=False,
            validation=False,
            *args,
            **kwargs
    ):
        """
        Extract data from a file(.txt,json,etc) containing model outputs to 2D array ready to use in Bayesian calibration
        and saves the results to a CSV file.

        Parameters
        ----------
        output_data_path : str
            Path to the file (.json) containing the model outputs. The file should be structured
            such that its keys correspond to calibration points, and its values are lists of nested dictionaries having the
            output values for each run and quantity/ies.
        delete_complex_outputs: Boolean, Default: False
            Delete complex model output files from the results folder (e.g. auto-saved-results-HydroBayesCal/<variable>).
            Recommended when running several simulations of the full complexity model.
        validation: Boolean, Default: False
            If True, new files for collocation points and model results are created. This is done to keep
            the collocation points and model results obtained during the calibration process.

        Returns
        -------
        model_results : numpy.ndarray
            A 2D array containing the processed model outputs. The shape of the array is
            [No. of total runs, No. of calibration points x No. of quantities], where 'No. of quantities'
            is the number of calibration quantities being processed, and 'No. of total runs' is the sum
            of initial runs and Bayesian active learning iterations. The array is also saved to a CSV file
            in the specified directory.
        """