"""
Core code for coupling any hydrodynamic simulation software with the main script for GPE surrogate model construction
and Bayesian Active Learning .
Author: Andres Heredia M.Sc.
"""
import os
import csv
import json
import pickle
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from datetime import datetime
from hydroBayesCal.utils.config_logging import logger_error
[docs]
class HydroSimulations(ABC):
[docs]
def __init__(
self,
control_file = "control.cas",
model_dir="",
res_dir="",
calibration_pts_file_path="",
n_cpus=int(),
init_runs=1,
calibration_parameters=None,
param_values=None,
calibration_quantities=None,
extraction_quantities=None,
dict_output_name='extraction-data',
user_param_values=False,
max_runs=1,
complete_bal_mode=False,
only_bal_mode=False,
check_inputs=False,
delete_complex_outputs=True,
validation=False,
multitask_selection="variables",
*args,
**kwargs,
):
"""
Constructor of the HydroSimulations class to manage and run hydrodynamic simulations within the context of
Bayesian Calibration using a Gaussian Process Emulator (GPE). The class is designed to handle simulation setup,
execution, and result storage while managing calibration parameters and Bayesian Active Learning (BAL) iterations.
Parameters
----------
control_file : str
Name of the file that controls the full complexity model simulation (default is "control.cas" as an example for Telemac).
model_dir : str
Full complexity model directory where all simulation files (mesh, control file, boundary conditions) are located.
res_dir : str
Directory where a subfolder called "auto-saved-results-HydroBayesCal" will be created to store all the result files. In this directory, the results of the calibration
process will be stored according to the calibration quantity name. Addiionally, subfolders for plots, surrogate models, and restart data will be created.
calibration_pts_file_path : str or optional
File path to the calibration points data file. Please check documentation for further details of the file format.
n_cpus : int
Number of CPUs to be used for parallel processing (if available).
init_runs : int
Initial runs of the full complexity model (before Bayesian Active Learning).
calibration_parameters : list of str
Names of the considered calibration parameters (e.g. roughness coefficients, empirical constants, turbulent viscosity, etc).
param_values : list
Value ranges considered for parameter sampling. Example: [[min1, max1], [min2, max2], ...].
calibration_quantities : list of str
Names of the calibration targets (model outputs) used for calibration. These quantities usually correspond to the measured values for calibration purposes.
Example: ['WATER DEPTH'] for a single quantity.
Example: ['WATER DEPTH', 'SCALAR VELOCITY'] for multiple quantities.
extraction_quantities : list of str
Names of the quantities to be extracted from the model output files. Generally, the same or more than the calibration_quantities. These quantities will be extracted from the model.
Example: ``calibration_quantities = ['WATER DEPTH']`` (WATER DEPTH as calibration parameter).
Example: ``extraction_quantities = ['WATER DEPTH', 'SCALAR VELOCITY', 'TURBULENT ENERG', 'VELOCITY U', 'VELOCITY V']``.
Any of these additional quantities can be used for calibration purposes when restarting the
calibration process with ``only_bal_mode = True``.
dict_output_name : str
Base name for output dictionary files where the outputs are saved as .json files.
This dictionary will be saved in the calibration-data subfolder for the considered calibration target.
parameter_sampling_method : str
Method used for sampling parameter values during the calibration process. The available options are:
- "random" : Random sampling.
- "latin_hypercube" : Latin Hypercube Sampling (LHS).
- "sobol" : Sobol sequence sampling.
- "halton" : Halton sequence sampling.
- "hammersley" : Hammersley sequence sampling.
- "chebyshev(FT)" : Chebyshev nodes (Fourier Transform-based).
- "grid(FT)" : Grid-based sampling (Fourier Transform-based).
- "user" : User-defined sampling.
Example::
parameter_sampling_method = "sobol" # Uses Sobol sequence sampling.
If "user" is selected, a ``.csv`` file containing user-defined collocation points must be provided
in the restart data folder. The file should follow this format::
param1 param2 param3 param4 param5
0.148 0.770 0.014 0.014 0.700
0.066 0.066 0.066 0.066 0.066
max_runs : int
Maximum (total) number of model simulations, including initial runs and Bayesian Active Learning iterations.
complete_bal_mode : bool, optional (Default: True)
- If True: Bayesian Active Learning (BAL) is performed after the initial runs, enabling a complete surrogate‐assisted calibration process.
**This option MUST be selected if you choose to perform only BAL** (i.e., when `only_bal_mode = True`).
- If False: Only the initial runs of the full complexity model are executed, and the model outputs are stored as `.json` files.
only_bal_mode : bool, optional (Default: False)
- If False: The process will either execute a complete surrogate‐assisted calibration or only the initial runs, depending on the value of `complete_bal_mode`.
- If True: Only the surrogate model construction and Bayesian Active Learning of preexisting model outputs at predefined collocation points are performed.
**This mode can be executed only if either a complete process has already been performed** (`complete_bal_mode = True` and `only_bal_mode = True`) **or if only the initial runs have been executed** (`complete_bal_mode = False` and `only_bal_mode = False`).
Shortcut combinations and their corresponding tasks::
complete_bal_mode | only_bal_mode | task
------------------+---------------------------------+-----------------------------------------------------
True | False | Complete surrogate-assisted calibration
False | False | Only initial runs (no surrogate model)
True | True, with init_runs = max_runs | Surrogate construction with predefined runs (no BAL)
True | True, with init_runs > max_runs | Surrogate construction + Bayesian Active Learning
validation : bool, optional (Default: False)
If True, creates output files (inputs and outputs) corresponding to validation process.
*args : tuple, optional
Additional positional arguments.
**kwargs : dict, optional
Additional keyword arguments.
Attributes
----------
asr_dir : str
Directory for auto-saved results.
model_dir : str
Path to the directory containing the model files.
res_dir : str
Path to the directory where results will be stored.
control_file : str
Path to the control file used in the calibration process.
calibration_pts_file_path : str
Path to the file containing the calibration points data.
nproc : int
Number of processors (CPUs) to be used.
param_values : array
Parameter values used in calibration.
calibration_quantities : list
Calibration quantities to be evaluated.
extraction_quantities : list
Quantities extracted from the model during calibration (calibration quantities must be included here).
calibration_parameters : list
Parameters involved in the calibration process.
parameter_sampling_method : str
Method used for sampling parameters during calibration. Options:
- "random"
- "latin_hypercube"
- "sobol"
- "halton"
- "hammersley"
- "chebyshev(FT)"
- "grid(FT)"
- "user" (requires a CSV file with user-defined collocation points in the restart data folder).
init_runs : int
Number of initial runs before surrogate-assisted calibration.
max_runs : int
Maximum number of calibration runs, including Bayesian Active Learning iterations.
complete_bal_mode : bool
If True, enables complete surrogate-assisted calibration with Bayesian Active Learning.
Must be selected if `only_bal_mode = True`.
only_bal_mode : bool
If True, only surrogate model construction and Bayesian Active Learning are performed.
Requires prior execution of either the full calibration process (`complete_bal_mode = True`)
or initial runs (`complete_bal_mode = False`).
delete_complex_outputs : bool
If True, deletes complex model outputs after processing.
validation : bool
If True, the model is run in validation mode.
multitask_selection : bool
If True, enables multitask selection for surrogate modeling.
dict_output_name : str
Name of the output dictionary file. Appends "-validation" if validation mode is enabled.
nloc : int
Number of calibration points where measured data are available.
ndim : int
Number of model parameters used in the calibration.
param_dic : dict
Dictionary containing calibration parameters and their respective ranges.
num_calibration_quantities : int
Number of quantities used for calibration.
num_extraction_quantities : int
Number of additional quantities extracted from the model.
observations : array
Observed values at each calibration point.
measurement_errors : array
Measurement errors associated with each calibration point.
calibration_pts_df : pandas.DataFrame
Contains calibration point information.
Header format::
Point | X | Y | <quantity>_DATA | <quantity>_ERROR | ...
user_collocation_points : array
User-defined collocation points loaded from a CSV file (only applicable when `parameter_sampling_method="user"`).
calibration_folder : str
Directory where calibration data are stored.
restart_data_folder : str
Directory for restart data, used for resuming calibration runs.
model_evaluations : numpy.ndarray
2D array of processed model outputs, shape
``[num_runs, nloc * num_calibration_quantities]``: ``num_runs`` is the
total number of evaluations (initial runs plus BAL iterations) and the
columns interleave the calibration quantities per location. For example,
with two quantities and two locations, columns 1-2 hold the two
quantities at the first location and columns 3-4 the second.
"""
if check_inputs:
self.check_inputs(model_dir=model_dir,
res_dir=res_dir,
control_file=control_file,
init_runs=init_runs,
nproc=n_cpus,
max_runs=max_runs,
calibration_parameters=calibration_parameters,
param_values=param_values,
calibration_quantities=calibration_quantities
)
self.model_dir = model_dir
self.res_dir = res_dir
self.control_file = control_file
self.calibration_pts_file_path = calibration_pts_file_path
self.nproc = n_cpus
self.param_values = param_values
self.calibration_quantities = calibration_quantities
self.extraction_quantities = extraction_quantities
self.calibration_parameters = calibration_parameters
self.init_runs = init_runs
self.max_runs = max_runs
self.complete_bal_mode = complete_bal_mode
self.only_bal_mode = only_bal_mode
self.delete_complex_outputs=delete_complex_outputs
self.validation=validation
self.multitask_selection = multitask_selection
self.user_param_values = user_param_values
if self.validation:
self.dict_output_name = dict_output_name + "-validation"
else:
self.dict_output_name = dict_output_name
self.nloc = None
self.ndim = None
self.param_dic = None
self.num_calibration_quantities = None
self.num_extraction_quantities = None
self.observations = None
self.measurement_errors = None
self.variances = None
self.calibration_pts_df = None
self.user_collocation_points = None
self.restart_collocation_points = None
self.model_evaluations = None
if calibration_parameters:
self.param_dic, self.ndim = self.set_calibration_parameters(calibration_parameters, param_values)
if calibration_pts_file_path:
self.observations,self.variances, self.measurement_errors, self.nloc, self.num_calibration_quantities, self.calibration_pts_df, self.num_extraction_quantities = self.set_observations_and_variances(
calibration_pts_file_path, calibration_quantities, extraction_quantities)
self.asr_dir = os.path.join(res_dir,
f"auto-saved-results-HydroBayesCal")
self.calibration_folder = os.path.join(self.asr_dir,"calibration-data","_".join(self.calibration_quantities))
self.restart_data_folder = os.path.join(self.asr_dir,"restart_data")
if not os.path.exists(self.asr_dir):
os.makedirs(self.asr_dir)
if not os.path.exists(self.calibration_folder) and not self.user_param_values:
os.makedirs(self.calibration_folder)
if not os.path.exists(self.restart_data_folder):
os.makedirs(self.restart_data_folder)
if not os.path.exists(os.path.join(self.asr_dir, "plots")):
os.makedirs(os.path.join(self.asr_dir, "plots"))
if not os.path.exists(os.path.join(self.asr_dir, "surrogate-gpe")):
os.makedirs(os.path.join(self.asr_dir, "surrogate-gpe"))
if self.user_param_values:
collocation_path = os.path.join(self.restart_data_folder, 'user-collocation-points.csv')
self.user_collocation_points = np.loadtxt(collocation_path, delimiter=',', skiprows=1, ndmin=2)
if self.only_bal_mode:
collocation_path = os.path.join(self.restart_data_folder, 'initial-collocation-points.csv')
self.restart_collocation_points = np.loadtxt(collocation_path, delimiter=',', skiprows=1, max_rows=self.init_runs,ndmin=2)
[docs]
@abstractmethod
def run_multiple_simulations(
self,
collocation_points=None,
bal_new_set_parameters=None,
bal_iteration=0,
complete_bal_mode=True,
validation=False,
*args,
**kwargs
):
"""Run the full-complexity model for a set of collocation points (BAL).
Executes multiple hydrodynamic simulations in the context of Bayesian
Active Learning (BAL). A new set of calibration parameters may be added
as an array during BAL iterations.
* If ``complete_bal_mode=True``, the process includes initial runs,
surrogate-model construction and BAL iterations.
* If ``complete_bal_mode=False``, only the initial model runs are
performed.
* If ``validation=True``, a separate set of runs is executed for
validation (e.g. assessing surrogate-model performance).
The number of processors is defined by ``self.nproc`` at initialisation.
Parameters
----------
collocation_points : numpy.ndarray, optional
Array of shape ``[init_runs, n_parameters]`` with the initial
collocation points for the iterative runs. ``None`` during the BAL
phase.
bal_new_set_parameters : numpy.ndarray, optional
Array of shape ``[1, n_parameters]`` with the new parameter set for
a BAL iteration. ``None`` during the initial runs.
bal_iteration : int, optional
BAL iteration number (default ``0``).
complete_bal_mode : bool, optional
``True`` (default) to run the full process (initial runs, surrogate
construction and BAL); ``False`` for initial runs only.
validation : bool, optional
``True`` to run a separate set of validation simulations.
*args, **kwargs
Binding-specific options (e.g. Telemac's ``output_extraction``,
``output_extraction_time`` and ``n``). See the concrete subclass.
Returns
-------
numpy.ndarray
2D array of processed model outputs, shape
``[num_runs, nloc * num_calibration_quantities]``, where ``num_runs``
is the total number of evaluations (initial runs plus BAL
iterations) and the columns interleave the calibration quantities
per location. For example, with two quantities and two locations,
columns 1-2 hold the two quantities at the first location and
columns 3-4 the second.
"""
[docs]
def run_single_simulation(
self,
control_file="control_file.hydro",
):
"""
Executes a single model run using a specified script or launcher file.
This method is intended to handle the execution of a single simulation for various models
(e.g., Telemac, OpenFOAM, Basement) by calling the appropriate launcher script.
Parameters
----------
control_file : str
The name of the control file used to launch the simulation. Defaults to "control_file.hydro" as an example.
This file should be present in the appropriate directory and executable through a terminal.
Returns
-------
None
The method executes the model run using a launcher command.
"""
raise NotImplementedError(
f"{type(self).__name__} does not implement run_single_simulation()."
)
[docs]
def set_observations_and_variances(
self,
calibration_pts_file_path,
calibration_quantities,
extraction_quantities,
gpe_error=0.10,
measurement_error=0.10):
"""
Reads calibration point data and constructs observation variances.
Total variance is computed as::
variance = measurement_error**2 + gpe_error**2 + site_specific_error**2
where:
- ``measurement_error`` is assigned as a percentage of the measured value.
- ``gpe_error`` is assigned as a percentage of the measured value.
- ``site_specific_error`` is read from ``<quantity>_ERROR`` columns and should
already be in the physical units of the corresponding calibration quantity.
"""
calibration_pts_df = pd.read_csv(calibration_pts_file_path)
# Resolve the required <quantity>_DATA / <quantity>_ERROR columns
# case-insensitively, so both the Telemac (e.g. "WATER DEPTH_DATA") and
# OpenFOAM (e.g. "U_x_DATA") naming conventions work regardless of the
# exact case used in the CSV header.
column_lookup = {col.lower(): col for col in calibration_pts_df.columns}
observation_columns = []
error_columns = []
missing_columns = []
for quantity in calibration_quantities:
obs_actual = column_lookup.get(f"{quantity}_DATA".lower())
err_actual = column_lookup.get(f"{quantity}_ERROR".lower())
if obs_actual is not None:
observation_columns.append(obs_actual)
else:
missing_columns.append(f"{quantity}_DATA")
if err_actual is not None:
error_columns.append(err_actual)
else:
missing_columns.append(f"{quantity}_ERROR")
if missing_columns:
raise ValueError(
f"Missing required columns in calibration file: {missing_columns}"
)
observations_2d = calibration_pts_df[observation_columns].to_numpy(dtype=float)
site_specific_errors_2d = calibration_pts_df[error_columns].to_numpy(dtype=float)
abs_observations_2d = np.abs(observations_2d)
measurement_errors_2d = abs_observations_2d * measurement_error
gpe_errors_2d = abs_observations_2d * gpe_error
observations = observations_2d.flatten().reshape(1, -1)
measurement_errors = measurement_errors_2d.flatten()
gpe_errors = gpe_errors_2d.flatten()
site_specific_errors = site_specific_errors_2d.flatten()
variances = (
measurement_errors ** 2
+ gpe_errors ** 2
+ site_specific_errors ** 2
)
n_loc = len(calibration_pts_df)
n_calib_quantities = len(calibration_quantities)
if extraction_quantities is None:
n_extraction_quantities = 0
else:
n_extraction_quantities = len(extraction_quantities)
return (
observations,
variances,
measurement_errors,
n_loc,
n_calib_quantities,
calibration_pts_df,
n_extraction_quantities
)
[docs]
@staticmethod
def read_data(results_folder, file_name):
"""
Reads and extracts data from various file types based on the provided file name.
The function supports file types such as .csv, .json, .txt, .pkl, and .pickle.
Parameters
----------
results_folder : str
The base directory where the results files are stored.
file_name : str
The name of the file, including its extension (e.g., 'data.csv', 'output.json').
Returns
-------
data : object
The extracted data, which can be a DataFrame, dictionary, list, or other object depending on the file type.
Returns None if the file type is unsupported or an error occurs while reading the file.
"""
file_path = os.path.join(results_folder, file_name)
try:
# Determine the file extension
file_extension = os.path.splitext(file_name)[1]
if file_extension == '.csv':
data = pd.read_csv(file_path).to_numpy()
elif file_extension == '.json':
with open(file_path, 'r') as file:
data = json.load(file)
elif file_extension == '.txt':
with open(file_path, 'r') as file:
data = file.readlines()
elif file_extension in ['.pkl', '.pickle']:
with open(file_path, 'rb') as file:
data = pickle.load(file)
else:
raise ValueError(
"Unsupported file type. Only .csv, .json, .txt, .pkl, and .pickle files are supported.")
return data
except Exception as e:
print(f"An error occurred while reading the file: {e}")
return None
[docs]
def set_calibration_parameters(self, params, values):
"""
Create a dictionary from calibration parameters and their value ranges if both params and values exist.
If only one of them exists, compute the number of dimensions.
:param params: List of parameter names.
:param values: List of value ranges corresponding to the parameter names.
:return: Dictionary with parameter names as keys and value ranges as values, and the number of dimensions.
:raises ValueError: If the number of parameters does not match the number of values when both are provided.
"""
# Initialize default return values
param_dict = None
ndim = 0
# Check if both params and values exist
if params and values:
if len(params) != len(values):
logger_error.error("Mismatch between the number of parameters (%d) and values (%d)", len(params),
len(values))
raise ValueError("The number of parameters and values must be the same.")
# Create the dictionary
param_dict = dict(zip(params, values))
ndim = len(params)
# If only one of params or values exists, compute ndim
elif params or values:
ndim = len(params) if params else len(values)
# Return the dictionary and number of dimensions
return param_dict, ndim
[docs]
def update_model_controls(
self,
collocation_point_values,
calibration_parameters,
auxiliary_file_path,
simulation_id=0,
):
"""
Updates the model control files for Bayesian calibration.Incorporates new parameter values, ensuring that the model runs
with the specified settings during Bayesian calibration.
Parameters
----------
collocation_point_values : array
Contains values for the calibration parameters. These values are used
to update the model control files.
calibration_parameters : list of str
Calibration parameter names that are to be updated in the model control files. Each
string in the list should correspond to a parameter used in the model.
auxiliary_file_path : str
Path to an auxiliary file that may be required for running the model controls (i.e., .tbl file in Telemac).
simulation_id : int
An optional identifier for the simulation. The default is 0. This ID can be used to distinguish
different simulations or runs.
Returns
-------
None
This method does not return any value. It modifies the model control files.
"""
raise NotImplementedError(
f"{type(self).__name__} does not implement update_model_controls()."
)
[docs]
@abstractmethod
def output_processing(
self,
output_data="",
delete_complex_outputs=False,
validation=False,
*args,
**kwargs
):
"""
Extract data from a file(.txt,json,etc) containing model outputs to 2D array ready to use in Bayesian calibration
and saves the results to a CSV file.
Parameters
----------
output_data_path : str
Path to the file (.json) containing the model outputs. The file should be structured
such that its keys correspond to calibration points, and its values are lists of nested dictionaries having the
output values for each run and quantity/ies.
delete_complex_outputs: Boolean, Default: False
Delete complex model output files from the results folder (e.g. auto-saved-results-HydroBayesCal/<variable>).
Recommended when running several simulations of the full complexity model.
validation: Boolean, Default: False
If True, new files for collocation points and model results are created. This is done to keep
the collocation points and model results obtained during the calibration process.
Returns
-------
model_results : numpy.ndarray
A 2D array containing the processed model outputs. The shape of the array is
[No. of total runs, No. of calibration points x No. of quantities], where 'No. of quantities'
is the number of calibration quantities being processed, and 'No. of total runs' is the sum
of initial runs and Bayesian active learning iterations. The array is also saved to a CSV file
in the specified directory.
"""