Skip to content

era5

canari_ml.data.masks.era5

Module to mask out the northern/southern hemisphere

canari_ml.data.masks.era5.MaskDatasetConfig(downloaded_files=None, identifier='masks', variable_name=None, reference_era5_file=None, base_weight=1.0, region_weights=None, weight_smoothing_sigma=10.0, **kwargs)

Bases: DatasetConfig

Configuration class for generating ERA5 mask datasets.

Inherits from download_toolbox.interface.DatasetConfig and extends it to handle hemisphere-specific masks.

Attributes:

Name Type Description
variable_name

Name of the variable to process. Defaults to None.

reference_era5_file

Path to reference ERA5 file for mask generation. Defaults to None.

Notes: Based on MaskDatasetConfig class from the IceNet library. https://github.com/icenet-ai/icenet/blob/6caa234907904bfa76b8724d8c83cd989230494a/icenet/data/masks/osisaf.py

Parameters:

Name Type Description Default
downloaded_files optional

List of downloaded files. Defaults to None.

None
identifier optional

Identifier for this dataset configuration. Defaults to "masks".

'masks'
variable_name None | str

Name of the ERA5 variable to process. Must be specified. Defaults to None.

None
reference_era5_file None | str

Path to the reference ERA5 file. Must be specified. Defaults to None.

None
**kwargs Unpack

Additional keyword arguments passed to super class.

{}

Raises:

Type Description
ValueError

If either variable_name or reference_era5_file are None.

NotImplementedError

If location is neither north nor south.

Source code in src/canari_ml/data/masks/era5.py
def __init__(
    self,
    downloaded_files: list | None = None,
    identifier: str = "masks",
    variable_name: None | str = None,
    reference_era5_file: None | str = None,
    base_weight: float = 1.0,
    region_weights: tuple | list | None = None,
    weight_smoothing_sigma: float = 10.0,
    **kwargs: Unpack,
):
    """Initialise the MaskDatasetConfig class.

    Args:
        downloaded_files (optional): List of downloaded files.
            Defaults to None.
        identifier (optional): Identifier for this dataset configuration.
            Defaults to "masks".
        variable_name: Name of the ERA5 variable to process. Must be specified.
            Defaults to None.
        reference_era5_file: Path to the reference ERA5 file. Must be specified.
            Defaults to None.
        **kwargs: Additional keyword arguments passed to super class.

    Raises:
        ValueError: If either variable_name or reference_era5_file are None.
        NotImplementedError: If location is neither north nor south.
    """
    super().__init__(
        identifier=identifier,
        levels=[None, None],
        path_components=[],
        var_names=["hemisphere", "weighted_regions"],
        **kwargs,
    )

    if not self.location.north and not self.location.south:
        raise NotImplementedError("Location must be north or south, not custom")

    self._hemi_str = "nh" if self.location.north else "sh"

    if variable_name is None or reference_era5_file is None:
        raise ValueError(
            "Reference ERA5 variable name and corresponding reference file "
            "must be specified"
        )
    self.variable_name = variable_name
    self.reference_era5_file = reference_era5_file
    self._base_weight = base_weight
    self._region_weights = region_weights
    self._weight_smoothing_sigma = weight_smoothing_sigma

canari_ml.data.masks.era5.MaskDatasetConfig.variable_name = variable_name instance-attribute

canari_ml.data.masks.era5.MaskDatasetConfig.reference_era5_file = reference_era5_file instance-attribute

canari_ml.data.masks.era5.MaskDatasetConfig.config property

Get the configuration object.

If not already created, initialises a Configuration object with the location name.

Returns:

Type Description
dict

The dataset configuration object.

canari_ml.data.masks.era5.MaskDatasetConfig.save_data_for_config(rename_var_list=None, source_ds=None, source_files=None, time_dim_values=None, var_filter_list=None, **kwargs)

Save data for the current configuration.

Processes each variable configuration and generates corresponding files.

Parameters:

Name Type Description Default
rename_var_list dict

Dictionary mapping old to new variable names. Defaults to None.

None
source_ds object

Source dataset. Defaults to None.

None
source_files list

List of source files. Defaults to None.

None
time_dim_values list

Time dimension values. Defaults to None.

None
var_filter_list list

List of variables to filter. Defaults to None.

None
**kwargs Unpack

Additional keyword arguments.

{}
Source code in src/canari_ml/data/masks/era5.py
def save_data_for_config(
    self,
    rename_var_list: dict = None,
    source_ds: object = None,
    source_files: list = None,
    time_dim_values: list = None,
    var_filter_list: list = None,
    **kwargs: Unpack,
) -> None:
    """Save data for the current configuration.

    Processes each variable configuration and generates corresponding files.

    Args:
        rename_var_list: Dictionary mapping old to new variable names.
            Defaults to None.
        source_ds: Source dataset.
            Defaults to None.
        source_files: List of source files.
            Defaults to None.
        time_dim_values: Time dimension values.
            Defaults to None.
        var_filter_list: List of variables to filter.
            Defaults to None.
        **kwargs: Additional keyword arguments.
    """
    for var_config in self.variables:
        files = getattr(self, "_generate_{}".format(var_config.name))()
        self.var_files[var_config.name] = files

canari_ml.data.masks.era5.MaskDatasetConfig.get_config(config_funcs=None, strip_keys=None)

Get the configuration object with specified keys removed.

Parameters:

Name Type Description Default
config_funcs dict

Dictionary of configuration functions. Defaults to None.

None
strip_keys list

List of keys to remove from the configuration. Defaults to None.

None

Returns:

Type Description
dict

The modified configuration object.

Source code in src/canari_ml/data/masks/era5.py
def get_config(self, config_funcs: dict = None, strip_keys: list = None) -> dict:
    """Get the configuration object with specified keys removed.

    Args:
        config_funcs: Dictionary of configuration functions.
            Defaults to None.
        strip_keys: List of keys to remove from the configuration.
            Defaults to None.

    Returns:
        The modified configuration object.
    """
    return super().get_config(
        strip_keys=[
            # "_filename_template_osi450",
            # "_hemi_str",
            "_identifier",
            "_levels",
            "_path_components",
            # "_retrieve_cmd_template_osi450",
            "_var_names",
            # "_year",
        ]
    )

canari_ml.data.masks.era5.Masks(dataset_config, *args, absolute_vars=None, identifier=None, base_weight=1.0, region_weights=None, weight_smoothing_sigma=10.0, mask_dataset_config_path=None, mask_config_path=None, **kwargs)

Bases: Processor

A Processor class for generating and applying hemisphere-specific masks.

Inherits from preprocess_toolbox.processor.Processor to handle mask generation and data processing, particularly for ERA5 datasets. This class manages the creation of northern or southern hemisphere masks based on configuration settings.

Attributes:

Name Type Description
_dataset_config DatasetConfig

Configuration object containing dataset parameters, including location, variables, and file paths.

abs_vars list

List of variables treated as absolute in processing.

_hemi_str str

'north' or 'south', indicating which hemisphere is being processed.

_region tuple

Slice/slices defining the region to apply masking.

Notes

Based on Masks class from the IceNet library. https://github.com/icenet-ai/icenet/blob/6caa234907904bfa76b8724d8c83cd989230494a/icenet/data/masks/osisaf.py

Parameters:

Name Type Description Default
dataset_config DatasetConfig

Configuration object for the dataset.

required
*args Unpack

Additional positional arguments passed to super class.

()
absolute_vars optional

Variables treated as absolute. Defaults to None.

None
identifier optional

Identifier for processing. Defaults to None.

None
base_weight optional

Base weight for regions. Defaults to 1.0.

1.0
region_weights optional

Weights for different regions. Defaults to None.

None
weight_smoothing_sigma optional

Sigma value for smoothing weights. Defaults to 10.0.

10.0
mask_dataset_config_path optional

Path for dataset config file. Defaults to None.

None
**kwargs Unpack

Additional keyword arguments passed to super class.

{}
Source code in src/canari_ml/data/masks/era5.py
def __init__(
    self,
    dataset_config: DatasetConfig,
    *args: Unpack,
    absolute_vars: list | None = None,
    identifier: str | None = None,
    base_weight: float = 1.0,
    region_weights: tuple | list | None = None,
    weight_smoothing_sigma: float = 10.0,
    mask_dataset_config_path: str | None = None, # passed via `--mask-dataset-config-path `
    mask_config_path: str | None = None, # passed via `--mask-config-path`
    **kwargs: Unpack,
):
    """Initialise the Masks processor with configuration settings.

    Args:
        dataset_config: Configuration object for the dataset.
        *args: Additional positional arguments passed to super class.
        absolute_vars (optional): Variables treated as absolute.
            Defaults to None.
        identifier (optional): Identifier for processing.
            Defaults to None.
        base_weight (optional): Base weight for regions.
            Defaults to 1.0.
        region_weights (optional): Weights for different regions.
            Defaults to None.
        weight_smoothing_sigma (optional): Sigma value for smoothing weights.
            Defaults to 10.0.
        mask_dataset_config_path (optional): Path for dataset config file.
            Defaults to None.
        **kwargs: Additional keyword arguments passed to super class.
    """

    # Use first ERA5 variable available to get a netcdf reference
    variable_name = next(iter(dataset_config.variables)).name

    # Use first ERA5 file available from this variable as reference file
    reference_era5_file = dataset_config.var_files.get(variable_name, None)[0]

    mask_ds_kwargs: dict = dict(
        base_path=dataset_config.base_path,
        frequency=dataset_config.frequency,
        location=dataset_config.location,
        variable_name=variable_name,
        reference_era5_file=reference_era5_file,
        base_weight=base_weight,
        region_weights=region_weights,
        weight_smoothing_sigma=weight_smoothing_sigma,
    )

    if mask_dataset_config_path:
        mask_ds_kwargs |= dict(config_path=mask_dataset_config_path)
        dir_path = os.path.dirname(mask_dataset_config_path)
        os.makedirs(dir_path, exist_ok=True)


    mask_ds = MaskDatasetConfig(**mask_ds_kwargs)
    mask_ds.save_data_for_config()
    self._dataset_config = mask_ds.save_config()
    self._hemi_str = "north" if dataset_config.location.north else "south"

    processor_kwargs: dict = dict(
        dataset_config=mask_ds,
        absolute_vars=["hemisphere", "weighted_regions"],
        # dtype=np.dtype(bool), # Removed since "weighted_regions" is a float,
                                # dealt with in `generate_sample` func.
        identifier="masks.{}".format(self._hemi_str),
    )

    if mask_config_path:
        processor_kwargs |= dict(config_path=mask_config_path)

    super().__init__(
        **processor_kwargs,
        **kwargs,
    )

    self._source_files = mask_ds.var_files.copy()
    self._region = (slice(None, None), slice(None, None))

canari_ml.data.masks.era5.Masks.region property writable

Get the current mask region.

Returns:

Type Description
tuple

The current region slices used for masking.

canari_ml.data.masks.era5.Masks.hemisphere_filename property

Get the filename for the hemisphere mask.

Returns:

Type Description
str

Path to the hemisphere mask file.

canari_ml.data.masks.era5.Masks.weighted_regions_filename property

Get the filename for the weighted regions.

Returns:

Type Description
str

Path to the weighted regions file.

canari_ml.data.masks.era5.Masks.get_config(config_funcs=None, strip_keys=None)

Retrieve the configuration dictionary for the processor.

Parameters:

Name Type Description Default
config_funcs optional

Dictionary of functions to modify config. Defaults to None.

None
strip_keys optional

Keys to remove from the config. Defaults to None.

None

Returns:

Name Type Description
dict dict

Configuration dictionary containing module and class implementation, absolute variables, dataset configuration, path, processed files, and source files.

Source code in src/canari_ml/data/masks/era5.py
def get_config(self, config_funcs: dict = None, strip_keys: list = None) -> dict:
    """Retrieve the configuration dictionary for the processor.

    Args:
        config_funcs (optional): Dictionary of functions to modify config.
            Defaults to None.
        strip_keys (optional): Keys to remove from the config.
            Defaults to None.

    Returns:
        dict: Configuration dictionary containing module and class implementation,
            absolute variables, dataset configuration, path, processed files,
            and source files.
    """
    return {
        "implementation": "{}:{}".format(self.__module__, self.__class__.__name__),
        "absolute_vars": self.abs_vars,
        "dataset_config": self._dataset_config,
        "path": self.path,
        "processed_files": self._processed_files,
        "source_files": self._source_files,
    }

canari_ml.data.masks.era5.Masks.process()

Generate and save the hemisphere mask based on the configured region.

Source code in src/canari_ml/data/masks/era5.py
def process(self):
    """Generate and save the hemisphere mask based on the configured region."""
    # Hemisphere mask preparation
    hemisphere_mask = np.load(self._source_files["hemisphere"])

    da_hemisphere_mask = xr.DataArray(
        data=hemisphere_mask,
        dims=["y", "x"],
        attrs=dict(description="Mask of hemisphere"),
    )

    self.save_processed_file(
        "hemisphere",
        os.path.basename(self.hemisphere_filename),
        da_hemisphere_mask,
        overwrite=True,
    )

    # Hemisphere mask preparation
    weighted_regions = np.load(self._source_files["weighted_regions"])

    da_weighted_regions = xr.DataArray(
        data=weighted_regions,
        dims=["y", "x"],
        attrs=dict(description="Weighted regions"),
    )

    self.save_processed_file(
        "weighted_regions",
        os.path.basename(self.weighted_regions_filename),
        da_weighted_regions,
        overwrite=True,
    )

    self.save_config()

canari_ml.data.masks.era5.Masks.hemisphere(*args, **kwargs)

Return the hemisphere mask as an xr.DataArray.

Parameters:

Name Type Description Default
*args Unpack
()
**kwargs Unpack
{}

Returns:

Type Description
DataArray

xr.DataArray: The hemisphere mask loaded from the specified file.

Source code in src/canari_ml/data/masks/era5.py
def hemisphere(self, *args: Unpack, **kwargs: Unpack) -> xr.DataArray:
    """Return the hemisphere mask as an xr.DataArray.

    Args:
        *args:
        **kwargs:

    Returns:
        xr.DataArray: The hemisphere mask loaded from the specified file.
    """
    da = xr.open_dataarray(self.hemisphere_filename)
    return da.data[self._region]

canari_ml.data.masks.era5.Masks.weighted_regions(*args, **kwargs)

Return the weighted regions as an xr.DataArray.

Parameters:

Name Type Description Default
*args Unpack
()
**kwargs Unpack
{}

Returns:

Type Description
DataArray

xr.DataArray: The hemisphere mask loaded from the specified file.

Source code in src/canari_ml/data/masks/era5.py
def weighted_regions(self, *args: Unpack, **kwargs: Unpack) -> xr.DataArray:
    """Return the weighted regions as an xr.DataArray.

    Args:
        *args:
        **kwargs:

    Returns:
        xr.DataArray: The hemisphere mask loaded from the specified file.
    """
    da = xr.open_dataarray(self.weighted_regions_filename)
    return da.data[self._region]

canari_ml.data.masks.era5.Masks.get_blank_mask()

Returns an empty boolean mask for the configured region.

Returns:

Type Description
array

A boolean array of shape matching the hemisphere mask, initialised to False for the pre-defined self._region.

Source code in src/canari_ml/data/masks/era5.py
def get_blank_mask(self) -> np.array:
    """Returns an empty boolean mask for the configured region.

    Returns:
        A boolean array of shape matching the hemisphere mask,
            initialised to `False` for the pre-defined `self._region`.
    """
    shape = self.hemisphere().shape
    return np.full(shape, False)[self._region]

canari_ml.data.masks.era5.Masks.reset_region()

Resets the mask region to cover the entire dataset.

Source code in src/canari_ml/data/masks/era5.py
def reset_region(self):
    """Resets the mask region to cover the entire dataset."""
    logging.info("Mask region reset, whole mask will be returned")
    self._region = (slice(None, None), slice(None, None))

canari_ml.data.masks.era5.RegionWeightAction

Bases: Action

Custom argparse action for handling region weights with their respective boundaries.

This action expects 5 values: lat_min, lat_max, lon_min, lon_max, and weight. It supports passing these values as a comma-separated or space-separated string. If the number of provided values is not 5, an error is raised. All values must be numeric.

This action accumulates region weights in the namespace object under the region_weights attribute, allowing multiple regions to be specified by calling the flag repeatedly.

canari_ml.data.masks.era5.WeightsArgParser(*args, **kwargs)

Bases: MetaArgParser

Argument parser for handling region weights.

This class extends :class:MetaArgParser and adds arguments related to managing region weights. It supports specifying a base weight, individual region weights, and smoothing of the weights using a Gaussian kernel with a given sigma.

Source code in src/canari_ml/data/masks/era5.py
def __init__(self, *args: Unpack, **kwargs: Unpack):
    super().__init__(*args, **kwargs)

canari_ml.data.masks.era5.WeightsArgParser.add_region_weights()

Source code in src/canari_ml/data/masks/era5.py
def add_region_weights(self):
    self.add_argument(
        "--base-weight",
        type=float,
        required=True,
        default=1.0,
        help="Base weight (float)",
    )
    self.add_argument(
        "--region-weights",
        nargs="+",
        action=RegionWeightAction,
        metavar="REGION",
        help="Specify a region and its weight. Can be used multiple times, but, weights must sum to 1.0.",
        default=[],
    )
    self.add_argument(
        "--weight-smoothing-sigma",
        type=float,
        default=10,
        help="Sigma for Gaussian smoothing of region weights (float)",
    )
    return self

canari_ml.data.masks.era5.get_channel_info_from_processor(cfg_segment)

Retrieves channel-specific information from a processor based on the given configuration segment.

This function uses :class:WeightsArgParser to parse arguments related to channels and region weights. It then retrieves the appropriate implementation for the processor, dataset configuration, and initializes the processor with the parsed arguments. The processor is used to process data and obtain channel-specific information, which is then stored in a configuration file under the specified segment.

Note

Based on code from preprocess-toolbox: https://github.com/environmental-forecasting/preprocess-toolbox/blob/35f57eecd8017fae0bf1c7a4a4ca80ca77e905d4/preprocess_toolbox/loader/cli.py#L131-L151

Parameters:

Name Type Description Default
cfg_segment str

The configuration segment under which to store the channel-specific information.

required

Raises:

Type Description
RuntimeError

If the --config-path argument is provided, as it is invalid for this CLI endpoint.

Source code in src/canari_ml/data/masks/era5.py
def get_channel_info_from_processor(cfg_segment: str):
    """
    Retrieves channel-specific information from a processor based on the given configuration segment.

    This function uses :class:`WeightsArgParser` to parse arguments related to channels and region weights.
    It then retrieves the appropriate implementation for the processor, dataset configuration, and initializes
    the processor with the parsed arguments. The processor is used to process data and obtain channel-specific
    information, which is then stored in a configuration file under the specified segment.

    Note:
        Based on code from `preprocess-toolbox`:
        https://github.com/environmental-forecasting/preprocess-toolbox/blob/35f57eecd8017fae0bf1c7a4a4ca80ca77e905d4/preprocess_toolbox/loader/cli.py#L131-L151

    Args:
        cfg_segment (str): The configuration segment under which to store the channel-specific information.

    Raises:
        RuntimeError: If the `--config-path` argument is provided, as it is invalid for this CLI endpoint.
    """
    args, unknown_args = WeightsArgParser(base_path="processed").add_channel().add_region_weights().parse_known_args()

    proc_impl = get_implementation(args.implementation)
    ds_config = get_dataset_config_implementation(args.ground_truth_dataset)

    if args.config is not None:
        # FIXME: args.config contains the location of the dataset config on render, but
        #   this is not part of this pattern! DS is either ground truth or in derived class,
        #   but this library doesn't care or know of it respectively.
        raise RuntimeError("--config-path is invalid for this CLI endpoint, sorry...")

    total_weight = args.base_weight + sum(
        w[-1] for w in getattr(args, "region_weights", [])
    )
    if not abs(total_weight - 1.0) < 1e-6:
        logging.error(f"Total weight must sum to 1.0, not {total_weight:.3f}")


    impl_args = (
        ds_config,
        [
            args.channel_name,
        ],
        args.channel_name,
    )

    impl_kwargs = {
        "base_path": args.destination_path,
        "base_weight": args.base_weight,
        "region_weights": args.region_weights,
        "weight_smoothing_sigma": args.weight_smoothing_sigma,
    }
    if unknown_args:
        impl_kwargs |= unknown_args

    processor = proc_impl(*impl_args, **impl_kwargs)
    processor.process()
    update_config(
        get_config_filename(args),
        cfg_segment,
        {args.channel_name: processor.get_config()},
    )

canari_ml.data.masks.era5.add_region_weights()

Entry point for region weighted processing.

Source code in src/canari_ml/data/masks/era5.py
def add_region_weights():
    """Entry point for region weighted processing."""
    get_channel_info_from_processor("masks")