Source code for ktch.datasets._sample_generator
"""Base IO code for small sample datasets"""
# Copyright 2023 Koji Noshita
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import numpy as np
import pandas as pd
from sklearn.utils import check_random_state
[docs]
def make_landmarks_from_reference(
reference: np.ndarray,
n_samples: int = 30,
sd: float = 1.0,
random_state: int | np.random.RandomState | None = None,
allow_collinearity: bool = False,
allow_dup: bool = False,
as_frame: bool = False,
) -> np.ndarray | pd.DataFrame:
"""Generate landmark dataset around a reference configuration.
Parameters
----------
reference : array-like, shape (n_landmarks, n_dim)
Reference configuration.
n_samples : int, default=30
Number of samples to generate.
sd : float, default=1.0
Standard deviation of the Gaussian noise added to the reference.
random_state : int, RandomState instance or None, default=None
Pass an int for reproducible output across multiple function calls.
allow_dup : bool, default=False
If True, allow duplicate configurations in the generated dataset.
as_frame : bool, default=False
If True, return a pandas DataFrame.
Returns
-------
X : array-like, shape (n_samples, n_landmarks, n_dim)
Generated landmark dataset.
"""
ref = np.asarray(reference)
generator = check_random_state(random_state)
rand_size = (n_samples,) + ref.shape
X = ref + sd * generator.standard_normal(rand_size)
if not allow_dup:
X = _remove_duplicated_configurations(X)
while X.shape[0] < n_samples:
rand_size = (n_samples - X.shape[0],) + ref.shape
X_comp = ref + sd * generator.standard_normal(rand_size)
X = np.concatenate([X, X_comp])
X = _remove_duplicated_configurations(X)
if as_frame:
n_landmarks = X.shape[1]
n_dim = X.shape[2]
X = pd.DataFrame(X.reshape(n_samples * n_landmarks, n_dim))
X["id"] = [i for i in range(n_samples) for _ in range(n_landmarks)]
X["coord_id"] = [j for _ in range(n_samples) for j in range(n_landmarks)]
X = X.set_index(["id", "coord_id"])
if n_dim == 2:
X.columns = ["x", "y"]
elif n_dim == 3:
X.columns = ["x", "y", "z"]
else:
raise ValueError("reference must be 2D or 3D.")
return X
def _remove_duplicated_configurations(X):
"""Remove duplicated configurations in a dataset.
The original row order is preserved (first occurrence kept).
"""
_, idx = np.unique(X.reshape(X.shape[0], -1), axis=0, return_index=True)
return X[np.sort(idx)]