Source code for ktch.io._chc

"""Chain code file I/O functions."""

# Copyright 2025 Koji Noshita
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import numpy.typing as npt
import pandas as pd


@dataclass
class ChainCodeData:
    """Chain code data class.

    Chain codes represent 2D contours using directional codes from 0 to 7:

        3 2 1
        4 * 0
        5 6 7

    Parameters
    ----------
    sample_name : str
        Sample name.
    x : float
        X coordinate.
    y : float
        Y coordinate.
    area_per_pixel : float
        Area (mm2) per pixel.
    area_pixels : int
        Area in pixels.
    chain_code : np.ndarray
        Chain code sequence with values from 0 to 7 representing directions.
    validate : bool, default=True
        If True, validate that chain code values are between 0 and 7.
    """

    sample_name: str
    x: float
    y: float
    area_per_pixel: float
    area_pixels: int
    chain_code: np.ndarray
    validate: bool = True

    def __post_init__(self):
        if not isinstance(self.chain_code, np.ndarray):
            self.chain_code = np.array(self.chain_code)

        if self.validate and not np.all(
            (self.chain_code >= 0) & (self.chain_code <= 7)
        ):
            invalid_values = self.chain_code[
                (self.chain_code < 0) | (self.chain_code > 7)
            ]
            raise ValueError(
                f"Chain code contains invalid values: {invalid_values}. "
                f"Values must be between 0 and 7 (inclusive)."
            )

    def get_chain_code(self):
        """Get the raw chain code as a numpy array.

        Returns
        -------
        chain_code : np.ndarray
            Raw chain code values (0-7) representing directions.
        """
        return self.chain_code

    def to_numpy(self):
        """Convert chain code to 2D coordinates as a numpy array.

        The chain code is converted to a sequence of 2D coordinates,
        starting from (0, 0) and applying the directional changes
        based on the chain code values. The coordinates are scaled
        using the area_per_pixel value.

        Chain codes represent 2D contours using directional codes from 0 to 7:

            3 2 1
            4 * 0
            5 6 7

        Returns
        -------
        coords : np.ndarray
            2D coordinates with shape (n, 2) where n is the number of points.
            The first column is the x-coordinate and the second column is the y-coordinate.
        """
        directions = np.array(
            [
                [1, 0],  # 0: right
                [1, -1],  # 1: up-right
                [0, -1],  # 2: up
                [-1, -1],  # 3: up-left
                [-1, 0],  # 4: left
                [-1, 1],  # 5: down-left
                [0, 1],  # 6: down
                [1, 1],  # 7: down-right
            ]
        )

        coords = np.zeros((len(self.chain_code) + 1, 2))

        for i, code in enumerate(self.chain_code):
            valid_code = min(max(0, code), 7)
            coords[i + 1] = coords[i] + directions[valid_code]

        scale_factor = np.sqrt(self.area_per_pixel)
        coords *= scale_factor

        coords[:, 0] += self.x
        coords[:, 1] += self.y

        return coords

    def to_dataframe(self):
        """Convert chain code to 2D coordinates as a pandas DataFrame.

        The chain code is converted to a sequence of 2D coordinates,
        starting from (0, 0) and applying the directional changes
        based on the chain code values. The coordinates are scaled
        using the area_per_pixel value.

        Chain codes represent 2D contours using directional codes from 0 to 7:

            3 2 1
            4 * 0
            5 6 7

        Returns
        -------
        df : pd.DataFrame
            DataFrame with x and y columns for the coordinates and chain_code
            column for the direction codes. The first point has chain_code=-1
            since it has no direction.
        """
        coords = self.to_numpy()

        chain_code_values = np.zeros(len(coords), dtype=int)
        chain_code_values[0] = -1  # First point has no direction
        chain_code_values[1:] = self.chain_code  # Remaining points have directions

        df = pd.DataFrame(
            {
                "x": coords[:, 0],
                "y": coords[:, 1],
                "chain_code": chain_code_values,
            },
            index=pd.MultiIndex.from_tuples(
                [[self.sample_name, i] for i in range(len(coords))],
                name=["specimen_id", "coord_id"],
            ),
        )

        return df


[docs] def read_chc(file_path, as_frame=False, validate=True, as_coordinates=True): """Read chain code (.chc) file. Chain codes represent 2D contours using directional codes from 0 to 7: 3 2 1 4 * 0 5 6 7 The chain code file format is: [Sample name] [X] [Y] [Area (mm2) per pixel] [Area (pixels)] [Chain code] -1 Parameters ---------- file_path : str Path to the chain code file. as_frame : bool, default=False If True, return pandas.DataFrame. Otherwise, return numpy.ndarray. validate : bool, default=True If True, validate that chain code values are between 0 and 7. Set to False to skip validation for legacy files that may contain other values. as_coordinates : bool, default=True If True, convert chain codes to 2D coordinates. If False, return the raw chain code values. Returns ------- chain_codes : list of np.ndarray or pd.DataFrame Chain codes or coordinates. """ path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"{path} does not exist.") if not path.suffix == ".chc": raise ValueError(f"{path} is not a chain code file.") chc_data_list = _read_chc(path, validate=validate) if len(chc_data_list) == 1: if as_frame: return chc_data_list[0].to_dataframe() else: if as_coordinates: return chc_data_list[0].to_numpy() else: return chc_data_list[0].get_chain_code() else: if as_frame: dfs = [chc_data.to_dataframe() for chc_data in chc_data_list] return pd.concat(dfs) else: if as_coordinates: return [chc_data.to_numpy() for chc_data in chc_data_list] else: return [chc_data.get_chain_code() for chc_data in chc_data_list]
[docs] def write_chc( file_path, chain_codes, sample_names=None, xs=None, ys=None, area_per_pixels=None, area_pixels_values=None, validate=True, ): """Write chain code to .chc file. Chain codes represent 2D contours using directional codes from 0 to 7: 3 2 1 4 * 0 5 6 7 The chain code file format is: [Sample name] [X] [Y] [Area (mm2) per pixel] [Area (pixels)] [Chain code] -1 Parameters ---------- file_path : str Path to the chain code file. chain_codes : list of np.ndarray or np.ndarray Chain codes with values from 0 to 7 representing directions. sample_names : list of str or str, optional Sample names. xs : list of float or float, optional X coordinates. ys : list of float or float, optional Y coordinates. area_per_pixels : list of float or float, optional Area (mm2) per pixel. area_pixels_values : list of int or int, optional Area in pixels. validate : bool, default=True If True, validate that chain code values are between 0 and 7. Set to False to skip validation for legacy files that may contain other values. """ path = Path(file_path) if isinstance(chain_codes, np.ndarray): if chain_codes.ndim == 1: chain_codes = [chain_codes] elif chain_codes.ndim == 2: chain_codes = [chain_codes[i, :] for i in range(chain_codes.shape[0])] else: raise ValueError("chain_codes must be a 1D or 2D array.") elif not isinstance(chain_codes, list): raise ValueError("chain_codes must be a list of numpy arrays or a numpy array.") n_samples = len(chain_codes) if sample_names is None: sample_names = ["Sample"] * n_samples elif isinstance(sample_names, str): sample_names = [sample_names] * n_samples if xs is None: xs = [0] * n_samples elif isinstance(xs, (int, float)): xs = [xs] * n_samples if ys is None: ys = [0] * n_samples elif isinstance(ys, (int, float)): ys = [ys] * n_samples if area_per_pixels is None: area_per_pixels = [1.0] * n_samples elif isinstance(area_per_pixels, (int, float)): area_per_pixels = [area_per_pixels] * n_samples if area_pixels_values is None: area_pixels_values = [len(code) for code in chain_codes] elif isinstance(area_pixels_values, int): area_pixels_values = [area_pixels_values] * n_samples chc_data_list = [] for i in range(n_samples): if validate and not np.all((chain_codes[i] >= 0) & (chain_codes[i] <= 7)): invalid_values = chain_codes[i][(chain_codes[i] < 0) | (chain_codes[i] > 7)] raise ValueError( f"Chain code contains invalid values: {invalid_values}. " f"Values must be between 0 and 7 (inclusive)." ) chc_data_list.append( ChainCodeData( sample_name=sample_names[i], x=xs[i], y=ys[i], area_per_pixel=area_per_pixels[i], area_pixels=area_pixels_values[i], chain_code=chain_codes[i], validate=validate, ) ) _write_chc(path, chc_data_list)
def _read_chc(file_path, validate=True): """Read chain code file. Parameters ---------- file_path : str or Path Path to the chain code file. validate : bool, default=True If True, validate that chain code values are between 0 and 7. Returns ------- chc_data_list : list of ChainCodeData Chain code data. """ chc_data_list = [] with open(file_path, "r") as f: for line in f: line = line.strip() if not line: continue parts = line.split(" ") sample_name = parts[0] x = float(parts[1]) y = float(parts[2]) area_per_pixel = float(parts[3]) area_pixels = int(parts[4]) try: end_idx = parts.index("-1") chain_code = np.array(parts[5:end_idx], dtype=int) except ValueError: chain_code = np.array(parts[5:], dtype=int) try: chc_data = ChainCodeData( sample_name=sample_name, x=x, y=y, area_per_pixel=area_per_pixel, area_pixels=area_pixels, chain_code=chain_code, ) except ValueError as e: if validate: raise e original_post_init = ChainCodeData.__post_init__ try: ChainCodeData.__post_init__ = ( lambda self: None if not isinstance(self.chain_code, np.ndarray) else setattr(self, "chain_code", np.array(self.chain_code)) ) chc_data = ChainCodeData( sample_name=sample_name, x=x, y=y, area_per_pixel=area_per_pixel, area_pixels=area_pixels, chain_code=chain_code, ) finally: ChainCodeData.__post_init__ = original_post_init chc_data_list.append(chc_data) return chc_data_list def _write_chc(file_path, chc_data_list): """Write chain code data to a file. Parameters ---------- file_path : str or Path Path to the chain code file. chc_data_list : list of ChainCodeData Chain code data. """ with open(file_path, "w") as f: for chc_data in chc_data_list: f.write( f"{chc_data.sample_name} {chc_data.x} {chc_data.y} " f"{chc_data.area_per_pixel} {chc_data.area_pixels} " ) f.write(" ".join(map(str, chc_data.chain_code.tolist()))) f.write(" -1\n")