Clusterer

Implementation of Self-Organizing Map.

`SOM(n_columns=5, n_rows=5, initialcodebook=None, kerneltype=0, maptype='planar', gridtype='rectangular', compactsupport=True, neighborhood='gaussian', std_coeff=0.5, random_state=None, verbose=0)`

Bases: BaseEstimator, ClusterMixin

Class to fit and visualize a Self-Organizing Map (SOM).

The implementation uses SOM from Somoclu. Read more in the [user_guide].

Parameters:

Name	Type	Description	Default
`n_columns`	`int`	The number of columns in the map.	`5`
`n_rows`	`int`	The number of rows in the map.	`5`
`initialcodebook`	`ArrayLike \| str \| None`	Define the codebook to start the training. If `initialcodebook='pca'` then the codebook is initialized from the first subspace spanned by the first two eigenvectors of the correlation matrix.	`None`
`kerneltype`	`int`	Specify which kernel to use. If `kerneltype=0` use dense CPU kernel. Else if `kerneltype=1` use dense GPU kernel if compiled with it.	`0`
`maptype`	`str`	Specify the map topology. If `maptype='planar'` use planar map. Else if `maptype='toroid'` use toroid map.	`'planar'`
`gridtype`	`str`	Specify the grid form of the nodes. If `gridtype='rectangular'` use rectangular neurons. Else if `gridtype='hexagonal'` use hexagonal neurons.	`'rectangular'`
`compactsupport`	`bool`	Cut off map updates beyond the training radius with the Gaussian neighborhood.	`True`
`neighborhood`	`str`	Specify the neighborhood. If `neighborhood='gaussian'` use Gaussian neighborhood. Else if `neighborhood='bubble'` use bubble neighborhood function.	`'gaussian'`
`std_coeff`	`float`	Set the coefficient in the Gaussian neighborhood :math:`exp(-\|\|x-y\|\|^2/(2(coeffradius)^2))`.	`0.5`
`random_state`	`int \| RandomState \| None`	Control the randomization of the algorithm by specifying the codebook initalization. It is ignored when `initialcodebook` is not `None`. If int, `random_state` is the seed used by the random number generator. If `RandomState` instance, random_state is the random number generator. If `None`, the random number generator is the `RandomState` instance used by `np.random`.	`None`
`verbose`	`int`	Specify verbosity level (0, 1, or 2).	`0`

Source code in src/clover/clusterer/_som.py

def __init__(
    self: Self,
    n_columns: int = 5,
    n_rows: int = 5,
    initialcodebook: npt.ArrayLike | str | None = None,
    kerneltype: int = 0,
    maptype: str = 'planar',
    gridtype: str = 'rectangular',
    compactsupport: bool = True,
    neighborhood: str = 'gaussian',
    std_coeff: float = 0.5,
    random_state: int | np.random.RandomState | None = None,
    verbose: int = 0,
) -> None:
    self.n_columns = n_columns
    self.n_rows = n_rows
    self.initialcodebook = initialcodebook
    self.kerneltype = kerneltype
    self.maptype = maptype
    self.gridtype = gridtype
    self.compactsupport = compactsupport
    self.neighborhood = neighborhood
    self.std_coeff = std_coeff
    self.random_state = random_state
    self.verbose = verbose

`fit(X, y=None, **fit_params)`

Train the self-organizing map.

Parameters:

Name	Type	Description	Default
`X`	`ArrayLike`	Training instances to cluster.	required
`y`	`ArrayLike \| None`	Ignored	`None`
`fit_params`	`dict[str, Any]`	Parameters to pass to train method of Somoclu object.	`{}`

Returns:

Type	Description
`Self`	The object itself.

Source code in src/clover/clusterer/_som.py

def fit(self: Self, X: npt.ArrayLike, y: npt.ArrayLike | None = None, **fit_params: dict[str, Any]) -> Self:
    """Train the self-organizing map.

    Args:
        X:
            Training instances to cluster.
        y:
            Ignored
        fit_params:
            Parameters to pass to train method of Somoclu object.

    Returns:
        The object itself.
    """

    # Check and normalize input data
    X_scaled = minmax_scale(check_array(X, dtype=np.float32))

    # Check random_state
    self.random_state_ = check_random_state(self.random_state)

    # Initialize codebook
    if self.initialcodebook is None:
        if self.random_state is None:
            initialcodebook = None
            initialization = 'random'
        else:
            codebook_size = self.n_columns * self.n_rows * X_scaled.shape[1]
            initialcodebook = self.random_state_.random_sample(
                codebook_size,
            ).astype(np.float32)
            initialization = None
    elif self.initialcodebook == 'pca':
        initialcodebook = None
        initialization = 'random'
    else:
        initialcodebook = self.initialcodebook
        initialization = None

    # Create Somoclu object
    self.algorithm_ = Somoclu(
        n_columns=self.n_columns,
        n_rows=self.n_rows,
        initialcodebook=initialcodebook,
        kerneltype=self.kerneltype,
        maptype=self.maptype,
        gridtype=self.gridtype,
        compactsupport=self.compactsupport,
        neighborhood=self.neighborhood,
        std_coeff=self.std_coeff,
        initialization=initialization,
        data=None,
        verbose=self.verbose,
    )

    # Fit Somoclu
    self.algorithm_.train(data=X_scaled, **fit_params)

    # Grid labels
    grid_labels = cast(list[tuple[int, int]], [tuple(grid_label) for grid_label in self.algorithm_.bmus.tolist()])

    # Generate labels mapping
    self.labels_mapping_ = generate_labels_mapping(grid_labels)

    # Generate cluster labels
    self.labels_ = np.array(
        [self.labels_mapping_[grid_label] for grid_label in grid_labels],
    )

    # Generate labels neighbors
    self.neighbors_ = self._generate_neighbors(
        sorted(set(grid_labels)),
        self.labels_mapping_,
    )

    return self

`fit_predict(X, y=None, **fit_params)`

Train the self-organizing map and assign cluster labels to samples.

Parameters:

Name	Type	Description	Default
`X`	`ArrayLike`	New data to transform.	required
`y`	`ArrayLike \| None`	Ignored.	`None`
`fit_params`	`dict[str, Any]`	Parameters to pass to train method of Somoclu object.	`{}`

Returns:

Name	Type	Description
`labels`	`NDArray`	Index of the cluster each sample belongs to.

Source code in src/clover/clusterer/_som.py

def fit_predict(
    self: Self,
    X: npt.ArrayLike,
    y: npt.ArrayLike | None = None,
    **fit_params: dict[str, Any],
) -> npt.NDArray:
    """Train the self-organizing map and assign cluster labels to samples.

    Args:
        X:
            New data to transform.
        y:
            Ignored.
        fit_params:
            Parameters to pass to train method of Somoclu object.

    Returns:
        labels:
            Index of the cluster each sample belongs to.
    """
    return self.fit(X=X, y=None, **fit_params).labels_

`extract_topological_neighbors(col, row, gridtype, n_rows, n_columns, bmus)`

Return the topological neighbors of a neuron.

Source code in src/clover/clusterer/_som.py

def extract_topological_neighbors(
    col: int,
    row: int,
    gridtype: str,
    n_rows: int,
    n_columns: int,
    bmus: list[list[int]],
) -> list[tuple[int, int]]:
    """Return the topological neighbors of a neuron."""

    # Return common topological neighbors for the two grid types
    topological_neighbors = [
        (col - 1, row),
        (col + 1, row),
        (col, row - 1),
        (col, row + 1),
    ]

    # Append extra topological neighbors for hexagonal grid type
    if gridtype == 'hexagonal':
        offset = (-1) ** row
        topological_neighbors += [
            (col - offset, row - offset),
            (col - offset, row + offset),
        ]

    # Apply constraints
    topological_neighbors = [
        (col, row)
        for col, row in topological_neighbors
        if 0 <= col < n_columns and 0 <= row < n_rows and [col, row] in bmus
    ]

    return topological_neighbors

`generate_labels_mapping(grid_labels)`

Generate a mapping between grid labels and cluster labels.

Source code in src/clover/clusterer/_som.py

def generate_labels_mapping(grid_labels: list[tuple[int, int]]) -> dict[tuple[int, int], int]:
    """Generate a mapping between grid labels and cluster labels."""

    # Identify unique grid labels
    unique_labels = sorted(set(grid_labels))

    # Generate mapping
    labels_mapping = dict(zip(unique_labels, range(len(unique_labels)), strict=True))

    return labels_mapping