Source code for das.annot

"""Utilities for dealing with annotations."""

import numpy as np
import xarray as xr
import pandas as pd
from collections import UserDict
from typing import Optional, List, Dict, Any, Union


[docs]class Events(UserDict):
    """Utility class for dealing with annotations."""

    def __init__(
        self,
        data: Optional[Dict[str, List[float]]] = None,
        categories: Optional[Dict[str, str]] = None,
        add_names_from_categories: bool = True,
    ):
        # """Initializes Events class. Do not use. Use `from_df` for `from_lists` instead.

        # Args:
        #     data (Optional[Dict[str, List[float]]], optional): [description]. Defaults to None.
        #     categories (Optional[Dict[str, str]], optional): [description]. Defaults to None.
        #     add_names_from_categories (bool, optional): [description]. Defaults to True.
        # """
        if data is None:
            data = dict()

        super().__init__(data)

        for key, val in self.items():
            val = np.array(val)
            if val.ndim == 1:
                val = val[:, np.newaxis]
            if val.shape[1] == 1:
                val = np.concatenate((val, val), axis=1)

            self.data[key] = val
        self.categories = self._infer_categories()

        # drop nan
        self._drop_nan()

        # preserve cats from input
        if hasattr(data, "categories"):
            for name, cat in data.categories.items():
                if name in self:  # update only existing keys
                    self.categories[name] = cat

        # update cats from arg
        if categories is not None:
            for name, cat in categories.items():
                if name in self:  # update only existing keys
                    self.categories[name] = cat
                elif add_names_from_categories:
                    self.add_name(name=name, category=cat)

[docs]    @classmethod
    def from_df(cls, df, possible_event_names: Optional[List] = None):
        """[summary]

        Args:
            df (pd.DataFrame): with columns `name`, `start_seconds`, `end_seconds`.
            possible_event_names (list, optional): [description]. Defaults to [].

        Returns:
            Events: [description]
        """
        if possible_event_names is None:
            possible_event_names = []

        return cls.from_lists(
            df.name.values, df.start_seconds.values.astype(float), df.stop_seconds.values.astype(float), possible_event_names
        )

[docs]    @classmethod
    def from_lists(cls, names, start_seconds, stop_seconds, possible_event_names: Optional[List] = None):
        """[summary]

        Args:
            names ([type]): [description]
            start_seconds ([type]): [description]
            stop_seconds ([type]): [description]
            possible_event_names (list, optional): [description]. Defaults to [].

        Returns:
            [type]: [description]
        """
        if possible_event_names is None:
            possible_event_names = []
        unique_names = list(set(names))
        unique_names.extend(possible_event_names)
        dct = {name: [] for name in unique_names}

        for name, start_second, stop_second in zip(names, start_seconds, stop_seconds):
            dct[name].append([start_second, stop_second])

        return cls(dct)

[docs]    @classmethod
    def from_dataset(cls, ds):
        """[summary]

        Args:
            ds ([type]): [description]

        Returns:
            [type]: [description]
        """
        start_seconds = np.array(ds.event_times.sel(event_time="start_seconds").data)
        stop_seconds = np.array(ds.event_times.sel(event_time="stop_seconds").data)
        names = np.array(ds.event_names.data)
        if "possible_event_names" in ds.attrs:
            possible_event_names = ds.attrs["possible_event_names"]
        elif "possible_event_names" in ds.event_names.attrs:
            possible_event_names = ds.event_names.attrs["possible_event_names"]
        else:
            possible_event_names = []

        out = cls.from_lists(names, start_seconds, stop_seconds, possible_event_names)
        if "event_categories" in ds:
            cats = {str(cat.event_types.data): str(cat.event_categories.data) for cat in ds.event_categories}
            out = cls(out, categories=cats)
        return out

    @classmethod
    def from_predict(cls, events=None, segments=None):
        # to lists or df
        names = []
        start_seconds = []
        stop_seconds = []
        possible_names = []

        if (
            segments is not None
            and len(segments)
            and "sequence" in segments
            and len(segments["sequence"])
            and "names" in segments
        ):
            if type(segments["sequence"][0]) is not str and type(segments["sequence"][0]) is not np.str_:
                segment_names = [segments["names"][ii] for ii in segments["sequence"]]  # from ints to names
            else:
                segment_names = segments["sequence"]
            names.extend(segment_names)
            start_seconds.extend(segments["onsets_seconds"])
            stop_seconds.extend(segments["offsets_seconds"])
            possible_names.extend(segments["names"])

        if events is not None and len(events) and "sequence" in events and len(events["sequence"]) and "names" in events:
            if type(events["sequence"][0]) is not str and type(events["sequence"][0]) is not np.str_:
                event_names = [events["names"][ii] for ii in events["sequence"]]  # from ints to names
            else:
                event_names = events["sequence"]
            names.extend(event_names)
            start_seconds.extend(events["seconds"])
            stop_seconds.extend(events["seconds"])
            possible_names.extend(events["names"])

        # only keep unique
        possible_names = list(set(possible_names))

        out = cls.from_lists(names, start_seconds, stop_seconds, possible_names)
        return out

    def _init_df(self) -> pd.DataFrame:
        return pd.DataFrame(columns=["name", "start_seconds", "stop_seconds"])

    def _append_row(self, df, name, start_seconds, stop_seconds=None) -> pd.DataFrame:
        if stop_seconds is None:
            stop_seconds = start_seconds
        new_row = pd.DataFrame(np.array([name, start_seconds, stop_seconds])[np.newaxis, :], columns=df.columns)
        return pd.concat((df, new_row), ignore_index=True)

[docs]    def to_df(self, preserve_empty: bool = True) -> pd.DataFrame:
        """Convert to pandas.DataFrame

        Args:
            preserve_empty (bool, optional):
                In keeping with the convention that events have identical start and stop times and segments do not,
                empty events are coded with np.nan as both start and stop and
                empty segments are coded as np.nan as start and 0 as stop.
                `from_df()` will obey this convention - if both start and stop are np.nan,
                the name will be a segment,
                if only the start is np.nan (the stop does not matter), the name will be an event
                Defaults to True.

        Returns:
            pandas.DataFrame: with columns `name`, `start_seconds`, `stop_seconds`, one row per event.
        """
        df = self._init_df()
        for name in self.names:
            for start_second, stop_second in zip(self.start_seconds(name), self.stop_seconds(name)):
                df = self._append_row(df, name, start_second, stop_second)
        if preserve_empty:  # ensure we keep events without annotations
            for name, cat in zip(self.names, self.categories.values()):
                if name not in df.name.values:
                    stop_seconds = (
                        np.nan if cat == "event" else 0
                    )  # (np.nan, np.nan) -> empty events, (np.nan, some number) -> empty segments
                    df = self._append_row(df, name, start_seconds=np.nan, stop_seconds=stop_seconds)
        # make sure start and stop seconds are numeric
        df["start_seconds"] = pd.to_numeric(df["start_seconds"], errors="coerce")
        df["stop_seconds"] = pd.to_numeric(df["stop_seconds"], errors="coerce")
        return df

[docs]    def to_dataset(self) -> xr.Dataset:
        """Returns an xarray dataset.

        Returns:
            xr.Dataset: with the data arrays `event_names` and `event_times`.
        """
        df = self.to_df()
        names = df.name.values
        start_seconds = df.start_seconds.values.astype(float)
        stop_seconds = df.stop_seconds.values.astype(float)

        da_names = xr.DataArray(
            name="event_names",
            data=np.array(names, dtype="U128"),
            dims=[
                "index",
            ],
        )
        da_times = xr.DataArray(
            name="event_times",
            data=np.array([start_seconds, stop_seconds]).T,
            dims=["index", "event_time"],
            coords={"event_time": ["start_seconds", "stop_seconds"]},
        )

        ds = xr.Dataset({da.name: da for da in [da_names, da_times]})
        ds.attrs["time_units"] = "seconds"
        ds.attrs["possible_event_names"] = self.names  # ensure that we preserve even names w/o events that get lost in to_df
        return ds

[docs]    def add_name(
        self,
        name,
        category="segment",
        times=None,
        overwrite: bool = False,
        append: bool = False,
        sort_after_append: bool = False,
    ):
        """[summary]

        Args:
            name ([type]): [description]
            category (str, optional): [description]. Defaults to 'segment'.
            times ([type], optional): [description]. Defaults to None.
            overwrite (bool, optional): [description]. Defaults to False.
            append (bool, optional): [description]. Defaults to False.
            sort_after_append (bool, optional): [description]. Defaults to False.
        """
        if times is None:
            times = np.zeros((0, 2))

        if name not in self or (name in self and overwrite):
            self.update({name: times})
            self.categories[name] = category
        elif name in self and append:
            self[name] = np.append(self[name], times, axis=0)
            if sort_after_append:
                self[name].sort(axis=0)

[docs]    def delete_name(self, name):
        """[summary]

        Args:
            name ([type]): [description]
        """
        if name in self:
            del self[name]
        if name in self.categories:
            del self.categories[name]

[docs]    def add_time(self, name, start_seconds, stop_seconds=None):
        """[summary]

        Args:
            name ([type]): [description]
            start_seconds ([type]): [description]
            stop_seconds ([type], optional): [description]. Defaults to None.
        """
        if name not in self:
            raise ValueError(f"Song type {name} not added yet. Add via 'self.add_name(name, category)'")

        if stop_seconds is None:
            stop_seconds = start_seconds

        self[name] = np.insert(self[name], len(self[name]), sorted([start_seconds, stop_seconds]), axis=0)

[docs]    def move_time(self, name, old_time, new_time):
        """[summary]

        Args:
            name ([type]): [description]
            old_time ([type]): [description]
            new_time ([type]): [description]
        """
        self[name][self[name] == old_time] = new_time

    def delete_time(self, name, time, tol=0):
        nearest_start = self._find_nearest(self.start_seconds(name), time)
        index = np.where(self.start_seconds(name) == nearest_start)[0][0]

        if self.categories[name] == "segment":
            matching_stop = self.stop_seconds(name)[index]
            event_at_time = matching_stop > time
        elif self.categories[name] == "event":
            event_at_time = np.abs(time - nearest_start) < tol
        else:
            event_at_time = False

        if event_at_time:
            deleted_time = self[name][index, :]
            self[name] = np.delete(self[name], index, axis=0)
        else:
            deleted_time = None
        return deleted_time

[docs]    def select_range(self, name: str, t0: Optional[float] = None, t1: Optional[float] = None, strict: bool = True):
        """Get indices of events within the range.

        Need to start and stop after t0 and before t1 (non-inclusive bounds).

        Args:
            name (str): [description]
            t0 (float, optional): [description]
            t1 (float, optional): [description]
            strict (bool, optional): if true, only matches events that start AND stop within the range,
                           if false, matches events that start OR stop within the range

        Returns:
            List[uint]: List of indices of events within the range
        """

        if t0 is None:
            t0 = 0
        if t1 is None:
            t1 = np.inf

        if strict:
            within_range = np.logical_and(self.start_seconds(name) > t0, self.stop_seconds(name) < t1)
        else:
            starts_in_range = np.logical_and(self.start_seconds(name) > t0, self.start_seconds(name) < t1)
            stops_in_range = np.logical_and(self.stop_seconds(name) > t0, self.stop_seconds(name) < t1)
            within_range = np.logical_or(starts_in_range, stops_in_range)
        within_range_indices = np.where(within_range)[0]
        return within_range_indices

[docs]    def filter_range(self, name, t0, t1, strict: bool = False):
        """Returns events within the range.

        Need to start and stop after t0 and before t1 (non-inclusive bounds).

        Args:
            name ([type]): [description]
            t0 ([type]): [description]
            t1 ([type]): [description]
            strict (bool): if true, only matches events that start AND stop within the range,
                           if false, matches events that start OR stop within the range
        Returns:
            List[float]: [N, 2] list of start_seconds and stop_seconds in the range
        """
        indices = self.select_range(name, t0, t1, strict)
        return self[name][indices, :]

[docs]    def delete_range(self, name, t0, t1, strict: bool = True):
        """Deletes events within the range.

        Need to start and stop after t0 and before t1 (non-inclusive bounds).

        Args:
            name ([type]): [description]
            t0 ([type]): [description]
            t1 ([type]): [description]
            strict (bool): if true, only matches events that start AND stop within the range,
                           if false, matches events that start OR stop within the range
        Returns:
            int: number of deleted events
        """
        indices = self.select_range(name, t0, t1)
        self[name] = np.delete(self[name], indices, axis=0)
        return len(indices)

    def _find_nearest(self, array, value):
        array = np.asarray(array)
        idx = (np.abs(array - value)).argmin()
        return array[idx]

    def _infer_categories(self):
        categories = dict()
        for name in self.names:
            if len(self[name]) == 0:
                if not hasattr(self, "categories") or name not in self.categories:
                    categories[name] = None
                elif hasattr(self, "categories") and name in self.categories:
                    categories[name] = self.categories[name]
            else:
                first_start = self.start_seconds(name)[0]
                first_stop = self.stop_seconds(name)[0]

                if (np.isnan(first_start) and np.isnan(first_stop)) or (first_start == first_stop):
                    category = "event"
                else:
                    category = "segment"

                categories[name] = category

        return categories

    def _drop_nan(self):
        # remove entries with nan stop or start (but keep their name)
        for name in self.names:
            nan_events = np.logical_or(np.isnan(self.start_seconds(name)), np.isnan(self.stop_seconds(name)))
            self[name] = self[name][~nan_events]

    @property
    def names(self):
        return list(self.keys())

    def start_seconds(self, key):
        return self[key][:, 0]

    def stop_seconds(self, key):
        return self[key][:, 1]

    def duration_seconds(self, key):
        return self[key][:, 1] - self[key][:, 0]