Skip to content

masking

Mask

Class to define masks with conditions and weights to apply to DataFiles

Parameters:

Name Type Description Default
where MaskCondition | list[MaskCondition]

Where the mask should be applied

None
use MaskCondition | list[MaskCondition]

Condition on where to use the masks

None
weight None | float | str | list[float | str]

Weights to apply

None
other float
nan
comment str
Comment
''
Source code in python/posted/masking.py
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
class Mask:
    '''Class to define masks with conditions and weights to apply to DataFiles

    Parameters
    ----------
    where: MaskCondition | list[MaskCondition], optional
        Where the mask should be applied
    use:  MaskCondition | list[MaskCondition], optional
        Condition on where to use the masks
    weight: None | float | str | list[float | str], optional
        Weights to apply
    other: float, optional

    comment: str, optional
            Comment
    '''
    def __init__(self,
                 where: MaskCondition | list[MaskCondition] = None,
                 use: MaskCondition | list[MaskCondition] = None,
                 weight: None | float | str | list[float | str] = None,
                 other: float = np.nan,
                 comment: str = ''):
        '''set fields from constructor arguments, perform consistency checks on fields,
        set default weight to 1 if not set otherwise'''
        self._where: list[MaskCondition] = [] if where is None else where if isinstance(where, list) else [where]
        self._use: list[MaskCondition] = [] if use is None else use if isinstance(use, list) else [use]
        self._weight: list[float] = (
            None
            if weight is None else
            [float(w) for w in weight]
            if isinstance(weight, list) else
            [float(weight)]
        )
        self._other: float = other
        self._comment: str = comment

        # perform consistency checks on fields
        if self._use and self._weight and len(self._use) != len(self._weight):
            raise Exception(f"Must provide same length of 'use' conditions as 'weight' values.")

        # set default weight to 1 if not set otherwise
        if not self._weight:
            self._weight = len(self._use) * [1.0]


    def matches(self, df: pd.DataFrame):
        '''Check if a mask matches a dataframe (all 'where' conditions match across all rows)

        Parameters
        ----------
        df: pd.Dataframe
            Dataframe to check for matches
        Returns
        -------
            bool
                If the mask matches the dataframe'''
        for w in self._where:
            if not apply_cond(df, w).all():
                return False
        return True


    def get_weights(self, df: pd.DataFrame):
        '''Apply weights to the dataframe

        Parameters
        ----------
        df: pd.Dataframe
            Dataframe to apply weights on

        Returns
        -------
            pd.DataFrame
                Dataframe with applied weights'''
        ret = pd.Series(index=df.index, data=np.nan)

        # apply weights where the use condition matches
        for u, w in zip(self._use, self._weight):
            ret.loc[apply_cond(df, u)] = w

        return ret

__init__(where=None, use=None, weight=None, other=np.nan, comment='')

set fields from constructor arguments, perform consistency checks on fields, set default weight to 1 if not set otherwise

Source code in python/posted/masking.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def __init__(self,
             where: MaskCondition | list[MaskCondition] = None,
             use: MaskCondition | list[MaskCondition] = None,
             weight: None | float | str | list[float | str] = None,
             other: float = np.nan,
             comment: str = ''):
    '''set fields from constructor arguments, perform consistency checks on fields,
    set default weight to 1 if not set otherwise'''
    self._where: list[MaskCondition] = [] if where is None else where if isinstance(where, list) else [where]
    self._use: list[MaskCondition] = [] if use is None else use if isinstance(use, list) else [use]
    self._weight: list[float] = (
        None
        if weight is None else
        [float(w) for w in weight]
        if isinstance(weight, list) else
        [float(weight)]
    )
    self._other: float = other
    self._comment: str = comment

    # perform consistency checks on fields
    if self._use and self._weight and len(self._use) != len(self._weight):
        raise Exception(f"Must provide same length of 'use' conditions as 'weight' values.")

    # set default weight to 1 if not set otherwise
    if not self._weight:
        self._weight = len(self._use) * [1.0]

get_weights(df)

Apply weights to the dataframe

Parameters:

Name Type Description Default
df DataFrame

Dataframe to apply weights on

required

Returns:

Type Description
pd.DataFrame

Dataframe with applied weights

Source code in python/posted/masking.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def get_weights(self, df: pd.DataFrame):
    '''Apply weights to the dataframe

    Parameters
    ----------
    df: pd.Dataframe
        Dataframe to apply weights on

    Returns
    -------
        pd.DataFrame
            Dataframe with applied weights'''
    ret = pd.Series(index=df.index, data=np.nan)

    # apply weights where the use condition matches
    for u, w in zip(self._use, self._weight):
        ret.loc[apply_cond(df, u)] = w

    return ret

matches(df)

Check if a mask matches a dataframe (all 'where' conditions match across all rows)

Parameters:

Name Type Description Default
df DataFrame

Dataframe to check for matches

required

Returns:

Type Description
bool

If the mask matches the dataframe

Source code in python/posted/masking.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def matches(self, df: pd.DataFrame):
    '''Check if a mask matches a dataframe (all 'where' conditions match across all rows)

    Parameters
    ----------
    df: pd.Dataframe
        Dataframe to check for matches
    Returns
    -------
        bool
            If the mask matches the dataframe'''
    for w in self._where:
        if not apply_cond(df, w).all():
            return False
    return True

apply_cond(df, cond)

Takes a pandas DataFrame and a condition, which can be a string, dictionary, or callable, and applies the condition to the DataFrame using eval or apply accordingly.

Parameters:

Name Type Description Default
df DataFrame

A pandas DataFrame containing the data on which the condition will be applied.

required
cond MaskCondition

The condition to be applied on the dataframe. Can be either a string, a dictionary, or a callable function.

required

Returns:

Type Description
pd.DataFrame

Dataframe evaluated at the mask condition

Source code in python/posted/masking.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def apply_cond(df: pd.DataFrame, cond: MaskCondition):
    '''Takes a pandas DataFrame and a condition, which can be a string, dictionary,
    or callable, and applies the condition to the DataFrame using `eval` or `apply`
    accordingly.

    Parameters
    ----------
    df : pd.DataFrame
        A pandas DataFrame containing the data on which the condition will be applied.
    cond : MaskCondition
        The condition to be applied on the dataframe. Can be either a string, a dictionary, or a
        callable function.

    Returns
    -------
        pd.DataFrame
            Dataframe evaluated at the mask condition

    '''
    if isinstance(cond, str):
        return df.eval(cond)
    elif isinstance(cond, dict):
        cond = ' & '.join([f"{key}=='{val}'" for key, val in cond.items()])
        return df.eval(cond)
    elif isinstance(cond, Callable):
        return df.apply(cond)

read_masks(variable)

Reads YAML files containing mask specifications from multiple databases and returns a list of Mask objects.

Parameters:

Name Type Description Default
variable str

Variable to be read

required

Returns:

Type Description
list

List with masks for the variable

Source code in python/posted/masking.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def read_masks(variable: str):
    '''Reads YAML files containing mask specifications from multiple databases
    and returns a list of Mask objects.

    Parameters
    ----------
    variable : str
        Variable to be read

    Returns
    -------
        list
            List with masks for the variable

    '''
    ret: list[Mask] = []

    for database_id in databases:
        fpath = databases[database_id] / 'masks' / ('/'.join(variable.split('|')) + '.yml')
        if fpath.exists():
            if not fpath.is_file():
                raise Exception(f"Expected YAML file, but not a file: {fpath}")

            ret += [
                Mask(**mask_specs)
                for mask_specs in read_yml_file(fpath)
            ]

    return ret