Skip to content

Hashers

Module containing all Hashers

Hasher #

Bases: ABC

Abstract class from which all Hashers inherit from

Source code in scrat/hasher/base.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class Hasher(ABC):
    "Abstract class from which all Hashers inherit from"

    @abstractmethod
    def hash(self, value: T.Any) -> str:
        """Calculate the hash-string corresponding to a value

        Parameters
        ----------
        value
            The argument value

        Returns
        -------
            The hash-string
        """
        return NotImplemented

    @classmethod
    def md5_hash(cls, *args) -> str:
        """
        Generate the hash for strings and bytes using md5

        Returns
        -------
            the resulting hexdigest
        """
        h = hashlib.md5()
        for value in args:
            if isinstance(value, str):
                value = value.encode()
            h.update(value)
        return h.hexdigest()

hash(value) abstractmethod #

Calculate the hash-string corresponding to a value

Parameters:

Name Type Description Default
value Any

The argument value

required

Returns:

Type Description
The hash-string
Source code in scrat/hasher/base.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
@abstractmethod
def hash(self, value: T.Any) -> str:
    """Calculate the hash-string corresponding to a value

    Parameters
    ----------
    value
        The argument value

    Returns
    -------
        The hash-string
    """
    return NotImplemented

md5_hash(*args) classmethod #

Generate the hash for strings and bytes using md5

Returns:

Type Description
the resulting hexdigest
Source code in scrat/hasher/base.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
@classmethod
def md5_hash(cls, *args) -> str:
    """
    Generate the hash for strings and bytes using md5

    Returns
    -------
        the resulting hexdigest
    """
    h = hashlib.md5()
    for value in args:
        if isinstance(value, str):
            value = value.encode()
        h.update(value)
    return h.hexdigest()

IterableHasher #

Bases: Hasher

Apply one Hasher to each element of a iterable

Parameters:

Name Type Description Default
item_hasher Hasher

A Hasher to hash each value in the iterable

required

Examples:

>>> import scrat as sc
>>> import numpy as np
>>> hasher = sc.IterableHasher(sc.NumpyHasher())
>>> hasher.hash([np.zeros(5), np.ones(3)])
'f86f4d4c12a426ce5d54d715723584be'
Source code in scrat/hasher/iterable.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class IterableHasher(Hasher):
    """
    Apply one Hasher to each element of a iterable

    Parameters
    ----------
    item_hasher
        A Hasher to hash each value in the iterable

    Examples
    --------
    >>> import scrat as sc
    >>> import numpy as np
    >>> hasher = sc.IterableHasher(sc.NumpyHasher())
    >>> hasher.hash([np.zeros(5), np.ones(3)])
    'f86f4d4c12a426ce5d54d715723584be'
    """

    def __init__(self, item_hasher: Hasher) -> None:
        super().__init__()
        self.item_hasher = item_hasher

    def hash(self, value: T.Iterable) -> str:
        return self.md5_hash(*[self.item_hasher.hash(x) for x in value])

NumpyHasher #

Bases: Hasher

Hasher for numpy arrays

Source code in scrat/hasher/numpy.py
 6
 7
 8
 9
10
class NumpyHasher(Hasher):
    "Hasher for numpy arrays"

    def hash(self, value: T.Any) -> str:
        return self.md5_hash(value)

PandasHasher #

Bases: Hasher

Hasher for Pandas Series and DataFrames

Parameters:

Name Type Description Default
use_values bool

If False, only the index of the dataframe is included in the hash This can help with the speed of the hasher on big dataframes where you only care what rows are included but you know the values don't change, by default True

True
Source code in scrat/hasher/pandas.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class PandasHasher(Hasher):
    """
    Hasher for Pandas Series and DataFrames

    Parameters
    ----------
    use_values
        If False, only the index of the dataframe is included in the hash
        This can help with the speed of the hasher on big dataframes where
        you only care what rows are included but you know the values
        don't change, by default True
    """

    def __init__(self, use_values: bool = True) -> None:
        super().__init__()
        self.use_values = use_values

    def hash(self, value: T.Any) -> str:
        if self.use_values:
            return self.md5_hash(value.index.values, value.values)
        return self.md5_hash(value.index.values)

ToStringHasher #

Bases: Hasher

Naive hasher that tries to conver the value to str and then hash it

Source code in scrat/hasher/to_string.py
 6
 7
 8
 9
10
class ToStringHasher(Hasher):
    "Naive hasher that tries to conver the value to str and then hash it"

    def hash(self, value: T.Any) -> str:
        return self.md5_hash(str(value))