Source code for das.data_hash
import hashlib
from pathlib import Path
from typing import Union
[docs]def hash_data(data_path: Union[str, Path], chunk_size: int = 65536) -> str:
"""Compute MD5 hash of the data_path (dir or file) for data versioning.
Args:
data_path ([type]): [description]
chunk_size (int, optional): [description]. Defaults to 65536.
Raises:
ValueError: [description]
Returns:
str: hexadecimal string containing the hash code
"""
if Path(data_path).is_dir():
hash = _hash_dir(data_path, chunk_size)
elif Path(data_path).is_file():
hash = _hash_file(data_path, chunk_size)
else:
raise ValueError(f"{data_path} is neither directory nor file.")
return hash.hexdigest()
def _update_hash_dir(directory: Union[str, Path], hash, chunk_size: int):
# from https://stackoverflow.com/questions/24937495/how-can-i-calculate-a-hash-for-a-filesystem-directory-using-python/54477583#54477583
assert Path(directory).is_dir()
for path in sorted(Path(directory).iterdir(), key=lambda p: str(p).lower()):
hash.update(path.name.encode())
if path.is_file():
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(chunk_size), b""):
hash.update(chunk)
elif path.is_dir():
hash = _update_hash_dir(path, hash, chunk_size)
return hash
def _hash_dir(directory: Union[str, Path], chunk_size: int):
return _update_hash_dir(directory, hashlib.md5(), chunk_size)
def _hash_file(data_file: Union[str, Path], chunk_size: int):
hash = hashlib.md5()
with open(data_file, "rb") as f:
for chunk in iter(lambda: f.read(chunk_size), b""):
hash.update(chunk)
return hash