iden.shard
iden.shard ¶
Contain shard implementations.
iden.shard.BaseShard ¶
Bases: ABC, Generic[T]
Define the base class to implement a shard.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.io import save_json
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri/0001").as_uri()
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri=uri, path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.BaseShard.clear
abstractmethod
¶
clear() -> None
Clear the current shard cache i.e. remove from memory the data if possible.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.io import save_json
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri/0001").as_uri()
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri=uri, path=file)
... data = shard.get_data(cache=True)
... data
... data.append(4) # in-place modification
... data = shard.get_data()
... data
... shard.clear()
... data = shard.get_data()
... data
...
[1, 2, 3]
[1, 2, 3, 4]
[1, 2, 3]
iden.shard.BaseShard.equal
abstractmethod
¶
equal(other: Any, equal_nan: bool = False) -> bool
Indicate if two shards are equal or not.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
other
|
Any
|
The object to compare with. |
required |
equal_nan
|
bool
|
If |
False
|
Returns:
| Type | Description |
|---|---|
bool
|
|
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import JsonShard, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri1 = Path(tmpdir).joinpath("my_uri1").as_uri()
... uri2 = Path(tmpdir).joinpath("my_uri2").as_uri()
... shard1 = create_json_shard([1, 2, 3], uri=uri1)
... shard2 = create_json_shard([4, 5, 6], uri=uri2)
... shard3 = JsonShard.from_uri(uri=uri1)
... shard1.equal(shard2)
... shard1.equal(shard3)
...
False
True
iden.shard.BaseShard.get_data
abstractmethod
¶
get_data(cache: bool = False) -> T
Get the data in the shard.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
cache
|
bool
|
If |
False
|
Returns:
| Type | Description |
|---|---|
T
|
The data in the shard. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.io import save_json
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri/0001").as_uri()
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri=uri, path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.BaseShard.get_uri
abstractmethod
¶
get_uri() -> str | None
Get the Uniform Resource Identifier (URI) of the shard.
Returns:
| Type | Description |
|---|---|
str | None
|
The Uniform Resource Identifier (URI). |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.io import save_json
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri/0001").as_uri()
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri=uri, path=file)
... shard.get_uri()
...
'file:///.../uri/0001'
iden.shard.BaseShard.is_cached
abstractmethod
¶
is_cached() -> bool
Indicate if the data in the shard are cached or not.
Returns:
| Type | Description |
|---|---|
bool
|
|
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.io import save_json
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri/0001").as_uri()
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri=uri, path=file)
... shard.is_cached()
... data = shard.get_data(cache=True)
... shard.is_cached()
... shard.clear()
... shard.is_cached()
...
False
True
False
iden.shard.CloudpickleShard ¶
Bases: FileShard[T]
Implement a cloudpickle shard for advanced Python object serialization.
This shard stores data using cloudpickle, which extends Python's pickle to handle more complex objects like lambda functions and nested classes. The data are stored in a cloudpickle file.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
path
|
Path | str
|
The path to the cloudpickle file. |
required |
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import CloudpickleShard
>>> from iden.io import save_pickle
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.pkl")
... save_pickle([1, 2, 3], file)
... shard = CloudpickleShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.CloudpickleShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path
|
Path
|
The path to the pickle file. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import CloudpickleShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.pkl")
... CloudpickleShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.pkl'},
'loader': {'_target_': 'iden.shard.loader.CloudpickleShardLoader'}}
iden.shard.FileShard ¶
Bases: BaseShard[T]
Implement a generic shard where the data are stored in a single file.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
path
|
Path | str
|
The path to the pickle file. |
required |
loader
|
BaseLoader[T] | dict[Any, Any] | None
|
The data loader or its configuration. |
None
|
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import FileShard
>>> from iden.io import save_json, JsonLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... shard = FileShard(uri=uri, path=file, loader=JsonLoader())
... shard.get_data()
...
[1, 2, 3]
iden.shard.FileShard.from_uri
classmethod
¶
from_uri(uri: str) -> S
Instantiate a shard from its URI.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The Uniform Resource Identifier (URI) of the file shard to load. |
required |
Returns:
| Type | Description |
|---|---|
S
|
The instantiated shard. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import FileShard, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_json_shard([1, 2, 3], uri=uri)
... shard = FileShard.from_uri(uri)
... shard
...
JsonShard(uri=file:///.../my_uri)
iden.shard.FileShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path
|
Path
|
The path to the json file. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import FileShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.json")
... FileShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.json'},
'loader': {'_target_': 'iden.shard.loader.FileShardLoader'}}
iden.shard.InMemoryShard ¶
Bases: BaseShard[T]
Implement an in-memory shard for transient data storage.
This shard stores data directly in memory without persistence to disk. It does not have a valid URI as the data exists only during runtime.
Example
>>> from iden.shard import InMemoryShard
>>> shard = InMemoryShard([1, 2, 3])
>>> shard.get_data()
[1, 2, 3]
iden.shard.JoblibShard ¶
Bases: FileShard[T]
Implement a joblib shard for efficient persistence of Python objects.
This shard stores data in a joblib file format, which provides efficient serialization for numerical data and scikit-learn models. The data are stored in a joblib file.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
path
|
Path | str
|
The path to the joblib file. |
required |
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import JoblibShard
>>> from iden.io import save_pickle
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.joblib")
... save_pickle([1, 2, 3], file)
... shard = JoblibShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.JoblibShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path
|
Path
|
The path to the pickle file. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import JoblibShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.joblib")
... JoblibShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.joblib'},
'loader': {'_target_': 'iden.shard.loader.JoblibShardLoader'}}
iden.shard.JsonShard ¶
Bases: FileShard[T]
Implement a JSON shard for human-readable data persistence.
This shard stores data in JSON (JavaScript Object Notation) format, providing a text-based, human-readable serialization. The data are stored in a JSON file.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
path
|
Path | str
|
The path to the JSON file. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import JsonShard
>>> from iden.io import save_json
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.json")
... save_json([1, 2, 3], file)
... shard = JsonShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.JsonShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path
|
Path
|
The path to the json file. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import JsonShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.json")
... JsonShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.json'},
'loader': {'_target_': 'iden.shard.loader.JsonShardLoader'}}
iden.shard.NumpySafetensorsShard ¶
Bases: FileShard[dict[str, ndarray]]
Implement a safetensors shard for secure NumPy array storage.
This shard stores NumPy arrays using the safetensors format, which provides fast and secure serialization without arbitrary code execution risks. The data are stored in a safetensors file.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
path
|
Path | str
|
The path to the safetensors file. |
required |
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> import numpy as np
>>> from pathlib import Path
>>> from iden.shard import NumpySafetensorsShard
>>> from iden.io.safetensors import NumpySaver
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.safetensors")
... NumpySaver().save({"key1": np.ones((2, 3)), "key2": np.arange(5)}, file)
... shard = NumpySafetensorsShard(uri="file:///data/1234456789", path=file)
... dict(sorted(shard.get_data().items()))
...
{'key1': array([[1., 1., 1.], [1., 1., 1.]]), 'key2': array([0, 1, 2, 3, 4])}
iden.shard.NumpySafetensorsShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path
|
Path
|
The path to the pickle file. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import NumpySafetensorsShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.safetensors")
... NumpySafetensorsShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.safetensors'},
'loader': {'_target_': 'iden.shard.loader.NumpySafetensorsShardLoader'}}
iden.shard.PickleShard ¶
Bases: FileShard[T]
Implement a pickle shard for Python object serialization.
This shard stores data using Python's pickle protocol, which allows serialization of arbitrary Python objects. The data are stored in a pickle file.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
path
|
Path | str
|
The path to the pickle file. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import PickleShard
>>> from iden.io import save_pickle
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.pkl")
... save_pickle([1, 2, 3], file)
... shard = PickleShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.PickleShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path
|
Path
|
The path to the pickle file. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import PickleShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.pkl")
... PickleShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.pkl'},
'loader': {'_target_': 'iden.shard.loader.PickleShardLoader'}}
iden.shard.ShardDict ¶
Bases: BaseShard[T]
Implement a data structure to manage a dictionary of shards.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
shards
|
dict[str, BaseShard[T]]
|
The dictionary of shards. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.dataset import VanillaDataset
>>> from iden.shard import create_json_shard, ShardDict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shards/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shards/uri2").as_uri()
... ),
... }
... sd = ShardDict(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards)
... sd
...
ShardDict(
(uri): file:///.../uri
(shards):
(train): JsonShard(uri=file:///.../shards/uri1)
(val): JsonShard(uri=file:///.../shards/uri2)
)
iden.shard.ShardDict.from_uri
classmethod
¶
from_uri(uri: str) -> ShardDict[T]
Instantiate a shard from its URI.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The Uniform Resource Identifier (URI) of the shard dictionary to load. |
required |
Returns:
| Type | Description |
|---|---|
ShardDict[T]
|
The instantiated shard. |
Example:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardDict, create_json_shard, create_shard_dict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... uri = Path(tmpdir).joinpath("uri").as_uri()
... create_shard_dict(shards, uri=uri)
... shard = ShardDict.from_uri(uri)
... shard
...
ShardDict(
(uri): file:///.../uri
(shards):
(train): JsonShard(uri=file:///.../shard/uri1)
(val): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.ShardDict.generate_uri_config
classmethod
¶
generate_uri_config(
shards: dict[str, BaseShard[T]],
) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shards
|
dict[str, BaseShard[T]]
|
The dictionary of shards to include in the configuration, where keys are shard identifiers. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardDict, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... ShardDict.generate_uri_config(shards)
...
{'shards': {'train': 'file:///.../shard/uri1', 'val': 'file:///.../shard/uri2'},
'loader': {'_target_': 'iden.shard.loader.ShardDictLoader'}}
iden.shard.ShardDict.get_shard ¶
get_shard(shard_id: str) -> Any
Get a shard.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shard_id
|
str
|
The shard ID. |
required |
Returns:
| Type | Description |
|---|---|
Any
|
The shard. |
Raises:
| Type | Description |
|---|---|
ShardNotFoundError
|
if the shard does not exist. |
Example:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, ShardDict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... sd = ShardDict(uri=Path(tmpdir).joinpath("main_uri").as_uri(), shards=shards)
... sd.get_shard("train")
...
JsonShard(uri=file:///.../uri1)
iden.shard.ShardDict.get_shard_ids ¶
get_shard_ids() -> set[str]
Get the shard IDs.
Returns:
| Type | Description |
|---|---|
set[str]
|
The shard IDs. |
Example:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, ShardDict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... sd = ShardDict(uri=Path(tmpdir).joinpath("main_uri").as_uri(), shards=shards)
... sorted(sd.get_shard_ids())
...
['train', 'val']
iden.shard.ShardDict.has_shard ¶
has_shard(shard_id: str) -> bool
Indicate if the shard exists or not.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shard_id
|
str
|
The shard ID. |
required |
Returns:
| Type | Description |
|---|---|
bool
|
|
Example:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, ShardDict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... sd = ShardDict(uri=Path(tmpdir).joinpath("main_uri").as_uri(), shards=shards)
... sd.has_shard("train")
... sd.has_shard("test")
...
True
False
iden.shard.ShardTuple ¶
Bases: BaseShard[tuple[BaseShard[T], ...]]
Implement a data structure to manage a tuple of shards.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
shards
|
Iterable[BaseShard[T]]
|
The tuple of shards. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard import ShardTuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("shards/uri1").as_uri()),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shards/uri2").as_uri()
... ),
... ]
... sl = ShardTuple(uri=Path(tmpdir).joinpath("uri").as_uri(), shards=shards)
... sl
...
ShardTuple(
(uri): file:///.../uri
(shards):
(0): JsonShard(uri=file:///.../shards/uri1)
(1): JsonShard(uri=file:///.../shards/uri2)
)
iden.shard.ShardTuple.from_uri
classmethod
¶
from_uri(uri: str) -> ShardTuple[T]
Instantiate a shard from its URI.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The Uniform Resource Identifier (URI) of the shard tuple to load. |
required |
Returns:
| Type | Description |
|---|---|
ShardTuple[T]
|
The instantiated shard. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardTuple, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... uri = Path(tmpdir).joinpath("uri").as_uri()
... create_shard_tuple(shards, uri=uri)
... shard = ShardTuple.from_uri(uri)
... shard
...
ShardTuple(
(uri): file:///.../uri
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.ShardTuple.generate_uri_config
classmethod
¶
generate_uri_config(
shards: Iterable[BaseShard[T]],
) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shards
|
Iterable[BaseShard[T]]
|
The sequence of shards to include in the configuration. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardTuple, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... ShardTuple.generate_uri_config(shards)
...
{'shards': ['file:///.../shard/uri1', 'file:///.../shard/uri2'],
'loader': {'_target_': 'iden.shard.loader.ShardTupleLoader'}}
iden.shard.ShardTuple.get ¶
get(index: int) -> BaseShard[T]
Get a shard.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
index
|
int
|
The shard index to get. |
required |
Returns:
| Type | Description |
|---|---|
BaseShard[T]
|
The shard. |
Raises:
| Type | Description |
|---|---|
IndexError
|
if the index is outside the tuple range. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard import ShardTuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... sl = ShardTuple(uri=Path(tmpdir).joinpath("main_uri").as_uri(), shards=shards)
... sl.get(0)
...
JsonShard(uri=file:///.../uri1)
iden.shard.ShardTuple.is_sorted_by_uri ¶
is_sorted_by_uri() -> bool
Indicate if the shards are sorted by ascending order of URIs or not.
Returns:
| Type | Description |
|---|---|
bool
|
|
iden.shard.TorchSafetensorsShard ¶
Bases: FileShard[dict[str, Tensor]]
Implement a safetensors shard for secure PyTorch tensor storage.
This shard stores PyTorch tensors using the safetensors format, which provides fast and secure serialization without arbitrary code execution risks. The data are stored in a safetensors file.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
path
|
Path | str
|
The path to the safetensors file. |
required |
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import TorchSafetensorsShard
>>> from iden.io.safetensors import TorchSaver
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.safetensors")
... TorchSaver().save({"key1": torch.ones(2, 3), "key2": torch.arange(5)}, file)
... shard = TorchSafetensorsShard(uri="file:///data/1234456789", path=file)
... dict(sorted(shard.get_data().items()))
...
{'key1': tensor([[1., 1., 1.], [1., 1., 1.]]), 'key2': tensor([0, 1, 2, 3, 4])}
iden.shard.TorchSafetensorsShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path
|
Path
|
The path to the pickle file. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import TorchSafetensorsShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.safetensors")
... TorchSafetensorsShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.safetensors'},
'loader': {'_target_': 'iden.shard.loader.TorchSafetensorsShardLoader'}}
iden.shard.TorchShard ¶
Bases: FileShard[T]
Implement a PyTorch shard for efficient tensor storage.
This shard stores data in PyTorch's native file format, optimized for
torch.Tensor objects and PyTorch state dictionaries. The data are
stored in a PyTorch file.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
path
|
Path | str
|
The path to the PyTorch file. |
required |
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import TorchShard
>>> from iden.io import TorchSaver
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.pt")
... TorchSaver().save({"key1": torch.ones(2, 3), "key2": torch.arange(5)}, file)
... shard = TorchShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
{'key1': tensor([[1., 1., 1.], [1., 1., 1.]]), 'key2': tensor([0, 1, 2, 3, 4])}
iden.shard.TorchShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the JSON format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path
|
Path
|
The path to the pickle file. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import TorchShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.pt")
... TorchShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.pt'},
'loader': {'_target_': 'iden.shard.loader.TorchShardLoader'}}
iden.shard.YamlShard ¶
Bases: FileShard[T]
Implement a YAML shard for human-readable configuration storage.
This shard stores data in YAML (YAML Ain't Markup Language) format, which provides a readable text-based serialization commonly used for configuration files. The data are stored in a YAML file.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The shard's URI. |
required |
path
|
Path | str
|
The path to the YAML file. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import YamlShard
>>> from iden.io import save_yaml
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.yaml")
... save_yaml([1, 2, 3], file)
... shard = YamlShard(uri="file:///data/1234456789", path=file)
... shard.get_data()
...
[1, 2, 3]
iden.shard.YamlShard.generate_uri_config
classmethod
¶
generate_uri_config(path: Path) -> dict[str, Any]
Generate the minimal config that is used to load the shard from its URI.
The config must be compatible with the YAML format.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
path
|
Path
|
The path to the yaml file. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, Any]
|
The minimal config to load the shard from its URI. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import YamlShard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... file = Path(tmpdir).joinpath("data.yaml")
... YamlShard.generate_uri_config(file)
...
{'kwargs': {'path': '.../data.yaml'},
'loader': {'_target_': 'iden.shard.loader.YamlShardLoader'}}
iden.shard.create_cloudpickle_shard ¶
create_cloudpickle_shard(
data: T, uri: str, path: Path | None = None
) -> CloudpickleShard[T]
Create a CloudpickleShard from data.
Note
It is a utility function to create a CloudpickleShard from its
data and URI. It is possible to create a CloudpickleShard
in other ways.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
T
|
The data to save in the cloudpickle file. |
required |
uri
|
str
|
The shard's URI. |
required |
path
|
Path | None
|
The path to the cloudpickle file. If |
None
|
Returns:
| Type | Description |
|---|---|
CloudpickleShard[T]
|
The |
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_pickle_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_pickle_shard([1, 2, 3], uri=Path(tmpdir).joinpath("my_uri").as_uri())
... shard.get_data()
...
[1, 2, 3]
iden.shard.create_joblib_shard ¶
create_joblib_shard(
data: T, uri: str, path: Path | None = None
) -> JoblibShard[T]
Create a JoblibShard from data.
Note
It is a utility function to create a JoblibShard from its
data and URI. It is possible to create a JoblibShard
in other ways.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
T
|
The data to save in the joblib file. |
required |
uri
|
str
|
The shard's URI. |
required |
path
|
Path | None
|
The path to the joblib file. If |
None
|
Returns:
| Type | Description |
|---|---|
JoblibShard[T]
|
The |
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_pickle_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_pickle_shard([1, 2, 3], uri=Path(tmpdir).joinpath("my_uri").as_uri())
... shard.get_data()
...
[1, 2, 3]
iden.shard.create_json_shard ¶
create_json_shard(
data: T, uri: str, path: Path | None = None
) -> JsonShard[T]
Create a JsonShard from data.
Note
It is a utility function to create a JsonShard from its
data and URI. It is possible to create a JsonShard
in other ways.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
T
|
The data to save in the json file. |
required |
uri
|
str
|
The shard's URI. |
required |
path
|
Path | None
|
The path to the JSON file. If |
None
|
Returns:
| Type | Description |
|---|---|
JsonShard[T]
|
The |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("my_uri").as_uri())
... shard.get_data()
...
[1, 2, 3]
iden.shard.create_numpy_safetensors_shard ¶
create_numpy_safetensors_shard(
data: dict[str, ndarray],
uri: str,
path: Path | None = None,
) -> NumpySafetensorsShard
Create a NumpySafetensorsShard from data.
Note
It is a utility function to create a NumpySafetensorsShard
from its data and URI. It is possible to create a
NumpySafetensorsShard in other ways.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
dict[str, ndarray]
|
The data to save in the safetensors file. |
required |
uri
|
str
|
The shard's URI. |
required |
path
|
Path | None
|
The path to the safetensors file. If |
None
|
Returns:
| Type | Description |
|---|---|
NumpySafetensorsShard
|
The |
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import create_numpy_safetensors_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_numpy_safetensors_shard(
... data={"key1": np.ones((2, 3)), "key2": np.arange(5)},
... uri=Path(tmpdir).joinpath("my_uri").as_uri(),
... )
... dict(sorted(shard.get_data().items()))
...
{'key1': array([[1., 1., 1.], [1., 1., 1.]]), 'key2': array([0, 1, 2, 3, 4])}
iden.shard.create_pickle_shard ¶
create_pickle_shard(
data: T, uri: str, path: Path | None = None
) -> PickleShard[T]
Create a PickleShard from data.
Note
It is a utility function to create a PickleShard from its
data and URI. It is possible to create a PickleShard
in other ways.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
T
|
The data to save in the pickle file. |
required |
uri
|
str
|
The shard's URI. |
required |
path
|
Path | None
|
The path to the pickle file. If |
None
|
Returns:
| Type | Description |
|---|---|
PickleShard[T]
|
The |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_pickle_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_pickle_shard([1, 2, 3], uri=Path(tmpdir).joinpath("my_uri").as_uri())
... shard.get_data()
...
[1, 2, 3]
iden.shard.create_shard_dict ¶
Create a ShardDict from a dictionary of shards.
Note
It is a utility function to create a ShardDict from its
shards and URI. It is possible to create a ShardDict
in other ways.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shards
|
dict[str, BaseShard[T]]
|
The dictionary of shards to include, where keys are shard identifiers and values are shard objects. |
required |
uri
|
str
|
The Uniform Resource Identifier (URI) for the shard dictionary. |
required |
Returns:
| Type | Description |
|---|---|
ShardDict[T]
|
The |
Example:
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardDict, create_json_shard, create_shard_dict
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... shard = create_shard_dict(shards, uri=Path(tmpdir).joinpath("uri").as_uri())
... shard
...
ShardDict(
(uri): file:///.../uri
(shards):
(train): JsonShard(uri=file:///.../shard/uri1)
(val): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.create_shard_tuple ¶
create_shard_tuple(
shards: Iterable[BaseShard[T]], uri: str
) -> ShardTuple[T]
Create a ShardTuple from a sequence of shards.
Note
It is a utility function to create a ShardTuple from its
shards and URI. It is possible to create a ShardTuple
in other ways.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shards
|
Iterable[BaseShard[T]]
|
The sequence of shards to include in the tuple. |
required |
uri
|
str
|
The Uniform Resource Identifier (URI) for the shard tuple. |
required |
Returns:
| Type | Description |
|---|---|
ShardTuple[T]
|
The |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import ShardTuple, create_json_shard, create_shard_tuple
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... shard = create_shard_tuple(shards, uri=Path(tmpdir).joinpath("uri").as_uri())
... shard
...
ShardTuple(
(uri): file:///.../uri
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.create_torch_safetensors_shard ¶
create_torch_safetensors_shard(
data: dict[str, Tensor],
uri: str,
path: Path | None = None,
) -> TorchSafetensorsShard
Create a TorchSafetensorsShard from data.
Note
It is a utility function to create a TorchSafetensorsShard
from its data and URI. It is possible to create a
TorchSafetensorsShard in other ways.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
dict[str, Tensor]
|
The data to save in the safetensors file. |
required |
uri
|
str
|
The shard's URI. |
required |
path
|
Path | None
|
The path to the safetensors file. If |
None
|
Returns:
| Type | Description |
|---|---|
TorchSafetensorsShard
|
The |
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import create_torch_safetensors_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_torch_safetensors_shard(
... data={"key1": torch.ones(2, 3), "key2": torch.arange(5)},
... uri=Path(tmpdir).joinpath("my_uri").as_uri(),
... )
... dict(sorted(shard.get_data().items()))
...
{'key1': tensor([[1., 1., 1.], [1., 1., 1.]]), 'key2': tensor([0, 1, 2, 3, 4])}
iden.shard.create_torch_shard ¶
create_torch_shard(
data: T, uri: str, path: Path | None = None
) -> TorchShard[T]
Create a TorchShard from data.
Note
It is a utility function to create a TorchShard from its
data and URI. It is possible to create a TorchShard
in other ways.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
T
|
The data to save in the PyTorch file. |
required |
uri
|
str
|
The shard's URI. |
required |
path
|
Path | None
|
The path to the PyTorch file. If |
None
|
Returns:
| Type | Description |
|---|---|
TorchShard[T]
|
The |
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> from pathlib import Path
>>> import torch
>>> from iden.shard import create_torch_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_torch_shard(
... data={"key1": torch.ones(2, 3), "key2": torch.arange(5)},
... uri=Path(tmpdir).joinpath("my_uri").as_uri(),
... )
... shard.get_data()
...
{'key1': tensor([[1., 1., 1.], [1., 1., 1.]]), 'key2': tensor([0, 1, 2, 3, 4])}
iden.shard.create_yaml_shard ¶
create_yaml_shard(
data: T, uri: str, path: Path | None = None
) -> YamlShard[T]
Create a YamlShard from data.
Note
It is a utility function to create a YamlShard from its
data and URI. It is possible to create a YamlShard
in other ways.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
T
|
The data to save in the yaml file. |
required |
uri
|
str
|
The shard's URI. |
required |
path
|
Path | None
|
The path to the YAML file. If |
None
|
Returns:
| Type | Description |
|---|---|
YamlShard[T]
|
The |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_yaml_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shard = create_yaml_shard([1, 2, 3], uri=Path(tmpdir).joinpath("my_uri").as_uri())
... shard.get_data()
...
[1, 2, 3]
iden.shard.get_dict_uris ¶
get_dict_uris(
shards: dict[str, BaseShard[Any]],
) -> dict[str, str]
Get the dictionary of shard URIs.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shards
|
dict[str, BaseShard[Any]]
|
The dictionary of shards. |
required |
Returns:
| Type | Description |
|---|---|
dict[str, str]
|
The dictionary of shard URIs. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, get_dict_uris
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... get_dict_uris(shards)
...
{'train': 'file:///.../shard/uri1', 'val': 'file:///.../shard/uri2'}
iden.shard.get_list_uris ¶
get_list_uris(
shards: Iterable[BaseShard[Any]],
) -> list[str]
Get the list of shard URIs.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shards
|
Iterable[BaseShard[Any]]
|
The iterable of shards from which to extract URIs. |
required |
Returns:
| Type | Description |
|---|---|
list[str]
|
The list of shard URIs. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import get_list_uris, create_json_shard
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... get_list_uris(shards)
...
['file:///.../shard/uri1', 'file:///.../shard/uri2']
iden.shard.load_from_uri ¶
load_from_uri(uri: str) -> BaseShard[Any]
Load a shard from its Uniform Resource Identifier (URI).
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The URI of the shard. |
required |
Returns:
| Type | Description |
|---|---|
BaseShard[Any]
|
The shard associated to the URI. |
Raises:
| Type | Description |
|---|---|
FileNotFoundError
|
if the URI file does not exist. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, load_from_uri
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_json_shard([1, 2, 3], uri=uri)
... shard = load_from_uri(uri)
... shard
...
JsonShard(uri=file:///.../my_uri)
iden.shard.sort_by_uri ¶
sort_by_uri(
shards: Iterable[BaseShard[Any]],
/,
*,
reverse: bool = False,
) -> list[BaseShard[Any]]
Sort a sequence of shards by their URIs.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shards
|
Iterable[BaseShard[Any]]
|
The shards to sort. |
required |
reverse
|
bool
|
If set to |
False
|
Returns:
| Type | Description |
|---|---|
list[BaseShard[Any]]
|
The sorted shards. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, sort_by_uri
>>> with tempfile.TemporaryDirectory() as tmpdir:
... shards = sort_by_uri(
... [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("uri2").as_uri()),
... create_json_shard([4, 5, 6, 7], uri=Path(tmpdir).joinpath("uri3").as_uri()),
... create_json_shard([4, 5, 6, 7], uri=Path(tmpdir).joinpath("uri1").as_uri()),
... ]
... )
... shards
...
[JsonShard(uri=file:///.../uri1), JsonShard(uri=file:///.../uri2), JsonShard(uri=file:///.../uri3)]
iden.shard.generator ¶
Contain shard generator implementations.
iden.shard.generator.BaseShardGenerator ¶
Bases: ABC, Generic[T]
Define the base class to create a shard.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
JsonShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
JsonShard(uri=file:///.../uri/shard1)
iden.shard.generator.BaseShardGenerator.equal
abstractmethod
¶
equal(other: Any, equal_nan: bool = False) -> bool
Indicate if two objects are equal or not.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
other
|
Any
|
The object to compare with. |
required |
equal_nan
|
bool
|
If |
False
|
Returns:
| Type | Description |
|---|---|
bool
|
|
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator1 = JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator2 = JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator3 = JsonShardGenerator(
... data=DataGenerator([]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator1.equal(generator2)
... generator1.equal(generator3)
...
True
False
iden.shard.generator.BaseShardGenerator.generate
abstractmethod
¶
generate(shard_id: str) -> BaseShard[T]
Generate a shard.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shard_id
|
str
|
The shard IDI. |
required |
Returns:
| Type | Description |
|---|---|
BaseShard[T]
|
The generated shard. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... shard = generator.generate("shard1")
... shard
...
JsonShard(uri=file:///.../uri/shard1)
iden.shard.generator.CloudpickleShardGenerator ¶
Bases: BaseFileShardGenerator[T]
Implement a cloudpickle shard generator.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
BaseDataGenerator[T] | dict[Any, Any]
|
The data to save in the shard. |
required |
path_uri
|
Path
|
The path where to save the URI file. |
required |
path_shard
|
Path
|
The path where to save the shard data. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import CloudpickleShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = CloudpickleShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
CloudpickleShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
CloudpickleShard(uri=file:///.../uri/shard1)
iden.shard.generator.JoblibShardGenerator ¶
Bases: BaseFileShardGenerator[T]
Implement a joblib shard generator.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
BaseDataGenerator[T] | dict[Any, Any]
|
The data to save in the shard. |
required |
path_uri
|
Path
|
The path where to save the URI file. |
required |
path_shard
|
Path
|
The path where to save the shard data. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import JoblibShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = JoblibShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
JoblibShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
JoblibShard(uri=file:///.../uri/shard1)
iden.shard.generator.JsonShardGenerator ¶
Bases: BaseFileShardGenerator[T]
Implement a JSON shard generator for creating shards with JSON persistence.
This generator creates shards that store data in JSON format, providing human-readable serialization suitable for structured data.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
BaseDataGenerator[T] | dict[Any, Any]
|
The data to save in the shard. |
required |
path_uri
|
Path
|
The path where to save the URI file. |
required |
path_shard
|
Path
|
The path where to save the shard data. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
JsonShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
JsonShard(uri=file:///.../uri/shard1)
iden.shard.generator.NumpySafetensorsShardGenerator ¶
Bases: BaseFileShardGenerator[dict[str, ndarray]]
Implement a safetensors shard generator.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
BaseDataGenerator[dict[str, ndarray]] | dict[Any, Any]
|
The data to save in the shard. |
required |
path_uri
|
Path
|
The path where to save the URI file. |
required |
path_shard
|
Path
|
The path where to save the shard data. |
required |
Example
>>> import tempfile
>>> import numpy as np
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import NumpySafetensorsShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = NumpySafetensorsShardGenerator(
... data=DataGenerator({"key1": np.ones((2, 3)), "key2": np.arange(5)}),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
NumpySafetensorsShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
NumpySafetensorsShard(uri=file:///.../uri/shard1)
iden.shard.generator.PickleShardGenerator ¶
Bases: BaseFileShardGenerator[T]
Implement a pickle shard generator for creating shards with pickle persistence.
This generator creates shards that store data using Python's pickle protocol, suitable for arbitrary Python objects.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
BaseDataGenerator[T] | dict[Any, Any]
|
The data to save in the shard. |
required |
path_uri
|
Path
|
The path where to save the URI file. |
required |
path_shard
|
Path
|
The path where to save the shard data. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import PickleShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = PickleShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
PickleShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
PickleShard(uri=file:///.../uri/shard1)
iden.shard.generator.ShardDictGenerator ¶
Bases: BaseShardGenerator[dict[str, BaseShard[T]]]
Implement a shard dictionary generator for creating dictionaries of shards.
This generator creates ShardDict instances containing multiple named shards, useful for organizing data splits or related datasets.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shards
|
dict[str, BaseShardGenerator[T] | dict[Any, Any]]
|
The shard generators or their configurations. |
required |
path_uri
|
Path
|
The path where to save the URI file. |
required |
Example
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import ShardDictGenerator, JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = ShardDictGenerator(
... shards={
... "train": JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... },
... path_uri=Path(tmpdir).joinpath("uri"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
ShardDictGenerator(
(path_uri): PosixPath('/.../uri')
(shards):
(train): JsonShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
)
ShardDict(
(uri): file:///.../uri/shard1
(shards):
(train): JsonShard(uri=file:///.../uri/train)
)
iden.shard.generator.ShardTupleGenerator ¶
Bases: BaseShardGenerator[tuple[BaseShard[T], ...]]
Implement a shard tuple generator for creating sequences of shards.
This generator creates ShardTuple instances containing an ordered sequence of shards, useful for organizing sequential data batches.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shard
|
BaseShardGenerator[T] | dict[Any, Any]
|
The shard generator or its configuration. |
required |
num_shards
|
int
|
The number of shards to generate in the
|
required |
path_uri
|
Path
|
The path where to save the URI file. |
required |
Example
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import ShardTupleGenerator, JsonShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = ShardTupleGenerator(
... shard=JsonShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... ),
... path_uri=Path(tmpdir).joinpath("uri"),
... num_shards=5,
... )
... generator
... shard = generator.generate("shard1")
... shard
...
ShardTupleGenerator(
(path_uri): PosixPath('/.../uri')
(num_shards): 5
(shard): JsonShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
)
ShardTuple(
(uri): file:///.../uri/shard1
(shards):
(0): JsonShard(uri=file:///.../uri/000000001)
(1): JsonShard(uri=file:///.../uri/000000002)
(2): JsonShard(uri=file:///.../uri/000000003)
(3): JsonShard(uri=file:///.../uri/000000004)
(4): JsonShard(uri=file:///.../uri/000000005)
)
iden.shard.generator.TorchSafetensorsShardGenerator ¶
Bases: BaseFileShardGenerator[dict[str, Tensor]]
Implement a safetensors shard generator.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
BaseDataGenerator[dict[str, Tensor]] | dict[Any, Any]
|
The data to save in the shard. |
required |
path_uri
|
Path
|
The path where to save the URI file. |
required |
path_shard
|
Path
|
The path where to save the shard data. |
required |
Example
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import TorchSafetensorsShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = TorchSafetensorsShardGenerator(
... data=DataGenerator({"key1": torch.ones(2, 3), "key2": torch.arange(5)}),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
TorchSafetensorsShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
TorchSafetensorsShard(uri=file:///.../uri/shard1)
iden.shard.generator.TorchShardGenerator ¶
Bases: BaseFileShardGenerator[T]
Implement a torch shard generator.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
BaseDataGenerator[T] | dict[Any, Any]
|
The data to save in the shard. |
required |
path_uri
|
Path
|
The path where to save the URI file. |
required |
path_shard
|
Path
|
The path where to save the shard data. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import TorchShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = TorchShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
TorchShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
TorchShard(uri=file:///.../uri/shard1)
iden.shard.generator.YamlShardGenerator ¶
Bases: BaseFileShardGenerator[T]
Implement a YAML shard generator.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
data
|
BaseDataGenerator[T] | dict[Any, Any]
|
The data to save in the shard. |
required |
path_uri
|
Path
|
The path where to save the URI file. |
required |
path_shard
|
Path
|
The path where to save the shard data. |
required |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.data.generator import DataGenerator
>>> from iden.shard.generator import YamlShardGenerator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = YamlShardGenerator(
... data=DataGenerator([1, 2, 3]),
... path_uri=Path(tmpdir).joinpath("uri"),
... path_shard=Path(tmpdir).joinpath("data"),
... )
... generator
... shard = generator.generate("shard1")
... shard
...
YamlShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): DataGenerator(copy=False)
)
YamlShard(uri=file:///.../uri/shard1)
iden.shard.generator.is_shard_generator_config ¶
is_shard_generator_config(config: dict[Any, Any]) -> bool
Indicate if the input configuration is a configuration for a
BaseShardGenerator.
This function only checks if the value of the key _target_
is valid. It does not check the other values. If _target_
indicates a function, the returned type hint is used to check
the class.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
config
|
dict[Any, Any]
|
The configuration to check. |
required |
Returns:
| Type | Description |
|---|---|
bool
|
|
Example
>>> from iden.shard.generator import is_shard_generator_config
>>> is_shard_generator_config({"_target_": "iden.shard.generator.JsonShardGenerator"})
True
iden.shard.generator.setup_shard_generator ¶
setup_shard_generator(
shard_generator: (
BaseShardGenerator[T] | dict[Any, Any]
),
) -> BaseShardGenerator[T]
Set up a shard generator.
The shard generator is instantiated from its configuration by using the
BaseShardGenerator factory function.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shard_generator
|
BaseShardGenerator[T] | dict[Any, Any]
|
The shard generator or its configuration. |
required |
Returns:
| Type | Description |
|---|---|
BaseShardGenerator[T]
|
The instantiated shard generator. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard.generator import setup_shard_generator
>>> with tempfile.TemporaryDirectory() as tmpdir:
... generator = setup_shard_generator(
... {
... "_target_": "iden.shard.generator.JsonShardGenerator",
... "data": [1, 2, 3],
... "path_uri": Path(tmpdir).joinpath("uri"),
... "path_shard": Path(tmpdir).joinpath("data"),
... }
... )
... generator
...
JsonShardGenerator(
(path_uri): PosixPath('/.../uri')
(path_shard): PosixPath('/.../data')
(data): [1, 2, 3]
)
iden.shard.loader ¶
Contain shard loader implementations.
iden.shard.loader.BaseShardLoader ¶
Bases: ABC, Generic[T]
Define the base class to implement a shard loader.
A shard loader object allows to load a BaseShard object from
its Uniform Resource Identifier (URI).
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard.loader import JsonShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_json_shard([1, 2, 3], uri=uri)
... loader = JsonShardLoader()
... shard = loader.load(uri)
... shard
...
JsonShard(uri=file:///.../my_uri)
iden.shard.loader.BaseShardLoader.equal
abstractmethod
¶
equal(other: Any, equal_nan: bool = False) -> bool
Indicate if two objects are equal or not.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
other
|
Any
|
The object to compare with. |
required |
equal_nan
|
bool
|
If |
False
|
Returns:
| Type | Description |
|---|---|
bool
|
|
Example
>>> from iden.shard.loader import JsonShardLoader, PickleShardLoader
>>> JsonShardLoader().equal(JsonShardLoader())
True
>>> JsonShardLoader().equal(PickleShardLoader())
False
iden.shard.loader.BaseShardLoader.load
abstractmethod
¶
load(uri: str) -> BaseShard[T]
Load a shard from its Uniform Resource Identifier (URI).
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
uri
|
str
|
The URI of the shard to load. |
required |
Returns:
| Type | Description |
|---|---|
BaseShard[T]
|
The loaded shard. |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard.loader import JsonShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_json_shard([1, 2, 3], uri=uri)
... loader = JsonShardLoader()
... shard = loader.load(uri)
... shard
...
JsonShard(uri=file:///.../my_uri)
iden.shard.loader.CloudpickleShardLoader ¶
Bases: BaseShardLoader[T]
Implement a cloudpickle shard loader.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_cloudpickle_shard
>>> from iden.shard.loader import CloudpickleShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_cloudpickle_shard([1, 2, 3], uri=uri)
... loader = CloudpickleShardLoader()
... shard = loader.load(uri)
... shard
...
CloudpickleShard(uri=file:///.../my_uri)
iden.shard.loader.FileShardLoader ¶
Bases: BaseShardLoader[T]
Implement a file-based shard loader.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard.loader import FileShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_json_shard([1, 2, 3], uri=uri)
... loader = FileShardLoader()
... shard = loader.load(uri)
... shard
...
JsonShard(uri=file:///.../my_uri)
iden.shard.loader.JoblibShardLoader ¶
Bases: BaseShardLoader[T]
Implement a joblib shard loader for loading shards from joblib files.
This loader reads shard configuration from a URI and instantiates a joblib shard with the specified data file path.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_joblib_shard
>>> from iden.shard.loader import JoblibShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_joblib_shard([1, 2, 3], uri=uri)
... loader = JoblibShardLoader()
... shard = loader.load(uri)
... shard
...
JoblibShard(uri=file:///.../my_uri)
iden.shard.loader.JsonShardLoader ¶
Bases: BaseShardLoader[T]
Implement a JSON shard loader for loading shards from JSON files.
This loader reads shard configuration from a URI and instantiates a JSON shard with the specified data file path.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard
>>> from iden.shard.loader import JsonShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_json_shard([1, 2, 3], uri=uri)
... loader = JsonShardLoader()
... shard = loader.load(uri)
... shard
...
JsonShard(uri=file:///.../my_uri)
iden.shard.loader.NumpySafetensorsShardLoader ¶
Bases: BaseShardLoader[dict[str, ndarray]]
Implement a safetensors shard loader for numpy.ndarrays.
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> import numpy as np
>>> from pathlib import Path
>>> from iden.shard import create_numpy_safetensors_shard
>>> from iden.shard.loader import NumpySafetensorsShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_numpy_safetensors_shard(
... {"key1": np.ones((2, 3)), "key2": np.arange(5)}, uri=uri
... )
... loader = NumpySafetensorsShardLoader()
... shard = loader.load(uri)
... shard
...
NumpySafetensorsShard(uri=file:///.../my_uri)
iden.shard.loader.PickleShardLoader ¶
Bases: BaseShardLoader[T]
Implement a pickle shard loader for loading shards from pickle files.
This loader reads shard configuration from a URI and instantiates a pickle shard with the specified data file path.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_pickle_shard
>>> from iden.shard.loader import PickleShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_pickle_shard([1, 2, 3], uri=uri)
... loader = PickleShardLoader()
... shard = loader.load(uri)
... shard
...
PickleShard(uri=file:///.../my_uri)
iden.shard.loader.ShardDictLoader ¶
Bases: BaseShardLoader[dict[str, BaseShard[T]]]
Implement a shard dictionary loader for loading dictionary- structured shards.
This loader reads shard configuration from a URI and instantiates a ShardDict containing multiple named shards.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, create_shard_dict
>>> from iden.shard.loader import ShardDictLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri").as_uri()
... shards = {
... "train": create_json_shard(
... [1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()
... ),
... "val": create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... }
... create_shard_dict(shards, uri=uri)
... loader = ShardDictLoader()
... shard = loader.load(uri)
... shard
...
ShardDict(
(uri): file:///.../uri
(shards):
(train): JsonShard(uri=file:///.../shard/uri1)
(val): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.loader.ShardTupleLoader ¶
Bases: BaseShardLoader[tuple[BaseShard[T], ...]]
Implement a shard tuple loader for loading sequence-structured shards.
This loader reads shard configuration from a URI and instantiates a ShardTuple containing an ordered sequence of shards.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_json_shard, create_shard_tuple
>>> from iden.shard.loader import ShardTupleLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("uri").as_uri()
... shards = [
... create_json_shard([1, 2, 3], uri=Path(tmpdir).joinpath("shard/uri1").as_uri()),
... create_json_shard(
... [4, 5, 6, 7], uri=Path(tmpdir).joinpath("shard/uri2").as_uri()
... ),
... ]
... create_shard_tuple(shards, uri=uri)
... loader = ShardTupleLoader()
... shard = loader.load(uri)
... shard
...
ShardTuple(
(uri): file:///.../uri
(shards):
(0): JsonShard(uri=file:///.../shard/uri1)
(1): JsonShard(uri=file:///.../shard/uri2)
)
iden.shard.loader.TorchSafetensorsShardLoader ¶
Bases: BaseShardLoader[dict[str, Tensor]]
Implement a safetensors shard loader for torch.Tensors.
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> import torch
>>> from pathlib import Path
>>> from iden.shard import create_torch_safetensors_shard
>>> from iden.shard.loader import TorchSafetensorsShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_torch_safetensors_shard(
... {"key1": torch.ones(2, 3), "key2": torch.arange(5)}, uri=uri
... )
... loader = TorchSafetensorsShardLoader()
... shard = loader.load(uri)
... shard
...
TorchSafetensorsShard(uri=file:///.../my_uri)
iden.shard.loader.TorchShardLoader ¶
Bases: BaseShardLoader[T]
Implement a PyTorch shard loader for loading shards from PyTorch files.
This loader reads shard configuration from a URI and instantiates a PyTorch shard with the specified data file path.
Raises:
| Type | Description |
|---|---|
RuntimeError
|
if |
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_torch_shard
>>> from iden.shard.loader import TorchShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_torch_shard([1, 2, 3], uri=uri)
... loader = TorchShardLoader()
... shard = loader.load(uri)
... shard
...
TorchShard(uri=file:///.../my_uri)
iden.shard.loader.YamlShardLoader ¶
Bases: BaseShardLoader[T]
Implement a YAML shard loader for loading shards from YAML files.
This loader reads shard configuration from a URI and instantiates a YAML shard with the specified data file path.
Example
>>> import tempfile
>>> from pathlib import Path
>>> from iden.shard import create_yaml_shard
>>> from iden.shard.loader import YamlShardLoader
>>> with tempfile.TemporaryDirectory() as tmpdir:
... uri = Path(tmpdir).joinpath("my_uri").as_uri()
... create_yaml_shard([1, 2, 3], uri=uri)
... loader = YamlShardLoader()
... shard = loader.load(uri)
... shard
...
YamlShard(uri=file:///.../my_uri)
iden.shard.loader.is_shard_loader_config ¶
is_shard_loader_config(config: dict[Any, Any]) -> bool
Indicate if the input configuration is a configuration for a
BaseShardLoader.
This function only checks if the value of the key _target_
is valid. It does not check the other values. If _target_
indicates a function, the returned type hint is used to check
the class.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
config
|
dict[Any, Any]
|
The configuration to check. |
required |
Returns:
| Type | Description |
|---|---|
bool
|
|
Example
>>> from iden.shard.loader import is_shard_loader_config
>>> is_shard_loader_config({"_target_": "iden.shard.loader.JsonShardLoader"})
True
iden.shard.loader.setup_shard_loader ¶
setup_shard_loader(
shard_loader: BaseShardLoader[T] | dict[Any, Any],
) -> BaseShardLoader[T]
Set up a shard loader.
The shard loader is instantiated from its configuration by using the
BaseShardLoader factory function.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shard_loader
|
BaseShardLoader[T] | dict[Any, Any]
|
The shard loader or its configuration. |
required |
Returns:
| Type | Description |
|---|---|
BaseShardLoader[T]
|
The instantiated shard loader. |
Example
>>> from iden.shard.loader import setup_shard_loader
>>> shard_loader = setup_shard_loader({"_target_": "iden.shard.loader.JsonShardLoader"})
>>> shard_loader
JsonShardLoader()