Skip to content

SessionCache

In-process object store that exposes large values as named handles with compact snapshots. The model never sees raw data in message history; it operates on handles through the Python interpreter.


data_harness.cache.SessionCache

SessionCache(
    sample_size: int = 5,
    storage_dir: str | Path | None = None,
    hot_limit: int | None = None,
)

In-process store that exposes large values as named handles with snapshots.

Large objects (DataFrames, arrays, query results) are stored by name. The model only ever sees a compact snapshot — shape, columns, a few sample rows — and operates on the data by writing Python against the handle name. This keeps message context lean without hiding data from the model.

When hot_limit is set, least-recently-used handles are spilled to disk automatically. DataFrames are written as Parquet, NumPy arrays as .npy, and everything else as pickle.

Parameters:

Name Type Description Default
sample_size int

Number of rows/elements to include in each snapshot.

5
storage_dir str | Path | None

Directory for disk-spilled handles. If None and hot_limit is set, a temporary directory is created automatically.

None
hot_limit int | None

Maximum number of handles kept in memory at once. None means unbounded (all handles stay in memory).

None
Source code in data_harness/cache.py
def __init__(
    self,
    sample_size: int = 5,
    storage_dir: str | Path | None = None,
    hot_limit: int | None = None,
) -> None:
    if hot_limit is not None and hot_limit < 1:
        raise ValueError("hot_limit must be at least 1")
    self.sample_size = sample_size
    self.hot_limit = hot_limit
    self._store: dict[str, Any] = {}
    self._cold: dict[str, _ColdEntry] = {}
    self._snapshots: dict[str, str] = {}
    self._recency: OrderedDict[str, None] = OrderedDict()
    self._temp_dir: tempfile.TemporaryDirectory[str] | None = None
    if storage_dir is None and hot_limit is not None:
        self._temp_dir = tempfile.TemporaryDirectory(prefix="data-harness-cache-")
        self._storage_dir = Path(self._temp_dir.name)
    elif storage_dir is not None:
        self._storage_dir = Path(storage_dir)
        self._storage_dir.mkdir(parents=True, exist_ok=True)
        if self.hot_limit is None:
            # Supplying storage_dir opts into disk-backed cache behaviour.
            # Keep the default bounded so a caller does not create a spill
            # directory that is never used.
            self.hot_limit = 10
    else:
        self._storage_dir = None

put

put(name: str, value: Any, overwrite: bool = False) -> str

Store a value under name and return the handle actually used.

If name is already taken and overwrite is False, a numeric suffix is appended (name_2, name_3, …) and the new handle is returned.

Parameters:

Name Type Description Default
name str

Desired handle name. Must be a valid Python identifier.

required
value Any

Any Python object. DataFrames and NumPy arrays get specialised snapshot and spill formats.

required
overwrite bool

Replace the existing handle if True.

False

Returns:

Type Description
str

The handle name under which the value was stored.

Raises:

Type Description
ValueError

If name is not a valid Python identifier.

Source code in data_harness/cache.py
def put(self, name: str, value: Any, overwrite: bool = False) -> str:
    """Store a value under ``name`` and return the handle actually used.

    If ``name`` is already taken and ``overwrite`` is ``False``, a numeric
    suffix is appended (``name_2``, ``name_3``, …) and the new handle is
    returned.

    Args:
        name: Desired handle name. Must be a valid Python identifier.
        value: Any Python object. DataFrames and NumPy arrays get
            specialised snapshot and spill formats.
        overwrite: Replace the existing handle if ``True``.

    Returns:
        The handle name under which the value was stored.

    Raises:
        ValueError: If ``name`` is not a valid Python identifier.
    """
    if not _is_valid_identifier(name):
        raise ValueError(
            f"Invalid handle name: {name!r}. Must be a valid Python identifier."
        )
    if overwrite or not self.has_handle(name):
        if overwrite:
            self._delete_cold(name)
        self._put_resolved(name, value)
        return name
    # Auto-suffix on collision
    suffix = 2
    while True:
        candidate = f"{name}_{suffix}"
        if not self.has_handle(candidate):
            self._put_resolved(candidate, value)
            return candidate
        suffix += 1

get

get(name: str) -> Any

Retrieve a value by handle name, promoting cold entries to hot.

Parameters:

Name Type Description Default
name str

A handle previously returned by put.

required

Returns:

Type Description
Any

The stored Python object.

Raises:

Type Description
KeyError

If no handle with name exists.

Source code in data_harness/cache.py
def get(self, name: str) -> Any:
    """Retrieve a value by handle name, promoting cold entries to hot.

    Args:
        name: A handle previously returned by `put`.

    Returns:
        The stored Python object.

    Raises:
        KeyError: If no handle with ``name`` exists.
    """
    if name in self._store:
        self._mark_recent(name)
        return self._store[name]
    if name in self._cold:
        value = self._read_cold(name)
        self._delete_cold(name)
        self._store[name] = value
        self._mark_recent(name)
        self._enforce_hot_limit()
        return value
    raise KeyError(name)

snapshot

snapshot(handle: str) -> str

Return the compact snapshot string for a stored handle.

The snapshot is a JSON string describing the value's type, shape, and a few sample elements. It is what the model sees in message history instead of the raw object.

Parameters:

Name Type Description Default
handle str

A handle previously returned by put.

required

Returns:

Type Description
str

A JSON string summary of the stored value.

Source code in data_harness/cache.py
def snapshot(self, handle: str) -> str:
    """Return the compact snapshot string for a stored handle.

    The snapshot is a JSON string describing the value's type, shape, and a
    few sample elements. It is what the model sees in message history
    instead of the raw object.

    Args:
        handle: A handle previously returned by `put`.

    Returns:
        A JSON string summary of the stored value.
    """
    if handle in self._snapshots:
        return self._snapshots[handle]
    value = self.get(handle)
    snapshot = self._make_snapshot(value)
    self._snapshots[handle] = snapshot
    return snapshot

list_handles

list_handles() -> dict[str, str]

Return a mapping of all handle names to their snapshot strings.

Source code in data_harness/cache.py
def list_handles(self) -> dict[str, str]:
    """Return a mapping of all handle names to their snapshot strings."""
    return {name: self.snapshot(name) for name in self.handle_names()}

handle_names

handle_names() -> list[str]

Return all handle names in most-recently-used order.

Source code in data_harness/cache.py
def handle_names(self) -> list[str]:
    """Return all handle names in most-recently-used order."""
    return list(self._recency.keys())

has_handle

has_handle(name: str) -> bool

Return True if name is a registered handle (hot or cold).

Source code in data_harness/cache.py
def has_handle(self, name: str) -> bool:
    """Return ``True`` if ``name`` is a registered handle (hot or cold)."""
    return name in self._store or name in self._cold

delete

delete(name: str) -> None

Remove a handle and its associated disk artefact (if any).

Parameters:

Name Type Description Default
name str

Handle to remove.

required

Raises:

Type Description
KeyError

If no handle with name exists.

Source code in data_harness/cache.py
def delete(self, name: str) -> None:
    """Remove a handle and its associated disk artefact (if any).

    Args:
        name: Handle to remove.

    Raises:
        KeyError: If no handle with ``name`` exists.
    """
    if not self.has_handle(name):
        raise KeyError(name)
    self._store.pop(name, None)
    self._delete_cold(name)
    self._snapshots.pop(name, None)
    self._recency.pop(name, None)

close

close() -> None

Release the temporary storage directory, if one was created.

Source code in data_harness/cache.py
def close(self) -> None:
    """Release the temporary storage directory, if one was created."""
    if self._temp_dir is not None:
        self._temp_dir.cleanup()
        self._temp_dir = None