Skip to content

Data Adapters

Authentication for gated datasets

Some adapters (currently LithoSim) load data from a gated HuggingFace Hub repository. If a load() call raises a RuntimeError mentioning HTTP 401, see HuggingFace Authentication for the unblock steps (request access → huggingface-cli login / HF_TOKEN).

openlithohub.data.base

Abstract base class for dataset adapters.

LithoSample dataclass

A single lithography sample with unified tensor representation.

Source code in src/openlithohub/data/base.py
@dataclass
class LithoSample:
    """A single lithography sample with unified tensor representation."""

    design: torch.Tensor
    mask: torch.Tensor | None = None
    resist: torch.Tensor | None = None
    metadata: dict[str, Any] = field(default_factory=dict)

DatasetAdapter

Bases: ABC

Abstract adapter for lithography datasets.

Subclasses must implement len and getitem to provide unified PyTorch Tensor access regardless of underlying format.

Source code in src/openlithohub/data/base.py
class DatasetAdapter(ABC):
    """Abstract adapter for lithography datasets.

    Subclasses must implement __len__ and __getitem__ to provide
    unified PyTorch Tensor access regardless of underlying format.
    """

    def __init_subclass__(cls, **kwargs: Any) -> None:
        super().__init_subclass__(**kwargs)
        # Wrap the subclass's ``download`` (if it overrides the abstract
        # parent) with a per-call audit hook keyed off
        # ``OPENLITHOHUB_AUDIT_DIR``. The wrapper is a no-op unless that
        # env var is set, so production callers pay only one env lookup.
        from openlithohub.data._audit import install_audit_hook

        install_audit_hook(cls)

    @abstractmethod
    def __len__(self) -> int: ...

    @abstractmethod
    def __getitem__(self, index: int) -> LithoSample: ...

    @property
    def supports_random_access(self) -> bool:
        """Whether ``len()`` and integer indexing are well-defined.

        Streaming adapters (e.g. ``LithoSimDataset(streaming=True)``) lazily
        consume an iterable and cannot answer ``len()`` or ``ds[i]`` without
        materialising the whole stream — they raise :class:`TypeError`
        instead. Callers that branch between batched evaluation (random
        access) and online consumption (iteration only) should query this
        property rather than catching ``TypeError`` after the fact.

        Defaults to ``True``; streaming adapters override to ``False``.
        """
        return True

    def __iter__(self) -> Iterator[LithoSample]:
        for i in range(len(self)):
            yield self[i]

    @abstractmethod
    def download(self, root: str) -> None:
        """Download dataset to the specified root directory."""
        ...

    # ---- ML metadata ----

    def croissant_name(self) -> str:
        """Human-readable name for Croissant metadata. Defaults to class name."""
        return type(self).__name__

    def croissant_description(self) -> str:
        """Free-text description for Croissant metadata.

        Subclasses should override with a one-paragraph dataset summary.
        """
        return f"Lithography dataset of type {type(self).__name__}."

    def croissant_license_url(self) -> str | None:
        """Upstream license URL, or ``None`` when not applicable."""
        return None

    def croissant_citation(self) -> str | None:
        """BibTeX or free-text citation, or ``None``."""
        return None

    def croissant_url(self) -> str | None:
        """Canonical landing-page URL for the dataset, or ``None``."""
        return None

    def to_croissant(self) -> dict[str, Any]:
        """Emit MLCommons Croissant 1.0 JSON-LD metadata.

        Croissant is the de-facto ML dataset metadata schema (HuggingFace,
        Google, Kaggle, MLCommons; published 2024-12). Producing it from
        ``DatasetAdapter`` lets downstream MLPerf-style benchmarks
        consume our datasets without bespoke adapters.

        The output is a Python dict matching the JSON-LD shape — caller
        serialises it with ``json.dumps`` (default), or feeds it to a
        Croissant validator. We emit the minimum compliant subset:
        ``@context``, ``@type``, ``name``, ``description``, ``license``,
        ``url``, ``citeAs``, plus a single ``RecordSet`` describing the
        sample shape (design / mask / resist tensors). Subclasses can
        override hook methods (``croissant_name`` / ``..._description``
        / ...) to enrich the output.
        """
        record_fields = [
            {
                "@type": "cr:Field",
                "name": "design",
                "description": "Target design tensor (binary mask of intended features).",
                "dataType": "cr:Tensor",
            },
            {
                "@type": "cr:Field",
                "name": "mask",
                "description": "Optimised lithography mask tensor (post-OPC), if available.",
                "dataType": "cr:Tensor",
            },
            {
                "@type": "cr:Field",
                "name": "resist",
                "description": "Simulated/measured resist contour tensor, if available.",
                "dataType": "cr:Tensor",
            },
        ]
        out: dict[str, Any] = {
            "@context": {
                "@vocab": "https://schema.org/",
                "cr": "http://mlcommons.org/croissant/",
                "sc": "https://schema.org/",
                "data": {"@id": "cr:data", "@type": "@json"},
            },
            "@type": "sc:Dataset",
            "name": self.croissant_name(),
            "description": self.croissant_description(),
            "conformsTo": "http://mlcommons.org/croissant/1.0",
            "recordSet": [
                {
                    "@type": "cr:RecordSet",
                    "name": "samples",
                    "description": "Per-sample lithography records.",
                    "field": record_fields,
                }
            ],
        }
        if (lic := self.croissant_license_url()) is not None:
            out["license"] = lic
        if (cite := self.croissant_citation()) is not None:
            out["citeAs"] = cite
        if (url := self.croissant_url()) is not None:
            out["url"] = url
        return out

supports_random_access property

Whether len() and integer indexing are well-defined.

Streaming adapters (e.g. LithoSimDataset(streaming=True)) lazily consume an iterable and cannot answer len() or ds[i] without materialising the whole stream — they raise :class:TypeError instead. Callers that branch between batched evaluation (random access) and online consumption (iteration only) should query this property rather than catching TypeError after the fact.

Defaults to True; streaming adapters override to False.

download(root) abstractmethod

Download dataset to the specified root directory.

Source code in src/openlithohub/data/base.py
@abstractmethod
def download(self, root: str) -> None:
    """Download dataset to the specified root directory."""
    ...

croissant_name()

Human-readable name for Croissant metadata. Defaults to class name.

Source code in src/openlithohub/data/base.py
def croissant_name(self) -> str:
    """Human-readable name for Croissant metadata. Defaults to class name."""
    return type(self).__name__

croissant_description()

Free-text description for Croissant metadata.

Subclasses should override with a one-paragraph dataset summary.

Source code in src/openlithohub/data/base.py
def croissant_description(self) -> str:
    """Free-text description for Croissant metadata.

    Subclasses should override with a one-paragraph dataset summary.
    """
    return f"Lithography dataset of type {type(self).__name__}."

croissant_license_url()

Upstream license URL, or None when not applicable.

Source code in src/openlithohub/data/base.py
def croissant_license_url(self) -> str | None:
    """Upstream license URL, or ``None`` when not applicable."""
    return None

croissant_citation()

BibTeX or free-text citation, or None.

Source code in src/openlithohub/data/base.py
def croissant_citation(self) -> str | None:
    """BibTeX or free-text citation, or ``None``."""
    return None

croissant_url()

Canonical landing-page URL for the dataset, or None.

Source code in src/openlithohub/data/base.py
def croissant_url(self) -> str | None:
    """Canonical landing-page URL for the dataset, or ``None``."""
    return None

to_croissant()

Emit MLCommons Croissant 1.0 JSON-LD metadata.

Croissant is the de-facto ML dataset metadata schema (HuggingFace, Google, Kaggle, MLCommons; published 2024-12). Producing it from DatasetAdapter lets downstream MLPerf-style benchmarks consume our datasets without bespoke adapters.

The output is a Python dict matching the JSON-LD shape — caller serialises it with json.dumps (default), or feeds it to a Croissant validator. We emit the minimum compliant subset: @context, @type, name, description, license, url, citeAs, plus a single RecordSet describing the sample shape (design / mask / resist tensors). Subclasses can override hook methods (croissant_name / ..._description / ...) to enrich the output.

Source code in src/openlithohub/data/base.py
def to_croissant(self) -> dict[str, Any]:
    """Emit MLCommons Croissant 1.0 JSON-LD metadata.

    Croissant is the de-facto ML dataset metadata schema (HuggingFace,
    Google, Kaggle, MLCommons; published 2024-12). Producing it from
    ``DatasetAdapter`` lets downstream MLPerf-style benchmarks
    consume our datasets without bespoke adapters.

    The output is a Python dict matching the JSON-LD shape — caller
    serialises it with ``json.dumps`` (default), or feeds it to a
    Croissant validator. We emit the minimum compliant subset:
    ``@context``, ``@type``, ``name``, ``description``, ``license``,
    ``url``, ``citeAs``, plus a single ``RecordSet`` describing the
    sample shape (design / mask / resist tensors). Subclasses can
    override hook methods (``croissant_name`` / ``..._description``
    / ...) to enrich the output.
    """
    record_fields = [
        {
            "@type": "cr:Field",
            "name": "design",
            "description": "Target design tensor (binary mask of intended features).",
            "dataType": "cr:Tensor",
        },
        {
            "@type": "cr:Field",
            "name": "mask",
            "description": "Optimised lithography mask tensor (post-OPC), if available.",
            "dataType": "cr:Tensor",
        },
        {
            "@type": "cr:Field",
            "name": "resist",
            "description": "Simulated/measured resist contour tensor, if available.",
            "dataType": "cr:Tensor",
        },
    ]
    out: dict[str, Any] = {
        "@context": {
            "@vocab": "https://schema.org/",
            "cr": "http://mlcommons.org/croissant/",
            "sc": "https://schema.org/",
            "data": {"@id": "cr:data", "@type": "@json"},
        },
        "@type": "sc:Dataset",
        "name": self.croissant_name(),
        "description": self.croissant_description(),
        "conformsTo": "http://mlcommons.org/croissant/1.0",
        "recordSet": [
            {
                "@type": "cr:RecordSet",
                "name": "samples",
                "description": "Per-sample lithography records.",
                "field": record_fields,
            }
        ],
    }
    if (lic := self.croissant_license_url()) is not None:
        out["license"] = lic
    if (cite := self.croissant_citation()) is not None:
        out["citeAs"] = cite
    if (url := self.croissant_url()) is not None:
        out["url"] = url
    return out

natural_sort_key(s)

Sort key that orders strings with embedded numbers numerically.

sample_2 < sample_10 < sample_100, instead of the lexical sample_10 < sample_2.

Source code in src/openlithohub/data/base.py
def natural_sort_key(s: str) -> tuple[Any, ...]:
    """Sort key that orders strings with embedded numbers numerically.

    `sample_2` < `sample_10` < `sample_100`, instead of the lexical
    `sample_10` < `sample_2`.
    """
    parts = _NAT_SPLIT_RE.split(s)
    return tuple(int(p) if p.isdigit() else p for p in parts)

openlithohub.data.lithobench

LithoBench dataset adapter (.npy format).

LithoBench (NeurIPS'23) organizes data as paired .npy arrays per sample: root/ design/ sample_0000.npy # binary design layout (H, W) sample_0001.npy ... mask/ sample_0000.npy # optimized mask (H, W), may not exist for all samples ... resist/ sample_0000.npy # simulated resist contour (H, W), optional ... metadata.json # optional: per-sample process parameters

Alternatively, a flat layout is supported: root/ sample_0000_design.npy sample_0000_mask.npy sample_0000_resist.npy ...

LithoBenchDataset

Bases: DatasetAdapter

Adapter for the LithoBench dataset (NeurIPS'23, 45nm baseline).

Supports two directory layouts: 1. Subdirectory layout: root/{design,mask,resist}/sample_XXXX.npy 2. Flat layout: root/sample_XXXX_{design,mask,resist}.npy

Parameters:

Name Type Description Default
root str | Path

Path to the dataset directory.

required
split str | None

Optional split name (e.g. 'train', 'test'). If set, looks for root/split/.

None
pixel_nm float

Pixel resolution in nanometers (default 1.0 for LithoBench 45nm node).

1.0
Source code in src/openlithohub/data/lithobench.py
class LithoBenchDataset(DatasetAdapter):
    """Adapter for the LithoBench dataset (NeurIPS'23, 45nm baseline).

    Supports two directory layouts:
    1. Subdirectory layout: root/{design,mask,resist}/sample_XXXX.npy
    2. Flat layout: root/sample_XXXX_{design,mask,resist}.npy

    Args:
        root: Path to the dataset directory.
        split: Optional split name (e.g. 'train', 'test'). If set, looks for root/split/.
        pixel_nm: Pixel resolution in nanometers (default 1.0 for LithoBench 45nm node).
    """

    def __init__(
        self,
        root: str | Path,
        split: str | None = None,
        pixel_nm: float = 1.0,
    ) -> None:
        self.root = Path(root)
        if split:
            self.root = self.root / split
        self.pixel_nm = pixel_nm
        self._index: list[str] = []
        self._layout: str = "unknown"
        self._metadata: dict[str, Any] = {}
        # Surface a clear warning when --data-root points at unverified
        # bytes. download() verifies the upstream tarball and is the
        # supported integrity path; bypassing it via a hand-extracted
        # data-root used to silently load whatever was on disk.
        warn_unverified_data_root(self.root, "lithobench")
        self._build_index()

    def _build_index(self) -> None:
        if not self.root.exists():
            raise FileNotFoundError(f"Dataset root not found: {self.root}")

        design_dir = self.root / "design"
        if design_dir.is_dir():
            self._layout = "subdirectory"
            self._index = sorted((p.stem for p in design_dir.glob("*.npy")), key=natural_sort_key)
        else:
            self._layout = "flat"
            seen: set[str] = set()
            for p in self.root.glob("*.npy"):
                m = _FILENAME_RE.match(p.name)
                if m and m.group("kind") == "design":
                    seen.add(m.group("sample_id"))
            self._index = sorted(seen, key=natural_sort_key)

        meta_path = self.root / "metadata.json"
        if meta_path.exists():
            try:
                with open(meta_path, encoding="utf-8") as f:
                    self._metadata = json.load(f)
            except json.JSONDecodeError as exc:
                raise RuntimeError(f"Corrupt LithoBench metadata at {meta_path}: {exc}") from exc

    def __len__(self) -> int:
        return len(self._index)

    def __getitem__(self, index: int) -> LithoSample:
        if index < 0 or index >= len(self._index):
            raise IndexError(f"Index {index} out of range [0, {len(self._index)})")

        sample_id = self._index[index]
        design = self._load_array(sample_id, "design")
        mask = self._try_load_array(sample_id, "mask")
        resist = self._try_load_array(sample_id, "resist")

        metadata: dict[str, Any] = {
            "dataset": "lithobench",
            "sample_id": sample_id,
            "pixel_nm": self.pixel_nm,
        }
        if sample_id in self._metadata:
            metadata.update(self._metadata[sample_id])

        return LithoSample(
            design=torch.from_numpy(design).float(),
            mask=torch.from_numpy(mask).float() if mask is not None else None,
            resist=torch.from_numpy(resist).float() if resist is not None else None,
            metadata=metadata,
        )

    def _resolve_path(self, sample_id: str, kind: str) -> Path:
        # Sample IDs feed directly into a filesystem path; refuse anything that
        # could escape ``self.root`` via traversal. The legitimate index
        # (populated from sorted file globs) only ever contains plain names,
        # so this is a guard against caller-supplied IDs (``has_kind``).
        if not sample_id or "/" in sample_id or "\\" in sample_id or sample_id in (".", ".."):
            raise ValueError(f"Invalid sample_id: {sample_id!r}")
        if kind not in _VALID_KINDS:
            raise ValueError(f"Invalid kind: {kind!r}. Expected one of {sorted(_VALID_KINDS)}.")
        if self._layout == "subdirectory":
            return self.root / kind / f"{sample_id}.npy"
        return self.root / f"{sample_id}_{kind}.npy"

    def _load_array(self, sample_id: str, kind: Kind) -> np.ndarray[Any, Any]:
        path = self._resolve_path(sample_id, kind)
        if not path.exists():
            raise FileNotFoundError(f"Required file not found: {path}")
        return np.load(path, allow_pickle=False)  # type: ignore[no-any-return]

    def _try_load_array(self, sample_id: str, kind: Kind) -> np.ndarray[Any, Any] | None:
        path = self._resolve_path(sample_id, kind)
        if path.exists():
            return np.load(path, allow_pickle=False)  # type: ignore[no-any-return]
        return None

    def has_kind(self, sample_id: str, kind: str) -> bool:
        """Return True if the file for (sample_id, kind) exists on disk."""
        return self._resolve_path(sample_id, kind).exists()

    def download(self, root: str, artifact: str = "lithomodels.tar.gz") -> None:
        """Download a pinned LithoBench artifact via gdown and verify its SHA-256.

        Args:
            root: Destination directory. Created if missing. The tar is
                streamed to ``<root>/<artifact>`` and extracted into
                ``<root>``; if the file already exists *and* matches the
                pinned hash, the download is skipped (idempotent resume).
            artifact: Canonical filename of the artifact to fetch. Must be
                a key in :data:`KNOWN_GOOD_SHA256` and :data:`_GDRIVE_FILE_IDS`.

        Raises:
            ValueError: ``artifact`` is unknown.
            ImportError: ``gdown`` is not installed (``pip install gdown``).
            IntegrityError: bytes-on-disk don't match the pinned SHA-256.
        """
        if artifact not in KNOWN_GOOD_SHA256:
            raise ValueError(
                f"Unknown LithoBench artifact: {artifact!r}. "
                f"Known artifacts: {sorted(KNOWN_GOOD_SHA256)}"
            )
        if artifact not in _GDRIVE_FILE_IDS:
            raise NotImplementedError(
                f"No Google Drive ID registered for {artifact!r}. Open an issue or PR to add one."
            )

        try:
            import gdown
        except ImportError as e:
            raise ImportError(
                "Auto-fetching LithoBench requires gdown. Install it with: pip install gdown"
            ) from e

        dest_root = Path(root)
        dest_root.mkdir(parents=True, exist_ok=True)
        target = dest_root / artifact
        pin = KNOWN_GOOD_SHA256[artifact]

        if target.exists():
            try:
                verify_sha256(target, pin)
                self._extract_tarball(target, dest_root)
                return
            except (RuntimeError, OSError):
                # IntegrityError (subclass of RuntimeError) or I/O failure; re-download below.
                target.unlink(missing_ok=True)

        url = f"https://drive.google.com/uc?id={_GDRIVE_FILE_IDS[artifact]}"
        # ``resume=True`` is the gdown analogue of ``--continue``: a partial
        # ``<target>`` from a prior aborted run is appended to instead of
        # restarted from byte 0 — important for the ~14.7 GB ``lithodata``
        # tar where Google Drive can drop the connection mid-stream.
        # Proxy passthrough flows through ``HTTPS_PROXY`` / ``HTTP_PROXY``
        # env vars (see docs/developer-guide/network.md); we deliberately
        # do NOT name internal hosts in code per ``feedback_proxy_usage.md``.
        try:
            gdown.download(url, str(target), quiet=False, resume=True)
        except Exception as exc:  # noqa: BLE001 — re-raised below
            msg = str(exc).lower()
            if "quota" in msg or "rate" in msg or "too many requests" in msg:
                raise RuntimeError(
                    f"Google Drive rate-limited the LithoBench artifact "
                    f"({artifact!r}). Wait 24h and retry, or fetch from a "
                    f"different network. Original error: {exc}"
                ) from exc
            raise

        verify_sha256(target, pin)
        self._extract_tarball(target, dest_root)

    @staticmethod
    def _extract_tarball(tar_path: Path, dest: Path) -> None:
        """Extract ``tar_path`` into ``dest``, refusing path-traversal members.

        ``tarfile.extractall`` historically followed ``../`` and absolute
        member names — we add an explicit guard so a tampered upload (the
        SHA-256 is verified before this is called, but in case of future
        re-pinning) cannot escape ``dest``.

        Uses ``Path.is_relative_to`` rather than string-prefix matching:
        ``str(...).startswith(str(dest))`` is vulnerable to prefix
        confusion (a member resolving to ``/tmp/foobar`` passes a
        ``startswith("/tmp/foo")`` check).
        """
        dest_resolved = dest.resolve()
        with tarfile.open(tar_path, "r:*") as tar:
            for member in tar.getmembers():
                member_path = (dest / member.name).resolve()
                if not member_path.is_relative_to(dest_resolved):
                    raise RuntimeError(
                        f"Refusing to extract path-traversal member: {member.name!r}"
                    )
            # B202: members were validated above; safe to extract.
            # ``filter="data"`` activates Python 3.12+ tar filter that
            # additionally blocks symlinks/hardlinks pointing outside the
            # destination — defence in depth on top of the manual guard.
            tar.extractall(dest, filter="data")  # nosec B202

    @property
    def sample_ids(self) -> list[str]:
        return list(self._index)

    # ---- Croissant metadata ----

    def croissant_name(self) -> str:
        return "LithoBench"

    def croissant_description(self) -> str:
        return (
            "LithoBench (NeurIPS'23) is a public benchmark for AI computational "
            "lithography spanning multiple design topologies and metrics. This "
            "adapter ingests the .npy distribution as (design, mask, resist) triples."
        )

    def croissant_url(self) -> str | None:
        return "https://github.com/shelljane/lithobench"

    def croissant_citation(self) -> str | None:
        return (
            "Zheng, S., Yang, H., Yu, B. et al. LithoBench: Benchmarking AI "
            "Computational Lithography for Semiconductor Manufacturing. NeurIPS 2023."
        )

has_kind(sample_id, kind)

Return True if the file for (sample_id, kind) exists on disk.

Source code in src/openlithohub/data/lithobench.py
def has_kind(self, sample_id: str, kind: str) -> bool:
    """Return True if the file for (sample_id, kind) exists on disk."""
    return self._resolve_path(sample_id, kind).exists()

download(root, artifact='lithomodels.tar.gz')

Download a pinned LithoBench artifact via gdown and verify its SHA-256.

Parameters:

Name Type Description Default
root str

Destination directory. Created if missing. The tar is streamed to <root>/<artifact> and extracted into <root>; if the file already exists and matches the pinned hash, the download is skipped (idempotent resume).

required
artifact str

Canonical filename of the artifact to fetch. Must be a key in :data:KNOWN_GOOD_SHA256 and :data:_GDRIVE_FILE_IDS.

'lithomodels.tar.gz'

Raises:

Type Description
ValueError

artifact is unknown.

ImportError

gdown is not installed (pip install gdown).

IntegrityError

bytes-on-disk don't match the pinned SHA-256.

Source code in src/openlithohub/data/lithobench.py
def download(self, root: str, artifact: str = "lithomodels.tar.gz") -> None:
    """Download a pinned LithoBench artifact via gdown and verify its SHA-256.

    Args:
        root: Destination directory. Created if missing. The tar is
            streamed to ``<root>/<artifact>`` and extracted into
            ``<root>``; if the file already exists *and* matches the
            pinned hash, the download is skipped (idempotent resume).
        artifact: Canonical filename of the artifact to fetch. Must be
            a key in :data:`KNOWN_GOOD_SHA256` and :data:`_GDRIVE_FILE_IDS`.

    Raises:
        ValueError: ``artifact`` is unknown.
        ImportError: ``gdown`` is not installed (``pip install gdown``).
        IntegrityError: bytes-on-disk don't match the pinned SHA-256.
    """
    if artifact not in KNOWN_GOOD_SHA256:
        raise ValueError(
            f"Unknown LithoBench artifact: {artifact!r}. "
            f"Known artifacts: {sorted(KNOWN_GOOD_SHA256)}"
        )
    if artifact not in _GDRIVE_FILE_IDS:
        raise NotImplementedError(
            f"No Google Drive ID registered for {artifact!r}. Open an issue or PR to add one."
        )

    try:
        import gdown
    except ImportError as e:
        raise ImportError(
            "Auto-fetching LithoBench requires gdown. Install it with: pip install gdown"
        ) from e

    dest_root = Path(root)
    dest_root.mkdir(parents=True, exist_ok=True)
    target = dest_root / artifact
    pin = KNOWN_GOOD_SHA256[artifact]

    if target.exists():
        try:
            verify_sha256(target, pin)
            self._extract_tarball(target, dest_root)
            return
        except (RuntimeError, OSError):
            # IntegrityError (subclass of RuntimeError) or I/O failure; re-download below.
            target.unlink(missing_ok=True)

    url = f"https://drive.google.com/uc?id={_GDRIVE_FILE_IDS[artifact]}"
    # ``resume=True`` is the gdown analogue of ``--continue``: a partial
    # ``<target>`` from a prior aborted run is appended to instead of
    # restarted from byte 0 — important for the ~14.7 GB ``lithodata``
    # tar where Google Drive can drop the connection mid-stream.
    # Proxy passthrough flows through ``HTTPS_PROXY`` / ``HTTP_PROXY``
    # env vars (see docs/developer-guide/network.md); we deliberately
    # do NOT name internal hosts in code per ``feedback_proxy_usage.md``.
    try:
        gdown.download(url, str(target), quiet=False, resume=True)
    except Exception as exc:  # noqa: BLE001 — re-raised below
        msg = str(exc).lower()
        if "quota" in msg or "rate" in msg or "too many requests" in msg:
            raise RuntimeError(
                f"Google Drive rate-limited the LithoBench artifact "
                f"({artifact!r}). Wait 24h and retry, or fetch from a "
                f"different network. Original error: {exc}"
            ) from exc
        raise

    verify_sha256(target, pin)
    self._extract_tarball(target, dest_root)

openlithohub.data.lithosim

LithoSim dataset adapter (HuggingFace Parquet format).

LithoSim is a sub-28nm industrial lithography simulation dataset hosted on HuggingFace Hub. It stores design/mask/resist image pairs as Parquet rows with image columns and process metadata.

The upstream dataset (OpenLithoHub/LithoSim) is currently gated: new users must request access on the Hub and authenticate with huggingface-cli login before this adapter can fetch data. Calls without auth fail with HTTP 401; the adapter detects that and raises :class:RuntimeError with the remediation steps.

Requires: pip install openlithohub[data] (adds datasets and pyarrow)

LithoSimDataset

Bases: DatasetAdapter

Adapter for the LithoSim dataset (sub-28nm industrial benchmark).

Loads data from HuggingFace Hub using the datasets library. Images are stored as columns in Parquet format and decoded to tensors on access.

Parameters:

Name Type Description Default
split str

Dataset split ('train', 'test', or 'all').

'test'
dataset_name str

HuggingFace dataset identifier. Override for custom forks.

_HF_DATASET_NAME
cache_dir str | None

Local cache directory for downloaded data.

None
pixel_nm float

Pixel resolution in nanometers.

0.5
streaming bool

If True, use streaming mode (no full download).

False
revision str | None

Optional Git revision (commit SHA, tag, or branch) to pin for reproducible downloads. Defaults to _DEFAULT_REVISION; pass None explicitly to opt out and resolve the dataset's default branch (irreproducible).

_DEFAULT_REVISION
Source code in src/openlithohub/data/lithosim.py
class LithoSimDataset(DatasetAdapter):
    """Adapter for the LithoSim dataset (sub-28nm industrial benchmark).

    Loads data from HuggingFace Hub using the `datasets` library.
    Images are stored as columns in Parquet format and decoded to tensors on access.

    Args:
        split: Dataset split ('train', 'test', or 'all').
        dataset_name: HuggingFace dataset identifier. Override for custom forks.
        cache_dir: Local cache directory for downloaded data.
        pixel_nm: Pixel resolution in nanometers.
        streaming: If True, use streaming mode (no full download).
        revision: Optional Git revision (commit SHA, tag, or branch) to pin
            for reproducible downloads. Defaults to ``_DEFAULT_REVISION``;
            pass ``None`` explicitly to opt out and resolve the dataset's
            default branch (irreproducible).
    """

    def __init__(
        self,
        split: str = "test",
        dataset_name: str = _HF_DATASET_NAME,
        cache_dir: str | None = None,
        pixel_nm: float = 0.5,
        streaming: bool = False,
        revision: str | None = _DEFAULT_REVISION,
    ) -> None:
        _ensure_datasets_available()
        self.split = split
        self.dataset_name = dataset_name
        self.cache_dir = cache_dir
        self.pixel_nm = pixel_nm
        self.streaming = streaming
        self.revision = revision
        self._ds: Any = None
        self._len: int | None = None
        # Mutable default revision ("main") gives no reproducibility — the
        # upstream dataset can advance at any time and existing scores
        # silently rebase. Warn loudly so downstream evaluators know the
        # bytes are not pinned. Pinned revisions (commit hash / tag) are
        # silent.
        if revision in (None, "main"):
            import warnings as _w

            _w.warn(
                f"LithoSimDataset is loading {dataset_name!r} at revision="
                f"{revision!r} — this is mutable. For reproducible scoring "
                "pass a commit hash or tag (e.g. revision='abc1234'). "
                "Loaded bytes are not integrity-pinned.",
                UserWarning,
                stacklevel=2,
            )

    @property
    def supports_random_access(self) -> bool:
        # Streaming mode wraps an HF IterableDataset — `len()` / `ds[i]`
        # would require draining the stream, so they raise TypeError. The
        # batched (non-streaming) load is materialised and supports both.
        return not self.streaming

    def _load_dataset(self) -> Any:
        if self._ds is None:
            from datasets import load_dataset

            try:
                # B615: revision is exposed as a constructor argument so callers
                # can pin a specific commit/tag for reproducible downloads.
                self._ds = load_dataset(  # nosec B615
                    self.dataset_name,
                    split=self.split,
                    cache_dir=self.cache_dir,
                    streaming=self.streaming,
                    revision=self.revision,
                )
            except Exception as exc:  # noqa: BLE001 — re-raised below
                if _is_auth_error(exc):
                    raise RuntimeError(
                        _GATED_REMEDIATION.format(name=self.dataset_name, err=exc)
                    ) from exc
                raise
        return self._ds

    def __len__(self) -> int:
        if self.streaming:
            raise TypeError(
                "LithoSimDataset does not support len() in streaming mode. "
                "Use iteration instead: `for sample in dataset: ...`"
            )
        if self._len is not None:
            return self._len
        ds = self._load_dataset()
        self._len = len(ds)
        return self._len

    def __getitem__(self, index: int) -> LithoSample:
        if self.streaming:
            raise TypeError(
                "LithoSimDataset does not support indexing in streaming mode. "
                "Use iteration instead: `for sample in dataset: ...`"
            )
        ds = self._load_dataset()

        if index < 0 or index >= len(self):
            raise IndexError(f"Index {index} out of range [0, {len(self)})")

        row = ds[index]
        return self._row_to_sample(row)

    def __iter__(self) -> Iterator[LithoSample]:
        ds = self._load_dataset()
        for row in ds:
            yield self._row_to_sample(row)

    def _row_to_sample(self, row: dict[str, Any]) -> LithoSample:
        design = self._decode_image(row, "design")
        mask = self._try_decode_image(row, "mask")
        resist = self._try_decode_image(row, "resist")

        metadata: dict[str, Any] = {
            "dataset": "lithosim",
            "pixel_nm": self.pixel_nm,
            "split": self.split,
        }
        for key in ("process_node", "pitch_nm", "dose", "focus", "sample_id", "feature_type"):
            if key in row:
                metadata[key] = row[key]

        return LithoSample(
            design=design,
            mask=mask,
            resist=resist,
            metadata=metadata,
        )

    def _decode_image(self, row: dict[str, Any], column: str) -> torch.Tensor:
        if column not in row:
            raise KeyError(f"Required column '{column}' not found in dataset row")
        return self._to_tensor(row[column])

    def _try_decode_image(self, row: dict[str, Any], column: str) -> torch.Tensor | None:
        if column not in row or row[column] is None:
            return None
        return self._to_tensor(row[column])

    @staticmethod
    def _array_to_tensor(arr: np.ndarray[Any, Any]) -> torch.Tensor:
        """Convert a numpy image array to a normalized float32 tensor in [0, 1].

        SEM and aerial-image rows in industrial litho datasets are commonly
        uint16; falling through to a plain ``astype(float32)`` would leave
        values in [0, 65535] and silently break any downstream code that
        assumes a [0, 1] resist threshold or EPE input range.
        """
        if arr.dtype == np.uint8:
            return torch.from_numpy(arr.astype(np.float32) / 255.0)
        if arr.dtype == np.uint16:
            return torch.from_numpy(arr.astype(np.float32) / 65535.0)
        if np.issubdtype(arr.dtype, np.integer):
            raise TypeError(
                f"Unsupported integer dtype {arr.dtype}; "
                "expected uint8 or uint16 SEM/aerial images."
            )
        return torch.from_numpy(arr.astype(np.float32))

    @staticmethod
    def _to_tensor(value: Any) -> torch.Tensor:
        if isinstance(value, np.ndarray):
            return LithoSimDataset._array_to_tensor(value)

        if isinstance(value, (list, tuple)):
            return torch.tensor(value, dtype=torch.float32)

        try:
            from PIL import Image
        except ImportError as e:
            raise ImportError(
                "Pillow is required for image decoding. Install with: pip install Pillow"
            ) from e

        if isinstance(value, Image.Image):
            return LithoSimDataset._array_to_tensor(np.array(value))

        if isinstance(value, dict) and "bytes" in value:
            import io

            with Image.open(io.BytesIO(value["bytes"])) as img:
                return LithoSimDataset._array_to_tensor(np.array(img))

        raise TypeError(f"Cannot convert {type(value)} to tensor")

    def download(self, root: str) -> None:
        from datasets import load_dataset

        try:
            # B615: revision is pinnable via the constructor argument.
            load_dataset(  # nosec B615
                self.dataset_name,
                split=self.split,
                cache_dir=root,
                revision=self.revision,
            )
        except Exception as exc:  # noqa: BLE001 — re-raised below
            if _is_auth_error(exc):
                raise RuntimeError(
                    _GATED_REMEDIATION.format(name=self.dataset_name, err=exc)
                ) from exc
            raise

    @property
    def columns(self) -> list[str]:
        ds = self._load_dataset()
        return ds.column_names  # type: ignore[no-any-return]

openlithohub.data.iccad16

ICCAD 2016 Problem C — EUV hotspot detection benchmark adapter.

The benchmark is from the ICCAD 2016 CAD Contest (Problem C, EUV Simulation). The publicly mirrored copy lives at https://github.com/phdyang007/ICCAD16-N7M2EUV — four EUV designs at N7 / 16 nm CD plus simulated hotspot locations recorded under a process-window sweep.

The dataset is a hotspot detection benchmark, not a mask optimization benchmark — there is no OPC reference mask to compare against. Two pieces of evidence:

  1. The repo's references are both hotspot-detection papers (Chen et al., DAC'19; Yang et al., TCAD'20 — "Bridging the Gap Between Layout Pattern Sampling and Hotspot Detection via Batch Active Learning").
  2. The auxiliary layer (10000, 0) ships 120 small 16×16 nm boxes distributed on a regular grid, covering only ~1% of design pixels and located 70+ nm away from any CSV hotspot — consistent with detection clip / inspection-grid sites, not with an OPC mask.

Files per test case:

  • testcaseN.oas — OASIS layout. The N7M2EUV stack is documented in [Yang2020_BatchAL, §III-A, p.4]; the per-layer mapping below applies to every test case in this distribution:
GDS layer Datatype Meaning
1000 0 Design polygons (drawn metal-2 features at N7, 16 nm CD).
10000 0 Auxiliary clip-site grid (16×16 nm boxes, hotspot inspection sites).

(layer=1000, datatype=0) is exposed as the loaded design tensor. (layer=10000, datatype=0) is exposed under metadata['clip_sites']. - testN.csv — hotspot annotations with columns def, id, category, x, y. Coordinates are in OASIS database units (1 dbu = 1 nm for these files); category is the contest's defect type code (raw integers, per-testcase). The README promises three semantic kinds (EPE / Bridging / Necking) but does not publish the integer mapping, so the code preserves the raw id. The same physical site can appear multiple times under different dose/focus conditions.

The adapter returns LithoSample(design, mask=None, resist=None, metadata). LithoSample.mask is intentionally left None — this dataset does not provide a reference mask. Hotspot annotations and clip-site centers live in metadata.

HotspotAnnotation dataclass

One row from the testN.csv hotspot table.

x_nm / y_nm are the contest dbu coordinates converted to nm using the OASIS layout's dbu (1 dbu = 1 nm for the published files, but the conversion still goes through layout.dbu * 1000). category_id preserves the raw contest code; the README only promises three semantic kinds (EPE / Bridging / Necking) but does not publish the integer mapping, so callers should treat the id as an opaque label until they have the contest's category dictionary.

Source code in src/openlithohub/data/iccad16.py
@dataclass(frozen=True)
class HotspotAnnotation:
    """One row from the testN.csv hotspot table.

    ``x_nm`` / ``y_nm`` are the contest dbu coordinates converted to nm
    using the OASIS layout's dbu (1 dbu = 1 nm for the published files,
    but the conversion still goes through ``layout.dbu * 1000``).
    ``category_id`` preserves the raw contest code; the README only
    promises three semantic kinds (EPE / Bridging / Necking) but does
    not publish the integer mapping, so callers should treat the id as
    an opaque label until they have the contest's category dictionary.
    """

    hotspot_id: int
    category_id: int
    x_nm: float
    y_nm: float

Iccad16Dataset

Bases: DatasetAdapter

Adapter for the ICCAD 2016 Problem C — EUV hotspot benchmark.

Parameters:

Name Type Description Default
root str | Path

Directory containing testcase{N}.oas and test{N}.csv files. The four published cases use N in 1..4.

required
cases list[int] | None

Optional explicit list of case indices to expose, e.g. [1, 3]. Defaults to whichever cases are present on disk.

None
design_layer tuple[int, int]

(layer, datatype) tuple selecting the design polygons. Defaults to (1000, 0) per the published files.

(1000, 0)
clip_layer tuple[int, int]

(layer, datatype) tuple selecting the auxiliary clip-site layer. Defaults to (10000, 0). Exposed via metadata['clip_sites']; the layer is empirically used for hotspot-detection clip locations, not as an OPC mask.

(10000, 0)
pixel_nm float

Raster pixel size in nm. The published layouts are ~1.9 µm × 1.5 µm so 1 nm/px stays well under 2k×2k.

1.0

The adapter reads each OASIS file lazily on first access and caches the rasterized design tensor in memory. klayout is required and is already pinned in pyproject.toml.

Source code in src/openlithohub/data/iccad16.py
class Iccad16Dataset(DatasetAdapter):
    """Adapter for the ICCAD 2016 Problem C — EUV hotspot benchmark.

    Args:
        root: Directory containing ``testcase{N}.oas`` and
            ``test{N}.csv`` files. The four published cases use ``N``
            in 1..4.
        cases: Optional explicit list of case indices to expose, e.g.
            ``[1, 3]``. Defaults to whichever cases are present on disk.
        design_layer: ``(layer, datatype)`` tuple selecting the design
            polygons. Defaults to ``(1000, 0)`` per the published files.
        clip_layer: ``(layer, datatype)`` tuple selecting the auxiliary
            clip-site layer. Defaults to ``(10000, 0)``. Exposed via
            ``metadata['clip_sites']``; the layer is empirically used
            for hotspot-detection clip locations, not as an OPC mask.
        pixel_nm: Raster pixel size in nm. The published layouts are
            ~1.9 µm × 1.5 µm so 1 nm/px stays well under 2k×2k.

    The adapter reads each OASIS file lazily on first access and caches
    the rasterized design tensor in memory. ``klayout`` is required and
    is already pinned in ``pyproject.toml``.
    """

    def __init__(
        self,
        root: str | Path,
        cases: list[int] | None = None,
        design_layer: tuple[int, int] = (1000, 0),
        clip_layer: tuple[int, int] = (10000, 0),
        pixel_nm: float = 1.0,
    ) -> None:
        self.root = Path(root)
        if not self.root.exists():
            raise FileNotFoundError(f"ICCAD16 root not found: {self.root}")
        from openlithohub._utils.integrity import warn_unverified_data_root

        warn_unverified_data_root(self.root, "iccad16")
        self.design_layer = design_layer
        self.clip_layer = clip_layer
        self.pixel_nm = float(pixel_nm)

        if cases is None:
            cases = sorted(
                int(p.stem.removeprefix("testcase"))
                for p in self.root.glob("testcase*.oas")
                if p.stem.removeprefix("testcase").isdigit()
            )
        if not cases:
            raise FileNotFoundError(f"No testcase*.oas files under {self.root}")
        self._cases = cases
        self._cache: dict[int, LithoSample] = {}

    def __len__(self) -> int:
        return len(self._cases)

    def __getitem__(self, index: int) -> LithoSample:
        if index < 0 or index >= len(self._cases):
            raise IndexError(f"Index {index} out of range [0, {len(self._cases)})")
        case_id = self._cases[index]
        if case_id in self._cache:
            return self._cache[case_id]
        sample = self._load_case(case_id)
        self._cache[case_id] = sample
        return sample

    def _load_case(self, case_id: int) -> LithoSample:
        oas_path = self.root / f"testcase{case_id}.oas"
        csv_path = self.root / f"test{case_id}.csv"
        if not oas_path.exists():
            raise FileNotFoundError(f"Missing OASIS file: {oas_path}")

        # OASIS rasterization via klayout. Imported lazily so that
        # importing the package does not require klayout for users who
        # only touch other datasets.
        import klayout.db as kdb

        layout = kdb.Layout()
        layout.read(str(oas_path))
        dbu_nm = layout.dbu * 1000.0  # klayout dbu is in µm

        top = layout.top_cell()
        if top is None:
            raise RuntimeError(f"OASIS file has no top cell: {oas_path}")

        design_arr, origin = self._rasterize_layer(layout, top, self.design_layer)
        clip_sites = self._collect_clip_sites(layout, top, self.clip_layer)

        hotspots = self._load_hotspots(csv_path) if csv_path.exists() else []

        metadata: dict[str, Any] = {
            "dataset": "iccad16",
            "case_id": case_id,
            "source_oas": str(oas_path),
            "source_csv": str(csv_path) if csv_path.exists() else None,
            "dbu_nm": dbu_nm,
            "pixel_nm": self.pixel_nm,
            "design_layer": list(self.design_layer),
            "clip_layer": list(self.clip_layer),
            "origin_nm": [origin[0], origin[1]],  # bbox lower-left in nm
            "hotspots": [h.__dict__ for h in hotspots],
            "num_hotspots": len(hotspots),
            "clip_sites": clip_sites,
            "num_clip_sites": len(clip_sites),
        }

        return LithoSample(
            design=torch.from_numpy(design_arr).float(),
            mask=None,
            resist=None,
            metadata=metadata,
        )

    def _rasterize_layer(
        self,
        layout: Any,
        top: Any,
        layer_spec: tuple[int, int],
    ) -> tuple[np.ndarray[Any, Any], tuple[float, float]]:
        """Rasterize a single OASIS layer into a {0,1} numpy array.

        Decomposes each polygon into trapezoids via klayout's
        ``Polygon.decompose_trapezoids`` and fills each trapezoid's pixel
        footprint. For Manhattan polygons every trapezoid is an
        axis-aligned rectangle, so the fill is exact even for L-shapes
        and other concave Manhattan geometry — a plain bbox fill would
        over-fill the concave corner.
        """
        import klayout.db as kdb

        layer_index = layout.find_layer(*layer_spec)
        bbox = top.bbox()
        origin = (
            bbox.left * layout.dbu * 1000.0,
            bbox.bottom * layout.dbu * 1000.0,
        )
        w = max(1, int(np.ceil(bbox.width() * layout.dbu * 1000.0 / self.pixel_nm)))
        h = max(1, int(np.ceil(bbox.height() * layout.dbu * 1000.0 / self.pixel_nm)))
        if layer_index is None:
            return np.zeros((h, w), dtype=np.float32), origin

        arr = np.zeros((h, w), dtype=np.float32)
        ox_nm, oy_nm = origin
        dbu_um = layout.dbu

        def _fill_box(b: Any) -> None:
            x0_nm = b.left * dbu_um * 1000.0 - ox_nm
            y0_nm = b.bottom * dbu_um * 1000.0 - oy_nm
            x1_nm = b.right * dbu_um * 1000.0 - ox_nm
            y1_nm = b.top * dbu_um * 1000.0 - oy_nm
            i0 = max(0, int(np.floor(x0_nm / self.pixel_nm)))
            j0 = max(0, int(np.floor(y0_nm / self.pixel_nm)))
            i1 = min(w, int(np.ceil(x1_nm / self.pixel_nm)))
            j1 = min(h, int(np.ceil(y1_nm / self.pixel_nm)))
            if i1 > i0 and j1 > j0:
                arr[j0:j1, i0:i1] = 1.0

        # Recursive iteration so hierarchical layouts (geometry referenced
        # through cell instances) are not silently dropped. Today's published
        # ICCAD16 files are flat — single TOPCELL — but matching the canonical
        # pattern from data/io.py:128 keeps this robust against future
        # contributions or upstream regenerations that introduce hierarchy.
        shapes_iter = top.begin_shapes_rec(layer_index)
        while not shapes_iter.at_end():
            shape_obj = shapes_iter.shape()
            trans = shapes_iter.trans()
            if shape_obj.is_box():
                _fill_box(shape_obj.box.transformed(trans))
                shapes_iter.next()
                continue
            if shape_obj.is_path():
                poly = shape_obj.path.polygon()
            elif shape_obj.is_polygon():
                poly = shape_obj.polygon
            else:
                shapes_iter.next()
                continue
            poly = poly.transformed(trans)
            try:
                trapezoids = list(poly.decompose_trapezoids(kdb.Polygon.TD_simple))
            except AttributeError:
                # Older klayout: fall back to whole-polygon bbox (still over-fills
                # concavities, but no worse than the historical behavior).
                _fill_box(poly.bbox())
                shapes_iter.next()
                continue
            for tz in trapezoids:
                _fill_box(tz.bbox())
            shapes_iter.next()

        return arr, origin

    def _collect_clip_sites(
        self,
        layout: Any,
        top: Any,
        layer_spec: tuple[int, int],
    ) -> list[dict[str, float]]:
        """Return clip-site bboxes in nm (no rasterization)."""
        layer_index = layout.find_layer(*layer_spec)
        if layer_index is None:
            return []
        dbu_nm = layout.dbu * 1000.0
        out: list[dict[str, float]] = []
        # Recursive iteration matches _rasterize_layer above.
        shapes_iter = top.begin_shapes_rec(layer_index)
        while not shapes_iter.at_end():
            s = shapes_iter.shape()
            trans = shapes_iter.trans()
            if s.is_box():
                b = s.box.transformed(trans)
            elif s.is_polygon():
                b = s.polygon.transformed(trans).bbox()
            else:
                shapes_iter.next()
                continue
            out.append(
                {
                    "x0_nm": b.left * dbu_nm,
                    "y0_nm": b.bottom * dbu_nm,
                    "x1_nm": b.right * dbu_nm,
                    "y1_nm": b.top * dbu_nm,
                }
            )
            shapes_iter.next()
        return out

    def _load_hotspots(self, csv_path: Path) -> list[HotspotAnnotation]:
        out: list[HotspotAnnotation] = []
        n_rows = 0
        n_skipped = 0
        with open(csv_path, encoding="utf-8", newline="") as f:
            reader = csv.DictReader(f)
            for row in reader:
                n_rows += 1
                try:
                    out.append(
                        HotspotAnnotation(
                            hotspot_id=int(row["id"]),
                            category_id=int(row["category"]),
                            x_nm=float(row["x"]),
                            y_nm=float(row["y"]),
                        )
                    )
                except (KeyError, ValueError) as exc:
                    n_skipped += 1
                    warnings.warn(
                        f"Skipped malformed row in {csv_path.name}: {exc!r}",
                        stacklevel=2,
                    )
        if n_rows > 0 and not out:
            raise ValueError(
                f"All {n_rows} rows in {csv_path} were malformed — the CSV "
                f"header may have changed (expected columns: id, category, x, y)."
            )
        if n_skipped:
            warnings.warn(
                f"{n_skipped}/{n_rows} rows in {csv_path.name} were skipped as malformed.",
                stacklevel=2,
            )
        return out

    def download(self, root: str) -> None:
        raise NotImplementedError(
            "ICCAD16 auto-download is not implemented. Clone manually from "
            "https://github.com/phdyang007/ICCAD16-N7M2EUV and place the "
            "testcase*.oas + test*.csv files under the dataset root."
        )

    @property
    def case_ids(self) -> list[int]:
        return list(self._cases)

openlithohub.data.ganopc

GAN-OPC training-data adapter.

GAN-OPC ships its training set as ~4875 paired binary PNGs at 2048×2048 resolution. The public mirror is https://github.com/phdyang007/GAN-OPC, distributed as a 30-volume 7z archive (ganopc-data.7z.001.030). The :func:download_ganopc helper auto-fetches and unpacks it on first use; until then the repo carries no upstream bytes (per DATA-LICENSES.md — redistribution is not granted).

Reference: Yang et al., GAN-OPC: Mask Optimization with Lithography-guided Generative Adversarial Nets, DAC 2018 (doi:10.1145/3195970.3196056). A paywalled TCAD 2020 extension exists; the open DAC paper is the canonical citation for this adapter.

Once unpacked, the directory layout is::

ganopc-data/
  artitgt/
    1.glp.png         # target design layout (binary)
    2.glp.png
    ...
    map.txt           # filename index (ignored by this loader)
  artimsk/
    1.glpOPC.png      # OPC-output mask paired with the target
    2.glpOPC.png
    ...

The two trees share sample IDs verbatim (N.glp.pngN.glpOPC.png). Pixel pitch is not stored alongside the data; the loader defaults to 1.0 nm/px (configurable). Both PNGs are 8-bit grayscale with strictly {0, 255} content, which the loader thresholds into a {0., 1.} float32 tensor.

GanOpcDataset

Bases: DatasetAdapter

Adapter for the GAN-OPC paired-PNG training set.

Parameters:

Name Type Description Default
root str | Path

Either the directory containing artitgt/ and artimsk/ (typically ganopc-data/), or the parent directory holding ganopc-data/ — both are accepted.

required
sample_ids list[str] | None

Optional explicit list of sample IDs to expose (e.g. ["1", "2", "100"]). Defaults to every ID present in both artitgt/ and artimsk/, sorted numerically when possible.

None
pixel_nm float

Raster pixel size in nm. Defaults to 1.0; this is the convention OpenLithoHub uses elsewhere and matches the ~2 µm patch sizes typical of GAN-OPC layouts. Override via constructor if your downstream pipeline assumes a different scale.

1.0
threshold int

Grayscale cutoff (0–255) above which a pixel is considered "on". Defaults to 127. The published PNGs are already strict binary, so the threshold only matters if a user supplies non-canonical files.

127
Source code in src/openlithohub/data/ganopc.py
class GanOpcDataset(DatasetAdapter):
    """Adapter for the GAN-OPC paired-PNG training set.

    Args:
        root: Either the directory containing ``artitgt/`` and
            ``artimsk/`` (typically ``ganopc-data/``), or the parent
            directory holding ``ganopc-data/`` — both are accepted.
        sample_ids: Optional explicit list of sample IDs to expose
            (e.g. ``["1", "2", "100"]``). Defaults to every ID present
            in both ``artitgt/`` and ``artimsk/``, sorted numerically
            when possible.
        pixel_nm: Raster pixel size in nm. Defaults to 1.0; this is the
            convention OpenLithoHub uses elsewhere and matches the
            ~2 µm patch sizes typical of GAN-OPC layouts. Override via
            constructor if your downstream pipeline assumes a different
            scale.
        threshold: Grayscale cutoff (0–255) above which a pixel is
            considered "on". Defaults to 127. The published PNGs are
            already strict binary, so the threshold only matters if a
            user supplies non-canonical files.
    """

    def __init__(
        self,
        root: str | Path,
        sample_ids: list[str] | None = None,
        pixel_nm: float = 1.0,
        threshold: int = 127,
    ) -> None:
        root = Path(root)
        if (root / "artitgt").is_dir() and (root / "artimsk").is_dir():
            data_root = root
        elif (root / "ganopc-data" / "artitgt").is_dir():
            data_root = root / "ganopc-data"
        else:
            raise FileNotFoundError(
                f"Could not find artitgt/ + artimsk/ under {root}. Pass "
                "either the ganopc-data directory itself or its parent."
            )
        self.root = data_root
        self.pixel_nm = float(pixel_nm)
        self.threshold = int(threshold)
        self._tgt_dir = data_root / "artitgt"
        self._msk_dir = data_root / "artimsk"

        if sample_ids is None:
            tgt_ids = {p.stem.removesuffix(".glp") for p in self._tgt_dir.glob("*.glp.png")}
            msk_ids = {p.stem.removesuffix(".glpOPC") for p in self._msk_dir.glob("*.glpOPC.png")}
            paired = sorted(tgt_ids & msk_ids, key=natural_sort_key)
            sample_ids = paired
        if not sample_ids:
            raise FileNotFoundError(
                f"No paired samples found under {data_root}/{{artitgt,artimsk}}/"
            )
        self._ids = list(sample_ids)

    def __len__(self) -> int:
        return len(self._ids)

    def __getitem__(self, index: int) -> LithoSample:
        if index < 0 or index >= len(self._ids):
            raise IndexError(f"Index {index} out of range [0, {len(self._ids)})")
        sample_id = self._ids[index]
        tgt_path = self._tgt_dir / f"{sample_id}.glp.png"
        msk_path = self._msk_dir / f"{sample_id}.glpOPC.png"
        if not tgt_path.exists():
            raise FileNotFoundError(f"Missing target PNG: {tgt_path}")
        if not msk_path.exists():
            raise FileNotFoundError(f"Missing mask PNG: {msk_path}")

        design_arr = self._load_png(tgt_path)
        mask_arr = self._load_png(msk_path)

        metadata: dict[str, Any] = {
            "dataset": "ganopc",
            "sample_id": sample_id,
            "source_target_png": str(tgt_path),
            "source_mask_png": str(msk_path),
            "pixel_nm": self.pixel_nm,
        }

        return LithoSample(
            design=torch.from_numpy(design_arr).float(),
            mask=torch.from_numpy(mask_arr).float(),
            resist=None,
            metadata=metadata,
        )

    def _load_png(self, path: Path) -> np.ndarray[Any, Any]:
        # Imported lazily so importing the data package does not require
        # Pillow for users who only touch other adapters.
        from PIL import Image

        with Image.open(path) as img:
            arr = np.asarray(img.convert("L"), dtype=np.uint8)
        return (arr > self.threshold).astype(np.float32)

    def download(self, root: str) -> None:
        """Fetch the GAN-OPC training set from upstream on first use.

        Mirrors the pattern in :class:`LithoBenchDataset.download`: clones
        the upstream repository, joins the 30-volume 7z archive, and
        extracts the resulting tree so that
        ``<root>/ganopc-data/{artitgt,artimsk}/`` is populated.

        Idempotent: if ``<root>/ganopc-data/artitgt`` already exists the
        call is a no-op. The intermediate clone and joined archive are
        kept on disk so a partial extraction can resume without re-cloning.

        Requires ``git`` on ``PATH`` and the ``py7zr`` +
        ``multivolumefile`` Python packages (declared optional under
        the ``data`` extras).
        """
        download_ganopc(root)

    @property
    def sample_ids(self) -> list[str]:
        return list(self._ids)

download(root)

Fetch the GAN-OPC training set from upstream on first use.

Mirrors the pattern in :class:LithoBenchDataset.download: clones the upstream repository, joins the 30-volume 7z archive, and extracts the resulting tree so that <root>/ganopc-data/{artitgt,artimsk}/ is populated.

Idempotent: if <root>/ganopc-data/artitgt already exists the call is a no-op. The intermediate clone and joined archive are kept on disk so a partial extraction can resume without re-cloning.

Requires git on PATH and the py7zr + multivolumefile Python packages (declared optional under the data extras).

Source code in src/openlithohub/data/ganopc.py
def download(self, root: str) -> None:
    """Fetch the GAN-OPC training set from upstream on first use.

    Mirrors the pattern in :class:`LithoBenchDataset.download`: clones
    the upstream repository, joins the 30-volume 7z archive, and
    extracts the resulting tree so that
    ``<root>/ganopc-data/{artitgt,artimsk}/`` is populated.

    Idempotent: if ``<root>/ganopc-data/artitgt`` already exists the
    call is a no-op. The intermediate clone and joined archive are
    kept on disk so a partial extraction can resume without re-cloning.

    Requires ``git`` on ``PATH`` and the ``py7zr`` +
    ``multivolumefile`` Python packages (declared optional under
    the ``data`` extras).
    """
    download_ganopc(root)

download_ganopc(root, *, revision=_DEFAULT_REVISION, repo_url=_UPSTREAM_REPO)

Clone GAN-OPC and extract the multi-volume 7z into <root>/ganopc-data.

Idempotent: if <root>/ganopc-data/artitgt already exists the fetch is a no-op and the existing path is returned. Otherwise the upstream repo is shallow-cloned into <root>/.ganopc-src, the 30 archive volumes (ganopc-data.7z.001.030) are joined via :mod:multivolumefile, and the resulting tree is extracted in place via :mod:py7zr.

Returns the path to the extracted ganopc-data directory.

Raises:

Type Description
ImportError

py7zr or multivolumefile is not installed.

FileNotFoundError

git is not on PATH.

RuntimeError

the upstream layout no longer matches the expected ganopc-data.7z.NNN shape — usually means the repo moved.

Source code in src/openlithohub/data/ganopc.py
def download_ganopc(
    root: str | Path,
    *,
    revision: str = _DEFAULT_REVISION,
    repo_url: str = _UPSTREAM_REPO,
) -> Path:
    """Clone GAN-OPC and extract the multi-volume 7z into ``<root>/ganopc-data``.

    Idempotent: if ``<root>/ganopc-data/artitgt`` already exists the
    fetch is a no-op and the existing path is returned. Otherwise the
    upstream repo is shallow-cloned into ``<root>/.ganopc-src``, the 30
    archive volumes (``ganopc-data.7z.001`` … ``.030``) are joined via
    :mod:`multivolumefile`, and the resulting tree is extracted in place
    via :mod:`py7zr`.

    Returns the path to the extracted ``ganopc-data`` directory.

    Raises:
        ImportError: ``py7zr`` or ``multivolumefile`` is not installed.
        FileNotFoundError: ``git`` is not on ``PATH``.
        RuntimeError: the upstream layout no longer matches the expected
            ``ganopc-data.7z.NNN`` shape — usually means the repo moved.
    """
    dest_root = Path(root)
    dest_root.mkdir(parents=True, exist_ok=True)
    extracted = dest_root / "ganopc-data"
    if (extracted / "artitgt").is_dir() and (extracted / "artimsk").is_dir():
        return extracted

    if shutil.which("git") is None:
        raise FileNotFoundError(
            "git is required to fetch GAN-OPC but was not found on PATH. "
            "Install git or pre-populate <root>/ganopc-data/{artitgt,artimsk}/ "
            "manually."
        )
    try:
        import multivolumefile
        import py7zr
    except ImportError as e:
        raise ImportError(
            "GAN-OPC auto-fetch requires py7zr and multivolumefile. Install "
            "them with: pip install py7zr multivolumefile"
        ) from e

    src = dest_root / ".ganopc-src"
    if not src.exists():
        # ``git`` is resolved via PATH (``shutil.which`` checked above) and
        # every argument is a literal or validated string — no shell
        # interpolation. S603/S607 ignored at file level via pyproject.toml.
        subprocess.run(
            [
                "git",
                "clone",
                "--depth",
                "1",
                "--branch",
                revision,
                repo_url,
                str(src),
            ],
            check=True,
        )

    volumes = sorted(src.glob("ganopc-data.7z.*"))
    if not volumes:
        raise RuntimeError(
            f"Upstream layout has changed: no ganopc-data.7z.NNN volumes under {src}. "
            "Open an issue at https://github.com/OpenLithoHub/OpenLithoHub/issues."
        )

    for volume in volumes:
        pin = KNOWN_GOOD_SHA256.get(volume.name)
        if pin is None:
            raise RuntimeError(
                f"Unexpected upstream volume {volume.name}: no SHA-256 pin in "
                "KNOWN_GOOD_SHA256. The upstream repo may have re-cut the archive."
            )
        verify_sha256(volume, pin)

    archive_prefix = volumes[0].with_suffix("")  # strip ``.001`` etc.
    dest_resolved = dest_root.resolve()
    with (
        multivolumefile.open(str(archive_prefix), mode="rb") as joined,
        py7zr.SevenZipFile(joined, mode="r") as archive,
    ):
        # Path-traversal guard: refuse archive members that resolve outside
        # ``dest_root``. SHA-256 pins above mitigate tampering for the known
        # upstream artifact, but the guard is defence-in-depth so a future
        # re-pin of a malicious archive cannot escape the destination.
        # ``Path.is_relative_to`` avoids string-prefix confusion (a member
        # resolving to ``/tmp/foobar`` would pass a startswith("/tmp/foo")
        # check).
        for name in archive.getnames():
            member_path = (dest_root / name).resolve()
            if not member_path.is_relative_to(dest_resolved):
                raise RuntimeError(f"Refusing to extract path-traversal member: {name!r}")
        archive.extractall(path=dest_root)

    if not (extracted / "artitgt").is_dir() or not (extracted / "artimsk").is_dir():
        raise RuntimeError(
            f"Extraction completed but {extracted}/{{artitgt,artimsk}}/ are missing. "
            "Inspect the archive contents and report at "
            "https://github.com/OpenLithoHub/OpenLithoHub/issues."
        )
    return extracted

openlithohub.data.asap7

ASAP7 predictive PDK adapter (standard cells from a single GDS).

ASAP7 (Clark et al., ASU) ships its standard-cell library as a single GDSII file containing every cell as a top-level cell in the layout — there is no per-cell file to read. The canonical release lives at https://github.com/The-OpenROAD-Project/asap7 under BSD-3-Clause; the 7.5-track regular-Vt cells are in submodule asap7sc7p5t_27 at GDS/asap7sc7p5t_27_R_*.gds.

This adapter loads a small canonical list of cells (INVx1, NAND2x1, NOR2x1, DFFHQNx1) by name and rasterizes one design layer per cell into a LithoSample.design tensor. The layer choice is configurable; the default is M1 (10/0), which is the densest mask layer foundry reviewers ask about first. The cell selection is intentionally narrow — Phase 1 of issue #4 is a smoke-test, not a full library benchmark.

Per DATA-LICENSES.md, this adapter does not redistribute any PDK bytes. Users must clone the upstream repository themselves and pass the local path. download() is a guarded helper that git clones the upstream repo only after the caller passes accept_license=True, acknowledging the BSD-3-Clause attribution requirement.

Asap7Dataset

Bases: DatasetAdapter

Adapter for the ASAP7 predictive PDK standard cells.

Parameters:

Name Type Description Default
root str | Path

Path to a local clone of The-OpenROAD-Project/asap7 with the asap7sc7p5t_27 submodule initialised. Use Asap7Dataset.download(root, accept_license=True) to create one.

required
cells tuple[str, ...] | list[str] | None

Cell names to expose, in order. Defaults to CANONICAL_CELLS. Function-name shorthand ("INV", "NAND2", "DFFHQN") is accepted alongside canonical ASAP7 strings ("INVx1_ASAP7_75t_R") — see resolve_shorthand and resolve_cell_name.

None
design_layer tuple[int, int]

(layer, datatype) to rasterize as the design tensor. Defaults to M1 (10, 0).

DEFAULT_DESIGN_LAYER
pixel_nm float

Raster pixel size in nm. Defaults to 1.0 to match the existing OpenLithoHub grid; ASAP7's manufacturing dbu is 0.25 nm so this is a 4× downsample.

1.0
gds_path str | Path | None

Optional explicit override for the GDS file path. If unset, the adapter globs asap7sc7p5t_27/GDS/... under root and picks the lexicographically last match.

None
resolve_shorthand bool

When True (default), attempt to expand function-name shorthand into the canonical ASAP7 cell-name (drive=x1, flavor=R, track=75) before raising KeyError. LithoSample.metadata['cell_name'] reflects the resolved string; metadata['requested_cell_name'] records the original input. Set False to require exact-match names.

True

The adapter requires klayout (already pinned in pyproject.toml).

Source code in src/openlithohub/data/asap7.py
class Asap7Dataset(DatasetAdapter):
    """Adapter for the ASAP7 predictive PDK standard cells.

    Args:
        root: Path to a local clone of ``The-OpenROAD-Project/asap7``
            with the ``asap7sc7p5t_27`` submodule initialised. Use
            ``Asap7Dataset.download(root, accept_license=True)`` to
            create one.
        cells: Cell names to expose, in order. Defaults to
            ``CANONICAL_CELLS``. Function-name shorthand (``"INV"``,
            ``"NAND2"``, ``"DFFHQN"``) is accepted alongside canonical
            ASAP7 strings (``"INVx1_ASAP7_75t_R"``) — see
            ``resolve_shorthand`` and ``resolve_cell_name``.
        design_layer: ``(layer, datatype)`` to rasterize as the design
            tensor. Defaults to M1 (10, 0).
        pixel_nm: Raster pixel size in nm. Defaults to 1.0 to match the
            existing OpenLithoHub grid; ASAP7's manufacturing dbu is
            0.25 nm so this is a 4× downsample.
        gds_path: Optional explicit override for the GDS file path. If
            unset, the adapter globs ``asap7sc7p5t_27/GDS/...`` under
            ``root`` and picks the lexicographically last match.
        resolve_shorthand: When True (default), attempt to expand
            function-name shorthand into the canonical ASAP7 cell-name
            (drive=x1, flavor=R, track=75) before raising KeyError.
            ``LithoSample.metadata['cell_name']`` reflects the resolved
            string; ``metadata['requested_cell_name']`` records the
            original input. Set False to require exact-match names.

    The adapter requires ``klayout`` (already pinned in pyproject.toml).
    """

    def __init__(
        self,
        root: str | Path,
        cells: tuple[str, ...] | list[str] | None = None,
        design_layer: tuple[int, int] = DEFAULT_DESIGN_LAYER,
        pixel_nm: float = 1.0,
        gds_path: str | Path | None = None,
        resolve_shorthand: bool = True,
    ) -> None:
        self.root = Path(root)
        if not self.root.exists():
            raise FileNotFoundError(f"ASAP7 root not found: {self.root}")
        # Surface a warning when no MANIFEST.SHA256 sits next to the data
        # — this adapter's `download()` helper clones the upstream repo
        # but does not pin per-file hashes; bytes loaded later are
        # trusted blindly without this signal.
        from openlithohub._utils.integrity import warn_unverified_data_root

        warn_unverified_data_root(self.root, "asap7")
        self.design_layer = design_layer
        self.pixel_nm = float(pixel_nm)
        self.cells: tuple[str, ...] = tuple(cells) if cells is not None else CANONICAL_CELLS
        self.resolve_shorthand = resolve_shorthand
        self._gds_path = Path(gds_path) if gds_path is not None else self._resolve_gds_path()
        if not self._gds_path.exists():
            raise FileNotFoundError(
                f"ASAP7 GDS not found at {self._gds_path}. "
                f"Did you run `git submodule update --init asap7sc7p5t_27` "
                f"under {self.root}?"
            )
        self._cache: dict[str, LithoSample] = {}

    def _resolve_gds_path(self) -> Path:
        matches = sorted(self.root.glob(_GDS_RELATIVE_GLOB))
        if not matches:
            raise FileNotFoundError(
                f"No GDS matching {_GDS_RELATIVE_GLOB!r} under {self.root}. "
                f"Initialise the asap7sc7p5t_27 submodule."
            )
        return matches[-1]

    def __len__(self) -> int:
        return len(self.cells)

    def __getitem__(self, index: int) -> LithoSample:
        if index < 0 or index >= len(self.cells):
            raise IndexError(f"Index {index} out of range [0, {len(self.cells)})")
        name = self.cells[index]
        if name in self._cache:
            return self._cache[name]
        sample = self._load_cell(name)
        self._cache[name] = sample
        return sample

    def _load_cell(self, name: str) -> LithoSample:
        import klayout.db as kdb

        layout = kdb.Layout()
        layout.read(str(self._gds_path))
        cell = layout.cell(name)
        resolved_name = name
        if cell is None and self.resolve_shorthand and "_ASAP7_" not in name:
            # Caller passed a function-name shorthand ("INV", "NAND2"); try the
            # canonical default flavour/drive before giving up. Records the
            # resolved string in metadata so the caller can see what was picked.
            try:
                candidate = resolve_cell_name(name)
            except ValueError:
                candidate = None
            if candidate is not None:
                cell = layout.cell(candidate)
                if cell is not None:
                    resolved_name = candidate
        if cell is None:
            available = sorted(c.name for c in layout.each_cell())[:10]
            raise KeyError(
                f"Cell {name!r} not found in {self._gds_path.name}. First 10 available: {available}"
            )

        design_arr, origin = rasterize_cell_layer(layout, cell, self.design_layer, self.pixel_nm)

        metadata: dict[str, Any] = {
            "dataset": "asap7",
            "pdk": "asap7",
            "pdk_variant": "asap7sc7p5t_27_R",
            "cell_name": resolved_name,
            "requested_cell_name": name,
            "source_gds": str(self._gds_path),
            "dbu_nm": layout.dbu * 1000.0,
            "pixel_nm": self.pixel_nm,
            "design_layer": list(self.design_layer),
            "origin_nm": [origin[0], origin[1]],
            "license": ASAP7_LICENSE,
            "license_url": ASAP7_LICENSE_URL,
        }

        return LithoSample(
            design=torch.from_numpy(design_arr).float(),
            mask=None,
            resist=None,
            metadata=metadata,
        )

    def download(self, root: str) -> None:
        """Clone ASAP7 to ``root``. Always rejected — use ``fetch()`` instead.

        The base ``DatasetAdapter.download`` signature has no place for the
        license-acknowledgement flag this PDK requires, so this method is a
        guard that points the caller at ``Asap7Dataset.fetch()``.
        """
        raise RuntimeError(
            "Asap7Dataset.download() is intentionally unimplemented because "
            "ASAP7 (BSD-3-Clause) requires explicit license acknowledgement. "
            "Use `Asap7Dataset.fetch(root, accept_license=True)` instead."
        )

    # ---- Croissant metadata ----

    def croissant_name(self) -> str:
        return "ASAP7"

    def croissant_description(self) -> str:
        return (
            "ASAP7 is a 7nm predictive academic PDK released by ASU + ARM (BSD-3-Clause). "
            "Cell layouts are rasterised on-the-fly into design-tensor samples for OPC research."
        )

    def croissant_license_url(self) -> str | None:
        return ASAP7_LICENSE_URL

    def croissant_url(self) -> str | None:
        return "https://github.com/The-OpenROAD-Project/asap7"

    def croissant_citation(self) -> str | None:
        return (
            "Clark, L. T., et al. ASAP7: A 7nm finFET predictive process design kit. "
            "Microelectronics Journal 53 (2016): 105-115."
        )

    @classmethod
    def fetch(
        cls,
        root: str | Path,
        accept_license: bool = False,
    ) -> None:
        """Clone the ASAP7 repo with submodules to ``root``.

        ASAP7 ships under BSD-3-Clause. The license requires attribution
        in any redistribution; ``accept_license=True`` is the caller's
        explicit acknowledgement that they have read the license at
        ``ASAP7_LICENSE_URL`` and will comply with the attribution
        requirement when sharing derived layouts.

        Per ``DATA-LICENSES.md``, OpenLithoHub does not redistribute PDK
        bytes — this method only clones from the official upstream
        source on the user's own machine.
        """
        if not accept_license:
            raise RuntimeError(
                f"ASAP7 is licensed under {ASAP7_LICENSE}. Read the terms at "
                f"{ASAP7_LICENSE_URL} and call fetch(..., accept_license=True) "
                f"to confirm you will comply with the attribution requirement."
            )
        target = Path(root)
        target.parent.mkdir(parents=True, exist_ok=True)
        sys.stderr.write(
            f"Cloning ASAP7 ({ASAP7_LICENSE}) into {target} from {ASAP7_UPSTREAM_URL}\n"
        )
        subprocess.run(
            [
                "git",
                "clone",
                "--depth",
                "1",
                "--recurse-submodules",
                "--shallow-submodules",
                ASAP7_UPSTREAM_URL,
                str(target),
            ],
            check=True,
        )

download(root)

Clone ASAP7 to root. Always rejected — use fetch() instead.

The base DatasetAdapter.download signature has no place for the license-acknowledgement flag this PDK requires, so this method is a guard that points the caller at Asap7Dataset.fetch().

Source code in src/openlithohub/data/asap7.py
def download(self, root: str) -> None:
    """Clone ASAP7 to ``root``. Always rejected — use ``fetch()`` instead.

    The base ``DatasetAdapter.download`` signature has no place for the
    license-acknowledgement flag this PDK requires, so this method is a
    guard that points the caller at ``Asap7Dataset.fetch()``.
    """
    raise RuntimeError(
        "Asap7Dataset.download() is intentionally unimplemented because "
        "ASAP7 (BSD-3-Clause) requires explicit license acknowledgement. "
        "Use `Asap7Dataset.fetch(root, accept_license=True)` instead."
    )

fetch(root, accept_license=False) classmethod

Clone the ASAP7 repo with submodules to root.

ASAP7 ships under BSD-3-Clause. The license requires attribution in any redistribution; accept_license=True is the caller's explicit acknowledgement that they have read the license at ASAP7_LICENSE_URL and will comply with the attribution requirement when sharing derived layouts.

Per DATA-LICENSES.md, OpenLithoHub does not redistribute PDK bytes — this method only clones from the official upstream source on the user's own machine.

Source code in src/openlithohub/data/asap7.py
@classmethod
def fetch(
    cls,
    root: str | Path,
    accept_license: bool = False,
) -> None:
    """Clone the ASAP7 repo with submodules to ``root``.

    ASAP7 ships under BSD-3-Clause. The license requires attribution
    in any redistribution; ``accept_license=True`` is the caller's
    explicit acknowledgement that they have read the license at
    ``ASAP7_LICENSE_URL`` and will comply with the attribution
    requirement when sharing derived layouts.

    Per ``DATA-LICENSES.md``, OpenLithoHub does not redistribute PDK
    bytes — this method only clones from the official upstream
    source on the user's own machine.
    """
    if not accept_license:
        raise RuntimeError(
            f"ASAP7 is licensed under {ASAP7_LICENSE}. Read the terms at "
            f"{ASAP7_LICENSE_URL} and call fetch(..., accept_license=True) "
            f"to confirm you will comply with the attribution requirement."
        )
    target = Path(root)
    target.parent.mkdir(parents=True, exist_ok=True)
    sys.stderr.write(
        f"Cloning ASAP7 ({ASAP7_LICENSE}) into {target} from {ASAP7_UPSTREAM_URL}\n"
    )
    subprocess.run(
        [
            "git",
            "clone",
            "--depth",
            "1",
            "--recurse-submodules",
            "--shallow-submodules",
            ASAP7_UPSTREAM_URL,
            str(target),
        ],
        check=True,
    )

resolve_cell_name(shorthand, *, drive='x1', flavor='R', track='75')

Expand a function-name shorthand into the ASAP7 canonical cell name.

The ASAP7 stdcell library names every cell as <FUNC><DRIVE>_ASAP7_<TRACK>t_<FLAVOR>. Issue spec language often uses the bare function name ("INV", "NAND2", "DFFHQN"); this helper composes the canonical string a downstream reader actually expects, with sensible defaults for drive / flavor / track.

Parameters:

Name Type Description Default
shorthand str

Function name, with or without trailing x... drive spec. "INV", "INVx1", and "INVx1_ASAP7_75t_R" all resolve identically. Already-canonical names pass through unchanged.

required
drive str

Drive-strength suffix ("x1", "x2", "xp33", "x1p5", ...). Used only when shorthand does not already include one. p is the decimal separator (e.g. "xp5" is ½×).

'x1'
flavor str

"R" (regular-Vt, default), "L" (low-Vt), "SL" (super-low-Vt), or "SRAM".

'R'
track str

"75" for the 7.5-track library (default) or "6" for the 6-track sibling library.

'75'

Returns:

Type Description
str

The canonical cell-name string, e.g. "INVx1_ASAP7_75t_R".

Raises:

Type Description
ValueError

For unknown flavor/track values.

Examples:

>>> resolve_cell_name("INV")
'INVx1_ASAP7_75t_R'
>>> resolve_cell_name("NAND2", drive="x2", flavor="L")
'NAND2x2_ASAP7_75t_L'
>>> resolve_cell_name("INVx1_ASAP7_75t_R")  # passthrough
'INVx1_ASAP7_75t_R'
Source code in src/openlithohub/data/asap7.py
def resolve_cell_name(
    shorthand: str,
    *,
    drive: str = "x1",
    flavor: str = "R",
    track: str = "75",
) -> str:
    """Expand a function-name shorthand into the ASAP7 canonical cell name.

    The ASAP7 stdcell library names every cell as
    ``<FUNC><DRIVE>_ASAP7_<TRACK>t_<FLAVOR>``. Issue spec language often
    uses the bare function name (``"INV"``, ``"NAND2"``, ``"DFFHQN"``);
    this helper composes the canonical string a downstream reader
    actually expects, with sensible defaults for drive / flavor / track.

    Args:
        shorthand: Function name, with or without trailing ``x...`` drive
            spec. ``"INV"``, ``"INVx1"``, and ``"INVx1_ASAP7_75t_R"`` all
            resolve identically. Already-canonical names pass through
            unchanged.
        drive: Drive-strength suffix (``"x1"``, ``"x2"``, ``"xp33"``,
            ``"x1p5"``, ...). Used only when ``shorthand`` does not
            already include one. ``p`` is the decimal separator
            (e.g. ``"xp5"`` is ½×).
        flavor: ``"R"`` (regular-Vt, default), ``"L"`` (low-Vt),
            ``"SL"`` (super-low-Vt), or ``"SRAM"``.
        track: ``"75"`` for the 7.5-track library (default) or ``"6"``
            for the 6-track sibling library.

    Returns:
        The canonical cell-name string, e.g. ``"INVx1_ASAP7_75t_R"``.

    Raises:
        ValueError: For unknown flavor/track values.

    Examples:
        >>> resolve_cell_name("INV")
        'INVx1_ASAP7_75t_R'
        >>> resolve_cell_name("NAND2", drive="x2", flavor="L")
        'NAND2x2_ASAP7_75t_L'
        >>> resolve_cell_name("INVx1_ASAP7_75t_R")  # passthrough
        'INVx1_ASAP7_75t_R'
    """
    if flavor not in _ASAP7_FLAVORS:
        raise ValueError(f"flavor must be one of {sorted(_ASAP7_FLAVORS)}, got {flavor!r}")
    if track not in _ASAP7_TRACKS:
        raise ValueError(f"track must be one of {sorted(_ASAP7_TRACKS)}, got {track!r}")
    # Already canonical? Pass through unchanged.
    if "_ASAP7_" in shorthand:
        return shorthand
    # Drive baked into the shorthand (e.g. "INVx1", "NAND2xp5")?
    # The "x" must be lowercase and precede a digit or "p".
    func = shorthand
    drive_suffix = drive
    for i, ch in enumerate(shorthand):
        if ch == "x" and i > 0 and i + 1 < len(shorthand):
            tail = shorthand[i + 1 :]
            if tail and (tail[0].isdigit() or tail[0] == "p"):
                func = shorthand[:i]
                drive_suffix = shorthand[i:]
                break
    return f"{func}{drive_suffix}_ASAP7_{track}t_{flavor}"

rasterize_cell_layer(layout, cell, layer_spec, pixel_nm)

Rasterize one (layer, datatype) of a klayout cell into a {0,1} array.

Polygons are rasterized through PIL.ImageDraw.polygon after transforming their hull/holes to pixel coordinates. This is faithful for arbitrary (Manhattan and non-Manhattan) shapes — earlier code decomposed into trapezoids and filled their bboxes, which over-filled angled trapezoids and only happened to be exact because ASAP7 is Manhattan-only. Sibling PDK adapters reuse this helper, so the fix has to handle non-axis-aligned geometry too.

Iterates the layer recursively (begin_shapes_rec) so geometry referenced via cell instances (a stdcell that INSTANCEs a shared via array, for example) is included; the previous flat cell.shapes(...).each() silently dropped instanced shapes.

Returns (array, origin_nm) where origin_nm is the cell bbox lower-left corner in nm. The returned array follows the same orientation convention as :func:openlithohub.data.io.load_layout: image (y-down) coordinates with arr[0] at the top of the layout viewer (largest y_nm). Earlier asap7 code stored y-up and then flipud-d, contradicting the canonical convention and producing vertically mirrored masks vs. load_layout-loaded layouts.

Source code in src/openlithohub/data/asap7.py
def rasterize_cell_layer(
    layout: Any,
    cell: Any,
    layer_spec: tuple[int, int],
    pixel_nm: float,
) -> tuple[np.ndarray[Any, Any], tuple[float, float]]:
    """Rasterize one (layer, datatype) of a klayout cell into a {0,1} array.

    Polygons are rasterized through ``PIL.ImageDraw.polygon`` after
    transforming their hull/holes to pixel coordinates. This is faithful
    for arbitrary (Manhattan and non-Manhattan) shapes — earlier code
    decomposed into trapezoids and filled their bboxes, which over-filled
    angled trapezoids and only happened to be exact because ASAP7 is
    Manhattan-only. Sibling PDK adapters reuse this helper, so the fix
    has to handle non-axis-aligned geometry too.

    Iterates the layer recursively (``begin_shapes_rec``) so geometry
    referenced via cell instances (a stdcell that ``INSTANCE``s a shared
    via array, for example) is included; the previous flat
    ``cell.shapes(...).each()`` silently dropped instanced shapes.

    Returns ``(array, origin_nm)`` where ``origin_nm`` is the cell bbox
    lower-left corner in nm. The returned array follows the same
    orientation convention as :func:`openlithohub.data.io.load_layout`:
    image (y-down) coordinates with ``arr[0]`` at the top of the layout
    viewer (largest y_nm). Earlier asap7 code stored y-up and then
    ``flipud``-d, contradicting the canonical convention and producing
    vertically mirrored masks vs. ``load_layout``-loaded layouts.
    """
    import klayout.db as kdb
    from PIL import Image, ImageDraw

    layer_index = layout.find_layer(*layer_spec)
    bbox = cell.bbox()
    dbu_nm = layout.dbu * 1000.0
    origin = (bbox.left * dbu_nm, bbox.bottom * dbu_nm)
    w = max(1, int(np.ceil(bbox.width() * dbu_nm / pixel_nm)))
    h = max(1, int(np.ceil(bbox.height() * dbu_nm / pixel_nm)))
    if layer_index is None:
        return np.zeros((h, w), dtype=np.float32), origin

    canvas = Image.new("L", (w, h), 0)
    drawer = ImageDraw.Draw(canvas)

    def _to_px(point: Any) -> tuple[int, int]:
        # GDSII uses mathematical y-up; PIL's image surface uses y-down.
        # Convert at projection time and store the array in PIL's native
        # orientation (no later flipud). ``arr[0]`` corresponds to the top
        # of the layout viewer (largest y_nm) — same convention as
        # ``data.io.load_layout``.
        x_dbu = point.x - bbox.left
        y_dbu = point.y - bbox.bottom
        x_nm = x_dbu * dbu_nm
        y_nm_math = y_dbu * dbu_nm
        px = int(round(x_nm / pixel_nm))
        py_math = int(round(y_nm_math / pixel_nm))
        py = (h - 1) - py_math
        return (max(0, min(px, w - 1)), max(0, min(py, h - 1)))

    # Build a Region from the recursive shape iterator so per-polygon hole
    # semantics survive a multi-shape cell with overlapping geometry, and
    # instanced sub-cell geometry (which the previous flat iterator
    # silently dropped) is included. Same contract as data.io.load_layout.
    region = kdb.Region()
    shapes_iter = cell.begin_shapes_rec(layer_index)
    while not shapes_iter.at_end():
        shape = shapes_iter.shape()
        trans = shapes_iter.trans()
        if shape.is_box():
            region.insert(kdb.Polygon(shape.box).transformed(trans))
        elif shape.is_path():
            region.insert(shape.path.polygon().transformed(trans))
        elif shape.is_polygon():
            region.insert(shape.polygon.transformed(trans))
        shapes_iter.next()
    region.merge()

    for poly in region.each():
        # Decompose into convex (hole-free) pieces so a polygon-with-hole
        # cannot erase a separate polygon nested inside its hole — the
        # global-canvas hazard fixed in data.io.load_layout.
        try:
            convex_pieces = list(poly.decompose_convex(kdb.Polygon.PO_any))
        except (AttributeError, TypeError):
            convex_pieces = [poly]
        for piece in convex_pieces:
            iter_points = (
                piece.each_point if hasattr(piece, "each_point") else piece.each_point_hull
            )
            hull = [_to_px(p) for p in iter_points()]
            if len(hull) >= 3:
                drawer.polygon(hull, fill=255)

    arr = np.array(canvas, dtype=np.float32) / 255.0
    return arr, origin

openlithohub.data.freepdk45

FreePDK45 + NanGate Open Cell Library adapter (single-GDS standard cells).

FreePDK45 is NCSU's 45nm open-source predictive PDK; NanGate's Open Cell Library provides the standard cells designed against it. The mflowgen ASIC design kit at https://github.com/mflowgen/freepdk-45nm bundles the two together as a convenience drop, including a single stdcells.gds file with all 135 NanGate cells.

This adapter loads a small canonical list of cells (INV_X1, NAND2_X1, NOR2_X1, DFF_X1) by name and rasterizes one design layer per cell. The default is metal1 = (11, 0) per the kit's rtk-stream-out.map (note: this is not the same numbering as ASAP7, where metal1 = (10, 0)).

License caveat

Unlike ASAP7's clean BSD-3-Clause, the FreePDK45 distribution is two licenses stacked:

  • FreePDK45 (NCSU): see https://eda.ncsu.edu/freepdk/freepdk45/.
  • NanGate Open Cell Library (Si2): see https://si2.org/open-cell-library/.

The mflowgen mirror at github.com/mflowgen/freepdk-45nm does not ship a top-level LICENSE file, so callers MUST verify the upstream terms themselves before redistributing any derivative work. As with ASAP7, fetch() requires explicit accept_license=True to acknowledge this responsibility, and the adapter never bundles PDK bytes into the OpenLithoHub repository.

FreePdk45Dataset

Bases: DatasetAdapter

Adapter for FreePDK45 + NanGate standard cells via mflowgen mirror.

Parameters:

Name Type Description Default
root str | Path

Path to a local clone of mflowgen/freepdk-45nm. Use FreePdk45Dataset.fetch(root, accept_license=True) to create one.

required
cells tuple[str, ...] | list[str] | None

Cell names to expose, in order. Defaults to CANONICAL_CELLS.

None
design_layer tuple[int, int]

(layer, datatype) to rasterize as the design tensor. Defaults to metal1 (11, 0) per rtk-stream-out.map.

DEFAULT_DESIGN_LAYER
pixel_nm float

Raster pixel size in nm. Defaults to 1.0; the FreePDK45 dbu is 0.1 nm so this is a 10× downsample.

1.0
gds_path str | Path | None

Optional explicit override for the GDS file path. If unset, the adapter looks for stdcells.gds directly under root.

None

The adapter requires klayout (already pinned in pyproject.toml).

Source code in src/openlithohub/data/freepdk45.py
class FreePdk45Dataset(DatasetAdapter):
    """Adapter for FreePDK45 + NanGate standard cells via mflowgen mirror.

    Args:
        root: Path to a local clone of ``mflowgen/freepdk-45nm``. Use
            ``FreePdk45Dataset.fetch(root, accept_license=True)`` to
            create one.
        cells: Cell names to expose, in order. Defaults to
            ``CANONICAL_CELLS``.
        design_layer: ``(layer, datatype)`` to rasterize as the design
            tensor. Defaults to metal1 (11, 0) per
            ``rtk-stream-out.map``.
        pixel_nm: Raster pixel size in nm. Defaults to 1.0; the
            FreePDK45 dbu is 0.1 nm so this is a 10× downsample.
        gds_path: Optional explicit override for the GDS file path. If
            unset, the adapter looks for ``stdcells.gds`` directly under
            ``root``.

    The adapter requires ``klayout`` (already pinned in pyproject.toml).
    """

    def __init__(
        self,
        root: str | Path,
        cells: tuple[str, ...] | list[str] | None = None,
        design_layer: tuple[int, int] = DEFAULT_DESIGN_LAYER,
        pixel_nm: float = 1.0,
        gds_path: str | Path | None = None,
    ) -> None:
        self.root = Path(root)
        if not self.root.exists():
            raise FileNotFoundError(f"FreePDK45 root not found: {self.root}")
        from openlithohub._utils.integrity import warn_unverified_data_root

        warn_unverified_data_root(self.root, "freepdk45")
        self.design_layer = design_layer
        self.pixel_nm = float(pixel_nm)
        self.cells: tuple[str, ...] = tuple(cells) if cells is not None else CANONICAL_CELLS
        self._gds_path = Path(gds_path) if gds_path is not None else self.root / _GDS_RELATIVE
        if not self._gds_path.exists():
            raise FileNotFoundError(
                f"FreePDK45 GDS not found at {self._gds_path}. "
                f"Did you clone {FREEPDK45_UPSTREAM_URL} into {self.root}?"
            )
        self._cache: dict[str, LithoSample] = {}

    def __len__(self) -> int:
        return len(self.cells)

    def __getitem__(self, index: int) -> LithoSample:
        if index < 0 or index >= len(self.cells):
            raise IndexError(f"Index {index} out of range [0, {len(self.cells)})")
        name = self.cells[index]
        if name in self._cache:
            return self._cache[name]
        sample = self._load_cell(name)
        self._cache[name] = sample
        return sample

    def _load_cell(self, name: str) -> LithoSample:
        import klayout.db as kdb

        layout = kdb.Layout()
        layout.read(str(self._gds_path))
        cell = layout.cell(name)
        if cell is None:
            available = sorted(c.name for c in layout.each_cell())[:10]
            raise KeyError(
                f"Cell {name!r} not found in {self._gds_path.name}. First 10 available: {available}"
            )

        design_arr, origin = rasterize_cell_layer(layout, cell, self.design_layer, self.pixel_nm)

        metadata: dict[str, Any] = {
            "dataset": "freepdk45",
            "pdk": "freepdk45",
            "pdk_variant": "nangate-openlib",
            "cell_name": name,
            "source_gds": str(self._gds_path),
            "dbu_nm": layout.dbu * 1000.0,
            "pixel_nm": self.pixel_nm,
            "design_layer": list(self.design_layer),
            "origin_nm": [origin[0], origin[1]],
            "license": FREEPDK45_LICENSE,
            "license_url": FREEPDK45_LICENSE_URL,
            "secondary_license_url": NANGATE_LICENSE_URL,
        }

        return LithoSample(
            design=torch.from_numpy(design_arr).float(),
            mask=None,
            resist=None,
            metadata=metadata,
        )

    def download(self, root: str) -> None:
        """Always rejected — use ``fetch()`` instead.

        The base ``DatasetAdapter.download`` signature has no place for
        the license-acknowledgement flag this PDK requires.
        """
        raise RuntimeError(
            "FreePdk45Dataset.download() is intentionally unimplemented "
            "because FreePDK45 + NanGate require explicit license "
            "acknowledgement. Use "
            "`FreePdk45Dataset.fetch(root, accept_license=True)` instead."
        )

    # ---- Croissant metadata ----

    def croissant_name(self) -> str:
        return "FreePDK45"

    def croissant_description(self) -> str:
        return (
            "FreePDK45 is the NCSU 45nm predictive academic PDK paired with NanGate "
            "Open Cell Library. Cell layouts are rasterised on-the-fly for OPC / mask "
            "optimisation research."
        )

    def croissant_license_url(self) -> str | None:
        return FREEPDK45_LICENSE_URL

    def croissant_url(self) -> str | None:
        return "https://eda.ncsu.edu/freepdk/freepdk45/"

    def croissant_citation(self) -> str | None:
        return (
            "Stine, J. E., et al. FreePDK: An Open-Source Variation-Aware Design Kit. "
            "IEEE MSE 2007."
        )

    @classmethod
    def fetch(
        cls,
        root: str | Path,
        accept_license: bool = False,
    ) -> None:
        """Clone the mflowgen FreePDK45 mirror to ``root``.

        FreePDK45 + NanGate ships under a stacked license that the
        mflowgen mirror does *not* declare in a LICENSE file. Callers
        must independently verify both upstream terms before
        redistributing any derivative work, and the adapter requires
        ``accept_license=True`` to acknowledge that responsibility.

        Per ``DATA-LICENSES.md``, OpenLithoHub does not redistribute PDK
        bytes — this method only clones from the mflowgen mirror on the
        user's own machine.
        """
        if not accept_license:
            raise RuntimeError(
                f"FreePDK45 ships under a stacked license: {FREEPDK45_LICENSE}. "
                f"Read the terms at {FREEPDK45_LICENSE_URL} (FreePDK45) and "
                f"{NANGATE_LICENSE_URL} (NanGate OCL) and call "
                f"fetch(..., accept_license=True) to confirm you will comply "
                f"with both."
            )
        target = Path(root)
        target.parent.mkdir(parents=True, exist_ok=True)
        sys.stderr.write(
            f"Cloning FreePDK45 / NanGate OCL into {target} from "
            f"{FREEPDK45_UPSTREAM_URL}\n"
            f"  Verify upstream terms: {FREEPDK45_LICENSE_URL}\n"
            f"                          {NANGATE_LICENSE_URL}\n"
        )
        subprocess.run(
            [
                "git",
                "clone",
                "--depth",
                "1",
                FREEPDK45_UPSTREAM_URL,
                str(target),
            ],
            check=True,
        )

download(root)

Always rejected — use fetch() instead.

The base DatasetAdapter.download signature has no place for the license-acknowledgement flag this PDK requires.

Source code in src/openlithohub/data/freepdk45.py
def download(self, root: str) -> None:
    """Always rejected — use ``fetch()`` instead.

    The base ``DatasetAdapter.download`` signature has no place for
    the license-acknowledgement flag this PDK requires.
    """
    raise RuntimeError(
        "FreePdk45Dataset.download() is intentionally unimplemented "
        "because FreePDK45 + NanGate require explicit license "
        "acknowledgement. Use "
        "`FreePdk45Dataset.fetch(root, accept_license=True)` instead."
    )

fetch(root, accept_license=False) classmethod

Clone the mflowgen FreePDK45 mirror to root.

FreePDK45 + NanGate ships under a stacked license that the mflowgen mirror does not declare in a LICENSE file. Callers must independently verify both upstream terms before redistributing any derivative work, and the adapter requires accept_license=True to acknowledge that responsibility.

Per DATA-LICENSES.md, OpenLithoHub does not redistribute PDK bytes — this method only clones from the mflowgen mirror on the user's own machine.

Source code in src/openlithohub/data/freepdk45.py
@classmethod
def fetch(
    cls,
    root: str | Path,
    accept_license: bool = False,
) -> None:
    """Clone the mflowgen FreePDK45 mirror to ``root``.

    FreePDK45 + NanGate ships under a stacked license that the
    mflowgen mirror does *not* declare in a LICENSE file. Callers
    must independently verify both upstream terms before
    redistributing any derivative work, and the adapter requires
    ``accept_license=True`` to acknowledge that responsibility.

    Per ``DATA-LICENSES.md``, OpenLithoHub does not redistribute PDK
    bytes — this method only clones from the mflowgen mirror on the
    user's own machine.
    """
    if not accept_license:
        raise RuntimeError(
            f"FreePDK45 ships under a stacked license: {FREEPDK45_LICENSE}. "
            f"Read the terms at {FREEPDK45_LICENSE_URL} (FreePDK45) and "
            f"{NANGATE_LICENSE_URL} (NanGate OCL) and call "
            f"fetch(..., accept_license=True) to confirm you will comply "
            f"with both."
        )
    target = Path(root)
    target.parent.mkdir(parents=True, exist_ok=True)
    sys.stderr.write(
        f"Cloning FreePDK45 / NanGate OCL into {target} from "
        f"{FREEPDK45_UPSTREAM_URL}\n"
        f"  Verify upstream terms: {FREEPDK45_LICENSE_URL}\n"
        f"                          {NANGATE_LICENSE_URL}\n"
    )
    subprocess.run(
        [
            "git",
            "clone",
            "--depth",
            "1",
            FREEPDK45_UPSTREAM_URL,
            str(target),
        ],
        check=True,
    )

openlithohub.data.freepdk45_sram

FreePDK45 SRAM bitcell adapter — load OpenRAM's bundled GDS as samples.

OpenRAM (BSD-3-Clause, pip install openram) ships a small set of hand-crafted FreePDK45 standard cells under technology/freepdk45/gds_lib/:

  • cell_1rw.gds — 6T 1-port SRAM bitcell
  • cell_2rw.gds — 8T dual-port SRAM bitcell
  • dff.gds — D-flip-flop
  • sense_amp.gds — sense amplifier
  • write_driver.gds — write driver
  • tri_gate.gds — tri-state gate
  • replica_cell_{1,2}rw.gds — timing-replica columns
  • dummy_cell_{1,2}rw.gds — row/column edge dummies

These are the exact cells OpenRAM compiles together to build a full SRAM macro on FreePDK45. Each GDS contains a single top cell whose name matches the file stem.

This adapter rasterizes one design layer per cell (default: metal1 (11, 0)) and emits one LithoSample per cell — directly addressing issue #4 Phase 3's SRAM-bitcell-tile data goal without running OpenRAM's compiler. The compile path (sram_compiler.py) currently has a numpy-2 scalar-conversion regression in upstream OpenRAM 1.2.48; the pre-shipped GDS files are the canonical, citation-worthy artifact and sidestep that bug entirely.

Layer numbering matches the FreePDK45 stream-out map (layers.map in the openram package): metal1 = 11/0, identical to the mflowgen NanGate mirror, so the central registry's LAYERS["freepdk45"].metal1 covers both.

License

  • OpenRAM: BSD-3-Clause (https://github.com/VLSIDA/OpenRAM)
  • FreePDK45: academic / non-commercial (NCSU EDA Wiki).

The adapter does not redistribute either set of bytes — it locates the bundled GDS via importlib.resources.files("openram") at runtime, so the user's pip-installed openram wheel is the source of truth.

FreePdk45SramDataset

Bases: DatasetAdapter

Adapter for OpenRAM's bundled FreePDK45 SRAM-cell GDS files.

Parameters:

Name Type Description Default
cells Sequence[str] | None

Cell names to expose, in order. Defaults to CANONICAL_CELLS (all 10 cells in the bundle).

None
design_layer tuple[int, int]

(layer, datatype) to rasterize as the design tensor. Defaults to metal1 (11, 0) per FreePDK45's layers.map.

DEFAULT_DESIGN_LAYER
pixel_nm float

Raster pixel size in nm. Defaults to 1.0; FreePDK45's dbu is 0.5 nm so this is a 2× downsample.

1.0
gds_lib_path str | Path | None

Optional explicit path to OpenRAM's gds_lib directory. If unset, auto-located via importlib.resources.

None

Each LithoSample has mask=None and resist=None — these are unmasked design-layer rasterizations, suitable as inputs to OPC / mask-optimization research, not paired training data.

Source code in src/openlithohub/data/freepdk45_sram.py
class FreePdk45SramDataset(DatasetAdapter):
    """Adapter for OpenRAM's bundled FreePDK45 SRAM-cell GDS files.

    Args:
        cells: Cell names to expose, in order. Defaults to
            ``CANONICAL_CELLS`` (all 10 cells in the bundle).
        design_layer: ``(layer, datatype)`` to rasterize as the design
            tensor. Defaults to metal1 (11, 0) per FreePDK45's
            ``layers.map``.
        pixel_nm: Raster pixel size in nm. Defaults to 1.0; FreePDK45's
            dbu is 0.5 nm so this is a 2× downsample.
        gds_lib_path: Optional explicit path to OpenRAM's ``gds_lib``
            directory. If unset, auto-located via ``importlib.resources``.

    Each ``LithoSample`` has ``mask=None`` and ``resist=None`` — these
    are unmasked design-layer rasterizations, suitable as inputs to OPC
    / mask-optimization research, not paired training data.
    """

    def __init__(
        self,
        cells: Sequence[str] | None = None,
        design_layer: tuple[int, int] = DEFAULT_DESIGN_LAYER,
        pixel_nm: float = 1.0,
        gds_lib_path: str | Path | None = None,
    ) -> None:
        if pixel_nm <= 0:
            raise ValueError(f"pixel_nm must be positive, got {pixel_nm!r}")
        self.design_layer = design_layer
        self.pixel_nm = float(pixel_nm)
        self.cells: tuple[str, ...] = tuple(cells) if cells is not None else CANONICAL_CELLS
        self._gds_lib = (
            Path(gds_lib_path) if gds_lib_path is not None else _locate_openram_gds_lib()
        )
        if not self._gds_lib.is_dir():
            raise FileNotFoundError(f"FreePDK45 SRAM gds_lib not found: {self._gds_lib}")
        from openlithohub._utils.integrity import warn_unverified_data_root

        warn_unverified_data_root(self._gds_lib, "freepdk45_sram")
        self._cache: dict[str, LithoSample] = {}

    def __len__(self) -> int:
        return len(self.cells)

    def __getitem__(self, index: int) -> LithoSample:
        n = len(self.cells)
        if index < -n or index >= n:
            raise IndexError(f"Index {index} out of range [{-n}, {n})")
        if index < 0:
            index += n
        name = self.cells[index]
        if name in self._cache:
            return self._cache[name]
        sample = self._load_cell(name)
        self._cache[name] = sample
        return sample

    def _load_cell(self, name: str) -> LithoSample:
        import klayout.db as kdb

        gds_path = self._gds_lib / f"{name}.gds"
        if not gds_path.exists():
            available = sorted(p.stem for p in self._gds_lib.glob("*.gds"))
            raise KeyError(f"Cell {name!r} not found in {self._gds_lib}. Available: {available}")
        layout = kdb.Layout()
        layout.read(str(gds_path))
        cell = layout.cell(name)
        if cell is None:
            # OpenRAM convention: each GDS top cell name = file stem.
            # If that ever drifts, fall back to the unique top cell.
            tops = list(layout.top_cells())
            if len(tops) != 1:
                names = [c.name for c in tops]
                raise KeyError(
                    f"Cell {name!r} not present in {gds_path.name}; expected "
                    f"exactly one top cell matching the file stem, found {names!r}."
                )
            cell = tops[0]

        design_arr, origin = rasterize_cell_layer(layout, cell, self.design_layer, self.pixel_nm)

        metadata: dict[str, Any] = {
            "dataset": "freepdk45-sram",
            "pdk": "freepdk45",
            "pdk_variant": "openram-bundled",
            "cell_name": cell.name,
            "source_gds": str(gds_path),
            "dbu_nm": layout.dbu * 1000.0,
            "pixel_nm": self.pixel_nm,
            "design_layer": list(self.design_layer),
            "origin_nm": [origin[0], origin[1]],
            "license": FREEPDK45_LICENSE,
            "license_url": FREEPDK45_LICENSE_URL,
            "tooling_license": OPENRAM_LICENSE,
            "tooling_license_url": OPENRAM_LICENSE_URL,
        }

        return LithoSample(
            design=torch.from_numpy(design_arr).float(),
            mask=None,
            resist=None,
            metadata=metadata,
        )

    def download(self, root: str) -> None:
        """No-op — the GDS bundle ships in the ``openram`` pip wheel.

        Install via ``pip install 'openlithohub[freepdk45-sram]'`` or
        ``pip install openram``; the adapter then locates the bundle
        automatically via ``importlib.resources``.
        """
        raise RuntimeError(
            "FreePdk45SramDataset has no download() — the GDS bundle ships "
            "inside the openram pip wheel. Install via `pip install openram` "
            "(or `pip install 'openlithohub[freepdk45-sram]'`) and the "
            "adapter will locate it automatically."
        )

    # ---- Croissant metadata ----

    def croissant_name(self) -> str:
        return "FreePDK45-SRAM-OpenRAM"

    def croissant_description(self) -> str:
        return (
            "FreePDK45 SRAM-cell GDS files bundled with OpenRAM (1RW / 2RW "
            "bitcells, sense amp, write driver, DFF, replica and dummy "
            "cells). Each cell is rasterised on one design layer for OPC / "
            "mask-optimisation research."
        )

    def croissant_license_url(self) -> str | None:
        return FREEPDK45_LICENSE_URL

    def croissant_url(self) -> str | None:
        return "https://github.com/VLSIDA/OpenRAM"

    def croissant_citation(self) -> str | None:
        return (
            "Guthaus, M. R., Stine, J. E., Ataei, S., et al. "
            "OpenRAM: An Open-Source Memory Compiler. ICCAD 2016."
        )

download(root)

No-op — the GDS bundle ships in the openram pip wheel.

Install via pip install 'openlithohub[freepdk45-sram]' or pip install openram; the adapter then locates the bundle automatically via importlib.resources.

Source code in src/openlithohub/data/freepdk45_sram.py
def download(self, root: str) -> None:
    """No-op — the GDS bundle ships in the ``openram`` pip wheel.

    Install via ``pip install 'openlithohub[freepdk45-sram]'`` or
    ``pip install openram``; the adapter then locates the bundle
    automatically via ``importlib.resources``.
    """
    raise RuntimeError(
        "FreePdk45SramDataset has no download() — the GDS bundle ships "
        "inside the openram pip wheel. Install via `pip install openram` "
        "(or `pip install 'openlithohub[freepdk45-sram]'`) and the "
        "adapter will locate it automatically."
    )

openlithohub.data.orfs

ORFS artifact adapter — load ASAP7-routed RISC-V layouts as tile samples.

OpenROAD-flow-scripts (ORFS) is the open-source RTL→GDSII flow. Its flow/designs/asap7/<name>/ configurations produce real ASAP7-routed layouts — including mock-alu, riscv32i, riscv32i-mock-sram (the SRAM-instantiated variant covering Phase 3's SRAM-bitcell-tile goal), ibex, swerv_wrapper, and cva6 — under flow/results/asap7/<name>/base/<name>.gds.

Phase 3 of issue #4 wires those artifacts into OpenLithoHub. The adapter is fully generic over design name; the build-asap7-mock-alu.yml workflow already accepts design as a workflow_dispatch input, so producing a different design's GDS is gh workflow run build-asap7-mock-alu.yml -f design=riscv32i-mock-sram — no adapter or workflow code change needed.

The adapter rasterizes one design layer of the top cell, then cuts the result into fixed-size tiles (2 µm or 5 µm by default — the windows AI-OPC inference is benchmarked on). One LithoSample per tile.

Why tiling instead of one sample per block: a routed RISC-V ALU block is hundreds of microns on a side, far too large for the Hopkins forward model to evaluate as a single tensor. The ICCAD/AI-OPC literature evaluates on ~2 µm and ~5 µm windows, and that's what the issue spec (Phase 3) calls for.

License

ORFS itself is BSD-3-Clause; the asap7 platform underneath is also BSD-3-Clause (same upstream as openlithohub.data.asap7). The adapter re-uses the ASAP7 license constants — there is no separate ORFS data-license gate beyond the ASAP7 acknowledgement already required when fetching the PDK.

This module never redistributes ORFS or ASAP7 bytes. The fetch() classmethod points at the build-asap7-mock-alu GitHub Actions workflow that produces the GDS as a release-style artifact.

OrfsArtifactDataset

Bases: DatasetAdapter

Load an ORFS-produced ASAP7 layout, expose it as N tile samples.

Parameters:

Name Type Description Default
gds_path str | Path

Path to a GDS file produced by ORFS make against an asap7/<design> config (e.g. mock-alu.gds).

required
cell_name str | None

Optional explicit top-cell name. Defaults to the GDS file's basename (matches ORFS naming convention).

None
design_layer tuple[int, int]

(layer, datatype) to rasterize. Defaults to metal1 (20, 0) — post-route ORFS-ASAP7 GDS numbers M1 as 20/0, not 10/0 like the cell-library source. See the module docstring for the full layer-numbering caveat.

DEFAULT_DESIGN_LAYER
pixel_nm float

Raster pixel size in nm. Default 1.0; ASAP7 dbu is 0.25 nm so the rasterizer downsamples 4×.

1.0
tile_nm float | None

Tile edge length in nm. Default 2000 (2 µm); also commonly 5000 (5 µm). Pass None to disable tiling and expose the whole block as a single sample (only feasible for very small designs).

DEFAULT_TILE_NM
stride_nm float | None

Tile stride. Defaults to tile_nm (non-overlapping). Pass a smaller value for overlapping inference windows.

None
drop_empty_tiles bool

Skip all-zero tiles. Default True.

True
design_name str | None

Optional human-readable design name for metadata (e.g. "mock-alu", "riscv32i", "riscv32i-mock-sram"). Defaults to gds_path.stem.

None
orfs_revision str | None

Optional ORFS git SHA recorded in metadata for reproducibility. Set this to the orfs_ref input of the build-asap7-mock-alu workflow that produced the GDS.

None
Source code in src/openlithohub/data/orfs.py
class OrfsArtifactDataset(DatasetAdapter):
    """Load an ORFS-produced ASAP7 layout, expose it as N tile samples.

    Args:
        gds_path: Path to a GDS file produced by ``ORFS make`` against
            an ``asap7/<design>`` config (e.g. ``mock-alu.gds``).
        cell_name: Optional explicit top-cell name. Defaults to the
            GDS file's basename (matches ORFS naming convention).
        design_layer: ``(layer, datatype)`` to rasterize. Defaults to
            metal1 (20, 0) — post-route ORFS-ASAP7 GDS numbers M1 as
            20/0, *not* 10/0 like the cell-library source. See the
            module docstring for the full layer-numbering caveat.
        pixel_nm: Raster pixel size in nm. Default 1.0; ASAP7 dbu is
            0.25 nm so the rasterizer downsamples 4×.
        tile_nm: Tile edge length in nm. Default 2000 (2 µm); also
            commonly 5000 (5 µm). Pass ``None`` to disable tiling and
            expose the whole block as a single sample (only feasible
            for very small designs).
        stride_nm: Tile stride. Defaults to ``tile_nm``
            (non-overlapping). Pass a smaller value for overlapping
            inference windows.
        drop_empty_tiles: Skip all-zero tiles. Default True.
        design_name: Optional human-readable design name for metadata
            (e.g. "mock-alu", "riscv32i", "riscv32i-mock-sram"). Defaults
            to ``gds_path.stem``.
        orfs_revision: Optional ORFS git SHA recorded in metadata for
            reproducibility. Set this to the ``orfs_ref`` input of the
            ``build-asap7-mock-alu`` workflow that produced the GDS.
    """

    def __init__(
        self,
        gds_path: str | Path,
        cell_name: str | None = None,
        design_layer: tuple[int, int] = DEFAULT_DESIGN_LAYER,
        pixel_nm: float = 1.0,
        tile_nm: float | None = DEFAULT_TILE_NM,
        stride_nm: float | None = None,
        drop_empty_tiles: bool = True,
        design_name: str | None = None,
        orfs_revision: str | None = None,
    ) -> None:
        self.gds_path = Path(gds_path)
        if not self.gds_path.exists():
            raise FileNotFoundError(f"ORFS GDS not found: {self.gds_path}")
        from openlithohub._utils.integrity import warn_unverified_data_root

        warn_unverified_data_root(self.gds_path.parent, "orfs")
        if tile_nm is not None and tile_nm <= 0:
            raise ValueError(f"tile_nm must be positive or None, got {tile_nm!r}")
        self.cell_name = cell_name
        self.design_layer = design_layer
        self.pixel_nm = float(pixel_nm)
        self.tile_nm = tile_nm
        self.stride_nm = stride_nm
        self.drop_empty_tiles = drop_empty_tiles
        self.design_name = design_name or self.gds_path.stem
        self.orfs_revision = orfs_revision
        # Lazy: rasterize on first __getitem__ so constructor is cheap.
        self._design_arr: np.ndarray[Any, Any] | None = None
        self._origin_nm: tuple[float, float] | None = None
        self._dbu_nm: float | None = None
        self._tiles: list[tuple[np.ndarray[Any, Any], tuple[int, int]]] | None = None

    def _ensure_loaded(self) -> None:
        if self._design_arr is not None:
            return
        import klayout.db as kdb

        layout = kdb.Layout()
        layout.read(str(self.gds_path))
        if self.cell_name is not None:
            cell = layout.cell(self.cell_name)
            if cell is None:
                available = sorted(c.name for c in layout.each_cell())[:10]
                raise KeyError(
                    f"Cell {self.cell_name!r} not found in {self.gds_path.name}. "
                    f"First 10 available: {available}"
                )
        else:
            top_cells = list(layout.top_cells())
            if not top_cells:
                raise ValueError(f"GDS {self.gds_path.name} has no top cells.")
            if len(top_cells) > 1:
                names = [c.name for c in top_cells]
                warnings.warn(
                    f"GDS {self.gds_path.name} has {len(top_cells)} top cells "
                    f"({names!r}); picking {names[0]!r}. Pass cell_name=... to "
                    "select explicitly.",
                    stacklevel=2,
                )
            cell = top_cells[0]
        design_arr, origin = rasterize_cell_layer(layout, cell, self.design_layer, self.pixel_nm)
        self._design_arr = design_arr
        self._origin_nm = origin
        self._dbu_nm = layout.dbu * 1000.0
        self._cell_name_resolved = cell.name
        if self.tile_nm is None:
            # Treat the whole block as a single "tile" at offset (0, 0).
            self._tiles = [(design_arr, (0, 0))]
        else:
            self._tiles = tile_design_tensor(
                design_arr,
                tile_nm=self.tile_nm,
                pixel_nm=self.pixel_nm,
                stride_nm=self.stride_nm,
                drop_empty=self.drop_empty_tiles,
            )

    def __len__(self) -> int:
        self._ensure_loaded()
        assert self._tiles is not None
        return len(self._tiles)

    def __getitem__(self, index: int) -> LithoSample:
        self._ensure_loaded()
        assert self._tiles is not None
        if index < 0 or index >= len(self._tiles):
            raise IndexError(f"Index {index} out of range [0, {len(self._tiles)})")
        tile_arr, (tx_px, ty_px) = self._tiles[index]
        ox_nm, oy_nm = self._origin_nm  # type: ignore[misc]
        tile_origin_nm = (
            ox_nm + tx_px * self.pixel_nm,
            oy_nm + ty_px * self.pixel_nm,
        )
        metadata: dict[str, Any] = {
            "dataset": "orfs",
            "pdk": "asap7",
            "design_name": self.design_name,
            "cell_name": self._cell_name_resolved,
            "source_gds": str(self.gds_path),
            "dbu_nm": self._dbu_nm,
            "pixel_nm": self.pixel_nm,
            "design_layer": list(self.design_layer),
            "tile_index": index,
            "tile_nm": self.tile_nm,
            "tile_origin_nm": [tile_origin_nm[0], tile_origin_nm[1]],
            "tile_pixels": list(tile_arr.shape[::-1]),  # (w, h)
            "license": ASAP7_LICENSE,
            "license_url": ASAP7_LICENSE_URL,
        }
        if self.orfs_revision is not None:
            metadata["orfs_revision"] = self.orfs_revision
        return LithoSample(
            design=torch.from_numpy(tile_arr).float(),
            mask=None,
            resist=None,
            metadata=metadata,
        )

    def download(self, root: str) -> None:
        """ORFS artifacts are produced by a CI workflow, not downloaded.

        See ``.github/workflows/build-asap7-mock-alu.yml`` — trigger it
        via ``gh workflow run build-asap7-mock-alu.yml`` and download
        the resulting GDS artifact. There is no remote URL to fetch.
        """
        raise RuntimeError(
            "OrfsArtifactDataset has no download() — the GDS comes from "
            "the build-asap7-mock-alu GitHub Actions workflow. Trigger "
            "it via `gh workflow run build-asap7-mock-alu.yml`, download "
            "the produced artifact, and pass its path to "
            "OrfsArtifactDataset(gds_path=...)."
        )

    # ---- Croissant metadata ----

    def croissant_name(self) -> str:
        return "ORFS-ASAP7-MockALU"

    def croissant_description(self) -> str:
        return (
            "Tiles extracted from a GDS produced by OpenROAD-flow-scripts (ORFS) "
            "routing a small ALU on the ASAP7 PDK. Each tile is a windowed view "
            "of the layout suitable for AI-OPC inference research."
        )

    def croissant_license_url(self) -> str | None:
        return ASAP7_LICENSE_URL

    def croissant_url(self) -> str | None:
        return "https://github.com/The-OpenROAD-Project/OpenROAD-flow-scripts"

    def croissant_citation(self) -> str | None:
        return (
            "Ajayi, T., Blaauw, D. et al. OpenROAD: Toward a Self-Driving, "
            "Open-Source Digital Layout Implementation Tool Chain. "
            "DAC 2019 / GOMACTech 2019."
        )

download(root)

ORFS artifacts are produced by a CI workflow, not downloaded.

See .github/workflows/build-asap7-mock-alu.yml — trigger it via gh workflow run build-asap7-mock-alu.yml and download the resulting GDS artifact. There is no remote URL to fetch.

Source code in src/openlithohub/data/orfs.py
def download(self, root: str) -> None:
    """ORFS artifacts are produced by a CI workflow, not downloaded.

    See ``.github/workflows/build-asap7-mock-alu.yml`` — trigger it
    via ``gh workflow run build-asap7-mock-alu.yml`` and download
    the resulting GDS artifact. There is no remote URL to fetch.
    """
    raise RuntimeError(
        "OrfsArtifactDataset has no download() — the GDS comes from "
        "the build-asap7-mock-alu GitHub Actions workflow. Trigger "
        "it via `gh workflow run build-asap7-mock-alu.yml`, download "
        "the produced artifact, and pass its path to "
        "OrfsArtifactDataset(gds_path=...)."
    )

tile_design_tensor(design, tile_nm, pixel_nm, stride_nm=None, drop_empty=True)

Cut a rasterized design into fixed-size tiles.

Returns [(tile_array, (x_pixels, y_pixels)), ...] where the second element is the tile's lower-left corner in pixel coordinates of the parent design. Tiles smaller than the requested size at the right/top edges are dropped — keeping ragged tiles would force the eval harness to handle variable-size inputs.

stride_nm defaults to tile_nm (non-overlapping grid).

drop_empty=True skips all-zero tiles. Routed layouts have huge empty regions outside the core; emitting thousands of zero tiles would dominate runtime without producing useful metrics.

Source code in src/openlithohub/data/orfs.py
def tile_design_tensor(
    design: np.ndarray[Any, Any],
    tile_nm: float,
    pixel_nm: float,
    stride_nm: float | None = None,
    drop_empty: bool = True,
) -> list[tuple[np.ndarray[Any, Any], tuple[int, int]]]:
    """Cut a rasterized design into fixed-size tiles.

    Returns ``[(tile_array, (x_pixels, y_pixels)), ...]`` where the
    second element is the tile's lower-left corner in pixel
    coordinates of the parent design. Tiles smaller than the requested
    size at the right/top edges are dropped — keeping ragged tiles
    would force the eval harness to handle variable-size inputs.

    ``stride_nm`` defaults to ``tile_nm`` (non-overlapping grid).

    ``drop_empty=True`` skips all-zero tiles. Routed layouts have huge
    empty regions outside the core; emitting thousands of zero tiles
    would dominate runtime without producing useful metrics.
    """
    tile_px = max(1, int(round(tile_nm / pixel_nm)))
    stride_px = max(1, int(round((stride_nm if stride_nm is not None else tile_nm) / pixel_nm)))
    h, w = design.shape
    tiles: list[tuple[np.ndarray[Any, Any], tuple[int, int]]] = []
    for y in range(0, h - tile_px + 1, stride_px):
        for x in range(0, w - tile_px + 1, stride_px):
            t = design[y : y + tile_px, x : x + tile_px]
            if drop_empty and not t.any():
                continue
            tiles.append((t.copy(), (x, y)))
    return tiles

openlithohub.data.transforms

Data transforms for resolution alignment and normalization.

align_resolution(tensor, source_pixel_nm, target_pixel_nm, mode='bilinear', *, binarize=False, binarize_threshold=0.5)

Resample a tensor to match target pixel resolution.

Parameters:

Name Type Description Default
tensor Tensor

Input tensor (H, W), (C, H, W), or (N, C, H, W).

required
source_pixel_nm float

Current pixel size in nanometers.

required
target_pixel_nm float

Desired pixel size in nanometers.

required
mode str

Interpolation mode ('bilinear', 'nearest', 'bicubic').

'bilinear'
binarize bool

If True, threshold the resampled output back to {0, 1} with > binarize_threshold. Use this when the input is a binary mask: bilinear / bicubic interpolation produces grayscale fringes along edges that downstream raster ops (DRC, MRC, contour trace) treat as foreground, inflating metrics by a few pixel widths. Skip when the input is a continuous field (aerial intensity, density map).

False
binarize_threshold float

Cutoff used when binarize=True.

0.5

Returns:

Type Description
Tensor

Resampled tensor at the target resolution; ndim matches input.

Notes

Output spatial dimensions are computed as round(H * source / target) and passed to F.interpolate via size=. The earlier scale_factor= form left the exact output size to the framework's rounding policy, which differs between PyTorch versions and between modes — explicit size keeps a (1024, 1024) layout aligning to a (2048, 2048) grid at 2× upsample regardless of build.

Source code in src/openlithohub/data/transforms.py
def align_resolution(
    tensor: torch.Tensor,
    source_pixel_nm: float,
    target_pixel_nm: float,
    mode: str = "bilinear",
    *,
    binarize: bool = False,
    binarize_threshold: float = 0.5,
) -> torch.Tensor:
    """Resample a tensor to match target pixel resolution.

    Args:
        tensor: Input tensor (H, W), (C, H, W), or (N, C, H, W).
        source_pixel_nm: Current pixel size in nanometers.
        target_pixel_nm: Desired pixel size in nanometers.
        mode: Interpolation mode ('bilinear', 'nearest', 'bicubic').
        binarize: If True, threshold the resampled output back to {0, 1}
            with ``> binarize_threshold``. Use this when the input is a
            binary mask: bilinear / bicubic interpolation produces
            grayscale fringes along edges that downstream raster ops
            (DRC, MRC, contour trace) treat as foreground, inflating
            metrics by a few pixel widths. Skip when the input is a
            continuous field (aerial intensity, density map).
        binarize_threshold: Cutoff used when ``binarize=True``.

    Returns:
        Resampled tensor at the target resolution; ndim matches input.

    Notes:
        Output spatial dimensions are computed as
        ``round(H * source / target)`` and passed to ``F.interpolate``
        via ``size=``. The earlier ``scale_factor=`` form left the exact
        output size to the framework's rounding policy, which differs
        between PyTorch versions and between modes — explicit ``size``
        keeps a (1024, 1024) layout aligning to a (2048, 2048) grid at
        2× upsample regardless of build.
    """
    if source_pixel_nm <= 0 or target_pixel_nm <= 0:
        raise ValueError("Pixel sizes must be positive")

    scale = source_pixel_nm / target_pixel_nm
    if abs(scale - 1.0) < 1e-6:
        return tensor

    ndim = tensor.ndim
    if ndim == 2:
        x = tensor.unsqueeze(0).unsqueeze(0)
    elif ndim == 3:
        x = tensor.unsqueeze(0)
    elif ndim == 4:
        x = tensor
    else:
        raise ValueError(f"Expected 2D (H,W), 3D (C,H,W), or 4D (N,C,H,W) tensor, got {ndim}D")

    h_in, w_in = int(x.shape[-2]), int(x.shape[-1])
    h_out = max(1, int(round(h_in * scale)))
    w_out = max(1, int(round(w_in * scale)))

    align_corners = None if mode == "nearest" else False
    x = f.interpolate(x, size=(h_out, w_out), mode=mode, align_corners=align_corners)

    if binarize:
        x = (x > binarize_threshold).to(x.dtype)

    if ndim == 2:
        return x.squeeze(0).squeeze(0)
    if ndim == 3:
        return x.squeeze(0)
    return x

normalize_to_binary(tensor, threshold=0.5)

Threshold a continuous tensor to binary (0/1).

Source code in src/openlithohub/data/transforms.py
def normalize_to_binary(tensor: torch.Tensor, threshold: float = 0.5) -> torch.Tensor:
    """Threshold a continuous tensor to binary (0/1)."""
    return (tensor > threshold).float()

openlithohub.data.dummy

Procedural dummy layout generator for CI, debugging, and onboarding.

These layouts are not representative of real cell libraries — they exist so that you can exercise the OpenLithoHub pipeline end-to-end without downloading LithoBench or LithoSim, and so CI can run hermetically without network or large data fixtures.

The generator only uses numpy and torch. It does not depend on KLayout or any of the heavy [workflow] extras, which keeps it usable in Colab and on minimal CI images.

DummyLayoutSpec dataclass

Parameters controlling the generated layout.

Source code in src/openlithohub/data/dummy.py
@dataclass(frozen=True)
class DummyLayoutSpec:
    """Parameters controlling the generated layout."""

    size: int = 256
    pixel_size_nm: float = 1.0
    min_width_nm: float = 40.0
    min_spacing_nm: float = 40.0
    fill_ratio: float = 0.25
    seed: int | None = 0

generate_dummy_layout(spec=None, *, size=None, seed=None)

Generate a deterministic dummy binary layout that satisfies basic DRC.

The result is a 2D torch.Tensor of shape (size, size) with values in {0.0, 1.0}. Polygons are placed by random rectangle splatting and then cleaned with morphological opening/closing so that minimum width and spacing rules are met for the configured pixel pitch.

Parameters:

Name Type Description Default
spec DummyLayoutSpec | None

Full configuration; if omitted, a default 256 px / 40 nm spec is used and overridden by the keyword arguments.

None
size int | None

Convenience override for spec.size.

None
seed int | None

Convenience override for spec.seed.

None

Returns:

Type Description
Tensor

Binary mask tensor of shape (size, size).

Examples:

>>> mask = generate_dummy_layout(size=128, seed=0)
>>> mask.shape
torch.Size([128, 128])
Source code in src/openlithohub/data/dummy.py
def generate_dummy_layout(
    spec: DummyLayoutSpec | None = None,
    *,
    size: int | None = None,
    seed: int | None = None,
) -> torch.Tensor:
    """Generate a deterministic dummy binary layout that satisfies basic DRC.

    The result is a 2D ``torch.Tensor`` of shape (size, size) with values in
    {0.0, 1.0}. Polygons are placed by random rectangle splatting and then
    cleaned with morphological opening/closing so that minimum width and
    spacing rules are met for the configured pixel pitch.

    Args:
        spec: Full configuration; if omitted, a default 256 px / 40 nm spec is
            used and overridden by the keyword arguments.
        size: Convenience override for ``spec.size``.
        seed: Convenience override for ``spec.seed``.

    Returns:
        Binary mask tensor of shape (size, size).

    Examples:
        >>> mask = generate_dummy_layout(size=128, seed=0)
        >>> mask.shape
        torch.Size([128, 128])
    """
    if spec is None:
        spec = DummyLayoutSpec()
    if size is not None or seed is not None:
        spec = DummyLayoutSpec(
            size=size if size is not None else spec.size,
            pixel_size_nm=spec.pixel_size_nm,
            min_width_nm=spec.min_width_nm,
            min_spacing_nm=spec.min_spacing_nm,
            fill_ratio=spec.fill_ratio,
            seed=seed if seed is not None else spec.seed,
        )

    if spec.size < 32:
        raise ValueError(f"size must be >= 32, got {spec.size}")
    if not 0.0 < spec.fill_ratio < 1.0:
        raise ValueError(f"fill_ratio must be in (0, 1), got {spec.fill_ratio}")

    rng = np.random.default_rng(spec.seed)
    canvas = np.zeros((spec.size, spec.size), dtype=np.float32)

    min_width_px = max(2, int(round(spec.min_width_nm / spec.pixel_size_nm)))
    min_spacing_px = max(2, int(round(spec.min_spacing_nm / spec.pixel_size_nm)))

    target_pixels = spec.fill_ratio * spec.size * spec.size
    placed = 0
    attempts = 0
    max_attempts = 4000
    rect_min = min(min_width_px * 2, max(2, spec.size // 8))
    rect_max = max(rect_min + 1, min(spec.size // 4, spec.size - 1))

    while placed < target_pixels and attempts < max_attempts:
        attempts += 1
        w = int(rng.integers(rect_min, rect_max))
        h = int(rng.integers(rect_min, rect_max))
        if w >= spec.size or h >= spec.size:
            continue
        x = int(rng.integers(0, spec.size - w))
        y = int(rng.integers(0, spec.size - h))
        sub = canvas[y : y + h, x : x + w]
        new_pixels = int((sub == 0.0).sum())
        sub[:] = 1.0
        placed += new_pixels

    mask = torch.from_numpy(canvas)
    mask = _enforce_min_rules(mask, min_width_px, min_spacing_px)
    return (mask > 0.5).float()

generate_dummy_pair(spec=None, **kwargs)

Generate a (design, mask) pair where the mask is a dilated design.

Useful for sanity-checking OPC pipelines without real ground truth.

Source code in src/openlithohub/data/dummy.py
def generate_dummy_pair(
    spec: DummyLayoutSpec | None = None, **kwargs: int | None
) -> tuple[torch.Tensor, torch.Tensor]:
    """Generate a ``(design, mask)`` pair where the mask is a dilated design.

    Useful for sanity-checking OPC pipelines without real ground truth.
    """
    design = generate_dummy_layout(spec, **kwargs)
    mask = binary_dilation(design, radius=2)
    return design, mask