Skip to content

Leaderboard

openlithohub.leaderboard.schema

Pydantic schemas for leaderboard entries and submissions.

ProcessNode

Bases: str, Enum

Supported process technology nodes.

Source code in src/openlithohub/leaderboard/schema.py
class ProcessNode(str, Enum):
    """Supported process technology nodes."""

    N45 = "45nm"
    N28 = "28nm"
    N7 = "7nm"
    N5_EUV = "5nm-euv"
    N3_EUV = "3nm-euv"
    N2_EUV = "2nm-euv"

MaskTopology

Bases: str, Enum

Mask shape classification.

Source code in src/openlithohub/leaderboard/schema.py
class MaskTopology(str, Enum):
    """Mask shape classification."""

    MANHATTAN = "manhattan"
    CURVILINEAR = "curvilinear"

LeaderboardTrack

Bases: str, Enum

Leaderboard track. Default is the open ongoing competition.

Hackathon tracks scope a fixed dataset + node + frozen test split for a bounded period. Entries marked with a hackathon track are displayed in their own ranked table on the website and never mix with the open leaderboard. See docs/hackathon.md.

Source code in src/openlithohub/leaderboard/schema.py
class LeaderboardTrack(str, Enum):
    """Leaderboard track. Default is the open ongoing competition.

    Hackathon tracks scope a fixed dataset + node + frozen test split for
    a bounded period. Entries marked with a hackathon track are
    displayed in their own ranked table on the website and never mix
    with the open leaderboard. See ``docs/hackathon.md``.
    """

    OPEN = "open"
    HACKATHON_2026Q3 = "hackathon-2026q3"

BenchmarkResult

Bases: BaseModel

A single benchmark submission for the leaderboard.

The leaderboard ingests this schema from community pull requests via the auto-leaderboard workflow. The schema is the only firewall between PR-supplied YAML and the canonical store, so it forbids extra fields, bounds string lengths, and validates URL fields.

Source code in src/openlithohub/leaderboard/schema.py
class BenchmarkResult(BaseModel):
    """A single benchmark submission for the leaderboard.

    The leaderboard ingests this schema from community pull requests via the
    ``auto-leaderboard`` workflow. The schema is the only firewall between
    PR-supplied YAML and the canonical store, so it forbids extra fields,
    bounds string lengths, and validates URL fields.
    """

    model_config = ConfigDict(extra="forbid", protected_namespaces=())

    model_name: str = Field(
        ..., min_length=1, max_length=120, description="Name of the evaluated model"
    )
    dataset: str = Field(
        ...,
        min_length=1,
        max_length=120,
        description=("Dataset used (lithobench / lithosim / asap7 / freepdk45 / orfs)."),
    )
    process_node: ProcessNode
    mask_topology: MaskTopology
    track: LeaderboardTrack = Field(
        LeaderboardTrack.OPEN,
        description="Leaderboard track (open or a specific hackathon round).",
    )

    # Mask-level EPE — kept as a sanity baseline. NOT canonical: an Identity
    # model scores 0 here by construction. The leaderboard ranks on
    # ``l2_error_pixels`` (Neural-ILT contract), with ``pvband_mean_nm`` as
    # the secondary key. See ``benchmark/metrics/l2_error.py``.
    epe_mean_nm: float = Field(
        ..., ge=0, description="Mean mask-level EPE in nm (sanity, not the ranking key)."
    )
    epe_max_nm: float = Field(..., ge=0, description="Max mask-level EPE in nm.")

    # Wafer-level metrics — printability after forward simulation. These are
    # the physically meaningful figures and feed the leaderboard ranking.
    epe_wafer_mean_nm: float | None = Field(
        None, ge=0, description="Mean wafer-level EPE in nm (post forward-sim)."
    )
    epe_wafer_max_nm: float | None = Field(
        None, ge=0, description="Max wafer-level EPE in nm (post forward-sim)."
    )
    l2_error_pixels: float | None = Field(
        None,
        ge=0,
        description=(
            "Neural-ILT canonical printability scalar: |wafer - target| summed "
            "over pixels (technically L1; named per the Neural-ILT paper). "
            "Primary leaderboard ranking key."
        ),
    )
    l2_error_nm2: float | None = Field(
        None, ge=0, description="``l2_error_pixels`` converted to nm² area."
    )

    pvband_mean_nm: float | None = Field(None, ge=0, description="Mean PV band width (nm)")
    pvband_max_nm: float | None = Field(None, ge=0, description="Max PV band width (nm)")
    mrc_violation_rate: float | None = Field(None, ge=0, le=1)
    drc_pass: bool | None = None
    shot_count: int | None = Field(None, ge=0)
    stochastic_robustness: float | None = Field(None, ge=0, le=1)
    resist_diffusion_nm: float | None = Field(
        None,
        ge=0,
        description=(
            "Acid diffusion length used during evaluation. Must be 0.0 (or "
            "None) for leaderboard-eligible submissions; positive values are "
            "non-comparable with the canonical CTR baseline."
        ),
    )

    # Number of samples behind the aggregated metrics. Recorded so future
    # migrations can detect (and if needed re-normalize) entries written
    # under different aggregation conventions — see schema v3 migration.
    num_samples: int | None = Field(None, ge=0)

    # Per-metric counts of samples whose value was non-finite (``nan`` /
    # ``inf``) and was therefore excluded from the aggregate. Surfaces
    # eval-time dataset noise on the leaderboard so a quietly-broken run
    # doesn't sit next to a clean one with no indication. Keys match the
    # aggregated metric names (e.g. ``epe_wafer_mean_nm``).
    dropped_nonfinite: dict[str, int] | None = Field(default=None)

    submitted_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
    submission_id: str | None = Field(
        None, max_length=64, description="Auto-assigned submission ID (read-only)."
    )
    paper_url: str | None = Field(None, max_length=2048)
    code_url: str | None = Field(None, max_length=2048)
    notes: str | None = Field(None, max_length=2000)

    @field_validator("paper_url", "code_url")
    @classmethod
    def _validate_url(cls, v: str | None) -> str | None:
        if v is None:
            return v
        # Strict URL validation: parse and require https/http scheme, a
        # non-empty network location, no embedded user:password, and no
        # whitespace anywhere. The previous prefix-only check accepted
        # malformed strings like "http://", "http:// foo", or
        # "https://user:pass@evil/" — fine as text, but unusable as a
        # link and a phishing vector when surfaced in the leaderboard UI.
        from urllib.parse import urlparse

        if any(ch.isspace() for ch in v):
            raise ValueError("URL must not contain whitespace")
        try:
            parsed = urlparse(v)
        except ValueError as exc:
            raise ValueError(f"Invalid URL: {exc}") from exc
        if parsed.scheme not in ("http", "https"):
            raise ValueError("URL must use http:// or https:// scheme")
        if not parsed.netloc:
            raise ValueError("URL must include a host")
        if "@" in parsed.netloc:
            raise ValueError("URL must not include user:password credentials")
        hostname = parsed.hostname
        if hostname is None or "." not in hostname:
            raise ValueError("URL hostname must contain at least one '.'")
        return v

    @field_validator("submission_id")
    @classmethod
    def _validate_submission_id(cls, v: str | None) -> str | None:
        # Submission IDs are surfaced in URLs and filesystem paths; constrain
        # to a safe charset (alnum + dash + underscore) so a hostile ID can't
        # path-traverse out of the submissions/ directory or break URL routing.
        if v is None:
            return v
        if not v:
            raise ValueError("submission_id must not be empty")
        if not all(c.isalnum() or c in "-_" for c in v):
            raise ValueError(
                "submission_id must contain only alphanumeric characters, dashes, and underscores"
            )
        return v

openlithohub.leaderboard.tracker

SOTA tracking and leaderboard management.

The store is a single JSON file shared across CLI invocations and the Spaces app. Read-modify-write therefore needs: - A POSIX advisory lock (fcntl.flock) on a sidecar .lock file so two concurrent submitters serialize. - An atomic rename (tempfile + os.replace) so the file is never partially written.

Submission IDs are stored under the public submission_id field of BenchmarkResult so they round-trip through model_validate.

LeaderboardStore

JSON file-backed leaderboard data store.

Source code in src/openlithohub/leaderboard/tracker.py
class LeaderboardStore:
    """JSON file-backed leaderboard data store."""

    def __init__(self, path: Path | None = None) -> None:
        if path is not None:
            self._path = Path(path)
        else:
            env_path = os.environ.get("OPENLITHOHUB_LEADERBOARD_PATH")
            if env_path:
                self._path = Path(env_path)
            else:
                self._path = _DEFAULT_LEADERBOARD_DIR / _LEADERBOARD_FILENAME

    @property
    def path(self) -> Path:
        return self._path

    @property
    def _lock_path(self) -> Path:
        return self._path.with_suffix(self._path.suffix + ".lock")

    def _read_entries(self) -> list[dict[str, Any]]:
        if not self._path.exists():
            return []
        text = self._path.read_text(encoding="utf-8")
        data = json.loads(text)
        # Pre-schema-versioned files were written as a top-level JSON list.
        # Treat that as version 0 so the migration path is a single
        # well-defined funnel and a corrupted file is a clear error rather
        # than an `AttributeError` deep inside `.get`.
        if isinstance(data, list):
            return _migrate_entries(data, from_version=0)
        if not isinstance(data, dict):
            raise ValueError(
                f"Leaderboard file at {self._path} is neither a JSON object nor a list "
                f"(got {type(data).__name__}); refusing to load."
            )
        version = int(data.get("schema_version", 1))
        entries = data.get("entries", [])
        if not isinstance(entries, list):
            raise ValueError(
                f"Leaderboard file at {self._path} has a non-list 'entries' "
                f"(got {type(entries).__name__}); refusing to load."
            )
        return _migrate_entries(entries, from_version=version)

    def _write_entries(self, entries: list[dict[str, Any]]) -> None:
        self._path.parent.mkdir(parents=True, exist_ok=True)
        payload = json.dumps(
            {"schema_version": LEADERBOARD_SCHEMA_VERSION, "entries": entries},
            indent=2,
            default=str,
        )
        # tempfile in the same directory so os.replace is atomic across the
        # whole sequence (cross-device rename would otherwise be a copy).
        fd, tmp_name = tempfile.mkstemp(
            prefix=self._path.name + ".", suffix=".tmp", dir=str(self._path.parent)
        )
        try:
            with os.fdopen(fd, "w", encoding="utf-8") as f:
                f.write(payload)
            os.replace(tmp_name, self._path)
        except Exception:
            with contextlib.suppress(FileNotFoundError):
                os.unlink(tmp_name)
            raise

    def submit(self, result: BenchmarkResult) -> str:
        _require_forward_simulation(result)
        _require_no_diffusion(result)
        with _file_lock(self._lock_path):
            entries = self._read_entries()
            submission_id = _generate_id(result.model_name)
            entry = result.model_dump(mode="json")
            entry["submission_id"] = submission_id
            entries.append(entry)
            self._write_entries(entries)
        return submission_id

    def query(
        self,
        dataset: str | None = None,
        process_node: str | None = None,
    ) -> list[BenchmarkResult]:
        entries = self._read_entries()
        results: list[BenchmarkResult] = []
        for entry in entries:
            r = BenchmarkResult.model_validate(entry)
            if dataset and r.dataset != dataset:
                continue
            if process_node and r.process_node.value != process_node:
                continue
            results.append(r)
        # Rank by the canonical Neural-ILT printability scalar
        # (``l2_error_pixels``), then PV-band mean as a tiebreaker.
        # Entries written before v2 (no wafer fields) sort to the bottom.
        # ``epe_mean_nm`` is mask-level — kept on the entry for sanity but
        # NEVER used as the primary key, because Identity models score 0.
        results.sort(key=_ranking_key)
        return results

submit_result(result, *, store=None)

Submit a benchmark result to the leaderboard.

Parameters:

Name Type Description Default
result BenchmarkResult

Validated BenchmarkResult entry.

required
store LeaderboardStore | None

Optional explicit store (for testing). Uses default if None.

None

Returns:

Type Description
str

Submission ID for tracking.

Source code in src/openlithohub/leaderboard/tracker.py
def submit_result(result: BenchmarkResult, *, store: LeaderboardStore | None = None) -> str:
    """Submit a benchmark result to the leaderboard.

    Args:
        result: Validated BenchmarkResult entry.
        store: Optional explicit store (for testing). Uses default if None.

    Returns:
        Submission ID for tracking.
    """
    s = store or _get_store()
    return s.submit(result)

get_leaderboard(dataset=None, process_node=None, *, store=None)

Retrieve current leaderboard entries with optional filtering.

Parameters:

Name Type Description Default
dataset str | None

Filter by dataset name.

None
process_node str | None

Filter by process node.

None
store LeaderboardStore | None

Optional explicit store (for testing). Uses default if None.

None

Returns:

Type Description
list[BenchmarkResult]

Sorted list of BenchmarkResult entries (by L2 ascending, then

list[BenchmarkResult]

PV-band mean, then wafer-EPE mean — see _ranking_key).

Source code in src/openlithohub/leaderboard/tracker.py
def get_leaderboard(
    dataset: str | None = None,
    process_node: str | None = None,
    *,
    store: LeaderboardStore | None = None,
) -> list[BenchmarkResult]:
    """Retrieve current leaderboard entries with optional filtering.

    Args:
        dataset: Filter by dataset name.
        process_node: Filter by process node.
        store: Optional explicit store (for testing). Uses default if None.

    Returns:
        Sorted list of BenchmarkResult entries (by L2 ascending, then
        PV-band mean, then wafer-EPE mean — see ``_ranking_key``).
    """
    s = store or _get_store()
    return s.query(dataset=dataset, process_node=process_node)