diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index d085c6fd87..190ca0b478 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -21,6 +21,7 @@ from typing import ( TYPE_CHECKING, Any, + TypeAlias, ) from urllib.parse import quote, unquote @@ -86,7 +87,7 @@ TableRequirement, TableUpdate, ) -from pyiceberg.typedef import EMPTY_DICT, UTF8, IcebergBaseModel, Identifier, Properties +from pyiceberg.typedef import EMPTY_DICT, UTF8, IcebergBaseModel, Identifier, PaginationList, Properties from pyiceberg.types import transform_dict_value_to_str from pyiceberg.utils.deprecated import deprecation_message from pyiceberg.utils.properties import get_first_property_value, get_header_properties, property_as_bool, property_as_int @@ -96,6 +97,8 @@ if TYPE_CHECKING: import pyarrow as pa +_PageFetchResult: TypeAlias = tuple[list[Identifier], str | None] + class HttpMethod(str, Enum): GET = "GET" @@ -1051,26 +1054,24 @@ def list_tables(self, namespace: str | Identifier) -> list[Identifier]: raise ValueError(f"{PAGE_SIZE} must be a positive integer") params["pageSize"] = str(page_size) - tables: list[Identifier] = [] - page_token: str | None = None - - while True: - if page_token: - params["pageToken"] = page_token + def _fetch_page(page_token: str) -> _PageFetchResult: + params["pageToken"] = page_token response = self._session.get(url, params=params) try: response.raise_for_status() except HTTPError as exc: _handle_non_200_response(exc, {404: NoSuchNamespaceError}) - parsed = ListTablesResponse.model_validate_json(response.text) - tables.extend([(*table.namespace, table.name) for table in parsed.identifiers]) + return [(*t.namespace, t.name) for t in parsed.identifiers], parsed.next_page_token - if not parsed.next_page_token: - break - page_token = parsed.next_page_token - - return tables + response = self._session.get(url, params=params) + try: + response.raise_for_status() + except HTTPError as exc: + _handle_non_200_response(exc, {404: NoSuchNamespaceError}) + parsed = ListTablesResponse.model_validate_json(response.text) + first_page: list[Identifier] = [(*t.namespace, t.name) for t in parsed.identifiers] + return PaginationList(first_page, parsed.next_page_token, _fetch_page) @retry(**_RETRY_ARGS) @override @@ -1165,27 +1166,24 @@ def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise ValueError(f"{PAGE_SIZE} must be a positive integer") params["pageSize"] = str(page_size) - views: list[Identifier] = [] - page_token: str | None = None - - while True: - if page_token: - params["pageToken"] = page_token - + def _fetch_page(page_token: str) -> _PageFetchResult: + params["pageToken"] = page_token response = self._session.get(url, params=params) try: response.raise_for_status() except HTTPError as exc: _handle_non_200_response(exc, {404: NoSuchNamespaceError}) - parsed = ListViewsResponse.model_validate_json(response.text) - views.extend([(*view.namespace, view.name) for view in parsed.identifiers]) - - if not parsed.next_page_token: - break - page_token = parsed.next_page_token + return [(*v.namespace, v.name) for v in parsed.identifiers], parsed.next_page_token - return views + response = self._session.get(url, params=params) + try: + response.raise_for_status() + except HTTPError as exc: + _handle_non_200_response(exc, {404: NoSuchNamespaceError}) + parsed = ListViewsResponse.model_validate_json(response.text) + first_page: list[Identifier] = [(*v.namespace, v.name) for v in parsed.identifiers] + return PaginationList(first_page, parsed.next_page_token, _fetch_page) @retry(**_RETRY_ARGS) @override @@ -1279,6 +1277,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: self._check_endpoint(Capability.V1_LIST_NAMESPACES) namespace_tuple = self.identifier_to_tuple(namespace) + namespaces_url = self.url(Endpoints.list_namespaces) params: dict[str, str] = {} page_size = property_as_int(self.properties, PAGE_SIZE, None) @@ -1286,30 +1285,26 @@ def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: if page_size <= 0: raise ValueError(f"{PAGE_SIZE} must be a positive integer") params["pageSize"] = str(page_size) + if namespace_tuple: + params["parent"] = self._encode_namespace_path(namespace_tuple) - namespaces: list[Identifier] = [] - page_token: str | None = None - - while True: - if namespace_tuple: - params["parent"] = self._encode_namespace_path(namespace_tuple) - if page_token: - params["pageToken"] = page_token - response = self._session.get(self.url(Endpoints.list_namespaces), params=params) - + def _fetch_page(page_token: str) -> _PageFetchResult: + params["pageToken"] = page_token + response = self._session.get(namespaces_url, params=params) try: response.raise_for_status() except HTTPError as exc: _handle_non_200_response(exc, {404: NoSuchNamespaceError}) - parsed = ListNamespaceResponse.model_validate_json(response.text) - namespaces.extend(parsed.namespaces) - - if not parsed.next_page_token: - break - page_token = parsed.next_page_token + return list(parsed.namespaces), parsed.next_page_token - return namespaces + response = self._session.get(namespaces_url, params=params) + try: + response.raise_for_status() + except HTTPError as exc: + _handle_non_200_response(exc, {404: NoSuchNamespaceError}) + parsed = ListNamespaceResponse.model_validate_json(response.text) + return PaginationList(list(parsed.namespaces), parsed.next_page_token, _fetch_page) @retry(**_RETRY_ARGS) @override diff --git a/pyiceberg/typedef.py b/pyiceberg/typedef.py index 6989144ef9..fc5051ab87 100644 --- a/pyiceberg/typedef.py +++ b/pyiceberg/typedef.py @@ -17,7 +17,7 @@ from __future__ import annotations from abc import abstractmethod -from collections.abc import Callable +from collections.abc import Callable, Iterator from datetime import date, datetime, time from decimal import Decimal from typing import ( @@ -26,9 +26,11 @@ Generic, Literal, Protocol, + SupportsIndex, TypeAlias, TypeVar, Union, + overload, runtime_checkable, ) from uuid import UUID @@ -211,3 +213,95 @@ def __hash__(self) -> int: TableVersion: TypeAlias = Literal[1, 2, 3] ViewVersion: TypeAlias = Literal[1] + + +class PaginationList(list[T]): + """A list that lazily fetches subsequent pages from a paginated API. + + The first page is pre-loaded on construction. Subsequent pages are only + fetched when the caller iterates past items already in memory. Operations + that require the complete result set — ``len()``, ``in``, slicing, + ``repr()`` — trigger a full fetch of all remaining pages. + + Args: + first_page: Items from the first API response. + next_page_token: Pagination token returned with the first response, + or ``None`` if no further pages exist. + fetch_next_page: Callable that accepts a page token and returns a + tuple of ``(items, next_page_token_or_None)``. + """ + + def __init__( + self, + first_page: list[T], + next_page_token: str | None, + fetch_next_page: Callable[[str], tuple[list[T], str | None]], + ) -> None: + super().__init__(first_page) + self._next_page_token = next_page_token + self._fetch_next_page = fetch_next_page + + def _fetch_all(self) -> None: + while self._next_page_token: + items, self._next_page_token = self._fetch_next_page(self._next_page_token) + list.extend(self, items) + + def _fetch_through_index(self, idx: int) -> None: + while list.__len__(self) <= idx and self._next_page_token: + items, self._next_page_token = self._fetch_next_page(self._next_page_token) + list.extend(self, items) + + def __iter__(self) -> Iterator[T]: + """Iterate lazily, fetching pages only as the caller advances.""" + idx = 0 + while True: + if idx < list.__len__(self): + yield list.__getitem__(self, idx) + idx += 1 + elif self._next_page_token: + items, self._next_page_token = self._fetch_next_page(self._next_page_token) + list.extend(self, items) + else: + return + + def __len__(self) -> int: + """Return the total number of items, fetching all pages first.""" + self._fetch_all() + return list.__len__(self) + + def __contains__(self, item: object) -> bool: + """Return True if item is present, fetching all pages first.""" + self._fetch_all() + return list.__contains__(self, item) + + def __repr__(self) -> str: + """Return string representation after fetching all pages.""" + self._fetch_all() + return f"PaginationList({list.__repr__(self)})" + + def __eq__(self, other: object) -> bool: + """Compare equality after fetching all pages.""" + self._fetch_all() + return list.__eq__(self, other) + + def __ne__(self, other: object) -> bool: + """Compare inequality after fetching all pages.""" + return not self.__eq__(other) + + @overload + def __getitem__(self, idx: SupportsIndex) -> T: ... # noqa: D105 + + @overload + def __getitem__(self, idx: slice) -> list[T]: ... # noqa: D105 + + def __getitem__(self, idx: SupportsIndex | slice) -> T | list[T]: + """Fetch pages as needed before returning the requested item(s).""" + if isinstance(idx, slice): + self._fetch_all() + else: + i = idx.__index__() + if i < 0: + self._fetch_all() + else: + self._fetch_through_index(i) + return list.__getitem__(self, idx) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 1eb9f26a56..dd62c00a89 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -68,6 +68,7 @@ from pyiceberg.typedef import RecursiveDict from pyiceberg.types import StringType from pyiceberg.utils.config import Config +from pyiceberg.typedef import PaginationList from pyiceberg.view import View from pyiceberg.view.metadata import ViewMetadata, ViewVersion @@ -529,6 +530,62 @@ def test_list_tables_paginated_200(rest_mock: Mocker) -> None: ] +def test_list_tables_returns_pagination_list(rest_mock: Mocker) -> None: + """list_tables returns a PaginationList that defers fetching page 2.""" + namespace = "examples" + + rest_mock.get( + f"{TEST_URI}v1/namespaces/{namespace}/tables", + json={ + "identifiers": [ + {"namespace": ["examples"], "name": "table1"}, + {"namespace": ["examples"], "name": "table2"}, + ], + "next-page-token": "pagetoken", + }, + status_code=200, + request_headers=TEST_HEADERS, + ) + # Second page — registered but should only be called when iterated past page 1. + rest_mock.get( + f"{TEST_URI}v1/namespaces/{namespace}/tables?pageToken=pagetoken", + json={ + "identifiers": [ + {"namespace": ["examples"], "name": "table3"}, + ], + }, + status_code=200, + request_headers=TEST_HEADERS, + ) + + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + calls_after_init = rest_mock.call_count # config endpoint called during __init__ + + result = catalog.list_tables(namespace) + + assert isinstance(result, PaginationList) + + # Consuming only the first two items must not trigger the second HTTP request. + first_two = [] + for item in result: + first_two.append(item) + if len(first_two) == 2: + break + + assert first_two == [("examples", "table1"), ("examples", "table2")] + # Only the initial list_tables request should have been made beyond __init__. + assert rest_mock.call_count == calls_after_init + 1 + + # Consuming all items forces the second request. + all_tables = list(result) + assert all_tables == [ + ("examples", "table1"), + ("examples", "table2"), + ("examples", "table3"), + ] + assert rest_mock.call_count == calls_after_init + 2 + + def test_list_tables_paginated_200_none_next_page_token(rest_mock: Mocker) -> None: namespace = "examples" # First page with next-page-token diff --git a/tests/utils/test_pagination.py b/tests/utils/test_pagination.py new file mode 100644 index 0000000000..a1a86f5411 --- /dev/null +++ b/tests/utils/test_pagination.py @@ -0,0 +1,214 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +from pyiceberg.typedef import PaginationList + + +def _simple_pagination_list( + pages: list[list[int]], +) -> tuple[PaginationList[int], list[int]]: + all_items = [item for page in pages for item in page] + next_tokens = [f"tok{i}" for i in range(1, len(pages))] + [None] + + call_count = 0 + + def fetch(token: str) -> tuple[list[int], str | None]: + nonlocal call_count + page_idx = int(token.replace("tok", "")) + call_count += 1 + return pages[page_idx], next_tokens[page_idx] + + first_token = "tok1" if len(pages) > 1 else None + pl = PaginationList(pages[0], first_token, fetch) + return pl, all_items + + +# --------------------------------------------------------------------------- +# Single-page: behaves like a plain list +# --------------------------------------------------------------------------- + + +def test_single_page_iteration() -> None: + pl, expected = _simple_pagination_list([[1, 2, 3]]) + assert list(pl) == expected + + +def test_single_page_len() -> None: + pl, expected = _simple_pagination_list([[1, 2, 3]]) + assert len(pl) == len(expected) + + +def test_single_page_contains() -> None: + pl, _ = _simple_pagination_list([[1, 2, 3]]) + assert 2 in pl + assert 99 not in pl + + +def test_single_page_getitem() -> None: + pl, _ = _simple_pagination_list([[10, 20, 30]]) + assert pl[0] == 10 + assert pl[2] == 30 + assert pl[-1] == 30 + + +def test_single_page_slice() -> None: + pl, _ = _simple_pagination_list([[1, 2, 3, 4]]) + assert pl[1:3] == [2, 3] + + +def test_single_page_is_list_subclass() -> None: + pl, _ = _simple_pagination_list([[1, 2]]) + assert isinstance(pl, list) + + +# --------------------------------------------------------------------------- +# Multi-page: lazily fetches subsequent pages +# --------------------------------------------------------------------------- + + +def test_multi_page_iteration_fetches_all() -> None: + pl, expected = _simple_pagination_list([[1, 2], [3, 4], [5]]) + assert list(pl) == expected + + +def test_multi_page_partial_iteration_stops_early() -> None: + fetched = [] + + def fetch(token: str) -> tuple[list[int], str | None]: + fetched.append(token) + return [3, 4], None + + pl: PaginationList[int] = PaginationList([1, 2], "tok1", fetch) + + result = [] + for item in pl: + result.append(item) + if item == 2: + break + + assert result == [1, 2] + assert fetched == [], "No fetch should have occurred for first-page items" + + +def test_multi_page_iteration_triggers_fetch_when_needed() -> None: + fetched_tokens: list[str] = [] + + def fetch(token: str) -> tuple[list[int], str | None]: + fetched_tokens.append(token) + if token == "tok1": + return [3], "tok2" + return [4, 5], None + + pl: PaginationList[int] = PaginationList([1, 2], "tok1", fetch) + assert list(pl) == [1, 2, 3, 4, 5] + assert fetched_tokens == ["tok1", "tok2"] + + +def test_multi_page_len_fetches_all() -> None: + pl, expected = _simple_pagination_list([[1, 2], [3]]) + assert len(pl) == len(expected) + + +def test_multi_page_contains_fetches_all() -> None: + pl, _ = _simple_pagination_list([[1, 2], [3, 4]]) + assert 4 in pl + assert 99 not in pl + + +def test_multi_page_getitem_fetches_lazily() -> None: + fetched: list[str] = [] + + def fetch(token: str) -> tuple[list[int], str | None]: + fetched.append(token) + if token == "tok1": + return [3, 4], "tok2" + return [5], None + + pl: PaginationList[int] = PaginationList([1, 2], "tok1", fetch) + + assert pl[0] == 1 + assert fetched == [] + + assert pl[2] == 3 + assert fetched == ["tok1"] + + assert pl[4] == 5 + assert fetched == ["tok1", "tok2"] + + +def test_multi_page_negative_index_fetches_all() -> None: + pl, _ = _simple_pagination_list([[1, 2], [3, 4, 5]]) + assert pl[-1] == 5 + + +def test_multi_page_slice_fetches_all() -> None: + pl, _ = _simple_pagination_list([[1, 2], [3, 4]]) + assert pl[1:3] == [2, 3] + + +def test_multi_page_repr_fetches_all() -> None: + pl, _ = _simple_pagination_list([[1], [2]]) + r = repr(pl) + assert "1" in r and "2" in r + + +def test_multi_page_empty_first_page() -> None: + pl, expected = _simple_pagination_list([[], [1, 2]]) + assert list(pl) == expected + + +def test_multi_page_equality_with_plain_list() -> None: + pl, expected = _simple_pagination_list([[1, 2], [3]]) + assert pl == expected + + +# --------------------------------------------------------------------------- +# Performance: len() should not eagerly fetch all pages for a single-page list +# --------------------------------------------------------------------------- + + +def test_performance_len_single_page_makes_no_extra_fetches() -> None: + fetched: list[str] = [] + + def fetch(token: str) -> tuple[list[int], str | None]: + fetched.append(token) + return [], None + + pl: PaginationList[int] = PaginationList([1, 2, 3], None, fetch) + assert len(pl) == 3 + assert fetched == [], "No fetch should occur for a single-page list" + + +def test_performance_len_multi_page_fetches_all_pages_once() -> None: + fetch_count = 0 + + def fetch(token: str) -> tuple[list[int], str | None]: + nonlocal fetch_count + fetch_count += 1 + if token == "p2": + return [3, 4], "p3" + return [5], None + + pl: PaginationList[int] = PaginationList([1, 2], "p2", fetch) + assert len(pl) == 5 + assert fetch_count == 2, "Should fetch pages 2 and 3 exactly once each" + + fetch_count = 0 + assert len(pl) == 5 + assert fetch_count == 0, "Second len() should use cached data"