Skip to content

BUG: np.isin() gives OverflowError: Python int too large to convert to C long when using a np.uint64 np.array when using numpy==2.0.0 #26922

@jparismorgan

Description

@jparismorgan

Describe the issue:

When I use np.isin() and have test_elements which are uint64's where two or more elements are greater than np.iinfo(np.intp).max, I get OverflowError: Python int too large to convert to C long.

Note that this happens with numpy==2.0.0, but not with numpy==1.26.3, so my guess is that it's perhaps because of changes to data type promotion.

Reproduce the code example:

import numpy as np

print()
print('np.iinfo(np.intp).max', np.iinfo(np.intp).max)
print('np.iinfo(np.uint32).max', np.iinfo(np.uint32).max)
print('np.iinfo(np.uint64).max', np.iinfo(np.uint64).max)

# a. Works:
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.intp).max, np.iinfo(np.intp).max], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)

# b. Works:
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.intp).max, np.iinfo(np.intp).max + 1], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)

# c. Fails with:
#     basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min)
#     outgoing_array[basic_mask] = isin_helper_ar[
# >       np.subtract(ar1[basic_mask], ar2_min, dtype=np.intp)]
# E OverflowError: Python int too large to convert to C long
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.intp).max + 1, np.iinfo(np.intp).max + 1], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)

# d. Fails with the same error:
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.uint64).max, np.iinfo(np.uint64).max], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)

Error message:

        print()
        print('np.iinfo(np.intp).max', np.iinfo(np.intp).max)
        print('np.iinfo(np.uint32).max', np.iinfo(np.uint32).max)
        print('np.iinfo(np.uint64).max', np.iinfo(np.uint64).max)
    
        # Works:
        a = np.array([0, 1], dtype=np.uint64)
        b = np.array([1, np.iinfo(np.uint64).max - 1], dtype=np.uint64)
        np.isin(a, b, assume_unique=True, invert=True)
    
        # Works:
        a = np.array([0, 1], dtype=np.uint64)
        b = np.array([np.iinfo(np.uint32).max + 1, np.iinfo(np.uint32).max + 1], dtype=np.uint64)
        np.isin(a, b, assume_unique=True, invert=True)
    
        # Works:
        a = np.array([0, 1], dtype=np.uint64)
        b = np.array([np.iinfo(np.intp).max, np.iinfo(np.intp).max], dtype=np.uint64)
        np.isin(a, b, assume_unique=True, invert=True)
    
        # Fails:
        a = np.array([0, 1], dtype=np.uint64)
        b = np.array([np.iinfo(np.intp).max + 1, np.iinfo(np.intp).max + 1], dtype=np.uint64)
>       np.isin(a, b, assume_unique=True, invert=True)

apis/python/test/test_ingestion.py:1237: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/opt/homebrew/anaconda3/envs/TileDB-Vector-Search-4/lib/python3.9/site-packages/numpy/lib/_arraysetops_impl.py:1081: in isin
    return _in1d(element, test_elements, assume_unique=assume_unique,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

ar1 = array([0, 1], dtype=uint64), ar2 = array([9223372036854775808, 9223372036854775808], dtype=uint64), assume_unique = True
invert = True

    def _in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
        # Ravel both arrays, behavior for the first array could be different
        ar1 = np.asarray(ar1).ravel()
        ar2 = np.asarray(ar2).ravel()
    
        # Ensure that iteration through object arrays yields size-1 arrays
        if ar2.dtype == object:
            ar2 = ar2.reshape(-1, 1)
    
        if kind not in {None, 'sort', 'table'}:
            raise ValueError(
                f"Invalid kind: '{kind}'. Please use None, 'sort' or 'table'.")
    
        # Can use the table method if all arrays are integers or boolean:
        is_int_arrays = all(ar.dtype.kind in ("u", "i", "b") for ar in (ar1, ar2))
        use_table_method = is_int_arrays and kind in {None, 'table'}
    
        if use_table_method:
            if ar2.size == 0:
                if invert:
                    return np.ones_like(ar1, dtype=bool)
                else:
                    return np.zeros_like(ar1, dtype=bool)
    
            # Convert booleans to uint8 so we can use the fast integer algorithm
            if ar1.dtype == bool:
                ar1 = ar1.astype(np.uint8)
            if ar2.dtype == bool:
                ar2 = ar2.astype(np.uint8)
    
            ar2_min = int(np.min(ar2))
            ar2_max = int(np.max(ar2))
    
            ar2_range = ar2_max - ar2_min
    
            # Constraints on whether we can actually use the table method:
            #  1. Assert memory usage is not too large
            below_memory_constraint = ar2_range <= 6 * (ar1.size + ar2.size)
            #  2. Check overflows for (ar2 - ar2_min); dtype=ar2.dtype
            range_safe_from_overflow = ar2_range <= np.iinfo(ar2.dtype).max
    
            # Optimal performance is for approximately
            # log10(size) > (log10(range) - 2.27) / 0.927.
            # However, here we set the requirement that by default
            # the intermediate array can only be 6x
            # the combined memory allocation of the original
            # arrays. See discussion on
            # https://github.com/numpy/numpy/pull/12065.
    
            if (
                range_safe_from_overflow and
                (below_memory_constraint or kind == 'table')
            ):
    
                if invert:
                    outgoing_array = np.ones_like(ar1, dtype=bool)
                else:
                    outgoing_array = np.zeros_like(ar1, dtype=bool)
    
                # Make elements 1 where the integer exists in ar2
                if invert:
                    isin_helper_ar = np.ones(ar2_range + 1, dtype=bool)
                    isin_helper_ar[ar2 - ar2_min] = 0
                else:
                    isin_helper_ar = np.zeros(ar2_range + 1, dtype=bool)
                    isin_helper_ar[ar2 - ar2_min] = 1
    
                # Mask out elements we know won't work
                basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min)
                outgoing_array[basic_mask] = isin_helper_ar[
>                       np.subtract(ar1[basic_mask], ar2_min, dtype=np.intp)]
E               OverflowError: Python int too large to convert to C long

Python and NumPy Versions:

Python 3.9.18 (main, Sep 11 2023, 08:25:10) 
[Clang 14.0.6 ] :: Anaconda, Inc. on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import sys, numpy; print(numpy.__version__); print(sys.version)
2.0.0
3.9.18 (main, Sep 11 2023, 08:25:10) 
[Clang 14.0.6 ]

Runtime Environment:

[{
  'numpy_version': '2.0.0',
  'python': '3.9.18 (main, Sep 11 2023, 08:25:10) \n[Clang 14.0.6 ]',
  'uname': uname_result(
     system='Darwin', 
     node='Pariss-MacBook-Pro-2.local', 
     release='23.4.0', 
     version='Darwin Kernel Version 23.4.0: Fri Mar 15 00:12:49 PDT 2024; root:xnu-10063.101.17~1/RELEASE_ARM64_T6020', machine='arm64')
},
{
  'simd_extensions': {'baseline': ['NEON', 'NEON_FP16', 'NEON_VFPV4', 'ASIMD'], 'found': ['ASIMDHP'], 'not_found': ['ASIMDFHM']}
}]

Context for the issue:

The context is that I am updating TileDB Vector Search to use numpy 2, but code that used to work now fails.

Thank you for any help!

Metadata

Metadata

Assignees

No one assigned

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions