Describe the issue:
When I use np.isin() and have test_elements which are uint64's where two or more elements are greater than np.iinfo(np.intp).max, I get OverflowError: Python int too large to convert to C long.
Note that this happens with numpy==2.0.0, but not with numpy==1.26.3, so my guess is that it's perhaps because of changes to data type promotion.
Reproduce the code example:
import numpy as np
print()
print('np.iinfo(np.intp).max', np.iinfo(np.intp).max)
print('np.iinfo(np.uint32).max', np.iinfo(np.uint32).max)
print('np.iinfo(np.uint64).max', np.iinfo(np.uint64).max)
# a. Works:
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.intp).max, np.iinfo(np.intp).max], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)
# b. Works:
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.intp).max, np.iinfo(np.intp).max + 1], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)
# c. Fails with:
# basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min)
# outgoing_array[basic_mask] = isin_helper_ar[
# > np.subtract(ar1[basic_mask], ar2_min, dtype=np.intp)]
# E OverflowError: Python int too large to convert to C long
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.intp).max + 1, np.iinfo(np.intp).max + 1], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)
# d. Fails with the same error:
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.uint64).max, np.iinfo(np.uint64).max], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)
Error message:
print()
print('np.iinfo(np.intp).max', np.iinfo(np.intp).max)
print('np.iinfo(np.uint32).max', np.iinfo(np.uint32).max)
print('np.iinfo(np.uint64).max', np.iinfo(np.uint64).max)
# Works:
a = np.array([0, 1], dtype=np.uint64)
b = np.array([1, np.iinfo(np.uint64).max - 1], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)
# Works:
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.uint32).max + 1, np.iinfo(np.uint32).max + 1], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)
# Works:
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.intp).max, np.iinfo(np.intp).max], dtype=np.uint64)
np.isin(a, b, assume_unique=True, invert=True)
# Fails:
a = np.array([0, 1], dtype=np.uint64)
b = np.array([np.iinfo(np.intp).max + 1, np.iinfo(np.intp).max + 1], dtype=np.uint64)
> np.isin(a, b, assume_unique=True, invert=True)
apis/python/test/test_ingestion.py:1237:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/opt/homebrew/anaconda3/envs/TileDB-Vector-Search-4/lib/python3.9/site-packages/numpy/lib/_arraysetops_impl.py:1081: in isin
return _in1d(element, test_elements, assume_unique=assume_unique,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
ar1 = array([0, 1], dtype=uint64), ar2 = array([9223372036854775808, 9223372036854775808], dtype=uint64), assume_unique = True
invert = True
def _in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
# Ravel both arrays, behavior for the first array could be different
ar1 = np.asarray(ar1).ravel()
ar2 = np.asarray(ar2).ravel()
# Ensure that iteration through object arrays yields size-1 arrays
if ar2.dtype == object:
ar2 = ar2.reshape(-1, 1)
if kind not in {None, 'sort', 'table'}:
raise ValueError(
f"Invalid kind: '{kind}'. Please use None, 'sort' or 'table'.")
# Can use the table method if all arrays are integers or boolean:
is_int_arrays = all(ar.dtype.kind in ("u", "i", "b") for ar in (ar1, ar2))
use_table_method = is_int_arrays and kind in {None, 'table'}
if use_table_method:
if ar2.size == 0:
if invert:
return np.ones_like(ar1, dtype=bool)
else:
return np.zeros_like(ar1, dtype=bool)
# Convert booleans to uint8 so we can use the fast integer algorithm
if ar1.dtype == bool:
ar1 = ar1.astype(np.uint8)
if ar2.dtype == bool:
ar2 = ar2.astype(np.uint8)
ar2_min = int(np.min(ar2))
ar2_max = int(np.max(ar2))
ar2_range = ar2_max - ar2_min
# Constraints on whether we can actually use the table method:
# 1. Assert memory usage is not too large
below_memory_constraint = ar2_range <= 6 * (ar1.size + ar2.size)
# 2. Check overflows for (ar2 - ar2_min); dtype=ar2.dtype
range_safe_from_overflow = ar2_range <= np.iinfo(ar2.dtype).max
# Optimal performance is for approximately
# log10(size) > (log10(range) - 2.27) / 0.927.
# However, here we set the requirement that by default
# the intermediate array can only be 6x
# the combined memory allocation of the original
# arrays. See discussion on
# https://github.com/numpy/numpy/pull/12065.
if (
range_safe_from_overflow and
(below_memory_constraint or kind == 'table')
):
if invert:
outgoing_array = np.ones_like(ar1, dtype=bool)
else:
outgoing_array = np.zeros_like(ar1, dtype=bool)
# Make elements 1 where the integer exists in ar2
if invert:
isin_helper_ar = np.ones(ar2_range + 1, dtype=bool)
isin_helper_ar[ar2 - ar2_min] = 0
else:
isin_helper_ar = np.zeros(ar2_range + 1, dtype=bool)
isin_helper_ar[ar2 - ar2_min] = 1
# Mask out elements we know won't work
basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min)
outgoing_array[basic_mask] = isin_helper_ar[
> np.subtract(ar1[basic_mask], ar2_min, dtype=np.intp)]
E OverflowError: Python int too large to convert to C long
Python and NumPy Versions:
Python 3.9.18 (main, Sep 11 2023, 08:25:10)
[Clang 14.0.6 ] :: Anaconda, Inc. on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import sys, numpy; print(numpy.__version__); print(sys.version)
2.0.0
3.9.18 (main, Sep 11 2023, 08:25:10)
[Clang 14.0.6 ]
Runtime Environment:
[{
'numpy_version': '2.0.0',
'python': '3.9.18 (main, Sep 11 2023, 08:25:10) \n[Clang 14.0.6 ]',
'uname': uname_result(
system='Darwin',
node='Pariss-MacBook-Pro-2.local',
release='23.4.0',
version='Darwin Kernel Version 23.4.0: Fri Mar 15 00:12:49 PDT 2024; root:xnu-10063.101.17~1/RELEASE_ARM64_T6020', machine='arm64')
},
{
'simd_extensions': {'baseline': ['NEON', 'NEON_FP16', 'NEON_VFPV4', 'ASIMD'], 'found': ['ASIMDHP'], 'not_found': ['ASIMDFHM']}
}]
Context for the issue:
The context is that I am updating TileDB Vector Search to use numpy 2, but code that used to work now fails.
Thank you for any help!
Describe the issue:
When I use
np.isin()and havetest_elementswhich areuint64's where two or more elements are greater thannp.iinfo(np.intp).max, I getOverflowError: Python int too large to convert to C long.Note that this happens with
numpy==2.0.0, but not withnumpy==1.26.3, so my guess is that it's perhaps because of changes to data type promotion.Reproduce the code example:
Error message:
Python and NumPy Versions:
Runtime Environment:
Context for the issue:
The context is that I am updating TileDB Vector Search to use numpy 2, but code that used to work now fails.
Thank you for any help!