Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions be/src/core/data_type_serde/data_type_number_serde.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,29 +177,29 @@ Status read_integer_decoded_values(IColumn& column, const DecodedColumnView& vie
}

} // namespace
// Type map的基本结构
// Basic structure of the type map.
template <typename Key, typename Value, typename... Rest>
struct TypeMap {
using KeyType = Key;
using ValueType = Value;
using Next = TypeMap<Rest...>;
};

// Type map的末端
// End marker of the type map.
template <>
struct TypeMap<void, void> {};

// TypeMapLookup 前向声明
// Forward declaration of TypeMapLookup.
template <typename Key, typename Map>
struct TypeMapLookup;

// Type map查找:找到匹配的键时的情况
// Type map lookup when the key matches.
template <typename Key, typename Value, typename... Rest>
struct TypeMapLookup<Key, TypeMap<Key, Value, Rest...>> {
using ValueType = Value;
};

// Type map查找:递归查找
// Type map lookup by recursive search.
template <typename Key, typename K, typename V, typename... Rest>
struct TypeMapLookup<Key, TypeMap<K, V, Rest...>> {
using ValueType = typename TypeMapLookup<Key, TypeMap<Rest...>>::ValueType;
Expand Down
5 changes: 3 additions & 2 deletions be/src/core/data_type_serde/decoded_column_view.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ namespace doris {

class IColumn;

// 已解码 column batch 的物理值来源类型。
// 该枚举只描述通用内存布局,不包含 Parquet/ORC/Arrow 等格式专有类型。
// Physical value source type for a decoded column batch.
// This enum describes only generic memory layouts, not format-specific types such as
// Parquet/ORC/Arrow.
enum class DecodedValueKind {
BOOL,
INT32,
Expand Down
22 changes: 0 additions & 22 deletions be/src/exec/scan/file_scanner_v2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,13 +359,10 @@ Status FileScannerV2::_init_table_reader(const TFileRangeDesc& range) {
RETURN_IF_ERROR(_to_file_format(format_type, &file_format));
DORIS_CHECK(_table_reader != nullptr);

format::TableColumnPredicates table_column_predicates;
RETURN_IF_ERROR(_build_table_column_predicates(&table_column_predicates));
VExprContextSPtrs table_conjuncts;
RETURN_IF_ERROR(_build_table_conjuncts(&table_conjuncts));
RETURN_IF_ERROR(_table_reader->init({
.projected_columns = _projected_columns,
.column_predicates = std::move(table_column_predicates),
.conjuncts = std::move(table_conjuncts),
.format = file_format,
.scan_params = const_cast<TFileScanRangeParams*>(_params),
Expand Down Expand Up @@ -588,25 +585,6 @@ format::ColumnDefinition FileScannerV2::_build_table_column(const SlotDescriptor
return column;
}

Status FileScannerV2::_build_table_column_predicates(
format::TableColumnPredicates* predicates) const {
DORIS_CHECK(predicates != nullptr);
predicates->clear();
const auto& slot_predicates = _local_state->cast<FileScanLocalState>()._slot_id_to_predicates;
for (const auto& [slot_id, slot_predicate_list] : slot_predicates) {
const auto it = _slot_id_to_desc.find(slot_id);
if (it == _slot_id_to_desc.end()) {
continue;
}
const auto global_index_it = _slot_id_to_global_index.find(slot_id);
if (global_index_it == _slot_id_to_global_index.end()) {
continue;
}
(*predicates)[global_index_it->second] = slot_predicate_list;
}
return Status::OK();
}

Status FileScannerV2::_build_table_conjuncts(VExprContextSPtrs* conjuncts) const {
DORIS_CHECK(conjuncts != nullptr);
conjuncts->clear();
Expand Down
1 change: 0 additions & 1 deletion be/src/exec/scan/file_scanner_v2.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ class FileScannerV2 final : public Scanner {
Status _build_projected_columns(const format::TableReader& table_reader);
Status _build_default_expr(const TFileScanSlotInfo& slot_info, VExprContextSPtr* ctx) const;
static format::ColumnDefinition _build_table_column(const SlotDescriptor* slot_desc);
Status _build_table_column_predicates(format::TableColumnPredicates* predicates) const;
Status _build_table_conjuncts(VExprContextSPtrs* conjuncts) const;
static Status _to_file_format(TFileFormatType::type format_type,
format::FileFormat* file_format);
Expand Down
179 changes: 167 additions & 12 deletions be/src/exprs/expr_zonemap_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "exprs/vliteral.h"
#include "exprs/vslot_ref.h"
#include "runtime/runtime_state.h"
#include "storage/index/bloom_filter/bloom_filter.h"

namespace doris::expr_zonemap {
namespace {
Expand All @@ -52,8 +53,86 @@ bool value_in_range(const Field& value, const Field& min_value, const Field& max
return value >= min_value && value <= max_value;
}

bool dictionary_contains(const DictionaryEvalContext::SlotDictionary& dictionary,
const Field& value) {
return std::ranges::any_of(dictionary.values, [&](const Field& dictionary_value) {
return dictionary_value == value;
});
}

bool bloom_filter_may_contain(const BloomFilterEvalContext::SlotBloomFilter& slot_filter,
const Field& value) {
DORIS_CHECK(slot_filter.data_type != nullptr);
DORIS_CHECK(slot_filter.bloom_filter != nullptr);
const auto data_type = remove_nullable(slot_filter.data_type);
DORIS_CHECK(data_type != nullptr);
switch (data_type->get_primitive_type()) {
case TYPE_BOOLEAN: {
const bool typed_value = value.get<TYPE_BOOLEAN>();
return slot_filter.bloom_filter->test_bytes(reinterpret_cast<const char*>(&typed_value),
sizeof(typed_value));
}
case TYPE_INT: {
const int32_t typed_value = value.get<TYPE_INT>();
return slot_filter.bloom_filter->test_bytes(reinterpret_cast<const char*>(&typed_value),
sizeof(typed_value));
}
case TYPE_BIGINT: {
const int64_t typed_value = value.get<TYPE_BIGINT>();
return slot_filter.bloom_filter->test_bytes(reinterpret_cast<const char*>(&typed_value),
sizeof(typed_value));
}
case TYPE_FLOAT: {
const float typed_value = value.get<TYPE_FLOAT>();
return slot_filter.bloom_filter->test_bytes(reinterpret_cast<const char*>(&typed_value),
sizeof(typed_value));
}
case TYPE_DOUBLE: {
const double typed_value = value.get<TYPE_DOUBLE>();
return slot_filter.bloom_filter->test_bytes(reinterpret_cast<const char*>(&typed_value),
sizeof(typed_value));
}
case TYPE_CHAR:
case TYPE_VARCHAR:
case TYPE_STRING: {
const auto& typed_value = value.get<TYPE_STRING>();
return slot_filter.bloom_filter->test_bytes(typed_value.data(), typed_value.size());
}
default:
return true;
}
}

template <typename Capability>
int single_slot_index(const VExprContextSPtr& ctx, Capability capability) {
DORIS_CHECK(ctx != nullptr);
const auto& root = ctx->root();
DORIS_CHECK(root != nullptr);
if (!capability(root)) {
return -1;
}

std::set<int> slot_indexes;
root->collect_slot_column_ids(slot_indexes);
if (slot_indexes.size() != 1) {
return -1;
}

return *slot_indexes.begin();
}

} // namespace

const DictionaryEvalContext::SlotDictionary* DictionaryEvalContext::slot(int slot_index) const {
auto it = slots.find(slot_index);
return it == slots.end() ? nullptr : &it->second;
}

const BloomFilterEvalContext::SlotBloomFilter* BloomFilterEvalContext::slot(int slot_index) const {
auto it = slots.find(slot_index);
return it == slots.end() ? nullptr : &it->second;
}

TExprNode create_texpr_node_from_hybrid_set_value(const void* data, const PrimitiveType& type,
int precision, int scale) {
if (is_string_type(type)) {
Expand Down Expand Up @@ -236,24 +315,100 @@ ZoneMapFilterResult eval_in_zonemap(const ZoneMapEvalContext& ctx, const VExprSP
return ZoneMapFilterResult::kNoMatch;
}

ZoneMapFilterResult eval_eq_dictionary(const DictionaryEvalContext& ctx,
const SlotLiteral& slot_literal) {
auto dictionary = ctx.slot(slot_literal.slot_index);
if (dictionary == nullptr || dictionary->data_type == nullptr) {
return ZoneMapFilterResult::kUnsupported;
}
DORIS_CHECK(data_types_compatible(dictionary->data_type, slot_literal.slot_type));
if (slot_literal.literal.is_null()) {
return ZoneMapFilterResult::kUnsupported;
}
return dictionary_contains(*dictionary, slot_literal.literal) ? ZoneMapFilterResult::kMayMatch
: ZoneMapFilterResult::kNoMatch;
}

ZoneMapFilterResult eval_in_dictionary(const DictionaryEvalContext& ctx, const VExprSPtr& slot_expr,
bool is_not_in, const std::vector<Field>& values) {
if (is_not_in) {
return ZoneMapFilterResult::kUnsupported;
}
auto slot = std::dynamic_pointer_cast<VSlotRef>(slot_expr);
DORIS_CHECK(slot != nullptr);
auto dictionary = ctx.slot(slot->column_id());
if (dictionary == nullptr || dictionary->data_type == nullptr) {
return ZoneMapFilterResult::kUnsupported;
}
DORIS_CHECK(data_types_compatible(dictionary->data_type, slot->data_type()));
if (values.empty()) {
return ZoneMapFilterResult::kNoMatch;
}
for (const auto& value : values) {
if (!value.is_null() && dictionary_contains(*dictionary, value)) {
return ZoneMapFilterResult::kMayMatch;
}
}
return ZoneMapFilterResult::kNoMatch;
}

ZoneMapFilterResult eval_eq_bloom_filter(const BloomFilterEvalContext& ctx,
const SlotLiteral& slot_literal) {
auto slot_filter = ctx.slot(slot_literal.slot_index);
if (slot_filter == nullptr || slot_filter->data_type == nullptr ||
slot_filter->bloom_filter == nullptr) {
return ZoneMapFilterResult::kUnsupported;
}
DORIS_CHECK(data_types_compatible(slot_filter->data_type, slot_literal.slot_type));
if (slot_literal.literal.is_null()) {
return ZoneMapFilterResult::kUnsupported;
}
return bloom_filter_may_contain(*slot_filter, slot_literal.literal)
? ZoneMapFilterResult::kMayMatch
: ZoneMapFilterResult::kNoMatch;
}

ZoneMapFilterResult eval_in_bloom_filter(const BloomFilterEvalContext& ctx,
const VExprSPtr& slot_expr, bool is_not_in,
const std::vector<Field>& values) {
if (is_not_in) {
return ZoneMapFilterResult::kUnsupported;
}
auto slot = std::dynamic_pointer_cast<VSlotRef>(slot_expr);
DORIS_CHECK(slot != nullptr);
auto slot_filter = ctx.slot(slot->column_id());
if (slot_filter == nullptr || slot_filter->data_type == nullptr ||
slot_filter->bloom_filter == nullptr) {
return ZoneMapFilterResult::kUnsupported;
}
DORIS_CHECK(data_types_compatible(slot_filter->data_type, slot->data_type()));
if (values.empty()) {
return ZoneMapFilterResult::kNoMatch;
}
for (const auto& value : values) {
if (!value.is_null() && bloom_filter_may_contain(*slot_filter, value)) {
return ZoneMapFilterResult::kMayMatch;
}
}
return ZoneMapFilterResult::kNoMatch;
}

// Return the only slot ordinal referenced by a zonemap-evaluable expression. A negative result is
// the conservative fallback marker for unsupported expressions, multi-slot expressions, or invalid
// slot ordinals, so callers can skip schema-indexed zonemap pruning safely.
int single_slot_zonemap_index(const VExprContextSPtr& ctx) {
DORIS_CHECK(ctx != nullptr);
const auto& root = ctx->root();
DORIS_CHECK(root != nullptr);
if (!root->can_evaluate_zonemap_filter()) {
return -1;
}
return single_slot_index(
ctx, [](const VExprSPtr& expr) { return expr->can_evaluate_zonemap_filter(); });
}

std::set<int> slot_indexes;
root->collect_slot_column_ids(slot_indexes);
if (slot_indexes.size() != 1) {
return -1;
}
int single_slot_dictionary_index(const VExprContextSPtr& ctx) {
return single_slot_index(
ctx, [](const VExprSPtr& expr) { return expr->can_evaluate_dictionary_filter(); });
}

return *slot_indexes.begin();
int single_slot_bloom_filter_index(const VExprContextSPtr& ctx) {
return single_slot_index(
ctx, [](const VExprSPtr& expr) { return expr->can_evaluate_bloom_filter(); });
}

bool is_expr_zonemap_filter_enabled(const RuntimeState* state) {
Expand Down
52 changes: 52 additions & 0 deletions be/src/exprs/expr_zonemap_filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#pragma once

#include <compare>
#include <map>
#include <optional>
#include <vector>

Expand All @@ -35,6 +36,10 @@ namespace doris {
class HybridSetBase;
class RuntimeState;
class TExprNode;

namespace segment_v2 {
class BloomFilter;
} // namespace segment_v2
} // namespace doris

namespace doris::expr_zonemap {
Expand All @@ -46,6 +51,31 @@ struct InZonemapMaterializedSet {
Field max_value;
};

// Dictionary pruning evaluates file-level dictionary values, not row-level data. A kNoMatch result
// means no non-null dictionary entry can satisfy the expression, so the whole row group can be
// skipped only for dictionary-encoded columns whose dictionary contains all non-null values.
struct DictionaryEvalContext {
struct SlotDictionary {
DataTypePtr data_type;
std::vector<Field> values;
};

const SlotDictionary* slot(int slot_index) const;
std::map<int, SlotDictionary> slots;
};

// Bloom-filter pruning can only disprove equality-style predicates. A kNoMatch result means every
// literal candidate required by the expression is definitely absent from the file bloom filter.
struct BloomFilterEvalContext {
struct SlotBloomFilter {
DataTypePtr data_type;
const segment_v2::BloomFilter* bloom_filter = nullptr;
};

const SlotBloomFilter* slot(int slot_index) const;
std::map<int, SlotBloomFilter> slots;
};

struct SlotLiteral {
// Slot ordinal in the current expression binding. It is also the key used to look up the
// corresponding reader-schema type and zone map from ZoneMapEvalContext.
Expand Down Expand Up @@ -115,11 +145,33 @@ ZoneMapFilterResult eval_in_zonemap(const ZoneMapEvalContext& ctx, const VExprSP
bool is_not_in, const std::vector<Field>& values,
const Field& min_value, const Field& max_value);

ZoneMapFilterResult eval_eq_dictionary(const DictionaryEvalContext& ctx,
const SlotLiteral& slot_literal);

ZoneMapFilterResult eval_in_dictionary(const DictionaryEvalContext& ctx, const VExprSPtr& slot_expr,
bool is_not_in, const std::vector<Field>& values);

ZoneMapFilterResult eval_eq_bloom_filter(const BloomFilterEvalContext& ctx,
const SlotLiteral& slot_literal);

ZoneMapFilterResult eval_in_bloom_filter(const BloomFilterEvalContext& ctx,
const VExprSPtr& slot_expr, bool is_not_in,
const std::vector<Field>& values);

// Return the only slot ordinal referenced by a zonemap-evaluable expression in its current
// binding. Expressions that are unsupported by zonemap pruning, reference multiple slots, or use an
// invalid negative slot ordinal return a negative value.
int single_slot_zonemap_index(const VExprContextSPtr& ctx);

int single_slot_dictionary_index(const VExprContextSPtr& ctx);

int single_slot_bloom_filter_index(const VExprContextSPtr& ctx);

bool is_expr_zonemap_filter_enabled(const RuntimeState* state);

} // namespace doris::expr_zonemap

namespace doris {
using DictionaryEvalContext = expr_zonemap::DictionaryEvalContext;
using BloomFilterEvalContext = expr_zonemap::BloomFilterEvalContext;
} // namespace doris
Loading
Loading