How to understand vectorized query of mapd?


#1

How to understand vectorized query of mapd?


#2

If I understood your question, you should start analyzing the output of the explain command of your query.

so for the query

explain select max(arrdelay) from flights_2008_7M where flight_month in (1,2,3);

will be generated this IR code; if you are familiar with LLVM or ASM in quite straightforward to read the code and to understand how the system execute queries

The first part does the scan of values on memory buffers the last part (row_func_hoisted_literals) is doing filtering and calling functions for aggregates.

IR for the GPU:
===============

define void @query_template_0(i8** nocapture %byte_stream, i8* nocapture %literals, i64* nocapture %row_count_ptr, i64* nocapture %frag_row_off_ptr, i32* %max_matched_ptr, i64* %agg_init_val, i64** %out, i64** %unused, i32 %frag_idx, i64* %join_hash_tables, i32* %total_matched, i32* %error_code) {
.entry:
  %0 = getelementptr i8*, i8** %byte_stream, i32 0
  %1 = load i8*, i8** %0
  %2 = getelementptr i8*, i8** %byte_stream, i32 1
  %3 = load i8*, i8** %2
  %4 = getelementptr i8, i8* %literals, i16 0
  %5 = bitcast i8* %4 to i16*
  %literal_0 = load i16, i16* %5
  %6 = getelementptr i8, i8* %literals, i16 2
  %7 = bitcast i8* %6 to i16*
  %literal_2 = load i16, i16* %7
  %8 = getelementptr i8, i8* %literals, i16 4
  %9 = bitcast i8* %8 to i16*
  %literal_4 = load i16, i16* %9
  %result = alloca i64, align 8
  %row_count = load i64, i64* %row_count_ptr, align 8
  %10 = getelementptr inbounds i64, i64* %agg_init_val, i32 0
  %11 = load i64, i64* %10, align 8
  store i64 %11, i64* %result, align 8
  %pos_start = call i32 @pos_start_impl(i32* %error_code)
  %pos_step = call i32 @pos_step_impl()
  %group_buff_idx = call i32 @group_buff_idx_impl()
  %12 = sext i32 %pos_start to i64
  %13 = icmp slt i64 %12, %row_count
  br i1 %13, label %.loop.preheader, label %.exit

.loop.preheader:                                  ; preds = %.entry
  %14 = sext i32 %pos_step to i64
  br label %.for.body

.for.body:                                        ; preds = %.for.body, %.loop.preheader
  %pos = phi i64 [ %12, %.loop.preheader ], [ %16, %.for.body ]
  %15 = call i32 @row_func_hoisted_literals(i64* %result, i64* %agg_init_val, i64 %pos, i64* %frag_row_off_ptr, i64* %row_count_ptr, i8* %literals, i8* %1, i8* %3, i64* %join_hash_tables, i16 %literal_4, i16 %literal_2, i16 %literal_0)
  %16 = add nsw i64 %pos, %14
  %17 = icmp slt i64 %16, %row_count
  br i1 %17, label %.for.body, label %._crit_edge

._crit_edge:                                      ; preds = %.for.body
  %.pre.result = load i64, i64* %result, align 8
  br label %.exit

.exit:                                            ; preds = %._crit_edge, %.entry
  %18 = phi i64 [ %.pre.result, %._crit_edge ], [ %11, %.entry ]
  %19 = getelementptr inbounds i64*, i64** %out, i32 0
  %20 = load i64*, i64** %19, align 8
  %21 = mul i32 %frag_idx, %pos_step
  %22 = add i32 %group_buff_idx, %21
  %23 = getelementptr inbounds i64, i64* %20, i32 %22
  store i64 %18, i64* %23, align 8
  ret void
}

; Function Attrs: alwaysinline
define i32 @row_func_hoisted_literals(i64* %out, i64* %agg_init_val, i64 %pos, i64* %frag_row_off, i64* %num_rows_per_scan, i8* %literals, i8* %col_buf0, i8* %col_buf1, i64* %join_hash_tables, i16 %arg_literal_4, i16 %arg_literal_2, i16 %arg_literal_0) #20 {
entry:
  %0 = load i64, i64* %frag_row_off
  %1 = call i64 @fixed_width_int_decode(i8* %col_buf0, i32 2, i64 %pos)
  %2 = trunc i64 %1 to i16
  %3 = call i8 @eq_int16_t_nullable(i16 %2, i16 %arg_literal_0, i64 -32768, i8 -128)
  %4 = call i8 @logical_or(i8 0, i8 %3, i8 -128)
  %5 = call i8 @eq_int16_t_nullable(i16 %2, i16 %arg_literal_2, i64 -32768, i8 -128)
  %6 = call i8 @logical_or(i8 %4, i8 %5, i8 -128)
  %7 = call i8 @eq_int16_t_nullable(i16 %2, i16 %arg_literal_4, i64 -32768, i8 -128)
  %8 = call i8 @logical_or(i8 %6, i8 %7, i8 -128)
  %9 = icmp sgt i8 %8, 0
  %10 = and i1 true, %9
  br i1 %10, label %filter_true, label %filter_false

filter_true:                                      ; preds = %entry
  %11 = call i64 @fixed_width_int_decode(i8* %col_buf1, i32 2, i64 %pos)
  %12 = trunc i64 %11 to i16
  %13 = sext i16 %12 to i64
  call void @agg_max_skip_val(i64* %out, i64 %13, i64 -32768)
  br label %filter_false

filter_false:                                     ; preds = %filter_true, %entry
  ret i32 0
}

#3

hi @aznable:
think you! I want the vectorized query to be LLVM specific and I need to go into the details.