cloudflare/ClickHouse
Publicmirrored fromhttps://github.com/cloudflare/ClickHouse
src/Interpreters/InterpreterInsertQuery.cpp
327lines · modecode
| 1 | #include <Interpreters/InterpreterInsertQuery.h> |
| 2 | |
| 3 | #include <DataStreams/AddingDefaultBlockOutputStream.h> |
| 4 | #include <DataStreams/AddingDefaultsBlockInputStream.h> |
| 5 | #include <DataStreams/CheckConstraintsBlockOutputStream.h> |
| 6 | #include <DataStreams/CountingBlockOutputStream.h> |
| 7 | #include <DataStreams/InputStreamFromASTInsertQuery.h> |
| 8 | #include <DataStreams/NullAndDoCopyBlockInputStream.h> |
| 9 | #include <DataStreams/NullBlockOutputStream.h> |
| 10 | #include <DataStreams/PushingToViewsBlockOutputStream.h> |
| 11 | #include <DataStreams/RemoteBlockInputStream.h> |
| 12 | #include <DataStreams/SquashingBlockOutputStream.h> |
| 13 | #include <DataStreams/copyData.h> |
| 14 | #include <IO/ConcatReadBuffer.h> |
| 15 | #include <Interpreters/InterpreterSelectWithUnionQuery.h> |
| 16 | #include <Interpreters/InterpreterWatchQuery.h> |
| 17 | #include <Access/AccessFlags.h> |
| 18 | #include <Interpreters/JoinedTables.h> |
| 19 | #include <Parsers/ASTFunction.h> |
| 20 | #include <Parsers/ASTInsertQuery.h> |
| 21 | #include <Parsers/ASTSelectQuery.h> |
| 22 | #include <Parsers/ASTSelectWithUnionQuery.h> |
| 23 | #include <Parsers/queryToString.h> |
| 24 | #include <Storages/StorageDistributed.h> |
| 25 | #include <TableFunctions/TableFunctionFactory.h> |
| 26 | #include <Common/checkStackSize.h> |
| 27 | #include <Processors/Sources/SourceFromInputStream.h> |
| 28 | #include <Processors/NullSink.h> |
| 29 | #include <Processors/Transforms/ConvertingTransform.h> |
| 30 | #include <Processors/Sources/SinkToOutputStream.h> |
| 31 | |
| 32 | namespace |
| 33 | { |
| 34 | const UInt64 PARALLEL_DISTRIBUTED_INSERT_SELECT_ALL = 2; |
| 35 | } |
| 36 | |
| 37 | |
| 38 | namespace DB |
| 39 | { |
| 40 | |
| 41 | namespace ErrorCodes |
| 42 | { |
| 43 | extern const int NO_SUCH_COLUMN_IN_TABLE; |
| 44 | extern const int ILLEGAL_COLUMN; |
| 45 | extern const int DUPLICATE_COLUMN; |
| 46 | extern const int LOGICAL_ERROR; |
| 47 | } |
| 48 | |
| 49 | |
| 50 | InterpreterInsertQuery::InterpreterInsertQuery( |
| 51 | const ASTPtr & query_ptr_, const Context & context_, bool allow_materialized_, bool no_squash_, bool no_destination_) |
| 52 | : query_ptr(query_ptr_) |
| 53 | , context(context_) |
| 54 | , allow_materialized(allow_materialized_) |
| 55 | , no_squash(no_squash_) |
| 56 | , no_destination(no_destination_) |
| 57 | { |
| 58 | checkStackSize(); |
| 59 | } |
| 60 | |
| 61 | |
| 62 | StoragePtr InterpreterInsertQuery::getTable(ASTInsertQuery & query) |
| 63 | { |
| 64 | if (query.table_function) |
| 65 | { |
| 66 | const auto * table_function = query.table_function->as<ASTFunction>(); |
| 67 | const auto & factory = TableFunctionFactory::instance(); |
| 68 | TableFunctionPtr table_function_ptr = factory.get(table_function->name, context); |
| 69 | return table_function_ptr->execute(query.table_function, context, table_function_ptr->getName()); |
| 70 | } |
| 71 | |
| 72 | query.table_id = context.resolveStorageID(query.table_id); |
| 73 | return DatabaseCatalog::instance().getTable(query.table_id, context); |
| 74 | } |
| 75 | |
| 76 | Block InterpreterInsertQuery::getSampleBlock( |
| 77 | const ASTInsertQuery & query, |
| 78 | const StoragePtr & table, |
| 79 | const StorageMetadataPtr & metadata_snapshot) const |
| 80 | { |
| 81 | Block table_sample_non_materialized = metadata_snapshot->getSampleBlockNonMaterialized(); |
| 82 | /// If the query does not include information about columns |
| 83 | if (!query.columns) |
| 84 | { |
| 85 | if (no_destination) |
| 86 | return metadata_snapshot->getSampleBlockWithVirtuals(table->getVirtuals()); |
| 87 | else |
| 88 | return table_sample_non_materialized; |
| 89 | } |
| 90 | |
| 91 | Block table_sample = metadata_snapshot->getSampleBlock(); |
| 92 | /// Form the block based on the column names from the query |
| 93 | Block res; |
| 94 | for (const auto & identifier : query.columns->children) |
| 95 | { |
| 96 | std::string current_name = identifier->getColumnName(); |
| 97 | |
| 98 | /// The table does not have a column with that name |
| 99 | if (!table_sample.has(current_name)) |
| 100 | throw Exception("No such column " + current_name + " in table " + query.table_id.getNameForLogs(), ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); |
| 101 | |
| 102 | if (!allow_materialized && !table_sample_non_materialized.has(current_name)) |
| 103 | throw Exception("Cannot insert column " + current_name + ", because it is MATERIALIZED column.", ErrorCodes::ILLEGAL_COLUMN); |
| 104 | if (res.has(current_name)) |
| 105 | throw Exception("Column " + current_name + " specified more than once", ErrorCodes::DUPLICATE_COLUMN); |
| 106 | |
| 107 | res.insert(ColumnWithTypeAndName(table_sample.getByName(current_name).type, current_name)); |
| 108 | } |
| 109 | return res; |
| 110 | } |
| 111 | |
| 112 | |
| 113 | BlockIO InterpreterInsertQuery::execute() |
| 114 | { |
| 115 | const Settings & settings = context.getSettingsRef(); |
| 116 | auto & query = query_ptr->as<ASTInsertQuery &>(); |
| 117 | |
| 118 | BlockIO res; |
| 119 | |
| 120 | StoragePtr table = getTable(query); |
| 121 | auto table_lock = table->lockForShare(context.getInitialQueryId(), context.getSettingsRef().lock_acquire_timeout); |
| 122 | auto metadata_snapshot = table->getInMemoryMetadataPtr(); |
| 123 | |
| 124 | auto query_sample_block = getSampleBlock(query, table, metadata_snapshot); |
| 125 | if (!query.table_function) |
| 126 | context.checkAccess(AccessType::INSERT, query.table_id, query_sample_block.getNames()); |
| 127 | |
| 128 | bool is_distributed_insert_select = false; |
| 129 | |
| 130 | if (query.select && table->isRemote() && settings.parallel_distributed_insert_select) |
| 131 | { |
| 132 | // Distributed INSERT SELECT |
| 133 | std::shared_ptr<StorageDistributed> storage_src; |
| 134 | auto & select = query.select->as<ASTSelectWithUnionQuery &>(); |
| 135 | auto new_query = std::dynamic_pointer_cast<ASTInsertQuery>(query.clone()); |
| 136 | if (select.list_of_selects->children.size() == 1) |
| 137 | { |
| 138 | auto & select_query = select.list_of_selects->children.at(0)->as<ASTSelectQuery &>(); |
| 139 | JoinedTables joined_tables(Context(context), select_query); |
| 140 | |
| 141 | if (joined_tables.tablesCount() == 1) |
| 142 | { |
| 143 | storage_src = std::dynamic_pointer_cast<StorageDistributed>(joined_tables.getLeftTableStorage()); |
| 144 | if (storage_src) |
| 145 | { |
| 146 | const auto select_with_union_query = std::make_shared<ASTSelectWithUnionQuery>(); |
| 147 | select_with_union_query->list_of_selects = std::make_shared<ASTExpressionList>(); |
| 148 | |
| 149 | auto new_select_query = std::dynamic_pointer_cast<ASTSelectQuery>(select_query.clone()); |
| 150 | select_with_union_query->list_of_selects->children.push_back(new_select_query); |
| 151 | |
| 152 | new_select_query->replaceDatabaseAndTable(storage_src->getRemoteDatabaseName(), storage_src->getRemoteTableName()); |
| 153 | |
| 154 | new_query->select = select_with_union_query; |
| 155 | } |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | auto storage_dst = std::dynamic_pointer_cast<StorageDistributed>(table); |
| 160 | |
| 161 | if (storage_src && storage_dst && storage_src->cluster_name == storage_dst->cluster_name) |
| 162 | { |
| 163 | is_distributed_insert_select = true; |
| 164 | |
| 165 | if (settings.parallel_distributed_insert_select == PARALLEL_DISTRIBUTED_INSERT_SELECT_ALL) |
| 166 | { |
| 167 | new_query->table_id = StorageID(storage_dst->getRemoteDatabaseName(), storage_dst->getRemoteTableName()); |
| 168 | } |
| 169 | |
| 170 | const auto & cluster = storage_src->getCluster(); |
| 171 | const auto & shards_info = cluster->getShardsInfo(); |
| 172 | |
| 173 | std::vector<std::unique_ptr<QueryPipeline>> pipelines; |
| 174 | |
| 175 | String new_query_str = queryToString(new_query); |
| 176 | for (size_t shard_index : ext::range(0, shards_info.size())) |
| 177 | { |
| 178 | const auto & shard_info = shards_info[shard_index]; |
| 179 | if (shard_info.isLocal()) |
| 180 | { |
| 181 | InterpreterInsertQuery interpreter(new_query, context); |
| 182 | pipelines.emplace_back(std::make_unique<QueryPipeline>(interpreter.execute().pipeline)); |
| 183 | } |
| 184 | else |
| 185 | { |
| 186 | auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(settings); |
| 187 | auto connections = shard_info.pool->getMany(timeouts, &settings, PoolMode::GET_ONE); |
| 188 | if (connections.empty() || connections.front().isNull()) |
| 189 | throw Exception( |
| 190 | "Expected exactly one connection for shard " + toString(shard_info.shard_num), ErrorCodes::LOGICAL_ERROR); |
| 191 | |
| 192 | /// INSERT SELECT query returns empty block |
| 193 | auto in_stream = std::make_shared<RemoteBlockInputStream>(std::move(connections), new_query_str, Block{}, context); |
| 194 | pipelines.emplace_back(std::make_unique<QueryPipeline>()); |
| 195 | pipelines.back()->init(Pipe(std::make_shared<SourceFromInputStream>(std::move(in_stream)))); |
| 196 | pipelines.back()->setSinks([](const Block & header, QueryPipeline::StreamType) -> ProcessorPtr |
| 197 | { |
| 198 | return std::make_shared<EmptySink>(header); |
| 199 | }); |
| 200 | } |
| 201 | } |
| 202 | |
| 203 | res.pipeline = QueryPipeline::unitePipelines(std::move(pipelines), {}); |
| 204 | } |
| 205 | } |
| 206 | |
| 207 | BlockOutputStreams out_streams; |
| 208 | if (!is_distributed_insert_select || query.watch) |
| 209 | { |
| 210 | size_t out_streams_size = 1; |
| 211 | if (query.select) |
| 212 | { |
| 213 | /// Passing 1 as subquery_depth will disable limiting size of intermediate result. |
| 214 | InterpreterSelectWithUnionQuery interpreter_select{ query.select, context, SelectQueryOptions(QueryProcessingStage::Complete, 1)}; |
| 215 | res = interpreter_select.execute(); |
| 216 | |
| 217 | if (table->supportsParallelInsert() && settings.max_insert_threads > 1) |
| 218 | out_streams_size = std::min(size_t(settings.max_insert_threads), res.pipeline.getNumStreams()); |
| 219 | |
| 220 | res.pipeline.resize(out_streams_size); |
| 221 | } |
| 222 | else if (query.watch) |
| 223 | { |
| 224 | InterpreterWatchQuery interpreter_watch{ query.watch, context }; |
| 225 | res = interpreter_watch.execute(); |
| 226 | res.pipeline.init(Pipe(std::make_shared<SourceFromInputStream>(std::move(res.in)))); |
| 227 | } |
| 228 | |
| 229 | for (size_t i = 0; i < out_streams_size; i++) |
| 230 | { |
| 231 | /// We create a pipeline of several streams, into which we will write data. |
| 232 | BlockOutputStreamPtr out; |
| 233 | |
| 234 | /// NOTE: we explicitly ignore bound materialized views when inserting into Kafka Storage. |
| 235 | /// Otherwise we'll get duplicates when MV reads same rows again from Kafka. |
| 236 | if (table->noPushingToViews() && !no_destination) |
| 237 | out = table->write(query_ptr, metadata_snapshot, context); |
| 238 | else |
| 239 | out = std::make_shared<PushingToViewsBlockOutputStream>(table, metadata_snapshot, context, query_ptr, no_destination); |
| 240 | |
| 241 | /// Note that we wrap transforms one on top of another, so we write them in reverse of data processing order. |
| 242 | |
| 243 | /// Checking constraints. It must be done after calculation of all defaults, so we can check them on calculated columns. |
| 244 | if (const auto & constraints = metadata_snapshot->getConstraints(); !constraints.empty()) |
| 245 | out = std::make_shared<CheckConstraintsBlockOutputStream>( |
| 246 | query.table_id, out, out->getHeader(), metadata_snapshot->getConstraints(), context); |
| 247 | |
| 248 | /// Actually we don't know structure of input blocks from query/table, |
| 249 | /// because some clients break insertion protocol (columns != header) |
| 250 | out = std::make_shared<AddingDefaultBlockOutputStream>( |
| 251 | out, query_sample_block, out->getHeader(), metadata_snapshot->getColumns().getDefaults(), context); |
| 252 | |
| 253 | /// It's important to squash blocks as early as possible (before other transforms), |
| 254 | /// because other transforms may work inefficient if block size is small. |
| 255 | |
| 256 | /// Do not squash blocks if it is a sync INSERT into Distributed, since it lead to double bufferization on client and server side. |
| 257 | /// Client-side bufferization might cause excessive timeouts (especially in case of big blocks). |
| 258 | if (!(context.getSettingsRef().insert_distributed_sync && table->isRemote()) && !no_squash && !query.watch) |
| 259 | { |
| 260 | out = std::make_shared<SquashingBlockOutputStream>( |
| 261 | out, |
| 262 | out->getHeader(), |
| 263 | context.getSettingsRef().min_insert_block_size_rows, |
| 264 | context.getSettingsRef().min_insert_block_size_bytes); |
| 265 | } |
| 266 | |
| 267 | auto out_wrapper = std::make_shared<CountingBlockOutputStream>(out); |
| 268 | out_wrapper->setProcessListElement(context.getProcessListElement()); |
| 269 | out = std::move(out_wrapper); |
| 270 | out_streams.emplace_back(std::move(out)); |
| 271 | } |
| 272 | } |
| 273 | |
| 274 | /// What type of query: INSERT or INSERT SELECT or INSERT WATCH? |
| 275 | if (is_distributed_insert_select) |
| 276 | { |
| 277 | /// Pipeline was already built. |
| 278 | } |
| 279 | else if (query.select || query.watch) |
| 280 | { |
| 281 | const auto & header = out_streams.at(0)->getHeader(); |
| 282 | |
| 283 | res.pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr |
| 284 | { |
| 285 | return std::make_shared<ConvertingTransform>(in_header, header, |
| 286 | ConvertingTransform::MatchColumnsMode::Position); |
| 287 | }); |
| 288 | |
| 289 | res.pipeline.setSinks([&](const Block &, QueryPipeline::StreamType type) -> ProcessorPtr |
| 290 | { |
| 291 | if (type != QueryPipeline::StreamType::Main) |
| 292 | return nullptr; |
| 293 | |
| 294 | auto stream = std::move(out_streams.back()); |
| 295 | out_streams.pop_back(); |
| 296 | |
| 297 | return std::make_shared<SinkToOutputStream>(std::move(stream)); |
| 298 | }); |
| 299 | |
| 300 | if (!allow_materialized) |
| 301 | { |
| 302 | for (const auto & column : metadata_snapshot->getColumns()) |
| 303 | if (column.default_desc.kind == ColumnDefaultKind::Materialized && header.has(column.name)) |
| 304 | throw Exception("Cannot insert column " + column.name + ", because it is MATERIALIZED column.", ErrorCodes::ILLEGAL_COLUMN); |
| 305 | } |
| 306 | } |
| 307 | else if (query.data && !query.has_tail) /// can execute without additional data |
| 308 | { |
| 309 | // res.out = std::move(out_streams.at(0)); |
| 310 | res.in = std::make_shared<InputStreamFromASTInsertQuery>(query_ptr, nullptr, query_sample_block, context, nullptr); |
| 311 | res.in = std::make_shared<NullAndDoCopyBlockInputStream>(res.in, out_streams.at(0)); |
| 312 | } |
| 313 | else |
| 314 | res.out = std::move(out_streams.at(0)); |
| 315 | |
| 316 | res.pipeline.addStorageHolder(table); |
| 317 | |
| 318 | return res; |
| 319 | } |
| 320 | |
| 321 | |
| 322 | StorageID InterpreterInsertQuery::getDatabaseTable() const |
| 323 | { |
| 324 | return query_ptr->as<ASTInsertQuery &>().table_id; |
| 325 | } |
| 326 | |
| 327 | } |