sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7from sqlglot import exp, generator, parser, tokens, transforms 8from sqlglot._typing import E 9from sqlglot.dialects.dialect import ( 10 Dialect, 11 NormalizationStrategy, 12 annotate_with_type_lambda, 13 arg_max_or_min_no_count, 14 binary_from_function, 15 date_add_interval_sql, 16 datestrtodate_sql, 17 build_formatted_time, 18 filter_array_using_unnest, 19 if_sql, 20 inline_array_unless_query, 21 max_or_greatest, 22 min_or_least, 23 no_ilike_sql, 24 build_date_delta_with_interval, 25 regexp_replace_sql, 26 rename_func, 27 sha256_sql, 28 timestrtotime_sql, 29 ts_or_ds_add_cast, 30 unit_to_var, 31 strposition_sql, 32 groupconcat_sql, 33 space_sql, 34) 35from sqlglot.helper import seq_get, split_num_words 36from sqlglot.tokens import TokenType 37from sqlglot.generator import unsupported_args 38 39if t.TYPE_CHECKING: 40 from sqlglot._typing import Lit 41 42 from sqlglot.optimizer.annotate_types import TypeAnnotator 43 44logger = logging.getLogger("sqlglot") 45 46 47JSON_EXTRACT_TYPE = t.Union[exp.JSONExtract, exp.JSONExtractScalar, exp.JSONExtractArray] 48 49DQUOTES_ESCAPING_JSON_FUNCTIONS = ("JSON_QUERY", "JSON_VALUE", "JSON_QUERY_ARRAY") 50 51 52def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 53 if not expression.find_ancestor(exp.From, exp.Join): 54 return self.values_sql(expression) 55 56 structs = [] 57 alias = expression.args.get("alias") 58 for tup in expression.find_all(exp.Tuple): 59 field_aliases = ( 60 alias.columns 61 if alias and alias.columns 62 else (f"_c{i}" for i in range(len(tup.expressions))) 63 ) 64 expressions = [ 65 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 66 for name, fld in zip(field_aliases, tup.expressions) 67 ] 68 structs.append(exp.Struct(expressions=expressions)) 69 70 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 71 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 72 return self.unnest_sql( 73 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 74 ) 75 76 77def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 78 this = expression.this 79 if isinstance(this, exp.Schema): 80 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 81 else: 82 this = self.sql(this) 83 return f"RETURNS {this}" 84 85 86def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 87 returns = expression.find(exp.ReturnsProperty) 88 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 89 expression.set("kind", "TABLE FUNCTION") 90 91 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 92 expression.set("expression", expression.expression.this) 93 94 return self.create_sql(expression) 95 96 97# https://issuetracker.google.com/issues/162294746 98# workaround for bigquery bug when grouping by an expression and then ordering 99# WITH x AS (SELECT 1 y) 100# SELECT y + 1 z 101# FROM x 102# GROUP BY x + 1 103# ORDER by z 104def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 105 if isinstance(expression, exp.Select): 106 group = expression.args.get("group") 107 order = expression.args.get("order") 108 109 if group and order: 110 aliases = { 111 select.this: select.args["alias"] 112 for select in expression.selects 113 if isinstance(select, exp.Alias) 114 } 115 116 for grouped in group.expressions: 117 if grouped.is_int: 118 continue 119 alias = aliases.get(grouped) 120 if alias: 121 grouped.replace(exp.column(alias)) 122 123 return expression 124 125 126def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 127 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 128 if isinstance(expression, exp.CTE) and expression.alias_column_names: 129 cte_query = expression.this 130 131 if cte_query.is_star: 132 logger.warning( 133 "Can't push down CTE column names for star queries. Run the query through" 134 " the optimizer or use 'qualify' to expand the star projections first." 135 ) 136 return expression 137 138 column_names = expression.alias_column_names 139 expression.args["alias"].set("columns", None) 140 141 for name, select in zip(column_names, cte_query.selects): 142 to_replace = select 143 144 if isinstance(select, exp.Alias): 145 select = select.this 146 147 # Inner aliases are shadowed by the CTE column names 148 to_replace.replace(exp.alias_(select, name)) 149 150 return expression 151 152 153def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 154 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 155 this.set("zone", seq_get(args, 2)) 156 return this 157 158 159def _build_timestamp(args: t.List) -> exp.Timestamp: 160 timestamp = exp.Timestamp.from_arg_list(args) 161 timestamp.set("with_tz", True) 162 return timestamp 163 164 165def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 166 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 167 return expr_type.from_arg_list(args) 168 169 170def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 171 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 172 arg = seq_get(args, 0) 173 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 174 175 176def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 177 return self.sql( 178 exp.Exists( 179 this=exp.select("1") 180 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 181 .where(exp.column("_col").eq(expression.right)) 182 ) 183 ) 184 185 186def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 187 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 188 189 190def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 191 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 192 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 193 unit = unit_to_var(expression) 194 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 195 196 197def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 198 scale = expression.args.get("scale") 199 timestamp = expression.this 200 201 if scale in (None, exp.UnixToTime.SECONDS): 202 return self.func("TIMESTAMP_SECONDS", timestamp) 203 if scale == exp.UnixToTime.MILLIS: 204 return self.func("TIMESTAMP_MILLIS", timestamp) 205 if scale == exp.UnixToTime.MICROS: 206 return self.func("TIMESTAMP_MICROS", timestamp) 207 208 unix_seconds = exp.cast( 209 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 210 ) 211 return self.func("TIMESTAMP_SECONDS", unix_seconds) 212 213 214def _build_time(args: t.List) -> exp.Func: 215 if len(args) == 1: 216 return exp.TsOrDsToTime(this=args[0]) 217 if len(args) == 2: 218 return exp.Time.from_arg_list(args) 219 return exp.TimeFromParts.from_arg_list(args) 220 221 222def _build_datetime(args: t.List) -> exp.Func: 223 if len(args) == 1: 224 return exp.TsOrDsToDatetime.from_arg_list(args) 225 if len(args) == 2: 226 return exp.Datetime.from_arg_list(args) 227 return exp.TimestampFromParts.from_arg_list(args) 228 229 230def _build_regexp_extract( 231 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 232) -> t.Callable[[t.List], E]: 233 def _builder(args: t.List) -> E: 234 try: 235 group = re.compile(args[1].name).groups == 1 236 except re.error: 237 group = False 238 239 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 240 return expr_type( 241 this=seq_get(args, 0), 242 expression=seq_get(args, 1), 243 position=seq_get(args, 2), 244 occurrence=seq_get(args, 3), 245 group=exp.Literal.number(1) if group else default_group, 246 ) 247 248 return _builder 249 250 251def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 252 def _builder(args: t.List, dialect: Dialect) -> E: 253 if len(args) == 1: 254 # The default value for the JSONPath is '$' i.e all of the data 255 args.append(exp.Literal.string("$")) 256 return parser.build_extract_json_with_path(expr_type)(args, dialect) 257 258 return _builder 259 260 261def _str_to_datetime_sql( 262 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 263) -> str: 264 this = self.sql(expression, "this") 265 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 266 267 if expression.args.get("safe"): 268 fmt = self.format_time( 269 expression, 270 self.dialect.INVERSE_FORMAT_MAPPING, 271 self.dialect.INVERSE_FORMAT_TRIE, 272 ) 273 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 274 275 fmt = self.format_time(expression) 276 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 277 278 279def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 280 """ 281 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 282 +---------+---------+---------+------------+---------+ 283 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 284 +---------+---------+---------+------------+---------+ 285 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 286 +---------+---------+---------+------------+---------+ 287 """ 288 self._annotate_args(expression) 289 290 this: exp.Expression = expression.this 291 292 self._set_type( 293 expression, 294 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 295 ) 296 return expression 297 298 299@unsupported_args("ins_cost", "del_cost", "sub_cost") 300def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 301 max_dist = expression.args.get("max_dist") 302 if max_dist: 303 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 304 305 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 306 307 308def _build_levenshtein(args: t.List) -> exp.Levenshtein: 309 max_dist = seq_get(args, 2) 310 return exp.Levenshtein( 311 this=seq_get(args, 0), 312 expression=seq_get(args, 1), 313 max_dist=max_dist.expression if max_dist else None, 314 ) 315 316 317def _build_format_time(expr_type: t.Type[exp.Expression]) -> t.Callable[[t.List], exp.TimeToStr]: 318 def _builder(args: t.List) -> exp.TimeToStr: 319 return exp.TimeToStr( 320 this=expr_type(this=seq_get(args, 1)), 321 format=seq_get(args, 0), 322 zone=seq_get(args, 2), 323 ) 324 325 return _builder 326 327 328def _build_contains_substring(args: t.List) -> exp.Contains | exp.Anonymous: 329 if len(args) == 3: 330 return exp.Anonymous(this="CONTAINS_SUBSTR", expressions=args) 331 332 # Lowercase the operands in case of transpilation, as exp.Contains 333 # is case-sensitive on other dialects 334 this = exp.Lower(this=seq_get(args, 0)) 335 expr = exp.Lower(this=seq_get(args, 1)) 336 337 return exp.Contains(this=this, expression=expr) 338 339 340def _json_extract_sql(self: BigQuery.Generator, expression: JSON_EXTRACT_TYPE) -> str: 341 name = (expression._meta and expression.meta.get("name")) or expression.sql_name() 342 upper = name.upper() 343 344 dquote_escaping = upper in DQUOTES_ESCAPING_JSON_FUNCTIONS 345 346 if dquote_escaping: 347 self._quote_json_path_key_using_brackets = False 348 349 sql = rename_func(upper)(self, expression) 350 351 if dquote_escaping: 352 self._quote_json_path_key_using_brackets = True 353 354 return sql 355 356 357def _annotate_concat(self: TypeAnnotator, expression: exp.Concat) -> exp.Concat: 358 annotated = self._annotate_by_args(expression, "expressions") 359 360 # Args must be BYTES or types that can be cast to STRING, return type is either BYTES or STRING 361 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#concat 362 if not annotated.is_type(exp.DataType.Type.BINARY, exp.DataType.Type.UNKNOWN): 363 annotated.type = exp.DataType.Type.VARCHAR 364 365 return annotated 366 367 368class BigQuery(Dialect): 369 WEEK_OFFSET = -1 370 UNNEST_COLUMN_ONLY = True 371 SUPPORTS_USER_DEFINED_TYPES = False 372 SUPPORTS_SEMI_ANTI_JOIN = False 373 LOG_BASE_FIRST = False 374 HEX_LOWERCASE = True 375 FORCE_EARLY_ALIAS_REF_EXPANSION = True 376 PRESERVE_ORIGINAL_NAMES = True 377 HEX_STRING_IS_INTEGER_TYPE = True 378 379 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 380 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 381 382 # bigquery udfs are case sensitive 383 NORMALIZE_FUNCTIONS = False 384 385 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 386 TIME_MAPPING = { 387 "%D": "%m/%d/%y", 388 "%E6S": "%S.%f", 389 "%e": "%-d", 390 } 391 392 FORMAT_MAPPING = { 393 "DD": "%d", 394 "MM": "%m", 395 "MON": "%b", 396 "MONTH": "%B", 397 "YYYY": "%Y", 398 "YY": "%y", 399 "HH": "%I", 400 "HH12": "%I", 401 "HH24": "%H", 402 "MI": "%M", 403 "SS": "%S", 404 "SSSSS": "%f", 405 "TZH": "%z", 406 } 407 408 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 409 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 410 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 411 412 # All set operations require either a DISTINCT or ALL specifier 413 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 414 415 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 416 TYPE_TO_EXPRESSIONS = { 417 **Dialect.TYPE_TO_EXPRESSIONS, 418 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 419 } 420 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 421 422 ANNOTATORS = { 423 **Dialect.ANNOTATORS, 424 **{ 425 expr_type: annotate_with_type_lambda(data_type) 426 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 427 for expr_type in expressions 428 }, 429 **{ 430 expr_type: lambda self, e: _annotate_math_functions(self, e) 431 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 432 }, 433 **{ 434 expr_type: lambda self, e: self._annotate_by_args(e, "this") 435 for expr_type in ( 436 exp.Left, 437 exp.Right, 438 exp.Lower, 439 exp.Upper, 440 exp.Pad, 441 exp.Trim, 442 exp.RegexpExtract, 443 exp.RegexpReplace, 444 exp.Repeat, 445 exp.Substring, 446 ) 447 }, 448 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 449 exp.Concat: _annotate_concat, 450 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 451 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 452 } 453 454 def normalize_identifier(self, expression: E) -> E: 455 if ( 456 isinstance(expression, exp.Identifier) 457 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 458 ): 459 parent = expression.parent 460 while isinstance(parent, exp.Dot): 461 parent = parent.parent 462 463 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 464 # by default. The following check uses a heuristic to detect tables based on whether 465 # they are qualified. This should generally be correct, because tables in BigQuery 466 # must be qualified with at least a dataset, unless @@dataset_id is set. 467 case_sensitive = ( 468 isinstance(parent, exp.UserDefinedFunction) 469 or ( 470 isinstance(parent, exp.Table) 471 and parent.db 472 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 473 ) 474 or expression.meta.get("is_table") 475 ) 476 if not case_sensitive: 477 expression.set("this", expression.this.lower()) 478 479 return t.cast(E, expression) 480 481 return super().normalize_identifier(expression) 482 483 class Tokenizer(tokens.Tokenizer): 484 QUOTES = ["'", '"', '"""', "'''"] 485 COMMENTS = ["--", "#", ("/*", "*/")] 486 IDENTIFIERS = ["`"] 487 STRING_ESCAPES = ["\\"] 488 489 HEX_STRINGS = [("0x", ""), ("0X", "")] 490 491 BYTE_STRINGS = [ 492 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 493 ] 494 495 RAW_STRINGS = [ 496 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 497 ] 498 499 NESTED_COMMENTS = False 500 501 KEYWORDS = { 502 **tokens.Tokenizer.KEYWORDS, 503 "ANY TYPE": TokenType.VARIANT, 504 "BEGIN": TokenType.COMMAND, 505 "BEGIN TRANSACTION": TokenType.BEGIN, 506 "BYTEINT": TokenType.INT, 507 "BYTES": TokenType.BINARY, 508 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 509 "DATETIME": TokenType.TIMESTAMP, 510 "DECLARE": TokenType.COMMAND, 511 "ELSEIF": TokenType.COMMAND, 512 "EXCEPTION": TokenType.COMMAND, 513 "EXPORT": TokenType.EXPORT, 514 "FLOAT64": TokenType.DOUBLE, 515 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 516 "MODEL": TokenType.MODEL, 517 "NOT DETERMINISTIC": TokenType.VOLATILE, 518 "RECORD": TokenType.STRUCT, 519 "TIMESTAMP": TokenType.TIMESTAMPTZ, 520 } 521 KEYWORDS.pop("DIV") 522 KEYWORDS.pop("VALUES") 523 KEYWORDS.pop("/*+") 524 525 class Parser(parser.Parser): 526 PREFIXED_PIVOT_COLUMNS = True 527 LOG_DEFAULTS_TO_LN = True 528 SUPPORTS_IMPLICIT_UNNEST = True 529 JOINS_HAVE_EQUAL_PRECEDENCE = True 530 531 # BigQuery does not allow ASC/DESC to be used as an identifier 532 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 533 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 534 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 535 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 536 TokenType.ASC, 537 TokenType.DESC, 538 } 539 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 540 541 FUNCTIONS = { 542 **parser.Parser.FUNCTIONS, 543 "CONTAINS_SUBSTR": _build_contains_substring, 544 "DATE": _build_date, 545 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 546 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 547 "DATE_TRUNC": lambda args: exp.DateTrunc( 548 unit=seq_get(args, 1), 549 this=seq_get(args, 0), 550 zone=seq_get(args, 2), 551 ), 552 "DATETIME": _build_datetime, 553 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 554 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 555 "DIV": binary_from_function(exp.IntDiv), 556 "EDIT_DISTANCE": _build_levenshtein, 557 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 558 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 559 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 560 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 561 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 562 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 563 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 564 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 565 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 566 "MD5": exp.MD5Digest.from_arg_list, 567 "TO_HEX": _build_to_hex, 568 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 569 [seq_get(args, 1), seq_get(args, 0)] 570 ), 571 "PARSE_TIMESTAMP": _build_parse_timestamp, 572 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 573 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 574 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 575 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 576 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 577 ), 578 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 579 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 580 "SPLIT": lambda args: exp.Split( 581 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 582 this=seq_get(args, 0), 583 expression=seq_get(args, 1) or exp.Literal.string(","), 584 ), 585 "STRPOS": exp.StrPosition.from_arg_list, 586 "TIME": _build_time, 587 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 588 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 589 "TIMESTAMP": _build_timestamp, 590 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 591 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 592 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 593 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 594 ), 595 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 596 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 597 ), 598 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 599 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 600 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 601 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 602 } 603 604 FUNCTION_PARSERS = { 605 **parser.Parser.FUNCTION_PARSERS, 606 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 607 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 608 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 609 } 610 FUNCTION_PARSERS.pop("TRIM") 611 612 NO_PAREN_FUNCTIONS = { 613 **parser.Parser.NO_PAREN_FUNCTIONS, 614 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 615 } 616 617 NESTED_TYPE_TOKENS = { 618 *parser.Parser.NESTED_TYPE_TOKENS, 619 TokenType.TABLE, 620 } 621 622 PROPERTY_PARSERS = { 623 **parser.Parser.PROPERTY_PARSERS, 624 "NOT DETERMINISTIC": lambda self: self.expression( 625 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 626 ), 627 "OPTIONS": lambda self: self._parse_with_property(), 628 } 629 630 CONSTRAINT_PARSERS = { 631 **parser.Parser.CONSTRAINT_PARSERS, 632 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 633 } 634 635 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 636 RANGE_PARSERS.pop(TokenType.OVERLAPS) 637 638 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 639 640 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 641 642 STATEMENT_PARSERS = { 643 **parser.Parser.STATEMENT_PARSERS, 644 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 645 TokenType.END: lambda self: self._parse_as_command(self._prev), 646 TokenType.FOR: lambda self: self._parse_for_in(), 647 TokenType.EXPORT: lambda self: self._parse_export_data(), 648 } 649 650 BRACKET_OFFSETS = { 651 "OFFSET": (0, False), 652 "ORDINAL": (1, False), 653 "SAFE_OFFSET": (0, True), 654 "SAFE_ORDINAL": (1, True), 655 } 656 657 def _parse_for_in(self) -> exp.ForIn: 658 this = self._parse_range() 659 self._match_text_seq("DO") 660 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 661 662 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 663 this = super()._parse_table_part(schema=schema) or self._parse_number() 664 665 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 666 if isinstance(this, exp.Identifier): 667 table_name = this.name 668 while self._match(TokenType.DASH, advance=False) and self._next: 669 start = self._curr 670 while self._is_connected() and not self._match_set( 671 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 672 ): 673 self._advance() 674 675 if start == self._curr: 676 break 677 678 table_name += self._find_sql(start, self._prev) 679 680 this = exp.Identifier( 681 this=table_name, quoted=this.args.get("quoted") 682 ).update_positions(this) 683 elif isinstance(this, exp.Literal): 684 table_name = this.name 685 686 if self._is_connected() and self._parse_var(any_token=True): 687 table_name += self._prev.text 688 689 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 690 691 return this 692 693 def _parse_table_parts( 694 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 695 ) -> exp.Table: 696 table = super()._parse_table_parts( 697 schema=schema, is_db_reference=is_db_reference, wildcard=True 698 ) 699 700 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 701 if not table.catalog: 702 if table.db: 703 previous_db = table.args["db"] 704 parts = table.db.split(".") 705 if len(parts) == 2 and not table.args["db"].quoted: 706 table.set( 707 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 708 ) 709 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 710 else: 711 previous_this = table.this 712 parts = table.name.split(".") 713 if len(parts) == 2 and not table.this.quoted: 714 table.set( 715 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 716 ) 717 table.set( 718 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 719 ) 720 721 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 722 alias = table.this 723 catalog, db, this, *rest = ( 724 exp.to_identifier(p, quoted=True) 725 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 726 ) 727 728 for part in (catalog, db, this): 729 if part: 730 part.update_positions(table.this) 731 732 if rest and this: 733 this = exp.Dot.build([this, *rest]) # type: ignore 734 735 table = exp.Table( 736 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 737 ) 738 table.meta["quoted_table"] = True 739 else: 740 alias = None 741 742 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 743 # dataset, so if the project identifier is omitted we need to fix the ast so that 744 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 745 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 746 # views, because it would seem like the "catalog" part is set, when it'd actually 747 # be the region/dataset. Merging the two identifiers into a single one is done to 748 # avoid producing a 4-part Table reference, which would cause issues in the schema 749 # module, when there are 3-part table names mixed with information schema views. 750 # 751 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 752 table_parts = table.parts 753 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 754 # We need to alias the table here to avoid breaking existing qualified columns. 755 # This is expected to be safe, because if there's an actual alias coming up in 756 # the token stream, it will overwrite this one. If there isn't one, we are only 757 # exposing the name that can be used to reference the view explicitly (a no-op). 758 exp.alias_( 759 table, 760 t.cast(exp.Identifier, alias or table_parts[-1]), 761 table=True, 762 copy=False, 763 ) 764 765 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 766 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 767 line=table_parts[-2].meta.get("line"), 768 col=table_parts[-1].meta.get("col"), 769 start=table_parts[-2].meta.get("start"), 770 end=table_parts[-1].meta.get("end"), 771 ) 772 table.set("this", new_this) 773 table.set("db", seq_get(table_parts, -3)) 774 table.set("catalog", seq_get(table_parts, -4)) 775 776 return table 777 778 def _parse_column(self) -> t.Optional[exp.Expression]: 779 column = super()._parse_column() 780 if isinstance(column, exp.Column): 781 parts = column.parts 782 if any("." in p.name for p in parts): 783 catalog, db, table, this, *rest = ( 784 exp.to_identifier(p, quoted=True) 785 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 786 ) 787 788 if rest and this: 789 this = exp.Dot.build([this, *rest]) # type: ignore 790 791 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 792 column.meta["quoted_column"] = True 793 794 return column 795 796 @t.overload 797 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 798 799 @t.overload 800 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 801 802 def _parse_json_object(self, agg=False): 803 json_object = super()._parse_json_object() 804 array_kv_pair = seq_get(json_object.expressions, 0) 805 806 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 807 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 808 if ( 809 array_kv_pair 810 and isinstance(array_kv_pair.this, exp.Array) 811 and isinstance(array_kv_pair.expression, exp.Array) 812 ): 813 keys = array_kv_pair.this.expressions 814 values = array_kv_pair.expression.expressions 815 816 json_object.set( 817 "expressions", 818 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 819 ) 820 821 return json_object 822 823 def _parse_bracket( 824 self, this: t.Optional[exp.Expression] = None 825 ) -> t.Optional[exp.Expression]: 826 bracket = super()._parse_bracket(this) 827 828 if this is bracket: 829 return bracket 830 831 if isinstance(bracket, exp.Bracket): 832 for expression in bracket.expressions: 833 name = expression.name.upper() 834 835 if name not in self.BRACKET_OFFSETS: 836 break 837 838 offset, safe = self.BRACKET_OFFSETS[name] 839 bracket.set("offset", offset) 840 bracket.set("safe", safe) 841 expression.replace(expression.expressions[0]) 842 843 return bracket 844 845 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 846 unnest = super()._parse_unnest(with_alias=with_alias) 847 848 if not unnest: 849 return None 850 851 unnest_expr = seq_get(unnest.expressions, 0) 852 if unnest_expr: 853 from sqlglot.optimizer.annotate_types import annotate_types 854 855 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 856 857 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 858 # in contrast to other dialects such as DuckDB which flattens only the array by default 859 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 860 array_elem.is_type(exp.DataType.Type.STRUCT) 861 for array_elem in unnest_expr._type.expressions 862 ): 863 unnest.set("explode_array", True) 864 865 return unnest 866 867 def _parse_make_interval(self) -> exp.MakeInterval: 868 expr = exp.MakeInterval() 869 870 for arg_key in expr.arg_types: 871 value = self._parse_lambda() 872 873 if not value: 874 break 875 876 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 877 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 878 if isinstance(value, exp.Kwarg): 879 arg_key = value.this.name 880 881 expr.set(arg_key, value) 882 883 self._match(TokenType.COMMA) 884 885 return expr 886 887 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 888 expr = self.expression( 889 exp.FeaturesAtTime, 890 this=(self._match(TokenType.TABLE) and self._parse_table()) 891 or self._parse_select(nested=True), 892 ) 893 894 while self._match(TokenType.COMMA): 895 arg = self._parse_lambda() 896 897 # Get the LHS of the Kwarg and set the arg to that value, e.g 898 # "num_rows => 1" sets the expr's `num_rows` arg 899 if arg: 900 expr.set(arg.this.name, arg) 901 902 return expr 903 904 def _parse_export_data(self) -> exp.Export: 905 self._match_text_seq("DATA") 906 907 return self.expression( 908 exp.Export, 909 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 910 options=self._parse_properties(), 911 this=self._match_text_seq("AS") and self._parse_select(), 912 ) 913 914 class Generator(generator.Generator): 915 INTERVAL_ALLOWS_PLURAL_FORM = False 916 JOIN_HINTS = False 917 QUERY_HINTS = False 918 TABLE_HINTS = False 919 LIMIT_FETCH = "LIMIT" 920 RENAME_TABLE_WITH_DB = False 921 NVL2_SUPPORTED = False 922 UNNEST_WITH_ORDINALITY = False 923 COLLATE_IS_FUNC = True 924 LIMIT_ONLY_LITERALS = True 925 SUPPORTS_TABLE_ALIAS_COLUMNS = False 926 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 927 JSON_KEY_VALUE_PAIR_SEP = "," 928 NULL_ORDERING_SUPPORTED = False 929 IGNORE_NULLS_IN_FUNC = True 930 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 931 CAN_IMPLEMENT_ARRAY_ANY = True 932 SUPPORTS_TO_NUMBER = False 933 NAMED_PLACEHOLDER_TOKEN = "@" 934 HEX_FUNC = "TO_HEX" 935 WITH_PROPERTIES_PREFIX = "OPTIONS" 936 SUPPORTS_EXPLODING_PROJECTIONS = False 937 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 938 SUPPORTS_UNIX_SECONDS = True 939 940 TRANSFORMS = { 941 **generator.Generator.TRANSFORMS, 942 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 943 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 944 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 945 exp.Array: inline_array_unless_query, 946 exp.ArrayContains: _array_contains_sql, 947 exp.ArrayFilter: filter_array_using_unnest, 948 exp.ArrayRemove: filter_array_using_unnest, 949 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 950 exp.CollateProperty: lambda self, e: ( 951 f"DEFAULT COLLATE {self.sql(e, 'this')}" 952 if e.args.get("default") 953 else f"COLLATE {self.sql(e, 'this')}" 954 ), 955 exp.Commit: lambda *_: "COMMIT TRANSACTION", 956 exp.CountIf: rename_func("COUNTIF"), 957 exp.Create: _create_sql, 958 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 959 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 960 exp.DateDiff: lambda self, e: self.func( 961 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 962 ), 963 exp.DateFromParts: rename_func("DATE"), 964 exp.DateStrToDate: datestrtodate_sql, 965 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 966 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 967 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 968 exp.FromTimeZone: lambda self, e: self.func( 969 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 970 ), 971 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 972 exp.GroupConcat: lambda self, e: groupconcat_sql( 973 self, e, func_name="STRING_AGG", within_group=False 974 ), 975 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 976 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 977 exp.If: if_sql(false_value="NULL"), 978 exp.ILike: no_ilike_sql, 979 exp.IntDiv: rename_func("DIV"), 980 exp.Int64: rename_func("INT64"), 981 exp.JSONExtract: _json_extract_sql, 982 exp.JSONExtractArray: _json_extract_sql, 983 exp.JSONExtractScalar: _json_extract_sql, 984 exp.JSONFormat: rename_func("TO_JSON_STRING"), 985 exp.Levenshtein: _levenshtein_sql, 986 exp.Max: max_or_greatest, 987 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 988 exp.MD5Digest: rename_func("MD5"), 989 exp.Min: min_or_least, 990 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 991 exp.RegexpExtract: lambda self, e: self.func( 992 "REGEXP_EXTRACT", 993 e.this, 994 e.expression, 995 e.args.get("position"), 996 e.args.get("occurrence"), 997 ), 998 exp.RegexpExtractAll: lambda self, e: self.func( 999 "REGEXP_EXTRACT_ALL", e.this, e.expression 1000 ), 1001 exp.RegexpReplace: regexp_replace_sql, 1002 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1003 exp.ReturnsProperty: _returnsproperty_sql, 1004 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1005 exp.Select: transforms.preprocess( 1006 [ 1007 transforms.explode_projection_to_unnest(), 1008 transforms.unqualify_unnest, 1009 transforms.eliminate_distinct_on, 1010 _alias_ordered_group, 1011 transforms.eliminate_semi_and_anti_joins, 1012 ] 1013 ), 1014 exp.SHA: rename_func("SHA1"), 1015 exp.SHA2: sha256_sql, 1016 exp.Space: space_sql, 1017 exp.StabilityProperty: lambda self, e: ( 1018 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1019 ), 1020 exp.String: rename_func("STRING"), 1021 exp.StrPosition: lambda self, e: ( 1022 strposition_sql( 1023 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1024 ) 1025 ), 1026 exp.StrToDate: _str_to_datetime_sql, 1027 exp.StrToTime: _str_to_datetime_sql, 1028 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1029 exp.TimeFromParts: rename_func("TIME"), 1030 exp.TimestampFromParts: rename_func("DATETIME"), 1031 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1032 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1033 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1034 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1035 exp.TimeStrToTime: timestrtotime_sql, 1036 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1037 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1038 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1039 exp.TsOrDsToTime: rename_func("TIME"), 1040 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1041 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1042 exp.Unhex: rename_func("FROM_HEX"), 1043 exp.UnixDate: rename_func("UNIX_DATE"), 1044 exp.UnixToTime: _unix_to_time_sql, 1045 exp.Uuid: lambda *_: "GENERATE_UUID()", 1046 exp.Values: _derived_table_values_to_unnest, 1047 exp.VariancePop: rename_func("VAR_POP"), 1048 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1049 } 1050 1051 SUPPORTED_JSON_PATH_PARTS = { 1052 exp.JSONPathKey, 1053 exp.JSONPathRoot, 1054 exp.JSONPathSubscript, 1055 } 1056 1057 TYPE_MAPPING = { 1058 **generator.Generator.TYPE_MAPPING, 1059 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1060 exp.DataType.Type.BIGINT: "INT64", 1061 exp.DataType.Type.BINARY: "BYTES", 1062 exp.DataType.Type.BLOB: "BYTES", 1063 exp.DataType.Type.BOOLEAN: "BOOL", 1064 exp.DataType.Type.CHAR: "STRING", 1065 exp.DataType.Type.DECIMAL: "NUMERIC", 1066 exp.DataType.Type.DOUBLE: "FLOAT64", 1067 exp.DataType.Type.FLOAT: "FLOAT64", 1068 exp.DataType.Type.INT: "INT64", 1069 exp.DataType.Type.NCHAR: "STRING", 1070 exp.DataType.Type.NVARCHAR: "STRING", 1071 exp.DataType.Type.SMALLINT: "INT64", 1072 exp.DataType.Type.TEXT: "STRING", 1073 exp.DataType.Type.TIMESTAMP: "DATETIME", 1074 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1075 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1076 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1077 exp.DataType.Type.TINYINT: "INT64", 1078 exp.DataType.Type.ROWVERSION: "BYTES", 1079 exp.DataType.Type.UUID: "STRING", 1080 exp.DataType.Type.VARBINARY: "BYTES", 1081 exp.DataType.Type.VARCHAR: "STRING", 1082 exp.DataType.Type.VARIANT: "ANY TYPE", 1083 } 1084 1085 PROPERTIES_LOCATION = { 1086 **generator.Generator.PROPERTIES_LOCATION, 1087 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1088 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1089 } 1090 1091 # WINDOW comes after QUALIFY 1092 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1093 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1094 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1095 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1096 } 1097 1098 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1099 RESERVED_KEYWORDS = { 1100 "all", 1101 "and", 1102 "any", 1103 "array", 1104 "as", 1105 "asc", 1106 "assert_rows_modified", 1107 "at", 1108 "between", 1109 "by", 1110 "case", 1111 "cast", 1112 "collate", 1113 "contains", 1114 "create", 1115 "cross", 1116 "cube", 1117 "current", 1118 "default", 1119 "define", 1120 "desc", 1121 "distinct", 1122 "else", 1123 "end", 1124 "enum", 1125 "escape", 1126 "except", 1127 "exclude", 1128 "exists", 1129 "extract", 1130 "false", 1131 "fetch", 1132 "following", 1133 "for", 1134 "from", 1135 "full", 1136 "group", 1137 "grouping", 1138 "groups", 1139 "hash", 1140 "having", 1141 "if", 1142 "ignore", 1143 "in", 1144 "inner", 1145 "intersect", 1146 "interval", 1147 "into", 1148 "is", 1149 "join", 1150 "lateral", 1151 "left", 1152 "like", 1153 "limit", 1154 "lookup", 1155 "merge", 1156 "natural", 1157 "new", 1158 "no", 1159 "not", 1160 "null", 1161 "nulls", 1162 "of", 1163 "on", 1164 "or", 1165 "order", 1166 "outer", 1167 "over", 1168 "partition", 1169 "preceding", 1170 "proto", 1171 "qualify", 1172 "range", 1173 "recursive", 1174 "respect", 1175 "right", 1176 "rollup", 1177 "rows", 1178 "select", 1179 "set", 1180 "some", 1181 "struct", 1182 "tablesample", 1183 "then", 1184 "to", 1185 "treat", 1186 "true", 1187 "unbounded", 1188 "union", 1189 "unnest", 1190 "using", 1191 "when", 1192 "where", 1193 "window", 1194 "with", 1195 "within", 1196 } 1197 1198 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1199 unit = expression.unit 1200 unit_sql = unit.name if unit.is_string else self.sql(unit) 1201 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1202 1203 def mod_sql(self, expression: exp.Mod) -> str: 1204 this = expression.this 1205 expr = expression.expression 1206 return self.func( 1207 "MOD", 1208 this.unnest() if isinstance(this, exp.Paren) else this, 1209 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1210 ) 1211 1212 def column_parts(self, expression: exp.Column) -> str: 1213 if expression.meta.get("quoted_column"): 1214 # If a column reference is of the form `dataset.table`.name, we need 1215 # to preserve the quoted table path, otherwise the reference breaks 1216 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1217 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1218 return f"{table_path}.{self.sql(expression, 'this')}" 1219 1220 return super().column_parts(expression) 1221 1222 def table_parts(self, expression: exp.Table) -> str: 1223 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1224 # we need to make sure the correct quoting is used in each case. 1225 # 1226 # For example, if there is a CTE x that clashes with a schema name, then the former will 1227 # return the table y in that schema, whereas the latter will return the CTE's y column: 1228 # 1229 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1230 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1231 if expression.meta.get("quoted_table"): 1232 table_parts = ".".join(p.name for p in expression.parts) 1233 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1234 1235 return super().table_parts(expression) 1236 1237 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1238 this = expression.this 1239 if isinstance(this, exp.TsOrDsToDatetime): 1240 func_name = "FORMAT_DATETIME" 1241 elif isinstance(this, exp.TsOrDsToTimestamp): 1242 func_name = "FORMAT_TIMESTAMP" 1243 else: 1244 func_name = "FORMAT_DATE" 1245 1246 time_expr = ( 1247 this 1248 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1249 else expression 1250 ) 1251 return self.func( 1252 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1253 ) 1254 1255 def eq_sql(self, expression: exp.EQ) -> str: 1256 # Operands of = cannot be NULL in BigQuery 1257 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1258 if not isinstance(expression.parent, exp.Update): 1259 return "NULL" 1260 1261 return self.binary(expression, "=") 1262 1263 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1264 parent = expression.parent 1265 1266 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1267 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1268 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1269 return self.func( 1270 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1271 ) 1272 1273 return super().attimezone_sql(expression) 1274 1275 def trycast_sql(self, expression: exp.TryCast) -> str: 1276 return self.cast_sql(expression, safe_prefix="SAFE_") 1277 1278 def bracket_sql(self, expression: exp.Bracket) -> str: 1279 this = expression.this 1280 expressions = expression.expressions 1281 1282 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1283 arg = expressions[0] 1284 if arg.type is None: 1285 from sqlglot.optimizer.annotate_types import annotate_types 1286 1287 arg = annotate_types(arg, dialect=self.dialect) 1288 1289 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1290 # BQ doesn't support bracket syntax with string values for structs 1291 return f"{self.sql(this)}.{arg.name}" 1292 1293 expressions_sql = self.expressions(expression, flat=True) 1294 offset = expression.args.get("offset") 1295 1296 if offset == 0: 1297 expressions_sql = f"OFFSET({expressions_sql})" 1298 elif offset == 1: 1299 expressions_sql = f"ORDINAL({expressions_sql})" 1300 elif offset is not None: 1301 self.unsupported(f"Unsupported array offset: {offset}") 1302 1303 if expression.args.get("safe"): 1304 expressions_sql = f"SAFE_{expressions_sql}" 1305 1306 return f"{self.sql(this)}[{expressions_sql}]" 1307 1308 def in_unnest_op(self, expression: exp.Unnest) -> str: 1309 return self.sql(expression) 1310 1311 def version_sql(self, expression: exp.Version) -> str: 1312 if expression.name == "TIMESTAMP": 1313 expression.set("this", "SYSTEM_TIME") 1314 return super().version_sql(expression) 1315 1316 def contains_sql(self, expression: exp.Contains) -> str: 1317 this = expression.this 1318 expr = expression.expression 1319 1320 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1321 this = this.this 1322 expr = expr.this 1323 1324 return self.func("CONTAINS_SUBSTR", this, expr) 1325 1326 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1327 this = expression.this 1328 1329 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1330 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1331 # because they aren't literals and so the above syntax is invalid BigQuery. 1332 if isinstance(this, exp.Array): 1333 elem = seq_get(this.expressions, 0) 1334 if not (elem and elem.find(exp.Query)): 1335 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1336 1337 return super().cast_sql(expression, safe_prefix=safe_prefix)
369class BigQuery(Dialect): 370 WEEK_OFFSET = -1 371 UNNEST_COLUMN_ONLY = True 372 SUPPORTS_USER_DEFINED_TYPES = False 373 SUPPORTS_SEMI_ANTI_JOIN = False 374 LOG_BASE_FIRST = False 375 HEX_LOWERCASE = True 376 FORCE_EARLY_ALIAS_REF_EXPANSION = True 377 PRESERVE_ORIGINAL_NAMES = True 378 HEX_STRING_IS_INTEGER_TYPE = True 379 380 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 381 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 382 383 # bigquery udfs are case sensitive 384 NORMALIZE_FUNCTIONS = False 385 386 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 387 TIME_MAPPING = { 388 "%D": "%m/%d/%y", 389 "%E6S": "%S.%f", 390 "%e": "%-d", 391 } 392 393 FORMAT_MAPPING = { 394 "DD": "%d", 395 "MM": "%m", 396 "MON": "%b", 397 "MONTH": "%B", 398 "YYYY": "%Y", 399 "YY": "%y", 400 "HH": "%I", 401 "HH12": "%I", 402 "HH24": "%H", 403 "MI": "%M", 404 "SS": "%S", 405 "SSSSS": "%f", 406 "TZH": "%z", 407 } 408 409 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 410 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 411 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 412 413 # All set operations require either a DISTINCT or ALL specifier 414 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 415 416 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 417 TYPE_TO_EXPRESSIONS = { 418 **Dialect.TYPE_TO_EXPRESSIONS, 419 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 420 } 421 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 422 423 ANNOTATORS = { 424 **Dialect.ANNOTATORS, 425 **{ 426 expr_type: annotate_with_type_lambda(data_type) 427 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 428 for expr_type in expressions 429 }, 430 **{ 431 expr_type: lambda self, e: _annotate_math_functions(self, e) 432 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 433 }, 434 **{ 435 expr_type: lambda self, e: self._annotate_by_args(e, "this") 436 for expr_type in ( 437 exp.Left, 438 exp.Right, 439 exp.Lower, 440 exp.Upper, 441 exp.Pad, 442 exp.Trim, 443 exp.RegexpExtract, 444 exp.RegexpReplace, 445 exp.Repeat, 446 exp.Substring, 447 ) 448 }, 449 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 450 exp.Concat: _annotate_concat, 451 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 452 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 453 } 454 455 def normalize_identifier(self, expression: E) -> E: 456 if ( 457 isinstance(expression, exp.Identifier) 458 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 459 ): 460 parent = expression.parent 461 while isinstance(parent, exp.Dot): 462 parent = parent.parent 463 464 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 465 # by default. The following check uses a heuristic to detect tables based on whether 466 # they are qualified. This should generally be correct, because tables in BigQuery 467 # must be qualified with at least a dataset, unless @@dataset_id is set. 468 case_sensitive = ( 469 isinstance(parent, exp.UserDefinedFunction) 470 or ( 471 isinstance(parent, exp.Table) 472 and parent.db 473 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 474 ) 475 or expression.meta.get("is_table") 476 ) 477 if not case_sensitive: 478 expression.set("this", expression.this.lower()) 479 480 return t.cast(E, expression) 481 482 return super().normalize_identifier(expression) 483 484 class Tokenizer(tokens.Tokenizer): 485 QUOTES = ["'", '"', '"""', "'''"] 486 COMMENTS = ["--", "#", ("/*", "*/")] 487 IDENTIFIERS = ["`"] 488 STRING_ESCAPES = ["\\"] 489 490 HEX_STRINGS = [("0x", ""), ("0X", "")] 491 492 BYTE_STRINGS = [ 493 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 494 ] 495 496 RAW_STRINGS = [ 497 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 498 ] 499 500 NESTED_COMMENTS = False 501 502 KEYWORDS = { 503 **tokens.Tokenizer.KEYWORDS, 504 "ANY TYPE": TokenType.VARIANT, 505 "BEGIN": TokenType.COMMAND, 506 "BEGIN TRANSACTION": TokenType.BEGIN, 507 "BYTEINT": TokenType.INT, 508 "BYTES": TokenType.BINARY, 509 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 510 "DATETIME": TokenType.TIMESTAMP, 511 "DECLARE": TokenType.COMMAND, 512 "ELSEIF": TokenType.COMMAND, 513 "EXCEPTION": TokenType.COMMAND, 514 "EXPORT": TokenType.EXPORT, 515 "FLOAT64": TokenType.DOUBLE, 516 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 517 "MODEL": TokenType.MODEL, 518 "NOT DETERMINISTIC": TokenType.VOLATILE, 519 "RECORD": TokenType.STRUCT, 520 "TIMESTAMP": TokenType.TIMESTAMPTZ, 521 } 522 KEYWORDS.pop("DIV") 523 KEYWORDS.pop("VALUES") 524 KEYWORDS.pop("/*+") 525 526 class Parser(parser.Parser): 527 PREFIXED_PIVOT_COLUMNS = True 528 LOG_DEFAULTS_TO_LN = True 529 SUPPORTS_IMPLICIT_UNNEST = True 530 JOINS_HAVE_EQUAL_PRECEDENCE = True 531 532 # BigQuery does not allow ASC/DESC to be used as an identifier 533 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 534 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 535 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 536 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 537 TokenType.ASC, 538 TokenType.DESC, 539 } 540 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 541 542 FUNCTIONS = { 543 **parser.Parser.FUNCTIONS, 544 "CONTAINS_SUBSTR": _build_contains_substring, 545 "DATE": _build_date, 546 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 547 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 548 "DATE_TRUNC": lambda args: exp.DateTrunc( 549 unit=seq_get(args, 1), 550 this=seq_get(args, 0), 551 zone=seq_get(args, 2), 552 ), 553 "DATETIME": _build_datetime, 554 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 555 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 556 "DIV": binary_from_function(exp.IntDiv), 557 "EDIT_DISTANCE": _build_levenshtein, 558 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 559 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 560 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 561 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 562 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 563 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 564 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 565 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 566 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 567 "MD5": exp.MD5Digest.from_arg_list, 568 "TO_HEX": _build_to_hex, 569 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 570 [seq_get(args, 1), seq_get(args, 0)] 571 ), 572 "PARSE_TIMESTAMP": _build_parse_timestamp, 573 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 574 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 575 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 576 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 577 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 578 ), 579 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 580 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 581 "SPLIT": lambda args: exp.Split( 582 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 583 this=seq_get(args, 0), 584 expression=seq_get(args, 1) or exp.Literal.string(","), 585 ), 586 "STRPOS": exp.StrPosition.from_arg_list, 587 "TIME": _build_time, 588 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 589 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 590 "TIMESTAMP": _build_timestamp, 591 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 592 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 593 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 594 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 595 ), 596 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 597 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 598 ), 599 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 600 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 601 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 602 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 603 } 604 605 FUNCTION_PARSERS = { 606 **parser.Parser.FUNCTION_PARSERS, 607 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 608 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 609 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 610 } 611 FUNCTION_PARSERS.pop("TRIM") 612 613 NO_PAREN_FUNCTIONS = { 614 **parser.Parser.NO_PAREN_FUNCTIONS, 615 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 616 } 617 618 NESTED_TYPE_TOKENS = { 619 *parser.Parser.NESTED_TYPE_TOKENS, 620 TokenType.TABLE, 621 } 622 623 PROPERTY_PARSERS = { 624 **parser.Parser.PROPERTY_PARSERS, 625 "NOT DETERMINISTIC": lambda self: self.expression( 626 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 627 ), 628 "OPTIONS": lambda self: self._parse_with_property(), 629 } 630 631 CONSTRAINT_PARSERS = { 632 **parser.Parser.CONSTRAINT_PARSERS, 633 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 634 } 635 636 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 637 RANGE_PARSERS.pop(TokenType.OVERLAPS) 638 639 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 640 641 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 642 643 STATEMENT_PARSERS = { 644 **parser.Parser.STATEMENT_PARSERS, 645 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 646 TokenType.END: lambda self: self._parse_as_command(self._prev), 647 TokenType.FOR: lambda self: self._parse_for_in(), 648 TokenType.EXPORT: lambda self: self._parse_export_data(), 649 } 650 651 BRACKET_OFFSETS = { 652 "OFFSET": (0, False), 653 "ORDINAL": (1, False), 654 "SAFE_OFFSET": (0, True), 655 "SAFE_ORDINAL": (1, True), 656 } 657 658 def _parse_for_in(self) -> exp.ForIn: 659 this = self._parse_range() 660 self._match_text_seq("DO") 661 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 662 663 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 664 this = super()._parse_table_part(schema=schema) or self._parse_number() 665 666 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 667 if isinstance(this, exp.Identifier): 668 table_name = this.name 669 while self._match(TokenType.DASH, advance=False) and self._next: 670 start = self._curr 671 while self._is_connected() and not self._match_set( 672 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 673 ): 674 self._advance() 675 676 if start == self._curr: 677 break 678 679 table_name += self._find_sql(start, self._prev) 680 681 this = exp.Identifier( 682 this=table_name, quoted=this.args.get("quoted") 683 ).update_positions(this) 684 elif isinstance(this, exp.Literal): 685 table_name = this.name 686 687 if self._is_connected() and self._parse_var(any_token=True): 688 table_name += self._prev.text 689 690 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 691 692 return this 693 694 def _parse_table_parts( 695 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 696 ) -> exp.Table: 697 table = super()._parse_table_parts( 698 schema=schema, is_db_reference=is_db_reference, wildcard=True 699 ) 700 701 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 702 if not table.catalog: 703 if table.db: 704 previous_db = table.args["db"] 705 parts = table.db.split(".") 706 if len(parts) == 2 and not table.args["db"].quoted: 707 table.set( 708 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 709 ) 710 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 711 else: 712 previous_this = table.this 713 parts = table.name.split(".") 714 if len(parts) == 2 and not table.this.quoted: 715 table.set( 716 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 717 ) 718 table.set( 719 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 720 ) 721 722 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 723 alias = table.this 724 catalog, db, this, *rest = ( 725 exp.to_identifier(p, quoted=True) 726 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 727 ) 728 729 for part in (catalog, db, this): 730 if part: 731 part.update_positions(table.this) 732 733 if rest and this: 734 this = exp.Dot.build([this, *rest]) # type: ignore 735 736 table = exp.Table( 737 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 738 ) 739 table.meta["quoted_table"] = True 740 else: 741 alias = None 742 743 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 744 # dataset, so if the project identifier is omitted we need to fix the ast so that 745 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 746 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 747 # views, because it would seem like the "catalog" part is set, when it'd actually 748 # be the region/dataset. Merging the two identifiers into a single one is done to 749 # avoid producing a 4-part Table reference, which would cause issues in the schema 750 # module, when there are 3-part table names mixed with information schema views. 751 # 752 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 753 table_parts = table.parts 754 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 755 # We need to alias the table here to avoid breaking existing qualified columns. 756 # This is expected to be safe, because if there's an actual alias coming up in 757 # the token stream, it will overwrite this one. If there isn't one, we are only 758 # exposing the name that can be used to reference the view explicitly (a no-op). 759 exp.alias_( 760 table, 761 t.cast(exp.Identifier, alias or table_parts[-1]), 762 table=True, 763 copy=False, 764 ) 765 766 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 767 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 768 line=table_parts[-2].meta.get("line"), 769 col=table_parts[-1].meta.get("col"), 770 start=table_parts[-2].meta.get("start"), 771 end=table_parts[-1].meta.get("end"), 772 ) 773 table.set("this", new_this) 774 table.set("db", seq_get(table_parts, -3)) 775 table.set("catalog", seq_get(table_parts, -4)) 776 777 return table 778 779 def _parse_column(self) -> t.Optional[exp.Expression]: 780 column = super()._parse_column() 781 if isinstance(column, exp.Column): 782 parts = column.parts 783 if any("." in p.name for p in parts): 784 catalog, db, table, this, *rest = ( 785 exp.to_identifier(p, quoted=True) 786 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 787 ) 788 789 if rest and this: 790 this = exp.Dot.build([this, *rest]) # type: ignore 791 792 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 793 column.meta["quoted_column"] = True 794 795 return column 796 797 @t.overload 798 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 799 800 @t.overload 801 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 802 803 def _parse_json_object(self, agg=False): 804 json_object = super()._parse_json_object() 805 array_kv_pair = seq_get(json_object.expressions, 0) 806 807 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 808 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 809 if ( 810 array_kv_pair 811 and isinstance(array_kv_pair.this, exp.Array) 812 and isinstance(array_kv_pair.expression, exp.Array) 813 ): 814 keys = array_kv_pair.this.expressions 815 values = array_kv_pair.expression.expressions 816 817 json_object.set( 818 "expressions", 819 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 820 ) 821 822 return json_object 823 824 def _parse_bracket( 825 self, this: t.Optional[exp.Expression] = None 826 ) -> t.Optional[exp.Expression]: 827 bracket = super()._parse_bracket(this) 828 829 if this is bracket: 830 return bracket 831 832 if isinstance(bracket, exp.Bracket): 833 for expression in bracket.expressions: 834 name = expression.name.upper() 835 836 if name not in self.BRACKET_OFFSETS: 837 break 838 839 offset, safe = self.BRACKET_OFFSETS[name] 840 bracket.set("offset", offset) 841 bracket.set("safe", safe) 842 expression.replace(expression.expressions[0]) 843 844 return bracket 845 846 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 847 unnest = super()._parse_unnest(with_alias=with_alias) 848 849 if not unnest: 850 return None 851 852 unnest_expr = seq_get(unnest.expressions, 0) 853 if unnest_expr: 854 from sqlglot.optimizer.annotate_types import annotate_types 855 856 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 857 858 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 859 # in contrast to other dialects such as DuckDB which flattens only the array by default 860 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 861 array_elem.is_type(exp.DataType.Type.STRUCT) 862 for array_elem in unnest_expr._type.expressions 863 ): 864 unnest.set("explode_array", True) 865 866 return unnest 867 868 def _parse_make_interval(self) -> exp.MakeInterval: 869 expr = exp.MakeInterval() 870 871 for arg_key in expr.arg_types: 872 value = self._parse_lambda() 873 874 if not value: 875 break 876 877 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 878 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 879 if isinstance(value, exp.Kwarg): 880 arg_key = value.this.name 881 882 expr.set(arg_key, value) 883 884 self._match(TokenType.COMMA) 885 886 return expr 887 888 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 889 expr = self.expression( 890 exp.FeaturesAtTime, 891 this=(self._match(TokenType.TABLE) and self._parse_table()) 892 or self._parse_select(nested=True), 893 ) 894 895 while self._match(TokenType.COMMA): 896 arg = self._parse_lambda() 897 898 # Get the LHS of the Kwarg and set the arg to that value, e.g 899 # "num_rows => 1" sets the expr's `num_rows` arg 900 if arg: 901 expr.set(arg.this.name, arg) 902 903 return expr 904 905 def _parse_export_data(self) -> exp.Export: 906 self._match_text_seq("DATA") 907 908 return self.expression( 909 exp.Export, 910 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 911 options=self._parse_properties(), 912 this=self._match_text_seq("AS") and self._parse_select(), 913 ) 914 915 class Generator(generator.Generator): 916 INTERVAL_ALLOWS_PLURAL_FORM = False 917 JOIN_HINTS = False 918 QUERY_HINTS = False 919 TABLE_HINTS = False 920 LIMIT_FETCH = "LIMIT" 921 RENAME_TABLE_WITH_DB = False 922 NVL2_SUPPORTED = False 923 UNNEST_WITH_ORDINALITY = False 924 COLLATE_IS_FUNC = True 925 LIMIT_ONLY_LITERALS = True 926 SUPPORTS_TABLE_ALIAS_COLUMNS = False 927 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 928 JSON_KEY_VALUE_PAIR_SEP = "," 929 NULL_ORDERING_SUPPORTED = False 930 IGNORE_NULLS_IN_FUNC = True 931 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 932 CAN_IMPLEMENT_ARRAY_ANY = True 933 SUPPORTS_TO_NUMBER = False 934 NAMED_PLACEHOLDER_TOKEN = "@" 935 HEX_FUNC = "TO_HEX" 936 WITH_PROPERTIES_PREFIX = "OPTIONS" 937 SUPPORTS_EXPLODING_PROJECTIONS = False 938 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 939 SUPPORTS_UNIX_SECONDS = True 940 941 TRANSFORMS = { 942 **generator.Generator.TRANSFORMS, 943 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 944 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 945 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 946 exp.Array: inline_array_unless_query, 947 exp.ArrayContains: _array_contains_sql, 948 exp.ArrayFilter: filter_array_using_unnest, 949 exp.ArrayRemove: filter_array_using_unnest, 950 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 951 exp.CollateProperty: lambda self, e: ( 952 f"DEFAULT COLLATE {self.sql(e, 'this')}" 953 if e.args.get("default") 954 else f"COLLATE {self.sql(e, 'this')}" 955 ), 956 exp.Commit: lambda *_: "COMMIT TRANSACTION", 957 exp.CountIf: rename_func("COUNTIF"), 958 exp.Create: _create_sql, 959 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 960 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 961 exp.DateDiff: lambda self, e: self.func( 962 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 963 ), 964 exp.DateFromParts: rename_func("DATE"), 965 exp.DateStrToDate: datestrtodate_sql, 966 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 967 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 968 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 969 exp.FromTimeZone: lambda self, e: self.func( 970 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 971 ), 972 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 973 exp.GroupConcat: lambda self, e: groupconcat_sql( 974 self, e, func_name="STRING_AGG", within_group=False 975 ), 976 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 977 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 978 exp.If: if_sql(false_value="NULL"), 979 exp.ILike: no_ilike_sql, 980 exp.IntDiv: rename_func("DIV"), 981 exp.Int64: rename_func("INT64"), 982 exp.JSONExtract: _json_extract_sql, 983 exp.JSONExtractArray: _json_extract_sql, 984 exp.JSONExtractScalar: _json_extract_sql, 985 exp.JSONFormat: rename_func("TO_JSON_STRING"), 986 exp.Levenshtein: _levenshtein_sql, 987 exp.Max: max_or_greatest, 988 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 989 exp.MD5Digest: rename_func("MD5"), 990 exp.Min: min_or_least, 991 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 992 exp.RegexpExtract: lambda self, e: self.func( 993 "REGEXP_EXTRACT", 994 e.this, 995 e.expression, 996 e.args.get("position"), 997 e.args.get("occurrence"), 998 ), 999 exp.RegexpExtractAll: lambda self, e: self.func( 1000 "REGEXP_EXTRACT_ALL", e.this, e.expression 1001 ), 1002 exp.RegexpReplace: regexp_replace_sql, 1003 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1004 exp.ReturnsProperty: _returnsproperty_sql, 1005 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1006 exp.Select: transforms.preprocess( 1007 [ 1008 transforms.explode_projection_to_unnest(), 1009 transforms.unqualify_unnest, 1010 transforms.eliminate_distinct_on, 1011 _alias_ordered_group, 1012 transforms.eliminate_semi_and_anti_joins, 1013 ] 1014 ), 1015 exp.SHA: rename_func("SHA1"), 1016 exp.SHA2: sha256_sql, 1017 exp.Space: space_sql, 1018 exp.StabilityProperty: lambda self, e: ( 1019 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1020 ), 1021 exp.String: rename_func("STRING"), 1022 exp.StrPosition: lambda self, e: ( 1023 strposition_sql( 1024 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1025 ) 1026 ), 1027 exp.StrToDate: _str_to_datetime_sql, 1028 exp.StrToTime: _str_to_datetime_sql, 1029 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1030 exp.TimeFromParts: rename_func("TIME"), 1031 exp.TimestampFromParts: rename_func("DATETIME"), 1032 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1033 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1034 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1035 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1036 exp.TimeStrToTime: timestrtotime_sql, 1037 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1038 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1039 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1040 exp.TsOrDsToTime: rename_func("TIME"), 1041 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1042 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1043 exp.Unhex: rename_func("FROM_HEX"), 1044 exp.UnixDate: rename_func("UNIX_DATE"), 1045 exp.UnixToTime: _unix_to_time_sql, 1046 exp.Uuid: lambda *_: "GENERATE_UUID()", 1047 exp.Values: _derived_table_values_to_unnest, 1048 exp.VariancePop: rename_func("VAR_POP"), 1049 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1050 } 1051 1052 SUPPORTED_JSON_PATH_PARTS = { 1053 exp.JSONPathKey, 1054 exp.JSONPathRoot, 1055 exp.JSONPathSubscript, 1056 } 1057 1058 TYPE_MAPPING = { 1059 **generator.Generator.TYPE_MAPPING, 1060 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1061 exp.DataType.Type.BIGINT: "INT64", 1062 exp.DataType.Type.BINARY: "BYTES", 1063 exp.DataType.Type.BLOB: "BYTES", 1064 exp.DataType.Type.BOOLEAN: "BOOL", 1065 exp.DataType.Type.CHAR: "STRING", 1066 exp.DataType.Type.DECIMAL: "NUMERIC", 1067 exp.DataType.Type.DOUBLE: "FLOAT64", 1068 exp.DataType.Type.FLOAT: "FLOAT64", 1069 exp.DataType.Type.INT: "INT64", 1070 exp.DataType.Type.NCHAR: "STRING", 1071 exp.DataType.Type.NVARCHAR: "STRING", 1072 exp.DataType.Type.SMALLINT: "INT64", 1073 exp.DataType.Type.TEXT: "STRING", 1074 exp.DataType.Type.TIMESTAMP: "DATETIME", 1075 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1076 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1077 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1078 exp.DataType.Type.TINYINT: "INT64", 1079 exp.DataType.Type.ROWVERSION: "BYTES", 1080 exp.DataType.Type.UUID: "STRING", 1081 exp.DataType.Type.VARBINARY: "BYTES", 1082 exp.DataType.Type.VARCHAR: "STRING", 1083 exp.DataType.Type.VARIANT: "ANY TYPE", 1084 } 1085 1086 PROPERTIES_LOCATION = { 1087 **generator.Generator.PROPERTIES_LOCATION, 1088 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1089 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1090 } 1091 1092 # WINDOW comes after QUALIFY 1093 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1094 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1095 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1096 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1097 } 1098 1099 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1100 RESERVED_KEYWORDS = { 1101 "all", 1102 "and", 1103 "any", 1104 "array", 1105 "as", 1106 "asc", 1107 "assert_rows_modified", 1108 "at", 1109 "between", 1110 "by", 1111 "case", 1112 "cast", 1113 "collate", 1114 "contains", 1115 "create", 1116 "cross", 1117 "cube", 1118 "current", 1119 "default", 1120 "define", 1121 "desc", 1122 "distinct", 1123 "else", 1124 "end", 1125 "enum", 1126 "escape", 1127 "except", 1128 "exclude", 1129 "exists", 1130 "extract", 1131 "false", 1132 "fetch", 1133 "following", 1134 "for", 1135 "from", 1136 "full", 1137 "group", 1138 "grouping", 1139 "groups", 1140 "hash", 1141 "having", 1142 "if", 1143 "ignore", 1144 "in", 1145 "inner", 1146 "intersect", 1147 "interval", 1148 "into", 1149 "is", 1150 "join", 1151 "lateral", 1152 "left", 1153 "like", 1154 "limit", 1155 "lookup", 1156 "merge", 1157 "natural", 1158 "new", 1159 "no", 1160 "not", 1161 "null", 1162 "nulls", 1163 "of", 1164 "on", 1165 "or", 1166 "order", 1167 "outer", 1168 "over", 1169 "partition", 1170 "preceding", 1171 "proto", 1172 "qualify", 1173 "range", 1174 "recursive", 1175 "respect", 1176 "right", 1177 "rollup", 1178 "rows", 1179 "select", 1180 "set", 1181 "some", 1182 "struct", 1183 "tablesample", 1184 "then", 1185 "to", 1186 "treat", 1187 "true", 1188 "unbounded", 1189 "union", 1190 "unnest", 1191 "using", 1192 "when", 1193 "where", 1194 "window", 1195 "with", 1196 "within", 1197 } 1198 1199 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1200 unit = expression.unit 1201 unit_sql = unit.name if unit.is_string else self.sql(unit) 1202 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1203 1204 def mod_sql(self, expression: exp.Mod) -> str: 1205 this = expression.this 1206 expr = expression.expression 1207 return self.func( 1208 "MOD", 1209 this.unnest() if isinstance(this, exp.Paren) else this, 1210 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1211 ) 1212 1213 def column_parts(self, expression: exp.Column) -> str: 1214 if expression.meta.get("quoted_column"): 1215 # If a column reference is of the form `dataset.table`.name, we need 1216 # to preserve the quoted table path, otherwise the reference breaks 1217 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1218 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1219 return f"{table_path}.{self.sql(expression, 'this')}" 1220 1221 return super().column_parts(expression) 1222 1223 def table_parts(self, expression: exp.Table) -> str: 1224 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1225 # we need to make sure the correct quoting is used in each case. 1226 # 1227 # For example, if there is a CTE x that clashes with a schema name, then the former will 1228 # return the table y in that schema, whereas the latter will return the CTE's y column: 1229 # 1230 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1231 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1232 if expression.meta.get("quoted_table"): 1233 table_parts = ".".join(p.name for p in expression.parts) 1234 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1235 1236 return super().table_parts(expression) 1237 1238 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1239 this = expression.this 1240 if isinstance(this, exp.TsOrDsToDatetime): 1241 func_name = "FORMAT_DATETIME" 1242 elif isinstance(this, exp.TsOrDsToTimestamp): 1243 func_name = "FORMAT_TIMESTAMP" 1244 else: 1245 func_name = "FORMAT_DATE" 1246 1247 time_expr = ( 1248 this 1249 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1250 else expression 1251 ) 1252 return self.func( 1253 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1254 ) 1255 1256 def eq_sql(self, expression: exp.EQ) -> str: 1257 # Operands of = cannot be NULL in BigQuery 1258 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1259 if not isinstance(expression.parent, exp.Update): 1260 return "NULL" 1261 1262 return self.binary(expression, "=") 1263 1264 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1265 parent = expression.parent 1266 1267 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1268 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1269 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1270 return self.func( 1271 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1272 ) 1273 1274 return super().attimezone_sql(expression) 1275 1276 def trycast_sql(self, expression: exp.TryCast) -> str: 1277 return self.cast_sql(expression, safe_prefix="SAFE_") 1278 1279 def bracket_sql(self, expression: exp.Bracket) -> str: 1280 this = expression.this 1281 expressions = expression.expressions 1282 1283 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1284 arg = expressions[0] 1285 if arg.type is None: 1286 from sqlglot.optimizer.annotate_types import annotate_types 1287 1288 arg = annotate_types(arg, dialect=self.dialect) 1289 1290 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1291 # BQ doesn't support bracket syntax with string values for structs 1292 return f"{self.sql(this)}.{arg.name}" 1293 1294 expressions_sql = self.expressions(expression, flat=True) 1295 offset = expression.args.get("offset") 1296 1297 if offset == 0: 1298 expressions_sql = f"OFFSET({expressions_sql})" 1299 elif offset == 1: 1300 expressions_sql = f"ORDINAL({expressions_sql})" 1301 elif offset is not None: 1302 self.unsupported(f"Unsupported array offset: {offset}") 1303 1304 if expression.args.get("safe"): 1305 expressions_sql = f"SAFE_{expressions_sql}" 1306 1307 return f"{self.sql(this)}[{expressions_sql}]" 1308 1309 def in_unnest_op(self, expression: exp.Unnest) -> str: 1310 return self.sql(expression) 1311 1312 def version_sql(self, expression: exp.Version) -> str: 1313 if expression.name == "TIMESTAMP": 1314 expression.set("this", "SYSTEM_TIME") 1315 return super().version_sql(expression) 1316 1317 def contains_sql(self, expression: exp.Contains) -> str: 1318 this = expression.this 1319 expr = expression.expression 1320 1321 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1322 this = this.this 1323 expr = expr.this 1324 1325 return self.func("CONTAINS_SUBSTR", this, expr) 1326 1327 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1328 this = expression.this 1329 1330 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1331 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1332 # because they aren't literals and so the above syntax is invalid BigQuery. 1333 if isinstance(this, exp.Array): 1334 elem = seq_get(this.expressions, 0) 1335 if not (elem and elem.find(exp.Query)): 1336 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1337 1338 return super().cast_sql(expression, safe_prefix=safe_prefix)
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime
formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy')
.
If empty, the corresponding trie will be constructed off of TIME_MAPPING
.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT *
queries.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
455 def normalize_identifier(self, expression: E) -> E: 456 if ( 457 isinstance(expression, exp.Identifier) 458 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 459 ): 460 parent = expression.parent 461 while isinstance(parent, exp.Dot): 462 parent = parent.parent 463 464 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 465 # by default. The following check uses a heuristic to detect tables based on whether 466 # they are qualified. This should generally be correct, because tables in BigQuery 467 # must be qualified with at least a dataset, unless @@dataset_id is set. 468 case_sensitive = ( 469 isinstance(parent, exp.UserDefinedFunction) 470 or ( 471 isinstance(parent, exp.Table) 472 and parent.db 473 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 474 ) 475 or expression.meta.get("is_table") 476 ) 477 if not case_sensitive: 478 expression.set("this", expression.this.lower()) 479 480 return t.cast(E, expression) 481 482 return super().normalize_identifier(expression)
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO
would be resolved as foo
in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO
. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
484 class Tokenizer(tokens.Tokenizer): 485 QUOTES = ["'", '"', '"""', "'''"] 486 COMMENTS = ["--", "#", ("/*", "*/")] 487 IDENTIFIERS = ["`"] 488 STRING_ESCAPES = ["\\"] 489 490 HEX_STRINGS = [("0x", ""), ("0X", "")] 491 492 BYTE_STRINGS = [ 493 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 494 ] 495 496 RAW_STRINGS = [ 497 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 498 ] 499 500 NESTED_COMMENTS = False 501 502 KEYWORDS = { 503 **tokens.Tokenizer.KEYWORDS, 504 "ANY TYPE": TokenType.VARIANT, 505 "BEGIN": TokenType.COMMAND, 506 "BEGIN TRANSACTION": TokenType.BEGIN, 507 "BYTEINT": TokenType.INT, 508 "BYTES": TokenType.BINARY, 509 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 510 "DATETIME": TokenType.TIMESTAMP, 511 "DECLARE": TokenType.COMMAND, 512 "ELSEIF": TokenType.COMMAND, 513 "EXCEPTION": TokenType.COMMAND, 514 "EXPORT": TokenType.EXPORT, 515 "FLOAT64": TokenType.DOUBLE, 516 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 517 "MODEL": TokenType.MODEL, 518 "NOT DETERMINISTIC": TokenType.VOLATILE, 519 "RECORD": TokenType.STRUCT, 520 "TIMESTAMP": TokenType.TIMESTAMPTZ, 521 } 522 KEYWORDS.pop("DIV") 523 KEYWORDS.pop("VALUES") 524 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
526 class Parser(parser.Parser): 527 PREFIXED_PIVOT_COLUMNS = True 528 LOG_DEFAULTS_TO_LN = True 529 SUPPORTS_IMPLICIT_UNNEST = True 530 JOINS_HAVE_EQUAL_PRECEDENCE = True 531 532 # BigQuery does not allow ASC/DESC to be used as an identifier 533 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 534 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 535 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 536 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 537 TokenType.ASC, 538 TokenType.DESC, 539 } 540 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 541 542 FUNCTIONS = { 543 **parser.Parser.FUNCTIONS, 544 "CONTAINS_SUBSTR": _build_contains_substring, 545 "DATE": _build_date, 546 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 547 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 548 "DATE_TRUNC": lambda args: exp.DateTrunc( 549 unit=seq_get(args, 1), 550 this=seq_get(args, 0), 551 zone=seq_get(args, 2), 552 ), 553 "DATETIME": _build_datetime, 554 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 555 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 556 "DIV": binary_from_function(exp.IntDiv), 557 "EDIT_DISTANCE": _build_levenshtein, 558 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 559 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 560 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 561 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 562 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 563 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 564 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 565 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 566 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 567 "MD5": exp.MD5Digest.from_arg_list, 568 "TO_HEX": _build_to_hex, 569 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 570 [seq_get(args, 1), seq_get(args, 0)] 571 ), 572 "PARSE_TIMESTAMP": _build_parse_timestamp, 573 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 574 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 575 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 576 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 577 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 578 ), 579 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 580 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 581 "SPLIT": lambda args: exp.Split( 582 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 583 this=seq_get(args, 0), 584 expression=seq_get(args, 1) or exp.Literal.string(","), 585 ), 586 "STRPOS": exp.StrPosition.from_arg_list, 587 "TIME": _build_time, 588 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 589 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 590 "TIMESTAMP": _build_timestamp, 591 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 592 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 593 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 594 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 595 ), 596 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 597 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 598 ), 599 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 600 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 601 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 602 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 603 } 604 605 FUNCTION_PARSERS = { 606 **parser.Parser.FUNCTION_PARSERS, 607 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 608 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 609 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 610 } 611 FUNCTION_PARSERS.pop("TRIM") 612 613 NO_PAREN_FUNCTIONS = { 614 **parser.Parser.NO_PAREN_FUNCTIONS, 615 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 616 } 617 618 NESTED_TYPE_TOKENS = { 619 *parser.Parser.NESTED_TYPE_TOKENS, 620 TokenType.TABLE, 621 } 622 623 PROPERTY_PARSERS = { 624 **parser.Parser.PROPERTY_PARSERS, 625 "NOT DETERMINISTIC": lambda self: self.expression( 626 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 627 ), 628 "OPTIONS": lambda self: self._parse_with_property(), 629 } 630 631 CONSTRAINT_PARSERS = { 632 **parser.Parser.CONSTRAINT_PARSERS, 633 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 634 } 635 636 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 637 RANGE_PARSERS.pop(TokenType.OVERLAPS) 638 639 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 640 641 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 642 643 STATEMENT_PARSERS = { 644 **parser.Parser.STATEMENT_PARSERS, 645 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 646 TokenType.END: lambda self: self._parse_as_command(self._prev), 647 TokenType.FOR: lambda self: self._parse_for_in(), 648 TokenType.EXPORT: lambda self: self._parse_export_data(), 649 } 650 651 BRACKET_OFFSETS = { 652 "OFFSET": (0, False), 653 "ORDINAL": (1, False), 654 "SAFE_OFFSET": (0, True), 655 "SAFE_ORDINAL": (1, True), 656 } 657 658 def _parse_for_in(self) -> exp.ForIn: 659 this = self._parse_range() 660 self._match_text_seq("DO") 661 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 662 663 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 664 this = super()._parse_table_part(schema=schema) or self._parse_number() 665 666 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 667 if isinstance(this, exp.Identifier): 668 table_name = this.name 669 while self._match(TokenType.DASH, advance=False) and self._next: 670 start = self._curr 671 while self._is_connected() and not self._match_set( 672 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 673 ): 674 self._advance() 675 676 if start == self._curr: 677 break 678 679 table_name += self._find_sql(start, self._prev) 680 681 this = exp.Identifier( 682 this=table_name, quoted=this.args.get("quoted") 683 ).update_positions(this) 684 elif isinstance(this, exp.Literal): 685 table_name = this.name 686 687 if self._is_connected() and self._parse_var(any_token=True): 688 table_name += self._prev.text 689 690 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 691 692 return this 693 694 def _parse_table_parts( 695 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 696 ) -> exp.Table: 697 table = super()._parse_table_parts( 698 schema=schema, is_db_reference=is_db_reference, wildcard=True 699 ) 700 701 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 702 if not table.catalog: 703 if table.db: 704 previous_db = table.args["db"] 705 parts = table.db.split(".") 706 if len(parts) == 2 and not table.args["db"].quoted: 707 table.set( 708 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 709 ) 710 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 711 else: 712 previous_this = table.this 713 parts = table.name.split(".") 714 if len(parts) == 2 and not table.this.quoted: 715 table.set( 716 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 717 ) 718 table.set( 719 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 720 ) 721 722 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 723 alias = table.this 724 catalog, db, this, *rest = ( 725 exp.to_identifier(p, quoted=True) 726 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 727 ) 728 729 for part in (catalog, db, this): 730 if part: 731 part.update_positions(table.this) 732 733 if rest and this: 734 this = exp.Dot.build([this, *rest]) # type: ignore 735 736 table = exp.Table( 737 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 738 ) 739 table.meta["quoted_table"] = True 740 else: 741 alias = None 742 743 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 744 # dataset, so if the project identifier is omitted we need to fix the ast so that 745 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 746 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 747 # views, because it would seem like the "catalog" part is set, when it'd actually 748 # be the region/dataset. Merging the two identifiers into a single one is done to 749 # avoid producing a 4-part Table reference, which would cause issues in the schema 750 # module, when there are 3-part table names mixed with information schema views. 751 # 752 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 753 table_parts = table.parts 754 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 755 # We need to alias the table here to avoid breaking existing qualified columns. 756 # This is expected to be safe, because if there's an actual alias coming up in 757 # the token stream, it will overwrite this one. If there isn't one, we are only 758 # exposing the name that can be used to reference the view explicitly (a no-op). 759 exp.alias_( 760 table, 761 t.cast(exp.Identifier, alias or table_parts[-1]), 762 table=True, 763 copy=False, 764 ) 765 766 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 767 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 768 line=table_parts[-2].meta.get("line"), 769 col=table_parts[-1].meta.get("col"), 770 start=table_parts[-2].meta.get("start"), 771 end=table_parts[-1].meta.get("end"), 772 ) 773 table.set("this", new_this) 774 table.set("db", seq_get(table_parts, -3)) 775 table.set("catalog", seq_get(table_parts, -4)) 776 777 return table 778 779 def _parse_column(self) -> t.Optional[exp.Expression]: 780 column = super()._parse_column() 781 if isinstance(column, exp.Column): 782 parts = column.parts 783 if any("." in p.name for p in parts): 784 catalog, db, table, this, *rest = ( 785 exp.to_identifier(p, quoted=True) 786 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 787 ) 788 789 if rest and this: 790 this = exp.Dot.build([this, *rest]) # type: ignore 791 792 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 793 column.meta["quoted_column"] = True 794 795 return column 796 797 @t.overload 798 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 799 800 @t.overload 801 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 802 803 def _parse_json_object(self, agg=False): 804 json_object = super()._parse_json_object() 805 array_kv_pair = seq_get(json_object.expressions, 0) 806 807 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 808 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 809 if ( 810 array_kv_pair 811 and isinstance(array_kv_pair.this, exp.Array) 812 and isinstance(array_kv_pair.expression, exp.Array) 813 ): 814 keys = array_kv_pair.this.expressions 815 values = array_kv_pair.expression.expressions 816 817 json_object.set( 818 "expressions", 819 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 820 ) 821 822 return json_object 823 824 def _parse_bracket( 825 self, this: t.Optional[exp.Expression] = None 826 ) -> t.Optional[exp.Expression]: 827 bracket = super()._parse_bracket(this) 828 829 if this is bracket: 830 return bracket 831 832 if isinstance(bracket, exp.Bracket): 833 for expression in bracket.expressions: 834 name = expression.name.upper() 835 836 if name not in self.BRACKET_OFFSETS: 837 break 838 839 offset, safe = self.BRACKET_OFFSETS[name] 840 bracket.set("offset", offset) 841 bracket.set("safe", safe) 842 expression.replace(expression.expressions[0]) 843 844 return bracket 845 846 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 847 unnest = super()._parse_unnest(with_alias=with_alias) 848 849 if not unnest: 850 return None 851 852 unnest_expr = seq_get(unnest.expressions, 0) 853 if unnest_expr: 854 from sqlglot.optimizer.annotate_types import annotate_types 855 856 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 857 858 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 859 # in contrast to other dialects such as DuckDB which flattens only the array by default 860 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 861 array_elem.is_type(exp.DataType.Type.STRUCT) 862 for array_elem in unnest_expr._type.expressions 863 ): 864 unnest.set("explode_array", True) 865 866 return unnest 867 868 def _parse_make_interval(self) -> exp.MakeInterval: 869 expr = exp.MakeInterval() 870 871 for arg_key in expr.arg_types: 872 value = self._parse_lambda() 873 874 if not value: 875 break 876 877 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 878 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 879 if isinstance(value, exp.Kwarg): 880 arg_key = value.this.name 881 882 expr.set(arg_key, value) 883 884 self._match(TokenType.COMMA) 885 886 return expr 887 888 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 889 expr = self.expression( 890 exp.FeaturesAtTime, 891 this=(self._match(TokenType.TABLE) and self._parse_table()) 892 or self._parse_select(nested=True), 893 ) 894 895 while self._match(TokenType.COMMA): 896 arg = self._parse_lambda() 897 898 # Get the LHS of the Kwarg and set the arg to that value, e.g 899 # "num_rows => 1" sets the expr's `num_rows` arg 900 if arg: 901 expr.set(arg.this.name, arg) 902 903 return expr 904 905 def _parse_export_data(self) -> exp.Export: 906 self._match_text_seq("DATA") 907 908 return self.expression( 909 exp.Export, 910 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 911 options=self._parse_properties(), 912 this=self._match_text_seq("AS") and self._parse_select(), 913 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- errors
- sql
915 class Generator(generator.Generator): 916 INTERVAL_ALLOWS_PLURAL_FORM = False 917 JOIN_HINTS = False 918 QUERY_HINTS = False 919 TABLE_HINTS = False 920 LIMIT_FETCH = "LIMIT" 921 RENAME_TABLE_WITH_DB = False 922 NVL2_SUPPORTED = False 923 UNNEST_WITH_ORDINALITY = False 924 COLLATE_IS_FUNC = True 925 LIMIT_ONLY_LITERALS = True 926 SUPPORTS_TABLE_ALIAS_COLUMNS = False 927 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 928 JSON_KEY_VALUE_PAIR_SEP = "," 929 NULL_ORDERING_SUPPORTED = False 930 IGNORE_NULLS_IN_FUNC = True 931 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 932 CAN_IMPLEMENT_ARRAY_ANY = True 933 SUPPORTS_TO_NUMBER = False 934 NAMED_PLACEHOLDER_TOKEN = "@" 935 HEX_FUNC = "TO_HEX" 936 WITH_PROPERTIES_PREFIX = "OPTIONS" 937 SUPPORTS_EXPLODING_PROJECTIONS = False 938 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 939 SUPPORTS_UNIX_SECONDS = True 940 941 TRANSFORMS = { 942 **generator.Generator.TRANSFORMS, 943 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 944 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 945 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 946 exp.Array: inline_array_unless_query, 947 exp.ArrayContains: _array_contains_sql, 948 exp.ArrayFilter: filter_array_using_unnest, 949 exp.ArrayRemove: filter_array_using_unnest, 950 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 951 exp.CollateProperty: lambda self, e: ( 952 f"DEFAULT COLLATE {self.sql(e, 'this')}" 953 if e.args.get("default") 954 else f"COLLATE {self.sql(e, 'this')}" 955 ), 956 exp.Commit: lambda *_: "COMMIT TRANSACTION", 957 exp.CountIf: rename_func("COUNTIF"), 958 exp.Create: _create_sql, 959 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 960 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 961 exp.DateDiff: lambda self, e: self.func( 962 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 963 ), 964 exp.DateFromParts: rename_func("DATE"), 965 exp.DateStrToDate: datestrtodate_sql, 966 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 967 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 968 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 969 exp.FromTimeZone: lambda self, e: self.func( 970 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 971 ), 972 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 973 exp.GroupConcat: lambda self, e: groupconcat_sql( 974 self, e, func_name="STRING_AGG", within_group=False 975 ), 976 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 977 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 978 exp.If: if_sql(false_value="NULL"), 979 exp.ILike: no_ilike_sql, 980 exp.IntDiv: rename_func("DIV"), 981 exp.Int64: rename_func("INT64"), 982 exp.JSONExtract: _json_extract_sql, 983 exp.JSONExtractArray: _json_extract_sql, 984 exp.JSONExtractScalar: _json_extract_sql, 985 exp.JSONFormat: rename_func("TO_JSON_STRING"), 986 exp.Levenshtein: _levenshtein_sql, 987 exp.Max: max_or_greatest, 988 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 989 exp.MD5Digest: rename_func("MD5"), 990 exp.Min: min_or_least, 991 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 992 exp.RegexpExtract: lambda self, e: self.func( 993 "REGEXP_EXTRACT", 994 e.this, 995 e.expression, 996 e.args.get("position"), 997 e.args.get("occurrence"), 998 ), 999 exp.RegexpExtractAll: lambda self, e: self.func( 1000 "REGEXP_EXTRACT_ALL", e.this, e.expression 1001 ), 1002 exp.RegexpReplace: regexp_replace_sql, 1003 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1004 exp.ReturnsProperty: _returnsproperty_sql, 1005 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1006 exp.Select: transforms.preprocess( 1007 [ 1008 transforms.explode_projection_to_unnest(), 1009 transforms.unqualify_unnest, 1010 transforms.eliminate_distinct_on, 1011 _alias_ordered_group, 1012 transforms.eliminate_semi_and_anti_joins, 1013 ] 1014 ), 1015 exp.SHA: rename_func("SHA1"), 1016 exp.SHA2: sha256_sql, 1017 exp.Space: space_sql, 1018 exp.StabilityProperty: lambda self, e: ( 1019 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1020 ), 1021 exp.String: rename_func("STRING"), 1022 exp.StrPosition: lambda self, e: ( 1023 strposition_sql( 1024 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1025 ) 1026 ), 1027 exp.StrToDate: _str_to_datetime_sql, 1028 exp.StrToTime: _str_to_datetime_sql, 1029 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1030 exp.TimeFromParts: rename_func("TIME"), 1031 exp.TimestampFromParts: rename_func("DATETIME"), 1032 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1033 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1034 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1035 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1036 exp.TimeStrToTime: timestrtotime_sql, 1037 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1038 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1039 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1040 exp.TsOrDsToTime: rename_func("TIME"), 1041 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1042 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1043 exp.Unhex: rename_func("FROM_HEX"), 1044 exp.UnixDate: rename_func("UNIX_DATE"), 1045 exp.UnixToTime: _unix_to_time_sql, 1046 exp.Uuid: lambda *_: "GENERATE_UUID()", 1047 exp.Values: _derived_table_values_to_unnest, 1048 exp.VariancePop: rename_func("VAR_POP"), 1049 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1050 } 1051 1052 SUPPORTED_JSON_PATH_PARTS = { 1053 exp.JSONPathKey, 1054 exp.JSONPathRoot, 1055 exp.JSONPathSubscript, 1056 } 1057 1058 TYPE_MAPPING = { 1059 **generator.Generator.TYPE_MAPPING, 1060 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1061 exp.DataType.Type.BIGINT: "INT64", 1062 exp.DataType.Type.BINARY: "BYTES", 1063 exp.DataType.Type.BLOB: "BYTES", 1064 exp.DataType.Type.BOOLEAN: "BOOL", 1065 exp.DataType.Type.CHAR: "STRING", 1066 exp.DataType.Type.DECIMAL: "NUMERIC", 1067 exp.DataType.Type.DOUBLE: "FLOAT64", 1068 exp.DataType.Type.FLOAT: "FLOAT64", 1069 exp.DataType.Type.INT: "INT64", 1070 exp.DataType.Type.NCHAR: "STRING", 1071 exp.DataType.Type.NVARCHAR: "STRING", 1072 exp.DataType.Type.SMALLINT: "INT64", 1073 exp.DataType.Type.TEXT: "STRING", 1074 exp.DataType.Type.TIMESTAMP: "DATETIME", 1075 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1076 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1077 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1078 exp.DataType.Type.TINYINT: "INT64", 1079 exp.DataType.Type.ROWVERSION: "BYTES", 1080 exp.DataType.Type.UUID: "STRING", 1081 exp.DataType.Type.VARBINARY: "BYTES", 1082 exp.DataType.Type.VARCHAR: "STRING", 1083 exp.DataType.Type.VARIANT: "ANY TYPE", 1084 } 1085 1086 PROPERTIES_LOCATION = { 1087 **generator.Generator.PROPERTIES_LOCATION, 1088 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1089 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1090 } 1091 1092 # WINDOW comes after QUALIFY 1093 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1094 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1095 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1096 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1097 } 1098 1099 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1100 RESERVED_KEYWORDS = { 1101 "all", 1102 "and", 1103 "any", 1104 "array", 1105 "as", 1106 "asc", 1107 "assert_rows_modified", 1108 "at", 1109 "between", 1110 "by", 1111 "case", 1112 "cast", 1113 "collate", 1114 "contains", 1115 "create", 1116 "cross", 1117 "cube", 1118 "current", 1119 "default", 1120 "define", 1121 "desc", 1122 "distinct", 1123 "else", 1124 "end", 1125 "enum", 1126 "escape", 1127 "except", 1128 "exclude", 1129 "exists", 1130 "extract", 1131 "false", 1132 "fetch", 1133 "following", 1134 "for", 1135 "from", 1136 "full", 1137 "group", 1138 "grouping", 1139 "groups", 1140 "hash", 1141 "having", 1142 "if", 1143 "ignore", 1144 "in", 1145 "inner", 1146 "intersect", 1147 "interval", 1148 "into", 1149 "is", 1150 "join", 1151 "lateral", 1152 "left", 1153 "like", 1154 "limit", 1155 "lookup", 1156 "merge", 1157 "natural", 1158 "new", 1159 "no", 1160 "not", 1161 "null", 1162 "nulls", 1163 "of", 1164 "on", 1165 "or", 1166 "order", 1167 "outer", 1168 "over", 1169 "partition", 1170 "preceding", 1171 "proto", 1172 "qualify", 1173 "range", 1174 "recursive", 1175 "respect", 1176 "right", 1177 "rollup", 1178 "rows", 1179 "select", 1180 "set", 1181 "some", 1182 "struct", 1183 "tablesample", 1184 "then", 1185 "to", 1186 "treat", 1187 "true", 1188 "unbounded", 1189 "union", 1190 "unnest", 1191 "using", 1192 "when", 1193 "where", 1194 "window", 1195 "with", 1196 "within", 1197 } 1198 1199 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1200 unit = expression.unit 1201 unit_sql = unit.name if unit.is_string else self.sql(unit) 1202 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1203 1204 def mod_sql(self, expression: exp.Mod) -> str: 1205 this = expression.this 1206 expr = expression.expression 1207 return self.func( 1208 "MOD", 1209 this.unnest() if isinstance(this, exp.Paren) else this, 1210 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1211 ) 1212 1213 def column_parts(self, expression: exp.Column) -> str: 1214 if expression.meta.get("quoted_column"): 1215 # If a column reference is of the form `dataset.table`.name, we need 1216 # to preserve the quoted table path, otherwise the reference breaks 1217 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1218 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1219 return f"{table_path}.{self.sql(expression, 'this')}" 1220 1221 return super().column_parts(expression) 1222 1223 def table_parts(self, expression: exp.Table) -> str: 1224 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1225 # we need to make sure the correct quoting is used in each case. 1226 # 1227 # For example, if there is a CTE x that clashes with a schema name, then the former will 1228 # return the table y in that schema, whereas the latter will return the CTE's y column: 1229 # 1230 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1231 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1232 if expression.meta.get("quoted_table"): 1233 table_parts = ".".join(p.name for p in expression.parts) 1234 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1235 1236 return super().table_parts(expression) 1237 1238 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1239 this = expression.this 1240 if isinstance(this, exp.TsOrDsToDatetime): 1241 func_name = "FORMAT_DATETIME" 1242 elif isinstance(this, exp.TsOrDsToTimestamp): 1243 func_name = "FORMAT_TIMESTAMP" 1244 else: 1245 func_name = "FORMAT_DATE" 1246 1247 time_expr = ( 1248 this 1249 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1250 else expression 1251 ) 1252 return self.func( 1253 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1254 ) 1255 1256 def eq_sql(self, expression: exp.EQ) -> str: 1257 # Operands of = cannot be NULL in BigQuery 1258 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1259 if not isinstance(expression.parent, exp.Update): 1260 return "NULL" 1261 1262 return self.binary(expression, "=") 1263 1264 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1265 parent = expression.parent 1266 1267 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1268 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1269 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1270 return self.func( 1271 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1272 ) 1273 1274 return super().attimezone_sql(expression) 1275 1276 def trycast_sql(self, expression: exp.TryCast) -> str: 1277 return self.cast_sql(expression, safe_prefix="SAFE_") 1278 1279 def bracket_sql(self, expression: exp.Bracket) -> str: 1280 this = expression.this 1281 expressions = expression.expressions 1282 1283 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1284 arg = expressions[0] 1285 if arg.type is None: 1286 from sqlglot.optimizer.annotate_types import annotate_types 1287 1288 arg = annotate_types(arg, dialect=self.dialect) 1289 1290 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1291 # BQ doesn't support bracket syntax with string values for structs 1292 return f"{self.sql(this)}.{arg.name}" 1293 1294 expressions_sql = self.expressions(expression, flat=True) 1295 offset = expression.args.get("offset") 1296 1297 if offset == 0: 1298 expressions_sql = f"OFFSET({expressions_sql})" 1299 elif offset == 1: 1300 expressions_sql = f"ORDINAL({expressions_sql})" 1301 elif offset is not None: 1302 self.unsupported(f"Unsupported array offset: {offset}") 1303 1304 if expression.args.get("safe"): 1305 expressions_sql = f"SAFE_{expressions_sql}" 1306 1307 return f"{self.sql(this)}[{expressions_sql}]" 1308 1309 def in_unnest_op(self, expression: exp.Unnest) -> str: 1310 return self.sql(expression) 1311 1312 def version_sql(self, expression: exp.Version) -> str: 1313 if expression.name == "TIMESTAMP": 1314 expression.set("this", "SYSTEM_TIME") 1315 return super().version_sql(expression) 1316 1317 def contains_sql(self, expression: exp.Contains) -> str: 1318 this = expression.this 1319 expr = expression.expression 1320 1321 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1322 this = this.this 1323 expr = expr.this 1324 1325 return self.func("CONTAINS_SUBSTR", this, expr) 1326 1327 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1328 this = expression.this 1329 1330 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1331 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1332 # because they aren't literals and so the above syntax is invalid BigQuery. 1333 if isinstance(this, exp.Array): 1334 elem = seq_get(this.expressions, 0) 1335 if not (elem and elem.find(exp.Query)): 1336 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1337 1338 return super().cast_sql(expression, safe_prefix=safe_prefix)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1213 def column_parts(self, expression: exp.Column) -> str: 1214 if expression.meta.get("quoted_column"): 1215 # If a column reference is of the form `dataset.table`.name, we need 1216 # to preserve the quoted table path, otherwise the reference breaks 1217 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1218 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1219 return f"{table_path}.{self.sql(expression, 'this')}" 1220 1221 return super().column_parts(expression)
1223 def table_parts(self, expression: exp.Table) -> str: 1224 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1225 # we need to make sure the correct quoting is used in each case. 1226 # 1227 # For example, if there is a CTE x that clashes with a schema name, then the former will 1228 # return the table y in that schema, whereas the latter will return the CTE's y column: 1229 # 1230 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1231 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1232 if expression.meta.get("quoted_table"): 1233 table_parts = ".".join(p.name for p in expression.parts) 1234 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1235 1236 return super().table_parts(expression)
1238 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1239 this = expression.this 1240 if isinstance(this, exp.TsOrDsToDatetime): 1241 func_name = "FORMAT_DATETIME" 1242 elif isinstance(this, exp.TsOrDsToTimestamp): 1243 func_name = "FORMAT_TIMESTAMP" 1244 else: 1245 func_name = "FORMAT_DATE" 1246 1247 time_expr = ( 1248 this 1249 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1250 else expression 1251 ) 1252 return self.func( 1253 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1254 )
1256 def eq_sql(self, expression: exp.EQ) -> str: 1257 # Operands of = cannot be NULL in BigQuery 1258 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1259 if not isinstance(expression.parent, exp.Update): 1260 return "NULL" 1261 1262 return self.binary(expression, "=")
1264 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1265 parent = expression.parent 1266 1267 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1268 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1269 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1270 return self.func( 1271 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1272 ) 1273 1274 return super().attimezone_sql(expression)
1279 def bracket_sql(self, expression: exp.Bracket) -> str: 1280 this = expression.this 1281 expressions = expression.expressions 1282 1283 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1284 arg = expressions[0] 1285 if arg.type is None: 1286 from sqlglot.optimizer.annotate_types import annotate_types 1287 1288 arg = annotate_types(arg, dialect=self.dialect) 1289 1290 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1291 # BQ doesn't support bracket syntax with string values for structs 1292 return f"{self.sql(this)}.{arg.name}" 1293 1294 expressions_sql = self.expressions(expression, flat=True) 1295 offset = expression.args.get("offset") 1296 1297 if offset == 0: 1298 expressions_sql = f"OFFSET({expressions_sql})" 1299 elif offset == 1: 1300 expressions_sql = f"ORDINAL({expressions_sql})" 1301 elif offset is not None: 1302 self.unsupported(f"Unsupported array offset: {offset}") 1303 1304 if expression.args.get("safe"): 1305 expressions_sql = f"SAFE_{expressions_sql}" 1306 1307 return f"{self.sql(this)}[{expressions_sql}]"
1327 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1328 this = expression.this 1329 1330 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1331 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1332 # because they aren't literals and so the above syntax is invalid BigQuery. 1333 if isinstance(this, exp.Array): 1334 elem = seq_get(this.expressions, 0) 1335 if not (elem and elem.find(exp.Query)): 1336 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1337 1338 return super().cast_sql(expression, safe_prefix=safe_prefix)
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SUPPORTS_WINDOW_EXCLUDE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- featuresattime_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- get_put_sql
- translatecharacters_sql