Ruby 4.1.0dev (2026-05-15 revision 11de89ca1a94899535875ea594962c79713615b1)
parser.h
1#ifndef PRISM_INTERNAL_PARSER_H
2#define PRISM_INTERNAL_PARSER_H
3
5
6#include "prism/internal/arena.h"
7#include "prism/internal/constant_pool.h"
8#include "prism/internal/encoding.h"
9#include "prism/internal/list.h"
10#include "prism/internal/options.h"
11#include "prism/internal/static_literals.h"
12#include "prism/internal/strpbrk.h"
13
14#include "prism/ast.h"
16#include "prism/parser.h"
17
18#include <stdbool.h>
19#include <stddef.h>
20#include <stdint.h>
21
22/*
23 * This enum provides various bits that represent different kinds of states that
24 * the lexer can track. This is used to determine which kind of token to return
25 * based on the context of the parser.
26 */
27typedef enum {
28 PM_LEX_STATE_BIT_BEG,
29 PM_LEX_STATE_BIT_END,
30 PM_LEX_STATE_BIT_ENDARG,
31 PM_LEX_STATE_BIT_ENDFN,
32 PM_LEX_STATE_BIT_ARG,
33 PM_LEX_STATE_BIT_CMDARG,
34 PM_LEX_STATE_BIT_MID,
35 PM_LEX_STATE_BIT_FNAME,
36 PM_LEX_STATE_BIT_DOT,
37 PM_LEX_STATE_BIT_CLASS,
38 PM_LEX_STATE_BIT_LABEL,
39 PM_LEX_STATE_BIT_LABELED,
40 PM_LEX_STATE_BIT_FITEM
41} pm_lex_state_bit_t;
42
43/*
44 * This enum combines the various bits from the above enum into individual
45 * values that represent the various states of the lexer.
46 */
47typedef enum {
48 PM_LEX_STATE_NONE = 0,
49 PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG),
50 PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END),
51 PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG),
52 PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN),
53 PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG),
54 PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG),
55 PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID),
56 PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME),
57 PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT),
58 PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS),
59 PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL),
60 PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED),
61 PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM),
62 PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS,
63 PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG,
64 PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN
65} pm_lex_state_t;
66
67/*
68 * The type of quote that a heredoc uses.
69 */
70typedef enum {
71 PM_HEREDOC_QUOTE_NONE,
72 PM_HEREDOC_QUOTE_SINGLE = '\'',
73 PM_HEREDOC_QUOTE_DOUBLE = '"',
74 PM_HEREDOC_QUOTE_BACKTICK = '`',
75} pm_heredoc_quote_t;
76
77/*
78 * The type of indentation that a heredoc uses.
79 */
80typedef enum {
81 PM_HEREDOC_INDENT_NONE,
82 PM_HEREDOC_INDENT_DASH,
83 PM_HEREDOC_INDENT_TILDE,
84} pm_heredoc_indent_t;
85
86/*
87 * All of the information necessary to store to lexing a heredoc.
88 */
89typedef struct {
90 /* A pointer to the start of the heredoc identifier. */
91 const uint8_t *ident_start;
92
93 /* The length of the heredoc identifier. */
94 size_t ident_length;
95
96 /* The type of quote that the heredoc uses. */
97 pm_heredoc_quote_t quote;
98
99 /* The type of indentation that the heredoc uses. */
100 pm_heredoc_indent_t indent;
102
103/*
104 * When lexing Ruby source, the lexer has a small amount of state to tell which
105 * kind of token it is currently lexing. For example, when we find the start of
106 * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After
107 * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that
108 * are found as part of a string.
109 */
110typedef struct pm_lex_mode {
111 /* The type of this lex mode. */
112 enum {
113 /* This state is used when any given token is being lexed. */
114 PM_LEX_DEFAULT,
115
116 /*
117 * This state is used when we're lexing as normal but inside an embedded
118 * expression of a string.
119 */
120 PM_LEX_EMBEXPR,
121
122 /*
123 * This state is used when we're lexing a variable that is embedded
124 * directly inside of a string with the # shorthand.
125 */
126 PM_LEX_EMBVAR,
127
128 /* This state is used when you are inside the content of a heredoc. */
129 PM_LEX_HEREDOC,
130
131 /*
132 * This state is used when we are lexing a list of tokens, as in a %w
133 * word list literal or a %i symbol list literal.
134 */
135 PM_LEX_LIST,
136
137 /*
138 * This state is used when a regular expression has been begun and we
139 * are looking for the terminator.
140 */
141 PM_LEX_REGEXP,
142
143 /*
144 * This state is used when we are lexing a string or a string-like
145 * token, as in string content with either quote or an xstring.
146 */
147 PM_LEX_STRING
148 } mode;
149
150 /* The data associated with this type of lex mode. */
151 union {
152 struct {
153 /* This keeps track of the nesting level of the list. */
154 size_t nesting;
155
156 /* Whether or not interpolation is allowed in this list. */
157 bool interpolation;
158
159 /*
160 * When lexing a list, it takes into account balancing the
161 * terminator if the terminator is one of (), [], {}, or <>.
162 */
163 uint8_t incrementor;
164
165 /* This is the terminator of the list literal. */
166 uint8_t terminator;
167
168 /*
169 * This is the character set that should be used to delimit the
170 * tokens within the list.
171 */
172 uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
173 } list;
174
175 struct {
176 /*
177 * This keeps track of the nesting level of the regular expression.
178 */
179 size_t nesting;
180
181 /*
182 * When lexing a regular expression, it takes into account balancing
183 * the terminator if the terminator is one of (), [], {}, or <>.
184 */
185 uint8_t incrementor;
186
187 /* This is the terminator of the regular expression. */
188 uint8_t terminator;
189
190 /*
191 * This is the character set that should be used to delimit the
192 * tokens within the regular expression.
193 */
194 uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
195 } regexp;
196
197 struct {
198 /* This keeps track of the nesting level of the string. */
199 size_t nesting;
200
201 /* Whether or not interpolation is allowed in this string. */
202 bool interpolation;
203
204 /*
205 * Whether or not at the end of the string we should allow a :,
206 * which would indicate this was a dynamic symbol instead of a
207 * string.
208 */
209 bool label_allowed;
210
211 /*
212 * When lexing a string, it takes into account balancing the
213 * terminator if the terminator is one of (), [], {}, or <>.
214 */
215 uint8_t incrementor;
216
217 /*
218 * This is the terminator of the string. It is typically either a
219 * single or double quote.
220 */
221 uint8_t terminator;
222
223 /*
224 * This is the character set that should be used to delimit the
225 * tokens within the string.
226 */
227 uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
228 } string;
229
230 struct {
231 /*
232 * All of the data necessary to lex a heredoc.
233 */
235
236 /*
237 * This is the pointer to the character where lexing should resume
238 * once the heredoc has been completely processed.
239 */
240 const uint8_t *next_start;
241
242 /*
243 * This is used to track the amount of common whitespace on each
244 * line so that we know how much to dedent each line in the case of
245 * a tilde heredoc.
246 */
247 size_t *common_whitespace;
248
249 /* True if the previous token ended with a line continuation. */
250 bool line_continuation;
251 } heredoc;
252 } as;
253
254 /* The previous lex state so that it knows how to pop. */
255 struct pm_lex_mode *prev;
257
258/*
259 * We pre-allocate a certain number of lex states in order to avoid having to
260 * call malloc too many times while parsing. You really shouldn't need more than
261 * this because you only really nest deeply when doing string interpolation.
262 */
263#define PM_LEX_STACK_SIZE 4
264
265/*
266 * While parsing, we keep track of a stack of contexts. This is helpful for
267 * error recovery so that we can pop back to a previous context when we hit a
268 * token that is understood by a parent context but not by the current context.
269 */
270typedef enum {
271 /* a null context, used for returning a value from a function */
272 PM_CONTEXT_NONE = 0,
273
274 /* a begin statement */
275 PM_CONTEXT_BEGIN,
276
277 /* an ensure statement with an explicit begin */
278 PM_CONTEXT_BEGIN_ENSURE,
279
280 /* a rescue else statement with an explicit begin */
281 PM_CONTEXT_BEGIN_ELSE,
282
283 /* a rescue statement with an explicit begin */
284 PM_CONTEXT_BEGIN_RESCUE,
285
286 /* expressions in block arguments using braces */
287 PM_CONTEXT_BLOCK_BRACES,
288
289 /* expressions in block arguments using do..end */
290 PM_CONTEXT_BLOCK_KEYWORDS,
291
292 /* an ensure statement within a do..end block */
293 PM_CONTEXT_BLOCK_ENSURE,
294
295 /* a rescue else statement within a do..end block */
296 PM_CONTEXT_BLOCK_ELSE,
297
298 /* expressions in block parameters `foo do |...| end ` */
299 PM_CONTEXT_BLOCK_PARAMETERS,
300
301 /* a rescue statement within a do..end block */
302 PM_CONTEXT_BLOCK_RESCUE,
303
304 /* a case when statements */
305 PM_CONTEXT_CASE_WHEN,
306
307 /* a case in statements */
308 PM_CONTEXT_CASE_IN,
309
310 /* a class declaration */
311 PM_CONTEXT_CLASS,
312
313 /* an ensure statement within a class statement */
314 PM_CONTEXT_CLASS_ENSURE,
315
316 /* a rescue else statement within a class statement */
317 PM_CONTEXT_CLASS_ELSE,
318
319 /* a rescue statement within a class statement */
320 PM_CONTEXT_CLASS_RESCUE,
321
322 /* a method definition */
323 PM_CONTEXT_DEF,
324
325 /* an ensure statement within a method definition */
326 PM_CONTEXT_DEF_ENSURE,
327
328 /* a rescue else statement within a method definition */
329 PM_CONTEXT_DEF_ELSE,
330
331 /* a rescue statement within a method definition */
332 PM_CONTEXT_DEF_RESCUE,
333
334 /* a method definition's parameters */
335 PM_CONTEXT_DEF_PARAMS,
336
337 /* a defined? expression */
338 PM_CONTEXT_DEFINED,
339
340 /* a method definition's default parameter */
341 PM_CONTEXT_DEFAULT_PARAMS,
342
343 /* an else clause */
344 PM_CONTEXT_ELSE,
345
346 /* an elsif clause */
347 PM_CONTEXT_ELSIF,
348
349 /* an interpolated expression */
350 PM_CONTEXT_EMBEXPR,
351
352 /* a for loop */
353 PM_CONTEXT_FOR,
354
355 /* a for loop's index */
356 PM_CONTEXT_FOR_INDEX,
357
358 /* an if statement */
359 PM_CONTEXT_IF,
360
361 /* a lambda expression with braces */
362 PM_CONTEXT_LAMBDA_BRACES,
363
364 /* a lambda expression with do..end */
365 PM_CONTEXT_LAMBDA_DO_END,
366
367 /* an ensure statement within a lambda expression */
368 PM_CONTEXT_LAMBDA_ENSURE,
369
370 /* a rescue else statement within a lambda expression */
371 PM_CONTEXT_LAMBDA_ELSE,
372
373 /* a rescue statement within a lambda expression */
374 PM_CONTEXT_LAMBDA_RESCUE,
375
376 /* the predicate clause of a loop statement */
377 PM_CONTEXT_LOOP_PREDICATE,
378
379 /* the top level context */
380 PM_CONTEXT_MAIN,
381
382 /* a module declaration */
383 PM_CONTEXT_MODULE,
384
385 /* an ensure statement within a module statement */
386 PM_CONTEXT_MODULE_ENSURE,
387
388 /* a rescue else statement within a module statement */
389 PM_CONTEXT_MODULE_ELSE,
390
391 /* a rescue statement within a module statement */
392 PM_CONTEXT_MODULE_RESCUE,
393
394 /* a multiple target expression */
395 PM_CONTEXT_MULTI_TARGET,
396
397 /* a parenthesized expression */
398 PM_CONTEXT_PARENS,
399
400 /* an END block */
401 PM_CONTEXT_POSTEXE,
402
403 /* a predicate inside an if/elsif/unless statement */
404 PM_CONTEXT_PREDICATE,
405
406 /* a BEGIN block */
407 PM_CONTEXT_PREEXE,
408
409 /* a modifier rescue clause */
410 PM_CONTEXT_RESCUE_MODIFIER,
411
412 /* a singleton class definition */
413 PM_CONTEXT_SCLASS,
414
415 /* an ensure statement with a singleton class */
416 PM_CONTEXT_SCLASS_ENSURE,
417
418 /* a rescue else statement with a singleton class */
419 PM_CONTEXT_SCLASS_ELSE,
420
421 /* a rescue statement with a singleton class */
422 PM_CONTEXT_SCLASS_RESCUE,
423
424 /* a ternary expression */
425 PM_CONTEXT_TERNARY,
426
427 /* an unless statement */
428 PM_CONTEXT_UNLESS,
429
430 /* an until statement */
431 PM_CONTEXT_UNTIL,
432
433 /* a while statement */
434 PM_CONTEXT_WHILE,
435} pm_context_t;
436
437/* This is a node in a linked list of contexts. */
438typedef struct pm_context_node {
439 /* The context that this node represents. */
440 pm_context_t context;
441
442 /* A pointer to the previous context in the linked list. */
443 struct pm_context_node *prev;
445
446/* The type of shareable constant value that can be set. */
447typedef uint8_t pm_shareable_constant_value_t;
448static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_NONE = 0x0;
449static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_LITERAL = PM_SHAREABLE_CONSTANT_NODE_FLAGS_LITERAL;
450static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_EVERYTHING = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_EVERYTHING;
451static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_COPY = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_COPY;
452
453/*
454 * This tracks an individual local variable in a certain lexical context, as
455 * well as the number of times is it read.
456 */
457typedef struct {
458 /* The name of the local variable. */
459 pm_constant_id_t name;
460
461 /* The location of the local variable in the source. */
462 pm_location_t location;
463
464 /* The index of the local variable in the local table. */
465 uint32_t index;
466
467 /* The number of times the local variable is read. */
468 uint32_t reads;
469
470 /* The hash of the local variable. */
471 uint32_t hash;
472} pm_local_t;
473
474/*
475 * This is a set of local variables in a certain lexical context (method, class,
476 * module, etc.). We need to track how many times these variables are read in
477 * order to warn if they only get written.
478 */
479typedef struct pm_locals {
480 /* The number of local variables in the set. */
481 uint32_t size;
482
483 /* The capacity of the local variables set. */
484 uint32_t capacity;
485
486 /*
487 * A bloom filter over constant IDs stored in this set. Used to quickly
488 * reject lookups for names that are definitely not present, avoiding the
489 * cost of a linear scan or hash probe.
490 */
491 uint32_t bloom;
492
493 /* The nullable allocated memory for the local variables in the set. */
494 pm_local_t *locals;
496
497/* The flags about scope parameters that can be set. */
498typedef uint8_t pm_scope_parameters_t;
499static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NONE = 0x0;
500static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS = 0x1;
501static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS = 0x2;
502static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_BLOCK = 0x4;
503static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_ALL = 0x8;
504static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED = 0x10;
505static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_INNER = 0x20;
506static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_FOUND = 0x40;
507
508/*
509 * This struct represents a node in a linked list of scopes. Some scopes can see
510 * into their parent scopes, while others cannot.
511 */
512typedef struct pm_scope {
513 /* A pointer to the previous scope in the linked list. */
514 struct pm_scope *previous;
515
516 /* The IDs of the locals in the given scope. */
517 pm_locals_t locals;
518
519 /*
520 * This is a list of the implicit parameters contained within the block.
521 * These will be processed after the block is parsed to determine the kind
522 * of parameters node that should be used and to check if any errors need to
523 * be added.
524 */
525 pm_node_list_t implicit_parameters;
526
527 /*
528 * This is a bitfield that indicates the parameters that are being used in
529 * this scope. It is a combination of the PM_SCOPE_PARAMETERS_* constants.
530 * There are three different kinds of parameters that can be used in a
531 * scope:
532 *
533 * - Ordinary parameters (e.g., def foo(bar); end)
534 * - Numbered parameters (e.g., def foo; _1; end)
535 * - The it parameter (e.g., def foo; it; end)
536 *
537 * If ordinary parameters are being used, then certain parameters can be
538 * forwarded to another method/structure. Those are indicated by four
539 * additional bits in the params field. For example, some combinations of:
540 *
541 * - def foo(*); end
542 * - def foo(**); end
543 * - def foo(&); end
544 * - def foo(...); end
545 */
546 pm_scope_parameters_t parameters;
547
548 /*
549 * The current state of constant shareability for this scope. This is
550 * changed by magic shareable_constant_value comments.
551 */
552 pm_shareable_constant_value_t shareable_constant;
553
554 /*
555 * A boolean indicating whether or not this scope can see into its parent.
556 * If closed is true, then the scope cannot see into its parent.
557 */
558 bool closed;
559} pm_scope_t;
560
561/*
562 * A struct that represents a stack of boolean values.
563 */
564typedef uint32_t pm_state_stack_t;
565
566/*
567 * This struct represents the overall parser. It contains a reference to the
568 * source file, as well as pointers that indicate where in the source it's
569 * currently parsing. It also contains the most recent and current token that
570 * it's considering.
571 */
573 /* The arena used for all AST-lifetime allocations. Caller-owned. */
574 pm_arena_t *arena;
575
576 /* The arena used for parser metadata (comments, diagnostics, etc.). */
577 pm_arena_t metadata_arena;
578
579 /*
580 * The next node identifier that will be assigned. This is a unique
581 * identifier used to track nodes such that the syntax tree can be dropped
582 * but the node can be found through another parse.
583 */
584 uint32_t node_id;
585
586 /*
587 * A single-entry cache for pm_parser_constant_id_raw. Avoids redundant
588 * constant pool lookups when the same token is resolved multiple times
589 * (e.g., once during lexing for local variable detection, and again
590 * during parsing for node creation).
591 */
592 struct {
593 const uint8_t *start;
594 const uint8_t *end;
596 } constant_cache;
597
598 /* The current state of the lexer. */
599 pm_lex_state_t lex_state;
600
601 /* Tracks the current nesting of (), [], and {}. */
602 int enclosure_nesting;
603
604 /*
605 * Used to temporarily track the nesting of enclosures to determine if a {
606 * is the beginning of a lambda following the parameters of a lambda.
607 */
608 int lambda_enclosure_nesting;
609
610 /*
611 * Used to track the nesting of braces to ensure we get the correct value
612 * when we are interpolating blocks with braces.
613 */
614 int brace_nesting;
615
616 /*
617 * The stack used to determine if a do keyword belongs to the predicate of a
618 * while, until, or for loop.
619 */
620 pm_state_stack_t do_loop_stack;
621
622 /*
623 * The stack used to determine if a do keyword belongs to the beginning of a
624 * block.
625 */
626 pm_state_stack_t accepts_block_stack;
627
628 /* A stack of lex modes. */
629 struct {
630 /* The current mode of the lexer. */
631 pm_lex_mode_t *current;
632
633 /* The stack of lexer modes. */
634 pm_lex_mode_t stack[PM_LEX_STACK_SIZE];
635
636 /* The current index into the lexer mode stack. */
637 size_t index;
638 } lex_modes;
639
640 /* The pointer to the start of the source. */
641 const uint8_t *start;
642
643 /* The pointer to the end of the source. */
644 const uint8_t *end;
645
646 /* The previous token we were considering. */
647 pm_token_t previous;
648
649 /* The current token we're considering. */
650 pm_token_t current;
651
652 /*
653 * This is a special field set on the parser when we need the parser to jump
654 * to a specific location when lexing the next token, as opposed to just
655 * using the end of the previous token. Normally this is NULL.
656 */
657 const uint8_t *next_start;
658
659 /*
660 * This field indicates the end of a heredoc whose identifier was found on
661 * the current line. If another heredoc is found on the same line, then this
662 * will be moved forward to the end of that heredoc. If no heredocs are
663 * found on a line then this is NULL.
664 */
665 const uint8_t *heredoc_end;
666
667 /* The list of comments that have been found while parsing. */
668 pm_list_t comment_list;
669
670 /* The list of magic comments that have been found while parsing. */
671 pm_list_t magic_comment_list;
672
673 /*
674 * An optional location that represents the location of the __END__ marker
675 * and the rest of the content of the file. This content is loaded into the
676 * DATA constant when the file being parsed is the main file being executed.
677 */
678 pm_location_t data_loc;
679
680 /* The list of warnings that have been found while parsing. */
681 pm_list_t warning_list;
682
683 /* The list of errors that have been found while parsing. */
684 pm_list_t error_list;
685
686 /* The current local scope. */
687 pm_scope_t *current_scope;
688
689 /* The current parsing context. */
690 pm_context_node_t *current_context;
691
692 /*
693 * The hash keys for the hash that is currently being parsed. This is not
694 * usually necessary because it can pass it down the various call chains,
695 * but in the event that you're parsing a hash that is being directly
696 * pushed into another hash with **, we need to share the hash keys so that
697 * we can warn for the nested hash as well.
698 */
699 pm_static_literals_t *current_hash_keys;
700
701 /*
702 * The encoding functions for the current file is attached to the parser as
703 * it's parsing so that it can change with a magic comment.
704 */
705 const pm_encoding_t *encoding;
706
707 /*
708 * When the encoding that is being used to parse the source is changed by
709 * prism, we provide the ability here to call out to a user-defined
710 * function.
711 */
712 pm_encoding_changed_callback_t encoding_changed_callback;
713
714 /*
715 * This pointer indicates where a comment must start if it is to be
716 * considered an encoding comment.
717 */
718 const uint8_t *encoding_comment_start;
719
720 /*
721 * When you are lexing through a file, the lexer needs all of the information
722 * that the parser additionally provides (for example, the local table). So if
723 * you want to properly lex Ruby, you need to actually lex it in the context of
724 * the parser. In order to provide this functionality, we optionally allow a
725 * struct to be attached to the parser that calls back out to a user-provided
726 * callback when each token is lexed.
727 */
728 struct {
729 /*
730 * This is the callback that is called when a token is lexed. It is
731 * passed the opaque data pointer, the parser, and the token that was
732 * lexed.
733 */
734 pm_lex_callback_t callback;
735
736 /*
737 * This opaque pointer is used to provide whatever information the user
738 * deemed necessary to the callback. In our case we use it to pass the
739 * array that the tokens get appended into.
740 */
741 void *data;
742 } lex_callback;
743
744 /*
745 * This is the path of the file being parsed. We use the filepath when
746 * constructing SourceFileNodes.
747 */
748 pm_string_t filepath;
749
750 /*
751 * This constant pool keeps all of the constants defined throughout the file
752 * so that we can reference them later.
753 */
754 pm_constant_pool_t constant_pool;
755
756 /* This is the list of line offsets in the source file. */
757 pm_line_offset_list_t line_offsets;
758
759 /*
760 * State communicated from the lexer to the parser for integer tokens.
761 */
762 struct {
763 /*
764 * A flag indicating the base of the integer (binary, octal, decimal,
765 * hexadecimal). Set during lexing and read during node creation.
766 */
767 pm_node_flags_t base;
768
769 /*
770 * When lexing a decimal integer that fits in a uint32_t, we compute
771 * the value during lexing to avoid re-scanning the digits during
772 * parsing. If lexed is true, this holds the result and
773 * pm_integer_parse can be skipped.
774 */
775 uint32_t value;
776
777 /* Whether value holds a valid pre-computed integer. */
778 bool lexed;
779 } integer;
780
781 /*
782 * This string is used to pass information from the lexer to the parser. It
783 * is particularly necessary because of escape sequences.
784 */
785 pm_string_t current_string;
786
787 /*
788 * The line number at the start of the parse. This will be used to offset
789 * the line numbers of all of the locations.
790 */
791 int32_t start_line;
792
793 /*
794 * When a string-like expression is being lexed, any byte or escape sequence
795 * that resolves to a value whose top bit is set (i.e., >= 0x80) will
796 * explicitly set the encoding to the same encoding as the source.
797 * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
798 * resolves to a value whose top bit is set, then the encoding will be
799 * explicitly set to UTF-8.
800 *
801 * The _next_ time this happens, if the encoding that is about to become the
802 * explicitly set encoding does not match the previously set explicit
803 * encoding, a mixed encoding error will be emitted.
804 *
805 * When the expression is finished being lexed, the explicit encoding
806 * controls the encoding of the expression. For the most part this means
807 * that the expression will either be encoded in the source encoding or
808 * UTF-8. This holds for all encodings except US-ASCII. If the source is
809 * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
810 * expression will be encoded as ASCII-8BIT.
811 *
812 * Note that if the expression is a list, different elements within the same
813 * list can have different encodings, so this will get reset between each
814 * element. Furthermore all of this only applies to lists that support
815 * interpolation, because otherwise escapes that could change the encoding
816 * are ignored.
817 *
818 * At first glance, it may make more sense for this to live on the lexer
819 * mode, but we need it here to communicate back to the parser for character
820 * literals that do not push a new lexer mode.
821 */
822 const pm_encoding_t *explicit_encoding;
823
824 /*
825 * When parsing block exits (e.g., break, next, redo), we need to validate
826 * that they are in correct contexts. For the most part we can do this by
827 * looking at our parent contexts. However, modifier while and until
828 * expressions can change that context to make block exits valid. In these
829 * cases, we need to keep track of the block exits and then validate them
830 * after the expression has been parsed.
831 *
832 * We use a pointer here because we don't want to keep a whole list attached
833 * since this will only be used in the context of begin/end expressions.
834 */
835 pm_node_list_t *current_block_exits;
836
837 /* The version of prism that we should use to parse. */
838 pm_options_version_t version;
839
840 /* The command line flags given from the options. */
841 uint8_t command_line;
842
843 /*
844 * Whether or not we have found a frozen_string_literal magic comment with
845 * a true or false value.
846 * May be:
847 * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED
848 * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED
849 * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET
850 */
851 int8_t frozen_string_literal;
852
853 /*
854 * Whether or not we are parsing an eval string. This impacts whether or not
855 * we should evaluate if block exits/yields are valid.
856 */
857 bool parsing_eval;
858
859 /*
860 * Whether or not we are parsing a "partial" script, which is a script that
861 * will be evaluated in the context of another script, so we should not
862 * check jumps (next/break/etc.) for validity.
863 */
864 bool partial_script;
865
866 /* Whether or not we're at the beginning of a command. */
867 bool command_start;
868
869 /*
870 * Whether or not we're currently parsing the body of an endless method
871 * definition. In this context, PM_TOKEN_KEYWORD_DO_BLOCK should not be
872 * consumed by commands (it should bubble up to the outer context).
873 */
874 bool in_endless_def_body;
875
876 /* Whether or not we're currently recovering from a syntax error. */
877 bool recovering;
878
879 /*
880 * Whether or not the source being parsed could become valid if more input
881 * were appended. This is set to false when the parser encounters a token
882 * that is definitively wrong (e.g., a stray `end` or `]`) as opposed to
883 * merely incomplete.
884 */
885 bool continuable;
886
887 /*
888 * This is very specialized behavior for when you want to parse in a context
889 * that does not respect encoding comments. Its main use case is translating
890 * into the whitequark/parser AST which re-encodes source files in UTF-8
891 * before they are parsed and ignores encoding comments.
892 */
893 bool encoding_locked;
894
895 /*
896 * Whether or not the encoding has been changed by a magic comment. We use
897 * this to provide a fast path for the lexer instead of going through the
898 * function pointer.
899 */
900 bool encoding_changed;
901
902 /*
903 * This flag indicates that we are currently parsing a pattern matching
904 * expression and impacts that calculation of newlines.
905 */
906 bool pattern_matching_newlines;
907
908 /* This flag indicates that we are currently parsing a keyword argument. */
909 bool in_keyword_arg;
910
911 /*
912 * Whether or not the parser has seen a token that has semantic meaning
913 * (i.e., a token that is not a comment or whitespace).
914 */
915 bool semantic_token_seen;
916
917 /*
918 * By default, Ruby always warns about mismatched indentation. This can be
919 * toggled with a magic comment.
920 */
921 bool warn_mismatched_indentation;
922
923#if defined(PRISM_HAS_NEON) || defined(PRISM_HAS_SSSE3) || defined(PRISM_HAS_SWAR)
924 /*
925 * Cached lookup tables for pm_strpbrk's SIMD fast path. Avoids rebuilding
926 * the nibble-based tables on every call when the charset hasn't changed
927 * (which is the common case during string/regex/list lexing).
928 */
929 struct {
930 /* The cached charset (null-terminated, max 11 chars + NUL). */
931 uint8_t charset[12];
932
933 /* Nibble-based low lookup table for SIMD matching. */
934 uint8_t low_lut[16];
935
936 /* Nibble-based high lookup table for SIMD matching. */
937 uint8_t high_lut[16];
938
939 /* Scalar fallback table (4 x 64-bit bitmasks covering all ASCII). */
940 uint64_t table[4];
941 } strpbrk_cache;
942#endif
943};
944
945/*
946 * Initialize a parser with the given start and end pointers.
947 */
948void pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options);
949
950/*
951 * Free the memory held by the given parser.
952 *
953 * This does not free the `pm_options_t` object that was used to initialize the
954 * parser.
955 */
956void pm_parser_cleanup(pm_parser_t *parser);
957
958#endif
uint32_t pm_constant_id_t
A constant id is a unique identifier for a constant in the constant pool.
A list of byte offsets of newlines in a string.
The parser used to parse Ruby source.
void(* pm_lex_callback_t)(pm_parser_t *parser, pm_token_t *token, void *data)
This is the callback that is called when a token is lexed.
Definition parser.h:55
void(* pm_encoding_changed_callback_t)(pm_parser_t *parser)
When the encoding that is being used to parse the source is changed by prism, we provide the ability ...
Definition parser.h:49
C99 shim for <stdbool.h>
A list of offsets of the start of lines in a string.
This struct represents a slice in the source code, defined by an offset and a length.
Definition ast.h:554
A list of nodes in the source, most often used for lists of children.
Definition ast.h:567
A generic string type that can have various ownership semantics.
Definition stringy.h:18
This struct represents a token in the Ruby source.
Definition ast.h:526