Ruby 4.1.0dev (2026-05-15 revision fff4b3ef2e3e309e7a84288de53c189aa3d45fed)
regcomp.c (fff4b3ef2e3e309e7a84288de53c189aa3d45fed)
1/**********************************************************************
2 regcomp.c - Onigmo (Oniguruma-mod) (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011-2019 K.Takata <kentkt AT csc DOT jp>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include "regparse.h"
32
33OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
34
35extern OnigCaseFoldType
36onig_get_default_case_fold_flag(void)
37{
38 return OnigDefaultCaseFoldFlag;
39}
40
41extern int
42onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
43{
44 OnigDefaultCaseFoldFlag = case_fold_flag;
45 return 0;
46}
47
48
49#ifndef PLATFORM_UNALIGNED_WORD_ACCESS
50static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
51#endif
52
53#if 0
54static UChar*
55str_dup(UChar* s, UChar* end)
56{
57 ptrdiff_t len = end - s;
58
59 if (len > 0) {
60 UChar* r = (UChar* )xmalloc(len + 1);
61 CHECK_NULL_RETURN(r);
62 xmemcpy(r, s, len);
63 r[len] = (UChar )0;
64 return r;
65 }
66 else return NULL;
67}
68#endif
69
70static void
71swap_node(Node* a, Node* b)
72{
73 Node c;
74 c = *a; *a = *b; *b = c;
75
76 if (NTYPE(a) == NT_STR) {
77 StrNode* sn = NSTR(a);
78 if (sn->capa == 0) {
79 size_t len = sn->end - sn->s;
80 sn->s = sn->buf;
81 sn->end = sn->s + len;
82 }
83 }
84
85 if (NTYPE(b) == NT_STR) {
86 StrNode* sn = NSTR(b);
87 if (sn->capa == 0) {
88 size_t len = sn->end - sn->s;
89 sn->s = sn->buf;
90 sn->end = sn->s + len;
91 }
92 }
93}
94
95static OnigDistance
96distance_add(OnigDistance d1, OnigDistance d2)
97{
98 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
99 return ONIG_INFINITE_DISTANCE;
100 else {
101 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
102 else return ONIG_INFINITE_DISTANCE;
103 }
104}
105
106static OnigDistance
107distance_multiply(OnigDistance d, int m)
108{
109 if (m == 0) return 0;
110
111 if (d < ONIG_INFINITE_DISTANCE / m)
112 return d * m;
113 else
114 return ONIG_INFINITE_DISTANCE;
115}
116
117static int
118bitset_is_empty(BitSetRef bs)
119{
120 int i;
121 for (i = 0; i < BITSET_SIZE; i++) {
122 if (bs[i] != 0) return 0;
123 }
124 return 1;
125}
126
127#ifdef ONIG_DEBUG
128static int
129bitset_on_num(BitSetRef bs)
130{
131 int i, n;
132
133 n = 0;
134 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
135 if (BITSET_AT(bs, i)) n++;
136 }
137 return n;
138}
139#endif
140
141// Attempt to right size allocated buffers for a regex post compile
142static void
143onig_reg_resize(regex_t *reg)
144{
145 do {
146 if (!reg->used) {
147 xfree(reg->p);
148 reg->alloc = 0;
149 reg->p = 0;
150 }
151 else if (reg->alloc > reg->used) {
152 unsigned char *new_ptr = xrealloc(reg->p, reg->used);
153 // Skip the right size optimization if memory allocation fails
154 if (new_ptr) {
155 reg->alloc = reg->used;
156 reg->p = new_ptr;
157 }
158 }
159 } while ((reg = reg->chain) != 0);
160}
161
162extern int
163onig_bbuf_init(BBuf* buf, OnigDistance size)
164{
165 if (size <= 0) {
166 size = 0;
167 buf->p = NULL;
168 }
169 else {
170 buf->p = (UChar* )xmalloc(size);
171 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
172 }
173
174 buf->alloc = (unsigned int )size;
175 buf->used = 0;
176 return 0;
177}
178
179
180#ifdef USE_SUBEXP_CALL
181
182static int
183unset_addr_list_init(UnsetAddrList* uslist, int size)
184{
185 UnsetAddr* p;
186
187 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
188 CHECK_NULL_RETURN_MEMERR(p);
189 uslist->num = 0;
190 uslist->alloc = size;
191 uslist->us = p;
192 return 0;
193}
194
195static void
196unset_addr_list_end(UnsetAddrList* uslist)
197{
198 xfree(uslist->us);
199}
200
201static int
202unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
203{
204 UnsetAddr* p;
205 int size;
206
207 if (uslist->num >= uslist->alloc) {
208 size = uslist->alloc * 2;
209 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
210 CHECK_NULL_RETURN_MEMERR(p);
211 uslist->alloc = size;
212 uslist->us = p;
213 }
214
215 uslist->us[uslist->num].offset = offset;
216 uslist->us[uslist->num].target = node;
217 uslist->num++;
218 return 0;
219}
220#endif /* USE_SUBEXP_CALL */
221
222
223static int
224add_opcode(regex_t* reg, int opcode)
225{
226 BBUF_ADD1(reg, opcode);
227 return 0;
228}
229
230#ifdef USE_COMBINATION_EXPLOSION_CHECK
231static int
232add_state_check_num(regex_t* reg, int num)
233{
234 StateCheckNumType n = (StateCheckNumType )num;
235
236 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
237 return 0;
238}
239#endif
240
241static int
242add_rel_addr(regex_t* reg, int addr)
243{
244 RelAddrType ra = (RelAddrType )addr;
245
246 BBUF_ADD(reg, &ra, SIZE_RELADDR);
247 return 0;
248}
249
250static int
251add_abs_addr(regex_t* reg, int addr)
252{
253 AbsAddrType ra = (AbsAddrType )addr;
254
255 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
256 return 0;
257}
258
259static int
260add_length(regex_t* reg, OnigDistance len)
261{
262 LengthType l = (LengthType )len;
263
264 BBUF_ADD(reg, &l, SIZE_LENGTH);
265 return 0;
266}
267
268static int
269add_mem_num(regex_t* reg, int num)
270{
271 MemNumType n = (MemNumType )num;
272
273 BBUF_ADD(reg, &n, SIZE_MEMNUM);
274 return 0;
275}
276
277#if 0
278static int
279add_pointer(regex_t* reg, void* addr)
280{
281 PointerType ptr = (PointerType )addr;
282
283 BBUF_ADD(reg, &ptr, SIZE_POINTER);
284 return 0;
285}
286#endif
287
288static int
289add_option(regex_t* reg, OnigOptionType option)
290{
291 BBUF_ADD(reg, &option, SIZE_OPTION);
292 return 0;
293}
294
295static int
296add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
297{
298 int r;
299
300 r = add_opcode(reg, opcode);
301 if (r) return r;
302 r = add_rel_addr(reg, addr);
303 return r;
304}
305
306static int
307add_bytes(regex_t* reg, UChar* bytes, OnigDistance len)
308{
309 BBUF_ADD(reg, bytes, len);
310 return 0;
311}
312
313static int
314add_bitset(regex_t* reg, BitSetRef bs)
315{
316 BBUF_ADD(reg, bs, SIZE_BITSET);
317 return 0;
318}
319
320static int
321add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
322{
323 int r;
324
325 r = add_opcode(reg, opcode);
326 if (r) return r;
327 r = add_option(reg, option);
328 return r;
329}
330
331static int compile_length_tree(Node* node, regex_t* reg);
332static int compile_tree(Node* node, regex_t* reg);
333
334
335#define IS_NEED_STR_LEN_OP_EXACT(op) \
336 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
337 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
338
339static int
340select_str_opcode(int mb_len, OnigDistance byte_len, int ignore_case)
341{
342 int op;
343 OnigDistance str_len = roomof(byte_len, mb_len);
344
345 if (ignore_case) {
346 switch (str_len) {
347 case 1: op = OP_EXACT1_IC; break;
348 default: op = OP_EXACTN_IC; break;
349 }
350 }
351 else {
352 switch (mb_len) {
353 case 1:
354 switch (str_len) {
355 case 1: op = OP_EXACT1; break;
356 case 2: op = OP_EXACT2; break;
357 case 3: op = OP_EXACT3; break;
358 case 4: op = OP_EXACT4; break;
359 case 5: op = OP_EXACT5; break;
360 default: op = OP_EXACTN; break;
361 }
362 break;
363
364 case 2:
365 switch (str_len) {
366 case 1: op = OP_EXACTMB2N1; break;
367 case 2: op = OP_EXACTMB2N2; break;
368 case 3: op = OP_EXACTMB2N3; break;
369 default: op = OP_EXACTMB2N; break;
370 }
371 break;
372
373 case 3:
374 op = OP_EXACTMB3N;
375 break;
376
377 default:
378 op = OP_EXACTMBN;
379 break;
380 }
381 }
382 return op;
383}
384
385static int
386compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
387{
388 int r;
389 int saved_num_null_check = reg->num_null_check;
390
391 if (empty_info != 0) {
392 r = add_opcode(reg, OP_NULL_CHECK_START);
393 if (r) return r;
394 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
395 if (r) return r;
396 reg->num_null_check++;
397 if ((MemNumType)reg->num_null_check <= 0) return ONIGERR_TOO_MANY_NULL_CHECK;
398 }
399
400 r = compile_tree(node, reg);
401 if (r) return r;
402
403 if (empty_info != 0) {
404 if (empty_info == NQ_TARGET_IS_EMPTY)
405 r = add_opcode(reg, OP_NULL_CHECK_END);
406 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
407 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
408 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
409 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
410
411 if (r) return r;
412 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
413 }
414 return r;
415}
416
417#ifdef USE_SUBEXP_CALL
418static int
419compile_call(CallNode* node, regex_t* reg)
420{
421 int r;
422
423 r = add_opcode(reg, OP_CALL);
424 if (r) return r;
425 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
426 node->target);
427 if (r) return r;
428 r = add_abs_addr(reg, 0 /*dummy addr.*/);
429 return r;
430}
431#endif
432
433static int
434compile_tree_n_times(Node* node, int n, regex_t* reg)
435{
436 int i, r;
437
438 for (i = 0; i < n; i++) {
439 r = compile_tree(node, reg);
440 if (r) return r;
441 }
442 return 0;
443}
444
445static int
446add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance byte_len,
447 regex_t* reg ARG_UNUSED, int ignore_case)
448{
449 int len;
450 int op = select_str_opcode(mb_len, byte_len, ignore_case);
451
452 len = SIZE_OPCODE;
453
454 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
455 if (IS_NEED_STR_LEN_OP_EXACT(op))
456 len += SIZE_LENGTH;
457
458 len += (int )byte_len;
459 return len;
460}
461
462static int
463add_compile_string(UChar* s, int mb_len, OnigDistance byte_len,
464 regex_t* reg, int ignore_case)
465{
466 int op = select_str_opcode(mb_len, byte_len, ignore_case);
467 add_opcode(reg, op);
468
469 if (op == OP_EXACTMBN)
470 add_length(reg, mb_len);
471
472 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
473 if (op == OP_EXACTN_IC)
474 add_length(reg, byte_len);
475 else
476 add_length(reg, byte_len / mb_len);
477 }
478
479 add_bytes(reg, s, byte_len);
480 return 0;
481}
482
483
484static int
485compile_length_string_node(Node* node, regex_t* reg)
486{
487 int rlen, r, len, prev_len, blen, ambig;
488 OnigEncoding enc = reg->enc;
489 UChar *p, *prev;
490 StrNode* sn;
491
492 sn = NSTR(node);
493 if (sn->end <= sn->s)
494 return 0;
495
496 ambig = NSTRING_IS_AMBIG(node);
497
498 p = prev = sn->s;
499 prev_len = enclen(enc, p, sn->end);
500 p += prev_len;
501 blen = prev_len;
502 rlen = 0;
503
504 for (; p < sn->end; ) {
505 len = enclen(enc, p, sn->end);
506 if (len == prev_len || ambig) {
507 blen += len;
508 }
509 else {
510 r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
511 rlen += r;
512 prev = p;
513 blen = len;
514 prev_len = len;
515 }
516 p += len;
517 }
518 r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
519 rlen += r;
520 return rlen;
521}
522
523static int
524compile_length_string_raw_node(StrNode* sn, regex_t* reg)
525{
526 if (sn->end <= sn->s)
527 return 0;
528
529 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
530}
531
532static int
533compile_string_node(Node* node, regex_t* reg)
534{
535 int r, len, prev_len, blen, ambig;
536 OnigEncoding enc = reg->enc;
537 UChar *p, *prev, *end;
538 StrNode* sn;
539
540 sn = NSTR(node);
541 if (sn->end <= sn->s)
542 return 0;
543
544 end = sn->end;
545 ambig = NSTRING_IS_AMBIG(node);
546
547 p = prev = sn->s;
548 prev_len = enclen(enc, p, end);
549 p += prev_len;
550 blen = prev_len;
551
552 for (; p < end; ) {
553 len = enclen(enc, p, end);
554 if (len == prev_len || ambig) {
555 blen += len;
556 }
557 else {
558 r = add_compile_string(prev, prev_len, blen, reg, ambig);
559 if (r) return r;
560
561 prev = p;
562 blen = len;
563 prev_len = len;
564 }
565
566 p += len;
567 }
568 return add_compile_string(prev, prev_len, blen, reg, ambig);
569}
570
571static int
572compile_string_raw_node(StrNode* sn, regex_t* reg)
573{
574 if (sn->end <= sn->s)
575 return 0;
576
577 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
578}
579
580static int
581add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
582{
583#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
584 add_length(reg, mbuf->used);
585 return add_bytes(reg, mbuf->p, mbuf->used);
586#else
587 int r, pad_size;
588 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
589
590 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
591 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
592 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
593
594 r = add_bytes(reg, mbuf->p, mbuf->used);
595
596 /* padding for return value from compile_length_cclass_node() to be fix. */
597 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
598 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
599 return r;
600#endif
601}
602
603static int
604compile_length_cclass_node(CClassNode* cc, regex_t* reg)
605{
606 int len;
607
608 if (IS_NULL(cc->mbuf)) {
609 len = SIZE_OPCODE + SIZE_BITSET;
610 }
611 else {
612 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
613 len = SIZE_OPCODE;
614 }
615 else {
616 len = SIZE_OPCODE + SIZE_BITSET;
617 }
618#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
619 len += SIZE_LENGTH + cc->mbuf->used;
620#else
621 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
622#endif
623 }
624
625 return len;
626}
627
628static int
629compile_cclass_node(CClassNode* cc, regex_t* reg)
630{
631 int r;
632
633 if (IS_NULL(cc->mbuf)) {
634 if (IS_NCCLASS_NOT(cc))
635 add_opcode(reg, OP_CCLASS_NOT);
636 else
637 add_opcode(reg, OP_CCLASS);
638
639 r = add_bitset(reg, cc->bs);
640 }
641 else {
642 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
643 if (IS_NCCLASS_NOT(cc))
644 add_opcode(reg, OP_CCLASS_MB_NOT);
645 else
646 add_opcode(reg, OP_CCLASS_MB);
647
648 r = add_multi_byte_cclass(cc->mbuf, reg);
649 }
650 else {
651 if (IS_NCCLASS_NOT(cc))
652 add_opcode(reg, OP_CCLASS_MIX_NOT);
653 else
654 add_opcode(reg, OP_CCLASS_MIX);
655
656 r = add_bitset(reg, cc->bs);
657 if (r) return r;
658 r = add_multi_byte_cclass(cc->mbuf, reg);
659 }
660 }
661
662 return r;
663}
664
665static int
666entry_repeat_range(regex_t* reg, int id, int lower, int upper)
667{
668#define REPEAT_RANGE_ALLOC 4
669
671
672 if (reg->repeat_range_alloc == 0) {
673 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
674 CHECK_NULL_RETURN_MEMERR(p);
675 reg->repeat_range = p;
676 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
677 }
678 else if (reg->repeat_range_alloc <= id) {
679 int n;
680 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
681 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
682 sizeof(OnigRepeatRange) * n);
683 CHECK_NULL_RETURN_MEMERR(p);
684 reg->repeat_range = p;
685 reg->repeat_range_alloc = n;
686 }
687 else {
688 p = reg->repeat_range;
689 }
690
691 p[id].lower = lower;
692 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
693 return 0;
694}
695
696static int
697compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
698 regex_t* reg)
699{
700 int r;
701 int num_repeat = reg->num_repeat;
702
703 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
704 if (r) return r;
705 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
706 reg->num_repeat++;
707 if ((MemNumType)reg->num_repeat <= 0) return ONIGERR_TOO_MANY_RANGE_REPEAT;
708 if (r) return r;
709 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
710 if (r) return r;
711
712 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
713 if (r) return r;
714
715 r = compile_tree_empty_check(qn->target, reg, empty_info);
716 if (r) return r;
717
718 if (
719#ifdef USE_SUBEXP_CALL
720 reg->num_call > 0 ||
721#endif
722 IS_QUANTIFIER_IN_REPEAT(qn)) {
723 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
724 }
725 else {
726 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
727 }
728 if (r) return r;
729 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
730 return r;
731}
732
733static int
734is_anychar_star_quantifier(QtfrNode* qn)
735{
736 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
737 NTYPE(qn->target) == NT_CANY)
738 return 1;
739 else
740 return 0;
741}
742
743#define QUANTIFIER_EXPAND_LIMIT_SIZE 50
744#define CKN_ON (ckn > 0)
745
746#ifdef USE_COMBINATION_EXPLOSION_CHECK
747
748static int
749compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
750{
751 int len, mod_tlen, cklen;
752 int ckn;
753 int infinite = IS_REPEAT_INFINITE(qn->upper);
754 int empty_info = qn->target_empty_info;
755 int tlen = compile_length_tree(qn->target, reg);
756
757 if (tlen < 0) return tlen;
758
759 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
760
761 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
762
763 /* anychar repeat */
764 if (NTYPE(qn->target) == NT_CANY) {
765 if (qn->greedy && infinite) {
766 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
767 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
768 else
769 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
770 }
771 }
772
773 if (empty_info != 0)
774 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
775 else
776 mod_tlen = tlen;
777
778 if (infinite && qn->lower <= 1) {
779 if (qn->greedy) {
780 if (qn->lower == 1)
781 len = SIZE_OP_JUMP;
782 else
783 len = 0;
784
785 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
786 }
787 else {
788 if (qn->lower == 0)
789 len = SIZE_OP_JUMP;
790 else
791 len = 0;
792
793 len += mod_tlen + SIZE_OP_PUSH + cklen;
794 }
795 }
796 else if (qn->upper == 0) {
797 if (qn->is_referred != 0) /* /(?<n>..){0}/ */
798 len = SIZE_OP_JUMP + tlen;
799 else
800 len = 0;
801 }
802 else if (qn->upper == 1 && qn->greedy) {
803 if (qn->lower == 0) {
804 if (CKN_ON) {
805 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
806 }
807 else {
808 len = SIZE_OP_PUSH + tlen;
809 }
810 }
811 else {
812 len = tlen;
813 }
814 }
815 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
816 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
817 }
818 else {
819 len = SIZE_OP_REPEAT_INC
820 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
821 if (CKN_ON)
822 len += SIZE_OP_STATE_CHECK;
823 }
824
825 return len;
826}
827
828static int
829compile_quantifier_node(QtfrNode* qn, regex_t* reg)
830{
831 int r, mod_tlen;
832 int ckn;
833 int infinite = IS_REPEAT_INFINITE(qn->upper);
834 int empty_info = qn->target_empty_info;
835 int tlen = compile_length_tree(qn->target, reg);
836
837 if (tlen < 0) return tlen;
838
839 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
840
841 if (is_anychar_star_quantifier(qn)) {
842 r = compile_tree_n_times(qn->target, qn->lower, reg);
843 if (r) return r;
844 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
845 if (IS_MULTILINE(reg->options))
846 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
847 else
848 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
849 if (r) return r;
850 if (CKN_ON) {
851 r = add_state_check_num(reg, ckn);
852 if (r) return r;
853 }
854
855 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
856 }
857 else {
858 if (IS_MULTILINE(reg->options)) {
859 r = add_opcode(reg, (CKN_ON ?
860 OP_STATE_CHECK_ANYCHAR_ML_STAR
861 : OP_ANYCHAR_ML_STAR));
862 }
863 else {
864 r = add_opcode(reg, (CKN_ON ?
865 OP_STATE_CHECK_ANYCHAR_STAR
866 : OP_ANYCHAR_STAR));
867 }
868 if (r) return r;
869 if (CKN_ON)
870 r = add_state_check_num(reg, ckn);
871
872 return r;
873 }
874 }
875
876 if (empty_info != 0)
877 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
878 else
879 mod_tlen = tlen;
880
881 if (infinite && qn->lower <= 1) {
882 if (qn->greedy) {
883 if (qn->lower == 1) {
884 r = add_opcode_rel_addr(reg, OP_JUMP,
885 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
886 if (r) return r;
887 }
888
889 if (CKN_ON) {
890 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
891 if (r) return r;
892 r = add_state_check_num(reg, ckn);
893 if (r) return r;
894 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
895 }
896 else {
897 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
898 }
899 if (r) return r;
900 r = compile_tree_empty_check(qn->target, reg, empty_info);
901 if (r) return r;
902 r = add_opcode_rel_addr(reg, OP_JUMP,
903 -(mod_tlen + (int )SIZE_OP_JUMP
904 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
905 }
906 else {
907 if (qn->lower == 0) {
908 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
909 if (r) return r;
910 }
911 r = compile_tree_empty_check(qn->target, reg, empty_info);
912 if (r) return r;
913 if (CKN_ON) {
914 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
915 if (r) return r;
916 r = add_state_check_num(reg, ckn);
917 if (r) return r;
918 r = add_rel_addr(reg,
919 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
920 }
921 else
922 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
923 }
924 }
925 else if (qn->upper == 0) {
926 if (qn->is_referred != 0) { /* /(?<n>..){0}/ */
927 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
928 if (r) return r;
929 r = compile_tree(qn->target, reg);
930 }
931 else
932 r = 0;
933 }
934 else if (qn->upper == 1 && qn->greedy) {
935 if (qn->lower == 0) {
936 if (CKN_ON) {
937 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
938 if (r) return r;
939 r = add_state_check_num(reg, ckn);
940 if (r) return r;
941 r = add_rel_addr(reg, tlen);
942 }
943 else {
944 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
945 }
946 if (r) return r;
947 }
948
949 r = compile_tree(qn->target, reg);
950 }
951 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
952 if (CKN_ON) {
953 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
954 if (r) return r;
955 r = add_state_check_num(reg, ckn);
956 if (r) return r;
957 r = add_rel_addr(reg, SIZE_OP_JUMP);
958 }
959 else {
960 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
961 }
962
963 if (r) return r;
964 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
965 if (r) return r;
966 r = compile_tree(qn->target, reg);
967 }
968 else {
969 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
970 if (CKN_ON) {
971 if (r) return r;
972 r = add_opcode(reg, OP_STATE_CHECK);
973 if (r) return r;
974 r = add_state_check_num(reg, ckn);
975 }
976 }
977 return r;
978}
979
980#else /* USE_COMBINATION_EXPLOSION_CHECK */
981
982static int
983compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
984{
985 int len, mod_tlen;
986 int infinite = IS_REPEAT_INFINITE(qn->upper);
987 int empty_info = qn->target_empty_info;
988 int tlen = compile_length_tree(qn->target, reg);
989
990 if (tlen < 0) return tlen;
991
992 /* anychar repeat */
993 if (NTYPE(qn->target) == NT_CANY) {
994 if (qn->greedy && infinite) {
995 if (IS_NOT_NULL(qn->next_head_exact))
996 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
997 else
998 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
999 }
1000 }
1001
1002 if (empty_info != 0)
1003 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1004 else
1005 mod_tlen = tlen;
1006
1007 if (infinite &&
1008 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1009 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1010 len = SIZE_OP_JUMP;
1011 }
1012 else {
1013 len = tlen * qn->lower;
1014 }
1015
1016 if (qn->greedy) {
1017#ifdef USE_OP_PUSH_OR_JUMP_EXACT
1018 if (IS_NOT_NULL(qn->head_exact))
1019 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1020 else
1021#endif
1022 if (IS_NOT_NULL(qn->next_head_exact))
1023 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1024 else
1025 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1026 }
1027 else
1028 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1029 }
1030 else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */
1031 len = SIZE_OP_JUMP + tlen;
1032 }
1033 else if (!infinite && qn->greedy &&
1034 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1035 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1036 len = tlen * qn->lower;
1037 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1038 }
1039 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1040 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1041 }
1042 else {
1043 len = SIZE_OP_REPEAT_INC
1044 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1045 }
1046
1047 return len;
1048}
1049
1050static int
1051compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1052{
1053 int i, r, mod_tlen;
1054 int infinite = IS_REPEAT_INFINITE(qn->upper);
1055 int empty_info = qn->target_empty_info;
1056 int tlen = compile_length_tree(qn->target, reg);
1057
1058 if (tlen < 0) return tlen;
1059
1060 if (is_anychar_star_quantifier(qn)) {
1061 r = compile_tree_n_times(qn->target, qn->lower, reg);
1062 if (r) return r;
1063 if (IS_NOT_NULL(qn->next_head_exact)) {
1064 if (IS_MULTILINE(reg->options))
1065 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1066 else
1067 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1068 if (r) return r;
1069 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1070 }
1071 else {
1072 if (IS_MULTILINE(reg->options))
1073 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1074 else
1075 return add_opcode(reg, OP_ANYCHAR_STAR);
1076 }
1077 }
1078
1079 if (empty_info != 0)
1080 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1081 else
1082 mod_tlen = tlen;
1083
1084 if (infinite &&
1085 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1086 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1087 if (qn->greedy) {
1088#ifdef USE_OP_PUSH_OR_JUMP_EXACT
1089 if (IS_NOT_NULL(qn->head_exact))
1090 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1091 else
1092#endif
1093 if (IS_NOT_NULL(qn->next_head_exact))
1094 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1095 else
1096 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1097 }
1098 else {
1099 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1100 }
1101 if (r) return r;
1102 }
1103 else {
1104 r = compile_tree_n_times(qn->target, qn->lower, reg);
1105 if (r) return r;
1106 }
1107
1108 if (qn->greedy) {
1109#ifdef USE_OP_PUSH_OR_JUMP_EXACT
1110 if (IS_NOT_NULL(qn->head_exact)) {
1111 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1112 mod_tlen + SIZE_OP_JUMP);
1113 if (r) return r;
1114 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1115 r = compile_tree_empty_check(qn->target, reg, empty_info);
1116 if (r) return r;
1117 r = add_opcode_rel_addr(reg, OP_JUMP,
1118 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1119 }
1120 else
1121#endif
1122 if (IS_NOT_NULL(qn->next_head_exact)) {
1123 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1124 mod_tlen + SIZE_OP_JUMP);
1125 if (r) return r;
1126 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1127 r = compile_tree_empty_check(qn->target, reg, empty_info);
1128 if (r) return r;
1129 r = add_opcode_rel_addr(reg, OP_JUMP,
1130 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1131 }
1132 else {
1133 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1134 if (r) return r;
1135 r = compile_tree_empty_check(qn->target, reg, empty_info);
1136 if (r) return r;
1137 r = add_opcode_rel_addr(reg, OP_JUMP,
1138 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1139 }
1140 }
1141 else {
1142 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1143 if (r) return r;
1144 r = compile_tree_empty_check(qn->target, reg, empty_info);
1145 if (r) return r;
1146 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1147 }
1148 }
1149 else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */
1150 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1151 if (r) return r;
1152 r = compile_tree(qn->target, reg);
1153 }
1154 else if (!infinite && qn->greedy &&
1155 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1156 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1157 int n = qn->upper - qn->lower;
1158
1159 r = compile_tree_n_times(qn->target, qn->lower, reg);
1160 if (r) return r;
1161
1162 for (i = 0; i < n; i++) {
1163 r = add_opcode_rel_addr(reg, OP_PUSH,
1164 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1165 if (r) return r;
1166 r = compile_tree(qn->target, reg);
1167 if (r) return r;
1168 }
1169 }
1170 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1171 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1172 if (r) return r;
1173 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1174 if (r) return r;
1175 r = compile_tree(qn->target, reg);
1176 }
1177 else {
1178 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1179 }
1180 return r;
1181}
1182#endif /* USE_COMBINATION_EXPLOSION_CHECK */
1183
1184static int
1185compile_length_option_node(EncloseNode* node, regex_t* reg)
1186{
1187 int tlen;
1188 OnigOptionType prev = reg->options;
1189
1190 reg->options = node->option;
1191 tlen = compile_length_tree(node->target, reg);
1192 reg->options = prev;
1193
1194 if (tlen < 0) return tlen;
1195
1196 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1197 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1198 + tlen + SIZE_OP_SET_OPTION;
1199 }
1200 else
1201 return tlen;
1202}
1203
1204static int
1205compile_option_node(EncloseNode* node, regex_t* reg)
1206{
1207 int r;
1208 OnigOptionType prev = reg->options;
1209
1210 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1211 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1212 if (r) return r;
1213 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1214 if (r) return r;
1215 r = add_opcode(reg, OP_FAIL);
1216 if (r) return r;
1217 }
1218
1219 reg->options = node->option;
1220 r = compile_tree(node->target, reg);
1221 reg->options = prev;
1222
1223 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1224 if (r) return r;
1225 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1226 }
1227 return r;
1228}
1229
1230static int
1231compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1232{
1233 int len;
1234 int tlen;
1235
1236 if (node->type == ENCLOSE_OPTION)
1237 return compile_length_option_node(node, reg);
1238
1239 if (node->target) {
1240 tlen = compile_length_tree(node->target, reg);
1241 if (tlen < 0) return tlen;
1242 }
1243 else
1244 tlen = 0;
1245
1246 switch (node->type) {
1247 case ENCLOSE_MEMORY:
1248#ifdef USE_SUBEXP_CALL
1249 if (IS_ENCLOSE_CALLED(node)) {
1250 len = SIZE_OP_MEMORY_START_PUSH + tlen
1251 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1252 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1253 len += (IS_ENCLOSE_RECURSION(node)
1254 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1255 else
1256 len += (IS_ENCLOSE_RECURSION(node)
1257 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1258 }
1259 else if (IS_ENCLOSE_RECURSION(node)) {
1260 len = SIZE_OP_MEMORY_START_PUSH;
1261 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1262 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC);
1263 }
1264 else
1265#endif
1266 {
1267 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1268 len = SIZE_OP_MEMORY_START_PUSH;
1269 else
1270 len = SIZE_OP_MEMORY_START;
1271
1272 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1273 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1274 }
1275 break;
1276
1277 case ENCLOSE_STOP_BACKTRACK:
1278 /* Disable POP_STOP_BT optimization for simple repeat under the match cache */
1279 /* optimization because the match cache optimization pushes an extra item to */
1280 /* the stack and it breaks the assumption for this optimization. */
1281#ifndef USE_MATCH_CACHE
1282 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1283 QtfrNode* qn = NQTFR(node->target);
1284 tlen = compile_length_tree(qn->target, reg);
1285 if (tlen < 0) return tlen;
1286
1287 len = tlen * qn->lower
1288 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1289 }
1290 else {
1291#endif
1292 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1293#ifndef USE_MATCH_CACHE
1294 }
1295#endif
1296 break;
1297
1298 case ENCLOSE_CONDITION:
1299 len = SIZE_OP_CONDITION;
1300 if (NTYPE(node->target) == NT_ALT) {
1301 Node* x = node->target;
1302
1303 tlen = compile_length_tree(NCAR(x), reg); /* yes-node */
1304 if (tlen < 0) return tlen;
1305 len += tlen + SIZE_OP_JUMP;
1306 if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
1307 x = NCDR(x);
1308 tlen = compile_length_tree(NCAR(x), reg); /* no-node */
1309 if (tlen < 0) return tlen;
1310 len += tlen;
1311 if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
1312 }
1313 else {
1314 return ONIGERR_PARSER_BUG;
1315 }
1316 break;
1317
1318 case ENCLOSE_ABSENT:
1319 len = SIZE_OP_PUSH_ABSENT_POS + SIZE_OP_ABSENT + tlen + SIZE_OP_ABSENT_END;
1320 break;
1321
1322 default:
1323 return ONIGERR_TYPE_BUG;
1324 break;
1325 }
1326
1327 return len;
1328}
1329
1330static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1331
1332static int
1333compile_enclose_node(EncloseNode* node, regex_t* reg)
1334{
1335 int r, len;
1336
1337 if (node->type == ENCLOSE_OPTION)
1338 return compile_option_node(node, reg);
1339
1340 switch (node->type) {
1341 case ENCLOSE_MEMORY:
1342#ifdef USE_SUBEXP_CALL
1343 if (IS_ENCLOSE_CALLED(node)) {
1344 r = add_opcode(reg, OP_CALL);
1345 if (r) return r;
1346 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1347 node->state |= NST_ADDR_FIXED;
1348 r = add_abs_addr(reg, (int )node->call_addr);
1349 if (r) return r;
1350 len = compile_length_tree(node->target, reg);
1351 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1352 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1353 len += (IS_ENCLOSE_RECURSION(node)
1354 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1355 else
1356 len += (IS_ENCLOSE_RECURSION(node)
1357 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1358
1359 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1360 if (r) return r;
1361 }
1362#endif
1363 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1364 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1365 else
1366 r = add_opcode(reg, OP_MEMORY_START);
1367 if (r) return r;
1368 r = add_mem_num(reg, node->regnum);
1369 if (r) return r;
1370 r = compile_tree(node->target, reg);
1371 if (r) return r;
1372#ifdef USE_SUBEXP_CALL
1373 if (IS_ENCLOSE_CALLED(node)) {
1374 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1375 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1376 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1377 else
1378 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1379 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1380
1381 if (r) return r;
1382 r = add_mem_num(reg, node->regnum);
1383 if (r) return r;
1384 r = add_opcode(reg, OP_RETURN);
1385 }
1386 else if (IS_ENCLOSE_RECURSION(node)) {
1387 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1388 r = add_opcode(reg, OP_MEMORY_END_PUSH_REC);
1389 else
1390 r = add_opcode(reg, OP_MEMORY_END_REC);
1391 if (r) return r;
1392 r = add_mem_num(reg, node->regnum);
1393 }
1394 else
1395#endif
1396 {
1397 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1398 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1399 else
1400 r = add_opcode(reg, OP_MEMORY_END);
1401 if (r) return r;
1402 r = add_mem_num(reg, node->regnum);
1403 }
1404 break;
1405
1406 case ENCLOSE_STOP_BACKTRACK:
1407 /* Disable POP_STOP_BT optimization for simple repeat under the match cache */
1408 /* optimization because the match cache optimization pushes an extra item to */
1409 /* the stack and it breaks the assumption for this optimization. */
1410#ifndef USE_MATCH_CACHE
1411 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1412 QtfrNode* qn = NQTFR(node->target);
1413 r = compile_tree_n_times(qn->target, qn->lower, reg);
1414 if (r) return r;
1415
1416 len = compile_length_tree(qn->target, reg);
1417 if (len < 0) return len;
1418
1419 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1420 if (r) return r;
1421 r = compile_tree(qn->target, reg);
1422 if (r) return r;
1423 r = add_opcode(reg, OP_POP);
1424 if (r) return r;
1425 r = add_opcode_rel_addr(reg, OP_JUMP,
1426 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1427 }
1428 else {
1429#endif
1430 r = add_opcode(reg, OP_PUSH_STOP_BT);
1431 if (r) return r;
1432 r = compile_tree(node->target, reg);
1433 if (r) return r;
1434 r = add_opcode(reg, OP_POP_STOP_BT);
1435#ifndef USE_MATCH_CACHE
1436 }
1437#endif
1438 break;
1439
1440 case ENCLOSE_CONDITION:
1441 r = add_opcode(reg, OP_CONDITION);
1442 if (r) return r;
1443 r = add_mem_num(reg, node->regnum);
1444 if (r) return r;
1445
1446 if (NTYPE(node->target) == NT_ALT) {
1447 Node* x = node->target;
1448 int len2;
1449
1450 len = compile_length_tree(NCAR(x), reg); /* yes-node */
1451 if (len < 0) return len;
1452 if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
1453 x = NCDR(x);
1454 len2 = compile_length_tree(NCAR(x), reg); /* no-node */
1455 if (len2 < 0) return len2;
1456 if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
1457
1458 x = node->target;
1459 r = add_rel_addr(reg, len + SIZE_OP_JUMP);
1460 if (r) return r;
1461 r = compile_tree(NCAR(x), reg); /* yes-node */
1462 if (r) return r;
1463 r = add_opcode_rel_addr(reg, OP_JUMP, len2);
1464 if (r) return r;
1465 x = NCDR(x);
1466 r = compile_tree(NCAR(x), reg); /* no-node */
1467 }
1468 else {
1469 return ONIGERR_PARSER_BUG;
1470 }
1471 break;
1472
1473 case ENCLOSE_ABSENT:
1474 len = compile_length_tree(node->target, reg);
1475 if (len < 0) return len;
1476
1477 r = add_opcode(reg, OP_PUSH_ABSENT_POS);
1478 if (r) return r;
1479 r = add_opcode_rel_addr(reg, OP_ABSENT, len + SIZE_OP_ABSENT_END);
1480 if (r) return r;
1481 r = compile_tree(node->target, reg);
1482 if (r) return r;
1483 r = add_opcode(reg, OP_ABSENT_END);
1484 break;
1485
1486 default:
1487 return ONIGERR_TYPE_BUG;
1488 break;
1489 }
1490
1491 return r;
1492}
1493
1494static int
1495compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1496{
1497 int len;
1498 int tlen = 0;
1499
1500 if (node->target) {
1501 tlen = compile_length_tree(node->target, reg);
1502 if (tlen < 0) return tlen;
1503 }
1504
1505 switch (node->type) {
1506 case ANCHOR_PREC_READ:
1507 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1508 break;
1509 case ANCHOR_PREC_READ_NOT:
1510 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1511 break;
1512 case ANCHOR_LOOK_BEHIND:
1513 len = SIZE_OP_LOOK_BEHIND + tlen;
1514 break;
1515 case ANCHOR_LOOK_BEHIND_NOT:
1516 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1517 break;
1518
1519 default:
1520 len = SIZE_OPCODE;
1521 break;
1522 }
1523
1524 return len;
1525}
1526
1527static int
1528compile_anchor_node(AnchorNode* node, regex_t* reg)
1529{
1530 int r, len;
1531
1532 switch (node->type) {
1533 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1534 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1535 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1536 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1537 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1538 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1539
1540 case ANCHOR_WORD_BOUND:
1541 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND);
1542 else r = add_opcode(reg, OP_WORD_BOUND);
1543 break;
1544 case ANCHOR_NOT_WORD_BOUND:
1545 if (node->ascii_range) r = add_opcode(reg, OP_NOT_ASCII_WORD_BOUND);
1546 else r = add_opcode(reg, OP_NOT_WORD_BOUND);
1547 break;
1548#ifdef USE_WORD_BEGIN_END
1549 case ANCHOR_WORD_BEGIN:
1550 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BEGIN);
1551 else r = add_opcode(reg, OP_WORD_BEGIN);
1552 break;
1553 case ANCHOR_WORD_END:
1554 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_END);
1555 else r = add_opcode(reg, OP_WORD_END);
1556 break;
1557#endif
1558 case ANCHOR_KEEP: r = add_opcode(reg, OP_KEEP); break;
1559
1560 case ANCHOR_PREC_READ:
1561 r = add_opcode(reg, OP_PUSH_POS);
1562 if (r) return r;
1563 r = compile_tree(node->target, reg);
1564 if (r) return r;
1565 r = add_opcode(reg, OP_POP_POS);
1566 break;
1567
1568 case ANCHOR_PREC_READ_NOT:
1569 len = compile_length_tree(node->target, reg);
1570 if (len < 0) return len;
1571 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1572 if (r) return r;
1573 r = compile_tree(node->target, reg);
1574 if (r) return r;
1575 r = add_opcode(reg, OP_FAIL_POS);
1576 break;
1577
1578 case ANCHOR_LOOK_BEHIND:
1579 {
1580 int n;
1581 r = add_opcode(reg, OP_LOOK_BEHIND);
1582 if (r) return r;
1583 if (node->char_len < 0) {
1584 r = get_char_length_tree(node->target, reg, &n);
1585 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1586 }
1587 else
1588 n = node->char_len;
1589 r = add_length(reg, n);
1590 if (r) return r;
1591 r = compile_tree(node->target, reg);
1592 }
1593 break;
1594
1595 case ANCHOR_LOOK_BEHIND_NOT:
1596 {
1597 int n;
1598 len = compile_length_tree(node->target, reg);
1599 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1600 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1601 if (r) return r;
1602 if (node->char_len < 0) {
1603 r = get_char_length_tree(node->target, reg, &n);
1604 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1605 }
1606 else
1607 n = node->char_len;
1608 r = add_length(reg, n);
1609 if (r) return r;
1610 r = compile_tree(node->target, reg);
1611 if (r) return r;
1612 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1613 }
1614 break;
1615
1616 default:
1617 return ONIGERR_TYPE_BUG;
1618 break;
1619 }
1620
1621 return r;
1622}
1623
1624static int
1625compile_length_tree(Node* node, regex_t* reg)
1626{
1627 int len, type, r;
1628
1629 type = NTYPE(node);
1630 switch (type) {
1631 case NT_LIST:
1632 len = 0;
1633 do {
1634 r = compile_length_tree(NCAR(node), reg);
1635 if (r < 0) return r;
1636 len += r;
1637 } while (IS_NOT_NULL(node = NCDR(node)));
1638 r = len;
1639 break;
1640
1641 case NT_ALT:
1642 {
1643 int n = 0;
1644 len = 0;
1645 do {
1646 r = compile_length_tree(NCAR(node), reg);
1647 if (r < 0) return r;
1648 len += r;
1649 n++;
1650 } while (IS_NOT_NULL(node = NCDR(node)));
1651 r = len;
1652 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1653 }
1654 break;
1655
1656 case NT_STR:
1657 if (NSTRING_IS_RAW(node))
1658 r = compile_length_string_raw_node(NSTR(node), reg);
1659 else
1660 r = compile_length_string_node(node, reg);
1661 break;
1662
1663 case NT_CCLASS:
1664 r = compile_length_cclass_node(NCCLASS(node), reg);
1665 break;
1666
1667 case NT_CTYPE:
1668 case NT_CANY:
1669 r = SIZE_OPCODE;
1670 break;
1671
1672 case NT_BREF:
1673 {
1674 BRefNode* br = NBREF(node);
1675
1676#ifdef USE_BACKREF_WITH_LEVEL
1677 if (IS_BACKREF_NEST_LEVEL(br)) {
1678 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1679 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1680 }
1681 else
1682#endif
1683 if (br->back_num == 1) {
1684 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1685 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1686 }
1687 else {
1688 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1689 }
1690 }
1691 break;
1692
1693#ifdef USE_SUBEXP_CALL
1694 case NT_CALL:
1695 r = SIZE_OP_CALL;
1696 break;
1697#endif
1698
1699 case NT_QTFR:
1700 r = compile_length_quantifier_node(NQTFR(node), reg);
1701 break;
1702
1703 case NT_ENCLOSE:
1704 r = compile_length_enclose_node(NENCLOSE(node), reg);
1705 break;
1706
1707 case NT_ANCHOR:
1708 r = compile_length_anchor_node(NANCHOR(node), reg);
1709 break;
1710
1711 default:
1712 return ONIGERR_TYPE_BUG;
1713 break;
1714 }
1715
1716 return r;
1717}
1718
1719static int
1720compile_tree(Node* node, regex_t* reg)
1721{
1722 int n, type, len, pos, r = 0;
1723
1724 type = NTYPE(node);
1725 switch (type) {
1726 case NT_LIST:
1727 do {
1728 r = compile_tree(NCAR(node), reg);
1729 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1730 break;
1731
1732 case NT_ALT:
1733 {
1734 Node* x = node;
1735 len = 0;
1736 do {
1737 len += compile_length_tree(NCAR(x), reg);
1738 if (NCDR(x) != NULL) {
1739 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1740 }
1741 } while (IS_NOT_NULL(x = NCDR(x)));
1742 pos = reg->used + len; /* goal position */
1743
1744 do {
1745 len = compile_length_tree(NCAR(node), reg);
1746 if (IS_NOT_NULL(NCDR(node))) {
1747 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1748 if (r) break;
1749 }
1750 r = compile_tree(NCAR(node), reg);
1751 if (r) break;
1752 if (IS_NOT_NULL(NCDR(node))) {
1753 len = pos - (reg->used + SIZE_OP_JUMP);
1754 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1755 if (r) break;
1756 }
1757 } while (IS_NOT_NULL(node = NCDR(node)));
1758 }
1759 break;
1760
1761 case NT_STR:
1762 if (NSTRING_IS_RAW(node))
1763 r = compile_string_raw_node(NSTR(node), reg);
1764 else
1765 r = compile_string_node(node, reg);
1766 break;
1767
1768 case NT_CCLASS:
1769 r = compile_cclass_node(NCCLASS(node), reg);
1770 break;
1771
1772 case NT_CTYPE:
1773 {
1774 int op;
1775
1776 switch (NCTYPE(node)->ctype) {
1777 case ONIGENC_CTYPE_WORD:
1778 if (NCTYPE(node)->ascii_range != 0) {
1779 if (NCTYPE(node)->not != 0) op = OP_NOT_ASCII_WORD;
1780 else op = OP_ASCII_WORD;
1781 }
1782 else {
1783 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1784 else op = OP_WORD;
1785 }
1786 break;
1787 default:
1788 return ONIGERR_TYPE_BUG;
1789 break;
1790 }
1791 r = add_opcode(reg, op);
1792 }
1793 break;
1794
1795 case NT_CANY:
1796 if (IS_MULTILINE(reg->options))
1797 r = add_opcode(reg, OP_ANYCHAR_ML);
1798 else
1799 r = add_opcode(reg, OP_ANYCHAR);
1800 break;
1801
1802 case NT_BREF:
1803 {
1804 BRefNode* br = NBREF(node);
1805
1806#ifdef USE_BACKREF_WITH_LEVEL
1807 if (IS_BACKREF_NEST_LEVEL(br)) {
1808 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1809 if (r) return r;
1810 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1811 if (r) return r;
1812 r = add_length(reg, br->nest_level);
1813 if (r) return r;
1814
1815 goto add_bacref_mems;
1816 }
1817 else
1818#endif
1819 if (br->back_num == 1) {
1820 n = br->back_static[0];
1821 if (IS_IGNORECASE(reg->options)) {
1822 r = add_opcode(reg, OP_BACKREFN_IC);
1823 if (r) return r;
1824 r = add_mem_num(reg, n);
1825 }
1826 else {
1827 switch (n) {
1828 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1829 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1830 default:
1831 r = add_opcode(reg, OP_BACKREFN);
1832 if (r) return r;
1833 r = add_mem_num(reg, n);
1834 break;
1835 }
1836 }
1837 }
1838 else {
1839 int i;
1840 int* p;
1841
1842 if (IS_IGNORECASE(reg->options)) {
1843 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1844 }
1845 else {
1846 r = add_opcode(reg, OP_BACKREF_MULTI);
1847 }
1848 if (r) return r;
1849
1850#ifdef USE_BACKREF_WITH_LEVEL
1851 add_bacref_mems:
1852#endif
1853 r = add_length(reg, br->back_num);
1854 if (r) return r;
1855 p = BACKREFS_P(br);
1856 for (i = br->back_num - 1; i >= 0; i--) {
1857 r = add_mem_num(reg, p[i]);
1858 if (r) return r;
1859 }
1860 }
1861 }
1862 break;
1863
1864#ifdef USE_SUBEXP_CALL
1865 case NT_CALL:
1866 r = compile_call(NCALL(node), reg);
1867 break;
1868#endif
1869
1870 case NT_QTFR:
1871 r = compile_quantifier_node(NQTFR(node), reg);
1872 break;
1873
1874 case NT_ENCLOSE:
1875 r = compile_enclose_node(NENCLOSE(node), reg);
1876 break;
1877
1878 case NT_ANCHOR:
1879 r = compile_anchor_node(NANCHOR(node), reg);
1880 break;
1881
1882 default:
1883#ifdef ONIG_DEBUG
1884 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1885#endif
1886 break;
1887 }
1888
1889 return r;
1890}
1891
1892#ifdef USE_NAMED_GROUP
1893
1894static int
1895noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1896{
1897 int r = 0;
1898 Node* node = *plink;
1899
1900 switch (NTYPE(node)) {
1901 case NT_LIST:
1902 case NT_ALT:
1903 do {
1904 r = noname_disable_map(&(NCAR(node)), map, counter);
1905 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1906 break;
1907
1908 case NT_QTFR:
1909 {
1910 Node** ptarget = &(NQTFR(node)->target);
1911 Node* old = *ptarget;
1912 r = noname_disable_map(ptarget, map, counter);
1913 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1914 onig_reduce_nested_quantifier(node, *ptarget);
1915 }
1916 }
1917 break;
1918
1919 case NT_ENCLOSE:
1920 {
1921 EncloseNode* en = NENCLOSE(node);
1922 if (en->type == ENCLOSE_MEMORY) {
1923 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1924 (*counter)++;
1925 map[en->regnum].new_val = *counter;
1926 en->regnum = *counter;
1927 }
1928 else if (en->regnum != 0) {
1929 *plink = en->target;
1930 en->target = NULL_NODE;
1931 onig_node_free(node);
1932 r = noname_disable_map(plink, map, counter);
1933 break;
1934 }
1935 }
1936 r = noname_disable_map(&(en->target), map, counter);
1937 }
1938 break;
1939
1940 case NT_ANCHOR:
1941 if (NANCHOR(node)->target)
1942 r = noname_disable_map(&(NANCHOR(node)->target), map, counter);
1943 break;
1944
1945 default:
1946 break;
1947 }
1948
1949 return r;
1950}
1951
1952static int
1953renumber_node_backref(Node* node, GroupNumRemap* map, const int num_mem)
1954{
1955 int i, pos, n, old_num;
1956 int *backs;
1957 BRefNode* bn = NBREF(node);
1958
1959 if (! IS_BACKREF_NAME_REF(bn))
1960 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1961
1962 old_num = bn->back_num;
1963 if (IS_NULL(bn->back_dynamic))
1964 backs = bn->back_static;
1965 else
1966 backs = bn->back_dynamic;
1967
1968 for (i = 0, pos = 0; i < old_num; i++) {
1969 if (backs[i] > num_mem) return ONIGERR_INVALID_BACKREF;
1970 n = map[backs[i]].new_val;
1971 if (n > 0) {
1972 backs[pos] = n;
1973 pos++;
1974 }
1975 }
1976
1977 bn->back_num = pos;
1978 return 0;
1979}
1980
1981static int
1982renumber_by_map(Node* node, GroupNumRemap* map, const int num_mem)
1983{
1984 int r = 0;
1985
1986 switch (NTYPE(node)) {
1987 case NT_LIST:
1988 case NT_ALT:
1989 do {
1990 r = renumber_by_map(NCAR(node), map, num_mem);
1991 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1992 break;
1993 case NT_QTFR:
1994 r = renumber_by_map(NQTFR(node)->target, map, num_mem);
1995 break;
1996 case NT_ENCLOSE:
1997 {
1998 EncloseNode* en = NENCLOSE(node);
1999 if (en->type == ENCLOSE_CONDITION) {
2000 if (en->regnum > num_mem) return ONIGERR_INVALID_BACKREF;
2001 en->regnum = map[en->regnum].new_val;
2002 }
2003 r = renumber_by_map(en->target, map, num_mem);
2004 }
2005 break;
2006
2007 case NT_BREF:
2008 r = renumber_node_backref(node, map, num_mem);
2009 break;
2010
2011 case NT_ANCHOR:
2012 if (NANCHOR(node)->target)
2013 r = renumber_by_map(NANCHOR(node)->target, map, num_mem);
2014 break;
2015
2016 default:
2017 break;
2018 }
2019
2020 return r;
2021}
2022
2023static int
2024numbered_ref_check(Node* node)
2025{
2026 int r = 0;
2027
2028 switch (NTYPE(node)) {
2029 case NT_LIST:
2030 case NT_ALT:
2031 do {
2032 r = numbered_ref_check(NCAR(node));
2033 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2034 break;
2035 case NT_QTFR:
2036 r = numbered_ref_check(NQTFR(node)->target);
2037 break;
2038 case NT_ENCLOSE:
2039 r = numbered_ref_check(NENCLOSE(node)->target);
2040 break;
2041
2042 case NT_BREF:
2043 if (! IS_BACKREF_NAME_REF(NBREF(node)))
2044 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2045 break;
2046
2047 case NT_ANCHOR:
2048 if (NANCHOR(node)->target)
2049 r = numbered_ref_check(NANCHOR(node)->target);
2050 break;
2051
2052 default:
2053 break;
2054 }
2055
2056 return r;
2057}
2058
2059static int
2060disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
2061{
2062 int r, i, pos, counter;
2063 BitStatusType loc;
2064 GroupNumRemap* map;
2065
2066 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
2067 CHECK_NULL_RETURN_MEMERR(map);
2068 for (i = 1; i <= env->num_mem; i++) {
2069 map[i].new_val = 0;
2070 }
2071 counter = 0;
2072 r = noname_disable_map(root, map, &counter);
2073 if (r != 0) return r;
2074
2075 r = renumber_by_map(*root, map, env->num_mem);
2076 if (r != 0) return r;
2077
2078 for (i = 1, pos = 1; i <= env->num_mem; i++) {
2079 if (map[i].new_val > 0) {
2080 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
2081 pos++;
2082 }
2083 }
2084
2085 loc = env->capture_history;
2086 BIT_STATUS_CLEAR(env->capture_history);
2087 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
2088 if (BIT_STATUS_AT(loc, i)) {
2089 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
2090 }
2091 }
2092
2093 env->num_mem = env->num_named;
2094 reg->num_mem = env->num_named;
2095
2096 return onig_renumber_name_table(reg, map);
2097}
2098#endif /* USE_NAMED_GROUP */
2099
2100#ifdef USE_SUBEXP_CALL
2101static int
2102unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
2103{
2104 int i, offset;
2105 EncloseNode* en;
2106 AbsAddrType addr;
2107
2108 for (i = 0; i < uslist->num; i++) {
2109 en = NENCLOSE(uslist->us[i].target);
2110 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
2111 addr = en->call_addr;
2112 offset = uslist->us[i].offset;
2113
2114 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
2115 }
2116 return 0;
2117}
2118#endif
2119
2120#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
2121static int
2122quantifiers_memory_node_info(Node* node)
2123{
2124 int r = 0;
2125
2126 switch (NTYPE(node)) {
2127 case NT_LIST:
2128 case NT_ALT:
2129 {
2130 int v;
2131 do {
2132 v = quantifiers_memory_node_info(NCAR(node));
2133 if (v > r) r = v;
2134 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
2135 }
2136 break;
2137
2138# ifdef USE_SUBEXP_CALL
2139 case NT_CALL:
2140 if (IS_CALL_RECURSION(NCALL(node))) {
2141 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
2142 }
2143 else
2144 r = quantifiers_memory_node_info(NCALL(node)->target);
2145 break;
2146# endif
2147
2148 case NT_QTFR:
2149 {
2150 QtfrNode* qn = NQTFR(node);
2151 if (qn->upper != 0) {
2152 r = quantifiers_memory_node_info(qn->target);
2153 }
2154 }
2155 break;
2156
2157 case NT_ENCLOSE:
2158 {
2159 EncloseNode* en = NENCLOSE(node);
2160 switch (en->type) {
2161 case ENCLOSE_MEMORY:
2162 return NQ_TARGET_IS_EMPTY_MEM;
2163 break;
2164
2165 case ENCLOSE_OPTION:
2166 case ENCLOSE_STOP_BACKTRACK:
2167 case ENCLOSE_CONDITION:
2168 case ENCLOSE_ABSENT:
2169 r = quantifiers_memory_node_info(en->target);
2170 break;
2171 default:
2172 break;
2173 }
2174 }
2175 break;
2176
2177 case NT_BREF:
2178 case NT_STR:
2179 case NT_CTYPE:
2180 case NT_CCLASS:
2181 case NT_CANY:
2182 case NT_ANCHOR:
2183 default:
2184 break;
2185 }
2186
2187 return r;
2188}
2189#endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2190
2191static int
2192get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2193{
2194 OnigDistance tmin;
2195 int r = 0;
2196
2197 *min = 0;
2198 switch (NTYPE(node)) {
2199 case NT_BREF:
2200 {
2201 int i;
2202 int* backs;
2203 Node** nodes = SCANENV_MEM_NODES(env);
2204 BRefNode* br = NBREF(node);
2205 if (br->state & NST_RECURSION) break;
2206
2207 backs = BACKREFS_P(br);
2208 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2209 r = get_min_match_length(nodes[backs[0]], min, env);
2210 if (r != 0) break;
2211 for (i = 1; i < br->back_num; i++) {
2212 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2213 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2214 if (r != 0) break;
2215 if (*min > tmin) *min = tmin;
2216 }
2217 }
2218 break;
2219
2220#ifdef USE_SUBEXP_CALL
2221 case NT_CALL:
2222 if (IS_CALL_RECURSION(NCALL(node))) {
2223 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2224 if (IS_ENCLOSE_MIN_FIXED(en))
2225 *min = en->min_len;
2226 }
2227 else
2228 r = get_min_match_length(NCALL(node)->target, min, env);
2229 break;
2230#endif
2231
2232 case NT_LIST:
2233 do {
2234 r = get_min_match_length(NCAR(node), &tmin, env);
2235 if (r == 0) *min += tmin;
2236 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2237 break;
2238
2239 case NT_ALT:
2240 {
2241 Node *x, *y;
2242 y = node;
2243 do {
2244 x = NCAR(y);
2245 r = get_min_match_length(x, &tmin, env);
2246 if (r != 0) break;
2247 if (y == node) *min = tmin;
2248 else if (*min > tmin) *min = tmin;
2249 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2250 }
2251 break;
2252
2253 case NT_STR:
2254 {
2255 StrNode* sn = NSTR(node);
2256 *min = sn->end - sn->s;
2257 }
2258 break;
2259
2260 case NT_CTYPE:
2261 *min = 1;
2262 break;
2263
2264 case NT_CCLASS:
2265 case NT_CANY:
2266 *min = 1;
2267 break;
2268
2269 case NT_QTFR:
2270 {
2271 QtfrNode* qn = NQTFR(node);
2272
2273 if (qn->lower > 0) {
2274 r = get_min_match_length(qn->target, min, env);
2275 if (r == 0)
2276 *min = distance_multiply(*min, qn->lower);
2277 }
2278 }
2279 break;
2280
2281 case NT_ENCLOSE:
2282 {
2283 EncloseNode* en = NENCLOSE(node);
2284 switch (en->type) {
2285 case ENCLOSE_MEMORY:
2286 if (IS_ENCLOSE_MIN_FIXED(en))
2287 *min = en->min_len;
2288 else {
2289 if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2290 *min = 0; /* recursive */
2291 else {
2292 SET_ENCLOSE_STATUS(node, NST_MARK1);
2293 r = get_min_match_length(en->target, min, env);
2294 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2295 if (r == 0) {
2296 en->min_len = *min;
2297 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2298 }
2299 }
2300 }
2301 break;
2302
2303 case ENCLOSE_OPTION:
2304 case ENCLOSE_STOP_BACKTRACK:
2305 case ENCLOSE_CONDITION:
2306 r = get_min_match_length(en->target, min, env);
2307 break;
2308
2309 case ENCLOSE_ABSENT:
2310 break;
2311 }
2312 }
2313 break;
2314
2315 case NT_ANCHOR:
2316 default:
2317 break;
2318 }
2319
2320 return r;
2321}
2322
2323static int
2324get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2325{
2326 OnigDistance tmax;
2327 int r = 0;
2328
2329 *max = 0;
2330 switch (NTYPE(node)) {
2331 case NT_LIST:
2332 do {
2333 r = get_max_match_length(NCAR(node), &tmax, env);
2334 if (r == 0)
2335 *max = distance_add(*max, tmax);
2336 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2337 break;
2338
2339 case NT_ALT:
2340 do {
2341 r = get_max_match_length(NCAR(node), &tmax, env);
2342 if (r == 0 && *max < tmax) *max = tmax;
2343 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2344 break;
2345
2346 case NT_STR:
2347 {
2348 StrNode* sn = NSTR(node);
2349 *max = sn->end - sn->s;
2350 }
2351 break;
2352
2353 case NT_CTYPE:
2354 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2355 break;
2356
2357 case NT_CCLASS:
2358 case NT_CANY:
2359 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2360 break;
2361
2362 case NT_BREF:
2363 {
2364 int i;
2365 int* backs;
2366 Node** nodes = SCANENV_MEM_NODES(env);
2367 BRefNode* br = NBREF(node);
2368 if (br->state & NST_RECURSION) {
2369 *max = ONIG_INFINITE_DISTANCE;
2370 break;
2371 }
2372 backs = BACKREFS_P(br);
2373 for (i = 0; i < br->back_num; i++) {
2374 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2375 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2376 if (r != 0) break;
2377 if (*max < tmax) *max = tmax;
2378 }
2379 }
2380 break;
2381
2382#ifdef USE_SUBEXP_CALL
2383 case NT_CALL:
2384 if (! IS_CALL_RECURSION(NCALL(node)))
2385 r = get_max_match_length(NCALL(node)->target, max, env);
2386 else
2387 *max = ONIG_INFINITE_DISTANCE;
2388 break;
2389#endif
2390
2391 case NT_QTFR:
2392 {
2393 QtfrNode* qn = NQTFR(node);
2394
2395 if (qn->upper != 0) {
2396 r = get_max_match_length(qn->target, max, env);
2397 if (r == 0 && *max != 0) {
2398 if (! IS_REPEAT_INFINITE(qn->upper))
2399 *max = distance_multiply(*max, qn->upper);
2400 else
2401 *max = ONIG_INFINITE_DISTANCE;
2402 }
2403 }
2404 }
2405 break;
2406
2407 case NT_ENCLOSE:
2408 {
2409 EncloseNode* en = NENCLOSE(node);
2410 switch (en->type) {
2411 case ENCLOSE_MEMORY:
2412 if (IS_ENCLOSE_MAX_FIXED(en))
2413 *max = en->max_len;
2414 else {
2415 if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2416 *max = ONIG_INFINITE_DISTANCE;
2417 else {
2418 SET_ENCLOSE_STATUS(node, NST_MARK1);
2419 r = get_max_match_length(en->target, max, env);
2420 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2421 if (r == 0) {
2422 en->max_len = *max;
2423 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2424 }
2425 }
2426 }
2427 break;
2428
2429 case ENCLOSE_OPTION:
2430 case ENCLOSE_STOP_BACKTRACK:
2431 case ENCLOSE_CONDITION:
2432 r = get_max_match_length(en->target, max, env);
2433 break;
2434
2435 case ENCLOSE_ABSENT:
2436 break;
2437 }
2438 }
2439 break;
2440
2441 case NT_ANCHOR:
2442 default:
2443 break;
2444 }
2445
2446 return r;
2447}
2448
2449#define GET_CHAR_LEN_VARLEN -1
2450#define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2451
2452/* fixed size pattern node only */
2453static int
2454get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2455{
2456 int tlen;
2457 int r = 0;
2458
2459 level++;
2460 *len = 0;
2461 switch (NTYPE(node)) {
2462 case NT_LIST:
2463 do {
2464 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2465 if (r == 0)
2466 *len = (int )distance_add(*len, tlen);
2467 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2468 break;
2469
2470 case NT_ALT:
2471 {
2472 int tlen2;
2473 int varlen = 0;
2474
2475 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2476 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2477 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2478 if (r == 0) {
2479 if (tlen != tlen2)
2480 varlen = 1;
2481 }
2482 }
2483 if (r == 0) {
2484 if (varlen != 0) {
2485 if (level == 1)
2486 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2487 else
2488 r = GET_CHAR_LEN_VARLEN;
2489 }
2490 else
2491 *len = tlen;
2492 }
2493 }
2494 break;
2495
2496 case NT_STR:
2497 {
2498 StrNode* sn = NSTR(node);
2499 UChar *s = sn->s;
2500 while (s < sn->end) {
2501 s += enclen(reg->enc, s, sn->end);
2502 (*len)++;
2503 }
2504 }
2505 break;
2506
2507 case NT_QTFR:
2508 {
2509 QtfrNode* qn = NQTFR(node);
2510 if (qn->lower == qn->upper) {
2511 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2512 if (r == 0)
2513 *len = (int )distance_multiply(tlen, qn->lower);
2514 }
2515 else
2516 r = GET_CHAR_LEN_VARLEN;
2517 }
2518 break;
2519
2520#ifdef USE_SUBEXP_CALL
2521 case NT_CALL:
2522 if (! IS_CALL_RECURSION(NCALL(node)))
2523 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2524 else
2525 r = GET_CHAR_LEN_VARLEN;
2526 break;
2527#endif
2528
2529 case NT_CTYPE:
2530 *len = 1;
2531 break;
2532
2533 case NT_CCLASS:
2534 case NT_CANY:
2535 *len = 1;
2536 break;
2537
2538 case NT_ENCLOSE:
2539 {
2540 EncloseNode* en = NENCLOSE(node);
2541 switch (en->type) {
2542 case ENCLOSE_MEMORY:
2543#ifdef USE_SUBEXP_CALL
2544 if (IS_ENCLOSE_CLEN_FIXED(en))
2545 *len = en->char_len;
2546 else {
2547 r = get_char_length_tree1(en->target, reg, len, level);
2548 if (r == 0) {
2549 en->char_len = *len;
2550 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2551 }
2552 }
2553 break;
2554#endif
2555 case ENCLOSE_OPTION:
2556 case ENCLOSE_STOP_BACKTRACK:
2557 case ENCLOSE_CONDITION:
2558 r = get_char_length_tree1(en->target, reg, len, level);
2559 break;
2560 case ENCLOSE_ABSENT:
2561 default:
2562 break;
2563 }
2564 }
2565 break;
2566
2567 case NT_ANCHOR:
2568 break;
2569
2570 default:
2571 r = GET_CHAR_LEN_VARLEN;
2572 break;
2573 }
2574
2575 return r;
2576}
2577
2578static int
2579get_char_length_tree(Node* node, regex_t* reg, int* len)
2580{
2581 return get_char_length_tree1(node, reg, len, 0);
2582}
2583
2584/* x is not included y ==> 1 : 0 */
2585static int
2586is_not_included(Node* x, Node* y, regex_t* reg)
2587{
2588 int i;
2589 OnigDistance len;
2590 OnigCodePoint code;
2591 UChar *p;
2592 int ytype;
2593
2594 retry:
2595 ytype = NTYPE(y);
2596 switch (NTYPE(x)) {
2597 case NT_CTYPE:
2598 {
2599 switch (ytype) {
2600 case NT_CTYPE:
2601 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2602 NCTYPE(y)->not != NCTYPE(x)->not &&
2603 NCTYPE(y)->ascii_range == NCTYPE(x)->ascii_range)
2604 return 1;
2605 else
2606 return 0;
2607 break;
2608
2609 case NT_CCLASS:
2610 swap:
2611 {
2612 Node* tmp;
2613 tmp = x; x = y; y = tmp;
2614 goto retry;
2615 }
2616 break;
2617
2618 case NT_STR:
2619 goto swap;
2620 break;
2621
2622 default:
2623 break;
2624 }
2625 }
2626 break;
2627
2628 case NT_CCLASS:
2629 {
2630 CClassNode* xc = NCCLASS(x);
2631 switch (ytype) {
2632 case NT_CTYPE:
2633 switch (NCTYPE(y)->ctype) {
2634 case ONIGENC_CTYPE_WORD:
2635 if (NCTYPE(y)->not == 0) {
2636 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2637 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2638 if (BITSET_AT(xc->bs, i)) {
2639 if (NCTYPE(y)->ascii_range) {
2640 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2641 }
2642 else {
2643 if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
2644 }
2645 }
2646 }
2647 return 1;
2648 }
2649 return 0;
2650 }
2651 else {
2652 if (IS_NOT_NULL(xc->mbuf)) return 0;
2653 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2654 int is_word;
2655 if (NCTYPE(y)->ascii_range)
2656 is_word = IS_CODE_SB_WORD(reg->enc, i);
2657 else
2658 is_word = ONIGENC_IS_CODE_WORD(reg->enc, i);
2659 if (! is_word) {
2660 if (!IS_NCCLASS_NOT(xc)) {
2661 if (BITSET_AT(xc->bs, i))
2662 return 0;
2663 }
2664 else {
2665 if (! BITSET_AT(xc->bs, i))
2666 return 0;
2667 }
2668 }
2669 }
2670 return 1;
2671 }
2672 break;
2673
2674 default:
2675 break;
2676 }
2677 break;
2678
2679 case NT_CCLASS:
2680 {
2681 int v;
2682 CClassNode* yc = NCCLASS(y);
2683
2684 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2685 v = BITSET_AT(xc->bs, i);
2686 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2687 (v == 0 && IS_NCCLASS_NOT(xc))) {
2688 v = BITSET_AT(yc->bs, i);
2689 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2690 (v == 0 && IS_NCCLASS_NOT(yc)))
2691 return 0;
2692 }
2693 }
2694 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2695 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2696 return 1;
2697 return 0;
2698 }
2699 break;
2700
2701 case NT_STR:
2702 goto swap;
2703 break;
2704
2705 default:
2706 break;
2707 }
2708 }
2709 break;
2710
2711 case NT_STR:
2712 {
2713 StrNode* xs = NSTR(x);
2714 if (NSTRING_LEN(x) == 0)
2715 break;
2716
2717 switch (ytype) {
2718 case NT_CTYPE:
2719 switch (NCTYPE(y)->ctype) {
2720 case ONIGENC_CTYPE_WORD:
2721 if (NCTYPE(y)->ascii_range) {
2722 if (ONIGENC_IS_MBC_ASCII_WORD(reg->enc, xs->s, xs->end))
2723 return NCTYPE(y)->not;
2724 else
2725 return !(NCTYPE(y)->not);
2726 }
2727 else {
2728 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2729 return NCTYPE(y)->not;
2730 else
2731 return !(NCTYPE(y)->not);
2732 }
2733 break;
2734 default:
2735 break;
2736 }
2737 break;
2738
2739 case NT_CCLASS:
2740 {
2741 CClassNode* cc = NCCLASS(y);
2742
2743 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2744 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2745 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2746 }
2747 break;
2748
2749 case NT_STR:
2750 {
2751 UChar *q;
2752 StrNode* ys = NSTR(y);
2753 len = NSTRING_LEN(x);
2754 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2755 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2756 /* tiny version */
2757 return 0;
2758 }
2759 else {
2760 for (i = 0, p = ys->s, q = xs->s; (OnigDistance )i < len; i++, p++, q++) {
2761 if (*p != *q) return 1;
2762 }
2763 }
2764 }
2765 break;
2766
2767 default:
2768 break;
2769 }
2770 }
2771 break;
2772
2773 default:
2774 break;
2775 }
2776
2777 return 0;
2778}
2779
2780static Node*
2781get_head_value_node(Node* node, int exact, regex_t* reg)
2782{
2783 Node* n = NULL_NODE;
2784
2785 switch (NTYPE(node)) {
2786 case NT_BREF:
2787 case NT_ALT:
2788 case NT_CANY:
2789#ifdef USE_SUBEXP_CALL
2790 case NT_CALL:
2791#endif
2792 break;
2793
2794 case NT_CTYPE:
2795 case NT_CCLASS:
2796 if (exact == 0) {
2797 n = node;
2798 }
2799 break;
2800
2801 case NT_LIST:
2802 n = get_head_value_node(NCAR(node), exact, reg);
2803 break;
2804
2805 case NT_STR:
2806 {
2807 StrNode* sn = NSTR(node);
2808 if (sn->end <= sn->s)
2809 break;
2810
2811 if (exact == 0 ||
2812 NSTRING_IS_RAW(node) || !IS_IGNORECASE(reg->options)) {
2813 n = node;
2814 }
2815 }
2816 break;
2817
2818 case NT_QTFR:
2819 {
2820 QtfrNode* qn = NQTFR(node);
2821 if (qn->lower > 0) {
2822#ifdef USE_OP_PUSH_OR_JUMP_EXACT
2823 if (IS_NOT_NULL(qn->head_exact))
2824 n = qn->head_exact;
2825 else
2826#endif
2827 n = get_head_value_node(qn->target, exact, reg);
2828 }
2829 }
2830 break;
2831
2832 case NT_ENCLOSE:
2833 {
2834 EncloseNode* en = NENCLOSE(node);
2835 switch (en->type) {
2836 case ENCLOSE_OPTION:
2837 {
2838 OnigOptionType options = reg->options;
2839
2840 reg->options = NENCLOSE(node)->option;
2841 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2842 reg->options = options;
2843 }
2844 break;
2845
2846 case ENCLOSE_MEMORY:
2847 case ENCLOSE_STOP_BACKTRACK:
2848 case ENCLOSE_CONDITION:
2849 n = get_head_value_node(en->target, exact, reg);
2850 break;
2851
2852 case ENCLOSE_ABSENT:
2853 break;
2854 }
2855 }
2856 break;
2857
2858 case NT_ANCHOR:
2859 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2860 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2861 break;
2862
2863 default:
2864 break;
2865 }
2866
2867 return n;
2868}
2869
2870static int
2871check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2872{
2873 int type, r = 0;
2874
2875 type = NTYPE(node);
2876 if ((NTYPE2BIT(type) & type_mask) == 0)
2877 return 1;
2878
2879 switch (type) {
2880 case NT_LIST:
2881 case NT_ALT:
2882 do {
2883 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2884 anchor_mask);
2885 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2886 break;
2887
2888 case NT_QTFR:
2889 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2890 anchor_mask);
2891 break;
2892
2893 case NT_ENCLOSE:
2894 {
2895 EncloseNode* en = NENCLOSE(node);
2896 if ((en->type & enclose_mask) == 0)
2897 return 1;
2898
2899 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2900 }
2901 break;
2902
2903 case NT_ANCHOR:
2904 type = NANCHOR(node)->type;
2905 if ((type & anchor_mask) == 0)
2906 return 1;
2907
2908 if (NANCHOR(node)->target)
2909 r = check_type_tree(NANCHOR(node)->target,
2910 type_mask, enclose_mask, anchor_mask);
2911 break;
2912
2913 default:
2914 break;
2915 }
2916 return r;
2917}
2918
2919#ifdef USE_SUBEXP_CALL
2920
2921# define RECURSION_EXIST 1
2922# define RECURSION_INFINITE 2
2923
2924static int
2925subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2926{
2927 int type;
2928 int r = 0;
2929
2930 type = NTYPE(node);
2931 switch (type) {
2932 case NT_LIST:
2933 {
2934 Node *x;
2935 OnigDistance min;
2936 int ret;
2937
2938 x = node;
2939 do {
2940 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2941 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2942 r |= ret;
2943 if (head) {
2944 ret = get_min_match_length(NCAR(x), &min, env);
2945 if (ret != 0) return ret;
2946 if (min != 0) head = 0;
2947 }
2948 } while (IS_NOT_NULL(x = NCDR(x)));
2949 }
2950 break;
2951
2952 case NT_ALT:
2953 {
2954 int ret;
2955 r = RECURSION_EXIST;
2956 do {
2957 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2958 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2959 r &= ret;
2960 } while (IS_NOT_NULL(node = NCDR(node)));
2961 }
2962 break;
2963
2964 case NT_QTFR:
2965 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2966 if (r == RECURSION_EXIST) {
2967 if (NQTFR(node)->lower == 0) r = 0;
2968 }
2969 break;
2970
2971 case NT_ANCHOR:
2972 {
2973 AnchorNode* an = NANCHOR(node);
2974 switch (an->type) {
2975 case ANCHOR_PREC_READ:
2976 case ANCHOR_PREC_READ_NOT:
2977 case ANCHOR_LOOK_BEHIND:
2978 case ANCHOR_LOOK_BEHIND_NOT:
2979 r = subexp_inf_recursive_check(an->target, env, head);
2980 break;
2981 }
2982 }
2983 break;
2984
2985 case NT_CALL:
2986 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2987 break;
2988
2989 case NT_ENCLOSE:
2990 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2991 return 0;
2992 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2993 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2994 else {
2995 SET_ENCLOSE_STATUS(node, NST_MARK2);
2996 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2997 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2998 }
2999 break;
3000
3001 default:
3002 break;
3003 }
3004
3005 return r;
3006}
3007
3008static int
3009subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
3010{
3011 int type;
3012 int r = 0;
3013
3014 type = NTYPE(node);
3015 switch (type) {
3016 case NT_LIST:
3017 case NT_ALT:
3018 do {
3019 r = subexp_inf_recursive_check_trav(NCAR(node), env);
3020 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3021 break;
3022
3023 case NT_QTFR:
3024 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
3025 break;
3026
3027 case NT_ANCHOR:
3028 {
3029 AnchorNode* an = NANCHOR(node);
3030 switch (an->type) {
3031 case ANCHOR_PREC_READ:
3032 case ANCHOR_PREC_READ_NOT:
3033 case ANCHOR_LOOK_BEHIND:
3034 case ANCHOR_LOOK_BEHIND_NOT:
3035 r = subexp_inf_recursive_check_trav(an->target, env);
3036 break;
3037 }
3038 }
3039 break;
3040
3041 case NT_ENCLOSE:
3042 {
3043 EncloseNode* en = NENCLOSE(node);
3044
3045 if (IS_ENCLOSE_RECURSION(en)) {
3046 SET_ENCLOSE_STATUS(node, NST_MARK1);
3047 r = subexp_inf_recursive_check(en->target, env, 1);
3048 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
3049 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
3050 }
3051 r = subexp_inf_recursive_check_trav(en->target, env);
3052 }
3053
3054 break;
3055
3056 default:
3057 break;
3058 }
3059
3060 return r;
3061}
3062
3063static int
3064subexp_recursive_check(Node* node)
3065{
3066 int r = 0;
3067
3068 switch (NTYPE(node)) {
3069 case NT_LIST:
3070 case NT_ALT:
3071 do {
3072 r |= subexp_recursive_check(NCAR(node));
3073 } while (IS_NOT_NULL(node = NCDR(node)));
3074 break;
3075
3076 case NT_QTFR:
3077 r = subexp_recursive_check(NQTFR(node)->target);
3078 break;
3079
3080 case NT_ANCHOR:
3081 {
3082 AnchorNode* an = NANCHOR(node);
3083 switch (an->type) {
3084 case ANCHOR_PREC_READ:
3085 case ANCHOR_PREC_READ_NOT:
3086 case ANCHOR_LOOK_BEHIND:
3087 case ANCHOR_LOOK_BEHIND_NOT:
3088 r = subexp_recursive_check(an->target);
3089 break;
3090 }
3091 }
3092 break;
3093
3094 case NT_CALL:
3095 r = subexp_recursive_check(NCALL(node)->target);
3096 if (r != 0) SET_CALL_RECURSION(node);
3097 break;
3098
3099 case NT_ENCLOSE:
3100 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
3101 return 0;
3102 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
3103 return 1; /* recursion */
3104 else {
3105 SET_ENCLOSE_STATUS(node, NST_MARK2);
3106 r = subexp_recursive_check(NENCLOSE(node)->target);
3107 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
3108 }
3109 break;
3110
3111 default:
3112 break;
3113 }
3114
3115 return r;
3116}
3117
3118
3119static int
3120subexp_recursive_check_trav(Node* node, ScanEnv* env)
3121{
3122# define FOUND_CALLED_NODE 1
3123
3124 int type;
3125 int r = 0;
3126
3127 type = NTYPE(node);
3128 switch (type) {
3129 case NT_LIST:
3130 case NT_ALT:
3131 {
3132 int ret;
3133 do {
3134 ret = subexp_recursive_check_trav(NCAR(node), env);
3135 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
3136 else if (ret < 0) return ret;
3137 } while (IS_NOT_NULL(node = NCDR(node)));
3138 }
3139 break;
3140
3141 case NT_QTFR:
3142 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
3143 if (NQTFR(node)->upper == 0) {
3144 if (r == FOUND_CALLED_NODE)
3145 NQTFR(node)->is_referred = 1;
3146 }
3147 break;
3148
3149 case NT_ANCHOR:
3150 {
3151 AnchorNode* an = NANCHOR(node);
3152 switch (an->type) {
3153 case ANCHOR_PREC_READ:
3154 case ANCHOR_PREC_READ_NOT:
3155 case ANCHOR_LOOK_BEHIND:
3156 case ANCHOR_LOOK_BEHIND_NOT:
3157 r = subexp_recursive_check_trav(an->target, env);
3158 break;
3159 }
3160 }
3161 break;
3162
3163 case NT_ENCLOSE:
3164 {
3165 EncloseNode* en = NENCLOSE(node);
3166
3167 if (! IS_ENCLOSE_RECURSION(en)) {
3168 if (IS_ENCLOSE_CALLED(en)) {
3169 SET_ENCLOSE_STATUS(node, NST_MARK1);
3170 r = subexp_recursive_check(en->target);
3171 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
3172 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
3173 }
3174 }
3175 r = subexp_recursive_check_trav(en->target, env);
3176 if (IS_ENCLOSE_CALLED(en))
3177 r |= FOUND_CALLED_NODE;
3178 }
3179 break;
3180
3181 default:
3182 break;
3183 }
3184
3185 return r;
3186}
3187
3188static int
3189setup_subexp_call(Node* node, ScanEnv* env)
3190{
3191 int type;
3192 int r = 0;
3193
3194 type = NTYPE(node);
3195 switch (type) {
3196 case NT_LIST:
3197 do {
3198 r = setup_subexp_call(NCAR(node), env);
3199 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3200 break;
3201
3202 case NT_ALT:
3203 do {
3204 r = setup_subexp_call(NCAR(node), env);
3205 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3206 break;
3207
3208 case NT_QTFR:
3209 r = setup_subexp_call(NQTFR(node)->target, env);
3210 break;
3211 case NT_ENCLOSE:
3212 r = setup_subexp_call(NENCLOSE(node)->target, env);
3213 break;
3214
3215 case NT_CALL:
3216 {
3217 CallNode* cn = NCALL(node);
3218 Node** nodes = SCANENV_MEM_NODES(env);
3219
3220 if (cn->group_num != 0) {
3221 int gnum = cn->group_num;
3222
3223# ifdef USE_NAMED_GROUP
3224 if (env->num_named > 0 &&
3225 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3226 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3227 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3228 }
3229# endif
3230 if (gnum > env->num_mem) {
3231 onig_scan_env_set_error_string(env,
3232 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3233 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3234 }
3235
3236# ifdef USE_NAMED_GROUP
3237 set_call_attr:
3238# endif
3239 cn->target = nodes[cn->group_num];
3240 if (IS_NULL(cn->target)) {
3241 onig_scan_env_set_error_string(env,
3242 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3243 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3244 }
3245 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3246 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3247 cn->unset_addr_list = env->unset_addr_list;
3248 }
3249# ifdef USE_NAMED_GROUP
3250# ifdef USE_PERL_SUBEXP_CALL
3251 else if (cn->name == cn->name_end) {
3252 goto set_call_attr;
3253 }
3254# endif
3255 else {
3256 int *refs;
3257
3258 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3259 &refs);
3260 if (n <= 0) {
3261 onig_scan_env_set_error_string(env,
3262 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3263 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3264 }
3265 else if (n > 1 &&
3266 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL)) {
3267 onig_scan_env_set_error_string(env,
3268 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3269 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3270 }
3271 else {
3272 cn->group_num = refs[0];
3273 goto set_call_attr;
3274 }
3275 }
3276# endif
3277 }
3278 break;
3279
3280 case NT_ANCHOR:
3281 {
3282 AnchorNode* an = NANCHOR(node);
3283
3284 switch (an->type) {
3285 case ANCHOR_PREC_READ:
3286 case ANCHOR_PREC_READ_NOT:
3287 case ANCHOR_LOOK_BEHIND:
3288 case ANCHOR_LOOK_BEHIND_NOT:
3289 r = setup_subexp_call(an->target, env);
3290 break;
3291 }
3292 }
3293 break;
3294
3295 default:
3296 break;
3297 }
3298
3299 return r;
3300}
3301#endif
3302
3303#define IN_ALT (1<<0)
3304#define IN_NOT (1<<1)
3305#define IN_REPEAT (1<<2)
3306#define IN_VAR_REPEAT (1<<3)
3307#define IN_CALL (1<<4)
3308#define IN_RECCALL (1<<5)
3309#define IN_LOOK_BEHIND (1<<6)
3310
3311/* divide different length alternatives in look-behind.
3312 (?<=A|B) ==> (?<=A)|(?<=B)
3313 (?<!A|B) ==> (?<!A)(?<!B)
3314*/
3315static int
3316divide_look_behind_alternatives(Node* node)
3317{
3318 Node *head, *np, *insert_node;
3319 AnchorNode* an = NANCHOR(node);
3320 int anc_type = an->type;
3321
3322 head = an->target;
3323 np = NCAR(head);
3324 swap_node(node, head);
3325 NCAR(node) = head;
3326 NANCHOR(head)->target = np;
3327
3328 np = node;
3329 while ((np = NCDR(np)) != NULL_NODE) {
3330 insert_node = onig_node_new_anchor(anc_type);
3331 CHECK_NULL_RETURN_MEMERR(insert_node);
3332 NANCHOR(insert_node)->target = NCAR(np);
3333 NCAR(np) = insert_node;
3334 }
3335
3336 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3337 np = node;
3338 do {
3339 SET_NTYPE(np, NT_LIST); /* alt -> list */
3340 } while ((np = NCDR(np)) != NULL_NODE);
3341 }
3342 return 0;
3343}
3344
3345static int
3346setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3347{
3348 int r, len;
3349 AnchorNode* an = NANCHOR(node);
3350
3351 r = get_char_length_tree(an->target, reg, &len);
3352 if (r == 0)
3353 an->char_len = len;
3354 else if (r == GET_CHAR_LEN_VARLEN)
3355 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3356 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3357 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3358 r = divide_look_behind_alternatives(node);
3359 else
3360 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3361 }
3362
3363 return r;
3364}
3365
3366static int
3367next_setup(Node* node, Node* next_node, regex_t* reg)
3368{
3369 int type;
3370
3371 retry:
3372 type = NTYPE(node);
3373 if (type == NT_QTFR) {
3374 QtfrNode* qn = NQTFR(node);
3375 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3376#ifdef USE_QTFR_PEEK_NEXT
3377 Node* n = get_head_value_node(next_node, 1, reg);
3378 /* '\0': for UTF-16BE etc... */
3379 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3380 qn->next_head_exact = n;
3381 }
3382#endif
3383 /* automatic possessification a*b ==> (?>a*)b */
3384 if (qn->lower <= 1) {
3385 int ttype = NTYPE(qn->target);
3386 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3387 Node *x, *y;
3388 x = get_head_value_node(qn->target, 0, reg);
3389 if (IS_NOT_NULL(x)) {
3390 y = get_head_value_node(next_node, 0, reg);
3391 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3392 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3393 CHECK_NULL_RETURN_MEMERR(en);
3394 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3395 swap_node(node, en);
3396 NENCLOSE(node)->target = en;
3397 }
3398 }
3399 }
3400 }
3401 }
3402 }
3403 else if (type == NT_ENCLOSE) {
3404 EncloseNode* en = NENCLOSE(node);
3405 if (en->type == ENCLOSE_MEMORY && !IS_ENCLOSE_CALLED(en)) {
3406 node = en->target;
3407 goto retry;
3408 }
3409 }
3410 return 0;
3411}
3412
3413
3414static int
3415update_string_node_case_fold(regex_t* reg, Node *node)
3416{
3417 UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3418 UChar *sbuf, *ebuf, *sp;
3419 int r, i, len;
3420 OnigDistance sbuf_size;
3421 StrNode* sn = NSTR(node);
3422
3423 end = sn->end;
3424 sbuf_size = (end - sn->s) * 2;
3425 sbuf = (UChar* )xmalloc(sbuf_size);
3426 CHECK_NULL_RETURN_MEMERR(sbuf);
3427 ebuf = sbuf + sbuf_size;
3428
3429 sp = sbuf;
3430 p = sn->s;
3431 while (p < end) {
3432 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3433 for (i = 0; i < len; i++) {
3434 if (sp >= ebuf) {
3435 UChar* p = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3436 if (IS_NULL(p)) {
3437 xfree(sbuf);
3438 return ONIGERR_MEMORY;
3439 }
3440 sbuf = p;
3441 sp = sbuf + sbuf_size;
3442 sbuf_size *= 2;
3443 ebuf = sbuf + sbuf_size;
3444 }
3445
3446 *sp++ = buf[i];
3447 }
3448 }
3449
3450 r = onig_node_str_set(node, sbuf, sp);
3451
3452 xfree(sbuf);
3453 return r;
3454}
3455
3456static int
3457expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3458 regex_t* reg)
3459{
3460 int r;
3461 Node *node;
3462
3463 node = onig_node_new_str(s, end);
3464 if (IS_NULL(node)) return ONIGERR_MEMORY;
3465
3466 r = update_string_node_case_fold(reg, node);
3467 if (r != 0) {
3468 onig_node_free(node);
3469 return r;
3470 }
3471
3472 NSTRING_SET_AMBIG(node);
3473 NSTRING_SET_DONT_GET_OPT_INFO(node);
3474 *rnode = node;
3475 return 0;
3476}
3477
3478static int
3479is_case_fold_variable_len(int item_num, OnigCaseFoldCodeItem items[],
3480 int slen)
3481{
3482 int i;
3483
3484 for (i = 0; i < item_num; i++) {
3485 if (items[i].byte_len != slen) {
3486 return 1;
3487 }
3488 if (items[i].code_len != 1) {
3489 return 1;
3490 }
3491 }
3492 return 0;
3493}
3494
3495static int
3496expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3497 UChar *p, int slen, UChar *end,
3498 regex_t* reg, Node **rnode)
3499{
3500 int r, i, j, len, varlen;
3501 Node *anode, *var_anode, *snode, *xnode, *an;
3502 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3503
3504 *rnode = var_anode = NULL_NODE;
3505
3506 varlen = 0;
3507 for (i = 0; i < item_num; i++) {
3508 if (items[i].byte_len != slen) {
3509 varlen = 1;
3510 break;
3511 }
3512 }
3513
3514 if (varlen != 0) {
3515 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3516 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3517
3518 xnode = onig_node_new_list(NULL, NULL);
3519 if (IS_NULL(xnode)) goto mem_err;
3520 NCAR(var_anode) = xnode;
3521
3522 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3523 if (IS_NULL(anode)) goto mem_err;
3524 NCAR(xnode) = anode;
3525 }
3526 else {
3527 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3528 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3529 }
3530
3531 snode = onig_node_new_str(p, p + slen);
3532 if (IS_NULL(snode)) goto mem_err;
3533
3534 NCAR(anode) = snode;
3535
3536 for (i = 0; i < item_num; i++) {
3537 snode = onig_node_new_str(NULL, NULL);
3538 if (IS_NULL(snode)) goto mem_err;
3539
3540 for (j = 0; j < items[i].code_len; j++) {
3541 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3542 if (len < 0) {
3543 r = len;
3544 goto mem_err2;
3545 }
3546
3547 r = onig_node_str_cat(snode, buf, buf + len);
3548 if (r != 0) goto mem_err2;
3549 }
3550
3551 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3552 if (IS_NULL(an)) {
3553 goto mem_err2;
3554 }
3555
3556 if (items[i].byte_len != slen) {
3557 Node *rem;
3558 UChar *q = p + items[i].byte_len;
3559
3560 if (q < end) {
3561 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3562 if (r != 0) {
3563 onig_node_free(an);
3564 goto mem_err2;
3565 }
3566
3567 xnode = onig_node_list_add(NULL_NODE, snode);
3568 if (IS_NULL(xnode)) {
3569 onig_node_free(an);
3570 onig_node_free(rem);
3571 goto mem_err2;
3572 }
3573 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3574 onig_node_free(an);
3575 onig_node_free(xnode);
3576 onig_node_free(rem);
3577 goto mem_err;
3578 }
3579
3580 NCAR(an) = xnode;
3581 }
3582 else {
3583 NCAR(an) = snode;
3584 }
3585
3586 NCDR(var_anode) = an;
3587 var_anode = an;
3588 }
3589 else {
3590 NCAR(an) = snode;
3591 NCDR(anode) = an;
3592 anode = an;
3593 }
3594 }
3595
3596 return varlen;
3597
3598 mem_err2:
3599 onig_node_free(snode);
3600
3601 mem_err:
3602 onig_node_free(*rnode);
3603
3604 return ONIGERR_MEMORY;
3605}
3606
3607#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3608
3609static int
3610expand_case_fold_string(Node* node, regex_t* reg, int state)
3611{
3612 int r, n, len, alt_num;
3613 int varlen = 0;
3614 int is_in_look_behind;
3615 UChar *start, *end, *p;
3616 Node *top_root, *root, *snode, *prev_node;
3617 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3618 StrNode* sn;
3619
3620 if (NSTRING_IS_AMBIG(node)) return 0;
3621
3622 sn = NSTR(node);
3623
3624 start = sn->s;
3625 end = sn->end;
3626 if (start >= end) return 0;
3627
3628 is_in_look_behind = (state & IN_LOOK_BEHIND) != 0;
3629
3630 r = 0;
3631 top_root = root = prev_node = snode = NULL_NODE;
3632 alt_num = 1;
3633 p = start;
3634 while (p < end) {
3635 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3636 p, end, items);
3637 if (n < 0) {
3638 r = n;
3639 goto err;
3640 }
3641
3642 len = enclen(reg->enc, p, end);
3643
3644 varlen = is_case_fold_variable_len(n, items, len);
3645 if (n == 0 || varlen == 0 || is_in_look_behind) {
3646 if (IS_NULL(snode)) {
3647 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3648 onig_node_free(top_root);
3649 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3650 if (IS_NULL(root)) {
3651 onig_node_free(prev_node);
3652 goto mem_err;
3653 }
3654 }
3655
3656 prev_node = snode = onig_node_new_str(NULL, NULL);
3657 if (IS_NULL(snode)) goto mem_err;
3658 if (IS_NOT_NULL(root)) {
3659 if (IS_NULL(onig_node_list_add(root, snode))) {
3660 onig_node_free(snode);
3661 goto mem_err;
3662 }
3663 }
3664 }
3665
3666 r = onig_node_str_cat(snode, p, p + len);
3667 if (r != 0) goto err;
3668 }
3669 else {
3670 alt_num *= (n + 1);
3671 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3672
3673 if (IS_NOT_NULL(snode)) {
3674 r = update_string_node_case_fold(reg, snode);
3675 if (r == 0) {
3676 NSTRING_SET_AMBIG(snode);
3677 }
3678 }
3679 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3680 onig_node_free(top_root);
3681 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3682 if (IS_NULL(root)) {
3683 onig_node_free(prev_node);
3684 goto mem_err;
3685 }
3686 }
3687
3688 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3689 if (r < 0) goto mem_err;
3690 if (r == 1) {
3691 if (IS_NULL(root)) {
3692 top_root = prev_node;
3693 }
3694 else {
3695 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3696 onig_node_free(prev_node);
3697 goto mem_err;
3698 }
3699 }
3700
3701 root = NCAR(prev_node);
3702 }
3703 else { /* r == 0 */
3704 if (IS_NOT_NULL(root)) {
3705 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3706 onig_node_free(prev_node);
3707 goto mem_err;
3708 }
3709 }
3710 }
3711
3712 snode = NULL_NODE;
3713 }
3714
3715 p += len;
3716 }
3717 if (IS_NOT_NULL(snode)) {
3718 r = update_string_node_case_fold(reg, snode);
3719 if (r == 0) {
3720 NSTRING_SET_AMBIG(snode);
3721 }
3722 }
3723
3724 if (p < end) {
3725 Node *srem;
3726
3727 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3728 if (r != 0) goto mem_err;
3729
3730 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3731 onig_node_free(top_root);
3732 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3733 if (IS_NULL(root)) {
3734 onig_node_free(srem);
3735 onig_node_free(prev_node);
3736 goto mem_err;
3737 }
3738 }
3739
3740 if (IS_NULL(root)) {
3741 prev_node = srem;
3742 }
3743 else {
3744 if (IS_NULL(onig_node_list_add(root, srem))) {
3745 onig_node_free(srem);
3746 goto mem_err;
3747 }
3748 }
3749 }
3750
3751 /* ending */
3752 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3753 swap_node(node, top_root);
3754 onig_node_free(top_root);
3755 return 0;
3756
3757 mem_err:
3758 r = ONIGERR_MEMORY;
3759
3760 err:
3761 onig_node_free(top_root);
3762 return r;
3763}
3764
3765
3766#ifdef USE_COMBINATION_EXPLOSION_CHECK
3767
3768# define CEC_THRES_NUM_BIG_REPEAT 512
3769# define CEC_INFINITE_NUM 0x7fffffff
3770
3771# define CEC_IN_INFINITE_REPEAT (1<<0)
3772# define CEC_IN_FINITE_REPEAT (1<<1)
3773# define CEC_CONT_BIG_REPEAT (1<<2)
3774
3775static int
3776setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3777{
3778 int type;
3779 int r = state;
3780
3781 type = NTYPE(node);
3782 switch (type) {
3783 case NT_LIST:
3784 {
3785 do {
3786 r = setup_comb_exp_check(NCAR(node), r, env);
3787 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3788 }
3789 break;
3790
3791 case NT_ALT:
3792 {
3793 int ret;
3794 do {
3795 ret = setup_comb_exp_check(NCAR(node), state, env);
3796 r |= ret;
3797 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3798 }
3799 break;
3800
3801 case NT_QTFR:
3802 {
3803 int child_state = state;
3804 int add_state = 0;
3805 QtfrNode* qn = NQTFR(node);
3806 Node* target = qn->target;
3807 int var_num;
3808
3809 if (! IS_REPEAT_INFINITE(qn->upper)) {
3810 if (qn->upper > 1) {
3811 /* {0,1}, {1,1} are allowed */
3812 child_state |= CEC_IN_FINITE_REPEAT;
3813
3814 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3815 if (env->backrefed_mem == 0) {
3816 if (NTYPE(qn->target) == NT_ENCLOSE) {
3817 EncloseNode* en = NENCLOSE(qn->target);
3818 if (en->type == ENCLOSE_MEMORY) {
3819 if (NTYPE(en->target) == NT_QTFR) {
3820 QtfrNode* q = NQTFR(en->target);
3821 if (IS_REPEAT_INFINITE(q->upper)
3822 && q->greedy == qn->greedy) {
3823 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3824 if (qn->upper == 1)
3825 child_state = state;
3826 }
3827 }
3828 }
3829 }
3830 }
3831 }
3832 }
3833
3834 if (state & CEC_IN_FINITE_REPEAT) {
3835 qn->comb_exp_check_num = -1;
3836 }
3837 else {
3838 if (IS_REPEAT_INFINITE(qn->upper)) {
3839 var_num = CEC_INFINITE_NUM;
3840 child_state |= CEC_IN_INFINITE_REPEAT;
3841 }
3842 else {
3843 var_num = qn->upper - qn->lower;
3844 }
3845
3846 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3847 add_state |= CEC_CONT_BIG_REPEAT;
3848
3849 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3850 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3851 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3852 if (qn->comb_exp_check_num == 0) {
3853 env->num_comb_exp_check++;
3854 qn->comb_exp_check_num = env->num_comb_exp_check;
3855 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3856 env->comb_exp_max_regnum = env->curr_max_regnum;
3857 }
3858 }
3859 }
3860
3861 r = setup_comb_exp_check(target, child_state, env);
3862 r |= add_state;
3863 }
3864 break;
3865
3866 case NT_ENCLOSE:
3867 {
3868 EncloseNode* en = NENCLOSE(node);
3869
3870 switch (en->type) {
3871 case ENCLOSE_MEMORY:
3872 {
3873 if (env->curr_max_regnum < en->regnum)
3874 env->curr_max_regnum = en->regnum;
3875
3876 r = setup_comb_exp_check(en->target, state, env);
3877 }
3878 break;
3879
3880 default:
3881 r = setup_comb_exp_check(en->target, state, env);
3882 break;
3883 }
3884 }
3885 break;
3886
3887# ifdef USE_SUBEXP_CALL
3888 case NT_CALL:
3889 if (IS_CALL_RECURSION(NCALL(node)))
3890 env->has_recursion = 1;
3891 else
3892 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3893 break;
3894# endif
3895
3896 default:
3897 break;
3898 }
3899
3900 return r;
3901}
3902#endif
3903
3904/* setup_tree does the following work.
3905 1. check empty loop. (set qn->target_empty_info)
3906 2. expand ignore-case in char class.
3907 3. set memory status bit flags. (reg->mem_stats)
3908 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3909 5. find invalid patterns in look-behind.
3910 6. expand repeated string.
3911 */
3912static int
3913setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3914{
3915 int type;
3916 int r = 0;
3917
3918restart:
3919 type = NTYPE(node);
3920 switch (type) {
3921 case NT_LIST:
3922 {
3923 Node* prev = NULL_NODE;
3924 do {
3925 r = setup_tree(NCAR(node), reg, state, env);
3926 if (IS_NOT_NULL(prev) && r == 0) {
3927 r = next_setup(prev, NCAR(node), reg);
3928 }
3929 prev = NCAR(node);
3930 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3931 }
3932 break;
3933
3934 case NT_ALT:
3935 do {
3936 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3937 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3938 break;
3939
3940 case NT_CCLASS:
3941 break;
3942
3943 case NT_STR:
3944 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3945 r = expand_case_fold_string(node, reg, state);
3946 }
3947 break;
3948
3949 case NT_CTYPE:
3950 case NT_CANY:
3951 break;
3952
3953#ifdef USE_SUBEXP_CALL
3954 case NT_CALL:
3955 break;
3956#endif
3957
3958 case NT_BREF:
3959 {
3960 int i;
3961 int* p;
3962 Node** nodes = SCANENV_MEM_NODES(env);
3963 BRefNode* br = NBREF(node);
3964 p = BACKREFS_P(br);
3965 for (i = 0; i < br->back_num; i++) {
3966 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3967 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3968 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3969#ifdef USE_BACKREF_WITH_LEVEL
3970 if (IS_BACKREF_NEST_LEVEL(br)) {
3971 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3972 }
3973#endif
3974 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3975 }
3976 }
3977 break;
3978
3979 case NT_QTFR:
3980 {
3981 OnigDistance d;
3982 QtfrNode* qn = NQTFR(node);
3983 Node* target = qn->target;
3984
3985 if ((state & IN_REPEAT) != 0) {
3986 qn->state |= NST_IN_REPEAT;
3987 }
3988
3989 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3990 r = get_min_match_length(target, &d, env);
3991 if (r) break;
3992 if (d == 0) {
3993 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3994#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3995 r = quantifiers_memory_node_info(target);
3996 if (r < 0) break;
3997 if (r > 0) {
3998 qn->target_empty_info = r;
3999 }
4000#endif
4001#if 0
4002 r = get_max_match_length(target, &d, env);
4003 if (r == 0 && d == 0) {
4004 /* ()* ==> ()?, ()+ ==> () */
4005 qn->upper = 1;
4006 if (qn->lower > 1) qn->lower = 1;
4007 if (NTYPE(target) == NT_STR) {
4008 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
4009 }
4010 }
4011#endif
4012 }
4013 }
4014
4015 state |= IN_REPEAT;
4016 if (qn->lower != qn->upper)
4017 state |= IN_VAR_REPEAT;
4018 r = setup_tree(target, reg, state, env);
4019 if (r) break;
4020
4021 /* expand string */
4022#define EXPAND_STRING_MAX_LENGTH 100
4023 if (NTYPE(target) == NT_STR) {
4024 if (qn->lower > 1) {
4025 int i, n = qn->lower;
4026 OnigDistance len = NSTRING_LEN(target);
4027 StrNode* sn = NSTR(target);
4028 Node* np;
4029
4030 np = onig_node_new_str(sn->s, sn->end);
4031 if (IS_NULL(np)) return ONIGERR_MEMORY;
4032 NSTR(np)->flag = sn->flag;
4033
4034 for (i = 1; i < n && (i+1) * len <= EXPAND_STRING_MAX_LENGTH; i++) {
4035 r = onig_node_str_cat(np, sn->s, sn->end);
4036 if (r) {
4037 onig_node_free(np);
4038 return r;
4039 }
4040 }
4041 if (i < qn->upper || IS_REPEAT_INFINITE(qn->upper)) {
4042 Node *np1, *np2;
4043
4044 qn->lower -= i;
4045 if (! IS_REPEAT_INFINITE(qn->upper))
4046 qn->upper -= i;
4047
4048 np1 = onig_node_new_list(np, NULL);
4049 if (IS_NULL(np1)) {
4050 onig_node_free(np);
4051 return ONIGERR_MEMORY;
4052 }
4053 swap_node(np1, node);
4054 np2 = onig_node_list_add(node, np1);
4055 if (IS_NULL(np2)) {
4056 onig_node_free(np1);
4057 return ONIGERR_MEMORY;
4058 }
4059 }
4060 else {
4061 swap_node(np, node);
4062 onig_node_free(np);
4063 }
4064 break; /* break case NT_QTFR: */
4065 }
4066 }
4067
4068#ifdef USE_OP_PUSH_OR_JUMP_EXACT
4069 if (qn->greedy && (qn->target_empty_info != 0)) {
4070 if (NTYPE(target) == NT_QTFR) {
4071 QtfrNode* tqn = NQTFR(target);
4072 if (IS_NOT_NULL(tqn->head_exact)) {
4073 qn->head_exact = tqn->head_exact;
4074 tqn->head_exact = NULL;
4075 }
4076 }
4077 else {
4078 qn->head_exact = get_head_value_node(qn->target, 1, reg);
4079 }
4080 }
4081#endif
4082 }
4083 break;
4084
4085 case NT_ENCLOSE:
4086 {
4087 EncloseNode* en = NENCLOSE(node);
4088
4089 switch (en->type) {
4090 case ENCLOSE_OPTION:
4091 {
4092 OnigOptionType options = reg->options;
4093 reg->options = NENCLOSE(node)->option;
4094 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4095 reg->options = options;
4096 }
4097 break;
4098
4099 case ENCLOSE_MEMORY:
4100 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_CALL)) != 0) {
4101 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
4102 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
4103 }
4104 if (IS_ENCLOSE_CALLED(en))
4105 state |= IN_CALL;
4106 if (IS_ENCLOSE_RECURSION(en))
4107 state |= IN_RECCALL;
4108 else if ((state & IN_RECCALL) != 0)
4109 SET_CALL_RECURSION(node);
4110 r = setup_tree(en->target, reg, state, env);
4111 break;
4112
4113 case ENCLOSE_STOP_BACKTRACK:
4114 {
4115 Node* target = en->target;
4116 r = setup_tree(target, reg, state, env);
4117 if (NTYPE(target) == NT_QTFR) {
4118 QtfrNode* tqn = NQTFR(target);
4119 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
4120 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
4121 int qtype = NTYPE(tqn->target);
4122 if (IS_NODE_TYPE_SIMPLE(qtype))
4123 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
4124 }
4125 }
4126 }
4127 break;
4128
4129 case ENCLOSE_CONDITION:
4130#ifdef USE_NAMED_GROUP
4131 if (! IS_ENCLOSE_NAME_REF(NENCLOSE(node)) &&
4132 env->num_named > 0 &&
4133 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
4134 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
4135 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
4136 }
4137#endif
4138 if (NENCLOSE(node)->regnum > env->num_mem)
4139 return ONIGERR_INVALID_BACKREF;
4140 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4141 break;
4142
4143 case ENCLOSE_ABSENT:
4144 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4145 break;
4146 }
4147 }
4148 break;
4149
4150 case NT_ANCHOR:
4151 {
4152 AnchorNode* an = NANCHOR(node);
4153
4154 switch (an->type) {
4155 case ANCHOR_PREC_READ:
4156 r = setup_tree(an->target, reg, state, env);
4157 break;
4158 case ANCHOR_PREC_READ_NOT:
4159 r = setup_tree(an->target, reg, (state | IN_NOT), env);
4160 break;
4161
4162/* allowed node types in look-behind */
4163#define ALLOWED_TYPE_IN_LB \
4164 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
4165 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
4166
4167#define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION )
4168#define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION
4169
4170#define ALLOWED_ANCHOR_IN_LB \
4171( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
4172 ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
4173 ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
4174 ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
4175#define ALLOWED_ANCHOR_IN_LB_NOT \
4176( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
4177 ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
4178 ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
4179 ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
4180
4181 case ANCHOR_LOOK_BEHIND:
4182 {
4183 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
4184 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
4185 if (r < 0) return r;
4186 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4187 if (NTYPE(node) != NT_ANCHOR) goto restart;
4188 r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env);
4189 if (r != 0) return r;
4190 r = setup_look_behind(node, reg, env);
4191 }
4192 break;
4193
4194 case ANCHOR_LOOK_BEHIND_NOT:
4195 {
4196 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
4197 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
4198 if (r < 0) return r;
4199 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4200 if (NTYPE(node) != NT_ANCHOR) goto restart;
4201 r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND),
4202 env);
4203 if (r != 0) return r;
4204 r = setup_look_behind(node, reg, env);
4205 }
4206 break;
4207 }
4208 }
4209 break;
4210
4211 default:
4212 break;
4213 }
4214
4215 return r;
4216}
4217
4218/* set skip map for Sunday's quick search */
4219static int
4220set_bm_skip(UChar* s, UChar* end, regex_t* reg,
4221 UChar skip[], int ignore_case)
4222{
4223 OnigDistance i, len;
4224 int clen, flen, n, j, k;
4225 UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
4226 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4227 OnigEncoding enc = reg->enc;
4228
4229 len = end - s;
4230 if (len >= ONIG_CHAR_TABLE_SIZE) {
4231 /* This should not happen. */
4232 return ONIGERR_TYPE_BUG;
4233 }
4234
4235 if (ignore_case) {
4236 for (i = 0; i < len; i += clen) {
4237 p = s + i;
4238 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4239 p, end, items);
4240 clen = enclen(enc, p, end);
4241 if (p + clen > end)
4242 clen = (int )(end - p);
4243
4244 for (j = 0; j < n; j++) {
4245 if ((items[j].code_len != 1) || (items[j].byte_len != clen)) {
4246 /* Different length isn't supported. Stop optimization at here. */
4247 end = p;
4248 goto endcheck;
4249 }
4250 flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf);
4251 if (flen != clen) {
4252 /* Different length isn't supported. Stop optimization at here. */
4253 end = p;
4254 goto endcheck;
4255 }
4256 }
4257 }
4258endcheck:
4259 len = end - s;
4260 }
4261
4262 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4263 skip[i] = (UChar )(len + 1);
4264 n = 0;
4265 for (i = 0; i < len; i += clen) {
4266 p = s + i;
4267 if (ignore_case)
4268 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4269 p, end, items);
4270 clen = enclen(enc, p, end);
4271 if (p + clen > end)
4272 clen = (int )(end - p);
4273
4274 for (j = 0; j < clen; j++) {
4275 skip[s[i + j]] = (UChar )(len - i - j);
4276 for (k = 0; k < n; k++) {
4277 ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
4278 skip[buf[j]] = (UChar )(len - i - j);
4279 }
4280 }
4281 }
4282
4283 return (int )len;
4284}
4285
4286typedef struct {
4287 OnigDistance min; /* min byte length */
4288 OnigDistance max; /* max byte length */
4289} MinMaxLen;
4290
4291typedef struct {
4292 MinMaxLen mmd;
4293 OnigEncoding enc;
4294 OnigOptionType options;
4295 OnigCaseFoldType case_fold_flag;
4296 ScanEnv* scan_env;
4297} OptEnv;
4298
4299typedef struct {
4300 int left_anchor;
4301 int right_anchor;
4302} OptAncInfo;
4303
4304typedef struct {
4305 MinMaxLen mmd; /* info position */
4306 OptAncInfo anc;
4307
4308 int reach_end;
4309 int ignore_case; /* -1: unset, 0: case sensitive, 1: ignore case */
4310 int len;
4311 UChar s[OPT_EXACT_MAXLEN];
4312} OptExactInfo;
4313
4314typedef struct {
4315 MinMaxLen mmd; /* info position */
4316 OptAncInfo anc;
4317
4318 int value; /* weighted value */
4319 UChar map[ONIG_CHAR_TABLE_SIZE];
4320} OptMapInfo;
4321
4322typedef struct {
4323 MinMaxLen len;
4324
4325 OptAncInfo anc;
4326 OptExactInfo exb; /* boundary */
4327 OptExactInfo exm; /* middle */
4328 OptExactInfo expr; /* prec read (?=...) */
4329
4330 OptMapInfo map; /* boundary */
4331} NodeOptInfo;
4332
4333
4334static int
4335map_position_value(OnigEncoding enc, int i)
4336{
4337 static const short int ByteValTable[] = {
4338 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
4339 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4340 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
4341 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
4342 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4343 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
4344 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4345 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
4346 };
4347
4348 if (i < numberof(ByteValTable)) {
4349 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
4350 return 20;
4351 else
4352 return (int )ByteValTable[i];
4353 }
4354 else
4355 return 4; /* Take it easy. */
4356}
4357
4358static int
4359distance_value(MinMaxLen* mm)
4360{
4361 /* 1000 / (min-max-dist + 1) */
4362 static const short int dist_vals[] = {
4363 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4364 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4365 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4366 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4367 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4368 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4369 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4370 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4371 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4372 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4373 };
4374
4375 OnigDistance d;
4376
4377 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4378
4379 d = mm->max - mm->min;
4380 if (d < numberof(dist_vals))
4381 /* return dist_vals[d] * 16 / (mm->min + 12); */
4382 return (int )dist_vals[d];
4383 else
4384 return 1;
4385}
4386
4387static int
4388comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4389{
4390 if (v2 <= 0) return -1;
4391 if (v1 <= 0) return 1;
4392
4393 v1 *= distance_value(d1);
4394 v2 *= distance_value(d2);
4395
4396 if (v2 > v1) return 1;
4397 if (v2 < v1) return -1;
4398
4399 if (d2->min < d1->min) return 1;
4400 if (d2->min > d1->min) return -1;
4401 return 0;
4402}
4403
4404static int
4405is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4406{
4407 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4408}
4409
4410
4411static void
4412set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4413{
4414 mml->min = min;
4415 mml->max = max;
4416}
4417
4418static void
4419clear_mml(MinMaxLen* mml)
4420{
4421 mml->min = mml->max = 0;
4422}
4423
4424static void
4425copy_mml(MinMaxLen* to, MinMaxLen* from)
4426{
4427 to->min = from->min;
4428 to->max = from->max;
4429}
4430
4431static void
4432add_mml(MinMaxLen* to, MinMaxLen* from)
4433{
4434 to->min = distance_add(to->min, from->min);
4435 to->max = distance_add(to->max, from->max);
4436}
4437
4438#if 0
4439static void
4440add_len_mml(MinMaxLen* to, OnigDistance len)
4441{
4442 to->min = distance_add(to->min, len);
4443 to->max = distance_add(to->max, len);
4444}
4445#endif
4446
4447static void
4448alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4449{
4450 if (to->min > from->min) to->min = from->min;
4451 if (to->max < from->max) to->max = from->max;
4452}
4453
4454static void
4455copy_opt_env(OptEnv* to, OptEnv* from)
4456{
4457 *to = *from;
4458}
4459
4460static void
4461clear_opt_anc_info(OptAncInfo* anc)
4462{
4463 anc->left_anchor = 0;
4464 anc->right_anchor = 0;
4465}
4466
4467static void
4468copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4469{
4470 *to = *from;
4471}
4472
4473static void
4474concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4475 OnigDistance left_len, OnigDistance right_len)
4476{
4477 clear_opt_anc_info(to);
4478
4479 to->left_anchor = left->left_anchor;
4480 if (left_len == 0) {
4481 to->left_anchor |= right->left_anchor;
4482 }
4483
4484 to->right_anchor = right->right_anchor;
4485 if (right_len == 0) {
4486 to->right_anchor |= left->right_anchor;
4487 }
4488 else {
4489 to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT);
4490 }
4491}
4492
4493static int
4494is_left_anchor(int anc)
4495{
4496 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4497 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4498 anc == ANCHOR_PREC_READ_NOT)
4499 return 0;
4500
4501 return 1;
4502}
4503
4504static int
4505is_set_opt_anc_info(OptAncInfo* to, int anc)
4506{
4507 if ((to->left_anchor & anc) != 0) return 1;
4508
4509 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4510}
4511
4512static void
4513add_opt_anc_info(OptAncInfo* to, int anc)
4514{
4515 if (is_left_anchor(anc))
4516 to->left_anchor |= anc;
4517 else
4518 to->right_anchor |= anc;
4519}
4520
4521static void
4522remove_opt_anc_info(OptAncInfo* to, int anc)
4523{
4524 if (is_left_anchor(anc))
4525 to->left_anchor &= ~anc;
4526 else
4527 to->right_anchor &= ~anc;
4528}
4529
4530static void
4531alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4532{
4533 to->left_anchor &= add->left_anchor;
4534 to->right_anchor &= add->right_anchor;
4535}
4536
4537static int
4538is_full_opt_exact_info(OptExactInfo* ex)
4539{
4540 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4541}
4542
4543static void
4544clear_opt_exact_info(OptExactInfo* ex)
4545{
4546 clear_mml(&ex->mmd);
4547 clear_opt_anc_info(&ex->anc);
4548 ex->reach_end = 0;
4549 ex->ignore_case = -1; /* unset */
4550 ex->len = 0;
4551 ex->s[0] = '\0';
4552}
4553
4554static void
4555copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4556{
4557 *to = *from;
4558}
4559
4560static void
4561concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4562{
4563 int i, j, len;
4564 UChar *p, *end;
4565 OptAncInfo tanc;
4566
4567 if (to->ignore_case < 0)
4568 to->ignore_case = add->ignore_case;
4569 else if (to->ignore_case != add->ignore_case)
4570 return ; /* avoid */
4571
4572 p = add->s;
4573 end = p + add->len;
4574 for (i = to->len; p < end; ) {
4575 len = enclen(enc, p, end);
4576 if (i + len > OPT_EXACT_MAXLEN) break;
4577 for (j = 0; j < len && p < end; j++)
4578 to->s[i++] = *p++;
4579 }
4580
4581 to->len = i;
4582 to->reach_end = (p == end ? add->reach_end : 0);
4583
4584 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4585 if (! to->reach_end) tanc.right_anchor = 0;
4586 copy_opt_anc_info(&to->anc, &tanc);
4587}
4588
4589static void
4590concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4591 int raw ARG_UNUSED, OnigEncoding enc)
4592{
4593 int i, j, len;
4594 UChar *p;
4595
4596 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4597 len = enclen(enc, p, end);
4598 if (i + len > OPT_EXACT_MAXLEN) break;
4599 for (j = 0; j < len && p < end; j++)
4600 to->s[i++] = *p++;
4601 }
4602
4603 to->len = i;
4604}
4605
4606static void
4607alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4608{
4609 int i, j, len;
4610
4611 if (add->len == 0 || to->len == 0) {
4612 clear_opt_exact_info(to);
4613 return ;
4614 }
4615
4616 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4617 clear_opt_exact_info(to);
4618 return ;
4619 }
4620
4621 for (i = 0; i < to->len && i < add->len; ) {
4622 if (to->s[i] != add->s[i]) break;
4623 len = enclen(env->enc, to->s + i, to->s + to->len);
4624
4625 for (j = 1; j < len; j++) {
4626 if (to->s[i+j] != add->s[i+j]) break;
4627 }
4628 if (j < len) break;
4629 i += len;
4630 }
4631
4632 if (! add->reach_end || i < add->len || i < to->len) {
4633 to->reach_end = 0;
4634 }
4635 to->len = i;
4636 if (to->ignore_case < 0)
4637 to->ignore_case = add->ignore_case;
4638 else if (add->ignore_case >= 0)
4639 to->ignore_case |= add->ignore_case;
4640
4641 alt_merge_opt_anc_info(&to->anc, &add->anc);
4642 if (! to->reach_end) to->anc.right_anchor = 0;
4643}
4644
4645static void
4646select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4647{
4648 int v1, v2;
4649
4650 v1 = now->len;
4651 v2 = alt->len;
4652
4653 if (v2 == 0) {
4654 return ;
4655 }
4656 else if (v1 == 0) {
4657 copy_opt_exact_info(now, alt);
4658 return ;
4659 }
4660 else if (v1 <= 2 && v2 <= 2) {
4661 /* ByteValTable[x] is big value --> low price */
4662 v2 = map_position_value(enc, now->s[0]);
4663 v1 = map_position_value(enc, alt->s[0]);
4664
4665 if (now->len > 1) v1 += 5;
4666 if (alt->len > 1) v2 += 5;
4667 }
4668
4669 if (now->ignore_case <= 0) v1 *= 2;
4670 if (alt->ignore_case <= 0) v2 *= 2;
4671
4672 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4673 copy_opt_exact_info(now, alt);
4674}
4675
4676static void
4677clear_opt_map_info(OptMapInfo* map)
4678{
4679 static const OptMapInfo clean_info = {
4680 {0, 0}, {0, 0}, 0,
4681 {
4682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4692 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4693 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4694 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4695 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4696 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4697 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4698 }
4699 };
4700
4701 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4702}
4703
4704static void
4705copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4706{
4707 *to = *from;
4708}
4709
4710static void
4711add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4712{
4713 if (map->map[c] == 0) {
4714 map->map[c] = 1;
4715 map->value += map_position_value(enc, c);
4716 }
4717}
4718
4719static int
4720add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4721 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4722{
4723 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4724 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4725 int i, n;
4726
4727 add_char_opt_map_info(map, p[0], enc);
4728
4729 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4730 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4731 if (n < 0) return n;
4732
4733 for (i = 0; i < n; i++) {
4734 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4735 add_char_opt_map_info(map, buf[0], enc);
4736 }
4737
4738 return 0;
4739}
4740
4741static void
4742select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4743{
4744 const int z = 1<<15; /* 32768: something big value */
4745
4746 int v1, v2;
4747
4748 if (alt->value == 0) return ;
4749 if (now->value == 0) {
4750 copy_opt_map_info(now, alt);
4751 return ;
4752 }
4753
4754 v1 = z / now->value;
4755 v2 = z / alt->value;
4756 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4757 copy_opt_map_info(now, alt);
4758}
4759
4760static int
4761comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4762{
4763#define COMP_EM_BASE 20
4764 int ve, vm;
4765
4766 if (m->value <= 0) return -1;
4767
4768 ve = COMP_EM_BASE * e->len * (e->ignore_case > 0 ? 1 : 2);
4769 vm = COMP_EM_BASE * 5 * 2 / m->value;
4770 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4771}
4772
4773static void
4774alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4775{
4776 int i, val;
4777
4778 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4779 if (to->value == 0) return ;
4780 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4781 clear_opt_map_info(to);
4782 return ;
4783 }
4784
4785 alt_merge_mml(&to->mmd, &add->mmd);
4786
4787 val = 0;
4788 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4789 if (add->map[i])
4790 to->map[i] = 1;
4791
4792 if (to->map[i])
4793 val += map_position_value(enc, i);
4794 }
4795 to->value = val;
4796
4797 alt_merge_opt_anc_info(&to->anc, &add->anc);
4798}
4799
4800static void
4801set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4802{
4803 copy_mml(&(opt->exb.mmd), mmd);
4804 copy_mml(&(opt->expr.mmd), mmd);
4805 copy_mml(&(opt->map.mmd), mmd);
4806}
4807
4808static void
4809clear_node_opt_info(NodeOptInfo* opt)
4810{
4811 clear_mml(&opt->len);
4812 clear_opt_anc_info(&opt->anc);
4813 clear_opt_exact_info(&opt->exb);
4814 clear_opt_exact_info(&opt->exm);
4815 clear_opt_exact_info(&opt->expr);
4816 clear_opt_map_info(&opt->map);
4817}
4818
4819static void
4820copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4821{
4822 *to = *from;
4823}
4824
4825static void
4826concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4827{
4828 int exb_reach, exm_reach;
4829 OptAncInfo tanc;
4830
4831 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4832 copy_opt_anc_info(&to->anc, &tanc);
4833
4834 if (add->exb.len > 0 && to->len.max == 0) {
4835 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4836 to->len.max, add->len.max);
4837 copy_opt_anc_info(&add->exb.anc, &tanc);
4838 }
4839
4840 if (add->map.value > 0 && to->len.max == 0) {
4841 if (add->map.mmd.max == 0)
4842 add->map.anc.left_anchor |= to->anc.left_anchor;
4843 }
4844
4845 exb_reach = to->exb.reach_end;
4846 exm_reach = to->exm.reach_end;
4847
4848 if (add->len.max != 0)
4849 to->exb.reach_end = to->exm.reach_end = 0;
4850
4851 if (add->exb.len > 0) {
4852 if (exb_reach) {
4853 concat_opt_exact_info(&to->exb, &add->exb, enc);
4854 clear_opt_exact_info(&add->exb);
4855 }
4856 else if (exm_reach) {
4857 concat_opt_exact_info(&to->exm, &add->exb, enc);
4858 clear_opt_exact_info(&add->exb);
4859 }
4860 }
4861 select_opt_exact_info(enc, &to->exm, &add->exb);
4862 select_opt_exact_info(enc, &to->exm, &add->exm);
4863
4864 if (to->expr.len > 0) {
4865 if (add->len.max > 0) {
4866 if (to->expr.len > (int )add->len.max)
4867 to->expr.len = (int )add->len.max;
4868
4869 if (to->expr.mmd.max == 0)
4870 select_opt_exact_info(enc, &to->exb, &to->expr);
4871 else
4872 select_opt_exact_info(enc, &to->exm, &to->expr);
4873 }
4874 }
4875 else if (add->expr.len > 0) {
4876 copy_opt_exact_info(&to->expr, &add->expr);
4877 }
4878
4879 select_opt_map_info(&to->map, &add->map);
4880
4881 add_mml(&to->len, &add->len);
4882}
4883
4884static void
4885alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4886{
4887 alt_merge_opt_anc_info (&to->anc, &add->anc);
4888 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4889 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4890 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4891 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4892
4893 alt_merge_mml(&to->len, &add->len);
4894}
4895
4896
4897#define MAX_NODE_OPT_INFO_REF_COUNT 5
4898
4899static int
4900optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4901{
4902 int type;
4903 int r = 0;
4904
4905 clear_node_opt_info(opt);
4906 set_bound_node_opt_info(opt, &env->mmd);
4907
4908 type = NTYPE(node);
4909 switch (type) {
4910 case NT_LIST:
4911 {
4912 OptEnv nenv;
4913 NodeOptInfo nopt;
4914 Node* nd = node;
4915
4916 copy_opt_env(&nenv, env);
4917 do {
4918 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4919 if (r == 0) {
4920 add_mml(&nenv.mmd, &nopt.len);
4921 concat_left_node_opt_info(env->enc, opt, &nopt);
4922 }
4923 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4924 }
4925 break;
4926
4927 case NT_ALT:
4928 {
4929 NodeOptInfo nopt;
4930 Node* nd = node;
4931
4932 do {
4933 r = optimize_node_left(NCAR(nd), &nopt, env);
4934 if (r == 0) {
4935 if (nd == node) copy_node_opt_info(opt, &nopt);
4936 else alt_merge_node_opt_info(opt, &nopt, env);
4937 }
4938 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4939 }
4940 break;
4941
4942 case NT_STR:
4943 {
4944 StrNode* sn = NSTR(node);
4945 OnigDistance slen = sn->end - sn->s;
4946 int is_raw = NSTRING_IS_RAW(node);
4947
4948 if (! NSTRING_IS_AMBIG(node)) {
4949 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4950 is_raw, env->enc);
4951 opt->exb.ignore_case = 0;
4952 if (slen > 0) {
4953 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4954 }
4955 set_mml(&opt->len, slen, slen);
4956 }
4957 else {
4958 OnigDistance max;
4959
4960 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4961 int n = onigenc_strlen(env->enc, sn->s, sn->end);
4962 max = (OnigDistance )ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n;
4963 }
4964 else {
4965 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4966 is_raw, env->enc);
4967 opt->exb.ignore_case = 1;
4968
4969 if (slen > 0) {
4970 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4971 env->enc, env->case_fold_flag);
4972 if (r != 0) break;
4973 }
4974
4975 max = slen;
4976 }
4977
4978 set_mml(&opt->len, slen, max);
4979 }
4980
4981 if ((OnigDistance )opt->exb.len == slen)
4982 opt->exb.reach_end = 1;
4983 }
4984 break;
4985
4986 case NT_CCLASS:
4987 {
4988 int i, z;
4989 CClassNode* cc = NCCLASS(node);
4990
4991 /* no need to check ignore case. (set in setup_tree()) */
4992
4993 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4994 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4995 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4996
4997 set_mml(&opt->len, min, max);
4998 }
4999 else {
5000 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
5001 z = BITSET_AT(cc->bs, i);
5002 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
5003 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5004 }
5005 }
5006 set_mml(&opt->len, 1, 1);
5007 }
5008 }
5009 break;
5010
5011 case NT_CTYPE:
5012 {
5013 int i, min, max;
5014 int maxcode;
5015
5016 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
5017
5018 if (max == 1) {
5019 min = 1;
5020
5021 maxcode = NCTYPE(node)->ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
5022 switch (NCTYPE(node)->ctype) {
5023 case ONIGENC_CTYPE_WORD:
5024 if (NCTYPE(node)->not != 0) {
5025 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
5026 if (! ONIGENC_IS_CODE_WORD(env->enc, i) || i >= maxcode) {
5027 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5028 }
5029 }
5030 }
5031 else {
5032 for (i = 0; i < maxcode; i++) {
5033 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
5034 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5035 }
5036 }
5037 }
5038 break;
5039 }
5040 }
5041 else {
5042 min = ONIGENC_MBC_MINLEN(env->enc);
5043 }
5044 set_mml(&opt->len, min, max);
5045 }
5046 break;
5047
5048 case NT_CANY:
5049 {
5050 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
5051 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
5052 set_mml(&opt->len, min, max);
5053 }
5054 break;
5055
5056 case NT_ANCHOR:
5057 switch (NANCHOR(node)->type) {
5058 case ANCHOR_BEGIN_BUF:
5059 case ANCHOR_BEGIN_POSITION:
5060 case ANCHOR_BEGIN_LINE:
5061 case ANCHOR_END_BUF:
5062 case ANCHOR_SEMI_END_BUF:
5063 case ANCHOR_END_LINE:
5064 case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */
5065 case ANCHOR_PREC_READ_NOT: /* just for (?!x).* */
5066 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
5067 break;
5068
5069 case ANCHOR_PREC_READ:
5070 {
5071 NodeOptInfo nopt;
5072
5073 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
5074 if (r == 0) {
5075 if (nopt.exb.len > 0)
5076 copy_opt_exact_info(&opt->expr, &nopt.exb);
5077 else if (nopt.exm.len > 0)
5078 copy_opt_exact_info(&opt->expr, &nopt.exm);
5079
5080 opt->expr.reach_end = 0;
5081
5082 if (nopt.map.value > 0)
5083 copy_opt_map_info(&opt->map, &nopt.map);
5084 }
5085 }
5086 break;
5087
5088 case ANCHOR_LOOK_BEHIND_NOT:
5089 break;
5090 }
5091 break;
5092
5093 case NT_BREF:
5094 {
5095 int i;
5096 int* backs;
5097 OnigDistance min, max, tmin, tmax;
5098 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
5099 BRefNode* br = NBREF(node);
5100
5101 if (br->state & NST_RECURSION) {
5102 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5103 break;
5104 }
5105 backs = BACKREFS_P(br);
5106 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
5107 if (r != 0) break;
5108 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
5109 if (r != 0) break;
5110 for (i = 1; i < br->back_num; i++) {
5111 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
5112 if (r != 0) break;
5113 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
5114 if (r != 0) break;
5115 if (min > tmin) min = tmin;
5116 if (max < tmax) max = tmax;
5117 }
5118 if (r == 0) set_mml(&opt->len, min, max);
5119 }
5120 break;
5121
5122#ifdef USE_SUBEXP_CALL
5123 case NT_CALL:
5124 if (IS_CALL_RECURSION(NCALL(node)))
5125 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5126 else {
5127 OnigOptionType save = env->options;
5128 env->options = NENCLOSE(NCALL(node)->target)->option;
5129 r = optimize_node_left(NCALL(node)->target, opt, env);
5130 env->options = save;
5131 }
5132 break;
5133#endif
5134
5135 case NT_QTFR:
5136 {
5137 int i;
5138 OnigDistance min, max;
5139 NodeOptInfo nopt;
5140 QtfrNode* qn = NQTFR(node);
5141
5142 r = optimize_node_left(qn->target, &nopt, env);
5143 if (r) break;
5144
5145 if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
5146 if (env->mmd.max == 0 &&
5147 NTYPE(qn->target) == NT_CANY && qn->greedy) {
5148 if (IS_MULTILINE(env->options))
5149 /* implicit anchor: /.*a/ ==> /\A.*a/ */
5150 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
5151 else
5152 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
5153 }
5154 }
5155 else {
5156 if (qn->lower > 0) {
5157 copy_node_opt_info(opt, &nopt);
5158 if (nopt.exb.len > 0) {
5159 if (nopt.exb.reach_end) {
5160 for (i = 2; i <= qn->lower &&
5161 ! is_full_opt_exact_info(&opt->exb); i++) {
5162 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
5163 }
5164 if (i < qn->lower) {
5165 opt->exb.reach_end = 0;
5166 }
5167 }
5168 }
5169
5170 if (qn->lower != qn->upper) {
5171 opt->exb.reach_end = 0;
5172 opt->exm.reach_end = 0;
5173 }
5174 if (qn->lower > 1)
5175 opt->exm.reach_end = 0;
5176 }
5177 }
5178
5179 min = distance_multiply(nopt.len.min, qn->lower);
5180 if (IS_REPEAT_INFINITE(qn->upper))
5181 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
5182 else
5183 max = distance_multiply(nopt.len.max, qn->upper);
5184
5185 set_mml(&opt->len, min, max);
5186 }
5187 break;
5188
5189 case NT_ENCLOSE:
5190 {
5191 EncloseNode* en = NENCLOSE(node);
5192
5193 switch (en->type) {
5194 case ENCLOSE_OPTION:
5195 {
5196 OnigOptionType save = env->options;
5197
5198 env->options = en->option;
5199 r = optimize_node_left(en->target, opt, env);
5200 env->options = save;
5201 }
5202 break;
5203
5204 case ENCLOSE_MEMORY:
5205#ifdef USE_SUBEXP_CALL
5206 en->opt_count++;
5207 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
5208 OnigDistance min, max;
5209
5210 min = 0;
5211 max = ONIG_INFINITE_DISTANCE;
5212 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
5213 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
5214 set_mml(&opt->len, min, max);
5215 }
5216 else
5217#endif
5218 {
5219 r = optimize_node_left(en->target, opt, env);
5220
5221 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
5222 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
5223 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
5224 }
5225 }
5226 break;
5227
5228 case ENCLOSE_STOP_BACKTRACK:
5229 case ENCLOSE_CONDITION:
5230 r = optimize_node_left(en->target, opt, env);
5231 break;
5232
5233 case ENCLOSE_ABSENT:
5234 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5235 break;
5236 }
5237 }
5238 break;
5239
5240 default:
5241#ifdef ONIG_DEBUG
5242 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
5243 NTYPE(node));
5244#endif
5245 r = ONIGERR_TYPE_BUG;
5246 break;
5247 }
5248
5249 return r;
5250}
5251
5252static int
5253set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
5254{
5255 int allow_reverse;
5256
5257 if (e->len == 0) return 0;
5258
5259 reg->exact = (UChar* )xmalloc(e->len);
5260 CHECK_NULL_RETURN_MEMERR(reg->exact);
5261 xmemcpy(reg->exact, e->s, e->len);
5262 reg->exact_end = reg->exact + e->len;
5263
5264 allow_reverse =
5265 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
5266
5267 if (e->ignore_case > 0) {
5268 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
5269 int orig_len = e->len;
5270 e->len = set_bm_skip(reg->exact, reg->exact_end, reg,
5271 reg->map, 1);
5272 if (e->len >= 3) {
5273 reg->exact_end = reg->exact + e->len;
5274 reg->optimize = (allow_reverse != 0
5275 ? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC);
5276 }
5277 else {
5278 /* Even if BM skip table can't be built (e.g., pattern starts with
5279 's' or 'k' which have multi-byte case fold variants), we should
5280 still use EXACT_IC optimization with the original pattern.
5281 Without this fallback, patterns like /slackware/i have no
5282 optimization at all, causing severe performance regression
5283 especially with non-ASCII strings. See [Bug #21824] */
5284 e->len = orig_len; /* Restore original length for EXACT_IC */
5285 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
5286 }
5287 }
5288 else {
5289 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
5290 }
5291 }
5292 else {
5293 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
5294 set_bm_skip(reg->exact, reg->exact_end, reg,
5295 reg->map, 0);
5296 reg->optimize = (allow_reverse != 0
5297 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
5298 }
5299 else {
5300 reg->optimize = ONIG_OPTIMIZE_EXACT;
5301 }
5302 }
5303
5304 reg->dmin = e->mmd.min;
5305 reg->dmax = e->mmd.max;
5306
5307 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
5308 reg->threshold_len = (int )(reg->dmin + (reg->exact_end - reg->exact));
5309 }
5310
5311 return 0;
5312}
5313
5314static void
5315set_optimize_map_info(regex_t* reg, OptMapInfo* m)
5316{
5317 int i;
5318
5319 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5320 reg->map[i] = m->map[i];
5321
5322 reg->optimize = ONIG_OPTIMIZE_MAP;
5323 reg->dmin = m->mmd.min;
5324 reg->dmax = m->mmd.max;
5325
5326 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
5327 reg->threshold_len = (int )(reg->dmin + 1);
5328 }
5329}
5330
5331static void
5332set_sub_anchor(regex_t* reg, OptAncInfo* anc)
5333{
5334 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
5335 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
5336}
5337
5338#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5339static void print_optimize_info(FILE* f, regex_t* reg);
5340#endif
5341
5342static int
5343set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
5344{
5345
5346 int r;
5347 NodeOptInfo opt;
5348 OptEnv env;
5349
5350 env.enc = reg->enc;
5351 env.options = reg->options;
5352 env.case_fold_flag = reg->case_fold_flag;
5353 env.scan_env = scan_env;
5354 clear_mml(&env.mmd);
5355
5356 r = optimize_node_left(node, &opt, &env);
5357 if (r) return r;
5358
5359 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
5360 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML |
5361 ANCHOR_LOOK_BEHIND);
5362
5363 if ((opt.anc.left_anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) != 0)
5364 reg->anchor &= ~ANCHOR_ANYCHAR_STAR_ML;
5365
5366 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF |
5367 ANCHOR_PREC_READ_NOT);
5368
5369 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
5370 reg->anchor_dmin = opt.len.min;
5371 reg->anchor_dmax = opt.len.max;
5372 }
5373
5374 if (opt.exb.len > 0 || opt.exm.len > 0) {
5375 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
5376 if (opt.map.value > 0 &&
5377 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
5378 goto set_map;
5379 }
5380 else {
5381 r = set_optimize_exact_info(reg, &opt.exb);
5382 set_sub_anchor(reg, &opt.exb.anc);
5383 }
5384 }
5385 else if (opt.map.value > 0) {
5386 set_map:
5387 set_optimize_map_info(reg, &opt.map);
5388 set_sub_anchor(reg, &opt.map.anc);
5389 }
5390 else {
5391 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5392 if (opt.len.max == 0)
5393 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5394 }
5395
5396#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5397 print_optimize_info(stderr, reg);
5398#endif
5399 return r;
5400}
5401
5402static void
5403clear_optimize_info(regex_t* reg)
5404{
5405 reg->optimize = ONIG_OPTIMIZE_NONE;
5406 reg->anchor = 0;
5407 reg->anchor_dmin = 0;
5408 reg->anchor_dmax = 0;
5409 reg->sub_anchor = 0;
5410 reg->exact_end = (UChar* )NULL;
5411 reg->threshold_len = 0;
5412 xfree(reg->exact);
5413 reg->exact = (UChar* )NULL;
5414}
5415
5416#ifdef ONIG_DEBUG
5417
5418static void print_enc_string(FILE* fp, OnigEncoding enc,
5419 const UChar *s, const UChar *end)
5420{
5421 fprintf(fp, "\nPATTERN: /");
5422
5423 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5424 const UChar *p;
5425 OnigCodePoint code;
5426
5427 p = s;
5428 while (p < end) {
5429 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5430 if (code >= 0x80) {
5431 fprintf(fp, " 0x%04x ", (int )code);
5432 }
5433 else {
5434 fputc((int )code, fp);
5435 }
5436
5437 p += enclen(enc, p, end);
5438 }
5439 }
5440 else {
5441 while (s < end) {
5442 fputc((int )*s, fp);
5443 s++;
5444 }
5445 }
5446
5447 fprintf(fp, "/ (%s)\n", enc->name);
5448}
5449#endif /* ONIG_DEBUG */
5450
5451#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5452static void
5453print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5454{
5455 if (a == ONIG_INFINITE_DISTANCE)
5456 fputs("inf", f);
5457 else
5458 fprintf(f, "(%"PRIuPTR")", a);
5459
5460 fputs("-", f);
5461
5462 if (b == ONIG_INFINITE_DISTANCE)
5463 fputs("inf", f);
5464 else
5465 fprintf(f, "(%"PRIuPTR")", b);
5466}
5467
5468static void
5469print_anchor(FILE* f, int anchor)
5470{
5471 int q = 0;
5472
5473 fprintf(f, "[");
5474
5475 if (anchor & ANCHOR_BEGIN_BUF) {
5476 fprintf(f, "begin-buf");
5477 q = 1;
5478 }
5479 if (anchor & ANCHOR_BEGIN_LINE) {
5480 if (q) fprintf(f, ", ");
5481 q = 1;
5482 fprintf(f, "begin-line");
5483 }
5484 if (anchor & ANCHOR_BEGIN_POSITION) {
5485 if (q) fprintf(f, ", ");
5486 q = 1;
5487 fprintf(f, "begin-pos");
5488 }
5489 if (anchor & ANCHOR_END_BUF) {
5490 if (q) fprintf(f, ", ");
5491 q = 1;
5492 fprintf(f, "end-buf");
5493 }
5494 if (anchor & ANCHOR_SEMI_END_BUF) {
5495 if (q) fprintf(f, ", ");
5496 q = 1;
5497 fprintf(f, "semi-end-buf");
5498 }
5499 if (anchor & ANCHOR_END_LINE) {
5500 if (q) fprintf(f, ", ");
5501 q = 1;
5502 fprintf(f, "end-line");
5503 }
5504 if (anchor & ANCHOR_ANYCHAR_STAR) {
5505 if (q) fprintf(f, ", ");
5506 q = 1;
5507 fprintf(f, "anychar-star");
5508 }
5509 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5510 if (q) fprintf(f, ", ");
5511 fprintf(f, "anychar-star-ml");
5512 }
5513
5514 fprintf(f, "]");
5515}
5516
5517static void
5518print_optimize_info(FILE* f, regex_t* reg)
5519{
5520 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5521 "EXACT_IC", "MAP",
5522 "EXACT_BM_IC", "EXACT_BM_NOT_REV_IC" };
5523
5524 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5525 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5526 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5527 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5528 fprintf(f, "\n");
5529
5530 if (reg->optimize) {
5531 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5532 fprintf(f, "\n");
5533 }
5534 fprintf(f, "\n");
5535
5536 if (reg->exact) {
5537 UChar *p;
5538 fprintf(f, "exact: [");
5539 for (p = reg->exact; p < reg->exact_end; p++) {
5540 fputc(*p, f);
5541 }
5542 fprintf(f, "]: length: %"PRIdPTR"\n", (reg->exact_end - reg->exact));
5543 }
5544 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5545 int c, i, n = 0;
5546
5547 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5548 if (reg->map[i]) n++;
5549
5550 fprintf(f, "map: n=%d\n", n);
5551 if (n > 0) {
5552 c = 0;
5553 fputc('[', f);
5554 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5555 if (reg->map[i] != 0) {
5556 if (c > 0) fputs(", ", f);
5557 c++;
5558 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5559 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5560 fputc(i, f);
5561 else
5562 fprintf(f, "%d", i);
5563 }
5564 }
5565 fprintf(f, "]\n");
5566 }
5567 }
5568}
5569#endif /* ONIG_DEBUG_COMPILE || ONIG_DEBUG_MATCH */
5570
5571
5572extern void
5573onig_free_body(regex_t* reg)
5574{
5575 if (IS_NOT_NULL(reg)) {
5576 xfree(reg->p);
5577 xfree(reg->exact);
5578 xfree(reg->repeat_range);
5579 onig_free(reg->chain);
5580
5581#ifdef USE_NAMED_GROUP
5582 onig_names_free(reg);
5583#endif
5584 }
5585}
5586
5587extern void
5588onig_free(regex_t* reg)
5589{
5590 if (IS_NOT_NULL(reg)) {
5591 onig_free_body(reg);
5592 xfree(reg);
5593 }
5594}
5595
5596static void*
5597dup_copy(const void *ptr, size_t size)
5598{
5599 void *newptr = xmalloc(size);
5600 if (IS_NOT_NULL(newptr)) {
5601 memcpy(newptr, ptr, size);
5602 }
5603 return newptr;
5604}
5605
5606extern int
5607onig_reg_copy(regex_t** nreg, regex_t* oreg)
5608{
5609 if (IS_NOT_NULL(oreg)) {
5610 regex_t *reg = *nreg = (regex_t* )xmalloc(sizeof(regex_t));
5611 if (IS_NULL(reg)) return ONIGERR_MEMORY;
5612
5613 *reg = *oreg;
5614
5615# define COPY_FAILED(mem, size) IS_NULL(reg->mem = dup_copy(reg->mem, size))
5616
5617 if (IS_NOT_NULL(reg->exact)) {
5618 size_t exact_size = reg->exact_end - reg->exact;
5619 if (COPY_FAILED(exact, exact_size))
5620 goto err;
5621 (reg)->exact_end = (reg)->exact + exact_size;
5622 }
5623
5624 if (IS_NOT_NULL(reg->p)) {
5625 if (COPY_FAILED(p, reg->alloc))
5626 goto err_p;
5627 }
5628 if (IS_NOT_NULL(reg->repeat_range)) {
5629 if (COPY_FAILED(repeat_range, reg->repeat_range_alloc * sizeof(OnigRepeatRange)))
5630 goto err_repeat_range;
5631 }
5632 if (IS_NOT_NULL(reg->name_table)) {
5633 if (onig_names_copy(reg, oreg))
5634 goto err_name_table;
5635 }
5636 if (IS_NOT_NULL(reg->chain)) {
5637 if (onig_reg_copy(&reg->chain, reg->chain))
5638 goto err_chain;
5639 }
5640 return 0;
5641# undef COPY_FAILED
5642
5643 err_chain:
5644 onig_names_free(reg);
5645 err_name_table:
5646 xfree(reg->repeat_range);
5647 err_repeat_range:
5648 xfree(reg->p);
5649 err_p:
5650 xfree(reg->exact);
5651 err:
5652 xfree(reg);
5653 return ONIGERR_MEMORY;
5654 }
5655 return 0;
5656}
5657
5658#ifdef RUBY
5659size_t
5660onig_memsize(const regex_t *reg)
5661{
5662 size_t size = sizeof(regex_t);
5663 if (IS_NULL(reg)) return 0;
5664 if (IS_NOT_NULL(reg->p)) size += reg->alloc;
5665 if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact;
5666 if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange);
5667 if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain);
5668
5669 return size;
5670}
5671
5672size_t
5673onig_region_memsize(const OnigRegion *regs)
5674{
5675 size_t size = sizeof(*regs);
5676 if (IS_NULL(regs)) return 0;
5677 size += regs->allocated * (sizeof(*regs->beg) + sizeof(*regs->end));
5678 return size;
5679}
5680#endif
5681
5682#define REGEX_TRANSFER(to,from) do {\
5683 onig_free_body(to);\
5684 xmemcpy(to, from, sizeof(regex_t));\
5685 xfree(from);\
5686} while (0)
5687
5688#if 0
5689extern void
5690onig_transfer(regex_t* to, regex_t* from)
5691{
5692 REGEX_TRANSFER(to, from);
5693}
5694#endif
5695
5696#ifdef ONIG_DEBUG_COMPILE
5697static void print_compiled_byte_code_list(FILE* f, regex_t* reg);
5698#endif
5699#ifdef ONIG_DEBUG_PARSE_TREE
5700static void print_tree(FILE* f, Node* node);
5701#endif
5702
5703#ifdef RUBY
5704extern int
5705onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5706 OnigErrorInfo* einfo)
5707{
5708 return onig_compile_ruby(reg, pattern, pattern_end, einfo, NULL, 0);
5709}
5710#endif
5711
5712#ifdef RUBY
5713extern int
5714onig_compile_ruby(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5715 OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
5716#else
5717extern int
5718onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5719 OnigErrorInfo* einfo)
5720#endif
5721{
5722#define COMPILE_INIT_SIZE 20
5723
5724 int r;
5725 OnigDistance init_size;
5726 Node* root;
5727 ScanEnv scan_env = {0};
5728#ifdef USE_SUBEXP_CALL
5729 UnsetAddrList uslist;
5730#endif
5731
5732 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5733
5734#ifdef RUBY
5735 scan_env.sourcefile = sourcefile;
5736 scan_env.sourceline = sourceline;
5737#endif
5738
5739#ifdef ONIG_DEBUG
5740 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5741#endif
5742
5743 if (reg->alloc == 0) {
5744 init_size = (pattern_end - pattern) * 2;
5745 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5746 r = BBUF_INIT(reg, init_size);
5747 if (r != 0) goto end;
5748 }
5749 else
5750 reg->used = 0;
5751
5752 reg->num_mem = 0;
5753 reg->num_repeat = 0;
5754 reg->num_null_check = 0;
5755 reg->repeat_range_alloc = 0;
5756 reg->repeat_range = (OnigRepeatRange* )NULL;
5757#ifdef USE_COMBINATION_EXPLOSION_CHECK
5758 reg->num_comb_exp_check = 0;
5759#endif
5760
5761 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5762 if (r != 0) goto err;
5763
5764#ifdef ONIG_DEBUG_PARSE_TREE
5765# if 0
5766 fprintf(stderr, "ORIGINAL PARSE TREE:\n");
5767 print_tree(stderr, root);
5768# endif
5769#endif
5770
5771#ifdef USE_NAMED_GROUP
5772 /* mixed use named group and no-named group */
5773 if (scan_env.num_named > 0 &&
5774 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5775 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5776 if (scan_env.num_named != scan_env.num_mem)
5777 r = disable_noname_group_capture(&root, reg, &scan_env);
5778 else
5779 r = numbered_ref_check(root);
5780
5781 if (r != 0) goto err;
5782 }
5783#endif
5784
5785#ifdef USE_SUBEXP_CALL
5786 if (scan_env.num_call > 0) {
5787 r = unset_addr_list_init(&uslist, scan_env.num_call);
5788 if (r != 0) goto err;
5789 scan_env.unset_addr_list = &uslist;
5790 r = setup_subexp_call(root, &scan_env);
5791 if (r != 0) goto err_unset;
5792 r = subexp_recursive_check_trav(root, &scan_env);
5793 if (r < 0) goto err_unset;
5794 r = subexp_inf_recursive_check_trav(root, &scan_env);
5795 if (r != 0) goto err_unset;
5796
5797 reg->num_call = scan_env.num_call;
5798 }
5799 else
5800 reg->num_call = 0;
5801#endif
5802
5803 r = setup_tree(root, reg, 0, &scan_env);
5804 if (r != 0) goto err_unset;
5805
5806#ifdef ONIG_DEBUG_PARSE_TREE
5807 print_tree(stderr, root);
5808#endif
5809
5810 reg->capture_history = scan_env.capture_history;
5811 reg->bt_mem_start = scan_env.bt_mem_start;
5812 reg->bt_mem_start |= reg->capture_history;
5813 if (IS_FIND_CONDITION(reg->options))
5814 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5815 else {
5816 reg->bt_mem_end = scan_env.bt_mem_end;
5817 reg->bt_mem_end |= reg->capture_history;
5818 }
5819
5820#ifdef USE_COMBINATION_EXPLOSION_CHECK
5821 if (scan_env.backrefed_mem == 0
5822# ifdef USE_SUBEXP_CALL
5823 || scan_env.num_call == 0
5824# endif
5825 ) {
5826 setup_comb_exp_check(root, 0, &scan_env);
5827# ifdef USE_SUBEXP_CALL
5828 if (scan_env.has_recursion != 0) {
5829 scan_env.num_comb_exp_check = 0;
5830 }
5831 else
5832# endif
5833 if (scan_env.comb_exp_max_regnum > 0) {
5834 int i;
5835 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5836 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5837 scan_env.num_comb_exp_check = 0;
5838 break;
5839 }
5840 }
5841 }
5842 }
5843
5844 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5845#endif
5846
5847 clear_optimize_info(reg);
5848#ifndef ONIG_DONT_OPTIMIZE
5849 r = set_optimize_info_from_tree(root, reg, &scan_env);
5850 if (r != 0) goto err_unset;
5851#endif
5852
5853 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5854 xfree(scan_env.mem_nodes_dynamic);
5855 scan_env.mem_nodes_dynamic = (Node** )NULL;
5856 }
5857
5858 r = compile_tree(root, reg);
5859 if (r == 0) {
5860 r = add_opcode(reg, OP_END);
5861#ifdef USE_SUBEXP_CALL
5862 if (scan_env.num_call > 0) {
5863 r = unset_addr_list_fix(&uslist, reg);
5864 unset_addr_list_end(&uslist);
5865 if (r) goto err;
5866 }
5867#endif
5868
5869 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5870 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5871 else {
5872 if (reg->bt_mem_start != 0)
5873 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5874 else
5875 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5876 }
5877 }
5878#ifdef USE_SUBEXP_CALL
5879 else if (scan_env.num_call > 0) {
5880 unset_addr_list_end(&uslist);
5881 }
5882#endif
5883 onig_node_free(root);
5884
5885#ifdef ONIG_DEBUG_COMPILE
5886# ifdef USE_NAMED_GROUP
5887 onig_print_names(stderr, reg);
5888# endif
5889 print_compiled_byte_code_list(stderr, reg);
5890#endif
5891
5892 end:
5893 onig_reg_resize(reg);
5894 return r;
5895
5896 err_unset:
5897#ifdef USE_SUBEXP_CALL
5898 if (scan_env.num_call > 0) {
5899 unset_addr_list_end(&uslist);
5900 }
5901#endif
5902 err:
5903 if (IS_NOT_NULL(scan_env.error)) {
5904 if (IS_NOT_NULL(einfo)) {
5905 einfo->enc = scan_env.enc;
5906 einfo->par = scan_env.error;
5907 einfo->par_end = scan_env.error_end;
5908 }
5909 }
5910
5911 onig_node_free(root);
5912 xfree(scan_env.mem_nodes_dynamic);
5913
5914 return r;
5915}
5916
5917static int onig_inited = 0;
5918
5919extern int
5920onig_reg_init(regex_t* reg, OnigOptionType option,
5921 OnigCaseFoldType case_fold_flag,
5922 OnigEncoding enc, const OnigSyntaxType* syntax)
5923{
5924 if (! onig_inited)
5925 onig_init();
5926
5927 if (IS_NULL(reg))
5928 return ONIGERR_INVALID_ARGUMENT;
5929
5930 (reg)->exact = (UChar* )NULL;
5931 (reg)->chain = (regex_t* )NULL;
5932 (reg)->p = (UChar* )NULL;
5933 (reg)->name_table = (void* )NULL;
5934 (reg)->repeat_range = (OnigRepeatRange* )NULL;
5935
5936 if (ONIGENC_IS_UNDEF(enc))
5937 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET;
5938
5939 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5940 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5941 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5942 }
5943
5944 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5945 option |= syntax->options;
5946 option &= ~ONIG_OPTION_SINGLELINE;
5947 }
5948 else
5949 option |= syntax->options;
5950
5951 (reg)->enc = enc;
5952 (reg)->options = option;
5953 (reg)->syntax = syntax;
5954 (reg)->optimize = 0;
5955
5956 (reg)->alloc = 0;
5957 (reg)->used = 0;
5958
5959 (reg)->case_fold_flag = case_fold_flag;
5960
5961 (reg)->timelimit = 0;
5962
5963 return 0;
5964}
5965
5966extern int
5967onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5968 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5969 const OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5970{
5971 int r;
5972
5973 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5974 if (r) return r;
5975
5976 r = onig_compile(reg, pattern, pattern_end, einfo);
5977 return r;
5978}
5979
5980extern int
5981onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5982 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
5983 OnigErrorInfo* einfo)
5984{
5985 *reg = (regex_t* )xmalloc(sizeof(regex_t));
5986 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5987
5988 int r = onig_new_without_alloc(*reg, pattern, pattern_end, option, enc, syntax, einfo);
5989 if (r) {
5990 onig_free(*reg);
5991 *reg = NULL;
5992 }
5993
5994 return r;
5995}
5996
5997extern int
5998onig_initialize(OnigEncoding encodings[] ARG_UNUSED, int n ARG_UNUSED)
5999{
6000 return onig_init();
6001}
6002
6003extern int
6004onig_init(void)
6005{
6006 if (onig_inited != 0)
6007 return 0;
6008
6009 onig_inited = 1;
6010
6011#if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER)
6012 _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
6013#endif
6014
6015 onigenc_init();
6016 /* onigenc_set_default_caseconv_table((UChar* )0); */
6017
6018#ifdef ONIG_DEBUG_STATISTICS
6019 onig_statistics_init();
6020#endif
6021
6022 return 0;
6023}
6024
6025
6026static OnigEndCallListItemType* EndCallTop;
6027
6028extern void onig_add_end_call(void (*func)(void))
6029{
6031
6032 item = (OnigEndCallListItemType* )xmalloc(sizeof(*item));
6033 if (item == 0) return ;
6034
6035 item->next = EndCallTop;
6036 item->func = func;
6037
6038 EndCallTop = item;
6039}
6040
6041static void
6042exec_end_call_list(void)
6043{
6045 void (*func)(void);
6046
6047 while (EndCallTop != 0) {
6048 func = EndCallTop->func;
6049 (*func)();
6050
6051 prev = EndCallTop;
6052 EndCallTop = EndCallTop->next;
6053 xfree(prev);
6054 }
6055}
6056
6057extern int
6058onig_end(void)
6059{
6060 exec_end_call_list();
6061
6062#ifdef ONIG_DEBUG_STATISTICS
6063 onig_print_statistics(stderr);
6064#endif
6065
6066#if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER)
6067 _CrtDumpMemoryLeaks();
6068#endif
6069
6070 onig_inited = 0;
6071
6072 return 0;
6073}
6074
6075extern int
6076onig_is_in_code_range(const UChar* p, OnigCodePoint code)
6077{
6078 OnigCodePoint n, *data;
6079 OnigCodePoint low, high, x;
6080
6081 GET_CODE_POINT(n, p);
6082 data = (OnigCodePoint* )p;
6083 data++;
6084
6085 for (low = 0, high = n; low < high; ) {
6086 x = (low + high) >> 1;
6087 if (code > data[x * 2 + 1])
6088 low = x + 1;
6089 else
6090 high = x;
6091 }
6092
6093 return ((low < n && code >= data[low * 2]) ? 1 : 0);
6094}
6095
6096extern int
6097onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
6098{
6099 int found;
6100
6101 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
6102 if (IS_NULL(cc->mbuf)) {
6103 found = 0;
6104 }
6105 else {
6106 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
6107 }
6108 }
6109 else {
6110 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
6111 }
6112
6113 if (IS_NCCLASS_NOT(cc))
6114 return !found;
6115 else
6116 return found;
6117}
6118
6119extern int
6120onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
6121{
6122 int len;
6123
6124 if (ONIGENC_MBC_MINLEN(enc) > 1) {
6125 len = 2;
6126 }
6127 else {
6128 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
6129 }
6130 return onig_is_code_in_cc_len(len, code, cc);
6131}
6132
6133
6134#ifdef ONIG_DEBUG
6135
6136/* arguments type */
6137# define ARG_SPECIAL -1
6138# define ARG_NON 0
6139# define ARG_RELADDR 1
6140# define ARG_ABSADDR 2
6141# define ARG_LENGTH 3
6142# define ARG_MEMNUM 4
6143# define ARG_OPTION 5
6144# define ARG_STATE_CHECK 6
6145
6146OnigOpInfoType OnigOpInfo[] = {
6147 { OP_FINISH, "finish", ARG_NON },
6148 { OP_END, "end", ARG_NON },
6149 { OP_EXACT1, "exact1", ARG_SPECIAL },
6150 { OP_EXACT2, "exact2", ARG_SPECIAL },
6151 { OP_EXACT3, "exact3", ARG_SPECIAL },
6152 { OP_EXACT4, "exact4", ARG_SPECIAL },
6153 { OP_EXACT5, "exact5", ARG_SPECIAL },
6154 { OP_EXACTN, "exactn", ARG_SPECIAL },
6155 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
6156 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
6157 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
6158 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
6159 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
6160 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
6161 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
6162 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
6163 { OP_CCLASS, "cclass", ARG_SPECIAL },
6164 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
6165 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
6166 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
6167 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
6168 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
6169 { OP_ANYCHAR, "anychar", ARG_NON },
6170 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
6171 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
6172 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
6173 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
6174 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
6175 { OP_WORD, "word", ARG_NON },
6176 { OP_NOT_WORD, "not-word", ARG_NON },
6177 { OP_WORD_BOUND, "word-bound", ARG_NON },
6178 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
6179 { OP_WORD_BEGIN, "word-begin", ARG_NON },
6180 { OP_WORD_END, "word-end", ARG_NON },
6181 { OP_ASCII_WORD, "ascii-word", ARG_NON },
6182 { OP_NOT_ASCII_WORD, "not-ascii-word", ARG_NON },
6183 { OP_ASCII_WORD_BOUND, "ascii-word-bound", ARG_NON },
6184 { OP_NOT_ASCII_WORD_BOUND,"not-ascii-word-bound", ARG_NON },
6185 { OP_ASCII_WORD_BEGIN, "ascii-word-begin", ARG_NON },
6186 { OP_ASCII_WORD_END, "ascii-word-end", ARG_NON },
6187 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
6188 { OP_END_BUF, "end-buf", ARG_NON },
6189 { OP_BEGIN_LINE, "begin-line", ARG_NON },
6190 { OP_END_LINE, "end-line", ARG_NON },
6191 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
6192 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
6193 { OP_BACKREF1, "backref1", ARG_NON },
6194 { OP_BACKREF2, "backref2", ARG_NON },
6195 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
6196 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
6197 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
6198 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
6199 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
6200 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
6201 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
6202 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
6203 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
6204 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
6205 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
6206 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
6207 { OP_SET_OPTION, "set-option", ARG_OPTION },
6208 { OP_KEEP, "keep", ARG_NON },
6209 { OP_FAIL, "fail", ARG_NON },
6210 { OP_JUMP, "jump", ARG_RELADDR },
6211 { OP_PUSH, "push", ARG_RELADDR },
6212 { OP_POP, "pop", ARG_NON },
6213 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
6214 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
6215 { OP_REPEAT, "repeat", ARG_SPECIAL },
6216 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
6217 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
6218 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
6219 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
6220 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
6221 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
6222 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
6223 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
6224 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
6225 { OP_PUSH_POS, "push-pos", ARG_NON },
6226 { OP_POP_POS, "pop-pos", ARG_NON },
6227 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
6228 { OP_FAIL_POS, "fail-pos", ARG_NON },
6229 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
6230 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
6231 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
6232 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
6233 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
6234 { OP_PUSH_ABSENT_POS, "push-absent-pos", ARG_NON },
6235 { OP_ABSENT, "absent", ARG_RELADDR },
6236 { OP_ABSENT_END, "absent-end", ARG_NON },
6237 { OP_CALL, "call", ARG_ABSADDR },
6238 { OP_RETURN, "return", ARG_NON },
6239 { OP_CONDITION, "condition", ARG_SPECIAL },
6240 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
6241 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
6242 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
6243 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
6244 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
6245 "state-check-anychar-ml*", ARG_STATE_CHECK },
6246 { -1, "", ARG_NON }
6247};
6248
6249static const char*
6250op2name(int opcode)
6251{
6252 int i;
6253
6254 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
6255 if (opcode == OnigOpInfo[i].opcode)
6256 return OnigOpInfo[i].name;
6257 }
6258 return "";
6259}
6260
6261static int
6262op2arg_type(int opcode)
6263{
6264 int i;
6265
6266 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
6267 if (opcode == OnigOpInfo[i].opcode)
6268 return OnigOpInfo[i].arg_type;
6269 }
6270 return ARG_SPECIAL;
6271}
6272
6273# ifdef ONIG_DEBUG_PARSE_TREE
6274static void
6275Indent(FILE* f, int indent)
6276{
6277 int i;
6278 for (i = 0; i < indent; i++) putc(' ', f);
6279}
6280# endif /* ONIG_DEBUG_PARSE_TREE */
6281
6282static void
6283p_string(FILE* f, ptrdiff_t len, UChar* s)
6284{
6285 fputs(":", f);
6286 while (len-- > 0) { fputc(*s++, f); }
6287}
6288
6289static void
6290p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
6291{
6292 int x = len * mb_len;
6293
6294 fprintf(f, ":%d:", len);
6295 while (x-- > 0) { fputc(*s++, f); }
6296}
6297
6298extern void
6299onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp,
6300 OnigEncoding enc)
6301{
6302 int i, n, arg_type;
6303 RelAddrType addr;
6304 LengthType len;
6305 MemNumType mem;
6306 StateCheckNumType scn;
6307 OnigCodePoint code;
6308 UChar *q;
6309
6310 fprintf(f, "[%s", op2name(*bp));
6311 arg_type = op2arg_type(*bp);
6312 if (arg_type != ARG_SPECIAL) {
6313 bp++;
6314 switch (arg_type) {
6315 case ARG_NON:
6316 break;
6317 case ARG_RELADDR:
6318 GET_RELADDR_INC(addr, bp);
6319 fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr);
6320 break;
6321 case ARG_ABSADDR:
6322 GET_ABSADDR_INC(addr, bp);
6323 fprintf(f, ":(%d)", addr);
6324 break;
6325 case ARG_LENGTH:
6326 GET_LENGTH_INC(len, bp);
6327 fprintf(f, ":%d", len);
6328 break;
6329 case ARG_MEMNUM:
6330 mem = *((MemNumType* )bp);
6331 bp += SIZE_MEMNUM;
6332 fprintf(f, ":%d", mem);
6333 break;
6334 case ARG_OPTION:
6335 {
6336 OnigOptionType option = *((OnigOptionType* )bp);
6337 bp += SIZE_OPTION;
6338 fprintf(f, ":%d", option);
6339 }
6340 break;
6341
6342 case ARG_STATE_CHECK:
6343 scn = *((StateCheckNumType* )bp);
6344 bp += SIZE_STATE_CHECK_NUM;
6345 fprintf(f, ":%d", scn);
6346 break;
6347 }
6348 }
6349 else {
6350 switch (*bp++) {
6351 case OP_EXACT1:
6352 case OP_ANYCHAR_STAR_PEEK_NEXT:
6353 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
6354 p_string(f, 1, bp++); break;
6355 case OP_EXACT2:
6356 p_string(f, 2, bp); bp += 2; break;
6357 case OP_EXACT3:
6358 p_string(f, 3, bp); bp += 3; break;
6359 case OP_EXACT4:
6360 p_string(f, 4, bp); bp += 4; break;
6361 case OP_EXACT5:
6362 p_string(f, 5, bp); bp += 5; break;
6363 case OP_EXACTN:
6364 GET_LENGTH_INC(len, bp);
6365 p_len_string(f, len, 1, bp);
6366 bp += len;
6367 break;
6368
6369 case OP_EXACTMB2N1:
6370 p_string(f, 2, bp); bp += 2; break;
6371 case OP_EXACTMB2N2:
6372 p_string(f, 4, bp); bp += 4; break;
6373 case OP_EXACTMB2N3:
6374 p_string(f, 6, bp); bp += 6; break;
6375 case OP_EXACTMB2N:
6376 GET_LENGTH_INC(len, bp);
6377 p_len_string(f, len, 2, bp);
6378 bp += len * 2;
6379 break;
6380 case OP_EXACTMB3N:
6381 GET_LENGTH_INC(len, bp);
6382 p_len_string(f, len, 3, bp);
6383 bp += len * 3;
6384 break;
6385 case OP_EXACTMBN:
6386 {
6387 int mb_len;
6388
6389 GET_LENGTH_INC(mb_len, bp);
6390 GET_LENGTH_INC(len, bp);
6391 fprintf(f, ":%d:%d:", mb_len, len);
6392 n = len * mb_len;
6393 while (n-- > 0) { fputc(*bp++, f); }
6394 }
6395 break;
6396
6397 case OP_EXACT1_IC:
6398 len = enclen(enc, bp, bpend);
6399 p_string(f, len, bp);
6400 bp += len;
6401 break;
6402 case OP_EXACTN_IC:
6403 GET_LENGTH_INC(len, bp);
6404 p_len_string(f, len, 1, bp);
6405 bp += len;
6406 break;
6407
6408 case OP_CCLASS:
6409 n = bitset_on_num((BitSetRef )bp);
6410 bp += SIZE_BITSET;
6411 fprintf(f, ":%d", n);
6412 break;
6413
6414 case OP_CCLASS_NOT:
6415 n = bitset_on_num((BitSetRef )bp);
6416 bp += SIZE_BITSET;
6417 fprintf(f, ":%d", n);
6418 break;
6419
6420 case OP_CCLASS_MB:
6421 case OP_CCLASS_MB_NOT:
6422 GET_LENGTH_INC(len, bp);
6423 q = bp;
6424# ifndef PLATFORM_UNALIGNED_WORD_ACCESS
6425 ALIGNMENT_RIGHT(q);
6426# endif
6427 GET_CODE_POINT(code, q);
6428 bp += len;
6429 fprintf(f, ":%d:%d", (int )code, len);
6430 break;
6431
6432 case OP_CCLASS_MIX:
6433 case OP_CCLASS_MIX_NOT:
6434 n = bitset_on_num((BitSetRef )bp);
6435 bp += SIZE_BITSET;
6436 GET_LENGTH_INC(len, bp);
6437 q = bp;
6438# ifndef PLATFORM_UNALIGNED_WORD_ACCESS
6439 ALIGNMENT_RIGHT(q);
6440# endif
6441 GET_CODE_POINT(code, q);
6442 bp += len;
6443 fprintf(f, ":%d:%d:%d", n, (int )code, len);
6444 break;
6445
6446 case OP_BACKREFN_IC:
6447 mem = *((MemNumType* )bp);
6448 bp += SIZE_MEMNUM;
6449 fprintf(f, ":%d", mem);
6450 break;
6451
6452 case OP_BACKREF_MULTI_IC:
6453 case OP_BACKREF_MULTI:
6454 fputs(" ", f);
6455 GET_LENGTH_INC(len, bp);
6456 for (i = 0; i < len; i++) {
6457 GET_MEMNUM_INC(mem, bp);
6458 if (i > 0) fputs(", ", f);
6459 fprintf(f, "%d", mem);
6460 }
6461 break;
6462
6463 case OP_BACKREF_WITH_LEVEL:
6464 {
6465 OnigOptionType option;
6466 LengthType level;
6467
6468 GET_OPTION_INC(option, bp);
6469 fprintf(f, ":%d", option);
6470 GET_LENGTH_INC(level, bp);
6471 fprintf(f, ":%d", level);
6472
6473 fputs(" ", f);
6474 GET_LENGTH_INC(len, bp);
6475 for (i = 0; i < len; i++) {
6476 GET_MEMNUM_INC(mem, bp);
6477 if (i > 0) fputs(", ", f);
6478 fprintf(f, "%d", mem);
6479 }
6480 }
6481 break;
6482
6483 case OP_REPEAT:
6484 case OP_REPEAT_NG:
6485 {
6486 mem = *((MemNumType* )bp);
6487 bp += SIZE_MEMNUM;
6488 addr = *((RelAddrType* )bp);
6489 bp += SIZE_RELADDR;
6490 fprintf(f, ":%d:%d", mem, addr);
6491 }
6492 break;
6493
6494 case OP_PUSH_OR_JUMP_EXACT1:
6495 case OP_PUSH_IF_PEEK_NEXT:
6496 addr = *((RelAddrType* )bp);
6497 bp += SIZE_RELADDR;
6498 fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr);
6499 p_string(f, 1, bp);
6500 bp += 1;
6501 break;
6502
6503 case OP_LOOK_BEHIND:
6504 GET_LENGTH_INC(len, bp);
6505 fprintf(f, ":%d", len);
6506 break;
6507
6508 case OP_PUSH_LOOK_BEHIND_NOT:
6509 GET_RELADDR_INC(addr, bp);
6510 GET_LENGTH_INC(len, bp);
6511 fprintf(f, ":%d:(%s%d)", len, (addr >= 0) ? "+" : "", addr);
6512 break;
6513
6514 case OP_STATE_CHECK_PUSH:
6515 case OP_STATE_CHECK_PUSH_OR_JUMP:
6516 scn = *((StateCheckNumType* )bp);
6517 bp += SIZE_STATE_CHECK_NUM;
6518 addr = *((RelAddrType* )bp);
6519 bp += SIZE_RELADDR;
6520 fprintf(f, ":%d:(%s%d)", scn, (addr >= 0) ? "+" : "", addr);
6521 break;
6522
6523 case OP_CONDITION:
6524 GET_MEMNUM_INC(mem, bp);
6525 GET_RELADDR_INC(addr, bp);
6526 fprintf(f, ":%d:(%s%d)", mem, (addr >= 0) ? "+" : "", addr);
6527 break;
6528
6529 default:
6530 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6531 bp[-1]);
6532 }
6533 }
6534 fputs("]", f);
6535 if (nextp) *nextp = bp;
6536}
6537
6538# ifdef ONIG_DEBUG_COMPILE
6539static void
6540print_compiled_byte_code_list(FILE* f, regex_t* reg)
6541{
6542 int ncode;
6543 UChar* bp = reg->p;
6544 UChar* end = reg->p + reg->used;
6545
6546 fprintf(f, "code length: %d", reg->used);
6547
6548 ncode = -1;
6549 while (bp < end) {
6550 ncode++;
6551 if (ncode % 5 == 0)
6552 fprintf(f, "\n%ld:", bp - reg->p);
6553 else
6554 fprintf(f, " %ld:", bp - reg->p);
6555 onig_print_compiled_byte_code(f, bp, end, &bp, reg->enc);
6556 }
6557
6558 fprintf(f, "\n");
6559}
6560# endif /* ONIG_DEBUG_COMPILE */
6561
6562# ifdef ONIG_DEBUG_PARSE_TREE
6563static void
6564print_indent_tree(FILE* f, Node* node, int indent)
6565{
6566 int i, type, container_p = 0;
6567 int add = 3;
6568 UChar* p;
6569
6570 Indent(f, indent);
6571 if (IS_NULL(node)) {
6572 fprintf(f, "ERROR: null node!!!\n");
6573 exit (0);
6574 }
6575
6576 type = NTYPE(node);
6577 switch (type) {
6578 case NT_LIST:
6579 case NT_ALT:
6580 if (NTYPE(node) == NT_LIST)
6581 fprintf(f, "<list:%"PRIxPTR">\n", (intptr_t )node);
6582 else
6583 fprintf(f, "<alt:%"PRIxPTR">\n", (intptr_t )node);
6584
6585 print_indent_tree(f, NCAR(node), indent + add);
6586 while (IS_NOT_NULL(node = NCDR(node))) {
6587 if (NTYPE(node) != type) {
6588 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6589 exit(0);
6590 }
6591 print_indent_tree(f, NCAR(node), indent + add);
6592 }
6593 break;
6594
6595 case NT_STR:
6596 fprintf(f, "<string%s:%"PRIxPTR">",
6597 (NSTRING_IS_RAW(node) ? "-raw" : ""), (intptr_t )node);
6598 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6599 if (*p >= 0x20 && *p < 0x7f)
6600 fputc(*p, f);
6601 else {
6602 fprintf(f, " 0x%02x", *p);
6603 }
6604 }
6605 break;
6606
6607 case NT_CCLASS:
6608 fprintf(f, "<cclass:%"PRIxPTR">", (intptr_t )node);
6609 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs("not ", f);
6610 if (NCCLASS(node)->mbuf) {
6611 BBuf* bbuf = NCCLASS(node)->mbuf;
6612 OnigCodePoint* data = (OnigCodePoint* )bbuf->p;
6613 OnigCodePoint* end = (OnigCodePoint* )(bbuf->p + bbuf->used);
6614 fprintf(f, "%d", *data++);
6615 for (; data < end; data+=2) {
6616 fprintf(f, ",");
6617 fprintf(f, "%04x-%04x", data[0], data[1]);
6618 }
6619 }
6620 break;
6621
6622 case NT_CTYPE:
6623 fprintf(f, "<ctype:%"PRIxPTR"> ", (intptr_t )node);
6624 switch (NCTYPE(node)->ctype) {
6625 case ONIGENC_CTYPE_WORD:
6626 if (NCTYPE(node)->not != 0)
6627 fputs("not word", f);
6628 else
6629 fputs("word", f);
6630 break;
6631
6632 default:
6633 fprintf(f, "ERROR: undefined ctype.\n");
6634 exit(0);
6635 }
6636 break;
6637
6638 case NT_CANY:
6639 fprintf(f, "<anychar:%"PRIxPTR">", (intptr_t )node);
6640 break;
6641
6642 case NT_ANCHOR:
6643 fprintf(f, "<anchor:%"PRIxPTR"> ", (intptr_t )node);
6644 switch (NANCHOR(node)->type) {
6645 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6646 case ANCHOR_END_BUF: fputs("end buf", f); break;
6647 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6648 case ANCHOR_END_LINE: fputs("end line", f); break;
6649 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6650 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6651
6652 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6653 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6654# ifdef USE_WORD_BEGIN_END
6655 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6656 case ANCHOR_WORD_END: fputs("word end", f); break;
6657# endif
6658 case ANCHOR_PREC_READ: fputs("prec read", f); container_p = TRUE; break;
6659 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); container_p = TRUE; break;
6660 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); container_p = TRUE; break;
6661 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); container_p = TRUE; break;
6662 case ANCHOR_KEEP: fputs("keep",f); break;
6663
6664 default:
6665 fprintf(f, "ERROR: undefined anchor type.\n");
6666 break;
6667 }
6668 break;
6669
6670 case NT_BREF:
6671 {
6672 int* p;
6673 BRefNode* br = NBREF(node);
6674 p = BACKREFS_P(br);
6675 fprintf(f, "<backref:%"PRIxPTR">", (intptr_t )node);
6676 for (i = 0; i < br->back_num; i++) {
6677 if (i > 0) fputs(", ", f);
6678 fprintf(f, "%d", p[i]);
6679 }
6680 }
6681 break;
6682
6683# ifdef USE_SUBEXP_CALL
6684 case NT_CALL:
6685 {
6686 CallNode* cn = NCALL(node);
6687 fprintf(f, "<call:%"PRIxPTR">", (intptr_t )node);
6688 p_string(f, cn->name_end - cn->name, cn->name);
6689 }
6690 break;
6691# endif
6692
6693 case NT_QTFR:
6694 fprintf(f, "<quantifier:%"PRIxPTR">{%d,%d}%s\n", (intptr_t )node,
6695 NQTFR(node)->lower, NQTFR(node)->upper,
6696 (NQTFR(node)->greedy ? "" : "?"));
6697 print_indent_tree(f, NQTFR(node)->target, indent + add);
6698 break;
6699
6700 case NT_ENCLOSE:
6701 fprintf(f, "<enclose:%"PRIxPTR"> ", (intptr_t )node);
6702 switch (NENCLOSE(node)->type) {
6703 case ENCLOSE_OPTION:
6704 fprintf(f, "option:%d", NENCLOSE(node)->option);
6705 break;
6706 case ENCLOSE_MEMORY:
6707 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6708 break;
6709 case ENCLOSE_STOP_BACKTRACK:
6710 fprintf(f, "stop-bt");
6711 break;
6712 case ENCLOSE_CONDITION:
6713 fprintf(f, "condition:%d", NENCLOSE(node)->regnum);
6714 break;
6715 case ENCLOSE_ABSENT:
6716 fprintf(f, "absent");
6717 break;
6718
6719 default:
6720 break;
6721 }
6722 fprintf(f, "\n");
6723 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6724 break;
6725
6726 default:
6727 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6728 break;
6729 }
6730
6731 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6732 type != NT_ENCLOSE)
6733 fprintf(f, "\n");
6734
6735 if (container_p) print_indent_tree(f, NANCHOR(node)->target, indent + add);
6736
6737 fflush(f);
6738}
6739
6740static void
6741print_tree(FILE* f, Node* node)
6742{
6743 print_indent_tree(f, node, 0);
6744}
6745# endif /* ONIG_DEBUG_PARSE_TREE */
6746#endif /* ONIG_DEBUG */
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define xrealloc
Old name of ruby_xrealloc.
Definition xmalloc.h:56
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
int len
Length of the buffer.
Definition io.h:8
VALUE type(ANYARGS)
ANYARGS-ed function type.