Ruby 4.1.0dev (2026-05-15 revision a8bcae043f931d9b79f1cb1fe2c021985d07b984)
string.c (a8bcae043f931d9b79f1cb1fe2c021985d07b984)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#undef rb_str_new
64#undef rb_usascii_str_new
65#undef rb_utf8_str_new
66#undef rb_enc_str_new
67#undef rb_str_new_cstr
68#undef rb_usascii_str_new_cstr
69#undef rb_utf8_str_new_cstr
70#undef rb_enc_str_new_cstr
71#undef rb_external_str_new_cstr
72#undef rb_locale_str_new_cstr
73#undef rb_str_dup_frozen
74#undef rb_str_buf_new_cstr
75#undef rb_str_buf_cat
76#undef rb_str_buf_cat2
77#undef rb_str_cat2
78#undef rb_str_cat_cstr
79#undef rb_fstring_cstr
80
83
84/* Flags of RString
85 *
86 * 0: STR_SHARED (equal to ELTS_SHARED)
87 * The string is shared. The buffer this string points to is owned by
88 * another string (the shared root).
89 * 1: RSTRING_NOEMBED
90 * The string is not embedded. When a string is embedded, the contents
91 * follow the header. When a string is not embedded, the contents is
92 * on a separately allocated buffer.
93 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
94 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
95 * It emits a deprecation warning when mutated for the first time.
96 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
97 * The string was allocated by the `Symbol#to_s` method.
98 * It emits a deprecation warning when mutated for the first time.
99 * 4: STR_PRECOMPUTED_HASH
100 * The string is embedded and has its precomputed hashcode stored
101 * after the terminator.
102 * 5: STR_SHARED_ROOT
103 * Other strings may point to the contents of this string. When this
104 * flag is set, STR_SHARED must not be set.
105 * 6: STR_BORROWED
106 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
107 * to be unshared by rb_str_tmp_frozen_release.
108 * 7: STR_TMPLOCK
109 * The pointer to the buffer is passed to a system call such as
110 * read(2). Any modification and realloc is prohibited.
111 * 8-9: ENC_CODERANGE
112 * Stores the coderange of the string.
113 * 10-16: ENCODING
114 * Stores the encoding of the string.
115 * 17: RSTRING_FSTR
116 * The string is a fstring. The string is deduplicated in the fstring
117 * table.
118 * 18: STR_NOFREE
119 * Do not free this string's buffer when the string is reclaimed
120 * by the garbage collector. Used for when the string buffer is a C
121 * string literal.
122 * 19: STR_FAKESTR
123 * The string is not allocated or managed by the garbage collector.
124 * Typically, the string object header (struct RString) is temporarily
125 * allocated on C stack.
126 */
127
128#define RUBY_MAX_CHAR_LEN 16
129#define STR_PRECOMPUTED_HASH FL_USER4
130#define STR_SHARED_ROOT FL_USER5
131#define STR_BORROWED FL_USER6
132#define STR_TMPLOCK FL_USER7
133#define STR_NOFREE FL_USER18
134#define STR_FAKESTR FL_USER19
135
136#define STR_SET_NOEMBED(str) do {\
137 FL_SET((str), STR_NOEMBED);\
138 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
139} while (0)
140#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
141
142#define STR_SET_LEN(str, n) do { \
143 RSTRING(str)->len = (n); \
144} while (0)
145
146#define TERM_LEN(str) (rb_str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
147#define TERM_FILL(ptr, termlen) do {\
148 char *const term_fill_ptr = (ptr);\
149 const int term_fill_len = (termlen);\
150 *term_fill_ptr = '\0';\
151 if (UNLIKELY(term_fill_len > 1))\
152 memset(term_fill_ptr, 0, term_fill_len);\
153} while (0)
154
155#define RESIZE_CAPA(str,capacity) do {\
156 const int termlen = TERM_LEN(str);\
157 RESIZE_CAPA_TERM(str,capacity,termlen);\
158} while (0)
159#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
160 if (STR_EMBED_P(str)) {\
161 if (str_embed_capa(str) < capacity + termlen) {\
162 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
163 const long tlen = RSTRING_LEN(str);\
164 memcpy(tmp, RSTRING_PTR(str), str_embed_capa(str));\
165 RSTRING(str)->as.heap.ptr = tmp;\
166 RSTRING(str)->len = tlen;\
167 STR_SET_NOEMBED(str);\
168 RSTRING(str)->as.heap.aux.capa = (capacity);\
169 }\
170 }\
171 else {\
172 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
173 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
174 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
175 RSTRING(str)->as.heap.aux.capa = (capacity);\
176 }\
177} while (0)
178
179#define STR_SET_SHARED(str, shared_str) do { \
180 if (!FL_TEST(str, STR_FAKESTR)) { \
181 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
182 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
183 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
184 FL_SET((str), STR_SHARED); \
185 rb_gc_register_pinning_obj(str); \
186 FL_SET((shared_str), STR_SHARED_ROOT); \
187 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
188 FL_SET_RAW((shared_str), STR_BORROWED); \
189 } \
190} while (0)
191
192#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
193#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
194/* TODO: include the terminator size in capa. */
195
196#define STR_ENC_GET(str) get_encoding(str)
197
198static inline bool
199zero_filled(const char *s, int n)
200{
201 for (; n > 0; --n) {
202 if (*s++) return false;
203 }
204 return true;
205}
206
207#if !defined SHARABLE_MIDDLE_SUBSTRING
208# define SHARABLE_MIDDLE_SUBSTRING 0
209#endif
210
211static inline bool
212SHARABLE_SUBSTRING_P(VALUE str, long beg, long len)
213{
214#if SHARABLE_MIDDLE_SUBSTRING
215 return true;
216#else
217 long end = beg + len;
218 long source_len = RSTRING_LEN(str);
219 return end == source_len || zero_filled(RSTRING_PTR(str) + end, TERM_LEN(str));
220#endif
221}
222
223static inline long
224str_embed_capa(VALUE str)
225{
226 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
227}
228
229bool
230rb_str_reembeddable_p(VALUE str)
231{
232 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
233}
234
235static inline size_t
236rb_str_embed_size(long capa, long termlen)
237{
238 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
239 if (size < sizeof(struct RString)) size = sizeof(struct RString);
240 return size;
241}
242
243size_t
244rb_str_size_as_embedded(VALUE str)
245{
246 size_t real_size;
247 if (STR_EMBED_P(str)) {
248 size_t capa = RSTRING(str)->len;
249 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
250
251 real_size = rb_str_embed_size(capa, TERM_LEN(str));
252 }
253 /* if the string is not currently embedded, but it can be embedded, how
254 * much space would it require */
255 else if (rb_str_reembeddable_p(str)) {
256 size_t capa = RSTRING(str)->as.heap.aux.capa;
257 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
258
259 real_size = rb_str_embed_size(capa, TERM_LEN(str));
260 }
261 else {
262 real_size = sizeof(struct RString);
263 }
264
265 return real_size;
266}
267
268static inline bool
269STR_EMBEDDABLE_P(long len, long termlen)
270{
271 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
272}
273
274static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
275static VALUE str_new_frozen(VALUE klass, VALUE orig);
276static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
277static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
278static VALUE str_new(VALUE klass, const char *ptr, long len);
279static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
280static inline void str_modifiable(VALUE str);
281static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
282static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
283
284static inline void
285str_make_independent(VALUE str)
286{
287 long len = RSTRING_LEN(str);
288 int termlen = TERM_LEN(str);
289 str_make_independent_expand((str), len, 0L, termlen);
290}
291
292static inline int str_dependent_p(VALUE str);
293
294void
295rb_str_make_independent(VALUE str)
296{
297 if (str_dependent_p(str)) {
298 str_make_independent(str);
299 }
300}
301
302void
303rb_str_make_embedded(VALUE str)
304{
305 RUBY_ASSERT(rb_str_reembeddable_p(str));
306 RUBY_ASSERT(!STR_EMBED_P(str));
307
308 int termlen = TERM_LEN(str);
309 char *buf = RSTRING(str)->as.heap.ptr;
310 long old_capa = RSTRING(str)->as.heap.aux.capa + termlen;
311 long len = RSTRING(str)->len;
312
313 STR_SET_EMBED(str);
314 STR_SET_LEN(str, len);
315
316 if (len > 0) {
317 memcpy(RSTRING_PTR(str), buf, len);
318 SIZED_FREE_N(buf, old_capa);
319 }
320
321 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
322}
323
324void
325rb_debug_rstring_null_ptr(const char *func)
326{
327 fprintf(stderr, "%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
331 func);
332}
333
334/* symbols for [up|down|swap]case/capitalize options */
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
336
337static rb_encoding *
338get_encoding(VALUE str)
339{
340 return rb_enc_from_index(ENCODING_GET(str));
341}
342
343static void
344mustnot_broken(VALUE str)
345{
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
348 }
349}
350
351static void
352mustnot_wchar(VALUE str)
353{
354 rb_encoding *enc = STR_ENC_GET(str);
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
357 }
358}
359
360static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
361
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
364#else
365#endif
366
367static inline bool
368BARE_STRING_P(VALUE str)
369{
370 return RBASIC_CLASS(str) == rb_cString && !rb_obj_shape_has_ivars(str);
371}
372
373static inline st_index_t
374str_do_hash(VALUE str)
375{
376 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
377 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
378 if (e && !is_ascii_string(str)) {
379 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
380 }
381 return h;
382}
383
384static VALUE
385str_store_precomputed_hash(VALUE str, st_index_t hash)
386{
387 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
388 RUBY_ASSERT(STR_EMBED_P(str));
389
390#if RUBY_DEBUG
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
393 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
394#endif
395
396 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
397
398 FL_SET(str, STR_PRECOMPUTED_HASH);
399
400 return str;
401}
402
403VALUE
404rb_fstring(VALUE str)
405{
406 VALUE fstr;
407 int bare;
408
409 Check_Type(str, T_STRING);
410
411 if (FL_TEST(str, RSTRING_FSTR))
412 return str;
413
414 bare = BARE_STRING_P(str);
415 if (!bare) {
416 if (STR_EMBED_P(str)) {
417 OBJ_FREEZE(str);
418 return str;
419 }
420
421 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
423 return str;
424 }
425 }
426
427 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
428 rb_str_resize(str, RSTRING_LEN(str));
429
430 fstr = register_fstring(str, false, false);
431
432 if (!bare) {
433 str_replace_shared_without_enc(str, fstr);
434 OBJ_FREEZE(str);
435 return str;
436 }
437 return fstr;
438}
439
440static VALUE fstring_table_obj;
441
442static VALUE
443fstring_concurrent_set_hash(VALUE str)
444{
445#ifdef PRECOMPUTED_FAKESTR_HASH
446 st_index_t h;
447 if (FL_TEST_RAW(str, STR_FAKESTR)) {
448 // register_fstring precomputes the hash and stores it in capa for fake strings
449 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
450 }
451 else {
452 h = rb_str_hash(str);
453 }
454 // rb_str_hash doesn't include the encoding for ascii only strings, so
455 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
456 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
457#else
458 return (VALUE)rb_str_hash(str);
459#endif
460}
461
462static bool
463fstring_concurrent_set_cmp(VALUE a, VALUE b)
464{
465 long alen, blen;
466 const char *aptr, *bptr;
467
470
471 RSTRING_GETMEM(a, aptr, alen);
472 RSTRING_GETMEM(b, bptr, blen);
473 return (alen == blen &&
474 ENCODING_GET(a) == ENCODING_GET(b) &&
475 memcmp(aptr, bptr, alen) == 0);
476}
477
479 bool copy;
480 bool force_precompute_hash;
481};
482
483static VALUE
484fstring_concurrent_set_create(VALUE str, void *data)
485{
486 struct fstr_create_arg *arg = data;
487
488 // Unless the string is empty or binary, its coderange has been precomputed.
489 int coderange = ENC_CODERANGE(str);
490
491 if (FL_TEST_RAW(str, STR_FAKESTR)) {
492 if (arg->copy) {
493 VALUE new_str;
494 long len = RSTRING_LEN(str);
495 long capa = len + sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
497
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
499 new_str = str_alloc_embed(rb_cString, capa + term_len);
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
502 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
505 }
506 else {
507 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
512 }
513#endif
514 }
515 str = new_str;
516 }
517 else {
518 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
519 RSTRING(str)->len,
520 ENCODING_GET(str));
521 }
522 OBJ_FREEZE(str);
523 }
524 else {
525 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
526 str = str_new_frozen(rb_cString, str);
527 }
528 if (STR_SHARED_P(str)) { /* str should not be shared */
529 /* shared substring */
530 str_make_independent(str);
532 }
533 if (!BARE_STRING_P(str)) {
534 str = str_new_frozen(rb_cString, str);
535 }
536 }
537
538 ENC_CODERANGE_SET(str, coderange);
539 RBASIC(str)->flags |= RSTRING_FSTR;
540 if (!RB_OBJ_SHAREABLE_P(str)) {
541 RB_OBJ_SET_SHAREABLE(str);
542 }
543 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
546 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
547 RUBY_ASSERT(!rb_obj_shape_has_ivars(str));
549 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
550
551 return str;
552}
553
554static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
555 .hash = fstring_concurrent_set_hash,
556 .cmp = fstring_concurrent_set_cmp,
557 .create = fstring_concurrent_set_create,
558 .free = NULL,
559};
560
561void
562Init_fstring_table(void)
563{
564 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
565 rb_gc_register_address(&fstring_table_obj);
566}
567
568static VALUE
569register_fstring(VALUE str, bool copy, bool force_precompute_hash)
570{
571 struct fstr_create_arg args = {
572 .copy = copy,
573 .force_precompute_hash = force_precompute_hash
574 };
575
576#if SIZEOF_VOIDP == SIZEOF_LONG
577 if (FL_TEST_RAW(str, STR_FAKESTR)) {
578 // if the string hasn't been interned, we'll need the hash twice, so we
579 // compute it once and store it in capa
580 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
581 }
582#endif
583
584 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
585
586 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
588 RUBY_ASSERT(OBJ_FROZEN(result));
590 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
591 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
593
594 return result;
595}
596
597bool
598rb_obj_is_fstring_table(VALUE obj)
599{
600 ASSERT_vm_locking();
601
602 return obj == fstring_table_obj;
603}
604
605void
606rb_gc_free_fstring(VALUE obj)
607{
608 ASSERT_vm_locking_with_barrier();
609
610 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
612 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
613
614 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
615
616 RB_DEBUG_COUNTER_INC(obj_str_fstr);
617
618 FL_UNSET(obj, RSTRING_FSTR);
619}
620
621void
622rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
623{
624 if (fstring_table_obj) {
625 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
626 }
627}
628
629static VALUE
630setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
631{
632 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
633 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
634
635 if (!name) {
637 name = "";
638 }
639
640 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
641
642 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
643 fake_str->len = len;
644 fake_str->as.heap.ptr = (char *)name;
645 fake_str->as.heap.aux.capa = len;
646 return (VALUE)fake_str;
647}
648
649/*
650 * set up a fake string which refers a static string literal.
651 */
652VALUE
653rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
654{
655 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
656}
657
658/*
659 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
660 * shared string which refers a static string literal. `ptr` must
661 * point a constant string.
662 */
663VALUE
664rb_fstring_new(const char *ptr, long len)
665{
666 struct RString fake_str = {RBASIC_INIT};
667 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
668}
669
670VALUE
671rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
672{
673 struct RString fake_str = {RBASIC_INIT};
674 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
675}
676
677VALUE
678rb_fstring_cstr(const char *ptr)
679{
680 return rb_fstring_new(ptr, strlen(ptr));
681}
682
683static inline bool
684single_byte_optimizable(VALUE str)
685{
686 int encindex = ENCODING_GET(str);
687 switch (encindex) {
688 case ENCINDEX_ASCII_8BIT:
689 case ENCINDEX_US_ASCII:
690 return true;
691 case ENCINDEX_UTF_8:
692 // For UTF-8 it's worth scanning the string coderange when unknown.
693 return rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT;
694 }
695 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
696 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
697 return true;
698 }
699
700 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
701 return true;
702 }
703
704 /* Conservative. Possibly single byte.
705 * "\xa1" in Shift_JIS for example. */
706 return false;
707}
708
710
711static inline const char *
712search_nonascii(const char *p, const char *e)
713{
714 const char *s, *t;
715
716#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
717# if SIZEOF_UINTPTR_T == 8
718# define NONASCII_MASK UINT64_C(0x8080808080808080)
719# elif SIZEOF_UINTPTR_T == 4
720# define NONASCII_MASK UINT32_C(0x80808080)
721# else
722# error "don't know what to do."
723# endif
724#else
725# if SIZEOF_UINTPTR_T == 8
726# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
727# elif SIZEOF_UINTPTR_T == 4
728# define NONASCII_MASK 0x80808080UL /* or...? */
729# else
730# error "don't know what to do."
731# endif
732#endif
733
734 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
735#if !UNALIGNED_WORD_ACCESS
736 if ((uintptr_t)p % SIZEOF_VOIDP) {
737 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
738 p += l;
739 switch (l) {
740 default: UNREACHABLE;
741#if SIZEOF_VOIDP > 4
742 case 7: if (p[-7]&0x80) return p-7;
743 case 6: if (p[-6]&0x80) return p-6;
744 case 5: if (p[-5]&0x80) return p-5;
745 case 4: if (p[-4]&0x80) return p-4;
746#endif
747 case 3: if (p[-3]&0x80) return p-3;
748 case 2: if (p[-2]&0x80) return p-2;
749 case 1: if (p[-1]&0x80) return p-1;
750 case 0: break;
751 }
752 }
753#endif
754#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
755#define aligned_ptr(value) \
756 __builtin_assume_aligned((value), sizeof(uintptr_t))
757#else
758#define aligned_ptr(value) (value)
759#endif
760 s = aligned_ptr(p);
761 t = (e - (SIZEOF_VOIDP-1));
762#undef aligned_ptr
763 for (;s < t; s += sizeof(uintptr_t)) {
764 uintptr_t word;
765 memcpy(&word, s, sizeof(word));
766 if (word & NONASCII_MASK) {
767#ifdef WORDS_BIGENDIAN
768 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
769#else
770 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
771#endif
772 }
773 }
774 p = (const char *)s;
775 }
776
777 switch (e - p) {
778 default: UNREACHABLE;
779#if SIZEOF_VOIDP > 4
780 case 7: if (e[-7]&0x80) return e-7;
781 case 6: if (e[-6]&0x80) return e-6;
782 case 5: if (e[-5]&0x80) return e-5;
783 case 4: if (e[-4]&0x80) return e-4;
784#endif
785 case 3: if (e[-3]&0x80) return e-3;
786 case 2: if (e[-2]&0x80) return e-2;
787 case 1: if (e[-1]&0x80) return e-1;
788 case 0: return NULL;
789 }
790}
791
792static int
793coderange_scan(const char *p, long len, rb_encoding *enc)
794{
795 const char *e = p + len;
796
797 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
798 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
799 p = search_nonascii(p, e);
801 }
802
803 if (rb_enc_asciicompat(enc)) {
804 p = search_nonascii(p, e);
805 if (!p) return ENC_CODERANGE_7BIT;
806 for (;;) {
807 int ret = rb_enc_precise_mbclen(p, e, enc);
809 p += MBCLEN_CHARFOUND_LEN(ret);
810 if (p == e) break;
811 p = search_nonascii(p, e);
812 if (!p) break;
813 }
814 }
815 else {
816 while (p < e) {
817 int ret = rb_enc_precise_mbclen(p, e, enc);
819 p += MBCLEN_CHARFOUND_LEN(ret);
820 }
821 }
822 return ENC_CODERANGE_VALID;
823}
824
825long
826rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
827{
828 const char *p = s;
829
830 if (*cr == ENC_CODERANGE_BROKEN)
831 return e - s;
832
833 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
834 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
835 if (*cr == ENC_CODERANGE_VALID) return e - s;
836 p = search_nonascii(p, e);
838 return e - s;
839 }
840 else if (rb_enc_asciicompat(enc)) {
841 p = search_nonascii(p, e);
842 if (!p) {
843 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
844 return e - s;
845 }
846 for (;;) {
847 int ret = rb_enc_precise_mbclen(p, e, enc);
848 if (!MBCLEN_CHARFOUND_P(ret)) {
850 return p - s;
851 }
852 p += MBCLEN_CHARFOUND_LEN(ret);
853 if (p == e) break;
854 p = search_nonascii(p, e);
855 if (!p) break;
856 }
857 }
858 else {
859 while (p < e) {
860 int ret = rb_enc_precise_mbclen(p, e, enc);
861 if (!MBCLEN_CHARFOUND_P(ret)) {
863 return p - s;
864 }
865 p += MBCLEN_CHARFOUND_LEN(ret);
866 }
867 }
869 return e - s;
870}
871
872static inline void
873str_enc_copy(VALUE str1, VALUE str2)
874{
875 rb_enc_set_index(str1, ENCODING_GET(str2));
876}
877
878/* Like str_enc_copy, but does not check frozen status of str1.
879 * You should use this only if you're certain that str1 is not frozen. */
880static inline void
881str_enc_copy_direct(VALUE str1, VALUE str2)
882{
883 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
884 if (inlined_encoding == ENCODING_INLINE_MAX) {
885 rb_enc_set_index(str1, rb_enc_get_index(str2));
886 }
887 else {
888 ENCODING_SET_INLINED(str1, inlined_encoding);
889 }
890}
891
892static void
893rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
894{
895 /* this function is designed for copying encoding and coderange
896 * from src to new string "dest" which is made from the part of src.
897 */
898 str_enc_copy(dest, src);
899 if (RSTRING_LEN(dest) == 0) {
900 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
902 else
904 return;
905 }
906 switch (ENC_CODERANGE(src)) {
909 break;
911 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
912 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
914 else
916 break;
917 default:
918 break;
919 }
920}
921
922static void
923rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
924{
925 str_enc_copy(dest, src);
927}
928
929static int
930enc_coderange_scan(VALUE str, rb_encoding *enc)
931{
932 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
933}
934
935int
936rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
937{
938 return enc_coderange_scan(str, enc);
939}
940
941int
942rbimpl_enc_str_coderange_scan(VALUE str)
943{
944 int cr = enc_coderange_scan(str, get_encoding(str));
945 ENC_CODERANGE_SET(str, cr);
946 return cr;
947}
948
949#undef rb_enc_str_coderange
950int
951rb_enc_str_coderange(VALUE str)
952{
953 int cr = ENC_CODERANGE(str);
954
955 if (cr == ENC_CODERANGE_UNKNOWN) {
956 cr = rbimpl_enc_str_coderange_scan(str);
957 }
958 return cr;
959}
960#define rb_enc_str_coderange rb_enc_str_coderange_inline
961
962static inline bool
963rb_enc_str_asciicompat(VALUE str)
964{
965 int encindex = ENCODING_GET_INLINED(str);
966 return rb_str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
967}
968
969int
971{
972 switch(ENC_CODERANGE(str)) {
974 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
976 return true;
977 default:
978 return false;
979 }
980}
981
982static inline void
983str_mod_check(VALUE s, const char *p, long len)
984{
985 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
986 rb_raise(rb_eRuntimeError, "string modified");
987 }
988}
989
990static size_t
991str_capacity(VALUE str, const int termlen)
992{
993 if (STR_EMBED_P(str)) {
994 return str_embed_capa(str) - termlen;
995 }
996 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
997 return RSTRING(str)->len;
998 }
999 else {
1000 return RSTRING(str)->as.heap.aux.capa;
1001 }
1002}
1003
1004size_t
1006{
1007 return str_capacity(str, TERM_LEN(str));
1008}
1009
1010static inline void
1011must_not_null(const char *ptr)
1012{
1013 if (!ptr) {
1014 rb_raise(rb_eArgError, "NULL pointer given");
1015 }
1016}
1017
1018static inline VALUE
1019str_alloc_embed(VALUE klass, size_t capa)
1020{
1021 size_t size = rb_str_embed_size(capa, 0);
1022 RUBY_ASSERT(size > 0);
1023 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1024
1025 NEWOBJ_OF(str, struct RString, klass, T_STRING, size);
1026
1027 str->len = 0;
1028 str->as.embed.ary[0] = 0;
1029
1030 return (VALUE)str;
1031}
1032
1033static inline VALUE
1034str_alloc_heap(VALUE klass)
1035{
1036 NEWOBJ_OF(str, struct RString, klass, T_STRING | STR_NOEMBED, sizeof(struct RString));
1037
1038 str->len = 0;
1039 str->as.heap.aux.capa = 0;
1040 str->as.heap.ptr = NULL;
1041
1042 return (VALUE)str;
1043}
1044
1045static inline VALUE
1046empty_str_alloc(VALUE klass)
1047{
1048 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1049 VALUE str = str_alloc_embed(klass, 0);
1050 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1052 return str;
1053}
1054
1055static VALUE
1056str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1057{
1058 VALUE str;
1059
1060 if (len < 0) {
1061 rb_raise(rb_eArgError, "negative string size (or size too big)");
1062 }
1063
1064 if (enc == NULL) {
1065 enc = rb_ascii8bit_encoding();
1066 }
1067
1068 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1069
1070 int termlen = rb_enc_mbminlen(enc);
1071
1072 if (STR_EMBEDDABLE_P(len, termlen)) {
1073 str = str_alloc_embed(klass, len + termlen);
1074 if (len == 0) {
1075 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1076 }
1077 }
1078 else {
1079 str = str_alloc_heap(klass);
1080 RSTRING(str)->as.heap.aux.capa = len;
1081 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1082 * integer overflow. If we can STATIC_ASSERT that, the following
1083 * mul_add_mul can be reverted to a simple ALLOC_N. */
1084 RSTRING(str)->as.heap.ptr =
1085 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1086 }
1087
1088 rb_enc_raw_set(str, enc);
1089
1090 if (ptr) {
1091 memcpy(RSTRING_PTR(str), ptr, len);
1092 }
1093 else {
1094 memset(RSTRING_PTR(str), 0, len);
1095 }
1096
1097 STR_SET_LEN(str, len);
1098 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1099 return str;
1100}
1101
1102static VALUE
1103str_new(VALUE klass, const char *ptr, long len)
1104{
1105 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1106}
1107
1108VALUE
1109rb_str_new(const char *ptr, long len)
1110{
1111 return str_new(rb_cString, ptr, len);
1112}
1113
1114VALUE
1115rb_usascii_str_new(const char *ptr, long len)
1116{
1117 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1118}
1119
1120VALUE
1121rb_utf8_str_new(const char *ptr, long len)
1122{
1123 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1124}
1125
1126VALUE
1127rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1128{
1129 return str_enc_new(rb_cString, ptr, len, enc);
1130}
1131
1132VALUE
1134{
1135 must_not_null(ptr);
1136 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1137 * memory regions, and that cannot be detected by the MSAN. Just
1138 * trust the programmer that the argument passed here is a sane C
1139 * string. */
1140 __msan_unpoison_string(ptr);
1141 return rb_str_new(ptr, strlen(ptr));
1142}
1143
1144VALUE
1146{
1147 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1148}
1149
1150VALUE
1152{
1153 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1154}
1155
1156VALUE
1158{
1159 must_not_null(ptr);
1160 if (rb_enc_mbminlen(enc) != 1) {
1161 rb_raise(rb_eArgError, "wchar encoding given");
1162 }
1163 return rb_enc_str_new(ptr, strlen(ptr), enc);
1164}
1165
1166static VALUE
1167str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1168{
1169 VALUE str;
1170
1171 if (len < 0) {
1172 rb_raise(rb_eArgError, "negative string size (or size too big)");
1173 }
1174
1175 if (!ptr) {
1176 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1177 }
1178 else {
1179 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1180 str = str_alloc_heap(klass);
1181 RSTRING(str)->len = len;
1182 RSTRING(str)->as.heap.ptr = (char *)ptr;
1183 RSTRING(str)->as.heap.aux.capa = len;
1184 RBASIC(str)->flags |= STR_NOFREE;
1185 rb_enc_associate_index(str, encindex);
1186 }
1187 return str;
1188}
1189
1190VALUE
1191rb_str_new_static(const char *ptr, long len)
1192{
1193 return str_new_static(rb_cString, ptr, len, 0);
1194}
1195
1196VALUE
1198{
1199 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1200}
1201
1202VALUE
1204{
1205 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1206}
1207
1208VALUE
1210{
1211 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1212}
1213
1214static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1215 rb_encoding *from, rb_encoding *to,
1216 int ecflags, VALUE ecopts);
1217
1218static inline bool
1219is_enc_ascii_string(VALUE str, rb_encoding *enc)
1220{
1221 int encidx = rb_enc_to_index(enc);
1222 if (rb_enc_get_index(str) == encidx)
1223 return is_ascii_string(str);
1224 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1225}
1226
1227VALUE
1228rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1229{
1230 long len;
1231 const char *ptr;
1232 VALUE newstr;
1233
1234 if (!to) return str;
1235 if (!from) from = rb_enc_get(str);
1236 if (from == to) return str;
1237 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1238 rb_is_ascii8bit_enc(to)) {
1239 if (STR_ENC_GET(str) != to) {
1240 str = rb_str_dup(str);
1241 rb_enc_associate(str, to);
1242 }
1243 return str;
1244 }
1245
1246 RSTRING_GETMEM(str, ptr, len);
1247 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1248 from, to, ecflags, ecopts);
1249 if (NIL_P(newstr)) {
1250 /* some error, return original */
1251 return str;
1252 }
1253 return newstr;
1254}
1255
1256VALUE
1257rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1258 rb_encoding *from, int ecflags, VALUE ecopts)
1259{
1260 long olen;
1261
1262 olen = RSTRING_LEN(newstr);
1263 if (ofs < -olen || olen < ofs)
1264 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1265 if (ofs < 0) ofs += olen;
1266 if (!from) {
1267 STR_SET_LEN(newstr, ofs);
1268 return rb_str_cat(newstr, ptr, len);
1269 }
1270
1271 rb_str_modify(newstr);
1272 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1273 rb_enc_get(newstr),
1274 ecflags, ecopts);
1275}
1276
1277VALUE
1278rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1279{
1280 STR_SET_LEN(str, 0);
1281 rb_enc_associate(str, enc);
1282 rb_str_cat(str, ptr, len);
1283 return str;
1284}
1285
1286static VALUE
1287str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1288 rb_encoding *from, rb_encoding *to,
1289 int ecflags, VALUE ecopts)
1290{
1291 rb_econv_t *ec;
1293 long olen;
1294 VALUE econv_wrapper;
1295 const unsigned char *start, *sp;
1296 unsigned char *dest, *dp;
1297 size_t converted_output = (size_t)ofs;
1298
1299 olen = rb_str_capacity(newstr);
1300
1301 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1302 RBASIC_CLEAR_CLASS(econv_wrapper);
1303 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1304 if (!ec) return Qnil;
1305 DATA_PTR(econv_wrapper) = ec;
1306
1307 sp = (unsigned char*)ptr;
1308 start = sp;
1309 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1310 (dp = dest + converted_output),
1311 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1313 /* destination buffer short */
1314 size_t converted_input = sp - start;
1315 size_t rest = len - converted_input;
1316 converted_output = dp - dest;
1317 rb_str_set_len(newstr, converted_output);
1318 if (converted_input && converted_output &&
1319 rest < (LONG_MAX / converted_output)) {
1320 rest = (rest * converted_output) / converted_input;
1321 }
1322 else {
1323 rest = olen;
1324 }
1325 olen += rest < 2 ? 2 : rest;
1326 rb_str_resize(newstr, olen);
1327 }
1328 DATA_PTR(econv_wrapper) = 0;
1329 RB_GC_GUARD(econv_wrapper);
1330 rb_econv_close(ec);
1331 switch (ret) {
1332 case econv_finished:
1333 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1334 rb_str_set_len(newstr, len);
1335 rb_enc_associate(newstr, to);
1336 return newstr;
1337
1338 default:
1339 return Qnil;
1340 }
1341}
1342
1343VALUE
1345{
1346 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1347}
1348
1349VALUE
1351{
1352 rb_encoding *ienc;
1353 VALUE str;
1354 const int eidx = rb_enc_to_index(eenc);
1355
1356 if (!ptr) {
1357 return rb_enc_str_new(ptr, len, eenc);
1358 }
1359
1360 /* ASCII-8BIT case, no conversion */
1361 if ((eidx == rb_ascii8bit_encindex()) ||
1362 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1363 return rb_str_new(ptr, len);
1364 }
1365 /* no default_internal or same encoding, no conversion */
1366 ienc = rb_default_internal_encoding();
1367 if (!ienc || eenc == ienc) {
1368 return rb_enc_str_new(ptr, len, eenc);
1369 }
1370 /* ASCII compatible, and ASCII only string, no conversion in
1371 * default_internal */
1372 if ((eidx == rb_ascii8bit_encindex()) ||
1373 (eidx == rb_usascii_encindex()) ||
1374 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1375 return rb_enc_str_new(ptr, len, ienc);
1376 }
1377 /* convert from the given encoding to default_internal */
1378 str = rb_enc_str_new(NULL, 0, ienc);
1379 /* when the conversion failed for some reason, just ignore the
1380 * default_internal and result in the given encoding as-is. */
1381 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1382 rb_str_initialize(str, ptr, len, eenc);
1383 }
1384 return str;
1385}
1386
1387VALUE
1388rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1389{
1390 int eidx = rb_enc_to_index(eenc);
1391 if (eidx == rb_usascii_encindex() &&
1392 !is_ascii_string(str)) {
1393 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1394 return str;
1395 }
1396 rb_enc_associate_index(str, eidx);
1397 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1398}
1399
1400VALUE
1401rb_external_str_new(const char *ptr, long len)
1402{
1403 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1404}
1405
1406VALUE
1408{
1409 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1410}
1411
1412VALUE
1413rb_locale_str_new(const char *ptr, long len)
1414{
1415 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1416}
1417
1418VALUE
1420{
1421 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1422}
1423
1424VALUE
1426{
1427 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1428}
1429
1430VALUE
1432{
1433 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1434}
1435
1436VALUE
1438{
1439 return rb_str_export_to_enc(str, rb_default_external_encoding());
1440}
1441
1442VALUE
1444{
1445 return rb_str_export_to_enc(str, rb_locale_encoding());
1446}
1447
1448VALUE
1450{
1451 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1452}
1453
1454static VALUE
1455str_replace_shared_without_enc(VALUE str2, VALUE str)
1456{
1457 const int termlen = TERM_LEN(str);
1458 char *ptr;
1459 long len;
1460
1461 RSTRING_GETMEM(str, ptr, len);
1462 if (str_embed_capa(str2) >= len + termlen) {
1463 char *ptr2 = RSTRING(str2)->as.embed.ary;
1464 STR_SET_EMBED(str2);
1465 memcpy(ptr2, RSTRING_PTR(str), len);
1466 TERM_FILL(ptr2+len, termlen);
1467 }
1468 else {
1469 VALUE root;
1470 if (STR_SHARED_P(str)) {
1471 root = RSTRING(str)->as.heap.aux.shared;
1472 RSTRING_GETMEM(str, ptr, len);
1473 }
1474 else {
1475 root = rb_str_new_frozen(str);
1476 RSTRING_GETMEM(root, ptr, len);
1477 }
1478 RUBY_ASSERT(OBJ_FROZEN(root));
1479
1480 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1481 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1482 rb_fatal("about to free a possible shared root");
1483 }
1484 char *ptr2 = STR_HEAP_PTR(str2);
1485 if (ptr2 != ptr) {
1486 SIZED_FREE_N(ptr2, STR_HEAP_SIZE(str2));
1487 }
1488 }
1489 FL_SET(str2, STR_NOEMBED);
1490 RSTRING(str2)->as.heap.ptr = ptr;
1491 STR_SET_SHARED(str2, root);
1492 }
1493
1494 STR_SET_LEN(str2, len);
1495
1496 return str2;
1497}
1498
1499static VALUE
1500str_replace_shared(VALUE str2, VALUE str)
1501{
1502 str_replace_shared_without_enc(str2, str);
1503 rb_enc_cr_str_exact_copy(str2, str);
1504 return str2;
1505}
1506
1507static VALUE
1508str_new_shared(VALUE klass, VALUE str)
1509{
1510 return str_replace_shared(str_alloc_heap(klass), str);
1511}
1512
1513VALUE
1515{
1516 return str_new_shared(rb_obj_class(str), str);
1517}
1518
1519VALUE
1521{
1522 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1523 return str_new_frozen(rb_obj_class(orig), orig);
1524}
1525
1526static VALUE
1527rb_str_new_frozen_String(VALUE orig)
1528{
1529 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1530 return str_new_frozen(rb_cString, orig);
1531}
1532
1533
1534VALUE
1535rb_str_frozen_bare_string(VALUE orig)
1536{
1537 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1538 return str_new_frozen(rb_cString, orig);
1539}
1540
1541VALUE
1542rb_str_tmp_frozen_acquire(VALUE orig)
1543{
1544 if (OBJ_FROZEN_RAW(orig)) return orig;
1545 return str_new_frozen_buffer(0, orig, FALSE);
1546}
1547
1548VALUE
1549rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1550{
1551 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1552 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1553
1554 VALUE str = str_alloc_heap(0);
1555 OBJ_FREEZE(str);
1556 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1557 FL_SET(str, STR_SHARED_ROOT);
1558
1559 size_t capa = str_capacity(orig, TERM_LEN(orig));
1560
1561 /* If the string is embedded then we want to create a copy that is heap
1562 * allocated. If the string is shared then the shared root must be
1563 * embedded, so we want to create a copy. If the string is a shared root
1564 * then it must be embedded, so we want to create a copy. */
1565 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1566 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1567 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1568 }
1569 else {
1570 /* orig must be heap allocated and not shared, so we can safely transfer
1571 * the pointer to str. */
1572 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1573 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1574 RBASIC(orig)->flags &= ~STR_NOFREE;
1575 STR_SET_SHARED(orig, str);
1576 if (RB_OBJ_SHAREABLE_P(orig)) {
1577 RB_OBJ_SET_SHAREABLE(str);
1578 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1579 }
1580 }
1581
1582 RSTRING(str)->len = RSTRING(orig)->len;
1583 RSTRING(str)->as.heap.aux.capa = capa + (TERM_LEN(orig) - TERM_LEN(str));
1584
1585 return str;
1586}
1587
1588void
1589rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1590{
1591 if (RBASIC_CLASS(tmp) != 0)
1592 return;
1593
1594 if (STR_EMBED_P(tmp)) {
1596 }
1597 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1598 !OBJ_FROZEN_RAW(orig)) {
1599 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1600
1601 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1602 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1603 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1604
1605 /* Unshare orig since the root (tmp) only has this one child. */
1606 FL_UNSET_RAW(orig, STR_SHARED);
1607 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1608 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1610
1611 /* Make tmp embedded and empty so it is safe for sweeping. */
1612 STR_SET_EMBED(tmp);
1613 STR_SET_LEN(tmp, 0);
1614 }
1615 }
1616}
1617
1618static VALUE
1619str_new_frozen(VALUE klass, VALUE orig)
1620{
1621 return str_new_frozen_buffer(klass, orig, TRUE);
1622}
1623
1624static VALUE
1625heap_str_make_shared(VALUE klass, VALUE orig)
1626{
1627 RUBY_ASSERT(!STR_EMBED_P(orig));
1628 RUBY_ASSERT(!STR_SHARED_P(orig));
1630
1631 VALUE str = str_alloc_heap(klass);
1632 STR_SET_LEN(str, RSTRING_LEN(orig));
1633 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1634 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1635 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1636 RBASIC(orig)->flags &= ~STR_NOFREE;
1637 STR_SET_SHARED(orig, str);
1638 if (klass == 0)
1639 FL_UNSET_RAW(str, STR_BORROWED);
1640 return str;
1641}
1642
1643static VALUE
1644str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1645{
1646 VALUE str;
1647
1648 long len = RSTRING_LEN(orig);
1649 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1650 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1651
1652 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1653 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1654 RUBY_ASSERT(STR_EMBED_P(str));
1655 }
1656 else {
1657 if (FL_TEST_RAW(orig, STR_SHARED)) {
1658 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1659 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1660 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1661 RUBY_ASSERT(ofs >= 0);
1662 RUBY_ASSERT(rest >= 0);
1663 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1665
1666 if ((ofs > 0) || (rest > 0) ||
1667 (klass != RBASIC(shared)->klass) ||
1668 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1669 str = str_new_shared(klass, shared);
1670 RUBY_ASSERT(!STR_EMBED_P(str));
1671 RSTRING(str)->as.heap.ptr += ofs;
1672 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1673 }
1674 else {
1675 if (RBASIC_CLASS(shared) == 0)
1676 FL_SET_RAW(shared, STR_BORROWED);
1677 return shared;
1678 }
1679 }
1680 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1681 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1682 STR_SET_EMBED(str);
1683 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1684 STR_SET_LEN(str, RSTRING_LEN(orig));
1685 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1686 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1687 }
1688 else {
1689 if (RB_OBJ_SHAREABLE_P(orig)) {
1690 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1691 }
1692 else {
1693 str = heap_str_make_shared(klass, orig);
1694 }
1695 }
1696 }
1697
1698 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1699 OBJ_FREEZE(str);
1700 return str;
1701}
1702
1703VALUE
1704rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1705{
1706 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1707}
1708
1709static VALUE
1710str_new_empty_String(VALUE str)
1711{
1712 VALUE v = rb_str_new(0, 0);
1713 rb_enc_copy(v, str);
1714 return v;
1715}
1716
1717#define STR_BUF_MIN_SIZE 63
1718
1719VALUE
1721{
1722 if (STR_EMBEDDABLE_P(capa, 1)) {
1723 return str_alloc_embed(rb_cString, capa + 1);
1724 }
1725
1726 VALUE str = str_alloc_heap(rb_cString);
1727
1728 RSTRING(str)->as.heap.aux.capa = capa;
1729 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1730 RSTRING(str)->as.heap.ptr[0] = '\0';
1731
1732 return str;
1733}
1734
1735VALUE
1737{
1738 VALUE str;
1739 long len = strlen(ptr);
1740
1741 str = rb_str_buf_new(len);
1742 rb_str_buf_cat(str, ptr, len);
1743
1744 return str;
1745}
1746
1747VALUE
1749{
1750 return str_new(0, 0, len);
1751}
1752
1753void
1755{
1756 if (STR_EMBED_P(str)) {
1757 RB_DEBUG_COUNTER_INC(obj_str_embed);
1758 }
1759 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1760 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1761 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1762 }
1763 else {
1764 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1765 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1766 }
1767}
1768
1769size_t
1770rb_str_memsize(VALUE str)
1771{
1772 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1773 return STR_HEAP_SIZE(str);
1774 }
1775 else {
1776 return 0;
1777 }
1778}
1779
1780VALUE
1782{
1783 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1784}
1785
1786static inline void str_discard(VALUE str);
1787static void str_shared_replace(VALUE str, VALUE str2);
1788
1789void
1791{
1792 if (str != str2) str_shared_replace(str, str2);
1793}
1794
1795static void
1796str_shared_replace(VALUE str, VALUE str2)
1797{
1798 rb_encoding *enc;
1799 int cr;
1800 int termlen;
1801
1802 RUBY_ASSERT(str2 != str);
1803 enc = STR_ENC_GET(str2);
1804 cr = ENC_CODERANGE(str2);
1805 str_discard(str);
1806 termlen = rb_enc_mbminlen(enc);
1807
1808 STR_SET_LEN(str, RSTRING_LEN(str2));
1809
1810 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1811 STR_SET_EMBED(str);
1812 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1813 rb_enc_associate(str, enc);
1814 ENC_CODERANGE_SET(str, cr);
1815 }
1816 else {
1817 if (STR_EMBED_P(str2)) {
1818 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1819 long len = RSTRING_LEN(str2);
1820 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1821
1822 char *new_ptr = ALLOC_N(char, len + termlen);
1823 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1824 RSTRING(str2)->as.heap.ptr = new_ptr;
1825 STR_SET_LEN(str2, len);
1826 RSTRING(str2)->as.heap.aux.capa = len;
1827 STR_SET_NOEMBED(str2);
1828 }
1829
1830 STR_SET_NOEMBED(str);
1831 FL_UNSET(str, STR_SHARED);
1832 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1833
1834 if (FL_TEST(str2, STR_SHARED)) {
1835 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1836 STR_SET_SHARED(str, shared);
1837 }
1838 else {
1839 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1840 }
1841
1842 /* abandon str2 */
1843 STR_SET_EMBED(str2);
1844 RSTRING_PTR(str2)[0] = 0;
1845 STR_SET_LEN(str2, 0);
1846 rb_enc_associate(str, enc);
1847 ENC_CODERANGE_SET(str, cr);
1848 }
1849}
1850
1851VALUE
1853{
1854 VALUE str;
1855
1856 if (RB_TYPE_P(obj, T_STRING)) {
1857 return obj;
1858 }
1859 str = rb_funcall(obj, idTo_s, 0);
1860 return rb_obj_as_string_result(str, obj);
1861}
1862
1863VALUE
1864rb_obj_as_string_result(VALUE str, VALUE obj)
1865{
1866 if (!RB_TYPE_P(str, T_STRING))
1867 return rb_any_to_s(obj);
1868 return str;
1869}
1870
1871static VALUE
1872str_replace(VALUE str, VALUE str2)
1873{
1874 long len;
1875
1876 len = RSTRING_LEN(str2);
1877 if (STR_SHARED_P(str2)) {
1878 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1880 STR_SET_NOEMBED(str);
1881 STR_SET_LEN(str, len);
1882 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1883 STR_SET_SHARED(str, shared);
1884 rb_enc_cr_str_exact_copy(str, str2);
1885 }
1886 else {
1887 str_replace_shared(str, str2);
1888 }
1889
1890 return str;
1891}
1892
1893static inline VALUE
1894ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1895{
1896 size_t size = rb_str_embed_size(capa, 0);
1897 RUBY_ASSERT(size > 0);
1898 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1899
1900 EC_NEWOBJ_OF(str, struct RString, klass, T_STRING, size, ec);
1901
1902 str->len = 0;
1903
1904 return (VALUE)str;
1905}
1906
1907static inline VALUE
1908ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1909{
1910 EC_NEWOBJ_OF(str, struct RString, klass, T_STRING | STR_NOEMBED, sizeof(struct RString), ec);
1911
1912 str->as.heap.aux.capa = 0;
1913 str->as.heap.ptr = NULL;
1914
1915 return (VALUE)str;
1916}
1917
1918static inline VALUE
1919str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1920{
1921 int encidx = 0;
1922 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1923 encidx = rb_enc_get_index(str);
1924 flags &= ~ENCODING_MASK;
1925 }
1926 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1927 if (encidx) rb_enc_associate_index(dup, encidx);
1928 return dup;
1929}
1930
1931static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1932
1933static inline VALUE
1934str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1935{
1936 VALUE flags = FL_TEST_RAW(str, flag_mask);
1937 long len = RSTRING_LEN(str);
1938
1939 RUBY_ASSERT(STR_EMBED_P(dup));
1940 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1941 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1942 STR_SET_LEN(dup, RSTRING_LEN(str));
1943 return str_duplicate_setup_encoding(str, dup, flags);
1944}
1945
1946static inline VALUE
1947str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1948{
1949 VALUE flags = FL_TEST_RAW(str, flag_mask);
1950 VALUE root = str;
1951 if (FL_TEST_RAW(str, STR_SHARED)) {
1952 root = RSTRING(str)->as.heap.aux.shared;
1953 }
1954 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1955 root = str = str_new_frozen(klass, str);
1956 flags = FL_TEST_RAW(str, flag_mask);
1957 }
1958 RUBY_ASSERT(!STR_SHARED_P(root));
1960
1961 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1962 FL_SET_RAW(dup, RSTRING_NOEMBED);
1963 STR_SET_SHARED(dup, root);
1964 flags |= RSTRING_NOEMBED | STR_SHARED;
1965
1966 STR_SET_LEN(dup, RSTRING_LEN(str));
1967 return str_duplicate_setup_encoding(str, dup, flags);
1968}
1969
1970static inline VALUE
1971str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1972{
1973 if (STR_EMBED_P(str)) {
1974 return str_duplicate_setup_embed(klass, str, dup);
1975 }
1976 else {
1977 return str_duplicate_setup_heap(klass, str, dup);
1978 }
1979}
1980
1981static inline VALUE
1982str_duplicate(VALUE klass, VALUE str)
1983{
1984 VALUE dup;
1985 if (STR_EMBED_P(str)) {
1986 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1987 }
1988 else {
1989 dup = str_alloc_heap(klass);
1990 }
1991
1992 return str_duplicate_setup(klass, str, dup);
1993}
1994
1995VALUE
1997{
1998 return str_duplicate(rb_obj_class(str), str);
1999}
2000
2001/* :nodoc: */
2002VALUE
2003rb_str_dup_m(VALUE str)
2004{
2005 if (LIKELY(BARE_STRING_P(str))) {
2006 return str_duplicate(rb_cString, str);
2007 }
2008 else {
2009 return rb_obj_dup(str);
2010 }
2011}
2012
2013VALUE
2015{
2016 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2017 return str_duplicate(rb_cString, str);
2018}
2019
2020VALUE
2021rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2022{
2023 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2024 VALUE new_str, klass = rb_cString;
2025
2026 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2027 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2028 str_duplicate_setup_embed(klass, str, new_str);
2029 }
2030 else {
2031 new_str = ec_str_alloc_heap(ec, klass);
2032 str_duplicate_setup_heap(klass, str, new_str);
2033 }
2034 if (chilled) {
2035 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2036 }
2037 return new_str;
2038}
2039
2040VALUE
2041rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2042{
2043 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2044 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2045 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2046 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2047 return rb_str_freeze(str);
2048}
2049
2050/*
2051 * The documentation block below uses an include (instead of inline text)
2052 * because the included text has non-ASCII characters (which are not allowed in a C file).
2053 */
2054
2055/*
2056 *
2057 * call-seq:
2058 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2059 *
2060 * :include: doc/string/new.rdoc
2061 *
2062 */
2063
2064static VALUE
2065rb_str_init(int argc, VALUE *argv, VALUE str)
2066{
2067 static ID keyword_ids[2];
2068 VALUE orig, opt, venc, vcapa;
2069 VALUE kwargs[2];
2070 rb_encoding *enc = 0;
2071 int n;
2072
2073 if (!keyword_ids[0]) {
2074 keyword_ids[0] = rb_id_encoding();
2075 CONST_ID(keyword_ids[1], "capacity");
2076 }
2077
2078 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2079 if (!NIL_P(opt)) {
2080 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2081 venc = kwargs[0];
2082 vcapa = kwargs[1];
2083 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2084 enc = rb_to_encoding(venc);
2085 }
2086 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2087 long capa = NUM2LONG(vcapa);
2088 long len = 0;
2089 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2090
2091 if (capa < STR_BUF_MIN_SIZE) {
2092 capa = STR_BUF_MIN_SIZE;
2093 }
2094 if (n == 1) {
2095 StringValue(orig);
2096 len = RSTRING_LEN(orig);
2097 if (capa < len) {
2098 capa = len;
2099 }
2100 if (orig == str) n = 0;
2101 }
2102 str_modifiable(str);
2103 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2104 /* make noembed always */
2105 const size_t size = (size_t)capa + termlen;
2106 const char *const old_ptr = RSTRING_PTR(str);
2107 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2108 char *new_ptr = ALLOC_N(char, size);
2109 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2110 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2111 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2112 RSTRING(str)->as.heap.ptr = new_ptr;
2113 }
2114 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2115 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2116 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2117 }
2118 STR_SET_LEN(str, len);
2119 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2120 if (n == 1) {
2121 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2122 rb_enc_cr_str_exact_copy(str, orig);
2123 }
2124 FL_SET(str, STR_NOEMBED);
2125 RSTRING(str)->as.heap.aux.capa = capa;
2126 }
2127 else if (n == 1) {
2128 rb_str_replace(str, orig);
2129 }
2130 if (enc) {
2131 rb_enc_associate(str, enc);
2133 }
2134 }
2135 else if (n == 1) {
2136 rb_str_replace(str, orig);
2137 }
2138 return str;
2139}
2140
2141/* :nodoc: */
2142static VALUE
2143rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2144{
2145 if (klass != rb_cString) {
2146 return rb_class_new_instance_pass_kw(argc, argv, klass);
2147 }
2148
2149 static ID keyword_ids[2];
2150 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2151 VALUE kwargs[2];
2152 rb_encoding *enc = NULL;
2153
2154 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2155 if (NIL_P(opt)) {
2156 return rb_class_new_instance_pass_kw(argc, argv, klass);
2157 }
2158
2159 keyword_ids[0] = rb_id_encoding();
2160 CONST_ID(keyword_ids[1], "capacity");
2161 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2162 encoding = kwargs[0];
2163 capacity = kwargs[1];
2164
2165 if (n == 1) {
2166 orig = StringValue(orig);
2167 }
2168 else {
2169 orig = Qnil;
2170 }
2171
2172 if (UNDEF_P(encoding)) {
2173 if (!NIL_P(orig)) {
2174 encoding = rb_obj_encoding(orig);
2175 }
2176 }
2177
2178 if (!UNDEF_P(encoding)) {
2179 enc = rb_to_encoding(encoding);
2180 }
2181
2182 // If capacity is nil, we're basically just duping `orig`.
2183 if (UNDEF_P(capacity)) {
2184 if (NIL_P(orig)) {
2185 VALUE empty_str = str_new(klass, "", 0);
2186 if (enc) {
2187 rb_enc_associate(empty_str, enc);
2188 }
2189 return empty_str;
2190 }
2191 VALUE copy = str_duplicate(klass, orig);
2192 rb_enc_associate(copy, enc);
2193 ENC_CODERANGE_CLEAR(copy);
2194 return copy;
2195 }
2196
2197 long capa = 0;
2198 capa = NUM2LONG(capacity);
2199 if (capa < 0) {
2200 capa = 0;
2201 }
2202
2203 if (!NIL_P(orig)) {
2204 long orig_capa = rb_str_capacity(orig);
2205 if (orig_capa > capa) {
2206 capa = orig_capa;
2207 }
2208 }
2209
2210 VALUE str = str_enc_new(klass, NULL, capa, enc);
2211 STR_SET_LEN(str, 0);
2212 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2213
2214 if (!NIL_P(orig)) {
2215 rb_str_buf_append(str, orig);
2216 }
2217
2218 return str;
2219}
2220
2221#ifdef NONASCII_MASK
2222#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2223
2224/*
2225 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2226 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2227 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2228 *
2229 * if (!(byte & 0x80))
2230 * byte |= 0x40; // turn on bit6
2231 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2232 *
2233 * This function calculates whether a byte is leading or not for all bytes
2234 * in the argument word by concurrently using the above logic, and then
2235 * adds up the number of leading bytes in the word.
2236 */
2237static inline uintptr_t
2238count_utf8_lead_bytes_with_word(const uintptr_t *s)
2239{
2240 uintptr_t d = *s;
2241
2242 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2243 d = (d>>6) | (~d>>7);
2244 d &= NONASCII_MASK >> 7;
2245
2246 /* Gather all bytes. */
2247#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2248 /* use only if it can use POPCNT */
2249 return rb_popcount_intptr(d);
2250#else
2251 d += (d>>8);
2252 d += (d>>16);
2253# if SIZEOF_VOIDP == 8
2254 d += (d>>32);
2255# endif
2256 return (d&0xF);
2257#endif
2258}
2259#endif
2260
2261static inline long
2262enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2263{
2264 long c;
2265 const char *q;
2266
2267 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2268 long diff = (long)(e - p);
2269 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2270 }
2271#ifdef NONASCII_MASK
2272 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2273 uintptr_t len = 0;
2274 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2275 const uintptr_t *s, *t;
2276 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2277 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2278 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2279 while (p < (const char *)s) {
2280 if (is_utf8_lead_byte(*p)) len++;
2281 p++;
2282 }
2283 while (s < t) {
2284 len += count_utf8_lead_bytes_with_word(s);
2285 s++;
2286 }
2287 p = (const char *)s;
2288 }
2289 while (p < e) {
2290 if (is_utf8_lead_byte(*p)) len++;
2291 p++;
2292 }
2293 return (long)len;
2294 }
2295#endif
2296 else if (rb_enc_asciicompat(enc)) {
2297 c = 0;
2298 if (ENC_CODERANGE_CLEAN_P(cr)) {
2299 while (p < e) {
2300 if (ISASCII(*p)) {
2301 q = search_nonascii(p, e);
2302 if (!q)
2303 return c + (e - p);
2304 c += q - p;
2305 p = q;
2306 }
2307 p += rb_enc_fast_mbclen(p, e, enc);
2308 c++;
2309 }
2310 }
2311 else {
2312 while (p < e) {
2313 if (ISASCII(*p)) {
2314 q = search_nonascii(p, e);
2315 if (!q)
2316 return c + (e - p);
2317 c += q - p;
2318 p = q;
2319 }
2320 p += rb_enc_mbclen(p, e, enc);
2321 c++;
2322 }
2323 }
2324 return c;
2325 }
2326
2327 for (c=0; p<e; c++) {
2328 p += rb_enc_mbclen(p, e, enc);
2329 }
2330 return c;
2331}
2332
2333long
2334rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2335{
2336 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2337}
2338
2339/* To get strlen with cr
2340 * Note that given cr is not used.
2341 */
2342long
2343rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2344{
2345 long c;
2346 const char *q;
2347 int ret;
2348
2349 *cr = 0;
2350 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2351 long diff = (long)(e - p);
2352 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2353 }
2354 else if (rb_enc_asciicompat(enc)) {
2355 c = 0;
2356 while (p < e) {
2357 if (ISASCII(*p)) {
2358 q = search_nonascii(p, e);
2359 if (!q) {
2360 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2361 return c + (e - p);
2362 }
2363 c += q - p;
2364 p = q;
2365 }
2366 ret = rb_enc_precise_mbclen(p, e, enc);
2367 if (MBCLEN_CHARFOUND_P(ret)) {
2368 *cr |= ENC_CODERANGE_VALID;
2369 p += MBCLEN_CHARFOUND_LEN(ret);
2370 }
2371 else {
2373 p++;
2374 }
2375 c++;
2376 }
2377 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2378 return c;
2379 }
2380
2381 for (c=0; p<e; c++) {
2382 ret = rb_enc_precise_mbclen(p, e, enc);
2383 if (MBCLEN_CHARFOUND_P(ret)) {
2384 *cr |= ENC_CODERANGE_VALID;
2385 p += MBCLEN_CHARFOUND_LEN(ret);
2386 }
2387 else {
2389 if (p + rb_enc_mbminlen(enc) <= e)
2390 p += rb_enc_mbminlen(enc);
2391 else
2392 p = e;
2393 }
2394 }
2395 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2396 return c;
2397}
2398
2399/* enc must be str's enc or rb_enc_check(str, str2) */
2400static long
2401str_strlen(VALUE str, rb_encoding *enc)
2402{
2403 const char *p, *e;
2404 int cr;
2405
2406 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2407 if (!enc) enc = STR_ENC_GET(str);
2408 p = RSTRING_PTR(str);
2409 e = RSTRING_END(str);
2410 cr = ENC_CODERANGE(str);
2411
2412 if (cr == ENC_CODERANGE_UNKNOWN) {
2413 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2414 if (cr) ENC_CODERANGE_SET(str, cr);
2415 return n;
2416 }
2417 else {
2418 return enc_strlen(p, e, enc, cr);
2419 }
2420}
2421
2422long
2424{
2425 return str_strlen(str, NULL);
2426}
2427
2428/*
2429 * call-seq:
2430 * length -> integer
2431 *
2432 * :include: doc/string/length.rdoc
2433 *
2434 */
2435
2436VALUE
2438{
2439 return LONG2NUM(str_strlen(str, NULL));
2440}
2441
2442/*
2443 * call-seq:
2444 * bytesize -> integer
2445 *
2446 * :include: doc/string/bytesize.rdoc
2447 *
2448 */
2449
2450VALUE
2451rb_str_bytesize(VALUE str)
2452{
2453 return LONG2NUM(RSTRING_LEN(str));
2454}
2455
2456/*
2457 * call-seq:
2458 * empty? -> true or false
2459 *
2460 * Returns whether the length of +self+ is zero:
2461 *
2462 * 'hello'.empty? # => false
2463 * ' '.empty? # => false
2464 * ''.empty? # => true
2465 *
2466 * Related: see {Querying}[rdoc-ref:String@Querying].
2467 */
2468
2469static VALUE
2470rb_str_empty(VALUE str)
2471{
2472 return RBOOL(RSTRING_LEN(str) == 0);
2473}
2474
2475/*
2476 * call-seq:
2477 * self + other_string -> new_string
2478 *
2479 * Returns a new string containing +other_string+ concatenated to +self+:
2480 *
2481 * 'Hello from ' + self.to_s # => "Hello from main"
2482 *
2483 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2484 */
2485
2486VALUE
2488{
2489 VALUE str3;
2490 rb_encoding *enc;
2491 char *ptr1, *ptr2, *ptr3;
2492 long len1, len2;
2493 int termlen;
2494
2495 StringValue(str2);
2496 enc = rb_enc_check_str(str1, str2);
2497 RSTRING_GETMEM(str1, ptr1, len1);
2498 RSTRING_GETMEM(str2, ptr2, len2);
2499 termlen = rb_enc_mbminlen(enc);
2500 if (len1 > LONG_MAX - len2) {
2501 rb_raise(rb_eArgError, "string size too big");
2502 }
2503 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2504 ptr3 = RSTRING_PTR(str3);
2505 memcpy(ptr3, ptr1, len1);
2506 memcpy(ptr3+len1, ptr2, len2);
2507 TERM_FILL(&ptr3[len1+len2], termlen);
2508
2509 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2511 RB_GC_GUARD(str1);
2512 RB_GC_GUARD(str2);
2513 return str3;
2514}
2515
2516/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2517VALUE
2518rb_str_opt_plus(VALUE str1, VALUE str2)
2519{
2522 long len1, len2;
2523 MAYBE_UNUSED(char) *ptr1, *ptr2;
2524 RSTRING_GETMEM(str1, ptr1, len1);
2525 RSTRING_GETMEM(str2, ptr2, len2);
2526 int enc1 = rb_enc_get_index(str1);
2527 int enc2 = rb_enc_get_index(str2);
2528
2529 if (enc1 < 0) {
2530 return Qundef;
2531 }
2532 else if (enc2 < 0) {
2533 return Qundef;
2534 }
2535 else if (enc1 != enc2) {
2536 return Qundef;
2537 }
2538 else if (len1 > LONG_MAX - len2) {
2539 return Qundef;
2540 }
2541 else {
2542 return rb_str_plus(str1, str2);
2543 }
2544
2545}
2546
2547/*
2548 * call-seq:
2549 * self * n -> new_string
2550 *
2551 * Returns a new string containing +n+ copies of +self+:
2552 *
2553 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2554 * 'No!' * 0 # => ""
2555 *
2556 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2557 */
2558
2559VALUE
2561{
2562 VALUE str2;
2563 long n, len;
2564 char *ptr2;
2565 int termlen;
2566
2567 if (times == INT2FIX(1)) {
2568 return str_duplicate(rb_cString, str);
2569 }
2570 if (times == INT2FIX(0)) {
2571 str2 = str_alloc_embed(rb_cString, 0);
2572 rb_enc_copy(str2, str);
2573 return str2;
2574 }
2575 len = NUM2LONG(times);
2576 if (len < 0) {
2577 rb_raise(rb_eArgError, "negative argument");
2578 }
2579 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2580 if (STR_EMBEDDABLE_P(len, 1)) {
2581 str2 = str_alloc_embed(rb_cString, len + 1);
2582 memset(RSTRING_PTR(str2), 0, len + 1);
2583 }
2584 else {
2585 str2 = str_alloc_heap(rb_cString);
2586 RSTRING(str2)->as.heap.aux.capa = len;
2587 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2588 }
2589 STR_SET_LEN(str2, len);
2590 rb_enc_copy(str2, str);
2591 return str2;
2592 }
2593 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2594 rb_raise(rb_eArgError, "argument too big");
2595 }
2596
2597 len *= RSTRING_LEN(str);
2598 termlen = TERM_LEN(str);
2599 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2600 ptr2 = RSTRING_PTR(str2);
2601 if (len) {
2602 n = RSTRING_LEN(str);
2603 memcpy(ptr2, RSTRING_PTR(str), n);
2604 while (n <= len/2) {
2605 memcpy(ptr2 + n, ptr2, n);
2606 n *= 2;
2607 }
2608 memcpy(ptr2 + n, ptr2, len-n);
2609 }
2610 STR_SET_LEN(str2, len);
2611 TERM_FILL(&ptr2[len], termlen);
2612 rb_enc_cr_str_copy_for_substr(str2, str);
2613
2614 return str2;
2615}
2616
2617/*
2618 * call-seq:
2619 * self % object -> new_string
2620 *
2621 * Returns the result of formatting +object+ into the format specifications
2622 * contained in +self+
2623 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2624 *
2625 * '%05d' % 123 # => "00123"
2626 *
2627 * If +self+ contains multiple format specifications,
2628 * +object+ must be an array or hash containing the objects to be formatted:
2629 *
2630 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2631 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2632 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2633 *
2634 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2635 */
2636
2637static VALUE
2638rb_str_format_m(VALUE str, VALUE arg)
2639{
2640 VALUE tmp = rb_check_array_type(arg);
2641
2642 if (!NIL_P(tmp)) {
2643 VALUE result = rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2644 RB_GC_GUARD(tmp);
2645 return result;
2646 }
2647 return rb_str_format(1, &arg, str);
2648}
2649
2650static inline void
2651rb_check_lockedtmp(VALUE str)
2652{
2653 if (FL_TEST(str, STR_TMPLOCK)) {
2654 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2655 }
2656}
2657
2658// If none of these flags are set, we know we have an modifiable string.
2659// If any is set, we need to do more detailed checks.
2660#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2661static inline void
2662str_modifiable(VALUE str)
2663{
2664 RUBY_ASSERT(ruby_thread_has_gvl_p());
2665
2666 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2667 if (CHILLED_STRING_P(str)) {
2668 CHILLED_STRING_MUTATED(str);
2669 }
2670 rb_check_lockedtmp(str);
2671 rb_check_frozen(str);
2672 }
2673}
2674
2675static inline int
2676str_dependent_p(VALUE str)
2677{
2678 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2679 return FALSE;
2680 }
2681 else {
2682 return TRUE;
2683 }
2684}
2685
2686// If none of these flags are set, we know we have an independent string.
2687// If any is set, we need to do more detailed checks.
2688#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2689static inline int
2690str_independent(VALUE str)
2691{
2692 RUBY_ASSERT(ruby_thread_has_gvl_p());
2693
2694 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2695 str_modifiable(str);
2696 return !str_dependent_p(str);
2697 }
2698 return TRUE;
2699}
2700
2701static void
2702str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2703{
2704 RUBY_ASSERT(ruby_thread_has_gvl_p());
2705
2706 char *ptr;
2707 char *oldptr;
2708 long capa = len + expand;
2709
2710 if (len > capa) len = capa;
2711
2712 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2713 ptr = RSTRING(str)->as.heap.ptr;
2714 STR_SET_EMBED(str);
2715 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2716 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2717 STR_SET_LEN(str, len);
2718 return;
2719 }
2720
2721 ptr = ALLOC_N(char, (size_t)capa + termlen);
2722 oldptr = RSTRING_PTR(str);
2723 if (oldptr) {
2724 memcpy(ptr, oldptr, len);
2725 }
2726 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2727 SIZED_FREE_N(oldptr, STR_HEAP_SIZE(str));
2728 }
2729 STR_SET_NOEMBED(str);
2730 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2731 TERM_FILL(ptr + len, termlen);
2732 RSTRING(str)->as.heap.ptr = ptr;
2733 STR_SET_LEN(str, len);
2734 RSTRING(str)->as.heap.aux.capa = capa;
2735}
2736
2737void
2738rb_str_modify(VALUE str)
2739{
2740 if (!str_independent(str))
2741 str_make_independent(str);
2743}
2744
2745void
2747{
2748 RUBY_ASSERT(ruby_thread_has_gvl_p());
2749
2750 int termlen = TERM_LEN(str);
2751 long len = RSTRING_LEN(str);
2752
2753 if (expand < 0) {
2754 rb_raise(rb_eArgError, "negative expanding string size");
2755 }
2756 if (expand >= LONG_MAX - len) {
2757 rb_raise(rb_eArgError, "string size too big");
2758 }
2759
2760 if (!str_independent(str)) {
2761 str_make_independent_expand(str, len, expand, termlen);
2762 }
2763 else if (expand > 0) {
2764 RESIZE_CAPA_TERM(str, len + expand, termlen);
2765 }
2767}
2768
2769/* As rb_str_modify(), but don't clear coderange */
2770static void
2771str_modify_keep_cr(VALUE str)
2772{
2773 if (!str_independent(str))
2774 str_make_independent(str);
2776 /* Force re-scan later */
2778}
2779
2780static inline void
2781str_discard(VALUE str)
2782{
2783 str_modifiable(str);
2784 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2785 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2786 RSTRING(str)->as.heap.ptr = 0;
2787 STR_SET_LEN(str, 0);
2788 }
2789}
2790
2791void
2793{
2794 int encindex = rb_enc_get_index(str);
2795
2796 if (RB_UNLIKELY(encindex == -1)) {
2797 rb_raise(rb_eTypeError, "not encoding capable object");
2798 }
2799
2800 if (RB_LIKELY(rb_str_encindex_fastpath(encindex))) {
2801 return;
2802 }
2803
2804 rb_encoding *enc = rb_enc_from_index(encindex);
2805 if (!rb_enc_asciicompat(enc)) {
2806 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2807 }
2808}
2809
2810VALUE
2812{
2813 RUBY_ASSERT(ruby_thread_has_gvl_p());
2814
2815 VALUE s = *ptr;
2816 if (!RB_TYPE_P(s, T_STRING)) {
2817 s = rb_str_to_str(s);
2818 *ptr = s;
2819 }
2820 return s;
2821}
2822
2823char *
2825{
2826 VALUE str = rb_string_value(ptr);
2827 return RSTRING_PTR(str);
2828}
2829
2830static const char *
2831str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2832{
2833 const char *e = s + len;
2834
2835 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2836 if (zero_filled(s, minlen)) return s;
2837 }
2838 return 0;
2839}
2840
2841static char *
2842str_fill_term(VALUE str, char *s, long len, int termlen)
2843{
2844 /* This function assumes that (capa + termlen) bytes of memory
2845 * is allocated, like many other functions in this file.
2846 */
2847 if (str_dependent_p(str)) {
2848 if (!zero_filled(s + len, termlen))
2849 str_make_independent_expand(str, len, 0L, termlen);
2850 }
2851 else {
2852 TERM_FILL(s + len, termlen);
2853 return s;
2854 }
2855 return RSTRING_PTR(str);
2856}
2857
2858void
2859rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2860{
2861 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2862 long len = RSTRING_LEN(str);
2863
2864 RUBY_ASSERT(capa >= len);
2865 if (capa - len < termlen) {
2866 rb_check_lockedtmp(str);
2867 str_make_independent_expand(str, len, 0L, termlen);
2868 }
2869 else if (str_dependent_p(str)) {
2870 if (termlen > oldtermlen)
2871 str_make_independent_expand(str, len, 0L, termlen);
2872 }
2873 else {
2874 if (!STR_EMBED_P(str)) {
2875 /* modify capa instead of realloc */
2876 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2877 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2878 }
2879 if (termlen > oldtermlen) {
2880 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2881 }
2882 }
2883
2884 return;
2885}
2886
2887static char *
2888str_null_check(VALUE str, int *w)
2889{
2890 char *s = RSTRING_PTR(str);
2891 long len = RSTRING_LEN(str);
2892 int minlen = 1;
2893
2894 if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) {
2895 rb_encoding *enc = rb_str_enc_get(str);
2896 minlen = rb_enc_mbminlen(enc);
2897
2898 if (minlen > 1) {
2899 *w = 1;
2900 if (str_null_char(s, len, minlen, enc)) {
2901 return NULL;
2902 }
2903 return str_fill_term(str, s, len, minlen);
2904 }
2905 }
2906
2907 *w = 0;
2908 if (!s || memchr(s, 0, len)) {
2909 return NULL;
2910 }
2911 if (s[len]) {
2912 s = str_fill_term(str, s, len, minlen);
2913 }
2914 return s;
2915}
2916
2917const char *
2918rb_str_null_check(VALUE str)
2919{
2921
2922 char *s;
2923 long len;
2924 RSTRING_GETMEM(str, s, len);
2925
2926 if (RB_LIKELY(rb_str_enc_fastpath(str))) {
2927 if (!s || memchr(s, 0, len)) {
2928 rb_raise(rb_eArgError, "string contains null byte");
2929 }
2930 }
2931 else {
2932 int w;
2933 const char *s = str_null_check(str, &w);
2934 if (!s) {
2935 if (w) {
2936 rb_raise(rb_eArgError, "string contains null char");
2937 }
2938 rb_raise(rb_eArgError, "string contains null byte");
2939 }
2940 }
2941
2942 return s;
2943}
2944
2945char *
2946rb_str_to_cstr(VALUE str)
2947{
2948 int w;
2949 return str_null_check(str, &w);
2950}
2951
2952char *
2954{
2955 VALUE str = rb_string_value(ptr);
2956 int w;
2957 char *s = str_null_check(str, &w);
2958 if (!s) {
2959 if (w) {
2960 rb_raise(rb_eArgError, "string contains null char");
2961 }
2962 rb_raise(rb_eArgError, "string contains null byte");
2963 }
2964 return s;
2965}
2966
2967char *
2968rb_str_fill_terminator(VALUE str, const int newminlen)
2969{
2970 char *s = RSTRING_PTR(str);
2971 long len = RSTRING_LEN(str);
2972 return str_fill_term(str, s, len, newminlen);
2973}
2974
2975VALUE
2977{
2978 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2979 return str;
2980}
2981
2982/*
2983 * call-seq:
2984 * String.try_convert(object) -> object, new_string, or nil
2985 *
2986 * Attempts to convert the given +object+ to a string.
2987 *
2988 * If +object+ is already a string, returns +object+, unmodified.
2989 *
2990 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2991 * calls <tt>object.to_str</tt> and returns the result.
2992 *
2993 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2994 *
2995 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2996 */
2997static VALUE
2998rb_str_s_try_convert(VALUE dummy, VALUE str)
2999{
3000 return rb_check_string_type(str);
3001}
3002
3003static char*
3004str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
3005{
3006 long nth = *nthp;
3007 if (rb_enc_mbmaxlen(enc) == 1) {
3008 p += nth;
3009 }
3010 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3011 p += nth * rb_enc_mbmaxlen(enc);
3012 }
3013 else if (rb_enc_asciicompat(enc)) {
3014 const char *p2, *e2;
3015 int n;
3016
3017 while (p < e && 0 < nth) {
3018 e2 = p + nth;
3019 if (e < e2) {
3020 *nthp = nth;
3021 return (char *)e;
3022 }
3023 if (ISASCII(*p)) {
3024 p2 = search_nonascii(p, e2);
3025 if (!p2) {
3026 nth -= e2 - p;
3027 *nthp = nth;
3028 return (char *)e2;
3029 }
3030 nth -= p2 - p;
3031 p = p2;
3032 }
3033 n = rb_enc_mbclen(p, e, enc);
3034 p += n;
3035 nth--;
3036 }
3037 *nthp = nth;
3038 if (nth != 0) {
3039 return (char *)e;
3040 }
3041 return (char *)p;
3042 }
3043 else {
3044 while (p < e && nth--) {
3045 p += rb_enc_mbclen(p, e, enc);
3046 }
3047 }
3048 if (p > e) p = e;
3049 *nthp = nth;
3050 return (char*)p;
3051}
3052
3053char*
3054rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3055{
3056 return str_nth_len(p, e, &nth, enc);
3057}
3058
3059static char*
3060str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3061{
3062 if (singlebyte)
3063 p += nth;
3064 else {
3065 p = str_nth_len(p, e, &nth, enc);
3066 }
3067 if (!p) return 0;
3068 if (p > e) p = e;
3069 return (char *)p;
3070}
3071
3072/* char offset to byte offset */
3073static long
3074str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3075{
3076 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3077 if (!pp) return e - p;
3078 return pp - p;
3079}
3080
3081long
3082rb_str_offset(VALUE str, long pos)
3083{
3084 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3085 STR_ENC_GET(str), single_byte_optimizable(str));
3086}
3087
3088#ifdef NONASCII_MASK
3089static char *
3090str_utf8_nth(const char *p, const char *e, long *nthp)
3091{
3092 long nth = *nthp;
3093 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3094 const uintptr_t *s, *t;
3095 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3096 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3097 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3098 while (p < (const char *)s) {
3099 if (is_utf8_lead_byte(*p)) nth--;
3100 p++;
3101 }
3102 do {
3103 nth -= count_utf8_lead_bytes_with_word(s);
3104 s++;
3105 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3106 p = (char *)s;
3107 }
3108 while (p < e) {
3109 if (is_utf8_lead_byte(*p)) {
3110 if (nth == 0) break;
3111 nth--;
3112 }
3113 p++;
3114 }
3115 *nthp = nth;
3116 return (char *)p;
3117}
3118
3119static long
3120str_utf8_offset(const char *p, const char *e, long nth)
3121{
3122 const char *pp = str_utf8_nth(p, e, &nth);
3123 return pp - p;
3124}
3125#endif
3126
3127/* byte offset to char offset */
3128long
3129rb_str_sublen(VALUE str, long pos)
3130{
3131 if (single_byte_optimizable(str) || pos < 0)
3132 return pos;
3133 else {
3134 char *p = RSTRING_PTR(str);
3135 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3136 }
3137}
3138
3139static VALUE
3140str_subseq(VALUE str, long beg, long len)
3141{
3142 VALUE str2;
3143
3144 RUBY_ASSERT(beg >= 0);
3145 RUBY_ASSERT(len >= 0);
3146 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3147
3148 const int termlen = TERM_LEN(str);
3149 if (!SHARABLE_SUBSTRING_P(str, beg, len)) {
3150 str2 = rb_enc_str_new(RSTRING_PTR(str) + beg, len, rb_str_enc_get(str));
3151 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
3153 }
3154 RB_GC_GUARD(str);
3155 return str2;
3156 }
3157
3158 str2 = str_alloc_heap(rb_cString);
3159 if (str_embed_capa(str2) >= len + termlen) {
3160 char *ptr2 = RSTRING(str2)->as.embed.ary;
3161 STR_SET_EMBED(str2);
3162 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3163 TERM_FILL(ptr2+len, termlen);
3164
3165 STR_SET_LEN(str2, len);
3166 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
3168 }
3169
3170 RB_GC_GUARD(str);
3171 }
3172 else {
3173 str_replace_shared(str2, str);
3174 RUBY_ASSERT(!STR_EMBED_P(str2));
3175 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3176 ENC_CODERANGE_CLEAR(str2);
3177 }
3178
3179 RSTRING(str2)->as.heap.ptr += beg;
3180 if (RSTRING_LEN(str2) > len) {
3181 STR_SET_LEN(str2, len);
3182 }
3183 }
3184
3185 return str2;
3186}
3187
3188VALUE
3189rb_str_subseq(VALUE str, long beg, long len)
3190{
3191 VALUE str2 = str_subseq(str, beg, len);
3192 rb_enc_cr_str_copy_for_substr(str2, str);
3193 return str2;
3194}
3195
3196char *
3197rb_str_subpos(VALUE str, long beg, long *lenp)
3198{
3199 long len = *lenp;
3200 long slen = -1L;
3201 const long blen = RSTRING_LEN(str);
3202 rb_encoding *enc = STR_ENC_GET(str);
3203 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3204
3205 if (len < 0) return 0;
3206 if (beg < 0 && -beg < 0) return 0;
3207 if (!blen) {
3208 len = 0;
3209 }
3210 if (single_byte_optimizable(str)) {
3211 if (beg > blen) return 0;
3212 if (beg < 0) {
3213 beg += blen;
3214 if (beg < 0) return 0;
3215 }
3216 if (len > blen - beg)
3217 len = blen - beg;
3218 if (len < 0) return 0;
3219 p = s + beg;
3220 goto end;
3221 }
3222 if (beg < 0) {
3223 if (len > -beg) len = -beg;
3224 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3225 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3226 beg = -beg;
3227 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3228 p = e;
3229 if (!p) return 0;
3230 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3231 if (!p) return 0;
3232 len = e - p;
3233 goto end;
3234 }
3235 else {
3236 slen = str_strlen(str, enc);
3237 beg += slen;
3238 if (beg < 0) return 0;
3239 p = s + beg;
3240 if (len == 0) goto end;
3241 }
3242 }
3243 else if (beg > 0 && beg > blen) {
3244 return 0;
3245 }
3246 if (len == 0) {
3247 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3248 p = s + beg;
3249 }
3250#ifdef NONASCII_MASK
3251 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3252 enc == rb_utf8_encoding()) {
3253 p = str_utf8_nth(s, e, &beg);
3254 if (beg > 0) return 0;
3255 len = str_utf8_offset(p, e, len);
3256 }
3257#endif
3258 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3259 int char_sz = rb_enc_mbmaxlen(enc);
3260
3261 p = s + beg * char_sz;
3262 if (p > e) {
3263 return 0;
3264 }
3265 else if (len * char_sz > e - p)
3266 len = e - p;
3267 else
3268 len *= char_sz;
3269 }
3270 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3271 if (beg > 0) return 0;
3272 len = 0;
3273 }
3274 else {
3275 len = str_offset(p, e, len, enc, 0);
3276 }
3277 end:
3278 *lenp = len;
3279 RB_GC_GUARD(str);
3280 return p;
3281}
3282
3283static VALUE str_substr(VALUE str, long beg, long len, int empty);
3284
3285VALUE
3286rb_str_substr(VALUE str, long beg, long len)
3287{
3288 return str_substr(str, beg, len, TRUE);
3289}
3290
3291VALUE
3292rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3293{
3294 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3295}
3296
3297static VALUE
3298str_substr(VALUE str, long beg, long len, int empty)
3299{
3300 char *p = rb_str_subpos(str, beg, &len);
3301
3302 if (!p) return Qnil;
3303 if (!len && !empty) return Qnil;
3304
3305 beg = p - RSTRING_PTR(str);
3306
3307 VALUE str2 = str_subseq(str, beg, len);
3308 rb_enc_cr_str_copy_for_substr(str2, str);
3309 return str2;
3310}
3311
3312/* :nodoc: */
3313VALUE
3315{
3316 if (CHILLED_STRING_P(str)) {
3317 FL_UNSET_RAW(str, STR_CHILLED);
3318 }
3319
3320 if (OBJ_FROZEN(str)) return str;
3321 rb_str_resize(str, RSTRING_LEN(str));
3322 return rb_obj_freeze(str);
3323}
3324
3325/*
3326 * call-seq:
3327 * +string -> new_string or self
3328 *
3329 * Returns +self+ if +self+ is not frozen and can be mutated
3330 * without warning issuance.
3331 *
3332 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3333 *
3334 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@FreezingUnfreezing].
3335 */
3336static VALUE
3337str_uplus(VALUE str)
3338{
3339 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3340 return rb_str_dup(str);
3341 }
3342 else {
3343 return str;
3344 }
3345}
3346
3347/*
3348 * call-seq:
3349 * -self -> frozen_string
3350 *
3351 * Returns a frozen string equal to +self+.
3352 *
3353 * The returned string is +self+ if and only if all of the following are true:
3354 *
3355 * - +self+ is already frozen.
3356 * - +self+ is an instance of \String (rather than of a subclass of \String)
3357 * - +self+ has no instance variables set on it.
3358 *
3359 * Otherwise, the returned string is a frozen copy of +self+.
3360 *
3361 * Returning +self+, when possible, saves duplicating +self+;
3362 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3363 *
3364 * It may also save duplicating other, already-existing, strings:
3365 *
3366 * s0 = 'foo'
3367 * s1 = 'foo'
3368 * s0.object_id == s1.object_id # => false
3369 * (-s0).object_id == (-s1).object_id # => true
3370 *
3371 * Note that method #-@ is convenient for defining a constant:
3372 *
3373 * FileName = -'config/database.yml'
3374 *
3375 * While its alias #dedup is better suited for chaining:
3376 *
3377 * 'foo'.dedup.gsub!('o')
3378 *
3379 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@FreezingUnfreezing].
3380 */
3381static VALUE
3382str_uminus(VALUE str)
3383{
3384 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3385 str = rb_str_dup(str);
3386 }
3387 return rb_fstring(str);
3388}
3389
3390RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3391#define rb_str_dup_frozen rb_str_new_frozen
3392
3393VALUE
3395{
3396 rb_check_frozen(str);
3397 if (FL_TEST(str, STR_TMPLOCK)) {
3398 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3399 }
3400 FL_SET(str, STR_TMPLOCK);
3401 return str;
3402}
3403
3404VALUE
3406{
3407 rb_check_frozen(str);
3408 if (!FL_TEST(str, STR_TMPLOCK)) {
3409 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3410 }
3411 FL_UNSET(str, STR_TMPLOCK);
3412 return str;
3413}
3414
3415VALUE
3416rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3417{
3418 rb_str_locktmp(str);
3419 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3420}
3421
3422void
3424{
3425 RUBY_ASSERT(ruby_thread_has_gvl_p());
3426
3427 long capa;
3428 const int termlen = TERM_LEN(str);
3429
3430 str_modifiable(str);
3431 if (STR_SHARED_P(str)) {
3432 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3433 }
3434 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3435 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3436 }
3437
3438 int cr = ENC_CODERANGE(str);
3439 if (len == 0) {
3440 /* Empty string does not contain non-ASCII */
3442 }
3443 else if (cr == ENC_CODERANGE_UNKNOWN) {
3444 /* Leave unknown. */
3445 }
3446 else if (len > RSTRING_LEN(str)) {
3447 if (ENC_CODERANGE_CLEAN_P(cr)) {
3448 /* Update the coderange regarding the extended part. */
3449 const char *const prev_end = RSTRING_END(str);
3450 const char *const new_end = RSTRING_PTR(str) + len;
3451 rb_encoding *enc = rb_enc_get(str);
3452 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3453 ENC_CODERANGE_SET(str, cr);
3454 }
3455 else if (cr == ENC_CODERANGE_BROKEN) {
3456 /* May be valid now, by appended part. */
3458 }
3459 }
3460 else if (len < RSTRING_LEN(str)) {
3461 if (cr != ENC_CODERANGE_7BIT) {
3462 /* ASCII-only string is keeping after truncated. Valid
3463 * and broken may be invalid or valid, leave unknown. */
3465 }
3466 }
3467
3468 STR_SET_LEN(str, len);
3469 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3470}
3471
3472VALUE
3473rb_str_resize(VALUE str, long len)
3474{
3475 if (len < 0) {
3476 rb_raise(rb_eArgError, "negative string size (or size too big)");
3477 }
3478
3479 int independent = str_independent(str);
3480 long slen = RSTRING_LEN(str);
3481 const int termlen = TERM_LEN(str);
3482
3483 if (slen > len || (termlen != 1 && slen < len)) {
3485 }
3486
3487 {
3488 long capa;
3489 if (STR_EMBED_P(str)) {
3490 if (len == slen) return str;
3491 if (str_embed_capa(str) >= len + termlen) {
3492 STR_SET_LEN(str, len);
3493 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3494 return str;
3495 }
3496 str_make_independent_expand(str, slen, len - slen, termlen);
3497 }
3498 else if (str_embed_capa(str) >= len + termlen) {
3499 capa = RSTRING(str)->as.heap.aux.capa;
3500 char *ptr = STR_HEAP_PTR(str);
3501 STR_SET_EMBED(str);
3502 if (slen > len) slen = len;
3503 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3504 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3505 STR_SET_LEN(str, len);
3506 if (independent) {
3507 SIZED_FREE_N(ptr, capa + termlen);
3508 }
3509 return str;
3510 }
3511 else if (!independent) {
3512 if (len == slen) return str;
3513 str_make_independent_expand(str, slen, len - slen, termlen);
3514 }
3515 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3516 (capa - len) > (len < 1024 ? len : 1024)) {
3517 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3518 (size_t)len + termlen, STR_HEAP_SIZE(str));
3519 RSTRING(str)->as.heap.aux.capa = len;
3520 }
3521 else if (len == slen) return str;
3522 STR_SET_LEN(str, len);
3523 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3524 }
3525 return str;
3526}
3527
3528static void
3529str_ensure_available_capa(VALUE str, long len)
3530{
3531 str_modify_keep_cr(str);
3532
3533 const int termlen = TERM_LEN(str);
3534 long olen = RSTRING_LEN(str);
3535
3536 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3537 rb_raise(rb_eArgError, "string sizes too big");
3538 }
3539
3540 long total = olen + len;
3541 long capa = str_capacity(str, termlen);
3542
3543 if (capa < total) {
3544 if (total >= LONG_MAX / 2) {
3545 capa = total;
3546 }
3547 while (total > capa) {
3548 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3549 }
3550 RESIZE_CAPA_TERM(str, capa, termlen);
3551 }
3552}
3553
3554static VALUE
3555str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3556{
3557 if (keep_cr) {
3558 str_modify_keep_cr(str);
3559 }
3560 else {
3561 rb_str_modify(str);
3562 }
3563 if (len == 0) return 0;
3564
3565 long total, olen, off = -1;
3566 char *sptr;
3567 const int termlen = TERM_LEN(str);
3568
3569 RSTRING_GETMEM(str, sptr, olen);
3570 if (ptr >= sptr && ptr <= sptr + olen) {
3571 off = ptr - sptr;
3572 }
3573
3574 long capa = str_capacity(str, termlen);
3575
3576 if (olen > LONG_MAX - len) {
3577 rb_raise(rb_eArgError, "string sizes too big");
3578 }
3579 total = olen + len;
3580 if (capa < total) {
3581 if (total >= LONG_MAX / 2) {
3582 capa = total;
3583 }
3584 while (total > capa) {
3585 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3586 }
3587 RESIZE_CAPA_TERM(str, capa, termlen);
3588 sptr = RSTRING_PTR(str);
3589 }
3590 if (off != -1) {
3591 ptr = sptr + off;
3592 }
3593 memcpy(sptr + olen, ptr, len);
3594 STR_SET_LEN(str, total);
3595 TERM_FILL(sptr + total, termlen); /* sentinel */
3596
3597 return str;
3598}
3599
3600#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3601#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3602
3603VALUE
3604rb_str_cat(VALUE str, const char *ptr, long len)
3605{
3606 if (len == 0) return str;
3607 if (len < 0) {
3608 rb_raise(rb_eArgError, "negative string size (or size too big)");
3609 }
3610 return str_buf_cat(str, ptr, len);
3611}
3612
3613VALUE
3614rb_str_cat_cstr(VALUE str, const char *ptr)
3615{
3616 must_not_null(ptr);
3617 return rb_str_buf_cat(str, ptr, strlen(ptr));
3618}
3619
3620static void
3621rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3622{
3623 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3624
3625 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3626 if (UNLIKELY(!str_independent(str))) {
3627 str_make_independent(str);
3628 }
3629
3630 long string_length = -1;
3631 const int null_terminator_length = 1;
3632 char *sptr;
3633 RSTRING_GETMEM(str, sptr, string_length);
3634
3635 // Ensure the resulting string wouldn't be too long.
3636 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3637 rb_raise(rb_eArgError, "string sizes too big");
3638 }
3639
3640 long string_capacity = str_capacity(str, null_terminator_length);
3641
3642 // Get the code range before any modifications since those might clear the code range.
3643 int cr = ENC_CODERANGE(str);
3644
3645 // Check if the string has spare string_capacity to write the new byte.
3646 if (LIKELY(string_capacity >= string_length + 1)) {
3647 // In fast path we can write the new byte and note the string's new length.
3648 sptr[string_length] = byte;
3649 STR_SET_LEN(str, string_length + 1);
3650 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3651 }
3652 else {
3653 // If there's not enough string_capacity, make a call into the general string concatenation function.
3654 str_buf_cat(str, (char *)&byte, 1);
3655 }
3656
3657 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3658 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3659 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3660 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3661 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3662 if (ISASCII(byte)) {
3664 }
3665 else {
3667
3668 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3669 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3670 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3671 }
3672 }
3673 }
3674}
3675
3676RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3677RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3678RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3679
3680static VALUE
3681rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3682 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3683{
3684 int str_encindex = ENCODING_GET(str);
3685 int res_encindex;
3686 int str_cr, res_cr;
3687 rb_encoding *str_enc, *ptr_enc;
3688
3689 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3690
3691 if (str_encindex == ptr_encindex) {
3692 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3693 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3694 }
3695 }
3696 else {
3697 str_enc = rb_enc_from_index(str_encindex);
3698 ptr_enc = rb_enc_from_index(ptr_encindex);
3699 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3700 if (len == 0)
3701 return str;
3702 if (RSTRING_LEN(str) == 0) {
3703 rb_str_buf_cat(str, ptr, len);
3704 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3705 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3706 return str;
3707 }
3708 goto incompatible;
3709 }
3710 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3711 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3712 }
3713 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3714 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3715 str_cr = rb_enc_str_coderange(str);
3716 }
3717 }
3718 }
3719 if (ptr_cr_ret)
3720 *ptr_cr_ret = ptr_cr;
3721
3722 if (str_encindex != ptr_encindex &&
3723 str_cr != ENC_CODERANGE_7BIT &&
3724 ptr_cr != ENC_CODERANGE_7BIT) {
3725 str_enc = rb_enc_from_index(str_encindex);
3726 ptr_enc = rb_enc_from_index(ptr_encindex);
3727 goto incompatible;
3728 }
3729
3730 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3731 res_encindex = str_encindex;
3732 res_cr = ENC_CODERANGE_UNKNOWN;
3733 }
3734 else if (str_cr == ENC_CODERANGE_7BIT) {
3735 if (ptr_cr == ENC_CODERANGE_7BIT) {
3736 res_encindex = str_encindex;
3737 res_cr = ENC_CODERANGE_7BIT;
3738 }
3739 else {
3740 res_encindex = ptr_encindex;
3741 res_cr = ptr_cr;
3742 }
3743 }
3744 else if (str_cr == ENC_CODERANGE_VALID) {
3745 res_encindex = str_encindex;
3746 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3747 res_cr = str_cr;
3748 else
3749 res_cr = ptr_cr;
3750 }
3751 else { /* str_cr == ENC_CODERANGE_BROKEN */
3752 res_encindex = str_encindex;
3753 res_cr = str_cr;
3754 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3755 }
3756
3757 if (len < 0) {
3758 rb_raise(rb_eArgError, "negative string size (or size too big)");
3759 }
3760 str_buf_cat(str, ptr, len);
3761 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3762 return str;
3763
3764 incompatible:
3765 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3766 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3768}
3769
3770VALUE
3771rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3772{
3773 return rb_enc_cr_str_buf_cat(str, ptr, len,
3774 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3775}
3776
3777VALUE
3779{
3780 /* ptr must reference NUL terminated ASCII string. */
3781 int encindex = ENCODING_GET(str);
3782 rb_encoding *enc = rb_enc_from_index(encindex);
3783 if (rb_enc_asciicompat(enc)) {
3784 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3785 encindex, ENC_CODERANGE_7BIT, 0);
3786 }
3787 else {
3788 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3789 while (*ptr) {
3790 unsigned int c = (unsigned char)*ptr;
3791 int len = rb_enc_codelen(c, enc);
3792 rb_enc_mbcput(c, buf, enc);
3793 rb_enc_cr_str_buf_cat(str, buf, len,
3794 encindex, ENC_CODERANGE_VALID, 0);
3795 ptr++;
3796 }
3797 return str;
3798 }
3799}
3800
3801VALUE
3803{
3804 int str2_cr = rb_enc_str_coderange(str2);
3805
3806 if (rb_str_enc_fastpath(str)) {
3807 switch (str2_cr) {
3808 case ENC_CODERANGE_7BIT:
3809 // If RHS is 7bit we can do simple concatenation
3810 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3811 RB_GC_GUARD(str2);
3812 return str;
3814 // If RHS is valid, we can do simple concatenation if encodings are the same
3815 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3816 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3817 int str_cr = ENC_CODERANGE(str);
3818 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3819 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3820 }
3821 RB_GC_GUARD(str2);
3822 return str;
3823 }
3824 }
3825 }
3826
3827 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3828 ENCODING_GET(str2), str2_cr, &str2_cr);
3829
3830 ENC_CODERANGE_SET(str2, str2_cr);
3831
3832 return str;
3833}
3834
3835VALUE
3837{
3838 StringValue(str2);
3839 return rb_str_buf_append(str, str2);
3840}
3841
3842VALUE
3843rb_str_concat_literals(size_t num, const VALUE *strary)
3844{
3845 VALUE str;
3846 size_t i, s = 0;
3847 unsigned long len = 1;
3848
3849 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3850 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3851
3852 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3853 str = rb_str_buf_new(len);
3854 str_enc_copy_direct(str, strary[0]);
3855
3856 for (i = s; i < num; ++i) {
3857 const VALUE v = strary[i];
3858 int encidx = ENCODING_GET(v);
3859
3860 rb_str_buf_append(str, v);
3861 if (encidx != ENCINDEX_US_ASCII) {
3862 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3863 rb_enc_set_index(str, encidx);
3864 }
3865 }
3866 return str;
3867}
3868
3869/*
3870 * call-seq:
3871 * concat(*objects) -> string
3872 *
3873 * :include: doc/string/concat.rdoc
3874 */
3875static VALUE
3876rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3877{
3878 str_modifiable(str);
3879
3880 if (argc == 1) {
3881 return rb_str_concat(str, argv[0]);
3882 }
3883 else if (argc > 1) {
3884 int i;
3885 VALUE arg_str = rb_str_tmp_new(0);
3886 rb_enc_copy(arg_str, str);
3887 for (i = 0; i < argc; i++) {
3888 rb_str_concat(arg_str, argv[i]);
3889 }
3890 rb_str_buf_append(str, arg_str);
3891 }
3892
3893 return str;
3894}
3895
3896/*
3897 * call-seq:
3898 * append_as_bytes(*objects) -> self
3899 *
3900 * Concatenates each object in +objects+ into +self+; returns +self+;
3901 * performs no encoding validation or conversion:
3902 *
3903 * s = 'foo'
3904 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3905 * s.valid_encoding? # => false
3906 * s.append_as_bytes("\xAC 12")
3907 * s.valid_encoding? # => true
3908 *
3909 * When a given object is an integer,
3910 * the value is considered an 8-bit byte;
3911 * if the integer occupies more than one byte (i.e,. is greater than 255),
3912 * appends only the low-order byte (similar to String#setbyte):
3913 *
3914 * s = ""
3915 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3916 * s.bytesize # => 2
3917 *
3918 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3919 */
3920
3921VALUE
3922rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3923{
3924 long needed_capacity = 0;
3925 volatile VALUE t0;
3926 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3927
3928 for (int index = 0; index < argc; index++) {
3929 VALUE obj = argv[index];
3930 enum ruby_value_type type = types[index] = rb_type(obj);
3931 switch (type) {
3932 case T_FIXNUM:
3933 case T_BIGNUM:
3934 needed_capacity++;
3935 break;
3936 case T_STRING:
3937 needed_capacity += RSTRING_LEN(obj);
3938 break;
3939 default:
3940 rb_raise(
3942 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3943 rb_obj_class(obj)
3944 );
3945 break;
3946 }
3947 }
3948
3949 str_ensure_available_capa(str, needed_capacity);
3950 char *sptr = RSTRING_END(str);
3951
3952 for (int index = 0; index < argc; index++) {
3953 VALUE obj = argv[index];
3954 enum ruby_value_type type = types[index];
3955 switch (type) {
3956 case T_FIXNUM:
3957 case T_BIGNUM: {
3958 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3959 char byte = (char)(NUM2INT(obj) & 0xFF);
3960 *sptr = byte;
3961 sptr++;
3962 break;
3963 }
3964 case T_STRING: {
3965 const char *ptr;
3966 long len;
3967 RSTRING_GETMEM(obj, ptr, len);
3968 memcpy(sptr, ptr, len);
3969 sptr += len;
3970 break;
3971 }
3972 default:
3973 rb_bug("append_as_bytes arguments should have been validated");
3974 }
3975 }
3976
3977 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3978 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3979
3980 int cr = ENC_CODERANGE(str);
3981 switch (cr) {
3982 case ENC_CODERANGE_7BIT: {
3983 for (int index = 0; index < argc; index++) {
3984 VALUE obj = argv[index];
3985 enum ruby_value_type type = types[index];
3986 switch (type) {
3987 case T_FIXNUM:
3988 case T_BIGNUM: {
3989 if (!ISASCII(NUM2INT(obj))) {
3990 goto clear_cr;
3991 }
3992 break;
3993 }
3994 case T_STRING: {
3995 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3996 goto clear_cr;
3997 }
3998 break;
3999 }
4000 default:
4001 rb_bug("append_as_bytes arguments should have been validated");
4002 }
4003 }
4004 break;
4005 }
4007 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
4008 goto keep_cr;
4009 }
4010 else {
4011 goto clear_cr;
4012 }
4013 break;
4014 default:
4015 goto clear_cr;
4016 break;
4017 }
4018
4019 RB_GC_GUARD(t0);
4020
4021 clear_cr:
4022 // If no fast path was hit, we clear the coderange.
4023 // append_as_bytes is predominantly meant to be used in
4024 // buffering situation, hence it's likely the coderange
4025 // will never be scanned, so it's not worth spending time
4026 // precomputing the coderange except for simple and common
4027 // situations.
4029 keep_cr:
4030 return str;
4031}
4032
4033/*
4034 * call-seq:
4035 * self << object -> self
4036 *
4037 * Appends a string representation of +object+ to +self+;
4038 * returns +self+.
4039 *
4040 * If +object+ is a string, appends it to +self+:
4041 *
4042 * s = 'foo'
4043 * s << 'bar' # => "foobar"
4044 * s # => "foobar"
4045 *
4046 * If +object+ is an integer,
4047 * its value is considered a codepoint;
4048 * converts the value to a character before concatenating:
4049 *
4050 * s = 'foo'
4051 * s << 33 # => "foo!"
4052 *
4053 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4054 * and the encoding of +self+ is Encoding::US_ASCII,
4055 * changes the encoding to Encoding::ASCII_8BIT:
4056 *
4057 * s = 'foo'.encode(Encoding::US_ASCII)
4058 * s.encoding # => #<Encoding:US-ASCII>
4059 * s << 0xff # => "foo\xFF"
4060 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4061 *
4062 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4063 *
4064 * s = 'foo'
4065 * s.encoding # => <Encoding:UTF-8>
4066 * s << 0x00110000 # 1114112 out of char range (RangeError)
4067 * s = 'foo'.encode(Encoding::EUC_JP)
4068 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4069 *
4070 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4071 */
4072VALUE
4074{
4075 unsigned int code;
4076 rb_encoding *enc = STR_ENC_GET(str1);
4077 int encidx;
4078
4079 if (RB_INTEGER_TYPE_P(str2)) {
4080 if (rb_num_to_uint(str2, &code) == 0) {
4081 }
4082 else if (FIXNUM_P(str2)) {
4083 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4084 }
4085 else {
4086 rb_raise(rb_eRangeError, "bignum out of char range");
4087 }
4088 }
4089 else {
4090 return rb_str_append(str1, str2);
4091 }
4092
4093 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4094
4095 if (encidx >= 0) {
4096 rb_str_buf_cat_byte(str1, (unsigned char)code);
4097 }
4098 else {
4099 long pos = RSTRING_LEN(str1);
4100 int cr = ENC_CODERANGE(str1);
4101 int len;
4102 char *buf;
4103
4104 switch (len = rb_enc_codelen(code, enc)) {
4105 case ONIGERR_INVALID_CODE_POINT_VALUE:
4106 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4107 break;
4108 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4109 case 0:
4110 rb_raise(rb_eRangeError, "%u out of char range", code);
4111 break;
4112 }
4113 buf = ALLOCA_N(char, len + 1);
4114 rb_enc_mbcput(code, buf, enc);
4115 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4116 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4117 }
4118 rb_str_resize(str1, pos+len);
4119 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4120 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4122 }
4123 else if (cr == ENC_CODERANGE_BROKEN) {
4125 }
4126 ENC_CODERANGE_SET(str1, cr);
4127 }
4128 return str1;
4129}
4130
4131int
4132rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4133{
4134 int encidx = rb_enc_to_index(enc);
4135
4136 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4137 /* US-ASCII automatically extended to ASCII-8BIT */
4138 if (code > 0xFF) {
4139 rb_raise(rb_eRangeError, "%u out of char range", code);
4140 }
4141 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4142 return ENCINDEX_ASCII_8BIT;
4143 }
4144 return encidx;
4145 }
4146 else {
4147 return -1;
4148 }
4149}
4150
4151/*
4152 * call-seq:
4153 * prepend(*other_strings) -> new_string
4154 *
4155 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4156 *
4157 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4158 *
4159 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4160 *
4161 */
4162
4163static VALUE
4164rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4165{
4166 str_modifiable(str);
4167
4168 if (argc == 1) {
4169 rb_str_update(str, 0L, 0L, argv[0]);
4170 }
4171 else if (argc > 1) {
4172 int i;
4173 VALUE arg_str = rb_str_tmp_new(0);
4174 rb_enc_copy(arg_str, str);
4175 for (i = 0; i < argc; i++) {
4176 rb_str_append(arg_str, argv[i]);
4177 }
4178 rb_str_update(str, 0L, 0L, arg_str);
4179 }
4180
4181 return str;
4182}
4183
4184st_index_t
4186{
4187 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4188 st_index_t precomputed_hash;
4189 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4190
4191 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4192 return precomputed_hash;
4193 }
4194
4195 return str_do_hash(str);
4196}
4197
4198int
4200{
4201 long len1, len2;
4202 const char *ptr1, *ptr2;
4203 RSTRING_GETMEM(str1, ptr1, len1);
4204 RSTRING_GETMEM(str2, ptr2, len2);
4205 return (len1 != len2 ||
4206 !rb_str_comparable(str1, str2) ||
4207 memcmp(ptr1, ptr2, len1) != 0);
4208}
4209
4210/*
4211 * call-seq:
4212 * hash -> integer
4213 *
4214 * :include: doc/string/hash.rdoc
4215 *
4216 */
4217
4218static VALUE
4219rb_str_hash_m(VALUE str)
4220{
4221 st_index_t hval = rb_str_hash(str);
4222 return ST2FIX(hval);
4223}
4224
4225#define lesser(a,b) (((a)>(b))?(b):(a))
4226
4227int
4229{
4230 int idx1, idx2;
4231 int rc1, rc2;
4232
4233 if (RSTRING_LEN(str1) == 0) return TRUE;
4234 if (RSTRING_LEN(str2) == 0) return TRUE;
4235 idx1 = ENCODING_GET(str1);
4236 idx2 = ENCODING_GET(str2);
4237 if (idx1 == idx2) return TRUE;
4238 rc1 = rb_enc_str_coderange(str1);
4239 rc2 = rb_enc_str_coderange(str2);
4240 if (rc1 == ENC_CODERANGE_7BIT) {
4241 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4242 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4243 return TRUE;
4244 }
4245 if (rc2 == ENC_CODERANGE_7BIT) {
4246 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4247 return TRUE;
4248 }
4249 return FALSE;
4250}
4251
4252int
4254{
4255 long len1, len2;
4256 const char *ptr1, *ptr2;
4257 int retval;
4258
4259 if (str1 == str2) return 0;
4260 RSTRING_GETMEM(str1, ptr1, len1);
4261 RSTRING_GETMEM(str2, ptr2, len2);
4262 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4263 if (len1 == len2) {
4264 if (!rb_str_comparable(str1, str2)) {
4265 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4266 return 1;
4267 return -1;
4268 }
4269 return 0;
4270 }
4271 if (len1 > len2) return 1;
4272 return -1;
4273 }
4274 if (retval > 0) return 1;
4275 return -1;
4276}
4277
4278/*
4279 * call-seq:
4280 * self == other -> true or false
4281 *
4282 * Returns whether +other+ is equal to +self+.
4283 *
4284 * When +other+ is a string, returns whether +other+ has the same length and content as +self+:
4285 *
4286 * s = 'foo'
4287 * s == 'foo' # => true
4288 * s == 'food' # => false
4289 * s == 'FOO' # => false
4290 *
4291 * Returns +false+ if the two strings' encodings are not compatible:
4292 *
4293 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4294 *
4295 * When +other+ is not a string:
4296 *
4297 * - If +other+ responds to method <tt>to_str</tt>,
4298 * <tt>other == self</tt> is called and its return value is returned.
4299 * - If +other+ does not respond to <tt>to_str</tt>,
4300 * +false+ is returned.
4301 *
4302 * Related: {Comparing}[rdoc-ref:String@Comparing].
4303 */
4304
4305VALUE
4307{
4308 if (str1 == str2) return Qtrue;
4309 if (!RB_TYPE_P(str2, T_STRING)) {
4310 if (!rb_respond_to(str2, idTo_str)) {
4311 return Qfalse;
4312 }
4313 return rb_equal(str2, str1);
4314 }
4315 return rb_str_eql_internal(str1, str2);
4316}
4317
4318/*
4319 * call-seq:
4320 * eql?(object) -> true or false
4321 *
4322 * :include: doc/string/eql_p.rdoc
4323 *
4324 */
4325
4326VALUE
4327rb_str_eql(VALUE str1, VALUE str2)
4328{
4329 if (str1 == str2) return Qtrue;
4330 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4331 return rb_str_eql_internal(str1, str2);
4332}
4333
4334/*
4335 * call-seq:
4336 * self <=> other -> -1, 0, 1, or nil
4337 *
4338 * Compares +self+ and +other+,
4339 * evaluating their _contents_, not their _lengths_.
4340 *
4341 * Returns:
4342 *
4343 * - +-1+, if +self+ is smaller.
4344 * - +0+, if the two are equal.
4345 * - +1+, if +self+ is larger.
4346 * - +nil+, if the two are incomparable.
4347 *
4348 * Examples:
4349 *
4350 * 'a' <=> 'b' # => -1
4351 * 'a' <=> 'ab' # => -1
4352 * 'a' <=> 'a' # => 0
4353 * 'b' <=> 'a' # => 1
4354 * 'ab' <=> 'a' # => 1
4355 * 'a' <=> :a # => nil
4356 *
4357 * \Class \String includes module Comparable,
4358 * each of whose methods uses String#<=> for comparison.
4359 *
4360 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4361 */
4362
4363static VALUE
4364rb_str_cmp_m(VALUE str1, VALUE str2)
4365{
4366 int result;
4367 VALUE s = rb_check_string_type(str2);
4368 if (NIL_P(s)) {
4369 return rb_invcmp(str1, str2);
4370 }
4371 result = rb_str_cmp(str1, s);
4372 return INT2FIX(result);
4373}
4374
4375static VALUE str_casecmp(VALUE str1, VALUE str2);
4376static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4377
4378/*
4379 * call-seq:
4380 * casecmp(other_string) -> -1, 0, 1, or nil
4381 *
4382 * Ignoring case, compares +self+ and +other_string+; returns:
4383 *
4384 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4385 * - 0 if the two are equal.
4386 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4387 * - +nil+ if the two are incomparable.
4388 *
4389 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4390 *
4391 * Examples:
4392 *
4393 * 'foo'.casecmp('goo') # => -1
4394 * 'goo'.casecmp('foo') # => 1
4395 * 'foo'.casecmp('food') # => -1
4396 * 'food'.casecmp('foo') # => 1
4397 * 'FOO'.casecmp('foo') # => 0
4398 * 'foo'.casecmp('FOO') # => 0
4399 * 'foo'.casecmp(1) # => nil
4400 *
4401 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4402 */
4403
4404VALUE
4405rb_str_casecmp(VALUE str1, VALUE str2)
4406{
4407 VALUE s = rb_check_string_type(str2);
4408 if (NIL_P(s)) {
4409 return Qnil;
4410 }
4411 return str_casecmp(str1, s);
4412}
4413
4414static VALUE
4415str_casecmp(VALUE str1, VALUE str2)
4416{
4417 long len;
4418 rb_encoding *enc;
4419 const char *p1, *p1end, *p2, *p2end;
4420
4421 enc = rb_enc_compatible(str1, str2);
4422 if (!enc) {
4423 return Qnil;
4424 }
4425
4426 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4427 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4428 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4429 while (p1 < p1end && p2 < p2end) {
4430 if (*p1 != *p2) {
4431 unsigned int c1 = TOLOWER(*p1 & 0xff);
4432 unsigned int c2 = TOLOWER(*p2 & 0xff);
4433 if (c1 != c2)
4434 return INT2FIX(c1 < c2 ? -1 : 1);
4435 }
4436 p1++;
4437 p2++;
4438 }
4439 }
4440 else {
4441 while (p1 < p1end && p2 < p2end) {
4442 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4443 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4444
4445 if (0 <= c1 && 0 <= c2) {
4446 c1 = TOLOWER(c1);
4447 c2 = TOLOWER(c2);
4448 if (c1 != c2)
4449 return INT2FIX(c1 < c2 ? -1 : 1);
4450 }
4451 else {
4452 int r;
4453 l1 = rb_enc_mbclen(p1, p1end, enc);
4454 l2 = rb_enc_mbclen(p2, p2end, enc);
4455 len = l1 < l2 ? l1 : l2;
4456 r = memcmp(p1, p2, len);
4457 if (r != 0)
4458 return INT2FIX(r < 0 ? -1 : 1);
4459 if (l1 != l2)
4460 return INT2FIX(l1 < l2 ? -1 : 1);
4461 }
4462 p1 += l1;
4463 p2 += l2;
4464 }
4465 }
4466 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4467 if (p1 == p1end) return INT2FIX(-1);
4468 return INT2FIX(1);
4469}
4470
4471/*
4472 * call-seq:
4473 * casecmp?(other_string) -> true, false, or nil
4474 *
4475 * Returns +true+ if +self+ and +other_string+ are equal after
4476 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4477 *
4478 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4479 *
4480 * Examples:
4481 *
4482 * 'foo'.casecmp?('goo') # => false
4483 * 'goo'.casecmp?('foo') # => false
4484 * 'foo'.casecmp?('food') # => false
4485 * 'food'.casecmp?('foo') # => false
4486 * 'FOO'.casecmp?('foo') # => true
4487 * 'foo'.casecmp?('FOO') # => true
4488 * 'foo'.casecmp?(1) # => nil
4489 *
4490 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4491 */
4492
4493static VALUE
4494rb_str_casecmp_p(VALUE str1, VALUE str2)
4495{
4496 VALUE s = rb_check_string_type(str2);
4497 if (NIL_P(s)) {
4498 return Qnil;
4499 }
4500 return str_casecmp_p(str1, s);
4501}
4502
4503static VALUE
4504str_casecmp_p(VALUE str1, VALUE str2)
4505{
4506 rb_encoding *enc;
4507 VALUE folded_str1, folded_str2;
4508 VALUE fold_opt = sym_fold;
4509
4510 enc = rb_enc_compatible(str1, str2);
4511 if (!enc) {
4512 return Qnil;
4513 }
4514
4515 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4516 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4517
4518 return rb_str_eql(folded_str1, folded_str2);
4519}
4520
4521static long
4522strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4523 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4524{
4525 const char *search_start = str_ptr;
4526 long pos, search_len = str_len - offset;
4527
4528 for (;;) {
4529 const char *t;
4530 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4531 if (pos < 0) return pos;
4532 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4533 if (t == search_start + pos) break;
4534 search_len -= t - search_start;
4535 if (search_len <= 0) return -1;
4536 offset += t - search_start;
4537 search_start = t;
4538 }
4539 return pos + offset;
4540}
4541
4542/* found index in byte */
4543#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4544#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4545
4546static long
4547rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4548{
4549 const char *str_ptr, *str_ptr_end, *sub_ptr;
4550 long str_len, sub_len;
4551 rb_encoding *enc;
4552
4553 enc = rb_enc_check(str, sub);
4554 if (is_broken_string(sub)) return -1;
4555
4556 str_ptr = RSTRING_PTR(str);
4557 str_ptr_end = RSTRING_END(str);
4558 str_len = RSTRING_LEN(str);
4559 sub_ptr = RSTRING_PTR(sub);
4560 sub_len = RSTRING_LEN(sub);
4561
4562 if (str_len < sub_len) return -1;
4563
4564 if (offset != 0) {
4565 long str_len_char, sub_len_char;
4566 int single_byte = single_byte_optimizable(str);
4567 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4568 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4569 if (offset < 0) {
4570 offset += str_len_char;
4571 if (offset < 0) return -1;
4572 }
4573 if (str_len_char - offset < sub_len_char) return -1;
4574 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4575 str_ptr += offset;
4576 }
4577 if (sub_len == 0) return offset;
4578
4579 /* need proceed one character at a time */
4580 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4581}
4582
4583
4584/*
4585 * call-seq:
4586 * index(pattern, offset = 0) -> integer or nil
4587 *
4588 * :include: doc/string/index.rdoc
4589 *
4590 */
4591
4592static VALUE
4593rb_str_index_m(int argc, VALUE *argv, VALUE str)
4594{
4595 VALUE sub;
4596 VALUE initpos;
4597 rb_encoding *enc = STR_ENC_GET(str);
4598 long pos;
4599
4600 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4601 long slen = str_strlen(str, enc); /* str's enc */
4602 pos = NUM2LONG(initpos);
4603 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4604 if (RB_TYPE_P(sub, T_REGEXP)) {
4606 }
4607 return Qnil;
4608 }
4609 }
4610 else {
4611 pos = 0;
4612 }
4613
4614 if (RB_TYPE_P(sub, T_REGEXP)) {
4615 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4616 enc, single_byte_optimizable(str));
4617
4618 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4619 VALUE match = rb_backref_get();
4620 pos = rb_str_sublen(str, RMATCH_BEG(match, 0));
4621 return LONG2NUM(pos);
4622 }
4623 }
4624 else {
4625 StringValue(sub);
4626 pos = rb_str_index(str, sub, pos);
4627 if (pos >= 0) {
4628 pos = rb_str_sublen(str, pos);
4629 return LONG2NUM(pos);
4630 }
4631 }
4632 return Qnil;
4633}
4634
4635/* Ensure that the given pos is a valid character boundary.
4636 * Note that in this function, "character" means a code point
4637 * (Unicode scalar value), not a grapheme cluster.
4638 */
4639static void
4640str_ensure_byte_pos(VALUE str, long pos)
4641{
4642 if (!single_byte_optimizable(str)) {
4643 const char *s = RSTRING_PTR(str);
4644 const char *e = RSTRING_END(str);
4645 const char *p = s + pos;
4646 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4647 rb_raise(rb_eIndexError,
4648 "offset %ld does not land on character boundary", pos);
4649 }
4650 }
4651}
4652
4653/*
4654 * call-seq:
4655 * byteindex(object, offset = 0) -> integer or nil
4656 *
4657 * Returns the 0-based integer index of a substring of +self+
4658 * specified by +object+ (a string or Regexp) and +offset+,
4659 * or +nil+ if there is no such substring;
4660 * the returned index is the count of _bytes_ (not characters).
4661 *
4662 * When +object+ is a string,
4663 * returns the index of the first found substring equal to +object+:
4664 *
4665 * s = 'foo' # => "foo"
4666 * s.size # => 3 # Three 1-byte characters.
4667 * s.bytesize # => 3 # Three bytes.
4668 * s.byteindex('f') # => 0
4669 * s.byteindex('o') # => 1
4670 * s.byteindex('oo') # => 1
4671 * s.byteindex('ooo') # => nil
4672 *
4673 * When +object+ is a Regexp,
4674 * returns the index of the first found substring matching +object+;
4675 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4676 *
4677 * s = 'foo'
4678 * s.byteindex(/f/) # => 0
4679 * $~ # => #<MatchData "f">
4680 * s.byteindex(/o/) # => 1
4681 * s.byteindex(/oo/) # => 1
4682 * s.byteindex(/ooo/) # => nil
4683 * $~ # => nil
4684 *
4685 * \Integer argument +offset+, if given, specifies the 0-based index
4686 * of the byte where searching is to begin.
4687 *
4688 * When +offset+ is non-negative,
4689 * searching begins at byte position +offset+:
4690 *
4691 * s = 'foo'
4692 * s.byteindex('o', 1) # => 1
4693 * s.byteindex('o', 2) # => 2
4694 * s.byteindex('o', 3) # => nil
4695 *
4696 * When +offset+ is negative, counts backward from the end of +self+:
4697 *
4698 * s = 'foo'
4699 * s.byteindex('o', -1) # => 2
4700 * s.byteindex('o', -2) # => 1
4701 * s.byteindex('o', -3) # => 1
4702 * s.byteindex('o', -4) # => nil
4703 *
4704 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4705 *
4706 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4707 * s.size # => 2 # Two 3-byte characters.
4708 * s.bytesize # => 6 # Six bytes.
4709 * s.byteindex("\uFFFF") # => 0
4710 * s.byteindex("\uFFFF", 1) # Raises IndexError
4711 * s.byteindex("\uFFFF", 2) # Raises IndexError
4712 * s.byteindex("\uFFFF", 3) # => 3
4713 * s.byteindex("\uFFFF", 4) # Raises IndexError
4714 * s.byteindex("\uFFFF", 5) # Raises IndexError
4715 * s.byteindex("\uFFFF", 6) # => nil
4716 *
4717 * Related: see {Querying}[rdoc-ref:String@Querying].
4718 */
4719
4720static VALUE
4721rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4722{
4723 VALUE sub;
4724 VALUE initpos;
4725 long pos;
4726
4727 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4728 long slen = RSTRING_LEN(str);
4729 pos = NUM2LONG(initpos);
4730 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4731 if (RB_TYPE_P(sub, T_REGEXP)) {
4733 }
4734 return Qnil;
4735 }
4736 }
4737 else {
4738 pos = 0;
4739 }
4740
4741 str_ensure_byte_pos(str, pos);
4742
4743 if (RB_TYPE_P(sub, T_REGEXP)) {
4744 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4745 VALUE match = rb_backref_get();
4746 pos = RMATCH_BEG(match, 0);
4747 return LONG2NUM(pos);
4748 }
4749 }
4750 else {
4751 StringValue(sub);
4752 pos = rb_str_byteindex(str, sub, pos);
4753 if (pos >= 0) return LONG2NUM(pos);
4754 }
4755 return Qnil;
4756}
4757
4758#ifndef HAVE_MEMRCHR
4759static void*
4760memrchr(const char *search_str, int chr, long search_len)
4761{
4762 const char *ptr = search_str + search_len;
4763 while (ptr > search_str) {
4764 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4765 }
4766
4767 return ((void *)0);
4768}
4769#endif
4770
4771static long
4772str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4773{
4774 char *hit, *adjusted;
4775 int c;
4776 long slen, searchlen;
4777 char *sbeg, *e, *t;
4778
4779 sbeg = RSTRING_PTR(str);
4780 slen = RSTRING_LEN(sub);
4781 if (slen == 0) return s - sbeg;
4782 e = RSTRING_END(str);
4783 t = RSTRING_PTR(sub);
4784 c = *t & 0xff;
4785 searchlen = s - sbeg + 1;
4786
4787 if (memcmp(s, t, slen) == 0) {
4788 return s - sbeg;
4789 }
4790
4791 do {
4792 hit = memrchr(sbeg, c, searchlen);
4793 if (!hit) break;
4794 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4795 if (hit != adjusted) {
4796 searchlen = adjusted - sbeg;
4797 continue;
4798 }
4799 if (memcmp(hit, t, slen) == 0)
4800 return hit - sbeg;
4801 searchlen = adjusted - sbeg;
4802 } while (searchlen > 0);
4803
4804 return -1;
4805}
4806
4807/* found index in byte */
4808static long
4809rb_str_rindex(VALUE str, VALUE sub, long pos)
4810{
4811 long len, slen;
4812 char *sbeg, *s;
4813 rb_encoding *enc;
4814 int singlebyte;
4815
4816 enc = rb_enc_check(str, sub);
4817 if (is_broken_string(sub)) return -1;
4818 singlebyte = single_byte_optimizable(str);
4819 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4820 slen = str_strlen(sub, enc); /* rb_enc_check */
4821
4822 /* substring longer than string */
4823 if (len < slen) return -1;
4824 if (len - pos < slen) pos = len - slen;
4825 if (len == 0) return pos;
4826
4827 sbeg = RSTRING_PTR(str);
4828
4829 if (pos == 0) {
4830 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4831 return 0;
4832 else
4833 return -1;
4834 }
4835
4836 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4837 return str_rindex(str, sub, s, enc);
4838}
4839
4840/*
4841 * call-seq:
4842 * rindex(pattern, offset = self.length) -> integer or nil
4843 *
4844 * :include:doc/string/rindex.rdoc
4845 *
4846 */
4847
4848static VALUE
4849rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4850{
4851 VALUE sub;
4852 VALUE initpos;
4853 rb_encoding *enc = STR_ENC_GET(str);
4854 long pos, len = str_strlen(str, enc); /* str's enc */
4855
4856 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4857 pos = NUM2LONG(initpos);
4858 if (pos < 0 && (pos += len) < 0) {
4859 if (RB_TYPE_P(sub, T_REGEXP)) {
4861 }
4862 return Qnil;
4863 }
4864 if (pos > len) pos = len;
4865 }
4866 else {
4867 pos = len;
4868 }
4869
4870 if (RB_TYPE_P(sub, T_REGEXP)) {
4871 /* enc = rb_enc_check(str, sub); */
4872 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4873 enc, single_byte_optimizable(str));
4874
4875 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4876 VALUE match = rb_backref_get();
4877 pos = rb_str_sublen(str, RMATCH_BEG(match, 0));
4878 return LONG2NUM(pos);
4879 }
4880 }
4881 else {
4882 StringValue(sub);
4883 pos = rb_str_rindex(str, sub, pos);
4884 if (pos >= 0) {
4885 pos = rb_str_sublen(str, pos);
4886 return LONG2NUM(pos);
4887 }
4888 }
4889 return Qnil;
4890}
4891
4892static long
4893rb_str_byterindex(VALUE str, VALUE sub, long pos)
4894{
4895 long len, slen;
4896 char *sbeg, *s;
4897 rb_encoding *enc;
4898
4899 enc = rb_enc_check(str, sub);
4900 if (is_broken_string(sub)) return -1;
4901 len = RSTRING_LEN(str);
4902 slen = RSTRING_LEN(sub);
4903
4904 /* substring longer than string */
4905 if (len < slen) return -1;
4906 if (len - pos < slen) pos = len - slen;
4907 if (len == 0) return pos;
4908
4909 sbeg = RSTRING_PTR(str);
4910
4911 if (pos == 0) {
4912 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4913 return 0;
4914 else
4915 return -1;
4916 }
4917
4918 s = sbeg + pos;
4919 return str_rindex(str, sub, s, enc);
4920}
4921
4922/*
4923 * call-seq:
4924 * byterindex(object, offset = self.bytesize) -> integer or nil
4925 *
4926 * Returns the 0-based integer index of a substring of +self+
4927 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4928 * or +nil+ if there is no such substring;
4929 * the returned index is the count of _bytes_ (not characters).
4930 *
4931 * When +object+ is a string,
4932 * returns the index of the _last_ found substring equal to +object+:
4933 *
4934 * s = 'foo' # => "foo"
4935 * s.size # => 3 # Three 1-byte characters.
4936 * s.bytesize # => 3 # Three bytes.
4937 * s.byterindex('f') # => 0
4938 * s.byterindex('o') # => 2
4939 * s.byterindex('oo') # => 1
4940 * s.byterindex('ooo') # => nil
4941 *
4942 * When +object+ is a Regexp,
4943 * returns the index of the last found substring matching +object+;
4944 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4945 *
4946 * s = 'foo'
4947 * s.byterindex(/f/) # => 0
4948 * $~ # => #<MatchData "f">
4949 * s.byterindex(/o/) # => 2
4950 * s.byterindex(/oo/) # => 1
4951 * s.byterindex(/ooo/) # => nil
4952 * $~ # => nil
4953 *
4954 * The last match means starting at the possible last position,
4955 * not the last of the longest matches:
4956 *
4957 * s = 'foo'
4958 * s.byterindex(/o+/) # => 2
4959 * $~ #=> #<MatchData "o">
4960 *
4961 * To get the last longest match, use a negative lookbehind:
4962 *
4963 * s = 'foo'
4964 * s.byterindex(/(?<!o)o+/) # => 1
4965 * $~ # => #<MatchData "oo">
4966 *
4967 * Or use method #byteindex with negative lookahead:
4968 *
4969 * s = 'foo'
4970 * s.byteindex(/o+(?!.*o)/) # => 1
4971 * $~ #=> #<MatchData "oo">
4972 *
4973 * \Integer argument +offset+, if given, specifies the 0-based index
4974 * of the byte where searching is to end.
4975 *
4976 * When +offset+ is non-negative,
4977 * searching ends at byte position +offset+:
4978 *
4979 * s = 'foo'
4980 * s.byterindex('o', 0) # => nil
4981 * s.byterindex('o', 1) # => 1
4982 * s.byterindex('o', 2) # => 2
4983 * s.byterindex('o', 3) # => 2
4984 *
4985 * When +offset+ is negative, counts backward from the end of +self+:
4986 *
4987 * s = 'foo'
4988 * s.byterindex('o', -1) # => 2
4989 * s.byterindex('o', -2) # => 1
4990 * s.byterindex('o', -3) # => nil
4991 *
4992 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4993 *
4994 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4995 * s.size # => 2 # Two 3-byte characters.
4996 * s.bytesize # => 6 # Six bytes.
4997 * s.byterindex("\uFFFF") # => 3
4998 * s.byterindex("\uFFFF", 1) # Raises IndexError
4999 * s.byterindex("\uFFFF", 2) # Raises IndexError
5000 * s.byterindex("\uFFFF", 3) # => 3
5001 * s.byterindex("\uFFFF", 4) # Raises IndexError
5002 * s.byterindex("\uFFFF", 5) # Raises IndexError
5003 * s.byterindex("\uFFFF", 6) # => nil
5004 *
5005 * Related: see {Querying}[rdoc-ref:String@Querying].
5006 */
5007
5008static VALUE
5009rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
5010{
5011 VALUE sub;
5012 VALUE initpos;
5013 long pos, len = RSTRING_LEN(str);
5014
5015 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5016 pos = NUM2LONG(initpos);
5017 if (pos < 0 && (pos += len) < 0) {
5018 if (RB_TYPE_P(sub, T_REGEXP)) {
5020 }
5021 return Qnil;
5022 }
5023 if (pos > len) pos = len;
5024 }
5025 else {
5026 pos = len;
5027 }
5028
5029 str_ensure_byte_pos(str, pos);
5030
5031 if (RB_TYPE_P(sub, T_REGEXP)) {
5032 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5033 VALUE match = rb_backref_get();
5034 pos = RMATCH_BEG(match, 0);
5035 return LONG2NUM(pos);
5036 }
5037 }
5038 else {
5039 StringValue(sub);
5040 pos = rb_str_byterindex(str, sub, pos);
5041 if (pos >= 0) return LONG2NUM(pos);
5042 }
5043 return Qnil;
5044}
5045
5046/*
5047 * call-seq:
5048 * self =~ other -> integer or nil
5049 *
5050 * When +other+ is a Regexp:
5051 *
5052 * - Returns the integer index (in characters) of the first match
5053 * for +self+ and +other+, or +nil+ if none;
5054 * - Updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables].
5055 *
5056 * Examples:
5057 *
5058 * 'foo' =~ /f/ # => 0
5059 * $~ # => #<MatchData "f">
5060 * 'foo' =~ /o/ # => 1
5061 * $~ # => #<MatchData "o">
5062 * 'foo' =~ /x/ # => nil
5063 * $~ # => nil
5064 *
5065 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5066 * (see Regexp#=~):
5067 *
5068 * number = nil
5069 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5070 * number # => nil # Not assigned.
5071 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5072 * number # => "9" # Assigned.
5073 *
5074 * When +other+ is not a Regexp, returns the value
5075 * returned by <tt>other =~ self</tt>.
5076 *
5077 * Related: see {Querying}[rdoc-ref:String@Querying].
5078 */
5079
5080static VALUE
5081rb_str_match(VALUE x, VALUE y)
5082{
5083 switch (OBJ_BUILTIN_TYPE(y)) {
5084 case T_STRING:
5085 rb_raise(rb_eTypeError, "type mismatch: String given");
5086
5087 case T_REGEXP:
5088 return rb_reg_match(y, x);
5089
5090 default:
5091 return rb_funcall(y, idEqTilde, 1, x);
5092 }
5093}
5094
5095
5096static VALUE get_pat(VALUE);
5097
5098
5099/*
5100 * call-seq:
5101 * match(pattern, offset = 0) -> matchdata or nil
5102 * match(pattern, offset = 0) {|matchdata| ... } -> object
5103 *
5104 * Creates a MatchData object based on +self+ and the given arguments;
5105 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5106 *
5107 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5108 *
5109 * regexp = Regexp.new(pattern)
5110 *
5111 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5112 * (see Regexp#match):
5113 *
5114 * matchdata = regexp.match(self[offset..])
5115 *
5116 * With no block given, returns the computed +matchdata+ or +nil+:
5117 *
5118 * 'foo'.match('f') # => #<MatchData "f">
5119 * 'foo'.match('o') # => #<MatchData "o">
5120 * 'foo'.match('x') # => nil
5121 * 'foo'.match('f', 1) # => nil
5122 * 'foo'.match('o', 1) # => #<MatchData "o">
5123 *
5124 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5125 * returns the block's return value:
5126 *
5127 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5128 *
5129 * With a block given and +nil+ +matchdata+, does not call the block:
5130 *
5131 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5132 *
5133 * Related: see {Querying}[rdoc-ref:String@Querying].
5134 */
5135
5136static VALUE
5137rb_str_match_m(int argc, VALUE *argv, VALUE str)
5138{
5139 VALUE re, result;
5140 if (argc < 1)
5141 rb_check_arity(argc, 1, 2);
5142 re = argv[0];
5143 argv[0] = str;
5144 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5145 if (!NIL_P(result) && rb_block_given_p()) {
5146 return rb_yield(result);
5147 }
5148 return result;
5149}
5150
5151/*
5152 * call-seq:
5153 * match?(pattern, offset = 0) -> true or false
5154 *
5155 * Returns whether a match is found for +self+ and the given arguments;
5156 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5157 *
5158 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5159 *
5160 * regexp = Regexp.new(pattern)
5161 *
5162 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5163 * +false+ otherwise:
5164 *
5165 * 'foo'.match?(/o/) # => true
5166 * 'foo'.match?('o') # => true
5167 * 'foo'.match?(/x/) # => false
5168 * 'foo'.match?('f', 1) # => false
5169 * 'foo'.match?('o', 1) # => true
5170 *
5171 * Related: see {Querying}[rdoc-ref:String@Querying].
5172 */
5173
5174static VALUE
5175rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5176{
5177 VALUE re;
5178 rb_check_arity(argc, 1, 2);
5179 re = get_pat(argv[0]);
5180 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5181}
5182
5183enum neighbor_char {
5184 NEIGHBOR_NOT_CHAR,
5185 NEIGHBOR_FOUND,
5186 NEIGHBOR_WRAPPED
5187};
5188
5189static enum neighbor_char
5190enc_succ_char(char *p, long len, rb_encoding *enc)
5191{
5192 long i;
5193 int l;
5194
5195 if (rb_enc_mbminlen(enc) > 1) {
5196 /* wchar, trivial case */
5197 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5198 if (!MBCLEN_CHARFOUND_P(r)) {
5199 return NEIGHBOR_NOT_CHAR;
5200 }
5201 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5202 l = rb_enc_code_to_mbclen(c, enc);
5203 if (!l) return NEIGHBOR_NOT_CHAR;
5204 if (l != len) return NEIGHBOR_WRAPPED;
5205 rb_enc_mbcput(c, p, enc);
5206 r = rb_enc_precise_mbclen(p, p + len, enc);
5207 if (!MBCLEN_CHARFOUND_P(r)) {
5208 return NEIGHBOR_NOT_CHAR;
5209 }
5210 return NEIGHBOR_FOUND;
5211 }
5212 while (1) {
5213 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5214 p[i] = '\0';
5215 if (i < 0)
5216 return NEIGHBOR_WRAPPED;
5217 ++((unsigned char*)p)[i];
5218 l = rb_enc_precise_mbclen(p, p+len, enc);
5219 if (MBCLEN_CHARFOUND_P(l)) {
5220 l = MBCLEN_CHARFOUND_LEN(l);
5221 if (l == len) {
5222 return NEIGHBOR_FOUND;
5223 }
5224 else {
5225 memset(p+l, 0xff, len-l);
5226 }
5227 }
5228 if (MBCLEN_INVALID_P(l) && i < len-1) {
5229 long len2;
5230 int l2;
5231 for (len2 = len-1; 0 < len2; len2--) {
5232 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5233 if (!MBCLEN_INVALID_P(l2))
5234 break;
5235 }
5236 memset(p+len2+1, 0xff, len-(len2+1));
5237 }
5238 }
5239}
5240
5241static enum neighbor_char
5242enc_pred_char(char *p, long len, rb_encoding *enc)
5243{
5244 long i;
5245 int l;
5246 if (rb_enc_mbminlen(enc) > 1) {
5247 /* wchar, trivial case */
5248 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5249 if (!MBCLEN_CHARFOUND_P(r)) {
5250 return NEIGHBOR_NOT_CHAR;
5251 }
5252 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5253 if (!c) return NEIGHBOR_NOT_CHAR;
5254 --c;
5255 l = rb_enc_code_to_mbclen(c, enc);
5256 if (!l) return NEIGHBOR_NOT_CHAR;
5257 if (l != len) return NEIGHBOR_WRAPPED;
5258 rb_enc_mbcput(c, p, enc);
5259 r = rb_enc_precise_mbclen(p, p + len, enc);
5260 if (!MBCLEN_CHARFOUND_P(r)) {
5261 return NEIGHBOR_NOT_CHAR;
5262 }
5263 return NEIGHBOR_FOUND;
5264 }
5265 while (1) {
5266 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5267 p[i] = '\xff';
5268 if (i < 0)
5269 return NEIGHBOR_WRAPPED;
5270 --((unsigned char*)p)[i];
5271 l = rb_enc_precise_mbclen(p, p+len, enc);
5272 if (MBCLEN_CHARFOUND_P(l)) {
5273 l = MBCLEN_CHARFOUND_LEN(l);
5274 if (l == len) {
5275 return NEIGHBOR_FOUND;
5276 }
5277 else {
5278 memset(p+l, 0, len-l);
5279 }
5280 }
5281 if (MBCLEN_INVALID_P(l) && i < len-1) {
5282 long len2;
5283 int l2;
5284 for (len2 = len-1; 0 < len2; len2--) {
5285 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5286 if (!MBCLEN_INVALID_P(l2))
5287 break;
5288 }
5289 memset(p+len2+1, 0, len-(len2+1));
5290 }
5291 }
5292}
5293
5294/*
5295 overwrite +p+ by succeeding letter in +enc+ and returns
5296 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5297 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5298 assuming each ranges are successive, and mbclen
5299 never change in each ranges.
5300 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5301 character.
5302 */
5303static enum neighbor_char
5304enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5305{
5306 enum neighbor_char ret;
5307 unsigned int c;
5308 int ctype;
5309 int range;
5310 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5311
5312 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5313 int try;
5314 const int max_gaps = 1;
5315
5316 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5317 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5318 ctype = ONIGENC_CTYPE_DIGIT;
5319 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5320 ctype = ONIGENC_CTYPE_ALPHA;
5321 else
5322 return NEIGHBOR_NOT_CHAR;
5323
5324 MEMCPY(save, p, char, len);
5325 for (try = 0; try <= max_gaps; ++try) {
5326 ret = enc_succ_char(p, len, enc);
5327 if (ret == NEIGHBOR_FOUND) {
5328 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5329 if (rb_enc_isctype(c, ctype, enc))
5330 return NEIGHBOR_FOUND;
5331 }
5332 }
5333 MEMCPY(p, save, char, len);
5334 range = 1;
5335 while (1) {
5336 MEMCPY(save, p, char, len);
5337 ret = enc_pred_char(p, len, enc);
5338 if (ret == NEIGHBOR_FOUND) {
5339 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5340 if (!rb_enc_isctype(c, ctype, enc)) {
5341 MEMCPY(p, save, char, len);
5342 break;
5343 }
5344 }
5345 else {
5346 MEMCPY(p, save, char, len);
5347 break;
5348 }
5349 range++;
5350 }
5351 if (range == 1) {
5352 return NEIGHBOR_NOT_CHAR;
5353 }
5354
5355 if (ctype != ONIGENC_CTYPE_DIGIT) {
5356 MEMCPY(carry, p, char, len);
5357 return NEIGHBOR_WRAPPED;
5358 }
5359
5360 MEMCPY(carry, p, char, len);
5361 enc_succ_char(carry, len, enc);
5362 return NEIGHBOR_WRAPPED;
5363}
5364
5365
5366static VALUE str_succ(VALUE str);
5367
5368/*
5369 * call-seq:
5370 * succ -> new_str
5371 *
5372 * :include: doc/string/succ.rdoc
5373 *
5374 */
5375
5376VALUE
5378{
5379 VALUE str;
5380 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5381 rb_enc_cr_str_copy_for_substr(str, orig);
5382 return str_succ(str);
5383}
5384
5385static VALUE
5386str_succ(VALUE str)
5387{
5388 rb_encoding *enc;
5389 char *sbeg, *s, *e, *last_alnum = 0;
5390 int found_alnum = 0;
5391 long l, slen;
5392 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5393 long carry_pos = 0, carry_len = 1;
5394 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5395
5396 slen = RSTRING_LEN(str);
5397 if (slen == 0) return str;
5398
5399 enc = STR_ENC_GET(str);
5400 sbeg = RSTRING_PTR(str);
5401 s = e = sbeg + slen;
5402
5403 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5404 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5405 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5406 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5407 break;
5408 }
5409 }
5410 l = rb_enc_precise_mbclen(s, e, enc);
5411 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5412 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5413 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5414 switch (neighbor) {
5415 case NEIGHBOR_NOT_CHAR:
5416 continue;
5417 case NEIGHBOR_FOUND:
5418 return str;
5419 case NEIGHBOR_WRAPPED:
5420 last_alnum = s;
5421 break;
5422 }
5423 found_alnum = 1;
5424 carry_pos = s - sbeg;
5425 carry_len = l;
5426 }
5427 if (!found_alnum) { /* str contains no alnum */
5428 s = e;
5429 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5430 enum neighbor_char neighbor;
5431 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5432 l = rb_enc_precise_mbclen(s, e, enc);
5433 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5434 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5435 MEMCPY(tmp, s, char, l);
5436 neighbor = enc_succ_char(tmp, l, enc);
5437 switch (neighbor) {
5438 case NEIGHBOR_FOUND:
5439 MEMCPY(s, tmp, char, l);
5440 return str;
5441 break;
5442 case NEIGHBOR_WRAPPED:
5443 MEMCPY(s, tmp, char, l);
5444 break;
5445 case NEIGHBOR_NOT_CHAR:
5446 break;
5447 }
5448 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5449 /* wrapped to \0...\0. search next valid char. */
5450 enc_succ_char(s, l, enc);
5451 }
5452 if (!rb_enc_asciicompat(enc)) {
5453 MEMCPY(carry, s, char, l);
5454 carry_len = l;
5455 }
5456 carry_pos = s - sbeg;
5457 }
5459 }
5460 RESIZE_CAPA(str, slen + carry_len);
5461 sbeg = RSTRING_PTR(str);
5462 s = sbeg + carry_pos;
5463 memmove(s + carry_len, s, slen - carry_pos);
5464 memmove(s, carry, carry_len);
5465 slen += carry_len;
5466 STR_SET_LEN(str, slen);
5467 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5468 rb_enc_str_coderange(str);
5469 return str;
5470}
5471
5472
5473/*
5474 * call-seq:
5475 * succ! -> self
5476 *
5477 * Like String#succ, but modifies +self+ in place; returns +self+.
5478 *
5479 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5480 */
5481
5482static VALUE
5483rb_str_succ_bang(VALUE str)
5484{
5485 rb_str_modify(str);
5486 str_succ(str);
5487 return str;
5488}
5489
5490static int
5491all_digits_p(const char *s, long len)
5492{
5493 while (len-- > 0) {
5494 if (!ISDIGIT(*s)) return 0;
5495 s++;
5496 }
5497 return 1;
5498}
5499
5500static int
5501str_upto_i(VALUE str, VALUE arg)
5502{
5503 rb_yield(str);
5504 return 0;
5505}
5506
5507/*
5508 * call-seq:
5509 * upto(other_string, exclusive = false) {|string| ... } -> self
5510 * upto(other_string, exclusive = false) -> new_enumerator
5511 *
5512 * :include: doc/string/upto.rdoc
5513 *
5514 */
5515
5516static VALUE
5517rb_str_upto(int argc, VALUE *argv, VALUE beg)
5518{
5519 VALUE end, exclusive;
5520
5521 rb_scan_args(argc, argv, "11", &end, &exclusive);
5522 RETURN_ENUMERATOR(beg, argc, argv);
5523 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5524}
5525
5526VALUE
5527rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5528{
5529 VALUE current, after_end;
5530 ID succ;
5531 int n, ascii;
5532 rb_encoding *enc;
5533
5534 CONST_ID(succ, "succ");
5535 StringValue(end);
5536 enc = rb_enc_check(beg, end);
5537 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5538 /* single character */
5539 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5540 char c = RSTRING_PTR(beg)[0];
5541 char e = RSTRING_PTR(end)[0];
5542
5543 if (c > e || (excl && c == e)) return beg;
5544 for (;;) {
5545 VALUE str = rb_enc_str_new(&c, 1, enc);
5547 if ((*each)(str, arg)) break;
5548 if (!excl && c == e) break;
5549 c++;
5550 if (excl && c == e) break;
5551 }
5552 return beg;
5553 }
5554 /* both edges are all digits */
5555 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5556 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5557 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5558 VALUE b, e;
5559 int width;
5560
5561 width = RSTRING_LENINT(beg);
5562 b = rb_str_to_inum(beg, 10, FALSE);
5563 e = rb_str_to_inum(end, 10, FALSE);
5564 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5565 long bi = FIX2LONG(b);
5566 long ei = FIX2LONG(e);
5567 rb_encoding *usascii = rb_usascii_encoding();
5568
5569 while (bi <= ei) {
5570 if (excl && bi == ei) break;
5571 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5572 bi++;
5573 }
5574 }
5575 else {
5576 ID op = excl ? '<' : idLE;
5577 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5578
5579 args[0] = INT2FIX(width);
5580 while (rb_funcall(b, op, 1, e)) {
5581 args[1] = b;
5582 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5583 b = rb_funcallv(b, succ, 0, 0);
5584 }
5585 }
5586 return beg;
5587 }
5588 /* normal case */
5589 n = rb_str_cmp(beg, end);
5590 if (n > 0 || (excl && n == 0)) return beg;
5591
5592 after_end = rb_funcallv(end, succ, 0, 0);
5593 current = str_duplicate(rb_cString, beg);
5594 while (!rb_str_equal(current, after_end)) {
5595 VALUE next = Qnil;
5596 if (excl || !rb_str_equal(current, end))
5597 next = rb_funcallv(current, succ, 0, 0);
5598 if ((*each)(current, arg)) break;
5599 if (NIL_P(next)) break;
5600 current = next;
5601 StringValue(current);
5602 if (excl && rb_str_equal(current, end)) break;
5603 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5604 break;
5605 }
5606
5607 return beg;
5608}
5609
5610VALUE
5611rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5612{
5613 VALUE current;
5614 ID succ;
5615
5616 CONST_ID(succ, "succ");
5617 /* both edges are all digits */
5618 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5619 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5620 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5621 int width = RSTRING_LENINT(beg);
5622 b = rb_str_to_inum(beg, 10, FALSE);
5623 if (FIXNUM_P(b)) {
5624 long bi = FIX2LONG(b);
5625 rb_encoding *usascii = rb_usascii_encoding();
5626
5627 while (FIXABLE(bi)) {
5628 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5629 bi++;
5630 }
5631 b = LONG2NUM(bi);
5632 }
5633 args[0] = INT2FIX(width);
5634 while (1) {
5635 args[1] = b;
5636 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5637 b = rb_funcallv(b, succ, 0, 0);
5638 }
5639 }
5640 /* normal case */
5641 current = str_duplicate(rb_cString, beg);
5642 while (1) {
5643 VALUE next = rb_funcallv(current, succ, 0, 0);
5644 if ((*each)(current, arg)) break;
5645 current = next;
5646 StringValue(current);
5647 if (RSTRING_LEN(current) == 0)
5648 break;
5649 }
5650
5651 return beg;
5652}
5653
5654static int
5655include_range_i(VALUE str, VALUE arg)
5656{
5657 VALUE *argp = (VALUE *)arg;
5658 if (!rb_equal(str, *argp)) return 0;
5659 *argp = Qnil;
5660 return 1;
5661}
5662
5663VALUE
5664rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5665{
5666 beg = rb_str_new_frozen(beg);
5667 StringValue(end);
5668 end = rb_str_new_frozen(end);
5669 if (NIL_P(val)) return Qfalse;
5670 val = rb_check_string_type(val);
5671 if (NIL_P(val)) return Qfalse;
5672 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5673 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5674 rb_enc_asciicompat(STR_ENC_GET(val))) {
5675 const char *bp = RSTRING_PTR(beg);
5676 const char *ep = RSTRING_PTR(end);
5677 const char *vp = RSTRING_PTR(val);
5678 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5679 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5680 return Qfalse;
5681 else {
5682 char b = *bp;
5683 char e = *ep;
5684 char v = *vp;
5685
5686 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5687 if (b <= v && v < e) return Qtrue;
5688 return RBOOL(!RTEST(exclusive) && v == e);
5689 }
5690 }
5691 }
5692#if 0
5693 /* both edges are all digits */
5694 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5695 all_digits_p(bp, RSTRING_LEN(beg)) &&
5696 all_digits_p(ep, RSTRING_LEN(end))) {
5697 /* TODO */
5698 }
5699#endif
5700 }
5701 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5702
5703 return RBOOL(NIL_P(val));
5704}
5705
5706static VALUE
5707rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5708{
5709 if (rb_reg_search(re, str, 0, 0) >= 0) {
5710 VALUE match = rb_backref_get();
5711 int nth = rb_reg_backref_number(match, backref);
5712 return rb_reg_nth_match(nth, match);
5713 }
5714 return Qnil;
5715}
5716
5717static VALUE
5718rb_str_aref(VALUE str, VALUE indx)
5719{
5720 long idx;
5721
5722 if (FIXNUM_P(indx)) {
5723 idx = FIX2LONG(indx);
5724 }
5725 else if (RB_TYPE_P(indx, T_REGEXP)) {
5726 return rb_str_subpat(str, indx, INT2FIX(0));
5727 }
5728 else if (RB_TYPE_P(indx, T_STRING)) {
5729 if (rb_str_index(str, indx, 0) != -1)
5730 return str_duplicate(rb_cString, indx);
5731 return Qnil;
5732 }
5733 else {
5734 /* check if indx is Range */
5735 long beg, len = str_strlen(str, NULL);
5736 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5737 case Qfalse:
5738 break;
5739 case Qnil:
5740 return Qnil;
5741 default:
5742 return rb_str_substr(str, beg, len);
5743 }
5744 idx = NUM2LONG(indx);
5745 }
5746
5747 return str_substr(str, idx, 1, FALSE);
5748}
5749
5750
5751/*
5752 * call-seq:
5753 * self[offset] -> new_string or nil
5754 * self[offset, size] -> new_string or nil
5755 * self[range] -> new_string or nil
5756 * self[regexp, capture = 0] -> new_string or nil
5757 * self[substring] -> new_string or nil
5758 *
5759 * :include: doc/string/aref.rdoc
5760 *
5761 */
5762
5763static VALUE
5764rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5765{
5766 if (argc == 2) {
5767 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5768 return rb_str_subpat(str, argv[0], argv[1]);
5769 }
5770 else {
5771 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5772 }
5773 }
5774 rb_check_arity(argc, 1, 2);
5775 return rb_str_aref(str, argv[0]);
5776}
5777
5778VALUE
5780{
5781 char *ptr = RSTRING_PTR(str);
5782 long olen = RSTRING_LEN(str), nlen;
5783
5784 str_modifiable(str);
5785 if (len > olen) len = olen;
5786 nlen = olen - len;
5787 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5788 char *oldptr = ptr;
5789 size_t old_capa = RSTRING(str)->as.heap.aux.capa + TERM_LEN(str);
5790 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5791 STR_SET_EMBED(str);
5792 ptr = RSTRING(str)->as.embed.ary;
5793 memmove(ptr, oldptr + len, nlen);
5794 if (fl == STR_NOEMBED) {
5795 SIZED_FREE_N(oldptr, old_capa);
5796 }
5797 }
5798 else {
5799 if (!STR_SHARED_P(str)) {
5800 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5801 rb_enc_cr_str_exact_copy(shared, str);
5803 }
5804 ptr = RSTRING(str)->as.heap.ptr += len;
5805 }
5806 STR_SET_LEN(str, nlen);
5807
5808 if (!SHARABLE_MIDDLE_SUBSTRING) {
5809 TERM_FILL(ptr + nlen, TERM_LEN(str));
5810 }
5812 return str;
5813}
5814
5815static void
5816rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5817{
5818 char *sptr;
5819 long slen;
5820 int cr;
5821
5822 if (beg == 0 && vlen == 0) {
5823 rb_str_drop_bytes(str, len);
5824 return;
5825 }
5826
5827 str_modify_keep_cr(str);
5828 RSTRING_GETMEM(str, sptr, slen);
5829 if (len < vlen) {
5830 /* expand string */
5831 RESIZE_CAPA(str, slen + vlen - len);
5832 sptr = RSTRING_PTR(str);
5833 }
5834
5836 cr = rb_enc_str_coderange(val);
5837 else
5839
5840 if (vlen != len) {
5841 memmove(sptr + beg + vlen,
5842 sptr + beg + len,
5843 slen - (beg + len));
5844 }
5845 if (vlen < beg && len < 0) {
5846 MEMZERO(sptr + slen, char, -len);
5847 }
5848 if (vlen > 0) {
5849 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5850 }
5851 slen += vlen - len;
5852 STR_SET_LEN(str, slen);
5853 TERM_FILL(&sptr[slen], TERM_LEN(str));
5854 ENC_CODERANGE_SET(str, cr);
5855}
5856
5857static inline void
5858rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5859{
5860 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5861}
5862
5863void
5864rb_str_update(VALUE str, long beg, long len, VALUE val)
5865{
5866 long slen;
5867 char *p, *e;
5868 rb_encoding *enc;
5869 int singlebyte = single_byte_optimizable(str);
5870 int cr;
5871
5872 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5873
5874 StringValue(val);
5875 enc = rb_enc_check(str, val);
5876 slen = str_strlen(str, enc); /* rb_enc_check */
5877
5878 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5879 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5880 }
5881 if (beg < 0) {
5882 beg += slen;
5883 }
5884 RUBY_ASSERT(beg >= 0);
5885 RUBY_ASSERT(beg <= slen);
5886
5887 if (len > slen - beg) {
5888 len = slen - beg;
5889 }
5890 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5891 if (!p) p = RSTRING_END(str);
5892 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5893 if (!e) e = RSTRING_END(str);
5894 /* error check */
5895 beg = p - RSTRING_PTR(str); /* physical position */
5896 len = e - p; /* physical length */
5897 rb_str_update_0(str, beg, len, val);
5898 rb_enc_associate(str, enc);
5900 if (cr != ENC_CODERANGE_BROKEN)
5901 ENC_CODERANGE_SET(str, cr);
5902}
5903
5904static void
5905rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5906{
5907 int nth;
5908 VALUE match;
5909 long start, end, len;
5910 rb_encoding *enc;
5911
5912 if (rb_reg_search(re, str, 0, 0) < 0) {
5913 rb_raise(rb_eIndexError, "regexp not matched");
5914 }
5915 match = rb_backref_get();
5916 nth = rb_reg_backref_number(match, backref);
5917 int num_regs = RMATCH_NREGS(match);
5918 if ((nth >= num_regs) || ((nth < 0) && (-nth >= num_regs))) {
5919 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5920 }
5921 if (nth < 0) {
5922 nth += num_regs;
5923 }
5924
5925 start = RMATCH_BEG(match, nth);
5926 if (start == -1) {
5927 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5928 }
5929 end = RMATCH_END(match, nth);
5930 len = end - start;
5931 StringValue(val);
5932 enc = rb_enc_check_str(str, val);
5933 rb_str_update_0(str, start, len, val);
5934 rb_enc_associate(str, enc);
5935}
5936
5937static VALUE
5938rb_str_aset(VALUE str, VALUE indx, VALUE val)
5939{
5940 long idx, beg;
5941
5942 switch (TYPE(indx)) {
5943 case T_REGEXP:
5944 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5945 return val;
5946
5947 case T_STRING:
5948 beg = rb_str_index(str, indx, 0);
5949 if (beg < 0) {
5950 rb_raise(rb_eIndexError, "string not matched");
5951 }
5952 beg = rb_str_sublen(str, beg);
5953 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5954 return val;
5955
5956 default:
5957 /* check if indx is Range */
5958 {
5959 long beg, len;
5960 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5961 rb_str_update(str, beg, len, val);
5962 return val;
5963 }
5964 }
5965 /* FALLTHROUGH */
5966
5967 case T_FIXNUM:
5968 idx = NUM2LONG(indx);
5969 rb_str_update(str, idx, 1, val);
5970 return val;
5971 }
5972}
5973
5974/*
5975 * call-seq:
5976 * self[index] = other_string -> new_string
5977 * self[start, length] = other_string -> new_string
5978 * self[range] = other_string -> new_string
5979 * self[regexp, capture = 0] = other_string -> new_string
5980 * self[substring] = other_string -> new_string
5981 *
5982 * :include: doc/string/aset.rdoc
5983 *
5984 */
5985
5986static VALUE
5987rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5988{
5989 if (argc == 3) {
5990 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5991 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5992 }
5993 else {
5994 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5995 }
5996 return argv[2];
5997 }
5998 rb_check_arity(argc, 2, 3);
5999 return rb_str_aset(str, argv[0], argv[1]);
6000}
6001
6002/*
6003 * call-seq:
6004 * insert(offset, other_string) -> self
6005 *
6006 * :include: doc/string/insert.rdoc
6007 *
6008 */
6009
6010static VALUE
6011rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6012{
6013 long pos = NUM2LONG(idx);
6014
6015 if (pos == -1) {
6016 return rb_str_append(str, str2);
6017 }
6018 else if (pos < 0) {
6019 pos++;
6020 }
6021 rb_str_update(str, pos, 0, str2);
6022 return str;
6023}
6024
6025
6026/*
6027 * call-seq:
6028 * slice!(index) -> new_string or nil
6029 * slice!(start, length) -> new_string or nil
6030 * slice!(range) -> new_string or nil
6031 * slice!(regexp, capture = 0) -> new_string or nil
6032 * slice!(substring) -> new_string or nil
6033 *
6034 * Like String#[] (and its alias String#slice), except that:
6035 *
6036 * - Performs substitutions in +self+ (not in a copy of +self+).
6037 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
6038 *
6039 * A few examples:
6040 *
6041 * s = 'hello'
6042 * s.slice!('e') # => "e"
6043 * s # => "hllo"
6044 * s.slice!('e') # => nil
6045 * s # => "hllo"
6046 *
6047 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6048 */
6049
6050static VALUE
6051rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6052{
6053 VALUE result = Qnil;
6054 VALUE indx;
6055 long beg, len = 1;
6056 char *p;
6057
6058 rb_check_arity(argc, 1, 2);
6059 str_modify_keep_cr(str);
6060 indx = argv[0];
6061 if (RB_TYPE_P(indx, T_REGEXP)) {
6062 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6063 VALUE match = rb_backref_get();
6064 int num_regs = RMATCH_NREGS(match);
6065 int nth = 0;
6066 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6067 if ((nth += num_regs) <= 0) return Qnil;
6068 }
6069 else if (nth >= num_regs) return Qnil;
6070 beg = RMATCH_BEG(match, nth);
6071 len = RMATCH_END(match, nth) - beg;
6072 goto subseq;
6073 }
6074 else if (argc == 2) {
6075 beg = NUM2LONG(indx);
6076 len = NUM2LONG(argv[1]);
6077 goto num_index;
6078 }
6079 else if (FIXNUM_P(indx)) {
6080 beg = FIX2LONG(indx);
6081 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6082 if (!len) return Qnil;
6083 beg = p - RSTRING_PTR(str);
6084 goto subseq;
6085 }
6086 else if (RB_TYPE_P(indx, T_STRING)) {
6087 beg = rb_str_index(str, indx, 0);
6088 if (beg == -1) return Qnil;
6089 len = RSTRING_LEN(indx);
6090 result = str_duplicate(rb_cString, indx);
6091 goto squash;
6092 }
6093 else {
6094 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6095 case Qnil:
6096 return Qnil;
6097 case Qfalse:
6098 beg = NUM2LONG(indx);
6099 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6100 if (!len) return Qnil;
6101 beg = p - RSTRING_PTR(str);
6102 goto subseq;
6103 default:
6104 goto num_index;
6105 }
6106 }
6107
6108 num_index:
6109 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6110 beg = p - RSTRING_PTR(str);
6111
6112 subseq:
6113 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6114 rb_enc_cr_str_copy_for_substr(result, str);
6115
6116 squash:
6117 if (len > 0) {
6118 if (beg == 0) {
6119 rb_str_drop_bytes(str, len);
6120 }
6121 else {
6122 char *sptr = RSTRING_PTR(str);
6123 long slen = RSTRING_LEN(str);
6124 if (beg + len > slen) /* pathological check */
6125 len = slen - beg;
6126 memmove(sptr + beg,
6127 sptr + beg + len,
6128 slen - (beg + len));
6129 slen -= len;
6130 STR_SET_LEN(str, slen);
6131 TERM_FILL(&sptr[slen], TERM_LEN(str));
6132 }
6133 }
6134 return result;
6135}
6136
6137static VALUE
6138get_pat(VALUE pat)
6139{
6140 VALUE val;
6141
6142 switch (OBJ_BUILTIN_TYPE(pat)) {
6143 case T_REGEXP:
6144 return pat;
6145
6146 case T_STRING:
6147 break;
6148
6149 default:
6150 val = rb_check_string_type(pat);
6151 if (NIL_P(val)) {
6152 Check_Type(pat, T_REGEXP);
6153 }
6154 pat = val;
6155 }
6156
6157 return rb_reg_regcomp(pat);
6158}
6159
6160static VALUE
6161get_pat_quoted(VALUE pat, int check)
6162{
6163 VALUE val;
6164
6165 switch (OBJ_BUILTIN_TYPE(pat)) {
6166 case T_REGEXP:
6167 return pat;
6168
6169 case T_STRING:
6170 break;
6171
6172 default:
6173 val = rb_check_string_type(pat);
6174 if (NIL_P(val)) {
6175 Check_Type(pat, T_REGEXP);
6176 }
6177 pat = val;
6178 }
6179 if (check && is_broken_string(pat)) {
6180 rb_exc_raise(rb_reg_check_preprocess(pat));
6181 }
6182 return pat;
6183}
6184
6185static long
6186rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6187{
6188 if (BUILTIN_TYPE(pat) == T_STRING) {
6189 pos = rb_str_byteindex(str, pat, pos);
6190 if (set_backref_str) {
6191 if (pos >= 0) {
6192 str = rb_str_new_frozen_String(str);
6193 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6194 if (match) {
6195 *match = match_data;
6196 }
6197 }
6198 else {
6200 }
6201 }
6202 return pos;
6203 }
6204 else {
6205 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6206 }
6207}
6208
6209static long
6210rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6211{
6212 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6213}
6214
6215
6216/*
6217 * call-seq:
6218 * sub!(pattern, replacement) -> self or nil
6219 * sub!(pattern) {|match| ... } -> self or nil
6220 *
6221 * Like String#sub, except that:
6222 *
6223 * - Changes are made to +self+, not to copy of +self+.
6224 * - Returns +self+ if any changes are made, +nil+ otherwise.
6225 *
6226 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6227 */
6228
6229static VALUE
6230rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6231{
6232 VALUE pat, repl, hash = Qnil;
6233 int iter = 0;
6234 long plen;
6235 int min_arity = rb_block_given_p() ? 1 : 2;
6236 long beg;
6237
6238 rb_check_arity(argc, min_arity, 2);
6239 if (argc == 1) {
6240 iter = 1;
6241 }
6242 else {
6243 repl = argv[1];
6244 if (!RB_TYPE_P(repl, T_STRING)) {
6245 hash = rb_check_hash_type(repl);
6246 if (NIL_P(hash)) {
6247 StringValue(repl);
6248 }
6249 }
6250 }
6251
6252 pat = get_pat_quoted(argv[0], 1);
6253
6254 str_modifiable(str);
6255 beg = rb_pat_search(pat, str, 0, 1);
6256 if (beg >= 0) {
6257 rb_encoding *enc;
6258 int cr = ENC_CODERANGE(str);
6259 long beg0, end0;
6260 VALUE match, match0 = Qnil;
6261 char *p, *rp;
6262 long len, rlen;
6263
6264 match = rb_backref_get();
6265 if (RB_TYPE_P(pat, T_STRING)) {
6266 beg0 = beg;
6267 end0 = beg0 + RSTRING_LEN(pat);
6268 match0 = pat;
6269 }
6270 else {
6271 beg0 = RMATCH_BEG(match, 0);
6272 end0 = RMATCH_END(match, 0);
6273 if (iter) match0 = rb_reg_nth_match(0, match);
6274 }
6275
6276 if (iter || !NIL_P(hash)) {
6277 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6278
6279 if (iter) {
6280 repl = rb_obj_as_string(rb_yield(match0));
6281 }
6282 else {
6283 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6284 repl = rb_obj_as_string(repl);
6285 }
6286 str_mod_check(str, p, len);
6287 rb_check_frozen(str);
6288 }
6289 else {
6290 repl = rb_reg_regsub_match(repl, str, match);
6291 }
6292
6293 enc = rb_enc_compatible(str, repl);
6294 if (!enc) {
6295 rb_encoding *str_enc = STR_ENC_GET(str);
6296 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6297 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6298 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6299 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6300 rb_enc_inspect_name(str_enc),
6301 rb_enc_inspect_name(STR_ENC_GET(repl)));
6302 }
6303 enc = STR_ENC_GET(repl);
6304 }
6305 rb_str_modify(str);
6306 rb_enc_associate(str, enc);
6308 int cr2 = ENC_CODERANGE(repl);
6309 if (cr2 == ENC_CODERANGE_BROKEN ||
6310 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6312 else
6313 cr = cr2;
6314 }
6315 plen = end0 - beg0;
6316 rlen = RSTRING_LEN(repl);
6317 len = RSTRING_LEN(str);
6318 if (rlen > plen) {
6319 RESIZE_CAPA(str, len + rlen - plen);
6320 }
6321 p = RSTRING_PTR(str);
6322 if (rlen != plen) {
6323 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6324 }
6325 rp = RSTRING_PTR(repl);
6326 memmove(p + beg0, rp, rlen);
6327 len += rlen - plen;
6328 STR_SET_LEN(str, len);
6329 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6330 ENC_CODERANGE_SET(str, cr);
6331
6332 RB_GC_GUARD(match);
6333
6334 return str;
6335 }
6336 return Qnil;
6337}
6338
6339
6340/*
6341 * call-seq:
6342 * sub(pattern, replacement) -> new_string
6343 * sub(pattern) {|match| ... } -> new_string
6344 *
6345 * :include: doc/string/sub.rdoc
6346 */
6347
6348static VALUE
6349rb_str_sub(int argc, VALUE *argv, VALUE str)
6350{
6351 str = str_duplicate(rb_cString, str);
6352 rb_str_sub_bang(argc, argv, str);
6353 return str;
6354}
6355
6356static VALUE
6357str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6358{
6359 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6360 long beg, beg0, end0;
6361 long offset, blen, slen, len, last;
6362 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6363 char *sp, *cp;
6364 int need_backref_str = -1;
6365 rb_encoding *str_enc;
6366
6367 switch (argc) {
6368 case 1:
6369 RETURN_ENUMERATOR(str, argc, argv);
6370 mode = ITER;
6371 break;
6372 case 2:
6373 repl = argv[1];
6374 if (!RB_TYPE_P(repl, T_STRING)) {
6375 hash = rb_check_hash_type(repl);
6376 if (NIL_P(hash)) {
6377 StringValue(repl);
6378 }
6379 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6380 mode = FAST_MAP;
6381 }
6382 else {
6383 mode = MAP;
6384 }
6385 }
6386 break;
6387 default:
6388 rb_error_arity(argc, 1, 2);
6389 }
6390
6391 pat = get_pat_quoted(argv[0], 1);
6392 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6393
6394 if (beg < 0) {
6395 if (bang) return Qnil; /* no match, no substitution */
6396 return str_duplicate(rb_cString, str);
6397 }
6398 if (bang) str_modify_keep_cr(str);
6399
6400 offset = 0;
6401 blen = RSTRING_LEN(str) + 30; /* len + margin */
6402 dest = rb_str_buf_new(blen);
6403 sp = RSTRING_PTR(str);
6404 slen = RSTRING_LEN(str);
6405 cp = sp;
6406 str_enc = STR_ENC_GET(str);
6407 rb_enc_associate(dest, str_enc);
6408 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6409
6410 do {
6411 if (RB_TYPE_P(pat, T_STRING)) {
6412 beg0 = beg;
6413 end0 = beg0 + RSTRING_LEN(pat);
6414 match0 = pat;
6415 }
6416 else {
6417 beg0 = RMATCH_BEG(match, 0);
6418 end0 = RMATCH_END(match, 0);
6419 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6420 }
6421
6422 if (mode != STR) {
6423 if (mode == ITER) {
6424 val = rb_obj_as_string(rb_yield(match0));
6425 }
6426 else {
6427 struct RString fake_str = {RBASIC_INIT};
6428 VALUE key;
6429 if (mode == FAST_MAP) {
6430 // It is safe to use a fake_str here because we established that it won't escape,
6431 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6432 // default proc.
6433 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6434 }
6435 else {
6436 key = rb_str_subseq(str, beg0, end0 - beg0);
6437 }
6438 val = rb_hash_aref(hash, key);
6439 val = rb_obj_as_string(val);
6440 }
6441 str_mod_check(str, sp, slen);
6442 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6443 rb_raise(rb_eRuntimeError, "block should not cheat");
6444 }
6445 }
6446 else if (need_backref_str) {
6447 val = rb_reg_regsub_match(repl, str, match);
6448 if (need_backref_str < 0) {
6449 need_backref_str = val != repl;
6450 }
6451 }
6452 else {
6453 val = repl;
6454 }
6455
6456 len = beg0 - offset; /* copy pre-match substr */
6457 if (len) {
6458 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6459 }
6460
6461 rb_str_buf_append(dest, val);
6462
6463 last = offset;
6464 offset = end0;
6465 if (beg0 == end0) {
6466 /*
6467 * Always consume at least one character of the input string
6468 * in order to prevent infinite loops.
6469 */
6470 if (RSTRING_LEN(str) <= end0) break;
6471 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6472 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6473 offset = end0 + len;
6474 }
6475 cp = RSTRING_PTR(str) + offset;
6476 if (offset > RSTRING_LEN(str)) break;
6477
6478 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6479 if (mode != FAST_MAP && mode != STR) {
6480 match = Qnil;
6481 }
6482 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6483
6484 RB_GC_GUARD(match);
6485 } while (beg >= 0);
6486
6487 if (RSTRING_LEN(str) > offset) {
6488 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6489 }
6490 rb_pat_search0(pat, str, last, 1, &match);
6491 if (bang) {
6492 str_shared_replace(str, dest);
6493 }
6494 else {
6495 str = dest;
6496 }
6497
6498 return str;
6499}
6500
6501
6502/*
6503 * call-seq:
6504 * gsub!(pattern, replacement) -> self or nil
6505 * gsub!(pattern) {|match| ... } -> self or nil
6506 * gsub!(pattern) -> an_enumerator
6507 *
6508 * Like String#gsub, except that:
6509 *
6510 * - Performs substitutions in +self+ (not in a copy of +self+).
6511 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6512 *
6513 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6514 */
6515
6516static VALUE
6517rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6518{
6519 str_modifiable(str);
6520 return str_gsub(argc, argv, str, 1);
6521}
6522
6523
6524/*
6525 * call-seq:
6526 * gsub(pattern, replacement) -> new_string
6527 * gsub(pattern) {|match| ... } -> new_string
6528 * gsub(pattern) -> enumerator
6529 *
6530 * Returns a copy of +self+ with zero or more substrings replaced.
6531 *
6532 * Argument +pattern+ may be a string or a Regexp;
6533 * argument +replacement+ may be a string or a Hash.
6534 * Varying types for the argument values makes this method very versatile.
6535 *
6536 * Below are some simple examples;
6537 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6538 *
6539 * With arguments +pattern+ and string +replacement+ given,
6540 * replaces each matching substring with the given +replacement+ string:
6541 *
6542 * s = 'abracadabra'
6543 * s.gsub('ab', 'AB') # => "ABracadABra"
6544 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6545 *
6546 * With arguments +pattern+ and hash +replacement+ given,
6547 * replaces each matching substring with a value from the given +replacement+ hash,
6548 * or removes it:
6549 *
6550 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6551 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6552 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6553 *
6554 * With argument +pattern+ and a block given,
6555 * calls the block with each matching substring;
6556 * replaces that substring with the block's return value:
6557 *
6558 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6559 * # => "ABrACADABrA"
6560 *
6561 * With argument +pattern+ and no block given,
6562 * returns a new Enumerator.
6563 *
6564 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6565 */
6566
6567static VALUE
6568rb_str_gsub(int argc, VALUE *argv, VALUE str)
6569{
6570 return str_gsub(argc, argv, str, 0);
6571}
6572
6573
6574/*
6575 * call-seq:
6576 * replace(other_string) -> self
6577 *
6578 * Replaces the contents of +self+ with the contents of +other_string+;
6579 * returns +self+:
6580 *
6581 * s = 'foo' # => "foo"
6582 * s.replace('bar') # => "bar"
6583 *
6584 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6585 */
6586
6587VALUE
6589{
6590 str_modifiable(str);
6591 if (str == str2) return str;
6592
6593 StringValue(str2);
6594 str_discard(str);
6595 return str_replace(str, str2);
6596}
6597
6598/*
6599 * call-seq:
6600 * clear -> self
6601 *
6602 * Removes the contents of +self+:
6603 *
6604 * s = 'foo'
6605 * s.clear # => ""
6606 * s # => ""
6607 *
6608 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6609 */
6610
6611static VALUE
6612rb_str_clear(VALUE str)
6613{
6614 str_discard(str);
6615 STR_SET_EMBED(str);
6616 STR_SET_LEN(str, 0);
6617 RSTRING_PTR(str)[0] = 0;
6618 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6620 else
6622 return str;
6623}
6624
6625/*
6626 * call-seq:
6627 * chr -> string
6628 *
6629 * :include: doc/string/chr.rdoc
6630 *
6631 */
6632
6633static VALUE
6634rb_str_chr(VALUE str)
6635{
6636 return rb_str_substr(str, 0, 1);
6637}
6638
6639/*
6640 * call-seq:
6641 * getbyte(index) -> integer or nil
6642 *
6643 * :include: doc/string/getbyte.rdoc
6644 *
6645 */
6646VALUE
6647rb_str_getbyte(VALUE str, VALUE index)
6648{
6649 long pos = NUM2LONG(index);
6650
6651 if (pos < 0)
6652 pos += RSTRING_LEN(str);
6653 if (pos < 0 || RSTRING_LEN(str) <= pos)
6654 return Qnil;
6655
6656 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6657}
6658
6659/*
6660 * call-seq:
6661 * setbyte(index, integer) -> integer
6662 *
6663 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6664 * returns +integer+:
6665 *
6666 * s = 'xyzzy'
6667 * s.setbyte(2, 129) # => 129
6668 * s # => "xy\x81zy"
6669 *
6670 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6671 */
6672VALUE
6673rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6674{
6675 long pos = NUM2LONG(index);
6676 long len = RSTRING_LEN(str);
6677 char *ptr, *head, *left = 0;
6678 rb_encoding *enc;
6679 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6680
6681 if (pos < -len || len <= pos)
6682 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6683 if (pos < 0)
6684 pos += len;
6685
6686 VALUE v = rb_to_int(value);
6687 VALUE w = rb_int_and(v, INT2FIX(0xff));
6688 char byte = (char)(NUM2INT(w) & 0xFF);
6689
6690 if (!str_independent(str))
6691 str_make_independent(str);
6692 enc = STR_ENC_GET(str);
6693 head = RSTRING_PTR(str);
6694 ptr = &head[pos];
6695 if (!STR_EMBED_P(str)) {
6696 cr = ENC_CODERANGE(str);
6697 switch (cr) {
6698 case ENC_CODERANGE_7BIT:
6699 left = ptr;
6700 *ptr = byte;
6701 if (ISASCII(byte)) goto end;
6702 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6703 if (!MBCLEN_CHARFOUND_P(nlen))
6705 else
6707 goto end;
6709 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6710 width = rb_enc_precise_mbclen(left, head+len, enc);
6711 *ptr = byte;
6712 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6713 if (!MBCLEN_CHARFOUND_P(nlen))
6715 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6717 goto end;
6718 }
6719 }
6721 *ptr = byte;
6722
6723 end:
6724 return value;
6725}
6726
6727static VALUE
6728str_byte_substr(VALUE str, long beg, long len, int empty)
6729{
6730 long n = RSTRING_LEN(str);
6731
6732 if (beg > n || len < 0) return Qnil;
6733 if (beg < 0) {
6734 beg += n;
6735 if (beg < 0) return Qnil;
6736 }
6737 if (len > n - beg)
6738 len = n - beg;
6739 if (len <= 0) {
6740 if (!empty) return Qnil;
6741 len = 0;
6742 }
6743
6744 VALUE str2 = str_subseq(str, beg, len);
6745
6746 str_enc_copy_direct(str2, str);
6747
6748 if (RSTRING_LEN(str2) == 0) {
6749 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6751 else
6753 }
6754 else {
6755 switch (ENC_CODERANGE(str)) {
6756 case ENC_CODERANGE_7BIT:
6758 break;
6759 default:
6761 break;
6762 }
6763 }
6764
6765 return str2;
6766}
6767
6768VALUE
6769rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6770{
6771 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6772}
6773
6774static VALUE
6775str_byte_aref(VALUE str, VALUE indx)
6776{
6777 long idx;
6778 if (FIXNUM_P(indx)) {
6779 idx = FIX2LONG(indx);
6780 }
6781 else {
6782 /* check if indx is Range */
6783 long beg, len = RSTRING_LEN(str);
6784
6785 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6786 case Qfalse:
6787 break;
6788 case Qnil:
6789 return Qnil;
6790 default:
6791 return str_byte_substr(str, beg, len, TRUE);
6792 }
6793
6794 idx = NUM2LONG(indx);
6795 }
6796 return str_byte_substr(str, idx, 1, FALSE);
6797}
6798
6799/*
6800 * call-seq:
6801 * byteslice(offset, length = 1) -> string or nil
6802 * byteslice(range) -> string or nil
6803 *
6804 * :include: doc/string/byteslice.rdoc
6805 */
6806
6807static VALUE
6808rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6809{
6810 if (argc == 2) {
6811 long beg = NUM2LONG(argv[0]);
6812 long len = NUM2LONG(argv[1]);
6813 return str_byte_substr(str, beg, len, TRUE);
6814 }
6815 rb_check_arity(argc, 1, 2);
6816 return str_byte_aref(str, argv[0]);
6817}
6818
6819static void
6820str_check_beg_len(VALUE str, long *beg, long *len)
6821{
6822 long end, slen = RSTRING_LEN(str);
6823
6824 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6825 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6826 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6827 }
6828 if (*beg < 0) {
6829 *beg += slen;
6830 }
6831 RUBY_ASSERT(*beg >= 0);
6832 RUBY_ASSERT(*beg <= slen);
6833
6834 if (*len > slen - *beg) {
6835 *len = slen - *beg;
6836 }
6837 end = *beg + *len;
6838 str_ensure_byte_pos(str, *beg);
6839 str_ensure_byte_pos(str, end);
6840}
6841
6842/*
6843 * call-seq:
6844 * bytesplice(offset, length, str) -> self
6845 * bytesplice(offset, length, str, str_offset, str_length) -> self
6846 * bytesplice(range, str) -> self
6847 * bytesplice(range, str, str_range) -> self
6848 *
6849 * :include: doc/string/bytesplice.rdoc
6850 */
6851
6852static VALUE
6853rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6854{
6855 long beg, len, vbeg, vlen;
6856 VALUE val;
6857 int cr;
6858
6859 rb_check_arity(argc, 2, 5);
6860 if (!(argc == 2 || argc == 3 || argc == 5)) {
6861 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6862 }
6863 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6864 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6865 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6866 rb_builtin_class_name(argv[0]));
6867 }
6868 val = argv[1];
6869 StringValue(val);
6870 if (argc == 2) {
6871 /* bytesplice(range, str) */
6872 vbeg = 0;
6873 vlen = RSTRING_LEN(val);
6874 }
6875 else {
6876 /* bytesplice(range, str, str_range) */
6877 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6878 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6879 rb_builtin_class_name(argv[2]));
6880 }
6881 }
6882 }
6883 else {
6884 beg = NUM2LONG(argv[0]);
6885 len = NUM2LONG(argv[1]);
6886 val = argv[2];
6887 StringValue(val);
6888 if (argc == 3) {
6889 /* bytesplice(index, length, str) */
6890 vbeg = 0;
6891 vlen = RSTRING_LEN(val);
6892 }
6893 else {
6894 /* bytesplice(index, length, str, str_index, str_length) */
6895 vbeg = NUM2LONG(argv[3]);
6896 vlen = NUM2LONG(argv[4]);
6897 }
6898 }
6899 str_check_beg_len(str, &beg, &len);
6900 str_check_beg_len(val, &vbeg, &vlen);
6901 str_modify_keep_cr(str);
6902
6903 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6904 rb_enc_associate(str, rb_enc_check(str, val));
6905 }
6906
6907 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6909 if (cr != ENC_CODERANGE_BROKEN)
6910 ENC_CODERANGE_SET(str, cr);
6911 return str;
6912}
6913
6914/*
6915 * call-seq:
6916 * reverse -> new_string
6917 *
6918 * Returns a new string with the characters from +self+ in reverse order.
6919 *
6920 * 'drawer'.reverse # => "reward"
6921 * 'reviled'.reverse # => "deliver"
6922 * 'stressed'.reverse # => "desserts"
6923 * 'semordnilaps'.reverse # => "spalindromes"
6924 *
6925 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6926 */
6927
6928static VALUE
6929rb_str_reverse(VALUE str)
6930{
6931 rb_encoding *enc;
6932 VALUE rev;
6933 char *s, *e, *p;
6934 int cr;
6935
6936 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6937 enc = STR_ENC_GET(str);
6938 rev = rb_str_new(0, RSTRING_LEN(str));
6939 s = RSTRING_PTR(str); e = RSTRING_END(str);
6940 p = RSTRING_END(rev);
6941 cr = ENC_CODERANGE(str);
6942
6943 if (RSTRING_LEN(str) > 1) {
6944 if (single_byte_optimizable(str)) {
6945 while (s < e) {
6946 *--p = *s++;
6947 }
6948 }
6949 else if (cr == ENC_CODERANGE_VALID) {
6950 while (s < e) {
6951 int clen = rb_enc_fast_mbclen(s, e, enc);
6952
6953 p -= clen;
6954 memcpy(p, s, clen);
6955 s += clen;
6956 }
6957 }
6958 else {
6959 cr = rb_enc_asciicompat(enc) ?
6961 while (s < e) {
6962 int clen = rb_enc_mbclen(s, e, enc);
6963
6964 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6965 p -= clen;
6966 memcpy(p, s, clen);
6967 s += clen;
6968 }
6969 }
6970 }
6971 STR_SET_LEN(rev, RSTRING_LEN(str));
6972 str_enc_copy_direct(rev, str);
6973 ENC_CODERANGE_SET(rev, cr);
6974
6975 return rev;
6976}
6977
6978
6979/*
6980 * call-seq:
6981 * reverse! -> self
6982 *
6983 * Returns +self+ with its characters reversed:
6984 *
6985 * 'drawer'.reverse! # => "reward"
6986 * 'reviled'.reverse! # => "deliver"
6987 * 'stressed'.reverse! # => "desserts"
6988 * 'semordnilaps'.reverse! # => "spalindromes"
6989 *
6990 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6991 */
6992
6993static VALUE
6994rb_str_reverse_bang(VALUE str)
6995{
6996 if (RSTRING_LEN(str) > 1) {
6997 if (single_byte_optimizable(str)) {
6998 char *s, *e, c;
6999
7000 str_modify_keep_cr(str);
7001 s = RSTRING_PTR(str);
7002 e = RSTRING_END(str) - 1;
7003 while (s < e) {
7004 c = *s;
7005 *s++ = *e;
7006 *e-- = c;
7007 }
7008 }
7009 else {
7010 str_shared_replace(str, rb_str_reverse(str));
7011 }
7012 }
7013 else {
7014 str_modify_keep_cr(str);
7015 }
7016 return str;
7017}
7018
7019
7020/*
7021 * call-seq:
7022 * include?(other_string) -> true or false
7023 *
7024 * Returns whether +self+ contains +other_string+:
7025 *
7026 * s = 'bar'
7027 * s.include?('ba') # => true
7028 * s.include?('ar') # => true
7029 * s.include?('bar') # => true
7030 * s.include?('a') # => true
7031 * s.include?('') # => true
7032 * s.include?('foo') # => false
7033 *
7034 * Related: see {Querying}[rdoc-ref:String@Querying].
7035 */
7036
7037VALUE
7038rb_str_include(VALUE str, VALUE arg)
7039{
7040 long i;
7041
7042 StringValue(arg);
7043 i = rb_str_index(str, arg, 0);
7044
7045 return RBOOL(i != -1);
7046}
7047
7048
7049/*
7050 * call-seq:
7051 * to_i(base = 10) -> integer
7052 *
7053 * Returns the result of interpreting leading characters in +self+
7054 * as an integer in the given +base+;
7055 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7056 *
7057 * '123456'.to_i # => 123456
7058 * '123def'.to_i(16) # => 1195503
7059 *
7060 * With +base+ zero given, string +object+ may contain leading characters
7061 * to specify the actual base:
7062 *
7063 * '123def'.to_i(0) # => 123
7064 * '0123def'.to_i(0) # => 83
7065 * '0b123def'.to_i(0) # => 1
7066 * '0o123def'.to_i(0) # => 83
7067 * '0d123def'.to_i(0) # => 123
7068 * '0x123def'.to_i(0) # => 1195503
7069 *
7070 * Characters past a leading valid number (in the given +base+) are ignored:
7071 *
7072 * '12.345'.to_i # => 12
7073 * '12345'.to_i(2) # => 1
7074 *
7075 * Returns zero if there is no leading valid number:
7076 *
7077 * 'abcdef'.to_i # => 0
7078 * '2'.to_i(2) # => 0
7079 *
7080 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
7081 */
7082
7083static VALUE
7084rb_str_to_i(int argc, VALUE *argv, VALUE str)
7085{
7086 int base = 10;
7087
7088 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7089 rb_raise(rb_eArgError, "invalid radix %d", base);
7090 }
7091 return rb_str_to_inum(str, base, FALSE);
7092}
7093
7094
7095/*
7096 * call-seq:
7097 * to_f -> float
7098 *
7099 * Returns the result of interpreting leading characters in +self+ as a Float:
7100 *
7101 * '3.14159'.to_f # => 3.14159
7102 * '1.234e-2'.to_f # => 0.01234
7103 *
7104 * Characters past a leading valid number are ignored:
7105 *
7106 * '3.14 (pi to two places)'.to_f # => 3.14
7107 *
7108 * Returns zero if there is no leading valid number:
7109 *
7110 * 'abcdef'.to_f # => 0.0
7111 *
7112 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
7113 */
7114
7115static VALUE
7116rb_str_to_f(VALUE str)
7117{
7118 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7119}
7120
7121
7122/*
7123 * call-seq:
7124 * to_s -> self or new_string
7125 *
7126 * Returns +self+ if +self+ is a +String+,
7127 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7128 *
7129 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7130 */
7131
7132static VALUE
7133rb_str_to_s(VALUE str)
7134{
7135 if (rb_obj_class(str) != rb_cString) {
7136 return str_duplicate(rb_cString, str);
7137 }
7138 return str;
7139}
7140
7141#if 0
7142static void
7143str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7144{
7145 char s[RUBY_MAX_CHAR_LEN];
7146 int n = rb_enc_codelen(c, enc);
7147
7148 rb_enc_mbcput(c, s, enc);
7149 rb_enc_str_buf_cat(str, s, n, enc);
7150}
7151#endif
7152
7153#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7154
7155int
7156rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7157{
7158 char buf[CHAR_ESC_LEN + 1];
7159 int l;
7160
7161#if SIZEOF_INT > 4
7162 c &= 0xffffffff;
7163#endif
7164 if (unicode_p) {
7165 if (c < 0x7F && ISPRINT(c)) {
7166 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7167 }
7168 else if (c < 0x10000) {
7169 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7170 }
7171 else {
7172 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7173 }
7174 }
7175 else {
7176 if (c < 0x100) {
7177 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7178 }
7179 else {
7180 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7181 }
7182 }
7183 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7184 rb_str_buf_cat(result, buf, l);
7185 return l;
7186}
7187
7188const char *
7189ruby_escaped_char(int c)
7190{
7191 switch (c) {
7192 case '\0': return "\\0";
7193 case '\n': return "\\n";
7194 case '\r': return "\\r";
7195 case '\t': return "\\t";
7196 case '\f': return "\\f";
7197 case '\013': return "\\v";
7198 case '\010': return "\\b";
7199 case '\007': return "\\a";
7200 case '\033': return "\\e";
7201 case '\x7f': return "\\c?";
7202 }
7203 return NULL;
7204}
7205
7206VALUE
7207rb_str_escape(VALUE str)
7208{
7209 int encidx = ENCODING_GET(str);
7210 rb_encoding *enc = rb_enc_from_index(encidx);
7211 const char *p = RSTRING_PTR(str);
7212 const char *pend = RSTRING_END(str);
7213 const char *prev = p;
7214 char buf[CHAR_ESC_LEN + 1];
7215 VALUE result = rb_str_buf_new(0);
7216 int unicode_p = rb_enc_unicode_p(enc);
7217 int asciicompat = rb_enc_asciicompat(enc);
7218
7219 while (p < pend) {
7220 unsigned int c;
7221 const char *cc;
7222 int n = rb_enc_precise_mbclen(p, pend, enc);
7223 if (!MBCLEN_CHARFOUND_P(n)) {
7224 if (p > prev) str_buf_cat(result, prev, p - prev);
7225 n = rb_enc_mbminlen(enc);
7226 if (pend < p + n)
7227 n = (int)(pend - p);
7228 while (n--) {
7229 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7230 str_buf_cat(result, buf, strlen(buf));
7231 prev = ++p;
7232 }
7233 continue;
7234 }
7235 n = MBCLEN_CHARFOUND_LEN(n);
7236 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7237 p += n;
7238 cc = ruby_escaped_char(c);
7239 if (cc) {
7240 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7241 str_buf_cat(result, cc, strlen(cc));
7242 prev = p;
7243 }
7244 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7245 }
7246 else {
7247 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7248 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7249 prev = p;
7250 }
7251 }
7252 if (p > prev) str_buf_cat(result, prev, p - prev);
7253 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7254
7255 return result;
7256}
7257
7258/*
7259 * call-seq:
7260 * inspect -> string
7261 *
7262 * :include: doc/string/inspect.rdoc
7263 *
7264 */
7265
7266VALUE
7268{
7269 int encidx = ENCODING_GET(str);
7270 rb_encoding *enc = rb_enc_from_index(encidx);
7271 const char *p, *pend, *prev;
7272 char buf[CHAR_ESC_LEN + 1];
7273 VALUE result = rb_str_buf_new(0);
7274 rb_encoding *resenc = rb_default_internal_encoding();
7275 int unicode_p = rb_enc_unicode_p(enc);
7276 int asciicompat = rb_enc_asciicompat(enc);
7277
7278 if (resenc == NULL) resenc = rb_default_external_encoding();
7279 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7280 rb_enc_associate(result, resenc);
7281 str_buf_cat2(result, "\"");
7282
7283 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7284 prev = p;
7285 while (p < pend) {
7286 unsigned int c, cc;
7287 int n;
7288
7289 n = rb_enc_precise_mbclen(p, pend, enc);
7290 if (!MBCLEN_CHARFOUND_P(n)) {
7291 if (p > prev) str_buf_cat(result, prev, p - prev);
7292 n = rb_enc_mbminlen(enc);
7293 if (pend < p + n)
7294 n = (int)(pend - p);
7295 while (n--) {
7296 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7297 str_buf_cat(result, buf, strlen(buf));
7298 prev = ++p;
7299 }
7300 continue;
7301 }
7302 n = MBCLEN_CHARFOUND_LEN(n);
7303 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7304 p += n;
7305 if ((asciicompat || unicode_p) &&
7306 (c == '"'|| c == '\\' ||
7307 (c == '#' &&
7308 p < pend &&
7309 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7310 (cc = rb_enc_codepoint(p,pend,enc),
7311 (cc == '$' || cc == '@' || cc == '{'))))) {
7312 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7313 str_buf_cat2(result, "\\");
7314 if (asciicompat || enc == resenc) {
7315 prev = p - n;
7316 continue;
7317 }
7318 }
7319 switch (c) {
7320 case '\n': cc = 'n'; break;
7321 case '\r': cc = 'r'; break;
7322 case '\t': cc = 't'; break;
7323 case '\f': cc = 'f'; break;
7324 case '\013': cc = 'v'; break;
7325 case '\010': cc = 'b'; break;
7326 case '\007': cc = 'a'; break;
7327 case 033: cc = 'e'; break;
7328 default: cc = 0; break;
7329 }
7330 if (cc) {
7331 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7332 buf[0] = '\\';
7333 buf[1] = (char)cc;
7334 str_buf_cat(result, buf, 2);
7335 prev = p;
7336 continue;
7337 }
7338 /* The special casing of 0x85 (NEXT_LINE) here is because
7339 * Oniguruma historically treats it as printable, but it
7340 * doesn't match the print POSIX bracket class or character
7341 * property in regexps.
7342 *
7343 * See Ruby Bug #16842 for details:
7344 * https://bugs.ruby-lang.org/issues/16842
7345 */
7346 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7347 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7348 continue;
7349 }
7350 else {
7351 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7352 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7353 prev = p;
7354 continue;
7355 }
7356 }
7357 if (p > prev) str_buf_cat(result, prev, p - prev);
7358 str_buf_cat2(result, "\"");
7359
7360 return result;
7361}
7362
7363#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7364
7365/*
7366 * call-seq:
7367 * dump -> new_string
7368 *
7369 * :include: doc/string/dump.rdoc
7370 *
7371 */
7372
7373VALUE
7375{
7376 int encidx = rb_enc_get_index(str);
7377 rb_encoding *enc = rb_enc_from_index(encidx);
7378 long len;
7379 const char *p, *pend;
7380 char *q, *qend;
7381 VALUE result;
7382 int u8 = (encidx == rb_utf8_encindex());
7383 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7384
7385 len = 2; /* "" */
7386 if (!rb_enc_asciicompat(enc)) {
7387 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7388 len += strlen(enc->name);
7389 }
7390
7391 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7392 while (p < pend) {
7393 int clen;
7394 unsigned char c = *p++;
7395
7396 switch (c) {
7397 case '"': case '\\':
7398 case '\n': case '\r':
7399 case '\t': case '\f':
7400 case '\013': case '\010': case '\007': case '\033':
7401 clen = 2;
7402 break;
7403
7404 case '#':
7405 clen = IS_EVSTR(p, pend) ? 2 : 1;
7406 break;
7407
7408 default:
7409 if (ISPRINT(c)) {
7410 clen = 1;
7411 }
7412 else {
7413 if (u8 && c > 0x7F) { /* \u notation */
7414 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7415 if (MBCLEN_CHARFOUND_P(n)) {
7416 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7417 if (cc <= 0xFFFF)
7418 clen = 6; /* \uXXXX */
7419 else if (cc <= 0xFFFFF)
7420 clen = 9; /* \u{XXXXX} */
7421 else
7422 clen = 10; /* \u{XXXXXX} */
7423 p += MBCLEN_CHARFOUND_LEN(n)-1;
7424 break;
7425 }
7426 }
7427 clen = 4; /* \xNN */
7428 }
7429 break;
7430 }
7431
7432 if (clen > LONG_MAX - len) {
7433 rb_raise(rb_eRuntimeError, "string size too big");
7434 }
7435 len += clen;
7436 }
7437
7438 result = rb_str_new(0, len);
7439 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7440 q = RSTRING_PTR(result); qend = q + len + 1;
7441
7442 *q++ = '"';
7443 while (p < pend) {
7444 unsigned char c = *p++;
7445
7446 if (c == '"' || c == '\\') {
7447 *q++ = '\\';
7448 *q++ = c;
7449 }
7450 else if (c == '#') {
7451 if (IS_EVSTR(p, pend)) *q++ = '\\';
7452 *q++ = '#';
7453 }
7454 else if (c == '\n') {
7455 *q++ = '\\';
7456 *q++ = 'n';
7457 }
7458 else if (c == '\r') {
7459 *q++ = '\\';
7460 *q++ = 'r';
7461 }
7462 else if (c == '\t') {
7463 *q++ = '\\';
7464 *q++ = 't';
7465 }
7466 else if (c == '\f') {
7467 *q++ = '\\';
7468 *q++ = 'f';
7469 }
7470 else if (c == '\013') {
7471 *q++ = '\\';
7472 *q++ = 'v';
7473 }
7474 else if (c == '\010') {
7475 *q++ = '\\';
7476 *q++ = 'b';
7477 }
7478 else if (c == '\007') {
7479 *q++ = '\\';
7480 *q++ = 'a';
7481 }
7482 else if (c == '\033') {
7483 *q++ = '\\';
7484 *q++ = 'e';
7485 }
7486 else if (ISPRINT(c)) {
7487 *q++ = c;
7488 }
7489 else {
7490 *q++ = '\\';
7491 if (u8) {
7492 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7493 if (MBCLEN_CHARFOUND_P(n)) {
7494 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7495 p += n;
7496 if (cc <= 0xFFFF)
7497 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7498 else
7499 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7500 q += strlen(q);
7501 continue;
7502 }
7503 }
7504 snprintf(q, qend-q, "x%02X", c);
7505 q += 3;
7506 }
7507 }
7508 *q++ = '"';
7509 *q = '\0';
7510 if (!rb_enc_asciicompat(enc)) {
7511 snprintf(q, qend-q, nonascii_suffix, enc->name);
7512 encidx = rb_ascii8bit_encindex();
7513 }
7514 /* result from dump is ASCII */
7515 rb_enc_associate_index(result, encidx);
7517 return result;
7518}
7519
7520static int
7521unescape_ascii(unsigned int c)
7522{
7523 switch (c) {
7524 case 'n':
7525 return '\n';
7526 case 'r':
7527 return '\r';
7528 case 't':
7529 return '\t';
7530 case 'f':
7531 return '\f';
7532 case 'v':
7533 return '\13';
7534 case 'b':
7535 return '\010';
7536 case 'a':
7537 return '\007';
7538 case 'e':
7539 return 033;
7540 }
7542}
7543
7544static void
7545undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7546{
7547 const char *s = *ss;
7548 unsigned int c;
7549 int codelen;
7550 size_t hexlen;
7551 unsigned char buf[6];
7552 static rb_encoding *enc_utf8 = NULL;
7553
7554 switch (*s) {
7555 case '\\':
7556 case '"':
7557 case '#':
7558 rb_str_cat(undumped, s, 1); /* cat itself */
7559 s++;
7560 break;
7561 case 'n':
7562 case 'r':
7563 case 't':
7564 case 'f':
7565 case 'v':
7566 case 'b':
7567 case 'a':
7568 case 'e':
7569 *buf = unescape_ascii(*s);
7570 rb_str_cat(undumped, (char *)buf, 1);
7571 s++;
7572 break;
7573 case 'u':
7574 if (*binary) {
7575 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7576 }
7577 *utf8 = true;
7578 if (++s >= s_end) {
7579 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7580 }
7581 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7582 if (*penc != enc_utf8) {
7583 *penc = enc_utf8;
7584 rb_enc_associate(undumped, enc_utf8);
7585 }
7586 if (*s == '{') { /* handle \u{...} form */
7587 s++;
7588 for (;;) {
7589 if (s >= s_end) {
7590 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7591 }
7592 if (*s == '}') {
7593 s++;
7594 break;
7595 }
7596 if (ISSPACE(*s)) {
7597 s++;
7598 continue;
7599 }
7600 c = scan_hex(s, s_end-s, &hexlen);
7601 if (hexlen == 0 || hexlen > 6) {
7602 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7603 }
7604 if (c > 0x10ffff) {
7605 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7606 }
7607 if (0xd800 <= c && c <= 0xdfff) {
7608 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7609 }
7610 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7611 rb_str_cat(undumped, (char *)buf, codelen);
7612 s += hexlen;
7613 }
7614 }
7615 else { /* handle \uXXXX form */
7616 c = scan_hex(s, 4, &hexlen);
7617 if (hexlen != 4) {
7618 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7619 }
7620 if (0xd800 <= c && c <= 0xdfff) {
7621 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7622 }
7623 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7624 rb_str_cat(undumped, (char *)buf, codelen);
7625 s += hexlen;
7626 }
7627 break;
7628 case 'x':
7629 if (++s >= s_end) {
7630 rb_raise(rb_eRuntimeError, "invalid hex escape");
7631 }
7632 *buf = scan_hex(s, 2, &hexlen);
7633 if (hexlen != 2) {
7634 rb_raise(rb_eRuntimeError, "invalid hex escape");
7635 }
7636 if (!ISASCII(*buf)) {
7637 if (*utf8) {
7638 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7639 }
7640 *binary = true;
7641 }
7642 rb_str_cat(undumped, (char *)buf, 1);
7643 s += hexlen;
7644 break;
7645 default:
7646 rb_str_cat(undumped, s-1, 2);
7647 s++;
7648 }
7649
7650 *ss = s;
7651}
7652
7653static VALUE rb_str_is_ascii_only_p(VALUE str);
7654
7655/*
7656 * call-seq:
7657 * undump -> new_string
7658 *
7659 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7660 *
7661 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7662 */
7663
7664static VALUE
7665str_undump(VALUE str)
7666{
7667 const char *s = RSTRING_PTR(str);
7668 const char *s_end = RSTRING_END(str);
7669 rb_encoding *enc = rb_enc_get(str);
7670 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7671 bool utf8 = false;
7672 bool binary = false;
7673 int w;
7674
7676 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7677 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7678 }
7679 if (!str_null_check(str, &w)) {
7680 rb_raise(rb_eRuntimeError, "string contains null byte");
7681 }
7682 if (RSTRING_LEN(str) < 2) goto invalid_format;
7683 if (*s != '"') goto invalid_format;
7684
7685 /* strip '"' at the start */
7686 s++;
7687
7688 for (;;) {
7689 if (s >= s_end) {
7690 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7691 }
7692
7693 if (*s == '"') {
7694 /* epilogue */
7695 s++;
7696 if (s == s_end) {
7697 /* ascii compatible dumped string */
7698 break;
7699 }
7700 else {
7701 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7702 static const char dup_suffix[] = ".dup";
7703 const char *encname;
7704 int encidx;
7705 ptrdiff_t size;
7706
7707 /* check separately for strings dumped by older versions */
7708 size = sizeof(dup_suffix) - 1;
7709 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7710
7711 size = sizeof(force_encoding_suffix) - 1;
7712 if (s_end - s <= size) goto invalid_format;
7713 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7714 s += size;
7715
7716 if (utf8) {
7717 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7718 }
7719
7720 encname = s;
7721 s = memchr(s, '"', s_end-s);
7722 size = s - encname;
7723 if (!s) goto invalid_format;
7724 if (s_end - s != 2) goto invalid_format;
7725 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7726
7727 encidx = rb_enc_find_index2(encname, (long)size);
7728 if (encidx < 0) {
7729 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7730 }
7731 rb_enc_associate_index(undumped, encidx);
7732 }
7733 break;
7734 }
7735
7736 if (*s == '\\') {
7737 s++;
7738 if (s >= s_end) {
7739 rb_raise(rb_eRuntimeError, "invalid escape");
7740 }
7741 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7742 }
7743 else {
7744 rb_str_cat(undumped, s++, 1);
7745 }
7746 }
7747
7748 RB_GC_GUARD(str);
7749
7750 return undumped;
7751invalid_format:
7752 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7753}
7754
7755static void
7756rb_str_check_dummy_enc(rb_encoding *enc)
7757{
7758 if (rb_enc_dummy_p(enc)) {
7759 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7760 rb_enc_name(enc));
7761 }
7762}
7763
7764static rb_encoding *
7765str_true_enc(VALUE str)
7766{
7767 rb_encoding *enc = STR_ENC_GET(str);
7768 rb_str_check_dummy_enc(enc);
7769 return enc;
7770}
7771
7772static OnigCaseFoldType
7773check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7774{
7775 if (argc==0)
7776 return flags;
7777 if (argc>2)
7778 rb_raise(rb_eArgError, "too many options");
7779 if (argv[0]==sym_turkic) {
7780 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7781 if (argc==2) {
7782 if (argv[1]==sym_lithuanian)
7783 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7784 else
7785 rb_raise(rb_eArgError, "invalid second option");
7786 }
7787 }
7788 else if (argv[0]==sym_lithuanian) {
7789 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7790 if (argc==2) {
7791 if (argv[1]==sym_turkic)
7792 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7793 else
7794 rb_raise(rb_eArgError, "invalid second option");
7795 }
7796 }
7797 else if (argc>1)
7798 rb_raise(rb_eArgError, "too many options");
7799 else if (argv[0]==sym_ascii)
7800 flags |= ONIGENC_CASE_ASCII_ONLY;
7801 else if (argv[0]==sym_fold) {
7802 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7803 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7804 else
7805 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7806 }
7807 else
7808 rb_raise(rb_eArgError, "invalid option");
7809 return flags;
7810}
7811
7812static inline bool
7813case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7814{
7815 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7816 return true;
7817 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7818}
7819
7820/* 16 should be long enough to absorb any kind of single character length increase */
7821#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7822#ifndef CASEMAP_DEBUG
7823# define CASEMAP_DEBUG 0
7824#endif
7825
7826struct mapping_buffer;
7827typedef struct mapping_buffer {
7828 size_t capa;
7829 size_t used;
7830 struct mapping_buffer *next;
7831 OnigUChar space[FLEX_ARY_LEN];
7833
7834static void
7835mapping_buffer_free(void *p)
7836{
7837 mapping_buffer *previous_buffer;
7838 mapping_buffer *current_buffer = p;
7839 while (current_buffer) {
7840 previous_buffer = current_buffer;
7841 current_buffer = current_buffer->next;
7842 ruby_xfree_sized(previous_buffer, offsetof(mapping_buffer, space) + previous_buffer->capa);
7843 }
7844}
7845
7846static const rb_data_type_t mapping_buffer_type = {
7847 "mapping_buffer",
7848 {0, mapping_buffer_free,},
7849 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7850};
7851
7852static VALUE
7853rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7854{
7855 VALUE target;
7856
7857 const OnigUChar *source_current, *source_end;
7858 int target_length = 0;
7859 VALUE buffer_anchor;
7860 mapping_buffer *current_buffer = 0;
7861 mapping_buffer **pre_buffer;
7862 size_t buffer_count = 0;
7863 int buffer_length_or_invalid;
7864
7865 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7866
7867 source_current = (OnigUChar*)RSTRING_PTR(source);
7868 source_end = (OnigUChar*)RSTRING_END(source);
7869
7870 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7871 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7872 while (source_current < source_end) {
7873 /* increase multiplier using buffer count to converge quickly */
7874 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7875 if (CASEMAP_DEBUG) {
7876 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7877 }
7878 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7879 *pre_buffer = current_buffer;
7880 pre_buffer = &current_buffer->next;
7881 current_buffer->next = NULL;
7882 current_buffer->capa = capa;
7883 buffer_length_or_invalid = enc->case_map(flags,
7884 &source_current, source_end,
7885 current_buffer->space,
7886 current_buffer->space+current_buffer->capa,
7887 enc);
7888 if (buffer_length_or_invalid < 0) {
7889 current_buffer = DATA_PTR(buffer_anchor);
7890 DATA_PTR(buffer_anchor) = 0;
7891 mapping_buffer_free(current_buffer);
7892 rb_raise(rb_eArgError, "input string invalid");
7893 }
7894 target_length += current_buffer->used = buffer_length_or_invalid;
7895 }
7896 if (CASEMAP_DEBUG) {
7897 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7898 }
7899
7900 if (buffer_count==1) {
7901 target = rb_str_new((const char*)current_buffer->space, target_length);
7902 }
7903 else {
7904 char *target_current;
7905
7906 target = rb_str_new(0, target_length);
7907 target_current = RSTRING_PTR(target);
7908 current_buffer = DATA_PTR(buffer_anchor);
7909 while (current_buffer) {
7910 memcpy(target_current, current_buffer->space, current_buffer->used);
7911 target_current += current_buffer->used;
7912 current_buffer = current_buffer->next;
7913 }
7914 }
7915 current_buffer = DATA_PTR(buffer_anchor);
7916 DATA_PTR(buffer_anchor) = 0;
7917 mapping_buffer_free(current_buffer);
7918
7919 RB_GC_GUARD(buffer_anchor);
7920
7921 /* TODO: check about string terminator character */
7922 str_enc_copy_direct(target, source);
7923 /*ENC_CODERANGE_SET(mapped, cr);*/
7924
7925 return target;
7926}
7927
7928static VALUE
7929rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7930{
7931 const OnigUChar *source_current, *source_end;
7932 OnigUChar *target_current, *target_end;
7933 long old_length = RSTRING_LEN(source);
7934 int length_or_invalid;
7935
7936 if (old_length == 0) return Qnil;
7937
7938 source_current = (OnigUChar*)RSTRING_PTR(source);
7939 source_end = (OnigUChar*)RSTRING_END(source);
7940 if (source == target) {
7941 target_current = (OnigUChar*)source_current;
7942 target_end = (OnigUChar*)source_end;
7943 }
7944 else {
7945 target_current = (OnigUChar*)RSTRING_PTR(target);
7946 target_end = (OnigUChar*)RSTRING_END(target);
7947 }
7948
7949 length_or_invalid = onigenc_ascii_only_case_map(flags,
7950 &source_current, source_end,
7951 target_current, target_end, enc);
7952 if (length_or_invalid < 0)
7953 rb_raise(rb_eArgError, "input string invalid");
7954 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7955 fprintf(stderr, "problem with rb_str_ascii_casemap"
7956 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7957 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7958 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7959 }
7960
7961 str_enc_copy(target, source);
7962
7963 return target;
7964}
7965
7966static bool
7967upcase_single(VALUE str)
7968{
7969 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7970 bool modified = false;
7971
7972 while (s < send) {
7973 unsigned int c = *(unsigned char*)s;
7974
7975 if ('a' <= c && c <= 'z') {
7976 *s = 'A' + (c - 'a');
7977 modified = true;
7978 }
7979 s++;
7980 }
7981 return modified;
7982}
7983
7984/*
7985 * call-seq:
7986 * upcase!(mapping) -> self or nil
7987 *
7988 * Like String#upcase, except that:
7989 *
7990 * - Changes character casings in +self+ (not in a copy of +self+).
7991 * - Returns +self+ if any changes are made, +nil+ otherwise.
7992 *
7993 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7994 */
7995
7996static VALUE
7997rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7998{
7999 rb_encoding *enc;
8000 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8001
8002 flags = check_case_options(argc, argv, flags);
8003 str_modify_keep_cr(str);
8004 enc = str_true_enc(str);
8005 if (case_option_single_p(flags, enc, str)) {
8006 if (upcase_single(str))
8007 flags |= ONIGENC_CASE_MODIFIED;
8008 }
8009 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8010 rb_str_ascii_casemap(str, str, &flags, enc);
8011 else
8012 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8013
8014 if (ONIGENC_CASE_MODIFIED&flags) return str;
8015 return Qnil;
8016}
8017
8018
8019/*
8020 * call-seq:
8021 * upcase(mapping = :ascii) -> new_string
8022 *
8023 * :include: doc/string/upcase.rdoc
8024 */
8025
8026static VALUE
8027rb_str_upcase(int argc, VALUE *argv, VALUE str)
8028{
8029 rb_encoding *enc;
8030 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8031 VALUE ret;
8032
8033 flags = check_case_options(argc, argv, flags);
8034 enc = str_true_enc(str);
8035 if (case_option_single_p(flags, enc, str)) {
8036 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8037 str_enc_copy_direct(ret, str);
8038 upcase_single(ret);
8039 }
8040 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8041 ret = rb_str_new(0, RSTRING_LEN(str));
8042 rb_str_ascii_casemap(str, ret, &flags, enc);
8043 }
8044 else {
8045 ret = rb_str_casemap(str, &flags, enc);
8046 }
8047
8048 return ret;
8049}
8050
8051static bool
8052downcase_single(VALUE str)
8053{
8054 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8055 bool modified = false;
8056
8057 while (s < send) {
8058 unsigned int c = *(unsigned char*)s;
8059
8060 if ('A' <= c && c <= 'Z') {
8061 *s = 'a' + (c - 'A');
8062 modified = true;
8063 }
8064 s++;
8065 }
8066
8067 return modified;
8068}
8069
8070/*
8071 * call-seq:
8072 * downcase!(mapping) -> self or nil
8073 *
8074 * Like String#downcase, except that:
8075 *
8076 * - Changes character casings in +self+ (not in a copy of +self+).
8077 * - Returns +self+ if any changes are made, +nil+ otherwise.
8078 *
8079 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8080 */
8081
8082static VALUE
8083rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8084{
8085 rb_encoding *enc;
8086 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8087
8088 flags = check_case_options(argc, argv, flags);
8089 str_modify_keep_cr(str);
8090 enc = str_true_enc(str);
8091 if (case_option_single_p(flags, enc, str)) {
8092 if (downcase_single(str))
8093 flags |= ONIGENC_CASE_MODIFIED;
8094 }
8095 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8096 rb_str_ascii_casemap(str, str, &flags, enc);
8097 else
8098 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8099
8100 if (ONIGENC_CASE_MODIFIED&flags) return str;
8101 return Qnil;
8102}
8103
8104
8105/*
8106 * call-seq:
8107 * downcase(mapping = :ascii) -> new_string
8108 *
8109 * :include: doc/string/downcase.rdoc
8110 *
8111 */
8112
8113static VALUE
8114rb_str_downcase(int argc, VALUE *argv, VALUE str)
8115{
8116 rb_encoding *enc;
8117 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8118 VALUE ret;
8119
8120 flags = check_case_options(argc, argv, flags);
8121 enc = str_true_enc(str);
8122 if (case_option_single_p(flags, enc, str)) {
8123 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8124 str_enc_copy_direct(ret, str);
8125 downcase_single(ret);
8126 }
8127 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8128 ret = rb_str_new(0, RSTRING_LEN(str));
8129 rb_str_ascii_casemap(str, ret, &flags, enc);
8130 }
8131 else {
8132 ret = rb_str_casemap(str, &flags, enc);
8133 }
8134
8135 return ret;
8136}
8137
8138
8139/*
8140 * call-seq:
8141 * capitalize!(mapping = :ascii) -> self or nil
8142 *
8143 * Like String#capitalize, except that:
8144 *
8145 * - Changes character casings in +self+ (not in a copy of +self+).
8146 * - Returns +self+ if any changes are made, +nil+ otherwise.
8147 *
8148 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8149 */
8150
8151static VALUE
8152rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8153{
8154 rb_encoding *enc;
8155 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8156
8157 flags = check_case_options(argc, argv, flags);
8158 str_modify_keep_cr(str);
8159 enc = str_true_enc(str);
8160 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8161 if (flags&ONIGENC_CASE_ASCII_ONLY)
8162 rb_str_ascii_casemap(str, str, &flags, enc);
8163 else
8164 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8165
8166 if (ONIGENC_CASE_MODIFIED&flags) return str;
8167 return Qnil;
8168}
8169
8170
8171/*
8172 * call-seq:
8173 * capitalize(mapping = :ascii) -> new_string
8174 *
8175 * :include: doc/string/capitalize.rdoc
8176 *
8177 */
8178
8179static VALUE
8180rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8181{
8182 rb_encoding *enc;
8183 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8184 VALUE ret;
8185
8186 flags = check_case_options(argc, argv, flags);
8187 enc = str_true_enc(str);
8188 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8189 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8190 ret = rb_str_new(0, RSTRING_LEN(str));
8191 rb_str_ascii_casemap(str, ret, &flags, enc);
8192 }
8193 else {
8194 ret = rb_str_casemap(str, &flags, enc);
8195 }
8196 return ret;
8197}
8198
8199
8200/*
8201 * call-seq:
8202 * swapcase!(mapping) -> self or nil
8203 *
8204 * Like String#swapcase, except that:
8205 *
8206 * - Changes are made to +self+, not to copy of +self+.
8207 * - Returns +self+ if any changes are made, +nil+ otherwise.
8208 *
8209 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8210 */
8211
8212static VALUE
8213rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8214{
8215 rb_encoding *enc;
8216 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8217
8218 flags = check_case_options(argc, argv, flags);
8219 str_modify_keep_cr(str);
8220 enc = str_true_enc(str);
8221 if (flags&ONIGENC_CASE_ASCII_ONLY)
8222 rb_str_ascii_casemap(str, str, &flags, enc);
8223 else
8224 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8225
8226 if (ONIGENC_CASE_MODIFIED&flags) return str;
8227 return Qnil;
8228}
8229
8230
8231/*
8232 * call-seq:
8233 * swapcase(mapping = :ascii) -> new_string
8234 *
8235 * :include: doc/string/swapcase.rdoc
8236 *
8237 */
8238
8239static VALUE
8240rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8241{
8242 rb_encoding *enc;
8243 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8244 VALUE ret;
8245
8246 flags = check_case_options(argc, argv, flags);
8247 enc = str_true_enc(str);
8248 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8249 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8250 ret = rb_str_new(0, RSTRING_LEN(str));
8251 rb_str_ascii_casemap(str, ret, &flags, enc);
8252 }
8253 else {
8254 ret = rb_str_casemap(str, &flags, enc);
8255 }
8256 return ret;
8257}
8258
8259typedef unsigned char *USTR;
8260
8261struct tr {
8262 int gen;
8263 unsigned int now, max;
8264 char *p, *pend;
8265};
8266
8267static unsigned int
8268trnext(struct tr *t, rb_encoding *enc)
8269{
8270 int n;
8271
8272 for (;;) {
8273 nextpart:
8274 if (!t->gen) {
8275 if (t->p == t->pend) return -1;
8276 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8277 t->p += n;
8278 }
8279 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8280 t->p += n;
8281 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8282 t->p += n;
8283 if (t->p < t->pend) {
8284 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8285 t->p += n;
8286 if (t->now > c) {
8287 if (t->now < 0x80 && c < 0x80) {
8288 rb_raise(rb_eArgError,
8289 "invalid range \"%c-%c\" in string transliteration",
8290 t->now, c);
8291 }
8292 else {
8293 rb_raise(rb_eArgError, "invalid range in string transliteration");
8294 }
8295 continue; /* not reached */
8296 }
8297 else if (t->now < c) {
8298 t->gen = 1;
8299 t->max = c;
8300 }
8301 }
8302 }
8303 return t->now;
8304 }
8305 else {
8306 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8307 if (t->now == t->max) {
8308 t->gen = 0;
8309 goto nextpart;
8310 }
8311 }
8312 if (t->now < t->max) {
8313 return t->now;
8314 }
8315 else {
8316 t->gen = 0;
8317 return t->max;
8318 }
8319 }
8320 }
8321}
8322
8323static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8324
8325static VALUE
8326tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8327{
8328 const unsigned int errc = -1;
8329 unsigned int trans[256];
8330 rb_encoding *enc, *e1, *e2;
8331 struct tr trsrc, trrepl;
8332 int cflag = 0;
8333 unsigned int c, c0, last = 0;
8334 int modify = 0, i, l;
8335 unsigned char *s, *send;
8336 VALUE hash = 0;
8337 int singlebyte = single_byte_optimizable(str);
8338 int termlen;
8339 int cr;
8340
8341#define CHECK_IF_ASCII(c) \
8342 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8343 (cr = ENC_CODERANGE_VALID) : 0)
8344
8345 StringValue(src);
8346 StringValue(repl);
8347 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8348 if (RSTRING_LEN(repl) == 0) {
8349 return rb_str_delete_bang(1, &src, str);
8350 }
8351
8352 cr = ENC_CODERANGE(str);
8353 e1 = rb_enc_check(str, src);
8354 e2 = rb_enc_check(str, repl);
8355 if (e1 == e2) {
8356 enc = e1;
8357 }
8358 else {
8359 enc = rb_enc_check(src, repl);
8360 }
8361 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8362 if (RSTRING_LEN(src) > 1 &&
8363 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8364 trsrc.p + l < trsrc.pend) {
8365 cflag = 1;
8366 trsrc.p += l;
8367 }
8368 trrepl.p = RSTRING_PTR(repl);
8369 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8370 trsrc.gen = trrepl.gen = 0;
8371 trsrc.now = trrepl.now = 0;
8372 trsrc.max = trrepl.max = 0;
8373
8374 if (cflag) {
8375 for (i=0; i<256; i++) {
8376 trans[i] = 1;
8377 }
8378 while ((c = trnext(&trsrc, enc)) != errc) {
8379 if (c < 256) {
8380 trans[c] = errc;
8381 }
8382 else {
8383 if (!hash) hash = rb_hash_new();
8384 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8385 }
8386 }
8387 while ((c = trnext(&trrepl, enc)) != errc)
8388 /* retrieve last replacer */;
8389 last = trrepl.now;
8390 for (i=0; i<256; i++) {
8391 if (trans[i] != errc) {
8392 trans[i] = last;
8393 }
8394 }
8395 }
8396 else {
8397 unsigned int r;
8398
8399 for (i=0; i<256; i++) {
8400 trans[i] = errc;
8401 }
8402 while ((c = trnext(&trsrc, enc)) != errc) {
8403 r = trnext(&trrepl, enc);
8404 if (r == errc) r = trrepl.now;
8405 if (c < 256) {
8406 trans[c] = r;
8407 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8408 }
8409 else {
8410 if (!hash) hash = rb_hash_new();
8411 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8412 }
8413 }
8414 }
8415
8416 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8417 cr = ENC_CODERANGE_7BIT;
8418 str_modify_keep_cr(str);
8419 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8420 termlen = rb_enc_mbminlen(enc);
8421 if (sflag) {
8422 int clen, tlen;
8423 long offset, max = RSTRING_LEN(str);
8424 unsigned int save = -1;
8425 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8426
8427 while (s < send) {
8428 int may_modify = 0;
8429
8430 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8431 if (!MBCLEN_CHARFOUND_P(r)) {
8432 SIZED_FREE_N(buf, max + termlen);
8433 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8434 }
8435 clen = MBCLEN_CHARFOUND_LEN(r);
8436 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8437
8438 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8439
8440 s += clen;
8441 if (c < 256) {
8442 c = trans[c];
8443 }
8444 else if (hash) {
8445 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8446 if (NIL_P(tmp)) {
8447 if (cflag) c = last;
8448 else c = errc;
8449 }
8450 else if (cflag) c = errc;
8451 else c = NUM2INT(tmp);
8452 }
8453 else {
8454 c = errc;
8455 }
8456 if (c != (unsigned int)-1) {
8457 if (save == c) {
8458 CHECK_IF_ASCII(c);
8459 continue;
8460 }
8461 save = c;
8462 tlen = rb_enc_codelen(c, enc);
8463 modify = 1;
8464 }
8465 else {
8466 save = -1;
8467 c = c0;
8468 if (enc != e1) may_modify = 1;
8469 }
8470 if ((offset = t - buf) + tlen > max) {
8471 size_t MAYBE_UNUSED(old) = max + termlen;
8472 max = offset + tlen + (send - s);
8473 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8474 t = buf + offset;
8475 }
8476 rb_enc_mbcput(c, t, enc);
8477 if (may_modify && memcmp(s, t, tlen) != 0) {
8478 modify = 1;
8479 }
8480 CHECK_IF_ASCII(c);
8481 t += tlen;
8482 }
8483 if (!STR_EMBED_P(str)) {
8484 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8485 }
8486 TERM_FILL((char *)t, termlen);
8487 RSTRING(str)->as.heap.ptr = (char *)buf;
8488 STR_SET_LEN(str, t - buf);
8489 STR_SET_NOEMBED(str);
8490 RSTRING(str)->as.heap.aux.capa = max;
8491 }
8492 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8493 while (s < send) {
8494 c = (unsigned char)*s;
8495 if (trans[c] != errc) {
8496 if (!cflag) {
8497 c = trans[c];
8498 *s = c;
8499 modify = 1;
8500 }
8501 else {
8502 *s = last;
8503 modify = 1;
8504 }
8505 }
8506 CHECK_IF_ASCII(c);
8507 s++;
8508 }
8509 }
8510 else {
8511 int clen, tlen;
8512 long offset, max = (long)((send - s) * 1.2);
8513 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8514
8515 while (s < send) {
8516 int may_modify = 0;
8517
8518 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8519 if (!MBCLEN_CHARFOUND_P(r)) {
8520 SIZED_FREE_N(buf, max + termlen);
8521 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8522 }
8523 clen = MBCLEN_CHARFOUND_LEN(r);
8524 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8525
8526 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8527
8528 if (c < 256) {
8529 c = trans[c];
8530 }
8531 else if (hash) {
8532 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8533 if (NIL_P(tmp)) {
8534 if (cflag) c = last;
8535 else c = errc;
8536 }
8537 else if (cflag) c = errc;
8538 else c = NUM2INT(tmp);
8539 }
8540 else {
8541 c = cflag ? last : errc;
8542 }
8543 if (c != errc) {
8544 tlen = rb_enc_codelen(c, enc);
8545 modify = 1;
8546 }
8547 else {
8548 c = c0;
8549 if (enc != e1) may_modify = 1;
8550 }
8551 if ((offset = t - buf) + tlen > max) {
8552 size_t MAYBE_UNUSED(old) = max + termlen;
8553 max = offset + tlen + (long)((send - s) * 1.2);
8554 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8555 t = buf + offset;
8556 }
8557 if (s != t) {
8558 rb_enc_mbcput(c, t, enc);
8559 if (may_modify && memcmp(s, t, tlen) != 0) {
8560 modify = 1;
8561 }
8562 }
8563 CHECK_IF_ASCII(c);
8564 s += clen;
8565 t += tlen;
8566 }
8567 if (!STR_EMBED_P(str)) {
8568 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8569 }
8570 TERM_FILL((char *)t, termlen);
8571 RSTRING(str)->as.heap.ptr = (char *)buf;
8572 STR_SET_LEN(str, t - buf);
8573 STR_SET_NOEMBED(str);
8574 RSTRING(str)->as.heap.aux.capa = max;
8575 }
8576
8577 if (modify) {
8578 if (cr != ENC_CODERANGE_BROKEN)
8579 ENC_CODERANGE_SET(str, cr);
8580 rb_enc_associate(str, enc);
8581 return str;
8582 }
8583 return Qnil;
8584}
8585
8586
8587/*
8588 * call-seq:
8589 * tr!(selector, replacements) -> self or nil
8590 *
8591 * Like String#tr, except:
8592 *
8593 * - Performs substitutions in +self+ (not in a copy of +self+).
8594 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8595 *
8596 * Related: {Modifying}[rdoc-ref:String@Modifying].
8597 */
8598
8599static VALUE
8600rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8601{
8602 return tr_trans(str, src, repl, 0);
8603}
8604
8605
8606/*
8607 * call-seq:
8608 * tr(selector, replacements) -> new_string
8609 *
8610 * Returns a copy of +self+ with each character specified by string +selector+
8611 * translated to the corresponding character in string +replacements+.
8612 * The correspondence is _positional_:
8613 *
8614 * - Each occurrence of the first character specified by +selector+
8615 * is translated to the first character in +replacements+.
8616 * - Each occurrence of the second character specified by +selector+
8617 * is translated to the second character in +replacements+.
8618 * - And so on.
8619 *
8620 * Example:
8621 *
8622 * 'hello'.tr('el', 'ip') #=> "hippo"
8623 *
8624 * If +replacements+ is shorter than +selector+,
8625 * it is implicitly padded with its own last character:
8626 *
8627 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8628 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8629 *
8630 * Arguments +selector+ and +replacements+ must be valid character selectors
8631 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8632 * and may use any of its valid forms, including negation, ranges, and escapes:
8633 *
8634 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8635 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8636 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8637 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8638 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8639 *
8640 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8641 */
8642
8643static VALUE
8644rb_str_tr(VALUE str, VALUE src, VALUE repl)
8645{
8646 str = str_duplicate(rb_cString, str);
8647 tr_trans(str, src, repl, 0);
8648 return str;
8649}
8650
8651#define TR_TABLE_MAX (UCHAR_MAX+1)
8652#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8653static void
8654tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8655 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8656{
8657 const unsigned int errc = -1;
8658 char buf[TR_TABLE_MAX];
8659 struct tr tr;
8660 unsigned int c;
8661 VALUE table = 0, ptable = 0;
8662 int i, l, cflag = 0;
8663
8664 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8665 tr.gen = tr.now = tr.max = 0;
8666
8667 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8668 cflag = 1;
8669 tr.p += l;
8670 }
8671 if (first) {
8672 for (i=0; i<TR_TABLE_MAX; i++) {
8673 stable[i] = 1;
8674 }
8675 stable[TR_TABLE_MAX] = cflag;
8676 }
8677 else if (stable[TR_TABLE_MAX] && !cflag) {
8678 stable[TR_TABLE_MAX] = 0;
8679 }
8680 for (i=0; i<TR_TABLE_MAX; i++) {
8681 buf[i] = cflag;
8682 }
8683
8684 while ((c = trnext(&tr, enc)) != errc) {
8685 if (c < TR_TABLE_MAX) {
8686 buf[(unsigned char)c] = !cflag;
8687 }
8688 else {
8689 VALUE key = UINT2NUM(c);
8690
8691 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8692 if (cflag) {
8693 ptable = *ctablep;
8694 table = ptable ? ptable : rb_hash_new();
8695 *ctablep = table;
8696 }
8697 else {
8698 table = rb_hash_new();
8699 ptable = *tablep;
8700 *tablep = table;
8701 }
8702 }
8703 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8704 rb_hash_aset(table, key, Qtrue);
8705 }
8706 }
8707 }
8708 for (i=0; i<TR_TABLE_MAX; i++) {
8709 stable[i] = stable[i] && buf[i];
8710 }
8711 if (!table && !cflag) {
8712 *tablep = 0;
8713 }
8714}
8715
8716
8717static int
8718tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8719{
8720 if (c < TR_TABLE_MAX) {
8721 return table[c] != 0;
8722 }
8723 else {
8724 VALUE v = UINT2NUM(c);
8725
8726 if (del) {
8727 if (!NIL_P(rb_hash_lookup(del, v)) &&
8728 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8729 return TRUE;
8730 }
8731 }
8732 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8733 return FALSE;
8734 }
8735 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8736 }
8737}
8738
8739/*
8740 * call-seq:
8741 * delete!(*selectors) -> self or nil
8742 *
8743 * Like String#delete, but modifies +self+ in place;
8744 * returns +self+ if any characters were deleted, +nil+ otherwise.
8745 *
8746 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8747 */
8748
8749static VALUE
8750rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8751{
8752 char squeez[TR_TABLE_SIZE];
8753 rb_encoding *enc = 0;
8754 char *s, *send, *t;
8755 VALUE del = 0, nodel = 0;
8756 int modify = 0;
8757 int i, ascompat, cr;
8758
8759 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8761 for (i=0; i<argc; i++) {
8762 VALUE s = argv[i];
8763
8764 StringValue(s);
8765 enc = rb_enc_check(str, s);
8766 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8767 }
8768
8769 str_modify_keep_cr(str);
8770 ascompat = rb_enc_asciicompat(enc);
8771 s = t = RSTRING_PTR(str);
8772 send = RSTRING_END(str);
8773 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8774 while (s < send) {
8775 unsigned int c;
8776 int clen;
8777
8778 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8779 if (squeez[c]) {
8780 modify = 1;
8781 }
8782 else {
8783 if (t != s) *t = c;
8784 t++;
8785 }
8786 s++;
8787 }
8788 else {
8789 c = rb_enc_codepoint_len(s, send, &clen, enc);
8790
8791 if (tr_find(c, squeez, del, nodel)) {
8792 modify = 1;
8793 }
8794 else {
8795 if (t != s) rb_enc_mbcput(c, t, enc);
8796 t += clen;
8798 }
8799 s += clen;
8800 }
8801 }
8802 TERM_FILL(t, TERM_LEN(str));
8803 STR_SET_LEN(str, t - RSTRING_PTR(str));
8804 ENC_CODERANGE_SET(str, cr);
8805
8806 if (modify) return str;
8807 return Qnil;
8808}
8809
8810
8811/*
8812 * call-seq:
8813 * delete(*selectors) -> new_string
8814 *
8815 * :include: doc/string/delete.rdoc
8816 *
8817 */
8818
8819static VALUE
8820rb_str_delete(int argc, VALUE *argv, VALUE str)
8821{
8822 str = str_duplicate(rb_cString, str);
8823 rb_str_delete_bang(argc, argv, str);
8824 return str;
8825}
8826
8827
8828/*
8829 * call-seq:
8830 * squeeze!(*selectors) -> self or nil
8831 *
8832 * Like String#squeeze, except that:
8833 *
8834 * - Characters are squeezed in +self+ (not in a copy of +self+).
8835 * - Returns +self+ if any changes are made, +nil+ otherwise.
8836 *
8837 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8838 */
8839
8840static VALUE
8841rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8842{
8843 char squeez[TR_TABLE_SIZE];
8844 rb_encoding *enc = 0;
8845 VALUE del = 0, nodel = 0;
8846 unsigned char *s, *send, *t;
8847 int i, modify = 0;
8848 int ascompat, singlebyte = single_byte_optimizable(str);
8849 unsigned int save;
8850
8851 if (argc == 0) {
8852 enc = STR_ENC_GET(str);
8853 }
8854 else {
8855 for (i=0; i<argc; i++) {
8856 VALUE s = argv[i];
8857
8858 StringValue(s);
8859 enc = rb_enc_check(str, s);
8860 if (singlebyte && !single_byte_optimizable(s))
8861 singlebyte = 0;
8862 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8863 }
8864 }
8865
8866 str_modify_keep_cr(str);
8867 s = t = (unsigned char *)RSTRING_PTR(str);
8868 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8869 send = (unsigned char *)RSTRING_END(str);
8870 save = -1;
8871 ascompat = rb_enc_asciicompat(enc);
8872
8873 if (singlebyte) {
8874 while (s < send) {
8875 unsigned int c = *s++;
8876 if (c != save || (argc > 0 && !squeez[c])) {
8877 *t++ = save = c;
8878 }
8879 }
8880 }
8881 else {
8882 while (s < send) {
8883 unsigned int c;
8884 int clen;
8885
8886 if (ascompat && (c = *s) < 0x80) {
8887 if (c != save || (argc > 0 && !squeez[c])) {
8888 *t++ = save = c;
8889 }
8890 s++;
8891 }
8892 else {
8893 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8894
8895 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8896 if (t != s) rb_enc_mbcput(c, t, enc);
8897 save = c;
8898 t += clen;
8899 }
8900 s += clen;
8901 }
8902 }
8903 }
8904
8905 TERM_FILL((char *)t, TERM_LEN(str));
8906 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8907 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8908 modify = 1;
8909 }
8910
8911 if (modify) return str;
8912 return Qnil;
8913}
8914
8915
8916/*
8917 * call-seq:
8918 * squeeze(*selectors) -> new_string
8919 *
8920 * :include: doc/string/squeeze.rdoc
8921 *
8922 */
8923
8924static VALUE
8925rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8926{
8927 str = str_duplicate(rb_cString, str);
8928 rb_str_squeeze_bang(argc, argv, str);
8929 return str;
8930}
8931
8932
8933/*
8934 * call-seq:
8935 * tr_s!(selector, replacements) -> self or nil
8936 *
8937 * Like String#tr_s, except:
8938 *
8939 * - Modifies +self+ in place (not a copy of +self+).
8940 * - Returns +self+ if any changes were made, +nil+ otherwise.
8941 *
8942 * Related: {Modifying}[rdoc-ref:String@Modifying].
8943 */
8944
8945static VALUE
8946rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8947{
8948 return tr_trans(str, src, repl, 1);
8949}
8950
8951
8952/*
8953 * call-seq:
8954 * tr_s(selector, replacements) -> new_string
8955 *
8956 * Like String#tr, except:
8957 *
8958 * - Also squeezes the modified portions of the translated string;
8959 * see String#squeeze.
8960 * - Returns the translated and squeezed string.
8961 *
8962 * Examples:
8963 *
8964 * 'hello'.tr_s('l', 'r') #=> "hero"
8965 * 'hello'.tr_s('el', '-') #=> "h-o"
8966 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8967 *
8968 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8969 *
8970 */
8971
8972static VALUE
8973rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8974{
8975 str = str_duplicate(rb_cString, str);
8976 tr_trans(str, src, repl, 1);
8977 return str;
8978}
8979
8980
8981/*
8982 * call-seq:
8983 * count(*selectors) -> integer
8984 *
8985 * :include: doc/string/count.rdoc
8986 */
8987
8988static VALUE
8989rb_str_count(int argc, VALUE *argv, VALUE str)
8990{
8991 char table[TR_TABLE_SIZE];
8992 rb_encoding *enc = 0;
8993 VALUE del = 0, nodel = 0, tstr;
8994 char *s, *send;
8995 int i;
8996 int ascompat;
8997 size_t n = 0;
8998
9000
9001 tstr = argv[0];
9002 StringValue(tstr);
9003 enc = rb_enc_check(str, tstr);
9004 if (argc == 1) {
9005 const char *ptstr;
9006 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9007 (ptstr = RSTRING_PTR(tstr),
9008 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9009 !is_broken_string(str)) {
9010 int clen;
9011 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9012
9013 s = RSTRING_PTR(str);
9014 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9015 send = RSTRING_END(str);
9016 while (s < send) {
9017 if (*(unsigned char*)s++ == c) n++;
9018 }
9019 return SIZET2NUM(n);
9020 }
9021 }
9022
9023 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9024 for (i=1; i<argc; i++) {
9025 tstr = argv[i];
9026 StringValue(tstr);
9027 enc = rb_enc_check(str, tstr);
9028 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9029 }
9030
9031 s = RSTRING_PTR(str);
9032 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9033 send = RSTRING_END(str);
9034 ascompat = rb_enc_asciicompat(enc);
9035 while (s < send) {
9036 unsigned int c;
9037
9038 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9039 if (table[c]) {
9040 n++;
9041 }
9042 s++;
9043 }
9044 else {
9045 int clen;
9046 c = rb_enc_codepoint_len(s, send, &clen, enc);
9047 if (tr_find(c, table, del, nodel)) {
9048 n++;
9049 }
9050 s += clen;
9051 }
9052 }
9053
9054 return SIZET2NUM(n);
9055}
9056
9057static VALUE
9058rb_fs_check(VALUE val)
9059{
9060 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9061 val = rb_check_string_type(val);
9062 if (NIL_P(val)) return 0;
9063 }
9064 return val;
9065}
9066
9067static const char isspacetable[256] = {
9068 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9069 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9070 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9071 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9072 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9078 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9084};
9085
9086#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9087
9088static long
9089split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9090{
9091 if (empty_count >= 0 && len == 0) {
9092 return empty_count + 1;
9093 }
9094 if (empty_count > 0) {
9095 /* make different substrings */
9096 if (result) {
9097 do {
9098 rb_ary_push(result, str_new_empty_String(str));
9099 } while (--empty_count > 0);
9100 }
9101 else {
9102 do {
9103 rb_yield(str_new_empty_String(str));
9104 } while (--empty_count > 0);
9105 }
9106 }
9107 str = rb_str_subseq(str, beg, len);
9108 if (result) {
9109 rb_ary_push(result, str);
9110 }
9111 else {
9112 rb_yield(str);
9113 }
9114 return empty_count;
9115}
9116
9117typedef enum {
9118 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9119} split_type_t;
9120
9121static split_type_t
9122literal_split_pattern(VALUE spat, split_type_t default_type)
9123{
9124 rb_encoding *enc = STR_ENC_GET(spat);
9125 const char *ptr;
9126 long len;
9127 RSTRING_GETMEM(spat, ptr, len);
9128 if (len == 0) {
9129 /* Special case - split into chars */
9130 return SPLIT_TYPE_CHARS;
9131 }
9132 else if (rb_enc_asciicompat(enc)) {
9133 if (len == 1 && ptr[0] == ' ') {
9134 return SPLIT_TYPE_AWK;
9135 }
9136 }
9137 else {
9138 int l;
9139 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9140 return SPLIT_TYPE_AWK;
9141 }
9142 }
9143 return default_type;
9144}
9145
9146/*
9147 * call-seq:
9148 * split(field_sep = $;, limit = 0) -> array_of_substrings
9149 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9150 *
9151 * :include: doc/string/split.rdoc
9152 *
9153 */
9154
9155static VALUE
9156rb_str_split_m(int argc, VALUE *argv, VALUE str)
9157{
9158 rb_encoding *enc;
9159 VALUE spat;
9160 VALUE limit;
9161 split_type_t split_type;
9162 long beg, end, i = 0, empty_count = -1;
9163 int lim = 0;
9164 VALUE result, tmp;
9165
9166 result = rb_block_given_p() ? Qfalse : Qnil;
9167 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9168 lim = NUM2INT(limit);
9169 if (lim <= 0) limit = Qnil;
9170 else if (lim == 1) {
9171 if (RSTRING_LEN(str) == 0)
9172 return result ? rb_ary_new2(0) : str;
9173 tmp = str_duplicate(rb_cString, str);
9174 if (!result) {
9175 rb_yield(tmp);
9176 return str;
9177 }
9178 return rb_ary_new3(1, tmp);
9179 }
9180 i = 1;
9181 }
9182 if (NIL_P(limit) && !lim) empty_count = 0;
9183
9184 enc = STR_ENC_GET(str);
9185 split_type = SPLIT_TYPE_REGEXP;
9186 if (!NIL_P(spat)) {
9187 spat = get_pat_quoted(spat, 0);
9188 }
9189 else if (NIL_P(spat = rb_fs)) {
9190 split_type = SPLIT_TYPE_AWK;
9191 }
9192 else if (!(spat = rb_fs_check(spat))) {
9193 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9194 }
9195 else {
9196 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9197 }
9198 if (split_type != SPLIT_TYPE_AWK) {
9199 switch (BUILTIN_TYPE(spat)) {
9200 case T_REGEXP:
9201 rb_reg_options(spat); /* check if uninitialized */
9202 tmp = RREGEXP_SRC(spat);
9203 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9204 if (split_type == SPLIT_TYPE_AWK) {
9205 spat = tmp;
9206 split_type = SPLIT_TYPE_STRING;
9207 }
9208 break;
9209
9210 case T_STRING:
9211 mustnot_broken(spat);
9212 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9213 break;
9214
9215 default:
9217 }
9218 }
9219
9220#define SPLIT_STR(beg, len) ( \
9221 empty_count = split_string(result, str, beg, len, empty_count), \
9222 str_mod_check(str, str_start, str_len))
9223
9224 beg = 0;
9225 char *ptr = RSTRING_PTR(str);
9226 char *const str_start = ptr;
9227 const long str_len = RSTRING_LEN(str);
9228 char *const eptr = str_start + str_len;
9229 if (split_type == SPLIT_TYPE_AWK) {
9230 char *bptr = ptr;
9231 int skip = 1;
9232 unsigned int c;
9233
9234 if (result) result = rb_ary_new();
9235 end = beg;
9236 if (is_ascii_string(str)) {
9237 while (ptr < eptr) {
9238 c = (unsigned char)*ptr++;
9239 if (skip) {
9240 if (ascii_isspace(c)) {
9241 beg = ptr - bptr;
9242 }
9243 else {
9244 end = ptr - bptr;
9245 skip = 0;
9246 if (!NIL_P(limit) && lim <= i) break;
9247 }
9248 }
9249 else if (ascii_isspace(c)) {
9250 SPLIT_STR(beg, end-beg);
9251 skip = 1;
9252 beg = ptr - bptr;
9253 if (!NIL_P(limit)) ++i;
9254 }
9255 else {
9256 end = ptr - bptr;
9257 }
9258 }
9259 }
9260 else {
9261 while (ptr < eptr) {
9262 int n;
9263
9264 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9265 ptr += n;
9266 if (skip) {
9267 if (rb_isspace(c)) {
9268 beg = ptr - bptr;
9269 }
9270 else {
9271 end = ptr - bptr;
9272 skip = 0;
9273 if (!NIL_P(limit) && lim <= i) break;
9274 }
9275 }
9276 else if (rb_isspace(c)) {
9277 SPLIT_STR(beg, end-beg);
9278 skip = 1;
9279 beg = ptr - bptr;
9280 if (!NIL_P(limit)) ++i;
9281 }
9282 else {
9283 end = ptr - bptr;
9284 }
9285 }
9286 }
9287 }
9288 else if (split_type == SPLIT_TYPE_STRING) {
9289 char *substr_start = ptr;
9290 char *sptr = RSTRING_PTR(spat);
9291 long slen = RSTRING_LEN(spat);
9292
9293 if (result) result = rb_ary_new();
9294 mustnot_broken(str);
9295 enc = rb_enc_check(str, spat);
9296 while (ptr < eptr &&
9297 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9298 /* Check we are at the start of a char */
9299 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9300 if (t != ptr + end) {
9301 ptr = t;
9302 continue;
9303 }
9304 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9305 str_mod_check(spat, sptr, slen);
9306 ptr += end + slen;
9307 substr_start = ptr;
9308 if (!NIL_P(limit) && lim <= ++i) break;
9309 }
9310 beg = ptr - str_start;
9311 }
9312 else if (split_type == SPLIT_TYPE_CHARS) {
9313 int n;
9314
9315 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9316 mustnot_broken(str);
9317 enc = rb_enc_get(str);
9318 while (ptr < eptr &&
9319 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9320 SPLIT_STR(ptr - str_start, n);
9321 ptr += n;
9322 if (!NIL_P(limit) && lim <= ++i) break;
9323 }
9324 beg = ptr - str_start;
9325 }
9326 else {
9327 if (result) result = rb_ary_new();
9328 long len = RSTRING_LEN(str);
9329 long start = beg;
9330 int idx;
9331 int last_null = 0;
9332 VALUE match = 0;
9333
9334 for (; rb_reg_search(spat, str, start, 0) >= 0;
9335 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9336 match = rb_backref_get();
9337 if (!result) rb_match_busy(match);
9338 end = RMATCH_BEG(match, 0);
9339 if (start == end && RMATCH_BEG(match, 0) == RMATCH_END(match, 0)) {
9340 if (!ptr) {
9341 SPLIT_STR(0, 0);
9342 break;
9343 }
9344 else if (last_null == 1) {
9345 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9346 beg = start;
9347 }
9348 else {
9349 if (start == len)
9350 start++;
9351 else
9352 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9353 last_null = 1;
9354 continue;
9355 }
9356 }
9357 else {
9358 SPLIT_STR(beg, end-beg);
9359 beg = start = RMATCH_END(match, 0);
9360 }
9361 last_null = 0;
9362
9363 for (idx = 1; idx < RMATCH_NREGS(match); idx++) {
9364 if (RMATCH_BEG(match, idx) == -1) continue;
9365 SPLIT_STR(RMATCH_BEG(match, idx), RMATCH_END(match, idx) - RMATCH_BEG(match, idx));
9366 }
9367 if (!NIL_P(limit) && lim <= ++i) break;
9368 }
9369 if (match) rb_match_unbusy(match);
9370 }
9371 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9372 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9373 }
9374
9375 return result ? result : str;
9376}
9377
9378VALUE
9379rb_str_split(VALUE str, const char *sep0)
9380{
9381 VALUE sep;
9382
9383 StringValue(str);
9384 sep = rb_str_new_cstr(sep0);
9385 return rb_str_split_m(1, &sep, str);
9386}
9387
9388#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9389
9390static inline int
9391enumerator_element(VALUE ary, VALUE e)
9392{
9393 if (ary) {
9394 rb_ary_push(ary, e);
9395 return 0;
9396 }
9397 else {
9398 rb_yield(e);
9399 return 1;
9400 }
9401}
9402
9403#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9404
9405static const char *
9406chomp_newline(const char *p, const char *e, rb_encoding *enc)
9407{
9408 const char *prev = rb_enc_prev_char(p, e, e, enc);
9409 if (rb_enc_is_newline(prev, e, enc)) {
9410 e = prev;
9411 prev = rb_enc_prev_char(p, e, e, enc);
9412 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9413 e = prev;
9414 }
9415 return e;
9416}
9417
9418static VALUE
9419get_rs(void)
9420{
9421 VALUE rs = rb_rs;
9422 if (!NIL_P(rs) &&
9423 (!RB_TYPE_P(rs, T_STRING) ||
9424 RSTRING_LEN(rs) != 1 ||
9425 RSTRING_PTR(rs)[0] != '\n')) {
9426 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9427 }
9428 return rs;
9429}
9430
9431#define rb_rs get_rs()
9432
9433static VALUE
9434rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9435{
9436 rb_encoding *enc;
9437 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9438 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9439 long pos, len, rslen;
9440 int rsnewline = 0;
9441
9442 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9443 rs = rb_rs;
9444 if (!NIL_P(opts)) {
9445 static ID keywords[1];
9446 if (!keywords[0]) {
9447 keywords[0] = rb_intern_const("chomp");
9448 }
9449 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9450 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9451 }
9452
9453 if (NIL_P(rs)) {
9454 if (!ENUM_ELEM(ary, str)) {
9455 return ary;
9456 }
9457 else {
9458 return orig;
9459 }
9460 }
9461
9462 if (!RSTRING_LEN(str)) goto end;
9463 str = rb_str_new_frozen(str);
9464 ptr = subptr = RSTRING_PTR(str);
9465 pend = RSTRING_END(str);
9466 len = RSTRING_LEN(str);
9467 StringValue(rs);
9468 rslen = RSTRING_LEN(rs);
9469
9470 if (rs == rb_default_rs)
9471 enc = rb_enc_get(str);
9472 else
9473 enc = rb_enc_check(str, rs);
9474
9475 if (rslen == 0) {
9476 /* paragraph mode */
9477 int n;
9478 const char *eol = NULL;
9479 subend = subptr;
9480 while (subend < pend) {
9481 long chomp_rslen = 0;
9482 do {
9483 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9484 n = 0;
9485 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9486 if (rb_enc_is_newline(subend + n, pend, enc)) {
9487 if (eol == subend) break;
9488 subend += rslen;
9489 if (subptr) {
9490 eol = subend;
9491 chomp_rslen = -rslen;
9492 }
9493 }
9494 else {
9495 if (!subptr) subptr = subend;
9496 subend += rslen;
9497 }
9498 rslen = 0;
9499 } while (subend < pend);
9500 if (!subptr) break;
9501 if (rslen == 0) chomp_rslen = 0;
9502 line = rb_str_subseq(str, subptr - ptr,
9503 subend - subptr + (chomp ? chomp_rslen : rslen));
9504 if (ENUM_ELEM(ary, line)) {
9505 str_mod_check(str, ptr, len);
9506 }
9507 subptr = eol = NULL;
9508 }
9509 goto end;
9510 }
9511 else {
9512 rsptr = RSTRING_PTR(rs);
9513 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9514 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9515 rsnewline = 1;
9516 }
9517 }
9518
9519 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9520 rs = rb_str_new(rsptr, rslen);
9521 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9522 rsptr = RSTRING_PTR(rs);
9523 rslen = RSTRING_LEN(rs);
9524 }
9525
9526 while (subptr < pend) {
9527 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9528 if (pos < 0) break;
9529 hit = subptr + pos;
9530 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9531 if (hit != adjusted) {
9532 subptr = adjusted;
9533 continue;
9534 }
9535 subend = hit += rslen;
9536 if (chomp) {
9537 if (rsnewline) {
9538 subend = chomp_newline(subptr, subend, enc);
9539 }
9540 else {
9541 subend -= rslen;
9542 }
9543 }
9544 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9545 if (ENUM_ELEM(ary, line)) {
9546 str_mod_check(str, ptr, len);
9547 }
9548 subptr = hit;
9549 }
9550
9551 if (subptr != pend) {
9552 if (chomp) {
9553 if (rsnewline) {
9554 pend = chomp_newline(subptr, pend, enc);
9555 }
9556 else if (pend - subptr >= rslen &&
9557 memcmp(pend - rslen, rsptr, rslen) == 0) {
9558 pend -= rslen;
9559 }
9560 }
9561 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9562 ENUM_ELEM(ary, line);
9563 RB_GC_GUARD(str);
9564 }
9565
9566 end:
9567 if (ary)
9568 return ary;
9569 else
9570 return orig;
9571}
9572
9573/*
9574 * call-seq:
9575 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9576 * each_line(record_separator = $/, chomp: false) -> enumerator
9577 *
9578 * :include: doc/string/each_line.rdoc
9579 *
9580 */
9581
9582static VALUE
9583rb_str_each_line(int argc, VALUE *argv, VALUE str)
9584{
9585 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9586 return rb_str_enumerate_lines(argc, argv, str, 0);
9587}
9588
9589/*
9590 * call-seq:
9591 * lines(record_separator = $/, chomp: false) -> array_of_strings
9592 *
9593 * Returns substrings ("lines") of +self+
9594 * according to the given arguments:
9595 *
9596 * s = <<~EOT
9597 * This is the first line.
9598 * This is line two.
9599 *
9600 * This is line four.
9601 * This is line five.
9602 * EOT
9603 *
9604 * With the default argument values:
9605 *
9606 * $/ # => "\n"
9607 * s.lines
9608 * # =>
9609 * ["This is the first line.\n",
9610 * "This is line two.\n",
9611 * "\n",
9612 * "This is line four.\n",
9613 * "This is line five.\n"]
9614 *
9615 * With a different +record_separator+:
9616 *
9617 * record_separator = ' is '
9618 * s.lines(record_separator)
9619 * # =>
9620 * ["This is ",
9621 * "the first line.\nThis is ",
9622 * "line two.\n\nThis is ",
9623 * "line four.\nThis is ",
9624 * "line five.\n"]
9625 *
9626 * With keyword argument +chomp+ as +true+,
9627 * removes the trailing newline from each line:
9628 *
9629 * s.lines(chomp: true)
9630 * # =>
9631 * ["This is the first line.",
9632 * "This is line two.",
9633 * "",
9634 * "This is line four.",
9635 * "This is line five."]
9636 *
9637 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
9638 */
9639
9640static VALUE
9641rb_str_lines(int argc, VALUE *argv, VALUE str)
9642{
9643 VALUE ary = WANTARRAY("lines", 0);
9644 return rb_str_enumerate_lines(argc, argv, str, ary);
9645}
9646
9647static VALUE
9648rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9649{
9650 return LONG2FIX(RSTRING_LEN(str));
9651}
9652
9653static VALUE
9654rb_str_enumerate_bytes(VALUE str, VALUE ary)
9655{
9656 long i;
9657
9658 for (i=0; i<RSTRING_LEN(str); i++) {
9659 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9660 }
9661 if (ary)
9662 return ary;
9663 else
9664 return str;
9665}
9666
9667/*
9668 * call-seq:
9669 * each_byte {|byte| ... } -> self
9670 * each_byte -> enumerator
9671 *
9672 * :include: doc/string/each_byte.rdoc
9673 *
9674 */
9675
9676static VALUE
9677rb_str_each_byte(VALUE str)
9678{
9679 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9680 return rb_str_enumerate_bytes(str, 0);
9681}
9682
9683/*
9684 * call-seq:
9685 * bytes -> array_of_bytes
9686 *
9687 * :include: doc/string/bytes.rdoc
9688 *
9689 */
9690
9691static VALUE
9692rb_str_bytes(VALUE str)
9693{
9694 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9695 return rb_str_enumerate_bytes(str, ary);
9696}
9697
9698static VALUE
9699rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9700{
9701 return rb_str_length(str);
9702}
9703
9704static VALUE
9705rb_str_enumerate_chars(VALUE str, VALUE ary)
9706{
9707 VALUE orig = str;
9708 long i, len, n;
9709 const char *ptr;
9710 rb_encoding *enc;
9711
9712 str = rb_str_new_frozen(str);
9713 ptr = RSTRING_PTR(str);
9714 len = RSTRING_LEN(str);
9715 enc = rb_enc_get(str);
9716
9718 for (i = 0; i < len; i += n) {
9719 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9720 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9721 }
9722 }
9723 else {
9724 for (i = 0; i < len; i += n) {
9725 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9726 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9727 }
9728 }
9729 RB_GC_GUARD(str);
9730 if (ary)
9731 return ary;
9732 else
9733 return orig;
9734}
9735
9736/*
9737 * call-seq:
9738 * each_char {|char| ... } -> self
9739 * each_char -> enumerator
9740 *
9741 * :include: doc/string/each_char.rdoc
9742 *
9743 */
9744
9745static VALUE
9746rb_str_each_char(VALUE str)
9747{
9748 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9749 return rb_str_enumerate_chars(str, 0);
9750}
9751
9752/*
9753 * call-seq:
9754 * chars -> array_of_characters
9755 *
9756 * :include: doc/string/chars.rdoc
9757 *
9758 */
9759
9760static VALUE
9761rb_str_chars(VALUE str)
9762{
9763 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9764 return rb_str_enumerate_chars(str, ary);
9765}
9766
9767static VALUE
9768rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9769{
9770 VALUE orig = str;
9771 int n;
9772 unsigned int c;
9773 const char *ptr, *end;
9774 rb_encoding *enc;
9775
9776 if (single_byte_optimizable(str))
9777 return rb_str_enumerate_bytes(str, ary);
9778
9779 str = rb_str_new_frozen(str);
9780 ptr = RSTRING_PTR(str);
9781 end = RSTRING_END(str);
9782 enc = STR_ENC_GET(str);
9783
9784 while (ptr < end) {
9785 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9786 ENUM_ELEM(ary, UINT2NUM(c));
9787 ptr += n;
9788 }
9789 RB_GC_GUARD(str);
9790 if (ary)
9791 return ary;
9792 else
9793 return orig;
9794}
9795
9796/*
9797 * call-seq:
9798 * each_codepoint {|codepoint| ... } -> self
9799 * each_codepoint -> enumerator
9800 *
9801 * :include: doc/string/each_codepoint.rdoc
9802 *
9803 */
9804
9805static VALUE
9806rb_str_each_codepoint(VALUE str)
9807{
9808 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9809 return rb_str_enumerate_codepoints(str, 0);
9810}
9811
9812/*
9813 * call-seq:
9814 * codepoints -> array_of_integers
9815 *
9816 * :include: doc/string/codepoints.rdoc
9817 *
9818 */
9819
9820static VALUE
9821rb_str_codepoints(VALUE str)
9822{
9823 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9824 return rb_str_enumerate_codepoints(str, ary);
9825}
9826
9827static regex_t *
9828get_reg_grapheme_cluster(rb_encoding *enc)
9829{
9830 int encidx = rb_enc_to_index(enc);
9831
9832 const OnigUChar source_ascii[] = "\\X";
9833 const OnigUChar *source = source_ascii;
9834 size_t source_len = sizeof(source_ascii) - 1;
9835
9836 switch (encidx) {
9837#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9838#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9839#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9840#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9841#define CASE_UTF(e) \
9842 case ENCINDEX_UTF_##e: { \
9843 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9844 source = source_UTF_##e; \
9845 source_len = sizeof(source_UTF_##e); \
9846 break; \
9847 }
9848 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9849#undef CASE_UTF
9850#undef CHARS_16BE
9851#undef CHARS_16LE
9852#undef CHARS_32BE
9853#undef CHARS_32LE
9854 }
9855
9856 regex_t *reg_grapheme_cluster;
9857 OnigErrorInfo einfo;
9858 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9859 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9860 if (r) {
9861 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9862 onig_error_code_to_str(message, r, &einfo);
9863 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9864 }
9865
9866 return reg_grapheme_cluster;
9867}
9868
9869static regex_t *
9870get_cached_reg_grapheme_cluster(rb_encoding *enc)
9871{
9872 int encidx = rb_enc_to_index(enc);
9873 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9874
9875 if (encidx == rb_utf8_encindex()) {
9876 if (!reg_grapheme_cluster_utf8) {
9877 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9878 }
9879
9880 return reg_grapheme_cluster_utf8;
9881 }
9882
9883 return NULL;
9884}
9885
9886static VALUE
9887rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9888{
9889 size_t grapheme_cluster_count = 0;
9890 rb_encoding *enc = get_encoding(str);
9891 const char *ptr, *end;
9892
9893 if (!rb_enc_unicode_p(enc)) {
9894 return rb_str_length(str);
9895 }
9896
9897 bool cached_reg_grapheme_cluster = true;
9898 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9899 if (!reg_grapheme_cluster) {
9900 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9901 cached_reg_grapheme_cluster = false;
9902 }
9903
9904 ptr = RSTRING_PTR(str);
9905 end = RSTRING_END(str);
9906
9907 while (ptr < end) {
9908 OnigPosition len = onig_match(reg_grapheme_cluster,
9909 (const OnigUChar *)ptr, (const OnigUChar *)end,
9910 (const OnigUChar *)ptr, NULL, 0);
9911 if (len <= 0) break;
9912 grapheme_cluster_count++;
9913 ptr += len;
9914 }
9915
9916 if (!cached_reg_grapheme_cluster) {
9917 onig_free(reg_grapheme_cluster);
9918 }
9919
9920 return SIZET2NUM(grapheme_cluster_count);
9921}
9922
9923static VALUE
9924rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9925{
9926 VALUE orig = str;
9927 rb_encoding *enc = get_encoding(str);
9928 const char *ptr0, *ptr, *end;
9929
9930 if (!rb_enc_unicode_p(enc)) {
9931 return rb_str_enumerate_chars(str, ary);
9932 }
9933
9934 if (!ary) str = rb_str_new_frozen(str);
9935
9936 bool cached_reg_grapheme_cluster = true;
9937 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9938 if (!reg_grapheme_cluster) {
9939 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9940 cached_reg_grapheme_cluster = false;
9941 }
9942
9943 ptr0 = ptr = RSTRING_PTR(str);
9944 end = RSTRING_END(str);
9945
9946 while (ptr < end) {
9947 OnigPosition len = onig_match(reg_grapheme_cluster,
9948 (const OnigUChar *)ptr, (const OnigUChar *)end,
9949 (const OnigUChar *)ptr, NULL, 0);
9950 if (len <= 0) break;
9951 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9952 ptr += len;
9953 }
9954
9955 if (!cached_reg_grapheme_cluster) {
9956 onig_free(reg_grapheme_cluster);
9957 }
9958
9959 RB_GC_GUARD(str);
9960 if (ary)
9961 return ary;
9962 else
9963 return orig;
9964}
9965
9966/*
9967 * call-seq:
9968 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9969 * each_grapheme_cluster -> enumerator
9970 *
9971 * :include: doc/string/each_grapheme_cluster.rdoc
9972 *
9973 */
9974
9975static VALUE
9976rb_str_each_grapheme_cluster(VALUE str)
9977{
9978 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9979 return rb_str_enumerate_grapheme_clusters(str, 0);
9980}
9981
9982/*
9983 * call-seq:
9984 * grapheme_clusters -> array_of_grapheme_clusters
9985 *
9986 * :include: doc/string/grapheme_clusters.rdoc
9987 *
9988 */
9989
9990static VALUE
9991rb_str_grapheme_clusters(VALUE str)
9992{
9993 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9994 return rb_str_enumerate_grapheme_clusters(str, ary);
9995}
9996
9997static long
9998chopped_length(VALUE str)
9999{
10000 rb_encoding *enc = STR_ENC_GET(str);
10001 const char *p, *p2, *beg, *end;
10002
10003 beg = RSTRING_PTR(str);
10004 end = beg + RSTRING_LEN(str);
10005 if (beg >= end) return 0;
10006 p = rb_enc_prev_char(beg, end, end, enc);
10007 if (!p) return 0;
10008 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10009 p2 = rb_enc_prev_char(beg, p, end, enc);
10010 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10011 }
10012 return p - beg;
10013}
10014
10015/*
10016 * call-seq:
10017 * chop! -> self or nil
10018 *
10019 * Like String#chop, except that:
10020 *
10021 * - Removes trailing characters from +self+ (not from a copy of +self+).
10022 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10023 *
10024 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10025 */
10026
10027static VALUE
10028rb_str_chop_bang(VALUE str)
10029{
10030 str_modify_keep_cr(str);
10031 if (RSTRING_LEN(str) > 0) {
10032 long len;
10033 len = chopped_length(str);
10034 STR_SET_LEN(str, len);
10035 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10036 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10038 }
10039 return str;
10040 }
10041 return Qnil;
10042}
10043
10044
10045/*
10046 * call-seq:
10047 * chop -> new_string
10048 *
10049 * :include: doc/string/chop.rdoc
10050 *
10051 */
10052
10053static VALUE
10054rb_str_chop(VALUE str)
10055{
10056 return rb_str_subseq(str, 0, chopped_length(str));
10057}
10058
10059static long
10060smart_chomp(VALUE str, const char *e, const char *p)
10061{
10062 rb_encoding *enc = rb_enc_get(str);
10063 if (rb_enc_mbminlen(enc) > 1) {
10064 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10065 if (rb_enc_is_newline(pp, e, enc)) {
10066 e = pp;
10067 }
10068 pp = e - rb_enc_mbminlen(enc);
10069 if (pp >= p) {
10070 pp = rb_enc_left_char_head(p, pp, e, enc);
10071 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10072 e = pp;
10073 }
10074 }
10075 }
10076 else {
10077 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10078 case '\n':
10079 if (--e > p && *(e-1) == '\r') {
10080 --e;
10081 }
10082 break;
10083 case '\r':
10084 --e;
10085 break;
10086 }
10087 }
10088 return e - p;
10089}
10090
10091static long
10092chompped_length(VALUE str, VALUE rs)
10093{
10094 rb_encoding *enc;
10095 int newline;
10096 char *pp, *e, *rsptr;
10097 long rslen;
10098 char *const p = RSTRING_PTR(str);
10099 long len = RSTRING_LEN(str);
10100
10101 if (len == 0) return 0;
10102 e = p + len;
10103 if (rs == rb_default_rs) {
10104 return smart_chomp(str, e, p);
10105 }
10106
10107 enc = rb_enc_get(str);
10108 RSTRING_GETMEM(rs, rsptr, rslen);
10109 if (rslen == 0) {
10110 if (rb_enc_mbminlen(enc) > 1) {
10111 while (e > p) {
10112 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10113 if (!rb_enc_is_newline(pp, e, enc)) break;
10114 e = pp;
10115 pp -= rb_enc_mbminlen(enc);
10116 if (pp >= p) {
10117 pp = rb_enc_left_char_head(p, pp, e, enc);
10118 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10119 e = pp;
10120 }
10121 }
10122 }
10123 }
10124 else {
10125 while (e > p && *(e-1) == '\n') {
10126 --e;
10127 if (e > p && *(e-1) == '\r')
10128 --e;
10129 }
10130 }
10131 return e - p;
10132 }
10133 if (rslen > len) return len;
10134
10135 enc = rb_enc_get(rs);
10136 newline = rsptr[rslen-1];
10137 if (rslen == rb_enc_mbminlen(enc)) {
10138 if (rslen == 1) {
10139 if (newline == '\n')
10140 return smart_chomp(str, e, p);
10141 }
10142 else {
10143 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10144 return smart_chomp(str, e, p);
10145 }
10146 }
10147
10148 enc = rb_enc_check(str, rs);
10149 if (is_broken_string(rs)) {
10150 return len;
10151 }
10152 pp = e - rslen;
10153 if (p[len-1] == newline &&
10154 (rslen <= 1 ||
10155 memcmp(rsptr, pp, rslen) == 0)) {
10156 if (at_char_boundary(p, pp, e, enc))
10157 return len - rslen;
10158 RB_GC_GUARD(rs);
10159 }
10160 return len;
10161}
10162
10168static VALUE
10169chomp_rs(int argc, const VALUE *argv)
10170{
10171 rb_check_arity(argc, 0, 1);
10172 if (argc > 0) {
10173 VALUE rs = argv[0];
10174 if (!NIL_P(rs)) StringValue(rs);
10175 return rs;
10176 }
10177 else {
10178 return rb_rs;
10179 }
10180}
10181
10182VALUE
10183rb_str_chomp_string(VALUE str, VALUE rs)
10184{
10185 long olen = RSTRING_LEN(str);
10186 long len = chompped_length(str, rs);
10187 if (len >= olen) return Qnil;
10188 str_modify_keep_cr(str);
10189 STR_SET_LEN(str, len);
10190 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10191 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10193 }
10194 return str;
10195}
10196
10197/*
10198 * call-seq:
10199 * chomp!(line_sep = $/) -> self or nil
10200 *
10201 * Like String#chomp, except that:
10202 *
10203 * - Removes trailing characters from +self+ (not from a copy of +self+).
10204 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10205 *
10206 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10207 */
10208
10209static VALUE
10210rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10211{
10212 VALUE rs;
10213 str_modifiable(str);
10214 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10215 rs = chomp_rs(argc, argv);
10216 if (NIL_P(rs)) return Qnil;
10217 return rb_str_chomp_string(str, rs);
10218}
10219
10220
10221/*
10222 * call-seq:
10223 * chomp(line_sep = $/) -> new_string
10224 *
10225 * :include: doc/string/chomp.rdoc
10226 *
10227 */
10228
10229static VALUE
10230rb_str_chomp(int argc, VALUE *argv, VALUE str)
10231{
10232 VALUE rs = chomp_rs(argc, argv);
10233 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10234 return rb_str_subseq(str, 0, chompped_length(str, rs));
10235}
10236
10237static void
10238tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10239 VALUE str, int num_selectors, VALUE *selectors)
10240{
10241 int i;
10242
10243 for (i=0; i<num_selectors; i++) {
10244 VALUE selector = selectors[i];
10245 rb_encoding *enc;
10246
10247 StringValue(selector);
10248 enc = rb_enc_check(str, selector);
10249 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10250 }
10251}
10252
10253static long
10254lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10255{
10256 const char *const start = s;
10257
10258 if (!s || s >= e) return 0;
10259
10260 /* remove spaces at head */
10261 if (single_byte_optimizable(str)) {
10262 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10263 }
10264 else {
10265 while (s < e) {
10266 int n;
10267 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10268
10269 if (cc && !rb_isspace(cc)) break;
10270 s += n;
10271 }
10272 }
10273 return s - start;
10274}
10275
10276static long
10277lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10278 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10279{
10280 const char *const start = s;
10281
10282 if (!s || s >= e) return 0;
10283
10284 /* remove leading characters in the table */
10285 while (s < e) {
10286 int n;
10287 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10288
10289 if (!tr_find(cc, table, del, nodel)) break;
10290 s += n;
10291 }
10292 return s - start;
10293}
10294
10295/*
10296 * call-seq:
10297 * lstrip!(*selectors) -> self or nil
10298 *
10299 * Like String#lstrip, except that:
10300 *
10301 * - Performs stripping in +self+ (not in a copy of +self+).
10302 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10303 *
10304 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10305 */
10306
10307static VALUE
10308rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10309{
10310 rb_encoding *enc;
10311 char *start, *s;
10312 long olen, loffset;
10313
10314 str_modify_keep_cr(str);
10315 enc = STR_ENC_GET(str);
10316 RSTRING_GETMEM(str, start, olen);
10317 if (argc > 0) {
10318 char table[TR_TABLE_SIZE];
10319 VALUE del = 0, nodel = 0;
10320
10321 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10322 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10323 }
10324 else {
10325 loffset = lstrip_offset(str, start, start+olen, enc);
10326 }
10327
10328 if (loffset > 0) {
10329 long len = olen-loffset;
10330 s = start + loffset;
10331 memmove(start, s, len);
10332 STR_SET_LEN(str, len);
10333 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10334 return str;
10335 }
10336 return Qnil;
10337}
10338
10339
10340/*
10341 * call-seq:
10342 * lstrip(*selectors) -> new_string
10343 *
10344 * Returns a copy of +self+ with leading whitespace removed;
10345 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10346 *
10347 * whitespace = "\x00\t\n\v\f\r "
10348 * s = whitespace + 'abc' + whitespace
10349 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10350 * s.lstrip
10351 * # => "abc\u0000\t\n\v\f\r "
10352 *
10353 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10354 *
10355 * s = "---abc+++"
10356 * s.lstrip("-") # => "abc+++"
10357 *
10358 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10359 * and may use any of its valid forms, including negation, ranges, and escapes:
10360 *
10361 * "01234abc56789".lstrip("0-9") # "abc56789"
10362 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10363 *
10364 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10365 */
10366
10367static VALUE
10368rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10369{
10370 char *start;
10371 long len, loffset;
10372
10373 RSTRING_GETMEM(str, start, len);
10374 if (argc > 0) {
10375 char table[TR_TABLE_SIZE];
10376 VALUE del = 0, nodel = 0;
10377
10378 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10379 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10380 }
10381 else {
10382 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10383 }
10384 if (loffset <= 0) return str_duplicate(rb_cString, str);
10385 return rb_str_subseq(str, loffset, len - loffset);
10386}
10387
10388static long
10389rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10390{
10391 const char *t;
10392
10393 rb_str_check_dummy_enc(enc);
10394 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
10395 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10396 }
10397 if (!s || s >= e) return 0;
10398 t = e;
10399
10400 /* remove trailing spaces or '\0's */
10401 if (single_byte_optimizable(str)) {
10402 unsigned char c;
10403 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10404 }
10405 else {
10406 char *tp;
10407
10408 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10409 unsigned int c = rb_enc_codepoint(tp, e, enc);
10410 if (c && !rb_isspace(c)) break;
10411 t = tp;
10412 }
10413 }
10414 return e - t;
10415}
10416
10417static long
10418rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10419 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10420{
10421 const char *t;
10422 char *tp;
10423
10424 rb_str_check_dummy_enc(enc);
10425 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
10426 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10427 }
10428 if (!s || s >= e) return 0;
10429 t = e;
10430
10431 /* remove trailing characters in the table */
10432 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10433 unsigned int c = rb_enc_codepoint(tp, e, enc);
10434 if (!tr_find(c, table, del, nodel)) break;
10435 t = tp;
10436 }
10437
10438 return e - t;
10439}
10440
10441/*
10442 * call-seq:
10443 * rstrip!(*selectors) -> self or nil
10444 *
10445 * Like String#rstrip, except that:
10446 *
10447 * - Performs stripping in +self+ (not in a copy of +self+).
10448 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10449 *
10450 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10451 */
10452
10453static VALUE
10454rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10455{
10456 rb_encoding *enc;
10457 char *start;
10458 long olen, roffset;
10459
10460 str_modify_keep_cr(str);
10461 enc = STR_ENC_GET(str);
10462 RSTRING_GETMEM(str, start, olen);
10463 if (argc > 0) {
10464 char table[TR_TABLE_SIZE];
10465 VALUE del = 0, nodel = 0;
10466
10467 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10468 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10469 }
10470 else {
10471 roffset = rstrip_offset(str, start, start+olen, enc);
10472 }
10473 if (roffset > 0) {
10474 long len = olen - roffset;
10475
10476 STR_SET_LEN(str, len);
10477 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10478 return str;
10479 }
10480 return Qnil;
10481}
10482
10483
10484/*
10485 * call-seq:
10486 * rstrip(*selectors) -> new_string
10487 *
10488 * Returns a copy of +self+ with trailing whitespace removed;
10489 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10490 *
10491 * whitespace = "\x00\t\n\v\f\r "
10492 * s = whitespace + 'abc' + whitespace
10493 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10494 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10495 *
10496 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10497 *
10498 * s = "---abc+++"
10499 * s.rstrip("+") # => "---abc"
10500 *
10501 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10502 * and may use any of its valid forms, including negation, ranges, and escapes:
10503 *
10504 * "01234abc56789".rstrip("0-9") # "01234abc"
10505 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10506 *
10507 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10508 */
10509
10510static VALUE
10511rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10512{
10513 rb_encoding *enc;
10514 char *start;
10515 long olen, roffset;
10516
10517 enc = STR_ENC_GET(str);
10518 RSTRING_GETMEM(str, start, olen);
10519 if (argc > 0) {
10520 char table[TR_TABLE_SIZE];
10521 VALUE del = 0, nodel = 0;
10522
10523 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10524 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10525 }
10526 else {
10527 roffset = rstrip_offset(str, start, start+olen, enc);
10528 }
10529 if (roffset <= 0) return str_duplicate(rb_cString, str);
10530 return rb_str_subseq(str, 0, olen-roffset);
10531}
10532
10533
10534/*
10535 * call-seq:
10536 * strip!(*selectors) -> self or nil
10537 *
10538 * Like String#strip, except that:
10539 *
10540 * - Any modifications are made to +self+.
10541 * - Returns +self+ if any modification are made, +nil+ otherwise.
10542 *
10543 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10544 */
10545
10546static VALUE
10547rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10548{
10549 char *start;
10550 long olen, loffset, roffset;
10551 rb_encoding *enc;
10552
10553 str_modify_keep_cr(str);
10554 enc = STR_ENC_GET(str);
10555 RSTRING_GETMEM(str, start, olen);
10556
10557 if (argc > 0) {
10558 char table[TR_TABLE_SIZE];
10559 VALUE del = 0, nodel = 0;
10560
10561 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10562 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10563 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10564 }
10565 else {
10566 loffset = lstrip_offset(str, start, start+olen, enc);
10567 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10568 }
10569
10570 if (loffset > 0 || roffset > 0) {
10571 long len = olen-roffset;
10572 if (loffset > 0) {
10573 len -= loffset;
10574 memmove(start, start + loffset, len);
10575 }
10576 STR_SET_LEN(str, len);
10577 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10578 return str;
10579 }
10580 return Qnil;
10581}
10582
10583
10584/*
10585 * call-seq:
10586 * strip(*selectors) -> new_string
10587 *
10588 * Returns a copy of +self+ with leading and trailing whitespace removed;
10589 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10590 *
10591 * whitespace = "\x00\t\n\v\f\r "
10592 * s = whitespace + 'abc' + whitespace
10593 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10594 * s.strip # => "abc"
10595 *
10596 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10597 *
10598 * s = "---abc+++"
10599 * s.strip("-+") # => "abc"
10600 * s.strip("+-") # => "abc"
10601 *
10602 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10603 * and may use any of its valid forms, including negation, ranges, and escapes:
10604 *
10605 * "01234abc56789".strip("0-9") # "abc"
10606 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10607 *
10608 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10609 */
10610
10611static VALUE
10612rb_str_strip(int argc, VALUE *argv, VALUE str)
10613{
10614 char *start;
10615 long olen, loffset, roffset;
10616 rb_encoding *enc = STR_ENC_GET(str);
10617
10618 RSTRING_GETMEM(str, start, olen);
10619
10620 if (argc > 0) {
10621 char table[TR_TABLE_SIZE];
10622 VALUE del = 0, nodel = 0;
10623
10624 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10625 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10626 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10627 }
10628 else {
10629 loffset = lstrip_offset(str, start, start+olen, enc);
10630 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10631 }
10632
10633 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10634 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10635}
10636
10637static VALUE
10638scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10639{
10640 VALUE result = Qnil;
10641 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10642 if (pos >= 0) {
10643 VALUE match = Qnil;
10644 if (BUILTIN_TYPE(pat) == T_STRING) {
10645 end = pos + RSTRING_LEN(pat);
10646 }
10647 else {
10648 match = rb_backref_get();
10649 pos = RMATCH_BEG(match, 0);
10650 end = RMATCH_END(match, 0);
10651 }
10652
10653 if (pos == end) {
10654 rb_encoding *enc = STR_ENC_GET(str);
10655 /*
10656 * Always consume at least one character of the input string
10657 */
10658 if (RSTRING_LEN(str) > end)
10659 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10660 RSTRING_END(str), enc);
10661 else
10662 *start = end + 1;
10663 }
10664 else {
10665 *start = end;
10666 }
10667
10668 if (NIL_P(match) || RMATCH_NREGS(match) == 1) {
10669 result = rb_str_subseq(str, pos, end - pos);
10670 return result;
10671 }
10672 else {
10673 int num_regs = RMATCH_NREGS(match);
10674 result = rb_ary_new2(num_regs);
10675 for (int i = 1; i < num_regs; i++) {
10676 VALUE s = Qnil;
10677 if (RMATCH_BEG(match, i) >= 0) {
10678 s = rb_str_subseq(str, RMATCH_BEG(match, i), RMATCH_END(match, i) - RMATCH_BEG(match, i));
10679 }
10680
10681 rb_ary_push(result, s);
10682 }
10683 }
10684
10685 RB_GC_GUARD(match);
10686 }
10687
10688 return result;
10689}
10690
10691
10692/*
10693 * call-seq:
10694 * scan(pattern) -> array_of_results
10695 * scan(pattern) {|result| ... } -> self
10696 *
10697 * :include: doc/string/scan.rdoc
10698 *
10699 */
10700
10701static VALUE
10702rb_str_scan(VALUE str, VALUE pat)
10703{
10704 VALUE result;
10705 long start = 0;
10706 long last = -1, prev = 0;
10707 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10708
10709 pat = get_pat_quoted(pat, 1);
10710 mustnot_broken(str);
10711 if (!rb_block_given_p()) {
10712 VALUE ary = rb_ary_new();
10713
10714 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10715 last = prev;
10716 prev = start;
10717 rb_ary_push(ary, result);
10718 }
10719 if (last >= 0) rb_pat_search(pat, str, last, 1);
10720 else rb_backref_set(Qnil);
10721 return ary;
10722 }
10723
10724 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10725 last = prev;
10726 prev = start;
10727 rb_yield(result);
10728 str_mod_check(str, p, len);
10729 }
10730 if (last >= 0) rb_pat_search(pat, str, last, 1);
10731 return str;
10732}
10733
10734
10735/*
10736 * call-seq:
10737 * hex -> integer
10738 *
10739 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10740 * returns its value as an integer.
10741 *
10742 * The leading substring is interpreted as hexadecimal when it begins with:
10743 *
10744 * - One or more character representing hexadecimal digits
10745 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10746 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10747 *
10748 * 'f'.hex # => 15
10749 * '11'.hex # => 17
10750 * 'FFF'.hex # => 4095
10751 * 'fffg'.hex # => 4095
10752 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10753 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10754 * 'deadbeef'.hex # => 3735928559
10755 *
10756 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10757 *
10758 * '0xfff'.hex # => 4095
10759 * '0xfffg'.hex # => 4095
10760 *
10761 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10762 *
10763 * '-fff'.hex # => -4095
10764 * '-0xFFF'.hex # => -4095
10765 *
10766 * For any substring not described above, returns zero:
10767 *
10768 * 'xxx'.hex # => 0
10769 * ''.hex # => 0
10770 *
10771 * Note that, unlike #oct, this method interprets only hexadecimal,
10772 * and not binary, octal, or decimal notations:
10773 *
10774 * '0b111'.hex # => 45329
10775 * '0o777'.hex # => 0
10776 * '0d999'.hex # => 55705
10777 *
10778 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
10779 */
10780
10781static VALUE
10782rb_str_hex(VALUE str)
10783{
10784 return rb_str_to_inum(str, 16, FALSE);
10785}
10786
10787
10788/*
10789 * call-seq:
10790 * oct -> integer
10791 *
10792 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10793 * returns their value as an integer.
10794 *
10795 * In brief:
10796 *
10797 * # Interpreted as octal.
10798 * '777'.oct # => 511
10799 * '777x'.oct # => 511
10800 * '0777'.oct # => 511
10801 * '0o777'.oct # => 511
10802 * '-777'.oct # => -511
10803 * # Not interpreted as octal.
10804 * '0b111'.oct # => 7 # Interpreted as binary.
10805 * '0d999'.oct # => 999 # Interpreted as decimal.
10806 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10807 *
10808 * The leading substring is interpreted as octal when it begins with:
10809 *
10810 * - One or more character representing octal digits
10811 * (each in the range <tt>'0'..'7'</tt>);
10812 * the string to be interpreted ends at the first character that does not represent an octal digit:
10813 *
10814 * '7'.oct @ => 7
10815 * '11'.oct # => 9
10816 * '777'.oct # => 511
10817 * '0777'.oct # => 511
10818 * '7778'.oct # => 511
10819 * '777x'.oct # => 511
10820 *
10821 * - <tt>'0o'</tt>, followed by one or more octal digits:
10822 *
10823 * '0o777'.oct # => 511
10824 * '0o7778'.oct # => 511
10825 *
10826 * The leading substring is _not_ interpreted as octal when it begins with:
10827 *
10828 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10829 * (each in the range <tt>'0'..'1'</tt>);
10830 * the string to be interpreted ends at the first character that does not represent a binary digit.
10831 * the string is interpreted as binary digits (base 2):
10832 *
10833 * '0b111'.oct # => 7
10834 * '0b1112'.oct # => 7
10835 *
10836 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10837 * (each in the range <tt>'0'..'9'</tt>);
10838 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10839 * the string is interpreted as decimal digits (base 10):
10840 *
10841 * '0d999'.oct # => 999
10842 * '0d999x'.oct # => 999
10843 *
10844 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10845 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10846 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10847 * the string is interpreted as hexadecimal digits (base 16):
10848 *
10849 * '0xfff'.oct # => 4095
10850 * '0xfffg'.oct # => 4095
10851 *
10852 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10853 *
10854 * '-777'.oct # => -511
10855 * '-0777'.oct # => -511
10856 * '-0b111'.oct # => -7
10857 * '-0xfff'.oct # => -4095
10858 *
10859 * For any substring not described above, returns zero:
10860 *
10861 * 'foo'.oct # => 0
10862 * ''.oct # => 0
10863 *
10864 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
10865 */
10866
10867static VALUE
10868rb_str_oct(VALUE str)
10869{
10870 return rb_str_to_inum(str, -8, FALSE);
10871}
10872
10873#ifndef HAVE_CRYPT_R
10874# include "ruby/thread_native.h"
10875# include "ruby/atomic.h"
10876
10877static struct {
10878 rb_nativethread_lock_t lock;
10879} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10880#endif
10881
10882/*
10883 * call-seq:
10884 * crypt(salt_str) -> new_string
10885 *
10886 * Returns the string generated by calling <code>crypt(3)</code>
10887 * standard library function with <code>str</code> and
10888 * <code>salt_str</code>, in this order, as its arguments. Please do
10889 * not use this method any longer. It is legacy; provided only for
10890 * backward compatibility with ruby scripts in earlier days. It is
10891 * bad to use in contemporary programs for several reasons:
10892 *
10893 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10894 * run. The generated string lacks data portability.
10895 *
10896 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10897 * (i.e. silently ends up in unexpected results).
10898 *
10899 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10900 * thread safe.
10901 *
10902 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10903 * very very weak. According to its manpage, Linux's traditional
10904 * <code>crypt(3)</code> output has only 2**56 variations; too
10905 * easy to brute force today. And this is the default behaviour.
10906 *
10907 * * In order to make things robust some OSes implement so-called
10908 * "modular" usage. To go through, you have to do a complex
10909 * build-up of the <code>salt_str</code> parameter, by hand.
10910 * Failure in generation of a proper salt string tends not to
10911 * yield any errors; typos in parameters are normally not
10912 * detectable.
10913 *
10914 * * For instance, in the following example, the second invocation
10915 * of String#crypt is wrong; it has a typo in "round=" (lacks
10916 * "s"). However the call does not fail and something unexpected
10917 * is generated.
10918 *
10919 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10920 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10921 *
10922 * * Even in the "modular" mode, some hash functions are considered
10923 * archaic and no longer recommended at all; for instance module
10924 * <code>$1$</code> is officially abandoned by its author: see
10925 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10926 * instance module <code>$3$</code> is considered completely
10927 * broken: see the manpage of FreeBSD.
10928 *
10929 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10930 * written above, <code>crypt(3)</code> on Mac OS never fails.
10931 * This means even if you build up a proper salt string it
10932 * generates a traditional DES hash anyways, and there is no way
10933 * for you to be aware of.
10934 *
10935 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10936 *
10937 * If for some reason you cannot migrate to other secure contemporary
10938 * password hashing algorithms, install the string-crypt gem and
10939 * <code>require 'string/crypt'</code> to continue using it.
10940 */
10941
10942static VALUE
10943rb_str_crypt(VALUE str, VALUE salt)
10944{
10945#ifdef HAVE_CRYPT_R
10946 VALUE databuf;
10947 struct crypt_data *data;
10948# define CRYPT_END() ALLOCV_END(databuf)
10949#else
10950 char *tmp_buf;
10951 extern char *crypt(const char *, const char *);
10952# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10953#endif
10954 VALUE result;
10955 const char *s, *saltp;
10956 char *res;
10957#ifdef BROKEN_CRYPT
10958 char salt_8bit_clean[3];
10959#endif
10960
10961 StringValue(salt);
10962 mustnot_wchar(str);
10963 mustnot_wchar(salt);
10964 s = StringValueCStr(str);
10965 saltp = RSTRING_PTR(salt);
10966 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10967 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10968 }
10969
10970#ifdef BROKEN_CRYPT
10971 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10972 salt_8bit_clean[0] = saltp[0] & 0x7f;
10973 salt_8bit_clean[1] = saltp[1] & 0x7f;
10974 salt_8bit_clean[2] = '\0';
10975 saltp = salt_8bit_clean;
10976 }
10977#endif
10978#ifdef HAVE_CRYPT_R
10979 data = ALLOCV(databuf, sizeof(struct crypt_data));
10980# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10981 data->initialized = 0;
10982# endif
10983 res = crypt_r(s, saltp, data);
10984#else
10985 rb_nativethread_lock_lock(&crypt_mutex.lock);
10986 res = crypt(s, saltp);
10987#endif
10988 if (!res) {
10989 int err = errno;
10990 CRYPT_END();
10991 rb_syserr_fail(err, "crypt");
10992 }
10993#ifdef HAVE_CRYPT_R
10994 result = rb_str_new_cstr(res);
10995 CRYPT_END();
10996#else
10997 // We need to copy this buffer because it's static and we need to unlock the mutex
10998 // before allocating a new object (the string to be returned). If we allocate while
10999 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
11000 // if other ractors are waiting on this lock.
11001 size_t res_size = strlen(res)+1;
11002 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
11003 memcpy(tmp_buf, res, res_size);
11004 res = tmp_buf;
11005 CRYPT_END();
11006 result = rb_str_new_cstr(res);
11007#endif
11008 return result;
11009}
11010
11011
11012/*
11013 * call-seq:
11014 * ord -> integer
11015 *
11016 * :include: doc/string/ord.rdoc
11017 *
11018 */
11019
11020static VALUE
11021rb_str_ord(VALUE s)
11022{
11023 unsigned int c;
11024
11025 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
11026 return UINT2NUM(c);
11027}
11028/*
11029 * call-seq:
11030 * sum(n = 16) -> integer
11031 *
11032 * :include: doc/string/sum.rdoc
11033 *
11034 */
11035
11036static VALUE
11037rb_str_sum(int argc, VALUE *argv, VALUE str)
11038{
11039 int bits = 16;
11040 char *ptr, *p, *pend;
11041 long len;
11042 VALUE sum = INT2FIX(0);
11043 unsigned long sum0 = 0;
11044
11045 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11046 bits = 0;
11047 }
11048 ptr = p = RSTRING_PTR(str);
11049 len = RSTRING_LEN(str);
11050 pend = p + len;
11051
11052 while (p < pend) {
11053 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11054 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11055 str_mod_check(str, ptr, len);
11056 sum0 = 0;
11057 }
11058 sum0 += (unsigned char)*p;
11059 p++;
11060 }
11061
11062 if (bits == 0) {
11063 if (sum0) {
11064 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11065 }
11066 }
11067 else {
11068 if (sum == INT2FIX(0)) {
11069 if (bits < (int)sizeof(long)*CHAR_BIT) {
11070 sum0 &= (((unsigned long)1)<<bits)-1;
11071 }
11072 sum = LONG2FIX(sum0);
11073 }
11074 else {
11075 VALUE mod;
11076
11077 if (sum0) {
11078 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11079 }
11080
11081 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11082 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11083 sum = rb_funcall(sum, '&', 1, mod);
11084 }
11085 }
11086 return sum;
11087}
11088
11089static VALUE
11090rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11091{
11092 rb_encoding *enc;
11093 VALUE w;
11094 long width, len, flen = 1, fclen = 1;
11095 VALUE res;
11096 char *p;
11097 const char *f = " ";
11098 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11099 VALUE pad;
11100 int singlebyte = 1, cr;
11101 int termlen;
11102
11103 rb_scan_args(argc, argv, "11", &w, &pad);
11104 enc = STR_ENC_GET(str);
11105 termlen = rb_enc_mbminlen(enc);
11106 width = NUM2LONG(w);
11107 if (argc == 2) {
11108 StringValue(pad);
11109 enc = rb_enc_check(str, pad);
11110 f = RSTRING_PTR(pad);
11111 flen = RSTRING_LEN(pad);
11112 fclen = str_strlen(pad, enc); /* rb_enc_check */
11113 singlebyte = single_byte_optimizable(pad);
11114 if (flen == 0 || fclen == 0) {
11115 rb_raise(rb_eArgError, "zero width padding");
11116 }
11117 }
11118 len = str_strlen(str, enc); /* rb_enc_check */
11119 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11120 n = width - len;
11121 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11122 rlen = n - llen;
11123 cr = ENC_CODERANGE(str);
11124 if (flen > 1) {
11125 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11126 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11127 }
11128 size = RSTRING_LEN(str);
11129 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11130 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11131 (len += llen2 + rlen2) >= LONG_MAX - size) {
11132 rb_raise(rb_eArgError, "argument too big");
11133 }
11134 len += size;
11135 res = str_enc_new(rb_cString, 0, len, enc);
11136 p = RSTRING_PTR(res);
11137 if (flen <= 1) {
11138 memset(p, *f, llen);
11139 p += llen;
11140 }
11141 else {
11142 while (llen >= fclen) {
11143 memcpy(p,f,flen);
11144 p += flen;
11145 llen -= fclen;
11146 }
11147 if (llen > 0) {
11148 memcpy(p, f, llen2);
11149 p += llen2;
11150 }
11151 }
11152 memcpy(p, RSTRING_PTR(str), size);
11153 p += size;
11154 if (flen <= 1) {
11155 memset(p, *f, rlen);
11156 p += rlen;
11157 }
11158 else {
11159 while (rlen >= fclen) {
11160 memcpy(p,f,flen);
11161 p += flen;
11162 rlen -= fclen;
11163 }
11164 if (rlen > 0) {
11165 memcpy(p, f, rlen2);
11166 p += rlen2;
11167 }
11168 }
11169 TERM_FILL(p, termlen);
11170 STR_SET_LEN(res, p-RSTRING_PTR(res));
11171
11172 if (argc == 2)
11173 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11174 if (cr != ENC_CODERANGE_BROKEN)
11175 ENC_CODERANGE_SET(res, cr);
11176
11177 RB_GC_GUARD(pad);
11178 return res;
11179}
11180
11181
11182/*
11183 * call-seq:
11184 * ljust(width, pad_string = ' ') -> new_string
11185 *
11186 * :include: doc/string/ljust.rdoc
11187 *
11188 */
11189
11190static VALUE
11191rb_str_ljust(int argc, VALUE *argv, VALUE str)
11192{
11193 return rb_str_justify(argc, argv, str, 'l');
11194}
11195
11196/*
11197 * call-seq:
11198 * rjust(width, pad_string = ' ') -> new_string
11199 *
11200 * :include: doc/string/rjust.rdoc
11201 *
11202 */
11203
11204static VALUE
11205rb_str_rjust(int argc, VALUE *argv, VALUE str)
11206{
11207 return rb_str_justify(argc, argv, str, 'r');
11208}
11209
11210
11211/*
11212 * call-seq:
11213 * center(size, pad_string = ' ') -> new_string
11214 *
11215 * :include: doc/string/center.rdoc
11216 *
11217 */
11218
11219static VALUE
11220rb_str_center(int argc, VALUE *argv, VALUE str)
11221{
11222 return rb_str_justify(argc, argv, str, 'c');
11223}
11224
11225/*
11226 * call-seq:
11227 * partition(pattern) -> [pre_match, first_match, post_match]
11228 *
11229 * :include: doc/string/partition.rdoc
11230 *
11231 */
11232
11233static VALUE
11234rb_str_partition(VALUE str, VALUE sep)
11235{
11236 long pos;
11237
11238 sep = get_pat_quoted(sep, 0);
11239 if (RB_TYPE_P(sep, T_REGEXP)) {
11240 if (rb_reg_search(sep, str, 0, 0) < 0) {
11241 goto failed;
11242 }
11243 VALUE match = rb_backref_get();
11244
11245 pos = RMATCH_BEG(match, 0);
11246 sep = rb_str_subseq(str, pos, RMATCH_END(match, 0) - pos);
11247 }
11248 else {
11249 pos = rb_str_index(str, sep, 0);
11250 if (pos < 0) goto failed;
11251 }
11252 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11253 sep,
11254 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11255 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11256
11257 failed:
11258 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11259}
11260
11261/*
11262 * call-seq:
11263 * rpartition(pattern) -> [pre_match, last_match, post_match]
11264 *
11265 * :include: doc/string/rpartition.rdoc
11266 *
11267 */
11268
11269static VALUE
11270rb_str_rpartition(VALUE str, VALUE sep)
11271{
11272 long pos = RSTRING_LEN(str);
11273
11274 sep = get_pat_quoted(sep, 0);
11275 if (RB_TYPE_P(sep, T_REGEXP)) {
11276 if (rb_reg_search(sep, str, pos, 1) < 0) {
11277 goto failed;
11278 }
11279 VALUE match = rb_backref_get();
11280
11281 pos = RMATCH_BEG(match, 0);
11282 sep = rb_str_subseq(str, pos, RMATCH_END(match, 0) - pos);
11283 }
11284 else {
11285 pos = rb_str_sublen(str, pos);
11286 pos = rb_str_rindex(str, sep, pos);
11287 if (pos < 0) {
11288 goto failed;
11289 }
11290 }
11291
11292 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11293 sep,
11294 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11295 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11296 failed:
11297 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11298}
11299
11300/*
11301 * call-seq:
11302 * start_with?(*patterns) -> true or false
11303 *
11304 * :include: doc/string/start_with_p.rdoc
11305 *
11306 */
11307
11308static VALUE
11309rb_str_start_with(int argc, VALUE *argv, VALUE str)
11310{
11311 int i;
11312
11313 for (i=0; i<argc; i++) {
11314 VALUE tmp = argv[i];
11315 if (RB_TYPE_P(tmp, T_REGEXP)) {
11316 if (rb_reg_start_with_p(tmp, str))
11317 return Qtrue;
11318 }
11319 else {
11320 const char *p, *s, *e;
11321 long slen, tlen;
11322 rb_encoding *enc;
11323
11324 StringValue(tmp);
11325 enc = rb_enc_check(str, tmp);
11326 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11327 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11328 p = RSTRING_PTR(str);
11329 e = p + slen;
11330 s = p + tlen;
11331 if (!at_char_right_boundary(p, s, e, enc))
11332 continue;
11333 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11334 return Qtrue;
11335 }
11336 }
11337 return Qfalse;
11338}
11339
11340/*
11341 * call-seq:
11342 * end_with?(*strings) -> true or false
11343 *
11344 * :include: doc/string/end_with_p.rdoc
11345 *
11346 */
11347
11348static VALUE
11349rb_str_end_with(int argc, VALUE *argv, VALUE str)
11350{
11351 int i;
11352
11353 for (i=0; i<argc; i++) {
11354 VALUE tmp = argv[i];
11355 const char *p, *s, *e;
11356 long slen, tlen;
11357 rb_encoding *enc;
11358
11359 StringValue(tmp);
11360 enc = rb_enc_check(str, tmp);
11361 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11362 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11363 p = RSTRING_PTR(str);
11364 e = p + slen;
11365 s = e - tlen;
11366 if (!at_char_boundary(p, s, e, enc))
11367 continue;
11368 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11369 return Qtrue;
11370 }
11371 return Qfalse;
11372}
11373
11383static long
11384deleted_prefix_length(VALUE str, VALUE prefix)
11385{
11386 const char *strptr, *prefixptr;
11387 long olen, prefixlen;
11388 rb_encoding *enc = rb_enc_get(str);
11389
11390 StringValue(prefix);
11391
11392 if (!is_broken_string(prefix) ||
11393 !rb_enc_asciicompat(enc) ||
11394 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11395 enc = rb_enc_check(str, prefix);
11396 }
11397
11398 /* return 0 if not start with prefix */
11399 prefixlen = RSTRING_LEN(prefix);
11400 if (prefixlen <= 0) return 0;
11401 olen = RSTRING_LEN(str);
11402 if (olen < prefixlen) return 0;
11403 strptr = RSTRING_PTR(str);
11404 prefixptr = RSTRING_PTR(prefix);
11405 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11406 if (is_broken_string(prefix)) {
11407 if (!is_broken_string(str)) {
11408 /* prefix in a valid string cannot be broken */
11409 return 0;
11410 }
11411 const char *strend = strptr + olen;
11412 const char *after_prefix = strptr + prefixlen;
11413 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11414 /* prefix does not end at char-boundary */
11415 return 0;
11416 }
11417 }
11418 /* prefix part in `str` also should be valid. */
11419
11420 return prefixlen;
11421}
11422
11423/*
11424 * call-seq:
11425 * delete_prefix!(prefix) -> self or nil
11426 *
11427 * Like String#delete_prefix, except that +self+ is modified in place;
11428 * returns +self+ if the prefix is removed, +nil+ otherwise.
11429 *
11430 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11431 */
11432
11433static VALUE
11434rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11435{
11436 long prefixlen;
11437 str_modify_keep_cr(str);
11438
11439 prefixlen = deleted_prefix_length(str, prefix);
11440 if (prefixlen <= 0) return Qnil;
11441
11442 return rb_str_drop_bytes(str, prefixlen);
11443}
11444
11445/*
11446 * call-seq:
11447 * delete_prefix(prefix) -> new_string
11448 *
11449 * :include: doc/string/delete_prefix.rdoc
11450 *
11451 */
11452
11453static VALUE
11454rb_str_delete_prefix(VALUE str, VALUE prefix)
11455{
11456 long prefixlen;
11457
11458 prefixlen = deleted_prefix_length(str, prefix);
11459 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11460
11461 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11462}
11463
11473static long
11474deleted_suffix_length(VALUE str, VALUE suffix)
11475{
11476 const char *strptr, *suffixptr;
11477 long olen, suffixlen;
11478 rb_encoding *enc;
11479
11480 StringValue(suffix);
11481 if (is_broken_string(suffix)) return 0;
11482 enc = rb_enc_check(str, suffix);
11483
11484 /* return 0 if not start with suffix */
11485 suffixlen = RSTRING_LEN(suffix);
11486 if (suffixlen <= 0) return 0;
11487 olen = RSTRING_LEN(str);
11488 if (olen < suffixlen) return 0;
11489 strptr = RSTRING_PTR(str);
11490 suffixptr = RSTRING_PTR(suffix);
11491 const char *strend = strptr + olen;
11492 const char *before_suffix = strend - suffixlen;
11493 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11494 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11495
11496 return suffixlen;
11497}
11498
11499/*
11500 * call-seq:
11501 * delete_suffix!(suffix) -> self or nil
11502 *
11503 * Like String#delete_suffix, except that +self+ is modified in place;
11504 * returns +self+ if the suffix is removed, +nil+ otherwise.
11505 *
11506 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11507 */
11508
11509static VALUE
11510rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11511{
11512 long olen, suffixlen, len;
11513 str_modifiable(str);
11514
11515 suffixlen = deleted_suffix_length(str, suffix);
11516 if (suffixlen <= 0) return Qnil;
11517
11518 olen = RSTRING_LEN(str);
11519 str_modify_keep_cr(str);
11520 len = olen - suffixlen;
11521 STR_SET_LEN(str, len);
11522 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11523 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11525 }
11526 return str;
11527}
11528
11529/*
11530 * call-seq:
11531 * delete_suffix(suffix) -> new_string
11532 *
11533 * :include: doc/string/delete_suffix.rdoc
11534 *
11535 */
11536
11537static VALUE
11538rb_str_delete_suffix(VALUE str, VALUE suffix)
11539{
11540 long suffixlen;
11541
11542 suffixlen = deleted_suffix_length(str, suffix);
11543 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11544
11545 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11546}
11547
11548void
11549rb_str_setter(VALUE val, ID id, VALUE *var)
11550{
11551 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11552 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11553 }
11554 *var = val;
11555}
11556
11557static void
11558nil_setter_warning(ID id)
11559{
11560 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11561}
11562
11563void
11564rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11565{
11566 rb_str_setter(val, id, var);
11567 if (!NIL_P(*var)) {
11568 nil_setter_warning(id);
11569 }
11570}
11571
11572static void
11573rb_fs_setter(VALUE val, ID id, VALUE *var)
11574{
11575 val = rb_fs_check(val);
11576 if (!val) {
11577 rb_raise(rb_eTypeError,
11578 "value of %"PRIsVALUE" must be String or Regexp",
11579 rb_id2str(id));
11580 }
11581 if (!NIL_P(val)) {
11582 nil_setter_warning(id);
11583 }
11584 *var = val;
11585}
11586
11587
11588/*
11589 * call-seq:
11590 * force_encoding(encoding) -> self
11591 *
11592 * :include: doc/string/force_encoding.rdoc
11593 *
11594 */
11595
11596static VALUE
11597rb_str_force_encoding(VALUE str, VALUE enc)
11598{
11599 str_modifiable(str);
11600
11601 rb_encoding *encoding = rb_to_encoding(enc);
11602 int idx = rb_enc_to_index(encoding);
11603
11604 // If the encoding is unchanged, we do nothing.
11605 if (ENCODING_GET(str) == idx) {
11606 return str;
11607 }
11608
11609 rb_enc_associate_index(str, idx);
11610
11611 // If the coderange was 7bit and the new encoding is ASCII-compatible
11612 // we can keep the coderange.
11613 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11614 return str;
11615 }
11616
11618 return str;
11619}
11620
11621/*
11622 * call-seq:
11623 * b -> new_string
11624 *
11625 * :include: doc/string/b.rdoc
11626 *
11627 */
11628
11629static VALUE
11630rb_str_b(VALUE str)
11631{
11632 VALUE str2;
11633 if (STR_EMBED_P(str)) {
11634 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11635 }
11636 else {
11637 str2 = str_alloc_heap(rb_cString);
11638 }
11639 str_replace_shared_without_enc(str2, str);
11640
11641 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11642 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11643 // If we know the receiver's code range then we know the result's code range.
11644 int cr = ENC_CODERANGE(str);
11645 switch (cr) {
11646 case ENC_CODERANGE_7BIT:
11648 break;
11652 break;
11653 default:
11654 ENC_CODERANGE_CLEAR(str2);
11655 break;
11656 }
11657 }
11658
11659 return str2;
11660}
11661
11662/*
11663 * call-seq:
11664 * valid_encoding? -> true or false
11665 *
11666 * :include: doc/string/valid_encoding_p.rdoc
11667 *
11668 */
11669
11670static VALUE
11671rb_str_valid_encoding_p(VALUE str)
11672{
11673 int cr = rb_enc_str_coderange(str);
11674
11675 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11676}
11677
11678/*
11679 * call-seq:
11680 * ascii_only? -> true or false
11681 *
11682 * Returns whether +self+ contains only ASCII characters:
11683 *
11684 * 'abc'.ascii_only? # => true
11685 * "abc\u{6666}".ascii_only? # => false
11686 *
11687 * Related: see {Querying}[rdoc-ref:String@Querying].
11688 */
11689
11690static VALUE
11691rb_str_is_ascii_only_p(VALUE str)
11692{
11693 int cr = rb_enc_str_coderange(str);
11694
11695 return RBOOL(cr == ENC_CODERANGE_7BIT);
11696}
11697
11698VALUE
11700{
11701 static const char ellipsis[] = "...";
11702 const long ellipsislen = sizeof(ellipsis) - 1;
11703 rb_encoding *const enc = rb_enc_get(str);
11704 const long blen = RSTRING_LEN(str);
11705 const char *const p = RSTRING_PTR(str), *e = p + blen;
11706 VALUE estr, ret = 0;
11707
11708 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11709 if (len * rb_enc_mbminlen(enc) >= blen ||
11710 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11711 ret = str;
11712 }
11713 else if (len <= ellipsislen ||
11714 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11715 if (rb_enc_asciicompat(enc)) {
11716 ret = rb_str_new(ellipsis, len);
11717 rb_enc_associate(ret, enc);
11718 }
11719 else {
11720 estr = rb_usascii_str_new(ellipsis, len);
11721 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11722 }
11723 }
11724 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11725 rb_str_cat(ret, ellipsis, ellipsislen);
11726 }
11727 else {
11728 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11729 rb_enc_from_encoding(enc), 0, Qnil);
11730 rb_str_append(ret, estr);
11731 }
11732 return ret;
11733}
11734
11735static VALUE
11736str_compat_and_valid(VALUE str, rb_encoding *enc)
11737{
11738 int cr;
11739 str = StringValue(str);
11740 cr = rb_enc_str_coderange(str);
11741 if (cr == ENC_CODERANGE_BROKEN) {
11742 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11743 }
11744 else {
11745 rb_encoding *e = STR_ENC_GET(str);
11746 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11747 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11748 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11749 }
11750 }
11751 return str;
11752}
11753
11754static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11755
11756VALUE
11758{
11759 rb_encoding *enc = STR_ENC_GET(str);
11760 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11761}
11762
11763VALUE
11764rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11765{
11766 int cr = ENC_CODERANGE_UNKNOWN;
11767 if (enc == STR_ENC_GET(str)) {
11768 /* cached coderange makes sense only when enc equals the
11769 * actual encoding of str */
11770 cr = ENC_CODERANGE(str);
11771 }
11772 return enc_str_scrub(enc, str, repl, cr);
11773}
11774
11775static VALUE
11776enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11777{
11778 int encidx;
11779 VALUE buf = Qnil;
11780 const char *rep, *p, *e, *p1, *sp;
11781 long replen = -1;
11782 long slen;
11783
11784 if (rb_block_given_p()) {
11785 if (!NIL_P(repl))
11786 rb_raise(rb_eArgError, "both of block and replacement given");
11787 replen = 0;
11788 }
11789
11790 if (ENC_CODERANGE_CLEAN_P(cr))
11791 return Qnil;
11792
11793 if (!NIL_P(repl)) {
11794 repl = str_compat_and_valid(repl, enc);
11795 }
11796
11797 if (rb_enc_dummy_p(enc)) {
11798 return Qnil;
11799 }
11800 encidx = rb_enc_to_index(enc);
11801
11802#define DEFAULT_REPLACE_CHAR(str) do { \
11803 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11804 rep = replace; replen = (int)sizeof(replace); \
11805 } while (0)
11806
11807 slen = RSTRING_LEN(str);
11808 p = RSTRING_PTR(str);
11809 e = RSTRING_END(str);
11810 p1 = p;
11811 sp = p;
11812
11813 if (rb_enc_asciicompat(enc)) {
11814 int rep7bit_p;
11815 if (!replen) {
11816 rep = NULL;
11817 rep7bit_p = FALSE;
11818 }
11819 else if (!NIL_P(repl)) {
11820 rep = RSTRING_PTR(repl);
11821 replen = RSTRING_LEN(repl);
11822 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11823 }
11824 else if (encidx == rb_utf8_encindex()) {
11825 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11826 rep7bit_p = FALSE;
11827 }
11828 else {
11829 DEFAULT_REPLACE_CHAR("?");
11830 rep7bit_p = TRUE;
11831 }
11832 cr = ENC_CODERANGE_7BIT;
11833
11834 p = search_nonascii(p, e);
11835 if (!p) {
11836 p = e;
11837 }
11838 while (p < e) {
11839 int ret = rb_enc_precise_mbclen(p, e, enc);
11840 if (MBCLEN_NEEDMORE_P(ret)) {
11841 break;
11842 }
11843 else if (MBCLEN_CHARFOUND_P(ret)) {
11845 p += MBCLEN_CHARFOUND_LEN(ret);
11846 }
11847 else if (MBCLEN_INVALID_P(ret)) {
11848 /*
11849 * p1~p: valid ascii/multibyte chars
11850 * p ~e: invalid bytes + unknown bytes
11851 */
11852 long clen = rb_enc_mbmaxlen(enc);
11853 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11854 if (p > p1) {
11855 rb_str_buf_cat(buf, p1, p - p1);
11856 }
11857
11858 if (e - p < clen) clen = e - p;
11859 if (clen <= 2) {
11860 clen = 1;
11861 }
11862 else {
11863 const char *q = p;
11864 clen--;
11865 for (; clen > 1; clen--) {
11866 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11867 if (MBCLEN_NEEDMORE_P(ret)) break;
11868 if (MBCLEN_INVALID_P(ret)) continue;
11870 }
11871 }
11872 if (rep) {
11873 rb_str_buf_cat(buf, rep, replen);
11874 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11875 }
11876 else {
11877 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11878 str_mod_check(str, sp, slen);
11879 repl = str_compat_and_valid(repl, enc);
11880 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11883 }
11884 p += clen;
11885 p1 = p;
11886 p = search_nonascii(p, e);
11887 if (!p) {
11888 p = e;
11889 break;
11890 }
11891 }
11892 else {
11894 }
11895 }
11896 if (NIL_P(buf)) {
11897 if (p == e) {
11898 ENC_CODERANGE_SET(str, cr);
11899 return Qnil;
11900 }
11901 buf = rb_str_buf_new(RSTRING_LEN(str));
11902 }
11903 if (p1 < p) {
11904 rb_str_buf_cat(buf, p1, p - p1);
11905 }
11906 if (p < e) {
11907 if (rep) {
11908 rb_str_buf_cat(buf, rep, replen);
11909 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11910 }
11911 else {
11912 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11913 str_mod_check(str, sp, slen);
11914 repl = str_compat_and_valid(repl, enc);
11915 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11918 }
11919 }
11920 }
11921 else {
11922 /* ASCII incompatible */
11923 long mbminlen = rb_enc_mbminlen(enc);
11924 if (!replen) {
11925 rep = NULL;
11926 }
11927 else if (!NIL_P(repl)) {
11928 rep = RSTRING_PTR(repl);
11929 replen = RSTRING_LEN(repl);
11930 }
11931 else if (encidx == ENCINDEX_UTF_16BE) {
11932 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11933 }
11934 else if (encidx == ENCINDEX_UTF_16LE) {
11935 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11936 }
11937 else if (encidx == ENCINDEX_UTF_32BE) {
11938 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11939 }
11940 else if (encidx == ENCINDEX_UTF_32LE) {
11941 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11942 }
11943 else {
11944 DEFAULT_REPLACE_CHAR("?");
11945 }
11946
11947 while (p < e) {
11948 int ret = rb_enc_precise_mbclen(p, e, enc);
11949 if (MBCLEN_NEEDMORE_P(ret)) {
11950 break;
11951 }
11952 else if (MBCLEN_CHARFOUND_P(ret)) {
11953 p += MBCLEN_CHARFOUND_LEN(ret);
11954 }
11955 else if (MBCLEN_INVALID_P(ret)) {
11956 const char *q = p;
11957 long clen = rb_enc_mbmaxlen(enc);
11958 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11959 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11960
11961 if (e - p < clen) clen = e - p;
11962 if (clen <= mbminlen * 2) {
11963 clen = mbminlen;
11964 }
11965 else {
11966 clen -= mbminlen;
11967 for (; clen > mbminlen; clen-=mbminlen) {
11968 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11969 if (MBCLEN_NEEDMORE_P(ret)) break;
11970 if (MBCLEN_INVALID_P(ret)) continue;
11972 }
11973 }
11974 if (rep) {
11975 rb_str_buf_cat(buf, rep, replen);
11976 }
11977 else {
11978 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11979 str_mod_check(str, sp, slen);
11980 repl = str_compat_and_valid(repl, enc);
11981 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11982 }
11983 p += clen;
11984 p1 = p;
11985 }
11986 else {
11988 }
11989 }
11990 if (NIL_P(buf)) {
11991 if (p == e) {
11993 return Qnil;
11994 }
11995 buf = rb_str_buf_new(RSTRING_LEN(str));
11996 }
11997 if (p1 < p) {
11998 rb_str_buf_cat(buf, p1, p - p1);
11999 }
12000 if (p < e) {
12001 if (rep) {
12002 rb_str_buf_cat(buf, rep, replen);
12003 }
12004 else {
12005 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12006 str_mod_check(str, sp, slen);
12007 repl = str_compat_and_valid(repl, enc);
12008 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12009 }
12010 }
12012 }
12013 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
12014 return buf;
12015}
12016
12017/*
12018 * call-seq:
12019 * scrub(replacement_string = default_replacement_string) -> new_string
12020 * scrub{|sequence| ... } -> new_string
12021 *
12022 * :include: doc/string/scrub.rdoc
12023 *
12024 */
12025static VALUE
12026str_scrub(int argc, VALUE *argv, VALUE str)
12027{
12028 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12029 VALUE new = rb_str_scrub(str, repl);
12030 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
12031}
12032
12033/*
12034 * call-seq:
12035 * scrub!(replacement_string = default_replacement_string) -> self
12036 * scrub!{|sequence| ... } -> self
12037 *
12038 * Like String#scrub, except that:
12039 *
12040 * - Any replacements are made in +self+.
12041 * - Returns +self+.
12042 *
12043 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12044 *
12045 */
12046static VALUE
12047str_scrub_bang(int argc, VALUE *argv, VALUE str)
12048{
12049 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12050 VALUE new = rb_str_scrub(str, repl);
12051 if (!NIL_P(new)) rb_str_replace(str, new);
12052 return str;
12053}
12054
12055static ID id_normalize;
12056static ID id_normalized_p;
12057static VALUE mUnicodeNormalize;
12058
12059static VALUE
12060unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12061{
12062 static int UnicodeNormalizeRequired = 0;
12063 VALUE argv2[2];
12064
12065 if (!UnicodeNormalizeRequired) {
12066 rb_require("unicode_normalize/normalize.rb");
12067 UnicodeNormalizeRequired = 1;
12068 }
12069 argv2[0] = str;
12070 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12071 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12072}
12073
12074/*
12075 * call-seq:
12076 * unicode_normalize(form = :nfc) -> string
12077 *
12078 * :include: doc/string/unicode_normalize.rdoc
12079 *
12080 */
12081static VALUE
12082rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12083{
12084 return unicode_normalize_common(argc, argv, str, id_normalize);
12085}
12086
12087/*
12088 * call-seq:
12089 * unicode_normalize!(form = :nfc) -> self
12090 *
12091 * Like String#unicode_normalize, except that the normalization
12092 * is performed on +self+ (not on a copy of +self+).
12093 *
12094 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12095 *
12096 */
12097static VALUE
12098rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12099{
12100 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12101}
12102
12103/* call-seq:
12104 * unicode_normalized?(form = :nfc) -> true or false
12105 *
12106 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12107 * see String#unicode_normalize.
12108 *
12109 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12110 *
12111 * Examples:
12112 *
12113 * "a\u0300".unicode_normalized? # => false
12114 * "a\u0300".unicode_normalized?(:nfd) # => true
12115 * "\u00E0".unicode_normalized? # => true
12116 * "\u00E0".unicode_normalized?(:nfd) # => false
12117 *
12118 *
12119 * Raises an exception if +self+ is not in a Unicode encoding:
12120 *
12121 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12122 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12123 *
12124 * Related: see {Querying}[rdoc-ref:String@Querying].
12125 */
12126static VALUE
12127rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12128{
12129 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12130}
12131
12132/**********************************************************************
12133 * Document-class: Symbol
12134 *
12135 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12136 *
12137 * You can create a +Symbol+ object explicitly with:
12138 *
12139 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12140 *
12141 * The same +Symbol+ object will be
12142 * created for a given name or string for the duration of a program's
12143 * execution, regardless of the context or meaning of that name. Thus
12144 * if <code>Fred</code> is a constant in one context, a method in
12145 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12146 * will be the same object in all three contexts.
12147 *
12148 * module One
12149 * class Fred
12150 * end
12151 * $f1 = :Fred
12152 * end
12153 * module Two
12154 * Fred = 1
12155 * $f2 = :Fred
12156 * end
12157 * def Fred()
12158 * end
12159 * $f3 = :Fred
12160 * $f1.object_id #=> 2514190
12161 * $f2.object_id #=> 2514190
12162 * $f3.object_id #=> 2514190
12163 *
12164 * Constant, method, and variable names are returned as symbols:
12165 *
12166 * module One
12167 * Two = 2
12168 * def three; 3 end
12169 * @four = 4
12170 * @@five = 5
12171 * $six = 6
12172 * end
12173 * seven = 7
12174 *
12175 * One.constants
12176 * # => [:Two]
12177 * One.instance_methods(true)
12178 * # => [:three]
12179 * One.instance_variables
12180 * # => [:@four]
12181 * One.class_variables
12182 * # => [:@@five]
12183 * global_variables.grep(/six/)
12184 * # => [:$six]
12185 * local_variables
12186 * # => [:seven]
12187 *
12188 * A +Symbol+ object differs from a String object in that
12189 * a +Symbol+ object represents an identifier, while a String object
12190 * represents text or data.
12191 *
12192 * == What's Here
12193 *
12194 * First, what's elsewhere. Class +Symbol+:
12195 *
12196 * - Inherits from {class Object}[rdoc-ref:Object@Whats+Here].
12197 * - Includes {module Comparable}[rdoc-ref:Comparable@Whats+Here].
12198 *
12199 * Here, class +Symbol+ provides methods that are useful for:
12200 *
12201 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12202 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12203 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12204 *
12205 * === Methods for Querying
12206 *
12207 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12208 * - #=~: Returns the index of the first substring in symbol that matches a
12209 * given Regexp or other object; returns +nil+ if no match is found.
12210 * - #[], #slice : Returns a substring of symbol
12211 * determined by a given index, start/length, or range, or string.
12212 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12213 * - #encoding: Returns the Encoding object that represents the encoding
12214 * of symbol.
12215 * - #end_with?: Returns +true+ if symbol ends with
12216 * any of the given strings.
12217 * - #match: Returns a MatchData object if symbol
12218 * matches a given Regexp; +nil+ otherwise.
12219 * - #match?: Returns +true+ if symbol
12220 * matches a given Regexp; +false+ otherwise.
12221 * - #length, #size: Returns the number of characters in symbol.
12222 * - #start_with?: Returns +true+ if symbol starts with
12223 * any of the given strings.
12224 *
12225 * === Methods for Comparing
12226 *
12227 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12228 * or larger than symbol.
12229 * - #==, #===: Returns +true+ if a given symbol has the same content and
12230 * encoding.
12231 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12232 * symbol is smaller than, equal to, or larger than symbol.
12233 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12234 * after Unicode case folding; +false+ otherwise.
12235 *
12236 * === Methods for Converting
12237 *
12238 * - #capitalize: Returns symbol with the first character upcased
12239 * and all other characters downcased.
12240 * - #downcase: Returns symbol with all characters downcased.
12241 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12242 * - #name: Returns the frozen string corresponding to symbol.
12243 * - #succ, #next: Returns the symbol that is the successor to symbol.
12244 * - #swapcase: Returns symbol with all upcase characters downcased
12245 * and all downcase characters upcased.
12246 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12247 * - #to_s, #id2name: Returns the string corresponding to +self+.
12248 * - #to_sym, #intern: Returns +self+.
12249 * - #upcase: Returns symbol with all characters upcased.
12250 *
12251 */
12252
12253
12254/*
12255 * call-seq:
12256 * self == other -> true or false
12257 *
12258 * Returns whether +other+ is the same object as +self+.
12259 */
12260
12261#define sym_equal rb_obj_equal
12262
12263static int
12264sym_printable(const char *s, const char *send, rb_encoding *enc)
12265{
12266 while (s < send) {
12267 int n;
12268 int c = rb_enc_precise_mbclen(s, send, enc);
12269
12270 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12271 n = MBCLEN_CHARFOUND_LEN(c);
12272 c = rb_enc_mbc_to_codepoint(s, send, enc);
12273 if (!rb_enc_isprint(c, enc)) return FALSE;
12274 s += n;
12275 }
12276 return TRUE;
12277}
12278
12279int
12280rb_str_symname_p(VALUE sym)
12281{
12282 rb_encoding *enc;
12283 const char *ptr;
12284 long len;
12285 rb_encoding *resenc = rb_default_internal_encoding();
12286
12287 if (resenc == NULL) resenc = rb_default_external_encoding();
12288 enc = STR_ENC_GET(sym);
12289 ptr = RSTRING_PTR(sym);
12290 len = RSTRING_LEN(sym);
12291 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12292 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12293 return FALSE;
12294 }
12295 return TRUE;
12296}
12297
12298VALUE
12299rb_str_quote_unprintable(VALUE str)
12300{
12301 rb_encoding *enc;
12302 const char *ptr;
12303 long len;
12304 rb_encoding *resenc;
12305
12306 Check_Type(str, T_STRING);
12307 resenc = rb_default_internal_encoding();
12308 if (resenc == NULL) resenc = rb_default_external_encoding();
12309 enc = STR_ENC_GET(str);
12310 ptr = RSTRING_PTR(str);
12311 len = RSTRING_LEN(str);
12312 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12313 !sym_printable(ptr, ptr + len, enc)) {
12314 return rb_str_escape(str);
12315 }
12316 return str;
12317}
12318
12319VALUE
12320rb_id_quote_unprintable(ID id)
12321{
12322 VALUE str = rb_id2str(id);
12323 if (!rb_str_symname_p(str)) {
12324 return rb_str_escape(str);
12325 }
12326 return str;
12327}
12328
12329/*
12330 * call-seq:
12331 * inspect -> string
12332 *
12333 * Returns a string representation of +self+ (including the leading colon):
12334 *
12335 * :foo.inspect # => ":foo"
12336 *
12337 * Related: Symbol#to_s, Symbol#name.
12338 *
12339 */
12340
12341static VALUE
12342sym_inspect(VALUE sym)
12343{
12344 VALUE str = rb_sym2str(sym);
12345 const char *ptr;
12346 long len;
12347 char *dest;
12348
12349 if (!rb_str_symname_p(str)) {
12350 str = rb_str_inspect(str);
12351 len = RSTRING_LEN(str);
12352 rb_str_resize(str, len + 1);
12353 dest = RSTRING_PTR(str);
12354 memmove(dest + 1, dest, len);
12355 }
12356 else {
12357 rb_encoding *enc = STR_ENC_GET(str);
12358 VALUE orig_str = str;
12359
12360 len = RSTRING_LEN(orig_str);
12361 str = rb_enc_str_new(0, len + 1, enc);
12362
12363 // Get data pointer after allocation
12364 ptr = RSTRING_PTR(orig_str);
12365 dest = RSTRING_PTR(str);
12366 memcpy(dest + 1, ptr, len);
12367
12368 RB_GC_GUARD(orig_str);
12369 }
12370 dest[0] = ':';
12371
12373
12374 return str;
12375}
12376
12377VALUE
12379{
12380 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12381 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12382 return str;
12383}
12384
12385VALUE
12386rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12387{
12388 VALUE obj;
12389
12390 if (argc < 1) {
12391 rb_raise(rb_eArgError, "no receiver given");
12392 }
12393 obj = argv[0];
12394 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12395}
12396
12397/*
12398 * call-seq:
12399 * succ
12400 *
12401 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12402 *
12403 * :foo.succ # => :fop
12404 *
12405 * Related: String#succ.
12406 */
12407
12408static VALUE
12409sym_succ(VALUE sym)
12410{
12411 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12412}
12413
12414/*
12415 * call-seq:
12416 * self <=> other -> -1, 0, 1, or nil
12417 *
12418 * Compares +self+ and +other+, using String#<=>.
12419 *
12420 * Returns:
12421 *
12422 * - <tt>self.to_s <=> other.to_s</tt>, if +other+ is a symbol.
12423 * - +nil+, otherwise.
12424 *
12425 * Examples:
12426 *
12427 * :bar <=> :foo # => -1
12428 * :foo <=> :foo # => 0
12429 * :foo <=> :bar # => 1
12430 * :foo <=> 'bar' # => nil
12431 *
12432 * \Class \Symbol includes module Comparable,
12433 * each of whose methods uses Symbol#<=> for comparison.
12434 *
12435 * Related: String#<=>.
12436 */
12437
12438static VALUE
12439sym_cmp(VALUE sym, VALUE other)
12440{
12441 if (!SYMBOL_P(other)) {
12442 return Qnil;
12443 }
12444 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12445}
12446
12447/*
12448 * call-seq:
12449 * casecmp(object) -> -1, 0, 1, or nil
12450 *
12451 * :include: doc/symbol/casecmp.rdoc
12452 *
12453 */
12454
12455static VALUE
12456sym_casecmp(VALUE sym, VALUE other)
12457{
12458 if (!SYMBOL_P(other)) {
12459 return Qnil;
12460 }
12461 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12462}
12463
12464/*
12465 * call-seq:
12466 * casecmp?(object) -> true, false, or nil
12467 *
12468 * :include: doc/symbol/casecmp_p.rdoc
12469 *
12470 */
12471
12472static VALUE
12473sym_casecmp_p(VALUE sym, VALUE other)
12474{
12475 if (!SYMBOL_P(other)) {
12476 return Qnil;
12477 }
12478 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12479}
12480
12481/*
12482 * call-seq:
12483 * self =~ other -> integer or nil
12484 *
12485 * Equivalent to <tt>self.to_s =~ other</tt>,
12486 * including possible updates to global variables;
12487 * see String#=~.
12488 *
12489 */
12490
12491static VALUE
12492sym_match(VALUE sym, VALUE other)
12493{
12494 return rb_str_match(rb_sym2str(sym), other);
12495}
12496
12497/*
12498 * call-seq:
12499 * match(pattern, offset = 0) -> matchdata or nil
12500 * match(pattern, offset = 0) {|matchdata| } -> object
12501 *
12502 * Equivalent to <tt>self.to_s.match</tt>,
12503 * including possible updates to global variables;
12504 * see String#match.
12505 *
12506 */
12507
12508static VALUE
12509sym_match_m(int argc, VALUE *argv, VALUE sym)
12510{
12511 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12512}
12513
12514/*
12515 * call-seq:
12516 * match?(pattern, offset) -> true or false
12517 *
12518 * Equivalent to <tt>sym.to_s.match?</tt>;
12519 * see String#match.
12520 *
12521 */
12522
12523static VALUE
12524sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12525{
12526 return rb_str_match_m_p(argc, argv, sym);
12527}
12528
12529/*
12530 * call-seq:
12531 * self[offset] -> string or nil
12532 * self[offset, size] -> string or nil
12533 * self[range] -> string or nil
12534 * self[regexp, capture = 0] -> string or nil
12535 * self[substring] -> string or nil
12536 *
12537 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12538 *
12539 */
12540
12541static VALUE
12542sym_aref(int argc, VALUE *argv, VALUE sym)
12543{
12544 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12545}
12546
12547/*
12548 * call-seq:
12549 * length -> integer
12550 *
12551 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12552 */
12553
12554static VALUE
12555sym_length(VALUE sym)
12556{
12557 return rb_str_length(rb_sym2str(sym));
12558}
12559
12560/*
12561 * call-seq:
12562 * empty? -> true or false
12563 *
12564 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12565 *
12566 */
12567
12568static VALUE
12569sym_empty(VALUE sym)
12570{
12571 return rb_str_empty(rb_sym2str(sym));
12572}
12573
12574/*
12575 * call-seq:
12576 * upcase(mapping) -> symbol
12577 *
12578 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12579 *
12580 * See String#upcase.
12581 *
12582 */
12583
12584static VALUE
12585sym_upcase(int argc, VALUE *argv, VALUE sym)
12586{
12587 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12588}
12589
12590/*
12591 * call-seq:
12592 * downcase(mapping) -> symbol
12593 *
12594 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12595 *
12596 * See String#downcase.
12597 *
12598 * Related: Symbol#upcase.
12599 *
12600 */
12601
12602static VALUE
12603sym_downcase(int argc, VALUE *argv, VALUE sym)
12604{
12605 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12606}
12607
12608/*
12609 * call-seq:
12610 * capitalize(mapping) -> symbol
12611 *
12612 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12613 *
12614 * See String#capitalize.
12615 *
12616 */
12617
12618static VALUE
12619sym_capitalize(int argc, VALUE *argv, VALUE sym)
12620{
12621 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12622}
12623
12624/*
12625 * call-seq:
12626 * swapcase(mapping) -> symbol
12627 *
12628 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12629 *
12630 * See String#swapcase.
12631 *
12632 */
12633
12634static VALUE
12635sym_swapcase(int argc, VALUE *argv, VALUE sym)
12636{
12637 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12638}
12639
12640/*
12641 * call-seq:
12642 * start_with?(*string_or_regexp) -> true or false
12643 *
12644 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12645 *
12646 */
12647
12648static VALUE
12649sym_start_with(int argc, VALUE *argv, VALUE sym)
12650{
12651 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12652}
12653
12654/*
12655 * call-seq:
12656 * end_with?(*strings) -> true or false
12657 *
12658 *
12659 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12660 *
12661 */
12662
12663static VALUE
12664sym_end_with(int argc, VALUE *argv, VALUE sym)
12665{
12666 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12667}
12668
12669/*
12670 * call-seq:
12671 * encoding -> encoding
12672 *
12673 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12674 *
12675 */
12676
12677static VALUE
12678sym_encoding(VALUE sym)
12679{
12680 return rb_obj_encoding(rb_sym2str(sym));
12681}
12682
12683static VALUE
12684string_for_symbol(VALUE name)
12685{
12686 if (!RB_TYPE_P(name, T_STRING)) {
12687 VALUE tmp = rb_check_string_type(name);
12688 if (NIL_P(tmp)) {
12689 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12690 name);
12691 }
12692 name = tmp;
12693 }
12694 return name;
12695}
12696
12697ID
12699{
12700 if (SYMBOL_P(name)) {
12701 return SYM2ID(name);
12702 }
12703 name = string_for_symbol(name);
12704 return rb_intern_str(name);
12705}
12706
12707VALUE
12709{
12710 if (SYMBOL_P(name)) {
12711 return name;
12712 }
12713 name = string_for_symbol(name);
12714 return rb_str_intern(name);
12715}
12716
12717/*
12718 * call-seq:
12719 * Symbol.all_symbols -> array_of_symbols
12720 *
12721 * Returns an array of all symbols currently in Ruby's symbol table:
12722 *
12723 * Symbol.all_symbols.size # => 9334
12724 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12725 *
12726 */
12727
12728static VALUE
12729sym_all_symbols(VALUE _)
12730{
12731 return rb_sym_all_symbols();
12732}
12733
12734VALUE
12735rb_str_to_interned_str(VALUE str)
12736{
12737 return rb_fstring(str);
12738}
12739
12740VALUE
12741rb_interned_str(const char *ptr, long len)
12742{
12743 struct RString fake_str = {RBASIC_INIT};
12744 int encidx = ENCINDEX_US_ASCII;
12745 int coderange = ENC_CODERANGE_7BIT;
12746 if (len > 0 && search_nonascii(ptr, ptr + len)) {
12747 encidx = ENCINDEX_ASCII_8BIT;
12748 coderange = ENC_CODERANGE_VALID;
12749 }
12750 VALUE str = setup_fake_str(&fake_str, ptr, len, encidx);
12751 ENC_CODERANGE_SET(str, coderange);
12752 return register_fstring(str, true, false);
12753}
12754
12755VALUE
12757{
12758 return rb_interned_str(ptr, strlen(ptr));
12759}
12760
12761VALUE
12762rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12763{
12764 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12765 rb_enc_autoload(enc);
12766 }
12767
12768 struct RString fake_str = {RBASIC_INIT};
12769 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12770}
12771
12772VALUE
12773rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12774{
12775 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12776 rb_enc_autoload(enc);
12777 }
12778
12779 struct RString fake_str = {RBASIC_INIT};
12780 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12781 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12782 return str;
12783}
12784
12785VALUE
12787{
12788 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12789}
12790
12791#if USE_YJIT || USE_ZJIT
12792void
12793rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12794{
12795 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12796 ssize_t code = RB_NUM2SSIZE(codepoint);
12797
12798 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12799 rb_str_buf_cat_byte(str, (char) code);
12800 return;
12801 }
12802 }
12803
12804 rb_str_concat(str, codepoint);
12805}
12806#endif
12807
12808static int
12809fstring_set_class_i(VALUE *str, void *data)
12810{
12811 RBASIC_SET_CLASS(*str, rb_cString);
12812
12813 return ST_CONTINUE;
12814}
12815
12816void
12817Init_String(void)
12818{
12820
12821 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12822
12824 rb_define_alloc_func(rb_cString, empty_str_alloc);
12825 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12826 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12827 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12829 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12830 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12833 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12834 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12835 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12836 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12839 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12840 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12841 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12842 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12845 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12846 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12847 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12848 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12849 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12851 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12853 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12854 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12855 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12856 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12857 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12858 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12859 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12860 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12861 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12862 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12863 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12864 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12865 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12866 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12868 rb_define_method(rb_cString, "+@", str_uplus, 0);
12869 rb_define_method(rb_cString, "-@", str_uminus, 0);
12870 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12871 rb_define_alias(rb_cString, "dedup", "-@");
12872
12873 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12874 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12875 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12876 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12879 rb_define_method(rb_cString, "undump", str_undump, 0);
12880
12881 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12882 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12883 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12884 sym_fold = ID2SYM(rb_intern_const("fold"));
12885
12886 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12887 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12888 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12889 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12890
12891 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12892 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12893 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12894 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12895
12896 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12897 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12898 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12899 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12900 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12901 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12902 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12903 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12904 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12905 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12906 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12907 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12909 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12910 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12911 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12912 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12913 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12914
12915 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12916 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12917 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12918
12919 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12920
12921 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12922 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12923 rb_define_method(rb_cString, "center", rb_str_center, -1);
12924
12925 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12926 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12927 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12928 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12929 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12930 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12931 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12932 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12933 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12934
12935 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12936 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12937 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12938 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12939 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12940 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12941 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12942 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12943 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12944
12945 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12946 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12947 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12948 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12949 rb_define_method(rb_cString, "count", rb_str_count, -1);
12950
12951 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12952 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12953 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12954 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12955
12956 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12957 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12958 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12959 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12960 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12961
12962 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12963
12964 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12965 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12966
12967 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12968 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12969
12970 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12971 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12972 rb_define_method(rb_cString, "b", rb_str_b, 0);
12973 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12974 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12975
12976 /* define UnicodeNormalize module here so that we don't have to look it up */
12977 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12978 id_normalize = rb_intern_const("normalize");
12979 id_normalized_p = rb_intern_const("normalized?");
12980
12981 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12982 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12983 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12984
12985 rb_fs = Qnil;
12986 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12987 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12988 rb_gc_register_address(&rb_fs);
12989
12994 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12995
12996 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12997 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12998 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12999 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
13000 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
13001 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
13002
13003 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
13004 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
13005 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
13006 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
13007
13008 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
13009 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
13010 rb_define_method(rb_cSymbol, "length", sym_length, 0);
13011 rb_define_method(rb_cSymbol, "size", sym_length, 0);
13012 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
13013 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
13014 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
13015
13016 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
13017 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
13018 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
13019 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
13020
13021 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
13022 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
13023
13024 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
13025}
13026
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:696
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:404
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1603
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1396
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1509
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2771
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2581
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3061
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1018
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2850
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:130
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1683
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:133
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1684
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:131
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:128
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:125
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:122
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:127
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:65
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:129
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:126
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:134
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:477
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:661
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3967
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1431
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1427
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1434
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1425
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1429
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_cObject
Object class.
Definition object.c:61
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:646
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2255
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2273
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1325
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3651
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:235
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:553
VALUE rb_cSymbol
Symbol class.
Definition string.c:82
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:141
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1313
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:81
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3335
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1344
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1209
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3054
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1228
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12762
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2334
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3771
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1157
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1449
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1350
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:970
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12786
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:826
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:755
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2714
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2977
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1742
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1121
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1208
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:208
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:242
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:709
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2037
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1091
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2043
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1949
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1286
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4360
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3852
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1532
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1967
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1754
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1514
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2487
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1584
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:946
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:940
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3836
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1425
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12378
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2560
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1401
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1748
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3082
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5377
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4199
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3189
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11699
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1791
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1499
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1790
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1682
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1191
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1533
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1005
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1520
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1996
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4185
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3604
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2423
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2014
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1640
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1568
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6588
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3197
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1147
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12756
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1431
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1605
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3802
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3129
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4306
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3423
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7267
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2792
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12741
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4253
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4073
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4228
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1693
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3778
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3314
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5864
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11757
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1626
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1704
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:632
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2976
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3286
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1657
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3405
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1203
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1550
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2746
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7374
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1413
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1720
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2437
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1515
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5779
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9379
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1197
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:968
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1852
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2064
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2143
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3485
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1742
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:1024
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12708
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
Definition string.c:12698
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1896
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3566
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1376
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1443
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2953
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2811
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1437
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2824
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1781
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
Definition rtypeddata.h:122
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:531
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@58::@59::@61 aux
Auxiliary info.
struct RString::@58::@60 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
VALUE shared
Parent of the string.
Definition rstring.h:240
struct RString::@58::@59 heap
Strings that use separated memory region for contents use this pattern.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
union RString::@58 as
String's specific fields.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:229
Definition string.c:8261
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:308
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113