Ruby 3.3.7p123 (2025-01-15 revision be31f993d7fa0219d85f7b3c694d454da4ecc10b)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
41#include "probes.h"
42#include "ruby/encoding.h"
43#include "ruby/re.h"
44#include "ruby/util.h"
45#include "ruby_assert.h"
46#include "vm_sync.h"
47
48#if defined HAVE_CRYPT_R
49# if defined HAVE_CRYPT_H
50# include <crypt.h>
51# endif
52#elif !defined HAVE_CRYPT
53# include "missing/crypt.h"
54# define HAVE_CRYPT_R 1
55#endif
56
57#define BEG(no) (regs->beg[(no)])
58#define END(no) (regs->end[(no)])
59
60#undef rb_str_new
61#undef rb_usascii_str_new
62#undef rb_utf8_str_new
63#undef rb_enc_str_new
64#undef rb_str_new_cstr
65#undef rb_usascii_str_new_cstr
66#undef rb_utf8_str_new_cstr
67#undef rb_enc_str_new_cstr
68#undef rb_external_str_new_cstr
69#undef rb_locale_str_new_cstr
70#undef rb_str_dup_frozen
71#undef rb_str_buf_new_cstr
72#undef rb_str_buf_cat
73#undef rb_str_buf_cat2
74#undef rb_str_cat2
75#undef rb_str_cat_cstr
76#undef rb_fstring_cstr
77
80
81/* FLAGS of RString
82 *
83 * 1: RSTRING_NOEMBED
84 * 2: STR_SHARED (== ELTS_SHARED)
85 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
86 * other strings that rely on this string's buffer)
87 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
88 * early, specific to rb_str_tmp_frozen_{acquire,release})
89 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
90 * such as read(2). Any modification and realloc is prohibited)
91 *
92 * 8-9: ENC_CODERANGE (2 bits)
93 * 10-16: ENCODING (7 bits == 128)
94 * 17: RSTRING_FSTR
95 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
96 * used for a string object based on C string literal)
97 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
98 * object header is temporarily allocated on C stack)
99 */
100
101#define RUBY_MAX_CHAR_LEN 16
102#define STR_SHARED_ROOT FL_USER5
103#define STR_BORROWED FL_USER6
104#define STR_TMPLOCK FL_USER7
105#define STR_NOFREE FL_USER18
106#define STR_FAKESTR FL_USER19
107
108#define STR_SET_NOEMBED(str) do {\
109 FL_SET((str), STR_NOEMBED);\
110 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
111} while (0)
112#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
113
114#define STR_SET_LEN(str, n) do { \
115 RSTRING(str)->len = (n); \
116} while (0)
117
118static inline bool
119str_enc_fastpath(VALUE str)
120{
121 // The overwhelming majority of strings are in one of these 3 encodings.
122 switch (ENCODING_GET_INLINED(str)) {
123 case ENCINDEX_ASCII_8BIT:
124 case ENCINDEX_UTF_8:
125 case ENCINDEX_US_ASCII:
126 return true;
127 default:
128 return false;
129 }
130}
131
132#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
133#define TERM_FILL(ptr, termlen) do {\
134 char *const term_fill_ptr = (ptr);\
135 const int term_fill_len = (termlen);\
136 *term_fill_ptr = '\0';\
137 if (UNLIKELY(term_fill_len > 1))\
138 memset(term_fill_ptr, 0, term_fill_len);\
139} while (0)
140
141#define RESIZE_CAPA(str,capacity) do {\
142 const int termlen = TERM_LEN(str);\
143 RESIZE_CAPA_TERM(str,capacity,termlen);\
144} while (0)
145#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
146 if (STR_EMBED_P(str)) {\
147 if (str_embed_capa(str) < capacity + termlen) {\
148 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
149 const long tlen = RSTRING_LEN(str);\
150 memcpy(tmp, RSTRING_PTR(str), tlen);\
151 RSTRING(str)->as.heap.ptr = tmp;\
152 RSTRING(str)->len = tlen;\
153 STR_SET_NOEMBED(str);\
154 RSTRING(str)->as.heap.aux.capa = (capacity);\
155 }\
156 }\
157 else {\
158 assert(!FL_TEST((str), STR_SHARED)); \
159 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
160 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
161 RSTRING(str)->as.heap.aux.capa = (capacity);\
162 }\
163} while (0)
164
165#define STR_SET_SHARED(str, shared_str) do { \
166 if (!FL_TEST(str, STR_FAKESTR)) { \
167 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
168 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
169 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
170 FL_SET((str), STR_SHARED); \
171 FL_SET((shared_str), STR_SHARED_ROOT); \
172 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
173 FL_SET_RAW((shared_str), STR_BORROWED); \
174 } \
175} while (0)
176
177#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
178#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
179/* TODO: include the terminator size in capa. */
180
181#define STR_ENC_GET(str) get_encoding(str)
182
183#if !defined SHARABLE_MIDDLE_SUBSTRING
184# define SHARABLE_MIDDLE_SUBSTRING 0
185#endif
186#if !SHARABLE_MIDDLE_SUBSTRING
187#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
188#else
189#define SHARABLE_SUBSTRING_P(beg, len, end) 1
190#endif
191
192
193static inline long
194str_embed_capa(VALUE str)
195{
196 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
197}
198
199bool
200rb_str_reembeddable_p(VALUE str)
201{
202 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
203}
204
205static inline size_t
206rb_str_embed_size(long capa)
207{
208 return offsetof(struct RString, as.embed.ary) + capa;
209}
210
211size_t
212rb_str_size_as_embedded(VALUE str)
213{
214 size_t real_size;
215 if (STR_EMBED_P(str)) {
216 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
217 }
218 /* if the string is not currently embedded, but it can be embedded, how
219 * much space would it require */
220 else if (rb_str_reembeddable_p(str)) {
221 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
222 }
223 else {
224 real_size = sizeof(struct RString);
225 }
226 return real_size;
227}
228
229static inline bool
230STR_EMBEDDABLE_P(long len, long termlen)
231{
232 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
233}
234
235static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
236static VALUE str_new_frozen(VALUE klass, VALUE orig);
237static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
238static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
239static VALUE str_new(VALUE klass, const char *ptr, long len);
240static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
241static inline void str_modifiable(VALUE str);
242static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
243
244static inline void
245str_make_independent(VALUE str)
246{
247 long len = RSTRING_LEN(str);
248 int termlen = TERM_LEN(str);
249 str_make_independent_expand((str), len, 0L, termlen);
250}
251
252static inline int str_dependent_p(VALUE str);
253
254void
255rb_str_make_independent(VALUE str)
256{
257 if (str_dependent_p(str)) {
258 str_make_independent(str);
259 }
260}
261
262void
263rb_str_make_embedded(VALUE str)
264{
265 RUBY_ASSERT(rb_str_reembeddable_p(str));
266 RUBY_ASSERT(!STR_EMBED_P(str));
267
268 char *buf = RSTRING(str)->as.heap.ptr;
269 long len = RSTRING(str)->len;
270
271 STR_SET_EMBED(str);
272 STR_SET_LEN(str, len);
273
274 if (len > 0) {
275 memcpy(RSTRING_PTR(str), buf, len);
276 ruby_xfree(buf);
277 }
278
279 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
280}
281
282void
283rb_debug_rstring_null_ptr(const char *func)
284{
285 fprintf(stderr, "%s is returning NULL!! "
286 "SIGSEGV is highly expected to follow immediately.\n"
287 "If you could reproduce, attach your debugger here, "
288 "and look at the passed string.\n",
289 func);
290}
291
292/* symbols for [up|down|swap]case/capitalize options */
293static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
294
295static rb_encoding *
296get_encoding(VALUE str)
297{
298 return rb_enc_from_index(ENCODING_GET(str));
299}
300
301static void
302mustnot_broken(VALUE str)
303{
304 if (is_broken_string(str)) {
305 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
306 }
307}
308
309static void
310mustnot_wchar(VALUE str)
311{
312 rb_encoding *enc = STR_ENC_GET(str);
313 if (rb_enc_mbminlen(enc) > 1) {
314 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
315 }
316}
317
318static int fstring_cmp(VALUE a, VALUE b);
319
320static VALUE register_fstring(VALUE str, bool copy);
321
322const struct st_hash_type rb_fstring_hash_type = {
323 fstring_cmp,
325};
326
327#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
328
330 VALUE fstr;
331 bool copy;
332};
333
334static int
335fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
336{
337
338 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
339 VALUE str = (VALUE)*key;
340
341 if (existing) {
342 /* because of lazy sweep, str may be unmarked already and swept
343 * at next time */
344
345 if (rb_objspace_garbage_object_p(str)) {
346 arg->fstr = Qundef;
347 return ST_DELETE;
348 }
349
350 arg->fstr = str;
351 return ST_STOP;
352 }
353 else {
354 if (FL_TEST_RAW(str, STR_FAKESTR)) {
355 if (arg->copy) {
356 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
357 rb_enc_copy(new_str, str);
358 str = new_str;
359 }
360 else {
361 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
362 RSTRING(str)->len,
363 ENCODING_GET(str));
364 }
365 OBJ_FREEZE_RAW(str);
366 }
367 else {
368 if (!OBJ_FROZEN(str))
369 str = str_new_frozen(rb_cString, str);
370 if (STR_SHARED_P(str)) { /* str should not be shared */
371 /* shared substring */
372 str_make_independent(str);
373 assert(OBJ_FROZEN(str));
374 }
375 if (!BARE_STRING_P(str)) {
376 str = str_new_frozen(rb_cString, str);
377 }
378 }
379 RBASIC(str)->flags |= RSTRING_FSTR;
380
381 *key = *value = arg->fstr = str;
382 return ST_CONTINUE;
383 }
384}
385
386RUBY_FUNC_EXPORTED
387VALUE
388rb_fstring(VALUE str)
389{
390 VALUE fstr;
391 int bare;
392
393 Check_Type(str, T_STRING);
394
395 if (FL_TEST(str, RSTRING_FSTR))
396 return str;
397
398 bare = BARE_STRING_P(str);
399 if (!bare) {
400 if (STR_EMBED_P(str)) {
401 OBJ_FREEZE_RAW(str);
402 return str;
403 }
404
405 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
406 assert(OBJ_FROZEN(str));
407 return str;
408 }
409 }
410
411 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE))
412 rb_str_resize(str, RSTRING_LEN(str));
413
414 fstr = register_fstring(str, FALSE);
415
416 if (!bare) {
417 str_replace_shared_without_enc(str, fstr);
418 OBJ_FREEZE_RAW(str);
419 return str;
420 }
421 return fstr;
422}
423
424static VALUE
425register_fstring(VALUE str, bool copy)
426{
427 struct fstr_update_arg args;
428 args.copy = copy;
429
430 RB_VM_LOCK_ENTER();
431 {
432 st_table *frozen_strings = rb_vm_fstring_table();
433 do {
434 args.fstr = str;
435 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
436 } while (UNDEF_P(args.fstr));
437 }
438 RB_VM_LOCK_LEAVE();
439
440 assert(OBJ_FROZEN(args.fstr));
441 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
442 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
443 assert(RBASIC_CLASS(args.fstr) == rb_cString);
444 return args.fstr;
445}
446
447static VALUE
448setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
449{
450 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
451 /* SHARED to be allocated by the callback */
452
453 if (!name) {
455 name = "";
456 }
457
458 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
459
460 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
461 fake_str->len = len;
462 fake_str->as.heap.ptr = (char *)name;
463 fake_str->as.heap.aux.capa = len;
464 return (VALUE)fake_str;
465}
466
467/*
468 * set up a fake string which refers a static string literal.
469 */
470VALUE
471rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
472{
473 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
474}
475
476/*
477 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
478 * shared string which refers a static string literal. `ptr` must
479 * point a constant string.
480 */
481VALUE
482rb_fstring_new(const char *ptr, long len)
483{
484 struct RString fake_str;
485 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
486}
487
488VALUE
489rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
490{
491 struct RString fake_str;
492 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
493}
494
495VALUE
496rb_fstring_cstr(const char *ptr)
497{
498 return rb_fstring_new(ptr, strlen(ptr));
499}
500
501static int
502fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
503{
504 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
505 return ST_CONTINUE;
506}
507
508static int
509fstring_cmp(VALUE a, VALUE b)
510{
511 long alen, blen;
512 const char *aptr, *bptr;
513 RSTRING_GETMEM(a, aptr, alen);
514 RSTRING_GETMEM(b, bptr, blen);
515 return (alen != blen ||
516 ENCODING_GET(a) != ENCODING_GET(b) ||
517 memcmp(aptr, bptr, alen) != 0);
518}
519
520static inline int
521single_byte_optimizable(VALUE str)
522{
523 rb_encoding *enc;
524
525 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
527 return 1;
528
529 enc = STR_ENC_GET(str);
530 if (rb_enc_mbmaxlen(enc) == 1)
531 return 1;
532
533 /* Conservative. Possibly single byte.
534 * "\xa1" in Shift_JIS for example. */
535 return 0;
536}
537
539
540static inline const char *
541search_nonascii(const char *p, const char *e)
542{
543 const uintptr_t *s, *t;
544
545#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
546# if SIZEOF_UINTPTR_T == 8
547# define NONASCII_MASK UINT64_C(0x8080808080808080)
548# elif SIZEOF_UINTPTR_T == 4
549# define NONASCII_MASK UINT32_C(0x80808080)
550# else
551# error "don't know what to do."
552# endif
553#else
554# if SIZEOF_UINTPTR_T == 8
555# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
556# elif SIZEOF_UINTPTR_T == 4
557# define NONASCII_MASK 0x80808080UL /* or...? */
558# else
559# error "don't know what to do."
560# endif
561#endif
562
563 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
564#if !UNALIGNED_WORD_ACCESS
565 if ((uintptr_t)p % SIZEOF_VOIDP) {
566 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
567 p += l;
568 switch (l) {
569 default: UNREACHABLE;
570#if SIZEOF_VOIDP > 4
571 case 7: if (p[-7]&0x80) return p-7;
572 case 6: if (p[-6]&0x80) return p-6;
573 case 5: if (p[-5]&0x80) return p-5;
574 case 4: if (p[-4]&0x80) return p-4;
575#endif
576 case 3: if (p[-3]&0x80) return p-3;
577 case 2: if (p[-2]&0x80) return p-2;
578 case 1: if (p[-1]&0x80) return p-1;
579 case 0: break;
580 }
581 }
582#endif
583#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
584#define aligned_ptr(value) \
585 __builtin_assume_aligned((value), sizeof(uintptr_t))
586#else
587#define aligned_ptr(value) (uintptr_t *)(value)
588#endif
589 s = aligned_ptr(p);
590 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
591#undef aligned_ptr
592 for (;s < t; s++) {
593 if (*s & NONASCII_MASK) {
594#ifdef WORDS_BIGENDIAN
595 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
596#else
597 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
598#endif
599 }
600 }
601 p = (const char *)s;
602 }
603
604 switch (e - p) {
605 default: UNREACHABLE;
606#if SIZEOF_VOIDP > 4
607 case 7: if (e[-7]&0x80) return e-7;
608 case 6: if (e[-6]&0x80) return e-6;
609 case 5: if (e[-5]&0x80) return e-5;
610 case 4: if (e[-4]&0x80) return e-4;
611#endif
612 case 3: if (e[-3]&0x80) return e-3;
613 case 2: if (e[-2]&0x80) return e-2;
614 case 1: if (e[-1]&0x80) return e-1;
615 case 0: return NULL;
616 }
617}
618
619static int
620coderange_scan(const char *p, long len, rb_encoding *enc)
621{
622 const char *e = p + len;
623
624 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
625 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
626 p = search_nonascii(p, e);
628 }
629
630 if (rb_enc_asciicompat(enc)) {
631 p = search_nonascii(p, e);
632 if (!p) return ENC_CODERANGE_7BIT;
633 for (;;) {
634 int ret = rb_enc_precise_mbclen(p, e, enc);
636 p += MBCLEN_CHARFOUND_LEN(ret);
637 if (p == e) break;
638 p = search_nonascii(p, e);
639 if (!p) break;
640 }
641 }
642 else {
643 while (p < e) {
644 int ret = rb_enc_precise_mbclen(p, e, enc);
646 p += MBCLEN_CHARFOUND_LEN(ret);
647 }
648 }
649 return ENC_CODERANGE_VALID;
650}
651
652long
653rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
654{
655 const char *p = s;
656
657 if (*cr == ENC_CODERANGE_BROKEN)
658 return e - s;
659
660 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
661 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
662 if (*cr == ENC_CODERANGE_VALID) return e - s;
663 p = search_nonascii(p, e);
665 return e - s;
666 }
667 else if (rb_enc_asciicompat(enc)) {
668 p = search_nonascii(p, e);
669 if (!p) {
670 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
671 return e - s;
672 }
673 for (;;) {
674 int ret = rb_enc_precise_mbclen(p, e, enc);
675 if (!MBCLEN_CHARFOUND_P(ret)) {
677 return p - s;
678 }
679 p += MBCLEN_CHARFOUND_LEN(ret);
680 if (p == e) break;
681 p = search_nonascii(p, e);
682 if (!p) break;
683 }
684 }
685 else {
686 while (p < e) {
687 int ret = rb_enc_precise_mbclen(p, e, enc);
688 if (!MBCLEN_CHARFOUND_P(ret)) {
690 return p - s;
691 }
692 p += MBCLEN_CHARFOUND_LEN(ret);
693 }
694 }
696 return e - s;
697}
698
699static inline void
700str_enc_copy(VALUE str1, VALUE str2)
701{
702 rb_enc_set_index(str1, ENCODING_GET(str2));
703}
704
705/* Like str_enc_copy, but does not check frozen status of str1.
706 * You should use this only if you're certain that str1 is not frozen. */
707static inline void
708str_enc_copy_direct(VALUE str1, VALUE str2)
709{
710 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
711 if (inlined_encoding == ENCODING_INLINE_MAX) {
712 rb_enc_set_index(str1, rb_enc_get_index(str2));
713 }
714 else {
715 ENCODING_SET_INLINED(str1, inlined_encoding);
716 }
717}
718
719static void
720rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
721{
722 /* this function is designed for copying encoding and coderange
723 * from src to new string "dest" which is made from the part of src.
724 */
725 str_enc_copy(dest, src);
726 if (RSTRING_LEN(dest) == 0) {
727 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
729 else
731 return;
732 }
733 switch (ENC_CODERANGE(src)) {
736 break;
738 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
739 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
741 else
743 break;
744 default:
745 break;
746 }
747}
748
749static void
750rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
751{
752 str_enc_copy(dest, src);
754}
755
756static int
757enc_coderange_scan(VALUE str, rb_encoding *enc)
758{
759 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
760}
761
762int
763rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
764{
765 return enc_coderange_scan(str, enc);
766}
767
768int
769rb_enc_str_coderange(VALUE str)
770{
771 int cr = ENC_CODERANGE(str);
772
773 if (cr == ENC_CODERANGE_UNKNOWN) {
774 cr = enc_coderange_scan(str, get_encoding(str));
775 ENC_CODERANGE_SET(str, cr);
776 }
777 return cr;
778}
779
780int
782{
783 rb_encoding *enc = STR_ENC_GET(str);
784
785 if (!rb_enc_asciicompat(enc))
786 return FALSE;
787 else if (is_ascii_string(str))
788 return TRUE;
789 return FALSE;
790}
791
792static inline void
793str_mod_check(VALUE s, const char *p, long len)
794{
795 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
796 rb_raise(rb_eRuntimeError, "string modified");
797 }
798}
799
800static size_t
801str_capacity(VALUE str, const int termlen)
802{
803 if (STR_EMBED_P(str)) {
804 return str_embed_capa(str) - termlen;
805 }
806 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
807 return RSTRING(str)->len;
808 }
809 else {
810 return RSTRING(str)->as.heap.aux.capa;
811 }
812}
813
814size_t
816{
817 return str_capacity(str, TERM_LEN(str));
818}
819
820static inline void
821must_not_null(const char *ptr)
822{
823 if (!ptr) {
824 rb_raise(rb_eArgError, "NULL pointer given");
825 }
826}
827
828static inline VALUE
829str_alloc_embed(VALUE klass, size_t capa)
830{
831 size_t size = rb_str_embed_size(capa);
832 assert(size > 0);
833 assert(rb_gc_size_allocatable_p(size));
834
835 NEWOBJ_OF(str, struct RString, klass,
837
838 return (VALUE)str;
839}
840
841static inline VALUE
842str_alloc_heap(VALUE klass)
843{
844 NEWOBJ_OF(str, struct RString, klass,
845 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
846
847 return (VALUE)str;
848}
849
850static inline VALUE
851empty_str_alloc(VALUE klass)
852{
853 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
854 VALUE str = str_alloc_embed(klass, 0);
855 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
856 return str;
857}
858
859static VALUE
860str_new0(VALUE klass, const char *ptr, long len, int termlen)
861{
862 VALUE str;
863
864 if (len < 0) {
865 rb_raise(rb_eArgError, "negative string size (or size too big)");
866 }
867
868 RUBY_DTRACE_CREATE_HOOK(STRING, len);
869
870 if (STR_EMBEDDABLE_P(len, termlen)) {
871 str = str_alloc_embed(klass, len + termlen);
872 if (len == 0) {
874 }
875 }
876 else {
877 str = str_alloc_heap(klass);
878 RSTRING(str)->as.heap.aux.capa = len;
879 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
880 * integer overflow. If we can STATIC_ASSERT that, the following
881 * mul_add_mul can be reverted to a simple ALLOC_N. */
882 RSTRING(str)->as.heap.ptr =
883 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
884 }
885 if (ptr) {
886 memcpy(RSTRING_PTR(str), ptr, len);
887 }
888 STR_SET_LEN(str, len);
889 TERM_FILL(RSTRING_PTR(str) + len, termlen);
890 return str;
891}
892
893static VALUE
894str_new(VALUE klass, const char *ptr, long len)
895{
896 return str_new0(klass, ptr, len, 1);
897}
898
899VALUE
900rb_str_new(const char *ptr, long len)
901{
902 return str_new(rb_cString, ptr, len);
903}
904
905VALUE
906rb_usascii_str_new(const char *ptr, long len)
907{
908 VALUE str = rb_str_new(ptr, len);
909 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
910 return str;
911}
912
913VALUE
914rb_utf8_str_new(const char *ptr, long len)
915{
916 VALUE str = str_new(rb_cString, ptr, len);
917 rb_enc_associate_index(str, rb_utf8_encindex());
918 return str;
919}
920
921VALUE
922rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
923{
924 VALUE str;
925
926 if (!enc) return rb_str_new(ptr, len);
927
928 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
929 rb_enc_associate(str, enc);
930 return str;
931}
932
933VALUE
934rb_str_new_cstr(const char *ptr)
935{
936 must_not_null(ptr);
937 /* rb_str_new_cstr() can take pointer from non-malloc-generated
938 * memory regions, and that cannot be detected by the MSAN. Just
939 * trust the programmer that the argument passed here is a sane C
940 * string. */
941 __msan_unpoison_string(ptr);
942 return rb_str_new(ptr, strlen(ptr));
943}
944
945VALUE
947{
948 VALUE str = rb_str_new_cstr(ptr);
949 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
950 return str;
951}
952
953VALUE
954rb_utf8_str_new_cstr(const char *ptr)
955{
956 VALUE str = rb_str_new_cstr(ptr);
957 rb_enc_associate_index(str, rb_utf8_encindex());
958 return str;
959}
960
961VALUE
962rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
963{
964 must_not_null(ptr);
965 if (rb_enc_mbminlen(enc) != 1) {
966 rb_raise(rb_eArgError, "wchar encoding given");
967 }
968 return rb_enc_str_new(ptr, strlen(ptr), enc);
969}
970
971static VALUE
972str_new_static(VALUE klass, const char *ptr, long len, int encindex)
973{
974 VALUE str;
975
976 if (len < 0) {
977 rb_raise(rb_eArgError, "negative string size (or size too big)");
978 }
979
980 if (!ptr) {
981 rb_encoding *enc = rb_enc_get_from_index(encindex);
982 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
983 }
984 else {
985 RUBY_DTRACE_CREATE_HOOK(STRING, len);
986 str = str_alloc_heap(klass);
987 RSTRING(str)->len = len;
988 RSTRING(str)->as.heap.ptr = (char *)ptr;
989 RSTRING(str)->as.heap.aux.capa = len;
990 RBASIC(str)->flags |= STR_NOFREE;
991 }
992 rb_enc_associate_index(str, encindex);
993 return str;
994}
995
996VALUE
997rb_str_new_static(const char *ptr, long len)
998{
999 return str_new_static(rb_cString, ptr, len, 0);
1000}
1001
1002VALUE
1003rb_usascii_str_new_static(const char *ptr, long len)
1004{
1005 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1006}
1007
1008VALUE
1009rb_utf8_str_new_static(const char *ptr, long len)
1010{
1011 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1012}
1013
1014VALUE
1015rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1016{
1017 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1018}
1019
1020static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1021 rb_encoding *from, rb_encoding *to,
1022 int ecflags, VALUE ecopts);
1023
1024static inline bool
1025is_enc_ascii_string(VALUE str, rb_encoding *enc)
1026{
1027 int encidx = rb_enc_to_index(enc);
1028 if (rb_enc_get_index(str) == encidx)
1029 return is_ascii_string(str);
1030 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1031}
1032
1033VALUE
1034rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1035{
1036 long len;
1037 const char *ptr;
1038 VALUE newstr;
1039
1040 if (!to) return str;
1041 if (!from) from = rb_enc_get(str);
1042 if (from == to) return str;
1043 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1044 rb_is_ascii8bit_enc(to)) {
1045 if (STR_ENC_GET(str) != to) {
1046 str = rb_str_dup(str);
1047 rb_enc_associate(str, to);
1048 }
1049 return str;
1050 }
1051
1052 RSTRING_GETMEM(str, ptr, len);
1053 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1054 from, to, ecflags, ecopts);
1055 if (NIL_P(newstr)) {
1056 /* some error, return original */
1057 return str;
1058 }
1059 return newstr;
1060}
1061
1062VALUE
1063rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1064 rb_encoding *from, int ecflags, VALUE ecopts)
1065{
1066 long olen;
1067
1068 olen = RSTRING_LEN(newstr);
1069 if (ofs < -olen || olen < ofs)
1070 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1071 if (ofs < 0) ofs += olen;
1072 if (!from) {
1073 STR_SET_LEN(newstr, ofs);
1074 return rb_str_cat(newstr, ptr, len);
1075 }
1076
1077 rb_str_modify(newstr);
1078 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1079 rb_enc_get(newstr),
1080 ecflags, ecopts);
1081}
1082
1083VALUE
1084rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1085{
1086 STR_SET_LEN(str, 0);
1087 rb_enc_associate(str, enc);
1088 rb_str_cat(str, ptr, len);
1089 return str;
1090}
1091
1092static VALUE
1093str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1094 rb_encoding *from, rb_encoding *to,
1095 int ecflags, VALUE ecopts)
1096{
1097 rb_econv_t *ec;
1099 long olen;
1100 VALUE econv_wrapper;
1101 const unsigned char *start, *sp;
1102 unsigned char *dest, *dp;
1103 size_t converted_output = (size_t)ofs;
1104
1105 olen = rb_str_capacity(newstr);
1106
1107 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1108 RBASIC_CLEAR_CLASS(econv_wrapper);
1109 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1110 if (!ec) return Qnil;
1111 DATA_PTR(econv_wrapper) = ec;
1112
1113 sp = (unsigned char*)ptr;
1114 start = sp;
1115 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1116 (dp = dest + converted_output),
1117 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1119 /* destination buffer short */
1120 size_t converted_input = sp - start;
1121 size_t rest = len - converted_input;
1122 converted_output = dp - dest;
1123 rb_str_set_len(newstr, converted_output);
1124 if (converted_input && converted_output &&
1125 rest < (LONG_MAX / converted_output)) {
1126 rest = (rest * converted_output) / converted_input;
1127 }
1128 else {
1129 rest = olen;
1130 }
1131 olen += rest < 2 ? 2 : rest;
1132 rb_str_resize(newstr, olen);
1133 }
1134 DATA_PTR(econv_wrapper) = 0;
1135 rb_econv_close(ec);
1136 switch (ret) {
1137 case econv_finished:
1138 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1139 rb_str_set_len(newstr, len);
1140 rb_enc_associate(newstr, to);
1141 return newstr;
1142
1143 default:
1144 return Qnil;
1145 }
1146}
1147
1148VALUE
1149rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1150{
1151 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1152}
1153
1154VALUE
1155rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1156{
1157 rb_encoding *ienc;
1158 VALUE str;
1159 const int eidx = rb_enc_to_index(eenc);
1160
1161 if (!ptr) {
1162 return rb_enc_str_new(ptr, len, eenc);
1163 }
1164
1165 /* ASCII-8BIT case, no conversion */
1166 if ((eidx == rb_ascii8bit_encindex()) ||
1167 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1168 return rb_str_new(ptr, len);
1169 }
1170 /* no default_internal or same encoding, no conversion */
1171 ienc = rb_default_internal_encoding();
1172 if (!ienc || eenc == ienc) {
1173 return rb_enc_str_new(ptr, len, eenc);
1174 }
1175 /* ASCII compatible, and ASCII only string, no conversion in
1176 * default_internal */
1177 if ((eidx == rb_ascii8bit_encindex()) ||
1178 (eidx == rb_usascii_encindex()) ||
1179 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1180 return rb_enc_str_new(ptr, len, ienc);
1181 }
1182 /* convert from the given encoding to default_internal */
1183 str = rb_enc_str_new(NULL, 0, ienc);
1184 /* when the conversion failed for some reason, just ignore the
1185 * default_internal and result in the given encoding as-is. */
1186 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1187 rb_str_initialize(str, ptr, len, eenc);
1188 }
1189 return str;
1190}
1191
1192VALUE
1193rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1194{
1195 int eidx = rb_enc_to_index(eenc);
1196 if (eidx == rb_usascii_encindex() &&
1197 !is_ascii_string(str)) {
1198 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1199 return str;
1200 }
1201 rb_enc_associate_index(str, eidx);
1202 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1203}
1204
1205VALUE
1206rb_external_str_new(const char *ptr, long len)
1207{
1208 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1209}
1210
1211VALUE
1213{
1214 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1215}
1216
1217VALUE
1218rb_locale_str_new(const char *ptr, long len)
1219{
1220 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1221}
1222
1223VALUE
1225{
1226 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1227}
1228
1229VALUE
1230rb_filesystem_str_new(const char *ptr, long len)
1231{
1232 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1233}
1234
1235VALUE
1236rb_filesystem_str_new_cstr(const char *ptr)
1237{
1238 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1239}
1240
1241VALUE
1243{
1244 return rb_str_export_to_enc(str, rb_default_external_encoding());
1245}
1246
1247VALUE
1249{
1250 return rb_str_export_to_enc(str, rb_locale_encoding());
1251}
1252
1253VALUE
1254rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1255{
1256 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1257}
1258
1259static VALUE
1260str_replace_shared_without_enc(VALUE str2, VALUE str)
1261{
1262 const int termlen = TERM_LEN(str);
1263 char *ptr;
1264 long len;
1265
1266 RSTRING_GETMEM(str, ptr, len);
1267 if (str_embed_capa(str2) >= len + termlen) {
1268 char *ptr2 = RSTRING(str2)->as.embed.ary;
1269 STR_SET_EMBED(str2);
1270 memcpy(ptr2, RSTRING_PTR(str), len);
1271 TERM_FILL(ptr2+len, termlen);
1272 }
1273 else {
1274 VALUE root;
1275 if (STR_SHARED_P(str)) {
1276 root = RSTRING(str)->as.heap.aux.shared;
1277 RSTRING_GETMEM(str, ptr, len);
1278 }
1279 else {
1280 root = rb_str_new_frozen(str);
1281 RSTRING_GETMEM(root, ptr, len);
1282 }
1283 assert(OBJ_FROZEN(root));
1284 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1285 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1286 rb_fatal("about to free a possible shared root");
1287 }
1288 char *ptr2 = STR_HEAP_PTR(str2);
1289 if (ptr2 != ptr) {
1290 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1291 }
1292 }
1293 FL_SET(str2, STR_NOEMBED);
1294 RSTRING(str2)->as.heap.ptr = ptr;
1295 STR_SET_SHARED(str2, root);
1296 }
1297
1298 STR_SET_LEN(str2, len);
1299
1300 return str2;
1301}
1302
1303static VALUE
1304str_replace_shared(VALUE str2, VALUE str)
1305{
1306 str_replace_shared_without_enc(str2, str);
1307 rb_enc_cr_str_exact_copy(str2, str);
1308 return str2;
1309}
1310
1311static VALUE
1312str_new_shared(VALUE klass, VALUE str)
1313{
1314 return str_replace_shared(str_alloc_heap(klass), str);
1315}
1316
1317VALUE
1319{
1320 return str_new_shared(rb_obj_class(str), str);
1321}
1322
1323VALUE
1324rb_str_new_frozen(VALUE orig)
1325{
1326 if (OBJ_FROZEN(orig)) return orig;
1327 return str_new_frozen(rb_obj_class(orig), orig);
1328}
1329
1330static VALUE
1331rb_str_new_frozen_String(VALUE orig)
1332{
1333 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1334 return str_new_frozen(rb_cString, orig);
1335}
1336
1337VALUE
1338rb_str_tmp_frozen_acquire(VALUE orig)
1339{
1340 if (OBJ_FROZEN_RAW(orig)) return orig;
1341 return str_new_frozen_buffer(0, orig, FALSE);
1342}
1343
1344VALUE
1345rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1346{
1347 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1348 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1349
1350 VALUE str = str_alloc_heap(0);
1351 OBJ_FREEZE(str);
1352 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1353 FL_SET(str, STR_SHARED_ROOT);
1354
1355 size_t capa = str_capacity(orig, TERM_LEN(orig));
1356
1357 /* If the string is embedded then we want to create a copy that is heap
1358 * allocated. If the string is shared then the shared root must be
1359 * embedded, so we want to create a copy. If the string is a shared root
1360 * then it must be embedded, so we want to create a copy. */
1361 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1362 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1363 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1364 }
1365 else {
1366 /* orig must be heap allocated and not shared, so we can safely transfer
1367 * the pointer to str. */
1368 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1369 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1370 RBASIC(orig)->flags &= ~STR_NOFREE;
1371 STR_SET_SHARED(orig, str);
1372 }
1373
1374 RSTRING(str)->len = RSTRING(orig)->len;
1375 RSTRING(str)->as.heap.aux.capa = capa;
1376
1377 return str;
1378}
1379
1380void
1381rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1382{
1383 if (RBASIC_CLASS(tmp) != 0)
1384 return;
1385
1386 if (STR_EMBED_P(tmp)) {
1387 assert(OBJ_FROZEN_RAW(tmp));
1388 }
1389 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1390 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1391 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1392
1393 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1394 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1395 assert(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1396
1397 /* Unshare orig since the root (tmp) only has this one child. */
1398 FL_UNSET_RAW(orig, STR_SHARED);
1399 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1400 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1401 assert(OBJ_FROZEN_RAW(tmp));
1402
1403 /* Make tmp embedded and empty so it is safe for sweeping. */
1404 STR_SET_EMBED(tmp);
1405 STR_SET_LEN(tmp, 0);
1406 }
1407 }
1408}
1409
1410static VALUE
1411str_new_frozen(VALUE klass, VALUE orig)
1412{
1413 return str_new_frozen_buffer(klass, orig, TRUE);
1414}
1415
1416static VALUE
1417heap_str_make_shared(VALUE klass, VALUE orig)
1418{
1419 assert(!STR_EMBED_P(orig));
1420 assert(!STR_SHARED_P(orig));
1421
1422 VALUE str = str_alloc_heap(klass);
1423 STR_SET_LEN(str, RSTRING_LEN(orig));
1424 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1425 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1426 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1427 RBASIC(orig)->flags &= ~STR_NOFREE;
1428 STR_SET_SHARED(orig, str);
1429 if (klass == 0)
1430 FL_UNSET_RAW(str, STR_BORROWED);
1431 return str;
1432}
1433
1434static VALUE
1435str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1436{
1437 VALUE str;
1438
1439 long len = RSTRING_LEN(orig);
1440 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1441
1442 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1443 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1444 assert(STR_EMBED_P(str));
1445 }
1446 else {
1447 if (FL_TEST_RAW(orig, STR_SHARED)) {
1448 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1449 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1450 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1451 assert(ofs >= 0);
1452 assert(rest >= 0);
1453 assert(ofs + rest <= RSTRING_LEN(shared));
1454 assert(OBJ_FROZEN(shared));
1455
1456 if ((ofs > 0) || (rest > 0) ||
1457 (klass != RBASIC(shared)->klass) ||
1458 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1459 str = str_new_shared(klass, shared);
1460 assert(!STR_EMBED_P(str));
1461 RSTRING(str)->as.heap.ptr += ofs;
1462 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1463 }
1464 else {
1465 if (RBASIC_CLASS(shared) == 0)
1466 FL_SET_RAW(shared, STR_BORROWED);
1467 return shared;
1468 }
1469 }
1470 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1471 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1472 STR_SET_EMBED(str);
1473 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1474 STR_SET_LEN(str, RSTRING_LEN(orig));
1475 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1476 }
1477 else {
1478 str = heap_str_make_shared(klass, orig);
1479 }
1480 }
1481
1482 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1483 OBJ_FREEZE(str);
1484 return str;
1485}
1486
1487VALUE
1488rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1489{
1490 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1491}
1492
1493static VALUE
1494str_new_empty_String(VALUE str)
1495{
1496 VALUE v = rb_str_new(0, 0);
1497 rb_enc_copy(v, str);
1498 return v;
1499}
1500
1501#define STR_BUF_MIN_SIZE 63
1502
1503VALUE
1504rb_str_buf_new(long capa)
1505{
1506 if (STR_EMBEDDABLE_P(capa, 1)) {
1507 return str_alloc_embed(rb_cString, capa + 1);
1508 }
1509
1510 VALUE str = str_alloc_heap(rb_cString);
1511
1512 RSTRING(str)->as.heap.aux.capa = capa;
1513 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1514 RSTRING(str)->as.heap.ptr[0] = '\0';
1515
1516 return str;
1517}
1518
1519VALUE
1520rb_str_buf_new_cstr(const char *ptr)
1521{
1522 VALUE str;
1523 long len = strlen(ptr);
1524
1525 str = rb_str_buf_new(len);
1526 rb_str_buf_cat(str, ptr, len);
1527
1528 return str;
1529}
1530
1531VALUE
1533{
1534 return str_new(0, 0, len);
1535}
1536
1537void
1539{
1540 if (FL_TEST(str, RSTRING_FSTR)) {
1541 st_data_t fstr = (st_data_t)str;
1542
1543 RB_VM_LOCK_ENTER();
1544 {
1545 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1546 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1547 }
1548 RB_VM_LOCK_LEAVE();
1549 }
1550
1551 if (STR_EMBED_P(str)) {
1552 RB_DEBUG_COUNTER_INC(obj_str_embed);
1553 }
1554 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1555 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1556 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1557 }
1558 else {
1559 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1560 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1561 }
1562}
1563
1564RUBY_FUNC_EXPORTED size_t
1565rb_str_memsize(VALUE str)
1566{
1567 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1568 return STR_HEAP_SIZE(str);
1569 }
1570 else {
1571 return 0;
1572 }
1573}
1574
1575VALUE
1577{
1578 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1579}
1580
1581static inline void str_discard(VALUE str);
1582static void str_shared_replace(VALUE str, VALUE str2);
1583
1584void
1586{
1587 if (str != str2) str_shared_replace(str, str2);
1588}
1589
1590static void
1591str_shared_replace(VALUE str, VALUE str2)
1592{
1593 rb_encoding *enc;
1594 int cr;
1595 int termlen;
1596
1597 RUBY_ASSERT(str2 != str);
1598 enc = STR_ENC_GET(str2);
1599 cr = ENC_CODERANGE(str2);
1600 str_discard(str);
1601 termlen = rb_enc_mbminlen(enc);
1602
1603 STR_SET_LEN(str, RSTRING_LEN(str2));
1604
1605 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1606 STR_SET_EMBED(str);
1607 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1608 rb_enc_associate(str, enc);
1609 ENC_CODERANGE_SET(str, cr);
1610 }
1611 else {
1612 if (STR_EMBED_P(str2)) {
1613 assert(!FL_TEST(str2, STR_SHARED));
1614 long len = RSTRING_LEN(str2);
1615 assert(len + termlen <= str_embed_capa(str2));
1616
1617 char *new_ptr = ALLOC_N(char, len + termlen);
1618 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1619 RSTRING(str2)->as.heap.ptr = new_ptr;
1620 STR_SET_LEN(str2, len);
1621 RSTRING(str2)->as.heap.aux.capa = len;
1622 STR_SET_NOEMBED(str2);
1623 }
1624
1625 STR_SET_NOEMBED(str);
1626 FL_UNSET(str, STR_SHARED);
1627 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1628
1629 if (FL_TEST(str2, STR_SHARED)) {
1630 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1631 STR_SET_SHARED(str, shared);
1632 }
1633 else {
1634 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1635 }
1636
1637 /* abandon str2 */
1638 STR_SET_EMBED(str2);
1639 RSTRING_PTR(str2)[0] = 0;
1640 STR_SET_LEN(str2, 0);
1641 rb_enc_associate(str, enc);
1642 ENC_CODERANGE_SET(str, cr);
1643 }
1644}
1645
1646VALUE
1647rb_obj_as_string(VALUE obj)
1648{
1649 VALUE str;
1650
1651 if (RB_TYPE_P(obj, T_STRING)) {
1652 return obj;
1653 }
1654 str = rb_funcall(obj, idTo_s, 0);
1655 return rb_obj_as_string_result(str, obj);
1656}
1657
1658VALUE
1659rb_obj_as_string_result(VALUE str, VALUE obj)
1660{
1661 if (!RB_TYPE_P(str, T_STRING))
1662 return rb_any_to_s(obj);
1663 return str;
1664}
1665
1666static VALUE
1667str_replace(VALUE str, VALUE str2)
1668{
1669 long len;
1670
1671 len = RSTRING_LEN(str2);
1672 if (STR_SHARED_P(str2)) {
1673 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1674 assert(OBJ_FROZEN(shared));
1675 STR_SET_NOEMBED(str);
1676 STR_SET_LEN(str, len);
1677 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1678 STR_SET_SHARED(str, shared);
1679 rb_enc_cr_str_exact_copy(str, str2);
1680 }
1681 else {
1682 str_replace_shared(str, str2);
1683 }
1684
1685 return str;
1686}
1687
1688static inline VALUE
1689ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1690{
1691 size_t size = rb_str_embed_size(capa);
1692 assert(size > 0);
1693 assert(rb_gc_size_allocatable_p(size));
1694
1695 NEWOBJ_OF(str, struct RString, klass,
1697
1698 return (VALUE)str;
1699}
1700
1701static inline VALUE
1702ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1703{
1704 NEWOBJ_OF(str, struct RString, klass,
1705 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1706
1707 return (VALUE)str;
1708}
1709
1710static inline VALUE
1711str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1712{
1713 const VALUE flag_mask =
1715 FL_FREEZE
1716 ;
1717 VALUE flags = FL_TEST_RAW(str, flag_mask);
1718 int encidx = 0;
1719 if (STR_EMBED_P(str)) {
1720 long len = RSTRING_LEN(str);
1721
1722 assert(STR_EMBED_P(dup));
1723 assert(str_embed_capa(dup) >= len + 1);
1724 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1725 }
1726 else {
1727 VALUE root = str;
1728 if (FL_TEST_RAW(str, STR_SHARED)) {
1729 root = RSTRING(str)->as.heap.aux.shared;
1730 }
1731 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1732 root = str = str_new_frozen(klass, str);
1733 flags = FL_TEST_RAW(str, flag_mask);
1734 }
1735 assert(!STR_SHARED_P(root));
1736 assert(RB_OBJ_FROZEN_RAW(root));
1737
1738 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1739 FL_SET(root, STR_SHARED_ROOT);
1740 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1741 flags |= RSTRING_NOEMBED | STR_SHARED;
1742 }
1743
1744 STR_SET_LEN(dup, RSTRING_LEN(str));
1745
1746 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1747 encidx = rb_enc_get_index(str);
1748 flags &= ~ENCODING_MASK;
1749 }
1750 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1751 if (encidx) rb_enc_associate_index(dup, encidx);
1752 return dup;
1753}
1754
1755static inline VALUE
1756ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1757{
1758 VALUE dup;
1759 if (STR_EMBED_P(str)) {
1760 dup = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1761 }
1762 else {
1763 dup = ec_str_alloc_heap(ec, klass);
1764 }
1765
1766 return str_duplicate_setup(klass, str, dup);
1767}
1768
1769static inline VALUE
1770str_duplicate(VALUE klass, VALUE str)
1771{
1772 VALUE dup;
1773 if (STR_EMBED_P(str)) {
1774 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1775 }
1776 else {
1777 dup = str_alloc_heap(klass);
1778 }
1779
1780 return str_duplicate_setup(klass, str, dup);
1781}
1782
1783VALUE
1784rb_str_dup(VALUE str)
1785{
1786 return str_duplicate(rb_obj_class(str), str);
1787}
1788
1789/* :nodoc: */
1790VALUE
1791rb_str_dup_m(VALUE str)
1792{
1793 if (LIKELY(BARE_STRING_P(str))) {
1794 return str_duplicate(rb_obj_class(str), str);
1795 }
1796 else {
1797 return rb_obj_dup(str);
1798 }
1799}
1800
1801VALUE
1803{
1804 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1805 return str_duplicate(rb_cString, str);
1806}
1807
1808VALUE
1809rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1810{
1811 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1812 return ec_str_duplicate(ec, rb_cString, str);
1813}
1814
1815/*
1816 *
1817 * call-seq:
1818 * String.new(string = '', **opts) -> new_string
1819 *
1820 * :include: doc/string/new.rdoc
1821 *
1822 */
1823
1824static VALUE
1825rb_str_init(int argc, VALUE *argv, VALUE str)
1826{
1827 static ID keyword_ids[2];
1828 VALUE orig, opt, venc, vcapa;
1829 VALUE kwargs[2];
1830 rb_encoding *enc = 0;
1831 int n;
1832
1833 if (!keyword_ids[0]) {
1834 keyword_ids[0] = rb_id_encoding();
1835 CONST_ID(keyword_ids[1], "capacity");
1836 }
1837
1838 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1839 if (!NIL_P(opt)) {
1840 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1841 venc = kwargs[0];
1842 vcapa = kwargs[1];
1843 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1844 enc = rb_to_encoding(venc);
1845 }
1846 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1847 long capa = NUM2LONG(vcapa);
1848 long len = 0;
1849 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1850
1851 if (capa < STR_BUF_MIN_SIZE) {
1852 capa = STR_BUF_MIN_SIZE;
1853 }
1854 if (n == 1) {
1855 StringValue(orig);
1856 len = RSTRING_LEN(orig);
1857 if (capa < len) {
1858 capa = len;
1859 }
1860 if (orig == str) n = 0;
1861 }
1862 str_modifiable(str);
1863 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1864 /* make noembed always */
1865 const size_t size = (size_t)capa + termlen;
1866 const char *const old_ptr = RSTRING_PTR(str);
1867 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
1868 char *new_ptr = ALLOC_N(char, size);
1869 if (STR_EMBED_P(str)) RUBY_ASSERT(osize <= str_embed_capa(str));
1870 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1871 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1872 RSTRING(str)->as.heap.ptr = new_ptr;
1873 }
1874 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1875 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1876 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1877 }
1878 STR_SET_LEN(str, len);
1879 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1880 if (n == 1) {
1881 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1882 rb_enc_cr_str_exact_copy(str, orig);
1883 }
1884 FL_SET(str, STR_NOEMBED);
1885 RSTRING(str)->as.heap.aux.capa = capa;
1886 }
1887 else if (n == 1) {
1888 rb_str_replace(str, orig);
1889 }
1890 if (enc) {
1891 rb_enc_associate(str, enc);
1893 }
1894 }
1895 else if (n == 1) {
1896 rb_str_replace(str, orig);
1897 }
1898 return str;
1899}
1900
1901/* :nodoc: */
1902static VALUE
1903rb_str_s_new(int argc, VALUE *argv, VALUE klass)
1904{
1905 if (klass != rb_cString) {
1906 return rb_class_new_instance_pass_kw(argc, argv, klass);
1907 }
1908
1909 static ID keyword_ids[2];
1910 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
1911 VALUE kwargs[2];
1912 rb_encoding *enc = NULL;
1913
1914 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1915 if (NIL_P(opt)) {
1916 return rb_class_new_instance_pass_kw(argc, argv, klass);
1917 }
1918
1919 keyword_ids[0] = rb_id_encoding();
1920 CONST_ID(keyword_ids[1], "capacity");
1921 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1922 encoding = kwargs[0];
1923 capacity = kwargs[1];
1924
1925 int termlen = 1;
1926
1927 if (n == 1) {
1928 orig = StringValue(orig);
1929 }
1930 else {
1931 orig = Qnil;
1932 }
1933
1934 if (UNDEF_P(encoding)) {
1935 if (!NIL_P(orig)) {
1936 encoding = rb_obj_encoding(orig);
1937 }
1938 }
1939
1940 if (!UNDEF_P(encoding)) {
1941 enc = rb_to_encoding(encoding);
1942 termlen = rb_enc_mbminlen(enc);
1943 }
1944
1945 // If capacity is nil, we're basically just duping `orig`.
1946 if (UNDEF_P(capacity)) {
1947 if (NIL_P(orig)) {
1948 VALUE empty_str = str_new(klass, "", 0);
1949 if (enc) {
1950 rb_enc_associate(empty_str, enc);
1951 }
1952 return empty_str;
1953 }
1954 VALUE copy = str_duplicate(klass, orig);
1955 rb_enc_associate(copy, enc);
1956 ENC_CODERANGE_CLEAR(copy);
1957 return copy;
1958 }
1959
1960 long capa = 0;
1961 capa = NUM2LONG(capacity);
1962 if (capa < 0) {
1963 capa = 0;
1964 }
1965
1966 if (!NIL_P(orig)) {
1967 long orig_capa = rb_str_capacity(orig);
1968 if (orig_capa > capa) {
1969 capa = orig_capa;
1970 }
1971 }
1972
1973 VALUE str = str_new0(klass, NULL, capa, termlen);
1974 STR_SET_LEN(str, 0);
1975 TERM_FILL(RSTRING_PTR(str), termlen);
1976
1977 if (enc) {
1978 rb_enc_associate(str, enc);
1979 }
1980
1981 if (!NIL_P(orig)) {
1982 rb_str_buf_append(str, orig);
1983 }
1984
1985 return str;
1986}
1987
1988#ifdef NONASCII_MASK
1989#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1990
1991/*
1992 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1993 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1994 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1995 *
1996 * if (!(byte & 0x80))
1997 * byte |= 0x40; // turn on bit6
1998 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1999 *
2000 * This function calculates whether a byte is leading or not for all bytes
2001 * in the argument word by concurrently using the above logic, and then
2002 * adds up the number of leading bytes in the word.
2003 */
2004static inline uintptr_t
2005count_utf8_lead_bytes_with_word(const uintptr_t *s)
2006{
2007 uintptr_t d = *s;
2008
2009 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2010 d = (d>>6) | (~d>>7);
2011 d &= NONASCII_MASK >> 7;
2012
2013 /* Gather all bytes. */
2014#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2015 /* use only if it can use POPCNT */
2016 return rb_popcount_intptr(d);
2017#else
2018 d += (d>>8);
2019 d += (d>>16);
2020# if SIZEOF_VOIDP == 8
2021 d += (d>>32);
2022# endif
2023 return (d&0xF);
2024#endif
2025}
2026#endif
2027
2028static inline long
2029enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2030{
2031 long c;
2032 const char *q;
2033
2034 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2035 long diff = (long)(e - p);
2036 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2037 }
2038#ifdef NONASCII_MASK
2039 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2040 uintptr_t len = 0;
2041 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2042 const uintptr_t *s, *t;
2043 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2044 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2045 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2046 while (p < (const char *)s) {
2047 if (is_utf8_lead_byte(*p)) len++;
2048 p++;
2049 }
2050 while (s < t) {
2051 len += count_utf8_lead_bytes_with_word(s);
2052 s++;
2053 }
2054 p = (const char *)s;
2055 }
2056 while (p < e) {
2057 if (is_utf8_lead_byte(*p)) len++;
2058 p++;
2059 }
2060 return (long)len;
2061 }
2062#endif
2063 else if (rb_enc_asciicompat(enc)) {
2064 c = 0;
2065 if (ENC_CODERANGE_CLEAN_P(cr)) {
2066 while (p < e) {
2067 if (ISASCII(*p)) {
2068 q = search_nonascii(p, e);
2069 if (!q)
2070 return c + (e - p);
2071 c += q - p;
2072 p = q;
2073 }
2074 p += rb_enc_fast_mbclen(p, e, enc);
2075 c++;
2076 }
2077 }
2078 else {
2079 while (p < e) {
2080 if (ISASCII(*p)) {
2081 q = search_nonascii(p, e);
2082 if (!q)
2083 return c + (e - p);
2084 c += q - p;
2085 p = q;
2086 }
2087 p += rb_enc_mbclen(p, e, enc);
2088 c++;
2089 }
2090 }
2091 return c;
2092 }
2093
2094 for (c=0; p<e; c++) {
2095 p += rb_enc_mbclen(p, e, enc);
2096 }
2097 return c;
2098}
2099
2100long
2101rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2102{
2103 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2104}
2105
2106/* To get strlen with cr
2107 * Note that given cr is not used.
2108 */
2109long
2110rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2111{
2112 long c;
2113 const char *q;
2114 int ret;
2115
2116 *cr = 0;
2117 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2118 long diff = (long)(e - p);
2119 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2120 }
2121 else if (rb_enc_asciicompat(enc)) {
2122 c = 0;
2123 while (p < e) {
2124 if (ISASCII(*p)) {
2125 q = search_nonascii(p, e);
2126 if (!q) {
2127 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2128 return c + (e - p);
2129 }
2130 c += q - p;
2131 p = q;
2132 }
2133 ret = rb_enc_precise_mbclen(p, e, enc);
2134 if (MBCLEN_CHARFOUND_P(ret)) {
2135 *cr |= ENC_CODERANGE_VALID;
2136 p += MBCLEN_CHARFOUND_LEN(ret);
2137 }
2138 else {
2140 p++;
2141 }
2142 c++;
2143 }
2144 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2145 return c;
2146 }
2147
2148 for (c=0; p<e; c++) {
2149 ret = rb_enc_precise_mbclen(p, e, enc);
2150 if (MBCLEN_CHARFOUND_P(ret)) {
2151 *cr |= ENC_CODERANGE_VALID;
2152 p += MBCLEN_CHARFOUND_LEN(ret);
2153 }
2154 else {
2156 if (p + rb_enc_mbminlen(enc) <= e)
2157 p += rb_enc_mbminlen(enc);
2158 else
2159 p = e;
2160 }
2161 }
2162 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2163 return c;
2164}
2165
2166/* enc must be str's enc or rb_enc_check(str, str2) */
2167static long
2168str_strlen(VALUE str, rb_encoding *enc)
2169{
2170 const char *p, *e;
2171 int cr;
2172
2173 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2174 if (!enc) enc = STR_ENC_GET(str);
2175 p = RSTRING_PTR(str);
2176 e = RSTRING_END(str);
2177 cr = ENC_CODERANGE(str);
2178
2179 if (cr == ENC_CODERANGE_UNKNOWN) {
2180 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2181 if (cr) ENC_CODERANGE_SET(str, cr);
2182 return n;
2183 }
2184 else {
2185 return enc_strlen(p, e, enc, cr);
2186 }
2187}
2188
2189long
2191{
2192 return str_strlen(str, NULL);
2193}
2194
2195/*
2196 * call-seq:
2197 * length -> integer
2198 *
2199 * :include: doc/string/length.rdoc
2200 *
2201 */
2202
2203VALUE
2205{
2206 return LONG2NUM(str_strlen(str, NULL));
2207}
2208
2209/*
2210 * call-seq:
2211 * bytesize -> integer
2212 *
2213 * :include: doc/string/bytesize.rdoc
2214 *
2215 */
2216
2217VALUE
2218rb_str_bytesize(VALUE str)
2219{
2220 return LONG2NUM(RSTRING_LEN(str));
2221}
2222
2223/*
2224 * call-seq:
2225 * empty? -> true or false
2226 *
2227 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2228 *
2229 * "hello".empty? # => false
2230 * " ".empty? # => false
2231 * "".empty? # => true
2232 *
2233 */
2234
2235static VALUE
2236rb_str_empty(VALUE str)
2237{
2238 return RBOOL(RSTRING_LEN(str) == 0);
2239}
2240
2241/*
2242 * call-seq:
2243 * string + other_string -> new_string
2244 *
2245 * Returns a new \String containing +other_string+ concatenated to +self+:
2246 *
2247 * "Hello from " + self.to_s # => "Hello from main"
2248 *
2249 */
2250
2251VALUE
2253{
2254 VALUE str3;
2255 rb_encoding *enc;
2256 char *ptr1, *ptr2, *ptr3;
2257 long len1, len2;
2258 int termlen;
2259
2260 StringValue(str2);
2261 enc = rb_enc_check_str(str1, str2);
2262 RSTRING_GETMEM(str1, ptr1, len1);
2263 RSTRING_GETMEM(str2, ptr2, len2);
2264 termlen = rb_enc_mbminlen(enc);
2265 if (len1 > LONG_MAX - len2) {
2266 rb_raise(rb_eArgError, "string size too big");
2267 }
2268 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2269 ptr3 = RSTRING_PTR(str3);
2270 memcpy(ptr3, ptr1, len1);
2271 memcpy(ptr3+len1, ptr2, len2);
2272 TERM_FILL(&ptr3[len1+len2], termlen);
2273
2274 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2276 RB_GC_GUARD(str1);
2277 RB_GC_GUARD(str2);
2278 return str3;
2279}
2280
2281/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2282VALUE
2283rb_str_opt_plus(VALUE str1, VALUE str2)
2284{
2285 assert(RBASIC_CLASS(str1) == rb_cString);
2286 assert(RBASIC_CLASS(str2) == rb_cString);
2287 long len1, len2;
2288 MAYBE_UNUSED(char) *ptr1, *ptr2;
2289 RSTRING_GETMEM(str1, ptr1, len1);
2290 RSTRING_GETMEM(str2, ptr2, len2);
2291 int enc1 = rb_enc_get_index(str1);
2292 int enc2 = rb_enc_get_index(str2);
2293
2294 if (enc1 < 0) {
2295 return Qundef;
2296 }
2297 else if (enc2 < 0) {
2298 return Qundef;
2299 }
2300 else if (enc1 != enc2) {
2301 return Qundef;
2302 }
2303 else if (len1 > LONG_MAX - len2) {
2304 return Qundef;
2305 }
2306 else {
2307 return rb_str_plus(str1, str2);
2308 }
2309
2310}
2311
2312/*
2313 * call-seq:
2314 * string * integer -> new_string
2315 *
2316 * Returns a new \String containing +integer+ copies of +self+:
2317 *
2318 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2319 * "Ho! " * 0 # => ""
2320 *
2321 */
2322
2323VALUE
2325{
2326 VALUE str2;
2327 long n, len;
2328 char *ptr2;
2329 int termlen;
2330
2331 if (times == INT2FIX(1)) {
2332 return str_duplicate(rb_cString, str);
2333 }
2334 if (times == INT2FIX(0)) {
2335 str2 = str_alloc_embed(rb_cString, 0);
2336 rb_enc_copy(str2, str);
2337 return str2;
2338 }
2339 len = NUM2LONG(times);
2340 if (len < 0) {
2341 rb_raise(rb_eArgError, "negative argument");
2342 }
2343 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2344 if (STR_EMBEDDABLE_P(len, 1)) {
2345 str2 = str_alloc_embed(rb_cString, len + 1);
2346 memset(RSTRING_PTR(str2), 0, len + 1);
2347 }
2348 else {
2349 str2 = str_alloc_heap(rb_cString);
2350 RSTRING(str2)->as.heap.aux.capa = len;
2351 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2352 }
2353 STR_SET_LEN(str2, len);
2354 rb_enc_copy(str2, str);
2355 return str2;
2356 }
2357 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2358 rb_raise(rb_eArgError, "argument too big");
2359 }
2360
2361 len *= RSTRING_LEN(str);
2362 termlen = TERM_LEN(str);
2363 str2 = str_new0(rb_cString, 0, len, termlen);
2364 ptr2 = RSTRING_PTR(str2);
2365 if (len) {
2366 n = RSTRING_LEN(str);
2367 memcpy(ptr2, RSTRING_PTR(str), n);
2368 while (n <= len/2) {
2369 memcpy(ptr2 + n, ptr2, n);
2370 n *= 2;
2371 }
2372 memcpy(ptr2 + n, ptr2, len-n);
2373 }
2374 STR_SET_LEN(str2, len);
2375 TERM_FILL(&ptr2[len], termlen);
2376 rb_enc_cr_str_copy_for_substr(str2, str);
2377
2378 return str2;
2379}
2380
2381/*
2382 * call-seq:
2383 * string % object -> new_string
2384 *
2385 * Returns the result of formatting +object+ into the format specification +self+
2386 * (see Kernel#sprintf for formatting details):
2387 *
2388 * "%05d" % 123 # => "00123"
2389 *
2390 * If +self+ contains multiple substitutions, +object+ must be
2391 * an Array or Hash containing the values to be substituted:
2392 *
2393 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2394 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2395 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2396 *
2397 */
2398
2399static VALUE
2400rb_str_format_m(VALUE str, VALUE arg)
2401{
2402 VALUE tmp = rb_check_array_type(arg);
2403
2404 if (!NIL_P(tmp)) {
2405 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2406 }
2407 return rb_str_format(1, &arg, str);
2408}
2409
2410static inline void
2411rb_check_lockedtmp(VALUE str)
2412{
2413 if (FL_TEST(str, STR_TMPLOCK)) {
2414 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2415 }
2416}
2417
2418static inline void
2419str_modifiable(VALUE str)
2420{
2421 rb_check_lockedtmp(str);
2422 rb_check_frozen(str);
2423}
2424
2425static inline int
2426str_dependent_p(VALUE str)
2427{
2428 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2429 return 0;
2430 }
2431 else {
2432 return 1;
2433 }
2434}
2435
2436static inline int
2437str_independent(VALUE str)
2438{
2439 str_modifiable(str);
2440 return !str_dependent_p(str);
2441}
2442
2443static void
2444str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2445{
2446 char *ptr;
2447 char *oldptr;
2448 long capa = len + expand;
2449
2450 if (len > capa) len = capa;
2451
2452 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2453 ptr = RSTRING(str)->as.heap.ptr;
2454 STR_SET_EMBED(str);
2455 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2456 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2457 STR_SET_LEN(str, len);
2458 return;
2459 }
2460
2461 ptr = ALLOC_N(char, (size_t)capa + termlen);
2462 oldptr = RSTRING_PTR(str);
2463 if (oldptr) {
2464 memcpy(ptr, oldptr, len);
2465 }
2466 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2467 xfree(oldptr);
2468 }
2469 STR_SET_NOEMBED(str);
2470 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2471 TERM_FILL(ptr + len, termlen);
2472 RSTRING(str)->as.heap.ptr = ptr;
2473 STR_SET_LEN(str, len);
2474 RSTRING(str)->as.heap.aux.capa = capa;
2475}
2476
2477void
2478rb_str_modify(VALUE str)
2479{
2480 if (!str_independent(str))
2481 str_make_independent(str);
2483}
2484
2485void
2487{
2488 int termlen = TERM_LEN(str);
2489 long len = RSTRING_LEN(str);
2490
2491 if (expand < 0) {
2492 rb_raise(rb_eArgError, "negative expanding string size");
2493 }
2494 if (expand >= LONG_MAX - len) {
2495 rb_raise(rb_eArgError, "string size too big");
2496 }
2497
2498 if (!str_independent(str)) {
2499 str_make_independent_expand(str, len, expand, termlen);
2500 }
2501 else if (expand > 0) {
2502 RESIZE_CAPA_TERM(str, len + expand, termlen);
2503 }
2505}
2506
2507/* As rb_str_modify(), but don't clear coderange */
2508static void
2509str_modify_keep_cr(VALUE str)
2510{
2511 if (!str_independent(str))
2512 str_make_independent(str);
2514 /* Force re-scan later */
2516}
2517
2518static inline void
2519str_discard(VALUE str)
2520{
2521 str_modifiable(str);
2522 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2523 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2524 RSTRING(str)->as.heap.ptr = 0;
2525 STR_SET_LEN(str, 0);
2526 }
2527}
2528
2529void
2531{
2532 rb_encoding *enc = rb_enc_get(str);
2533 if (!enc) {
2534 rb_raise(rb_eTypeError, "not encoding capable object");
2535 }
2536 if (!rb_enc_asciicompat(enc)) {
2537 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2538 }
2539}
2540
2541VALUE
2543{
2544 VALUE s = *ptr;
2545 if (!RB_TYPE_P(s, T_STRING)) {
2546 s = rb_str_to_str(s);
2547 *ptr = s;
2548 }
2549 return s;
2550}
2551
2552char *
2554{
2555 VALUE str = rb_string_value(ptr);
2556 return RSTRING_PTR(str);
2557}
2558
2559static int
2560zero_filled(const char *s, int n)
2561{
2562 for (; n > 0; --n) {
2563 if (*s++) return 0;
2564 }
2565 return 1;
2566}
2567
2568static const char *
2569str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2570{
2571 const char *e = s + len;
2572
2573 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2574 if (zero_filled(s, minlen)) return s;
2575 }
2576 return 0;
2577}
2578
2579static char *
2580str_fill_term(VALUE str, char *s, long len, int termlen)
2581{
2582 /* This function assumes that (capa + termlen) bytes of memory
2583 * is allocated, like many other functions in this file.
2584 */
2585 if (str_dependent_p(str)) {
2586 if (!zero_filled(s + len, termlen))
2587 str_make_independent_expand(str, len, 0L, termlen);
2588 }
2589 else {
2590 TERM_FILL(s + len, termlen);
2591 return s;
2592 }
2593 return RSTRING_PTR(str);
2594}
2595
2596void
2597rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2598{
2599 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2600 long len = RSTRING_LEN(str);
2601
2602 assert(capa >= len);
2603 if (capa - len < termlen) {
2604 rb_check_lockedtmp(str);
2605 str_make_independent_expand(str, len, 0L, termlen);
2606 }
2607 else if (str_dependent_p(str)) {
2608 if (termlen > oldtermlen)
2609 str_make_independent_expand(str, len, 0L, termlen);
2610 }
2611 else {
2612 if (!STR_EMBED_P(str)) {
2613 /* modify capa instead of realloc */
2614 assert(!FL_TEST((str), STR_SHARED));
2615 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2616 }
2617 if (termlen > oldtermlen) {
2618 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2619 }
2620 }
2621
2622 return;
2623}
2624
2625static char *
2626str_null_check(VALUE str, int *w)
2627{
2628 char *s = RSTRING_PTR(str);
2629 long len = RSTRING_LEN(str);
2630 rb_encoding *enc = rb_enc_get(str);
2631 const int minlen = rb_enc_mbminlen(enc);
2632
2633 if (minlen > 1) {
2634 *w = 1;
2635 if (str_null_char(s, len, minlen, enc)) {
2636 return NULL;
2637 }
2638 return str_fill_term(str, s, len, minlen);
2639 }
2640 *w = 0;
2641 if (!s || memchr(s, 0, len)) {
2642 return NULL;
2643 }
2644 if (s[len]) {
2645 s = str_fill_term(str, s, len, minlen);
2646 }
2647 return s;
2648}
2649
2650char *
2651rb_str_to_cstr(VALUE str)
2652{
2653 int w;
2654 return str_null_check(str, &w);
2655}
2656
2657char *
2659{
2660 VALUE str = rb_string_value(ptr);
2661 int w;
2662 char *s = str_null_check(str, &w);
2663 if (!s) {
2664 if (w) {
2665 rb_raise(rb_eArgError, "string contains null char");
2666 }
2667 rb_raise(rb_eArgError, "string contains null byte");
2668 }
2669 return s;
2670}
2671
2672char *
2673rb_str_fill_terminator(VALUE str, const int newminlen)
2674{
2675 char *s = RSTRING_PTR(str);
2676 long len = RSTRING_LEN(str);
2677 return str_fill_term(str, s, len, newminlen);
2678}
2679
2680VALUE
2682{
2683 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2684 return str;
2685}
2686
2687/*
2688 * call-seq:
2689 * String.try_convert(object) -> object, new_string, or nil
2690 *
2691 * If +object+ is a \String object, returns +object+.
2692 *
2693 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2694 * calls <tt>object.to_str</tt> and returns the result.
2695 *
2696 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2697 *
2698 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2699 */
2700static VALUE
2701rb_str_s_try_convert(VALUE dummy, VALUE str)
2702{
2703 return rb_check_string_type(str);
2704}
2705
2706static char*
2707str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2708{
2709 long nth = *nthp;
2710 if (rb_enc_mbmaxlen(enc) == 1) {
2711 p += nth;
2712 }
2713 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2714 p += nth * rb_enc_mbmaxlen(enc);
2715 }
2716 else if (rb_enc_asciicompat(enc)) {
2717 const char *p2, *e2;
2718 int n;
2719
2720 while (p < e && 0 < nth) {
2721 e2 = p + nth;
2722 if (e < e2) {
2723 *nthp = nth;
2724 return (char *)e;
2725 }
2726 if (ISASCII(*p)) {
2727 p2 = search_nonascii(p, e2);
2728 if (!p2) {
2729 nth -= e2 - p;
2730 *nthp = nth;
2731 return (char *)e2;
2732 }
2733 nth -= p2 - p;
2734 p = p2;
2735 }
2736 n = rb_enc_mbclen(p, e, enc);
2737 p += n;
2738 nth--;
2739 }
2740 *nthp = nth;
2741 if (nth != 0) {
2742 return (char *)e;
2743 }
2744 return (char *)p;
2745 }
2746 else {
2747 while (p < e && nth--) {
2748 p += rb_enc_mbclen(p, e, enc);
2749 }
2750 }
2751 if (p > e) p = e;
2752 *nthp = nth;
2753 return (char*)p;
2754}
2755
2756char*
2757rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2758{
2759 return str_nth_len(p, e, &nth, enc);
2760}
2761
2762static char*
2763str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2764{
2765 if (singlebyte)
2766 p += nth;
2767 else {
2768 p = str_nth_len(p, e, &nth, enc);
2769 }
2770 if (!p) return 0;
2771 if (p > e) p = e;
2772 return (char *)p;
2773}
2774
2775/* char offset to byte offset */
2776static long
2777str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2778{
2779 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2780 if (!pp) return e - p;
2781 return pp - p;
2782}
2783
2784long
2785rb_str_offset(VALUE str, long pos)
2786{
2787 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2788 STR_ENC_GET(str), single_byte_optimizable(str));
2789}
2790
2791#ifdef NONASCII_MASK
2792static char *
2793str_utf8_nth(const char *p, const char *e, long *nthp)
2794{
2795 long nth = *nthp;
2796 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2797 const uintptr_t *s, *t;
2798 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2799 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2800 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2801 while (p < (const char *)s) {
2802 if (is_utf8_lead_byte(*p)) nth--;
2803 p++;
2804 }
2805 do {
2806 nth -= count_utf8_lead_bytes_with_word(s);
2807 s++;
2808 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2809 p = (char *)s;
2810 }
2811 while (p < e) {
2812 if (is_utf8_lead_byte(*p)) {
2813 if (nth == 0) break;
2814 nth--;
2815 }
2816 p++;
2817 }
2818 *nthp = nth;
2819 return (char *)p;
2820}
2821
2822static long
2823str_utf8_offset(const char *p, const char *e, long nth)
2824{
2825 const char *pp = str_utf8_nth(p, e, &nth);
2826 return pp - p;
2827}
2828#endif
2829
2830/* byte offset to char offset */
2831long
2832rb_str_sublen(VALUE str, long pos)
2833{
2834 if (single_byte_optimizable(str) || pos < 0)
2835 return pos;
2836 else {
2837 char *p = RSTRING_PTR(str);
2838 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2839 }
2840}
2841
2842static VALUE
2843str_subseq(VALUE str, long beg, long len)
2844{
2845 VALUE str2;
2846
2847 assert(beg >= 0);
2848 assert(len >= 0);
2849 assert(beg+len <= RSTRING_LEN(str));
2850
2851 const int termlen = TERM_LEN(str);
2852 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2853 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2854 RB_GC_GUARD(str);
2855 return str2;
2856 }
2857
2858 str2 = str_alloc_heap(rb_cString);
2859 if (str_embed_capa(str2) >= len + termlen) {
2860 char *ptr2 = RSTRING(str2)->as.embed.ary;
2861 STR_SET_EMBED(str2);
2862 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
2863 TERM_FILL(ptr2+len, termlen);
2864
2865 STR_SET_LEN(str2, len);
2866 RB_GC_GUARD(str);
2867 }
2868 else {
2869 str_replace_shared(str2, str);
2870 assert(!STR_EMBED_P(str2));
2871 ENC_CODERANGE_CLEAR(str2);
2872 RSTRING(str2)->as.heap.ptr += beg;
2873 if (RSTRING_LEN(str2) > len) {
2874 STR_SET_LEN(str2, len);
2875 }
2876 }
2877
2878 return str2;
2879}
2880
2881VALUE
2882rb_str_subseq(VALUE str, long beg, long len)
2883{
2884 VALUE str2 = str_subseq(str, beg, len);
2885 rb_enc_cr_str_copy_for_substr(str2, str);
2886 return str2;
2887}
2888
2889char *
2890rb_str_subpos(VALUE str, long beg, long *lenp)
2891{
2892 long len = *lenp;
2893 long slen = -1L;
2894 const long blen = RSTRING_LEN(str);
2895 rb_encoding *enc = STR_ENC_GET(str);
2896 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2897
2898 if (len < 0) return 0;
2899 if (beg < 0 && -beg < 0) return 0;
2900 if (!blen) {
2901 len = 0;
2902 }
2903 if (single_byte_optimizable(str)) {
2904 if (beg > blen) return 0;
2905 if (beg < 0) {
2906 beg += blen;
2907 if (beg < 0) return 0;
2908 }
2909 if (len > blen - beg)
2910 len = blen - beg;
2911 if (len < 0) return 0;
2912 p = s + beg;
2913 goto end;
2914 }
2915 if (beg < 0) {
2916 if (len > -beg) len = -beg;
2917 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
2918 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
2919 beg = -beg;
2920 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2921 p = e;
2922 if (!p) return 0;
2923 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2924 if (!p) return 0;
2925 len = e - p;
2926 goto end;
2927 }
2928 else {
2929 slen = str_strlen(str, enc);
2930 beg += slen;
2931 if (beg < 0) return 0;
2932 p = s + beg;
2933 if (len == 0) goto end;
2934 }
2935 }
2936 else if (beg > 0 && beg > blen) {
2937 return 0;
2938 }
2939 if (len == 0) {
2940 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2941 p = s + beg;
2942 }
2943#ifdef NONASCII_MASK
2944 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2945 enc == rb_utf8_encoding()) {
2946 p = str_utf8_nth(s, e, &beg);
2947 if (beg > 0) return 0;
2948 len = str_utf8_offset(p, e, len);
2949 }
2950#endif
2951 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2952 int char_sz = rb_enc_mbmaxlen(enc);
2953
2954 p = s + beg * char_sz;
2955 if (p > e) {
2956 return 0;
2957 }
2958 else if (len * char_sz > e - p)
2959 len = e - p;
2960 else
2961 len *= char_sz;
2962 }
2963 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2964 if (beg > 0) return 0;
2965 len = 0;
2966 }
2967 else {
2968 len = str_offset(p, e, len, enc, 0);
2969 }
2970 end:
2971 *lenp = len;
2972 RB_GC_GUARD(str);
2973 return p;
2974}
2975
2976static VALUE str_substr(VALUE str, long beg, long len, int empty);
2977
2978VALUE
2979rb_str_substr(VALUE str, long beg, long len)
2980{
2981 return str_substr(str, beg, len, TRUE);
2982}
2983
2984static VALUE
2985str_substr(VALUE str, long beg, long len, int empty)
2986{
2987 char *p = rb_str_subpos(str, beg, &len);
2988
2989 if (!p) return Qnil;
2990 if (!len && !empty) return Qnil;
2991
2992 beg = p - RSTRING_PTR(str);
2993
2994 VALUE str2 = str_subseq(str, beg, len);
2995 rb_enc_cr_str_copy_for_substr(str2, str);
2996 return str2;
2997}
2998
2999/* :nodoc: */
3000VALUE
3002{
3003 if (OBJ_FROZEN(str)) return str;
3004 rb_str_resize(str, RSTRING_LEN(str));
3005 return rb_obj_freeze(str);
3006}
3007
3008
3009/*
3010 * call-seq:
3011 * +string -> new_string or self
3012 *
3013 * Returns +self+ if +self+ is not frozen.
3014 *
3015 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3016 */
3017static VALUE
3018str_uplus(VALUE str)
3019{
3020 if (OBJ_FROZEN(str)) {
3021 return rb_str_dup(str);
3022 }
3023 else {
3024 return str;
3025 }
3026}
3027
3028/*
3029 * call-seq:
3030 * -string -> frozen_string
3031 * dedup -> frozen_string
3032 *
3033 * Returns a frozen, possibly pre-existing copy of the string.
3034 *
3035 * The returned \String will be deduplicated as long as it does not have
3036 * any instance variables set on it and is not a String subclass.
3037 *
3038 * Note that <tt>-string</tt> variant is more convenient for defining
3039 * constants:
3040 *
3041 * FILENAME = -'config/database.yml'
3042 *
3043 * while +dedup+ is better suitable for using the method in chains
3044 * of calculations:
3045 *
3046 * @url_list.concat(urls.map(&:dedup))
3047 *
3048 */
3049static VALUE
3050str_uminus(VALUE str)
3051{
3052 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3053 str = rb_str_dup(str);
3054 }
3055 return rb_fstring(str);
3056}
3057
3058RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3059#define rb_str_dup_frozen rb_str_new_frozen
3060
3061VALUE
3063{
3064 if (FL_TEST(str, STR_TMPLOCK)) {
3065 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3066 }
3067 FL_SET(str, STR_TMPLOCK);
3068 return str;
3069}
3070
3071VALUE
3073{
3074 if (!FL_TEST(str, STR_TMPLOCK)) {
3075 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3076 }
3077 FL_UNSET(str, STR_TMPLOCK);
3078 return str;
3079}
3080
3081RUBY_FUNC_EXPORTED VALUE
3082rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3083{
3084 rb_str_locktmp(str);
3085 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3086}
3087
3088void
3089rb_str_set_len(VALUE str, long len)
3090{
3091 long capa;
3092 const int termlen = TERM_LEN(str);
3093
3094 str_modifiable(str);
3095 if (STR_SHARED_P(str)) {
3096 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3097 }
3098 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3099 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3100 }
3101
3102 int cr = ENC_CODERANGE(str);
3103 if (cr == ENC_CODERANGE_UNKNOWN) {
3104 /* Leave unknown. */
3105 }
3106 else if (len > RSTRING_LEN(str)) {
3107 if (ENC_CODERANGE_CLEAN_P(cr)) {
3108 /* Update the coderange regarding the extended part. */
3109 const char *const prev_end = RSTRING_END(str);
3110 const char *const new_end = RSTRING_PTR(str) + len;
3111 rb_encoding *enc = rb_enc_get(str);
3112 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3113 ENC_CODERANGE_SET(str, cr);
3114 }
3115 else if (cr == ENC_CODERANGE_BROKEN) {
3116 /* May be valid now, by appended part. */
3118 }
3119 }
3120 else if (len < RSTRING_LEN(str)) {
3121 if (cr != ENC_CODERANGE_7BIT) {
3122 /* ASCII-only string is keeping after truncated. Valid
3123 * and broken may be invalid or valid, leave unknown. */
3125 }
3126 }
3127
3128 STR_SET_LEN(str, len);
3129 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3130}
3131
3132VALUE
3133rb_str_resize(VALUE str, long len)
3134{
3135 if (len < 0) {
3136 rb_raise(rb_eArgError, "negative string size (or size too big)");
3137 }
3138
3139 int independent = str_independent(str);
3140 long slen = RSTRING_LEN(str);
3141
3142 if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3144 }
3145
3146 {
3147 long capa;
3148 const int termlen = TERM_LEN(str);
3149 if (STR_EMBED_P(str)) {
3150 if (len == slen) return str;
3151 if (str_embed_capa(str) >= len + termlen) {
3152 STR_SET_LEN(str, len);
3153 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3154 return str;
3155 }
3156 str_make_independent_expand(str, slen, len - slen, termlen);
3157 }
3158 else if (str_embed_capa(str) >= len + termlen) {
3159 char *ptr = STR_HEAP_PTR(str);
3160 STR_SET_EMBED(str);
3161 if (slen > len) slen = len;
3162 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3163 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3164 STR_SET_LEN(str, len);
3165 if (independent) ruby_xfree(ptr);
3166 return str;
3167 }
3168 else if (!independent) {
3169 if (len == slen) return str;
3170 str_make_independent_expand(str, slen, len - slen, termlen);
3171 }
3172 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3173 (capa - len) > (len < 1024 ? len : 1024)) {
3174 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3175 (size_t)len + termlen, STR_HEAP_SIZE(str));
3176 RSTRING(str)->as.heap.aux.capa = len;
3177 }
3178 else if (len == slen) return str;
3179 STR_SET_LEN(str, len);
3180 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3181 }
3182 return str;
3183}
3184
3185static VALUE
3186str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3187{
3188 if (keep_cr) {
3189 str_modify_keep_cr(str);
3190 }
3191 else {
3192 rb_str_modify(str);
3193 }
3194 if (len == 0) return 0;
3195
3196 long total, olen, off = -1;
3197 char *sptr;
3198 const int termlen = TERM_LEN(str);
3199
3200 RSTRING_GETMEM(str, sptr, olen);
3201 if (ptr >= sptr && ptr <= sptr + olen) {
3202 off = ptr - sptr;
3203 }
3204
3205 long capa = str_capacity(str, termlen);
3206
3207 if (olen > LONG_MAX - len) {
3208 rb_raise(rb_eArgError, "string sizes too big");
3209 }
3210 total = olen + len;
3211 if (capa < total) {
3212 if (total >= LONG_MAX / 2) {
3213 capa = total;
3214 }
3215 while (total > capa) {
3216 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3217 }
3218 RESIZE_CAPA_TERM(str, capa, termlen);
3219 sptr = RSTRING_PTR(str);
3220 }
3221 if (off != -1) {
3222 ptr = sptr + off;
3223 }
3224 memcpy(sptr + olen, ptr, len);
3225 STR_SET_LEN(str, total);
3226 TERM_FILL(sptr + total, termlen); /* sentinel */
3227
3228 return str;
3229}
3230
3231#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3232#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3233
3234VALUE
3235rb_str_cat(VALUE str, const char *ptr, long len)
3236{
3237 if (len == 0) return str;
3238 if (len < 0) {
3239 rb_raise(rb_eArgError, "negative string size (or size too big)");
3240 }
3241 return str_buf_cat(str, ptr, len);
3242}
3243
3244VALUE
3245rb_str_cat_cstr(VALUE str, const char *ptr)
3246{
3247 must_not_null(ptr);
3248 return rb_str_buf_cat(str, ptr, strlen(ptr));
3249}
3250
3251RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3252RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3253RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3254
3255static VALUE
3256rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3257 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3258{
3259 int str_encindex = ENCODING_GET(str);
3260 int res_encindex;
3261 int str_cr, res_cr;
3262 rb_encoding *str_enc, *ptr_enc;
3263
3264 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3265
3266 if (str_encindex == ptr_encindex) {
3267 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3268 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3269 }
3270 }
3271 else {
3272 str_enc = rb_enc_from_index(str_encindex);
3273 ptr_enc = rb_enc_from_index(ptr_encindex);
3274 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3275 if (len == 0)
3276 return str;
3277 if (RSTRING_LEN(str) == 0) {
3278 rb_str_buf_cat(str, ptr, len);
3279 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3280 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3281 return str;
3282 }
3283 goto incompatible;
3284 }
3285 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3286 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3287 }
3288 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3289 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3290 str_cr = rb_enc_str_coderange(str);
3291 }
3292 }
3293 }
3294 if (ptr_cr_ret)
3295 *ptr_cr_ret = ptr_cr;
3296
3297 if (str_encindex != ptr_encindex &&
3298 str_cr != ENC_CODERANGE_7BIT &&
3299 ptr_cr != ENC_CODERANGE_7BIT) {
3300 str_enc = rb_enc_from_index(str_encindex);
3301 ptr_enc = rb_enc_from_index(ptr_encindex);
3302 goto incompatible;
3303 }
3304
3305 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3306 res_encindex = str_encindex;
3307 res_cr = ENC_CODERANGE_UNKNOWN;
3308 }
3309 else if (str_cr == ENC_CODERANGE_7BIT) {
3310 if (ptr_cr == ENC_CODERANGE_7BIT) {
3311 res_encindex = str_encindex;
3312 res_cr = ENC_CODERANGE_7BIT;
3313 }
3314 else {
3315 res_encindex = ptr_encindex;
3316 res_cr = ptr_cr;
3317 }
3318 }
3319 else if (str_cr == ENC_CODERANGE_VALID) {
3320 res_encindex = str_encindex;
3321 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3322 res_cr = str_cr;
3323 else
3324 res_cr = ptr_cr;
3325 }
3326 else { /* str_cr == ENC_CODERANGE_BROKEN */
3327 res_encindex = str_encindex;
3328 res_cr = str_cr;
3329 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3330 }
3331
3332 if (len < 0) {
3333 rb_raise(rb_eArgError, "negative string size (or size too big)");
3334 }
3335 str_buf_cat(str, ptr, len);
3336 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3337 return str;
3338
3339 incompatible:
3340 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3341 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3343}
3344
3345VALUE
3346rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3347{
3348 return rb_enc_cr_str_buf_cat(str, ptr, len,
3349 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3350}
3351
3352VALUE
3353rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3354{
3355 /* ptr must reference NUL terminated ASCII string. */
3356 int encindex = ENCODING_GET(str);
3357 rb_encoding *enc = rb_enc_from_index(encindex);
3358 if (rb_enc_asciicompat(enc)) {
3359 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3360 encindex, ENC_CODERANGE_7BIT, 0);
3361 }
3362 else {
3363 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3364 while (*ptr) {
3365 unsigned int c = (unsigned char)*ptr;
3366 int len = rb_enc_codelen(c, enc);
3367 rb_enc_mbcput(c, buf, enc);
3368 rb_enc_cr_str_buf_cat(str, buf, len,
3369 encindex, ENC_CODERANGE_VALID, 0);
3370 ptr++;
3371 }
3372 return str;
3373 }
3374}
3375
3376VALUE
3377rb_str_buf_append(VALUE str, VALUE str2)
3378{
3379 int str2_cr = rb_enc_str_coderange(str2);
3380
3381 if (str_enc_fastpath(str)) {
3382 switch (str2_cr) {
3383 case ENC_CODERANGE_7BIT:
3384 // If RHS is 7bit we can do simple concatenation
3385 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3386 RB_GC_GUARD(str2);
3387 return str;
3389 // If RHS is valid, we can do simple concatenation if encodings are the same
3390 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3391 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3392 int str_cr = ENC_CODERANGE(str);
3393 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3394 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3395 }
3396 RB_GC_GUARD(str2);
3397 return str;
3398 }
3399 }
3400 }
3401
3402 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3403 ENCODING_GET(str2), str2_cr, &str2_cr);
3404
3405 ENC_CODERANGE_SET(str2, str2_cr);
3406
3407 return str;
3408}
3409
3410VALUE
3412{
3413 StringValue(str2);
3414 return rb_str_buf_append(str, str2);
3415}
3416
3417VALUE
3418rb_str_concat_literals(size_t num, const VALUE *strary)
3419{
3420 VALUE str;
3421 size_t i, s = 0;
3422 unsigned long len = 1;
3423
3424 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3425 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3426
3427 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3428 str = rb_str_buf_new(len);
3429 str_enc_copy_direct(str, strary[0]);
3430
3431 for (i = s; i < num; ++i) {
3432 const VALUE v = strary[i];
3433 int encidx = ENCODING_GET(v);
3434
3435 rb_str_buf_append(str, v);
3436 if (encidx != ENCINDEX_US_ASCII) {
3437 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3438 rb_enc_set_index(str, encidx);
3439 }
3440 }
3441 return str;
3442}
3443
3444/*
3445 * call-seq:
3446 * concat(*objects) -> string
3447 *
3448 * Concatenates each object in +objects+ to +self+ and returns +self+:
3449 *
3450 * s = 'foo'
3451 * s.concat('bar', 'baz') # => "foobarbaz"
3452 * s # => "foobarbaz"
3453 *
3454 * For each given object +object+ that is an Integer,
3455 * the value is considered a codepoint and converted to a character before concatenation:
3456 *
3457 * s = 'foo'
3458 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3459 *
3460 * Related: String#<<, which takes a single argument.
3461 */
3462static VALUE
3463rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3464{
3465 str_modifiable(str);
3466
3467 if (argc == 1) {
3468 return rb_str_concat(str, argv[0]);
3469 }
3470 else if (argc > 1) {
3471 int i;
3472 VALUE arg_str = rb_str_tmp_new(0);
3473 rb_enc_copy(arg_str, str);
3474 for (i = 0; i < argc; i++) {
3475 rb_str_concat(arg_str, argv[i]);
3476 }
3477 rb_str_buf_append(str, arg_str);
3478 }
3479
3480 return str;
3481}
3482
3483/*
3484 * call-seq:
3485 * string << object -> string
3486 *
3487 * Concatenates +object+ to +self+ and returns +self+:
3488 *
3489 * s = 'foo'
3490 * s << 'bar' # => "foobar"
3491 * s # => "foobar"
3492 *
3493 * If +object+ is an Integer,
3494 * the value is considered a codepoint and converted to a character before concatenation:
3495 *
3496 * s = 'foo'
3497 * s << 33 # => "foo!"
3498 *
3499 * Related: String#concat, which takes multiple arguments.
3500 */
3501VALUE
3503{
3504 unsigned int code;
3505 rb_encoding *enc = STR_ENC_GET(str1);
3506 int encidx;
3507
3508 if (RB_INTEGER_TYPE_P(str2)) {
3509 if (rb_num_to_uint(str2, &code) == 0) {
3510 }
3511 else if (FIXNUM_P(str2)) {
3512 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3513 }
3514 else {
3515 rb_raise(rb_eRangeError, "bignum out of char range");
3516 }
3517 }
3518 else {
3519 return rb_str_append(str1, str2);
3520 }
3521
3522 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3523 if (encidx >= 0) {
3524 char buf[1];
3525 buf[0] = (char)code;
3526 rb_str_cat(str1, buf, 1);
3527 if (encidx != rb_enc_to_index(enc)) {
3528 rb_enc_associate_index(str1, encidx);
3530 }
3531 }
3532 else {
3533 long pos = RSTRING_LEN(str1);
3534 int cr = ENC_CODERANGE(str1);
3535 int len;
3536 char *buf;
3537
3538 switch (len = rb_enc_codelen(code, enc)) {
3539 case ONIGERR_INVALID_CODE_POINT_VALUE:
3540 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3541 break;
3542 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3543 case 0:
3544 rb_raise(rb_eRangeError, "%u out of char range", code);
3545 break;
3546 }
3547 buf = ALLOCA_N(char, len + 1);
3548 rb_enc_mbcput(code, buf, enc);
3549 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3550 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3551 }
3552 rb_str_resize(str1, pos+len);
3553 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3554 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3556 }
3557 else if (cr == ENC_CODERANGE_BROKEN) {
3559 }
3560 ENC_CODERANGE_SET(str1, cr);
3561 }
3562 return str1;
3563}
3564
3565int
3566rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3567{
3568 int encidx = rb_enc_to_index(enc);
3569
3570 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3571 /* US-ASCII automatically extended to ASCII-8BIT */
3572 if (code > 0xFF) {
3573 rb_raise(rb_eRangeError, "%u out of char range", code);
3574 }
3575 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3576 return ENCINDEX_ASCII_8BIT;
3577 }
3578 return encidx;
3579 }
3580 else {
3581 return -1;
3582 }
3583}
3584
3585/*
3586 * call-seq:
3587 * prepend(*other_strings) -> string
3588 *
3589 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3590 *
3591 * s = 'foo'
3592 * s.prepend('bar', 'baz') # => "barbazfoo"
3593 * s # => "barbazfoo"
3594 *
3595 * Related: String#concat.
3596 */
3597
3598static VALUE
3599rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3600{
3601 str_modifiable(str);
3602
3603 if (argc == 1) {
3604 rb_str_update(str, 0L, 0L, argv[0]);
3605 }
3606 else if (argc > 1) {
3607 int i;
3608 VALUE arg_str = rb_str_tmp_new(0);
3609 rb_enc_copy(arg_str, str);
3610 for (i = 0; i < argc; i++) {
3611 rb_str_append(arg_str, argv[i]);
3612 }
3613 rb_str_update(str, 0L, 0L, arg_str);
3614 }
3615
3616 return str;
3617}
3618
3619st_index_t
3621{
3622 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
3623 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
3624 if (e && !is_ascii_string(str)) {
3625 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
3626 }
3627 return h;
3628}
3629
3630int
3632{
3633 long len1, len2;
3634 const char *ptr1, *ptr2;
3635 RSTRING_GETMEM(str1, ptr1, len1);
3636 RSTRING_GETMEM(str2, ptr2, len2);
3637 return (len1 != len2 ||
3638 !rb_str_comparable(str1, str2) ||
3639 memcmp(ptr1, ptr2, len1) != 0);
3640}
3641
3642/*
3643 * call-seq:
3644 * hash -> integer
3645 *
3646 * Returns the integer hash value for +self+.
3647 * The value is based on the length, content and encoding of +self+.
3648 *
3649 * Related: Object#hash.
3650 */
3651
3652static VALUE
3653rb_str_hash_m(VALUE str)
3654{
3655 st_index_t hval = rb_str_hash(str);
3656 return ST2FIX(hval);
3657}
3658
3659#define lesser(a,b) (((a)>(b))?(b):(a))
3660
3661int
3663{
3664 int idx1, idx2;
3665 int rc1, rc2;
3666
3667 if (RSTRING_LEN(str1) == 0) return TRUE;
3668 if (RSTRING_LEN(str2) == 0) return TRUE;
3669 idx1 = ENCODING_GET(str1);
3670 idx2 = ENCODING_GET(str2);
3671 if (idx1 == idx2) return TRUE;
3672 rc1 = rb_enc_str_coderange(str1);
3673 rc2 = rb_enc_str_coderange(str2);
3674 if (rc1 == ENC_CODERANGE_7BIT) {
3675 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3676 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3677 return TRUE;
3678 }
3679 if (rc2 == ENC_CODERANGE_7BIT) {
3680 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3681 return TRUE;
3682 }
3683 return FALSE;
3684}
3685
3686int
3688{
3689 long len1, len2;
3690 const char *ptr1, *ptr2;
3691 int retval;
3692
3693 if (str1 == str2) return 0;
3694 RSTRING_GETMEM(str1, ptr1, len1);
3695 RSTRING_GETMEM(str2, ptr2, len2);
3696 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3697 if (len1 == len2) {
3698 if (!rb_str_comparable(str1, str2)) {
3699 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3700 return 1;
3701 return -1;
3702 }
3703 return 0;
3704 }
3705 if (len1 > len2) return 1;
3706 return -1;
3707 }
3708 if (retval > 0) return 1;
3709 return -1;
3710}
3711
3712/*
3713 * call-seq:
3714 * string == object -> true or false
3715 * string === object -> true or false
3716 *
3717 * Returns +true+ if +object+ has the same length and content;
3718 * as +self+; +false+ otherwise:
3719 *
3720 * s = 'foo'
3721 * s == 'foo' # => true
3722 * s == 'food' # => false
3723 * s == 'FOO' # => false
3724 *
3725 * Returns +false+ if the two strings' encodings are not compatible:
3726 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3727 *
3728 * If +object+ is not an instance of \String but responds to +to_str+, then the
3729 * two strings are compared using <code>object.==</code>.
3730 */
3731
3732VALUE
3734{
3735 if (str1 == str2) return Qtrue;
3736 if (!RB_TYPE_P(str2, T_STRING)) {
3737 if (!rb_respond_to(str2, idTo_str)) {
3738 return Qfalse;
3739 }
3740 return rb_equal(str2, str1);
3741 }
3742 return rb_str_eql_internal(str1, str2);
3743}
3744
3745/*
3746 * call-seq:
3747 * eql?(object) -> true or false
3748 *
3749 * Returns +true+ if +object+ has the same length and content;
3750 * as +self+; +false+ otherwise:
3751 *
3752 * s = 'foo'
3753 * s.eql?('foo') # => true
3754 * s.eql?('food') # => false
3755 * s.eql?('FOO') # => false
3756 *
3757 * Returns +false+ if the two strings' encodings are not compatible:
3758 *
3759 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3760 *
3761 */
3762
3763VALUE
3764rb_str_eql(VALUE str1, VALUE str2)
3765{
3766 if (str1 == str2) return Qtrue;
3767 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3768 return rb_str_eql_internal(str1, str2);
3769}
3770
3771/*
3772 * call-seq:
3773 * string <=> other_string -> -1, 0, 1, or nil
3774 *
3775 * Compares +self+ and +other_string+, returning:
3776 *
3777 * - -1 if +other_string+ is larger.
3778 * - 0 if the two are equal.
3779 * - 1 if +other_string+ is smaller.
3780 * - +nil+ if the two are incomparable.
3781 *
3782 * Examples:
3783 *
3784 * 'foo' <=> 'foo' # => 0
3785 * 'foo' <=> 'food' # => -1
3786 * 'food' <=> 'foo' # => 1
3787 * 'FOO' <=> 'foo' # => -1
3788 * 'foo' <=> 'FOO' # => 1
3789 * 'foo' <=> 1 # => nil
3790 *
3791 */
3792
3793static VALUE
3794rb_str_cmp_m(VALUE str1, VALUE str2)
3795{
3796 int result;
3797 VALUE s = rb_check_string_type(str2);
3798 if (NIL_P(s)) {
3799 return rb_invcmp(str1, str2);
3800 }
3801 result = rb_str_cmp(str1, s);
3802 return INT2FIX(result);
3803}
3804
3805static VALUE str_casecmp(VALUE str1, VALUE str2);
3806static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3807
3808/*
3809 * call-seq:
3810 * casecmp(other_string) -> -1, 0, 1, or nil
3811 *
3812 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3813 *
3814 * - -1 if <tt>other_string.downcase</tt> is larger.
3815 * - 0 if the two are equal.
3816 * - 1 if <tt>other_string.downcase</tt> is smaller.
3817 * - +nil+ if the two are incomparable.
3818 *
3819 * Examples:
3820 *
3821 * 'foo'.casecmp('foo') # => 0
3822 * 'foo'.casecmp('food') # => -1
3823 * 'food'.casecmp('foo') # => 1
3824 * 'FOO'.casecmp('foo') # => 0
3825 * 'foo'.casecmp('FOO') # => 0
3826 * 'foo'.casecmp(1) # => nil
3827 *
3828 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3829 *
3830 * Related: String#casecmp?.
3831 *
3832 */
3833
3834static VALUE
3835rb_str_casecmp(VALUE str1, VALUE str2)
3836{
3837 VALUE s = rb_check_string_type(str2);
3838 if (NIL_P(s)) {
3839 return Qnil;
3840 }
3841 return str_casecmp(str1, s);
3842}
3843
3844static VALUE
3845str_casecmp(VALUE str1, VALUE str2)
3846{
3847 long len;
3848 rb_encoding *enc;
3849 const char *p1, *p1end, *p2, *p2end;
3850
3851 enc = rb_enc_compatible(str1, str2);
3852 if (!enc) {
3853 return Qnil;
3854 }
3855
3856 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3857 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3858 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3859 while (p1 < p1end && p2 < p2end) {
3860 if (*p1 != *p2) {
3861 unsigned int c1 = TOLOWER(*p1 & 0xff);
3862 unsigned int c2 = TOLOWER(*p2 & 0xff);
3863 if (c1 != c2)
3864 return INT2FIX(c1 < c2 ? -1 : 1);
3865 }
3866 p1++;
3867 p2++;
3868 }
3869 }
3870 else {
3871 while (p1 < p1end && p2 < p2end) {
3872 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3873 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3874
3875 if (0 <= c1 && 0 <= c2) {
3876 c1 = TOLOWER(c1);
3877 c2 = TOLOWER(c2);
3878 if (c1 != c2)
3879 return INT2FIX(c1 < c2 ? -1 : 1);
3880 }
3881 else {
3882 int r;
3883 l1 = rb_enc_mbclen(p1, p1end, enc);
3884 l2 = rb_enc_mbclen(p2, p2end, enc);
3885 len = l1 < l2 ? l1 : l2;
3886 r = memcmp(p1, p2, len);
3887 if (r != 0)
3888 return INT2FIX(r < 0 ? -1 : 1);
3889 if (l1 != l2)
3890 return INT2FIX(l1 < l2 ? -1 : 1);
3891 }
3892 p1 += l1;
3893 p2 += l2;
3894 }
3895 }
3896 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3897 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3898 return INT2FIX(-1);
3899}
3900
3901/*
3902 * call-seq:
3903 * casecmp?(other_string) -> true, false, or nil
3904 *
3905 * Returns +true+ if +self+ and +other_string+ are equal after
3906 * Unicode case folding, otherwise +false+:
3907 *
3908 * 'foo'.casecmp?('foo') # => true
3909 * 'foo'.casecmp?('food') # => false
3910 * 'food'.casecmp?('foo') # => false
3911 * 'FOO'.casecmp?('foo') # => true
3912 * 'foo'.casecmp?('FOO') # => true
3913 *
3914 * Returns +nil+ if the two values are incomparable:
3915 *
3916 * 'foo'.casecmp?(1) # => nil
3917 *
3918 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3919 *
3920 * Related: String#casecmp.
3921 *
3922 */
3923
3924static VALUE
3925rb_str_casecmp_p(VALUE str1, VALUE str2)
3926{
3927 VALUE s = rb_check_string_type(str2);
3928 if (NIL_P(s)) {
3929 return Qnil;
3930 }
3931 return str_casecmp_p(str1, s);
3932}
3933
3934static VALUE
3935str_casecmp_p(VALUE str1, VALUE str2)
3936{
3937 rb_encoding *enc;
3938 VALUE folded_str1, folded_str2;
3939 VALUE fold_opt = sym_fold;
3940
3941 enc = rb_enc_compatible(str1, str2);
3942 if (!enc) {
3943 return Qnil;
3944 }
3945
3946 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3947 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3948
3949 return rb_str_eql(folded_str1, folded_str2);
3950}
3951
3952static long
3953strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3954 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3955{
3956 const char *search_start = str_ptr;
3957 long pos, search_len = str_len - offset;
3958
3959 for (;;) {
3960 const char *t;
3961 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3962 if (pos < 0) return pos;
3963 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3964 if (t == search_start + pos) break;
3965 search_len -= t - search_start;
3966 if (search_len <= 0) return -1;
3967 offset += t - search_start;
3968 search_start = t;
3969 }
3970 return pos + offset;
3971}
3972
3973/* found index in byte */
3974#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3975#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
3976
3977static long
3978rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3979{
3980 const char *str_ptr, *str_ptr_end, *sub_ptr;
3981 long str_len, sub_len;
3982 rb_encoding *enc;
3983
3984 enc = rb_enc_check(str, sub);
3985 if (is_broken_string(sub)) return -1;
3986
3987 str_ptr = RSTRING_PTR(str);
3988 str_ptr_end = RSTRING_END(str);
3989 str_len = RSTRING_LEN(str);
3990 sub_ptr = RSTRING_PTR(sub);
3991 sub_len = RSTRING_LEN(sub);
3992
3993 if (str_len < sub_len) return -1;
3994
3995 if (offset != 0) {
3996 long str_len_char, sub_len_char;
3997 int single_byte = single_byte_optimizable(str);
3998 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3999 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4000 if (offset < 0) {
4001 offset += str_len_char;
4002 if (offset < 0) return -1;
4003 }
4004 if (str_len_char - offset < sub_len_char) return -1;
4005 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4006 str_ptr += offset;
4007 }
4008 if (sub_len == 0) return offset;
4009
4010 /* need proceed one character at a time */
4011 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4012}
4013
4014
4015/*
4016 * call-seq:
4017 * index(substring, offset = 0) -> integer or nil
4018 * index(regexp, offset = 0) -> integer or nil
4019 *
4020 * :include: doc/string/index.rdoc
4021 *
4022 */
4023
4024static VALUE
4025rb_str_index_m(int argc, VALUE *argv, VALUE str)
4026{
4027 VALUE sub;
4028 VALUE initpos;
4029 rb_encoding *enc = STR_ENC_GET(str);
4030 long pos;
4031
4032 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4033 long slen = str_strlen(str, enc); /* str's enc */
4034 pos = NUM2LONG(initpos);
4035 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4036 if (RB_TYPE_P(sub, T_REGEXP)) {
4038 }
4039 return Qnil;
4040 }
4041 }
4042 else {
4043 pos = 0;
4044 }
4045
4046 if (RB_TYPE_P(sub, T_REGEXP)) {
4047 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4048 enc, single_byte_optimizable(str));
4049
4050 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4051 VALUE match = rb_backref_get();
4052 struct re_registers *regs = RMATCH_REGS(match);
4053 pos = rb_str_sublen(str, BEG(0));
4054 return LONG2NUM(pos);
4055 }
4056 }
4057 else {
4058 StringValue(sub);
4059 pos = rb_str_index(str, sub, pos);
4060 if (pos >= 0) {
4061 pos = rb_str_sublen(str, pos);
4062 return LONG2NUM(pos);
4063 }
4064 }
4065 return Qnil;
4066}
4067
4068/* Ensure that the given pos is a valid character boundary.
4069 * Note that in this function, "character" means a code point
4070 * (Unicode scalar value), not a grapheme cluster.
4071 */
4072static void
4073str_ensure_byte_pos(VALUE str, long pos)
4074{
4075 const char *s = RSTRING_PTR(str);
4076 const char *e = RSTRING_END(str);
4077 const char *p = s + pos;
4078 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4079 rb_raise(rb_eIndexError,
4080 "offset %ld does not land on character boundary", pos);
4081 }
4082}
4083
4084/*
4085 * call-seq:
4086 * byteindex(substring, offset = 0) -> integer or nil
4087 * byteindex(regexp, offset = 0) -> integer or nil
4088 *
4089 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4090 * or +nil+ if none found:
4091 *
4092 * 'foo'.byteindex('f') # => 0
4093 * 'foo'.byteindex('o') # => 1
4094 * 'foo'.byteindex('oo') # => 1
4095 * 'foo'.byteindex('ooo') # => nil
4096 *
4097 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4098 * or +nil+ if none found:
4099 *
4100 * 'foo'.byteindex(/f/) # => 0
4101 * 'foo'.byteindex(/o/) # => 1
4102 * 'foo'.byteindex(/oo/) # => 1
4103 * 'foo'.byteindex(/ooo/) # => nil
4104 *
4105 * Integer argument +offset+, if given, specifies the byte-based position in the
4106 * string to begin the search:
4107 *
4108 * 'foo'.byteindex('o', 1) # => 1
4109 * 'foo'.byteindex('o', 2) # => 2
4110 * 'foo'.byteindex('o', 3) # => nil
4111 *
4112 * If +offset+ is negative, counts backward from the end of +self+:
4113 *
4114 * 'foo'.byteindex('o', -1) # => 2
4115 * 'foo'.byteindex('o', -2) # => 1
4116 * 'foo'.byteindex('o', -3) # => 1
4117 * 'foo'.byteindex('o', -4) # => nil
4118 *
4119 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4120 * raised.
4121 *
4122 * Related: String#index, String#byterindex.
4123 */
4124
4125static VALUE
4126rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4127{
4128 VALUE sub;
4129 VALUE initpos;
4130 long pos;
4131
4132 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4133 long slen = RSTRING_LEN(str);
4134 pos = NUM2LONG(initpos);
4135 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4136 if (RB_TYPE_P(sub, T_REGEXP)) {
4138 }
4139 return Qnil;
4140 }
4141 }
4142 else {
4143 pos = 0;
4144 }
4145
4146 str_ensure_byte_pos(str, pos);
4147
4148 if (RB_TYPE_P(sub, T_REGEXP)) {
4149 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4150 VALUE match = rb_backref_get();
4151 struct re_registers *regs = RMATCH_REGS(match);
4152 pos = BEG(0);
4153 return LONG2NUM(pos);
4154 }
4155 }
4156 else {
4157 StringValue(sub);
4158 pos = rb_str_byteindex(str, sub, pos);
4159 if (pos >= 0) return LONG2NUM(pos);
4160 }
4161 return Qnil;
4162}
4163
4164#ifdef HAVE_MEMRCHR
4165static long
4166str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4167{
4168 char *hit, *adjusted;
4169 int c;
4170 long slen, searchlen;
4171 char *sbeg, *e, *t;
4172
4173 sbeg = RSTRING_PTR(str);
4174 slen = RSTRING_LEN(sub);
4175 if (slen == 0) return s - sbeg;
4176 e = RSTRING_END(str);
4177 t = RSTRING_PTR(sub);
4178 c = *t & 0xff;
4179 searchlen = s - sbeg + 1;
4180
4181 do {
4182 hit = memrchr(sbeg, c, searchlen);
4183 if (!hit) break;
4184 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4185 if (hit != adjusted) {
4186 searchlen = adjusted - sbeg;
4187 continue;
4188 }
4189 if (memcmp(hit, t, slen) == 0)
4190 return hit - sbeg;
4191 searchlen = adjusted - sbeg;
4192 } while (searchlen > 0);
4193
4194 return -1;
4195}
4196#else
4197static long
4198str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4199{
4200 long slen;
4201 char *sbeg, *e, *t;
4202
4203 sbeg = RSTRING_PTR(str);
4204 e = RSTRING_END(str);
4205 t = RSTRING_PTR(sub);
4206 slen = RSTRING_LEN(sub);
4207
4208 while (s) {
4209 if (memcmp(s, t, slen) == 0) {
4210 return s - sbeg;
4211 }
4212 if (s <= sbeg) break;
4213 s = rb_enc_prev_char(sbeg, s, e, enc);
4214 }
4215
4216 return -1;
4217}
4218#endif
4219
4220/* found index in byte */
4221static long
4222rb_str_rindex(VALUE str, VALUE sub, long pos)
4223{
4224 long len, slen;
4225 char *sbeg, *s;
4226 rb_encoding *enc;
4227 int singlebyte;
4228
4229 enc = rb_enc_check(str, sub);
4230 if (is_broken_string(sub)) return -1;
4231 singlebyte = single_byte_optimizable(str);
4232 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4233 slen = str_strlen(sub, enc); /* rb_enc_check */
4234
4235 /* substring longer than string */
4236 if (len < slen) return -1;
4237 if (len - pos < slen) pos = len - slen;
4238 if (len == 0) return pos;
4239
4240 sbeg = RSTRING_PTR(str);
4241
4242 if (pos == 0) {
4243 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4244 return 0;
4245 else
4246 return -1;
4247 }
4248
4249 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4250 return str_rindex(str, sub, s, enc);
4251}
4252
4253/*
4254 * call-seq:
4255 * rindex(substring, offset = self.length) -> integer or nil
4256 * rindex(regexp, offset = self.length) -> integer or nil
4257 *
4258 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4259 * or +nil+ if none found:
4260 *
4261 * 'foo'.rindex('f') # => 0
4262 * 'foo'.rindex('o') # => 2
4263 * 'foo'.rindex('oo') # => 1
4264 * 'foo'.rindex('ooo') # => nil
4265 *
4266 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4267 * or +nil+ if none found:
4268 *
4269 * 'foo'.rindex(/f/) # => 0
4270 * 'foo'.rindex(/o/) # => 2
4271 * 'foo'.rindex(/oo/) # => 1
4272 * 'foo'.rindex(/ooo/) # => nil
4273 *
4274 * The _last_ match means starting at the possible last position, not
4275 * the last of longest matches.
4276 *
4277 * 'foo'.rindex(/o+/) # => 2
4278 * $~ #=> #<MatchData "o">
4279 *
4280 * To get the last longest match, needs to combine with negative
4281 * lookbehind.
4282 *
4283 * 'foo'.rindex(/(?<!o)o+/) # => 1
4284 * $~ #=> #<MatchData "oo">
4285 *
4286 * Or String#index with negative lookforward.
4287 *
4288 * 'foo'.index(/o+(?!.*o)/) # => 1
4289 * $~ #=> #<MatchData "oo">
4290 *
4291 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4292 * string to _end_ the search:
4293 *
4294 * 'foo'.rindex('o', 0) # => nil
4295 * 'foo'.rindex('o', 1) # => 1
4296 * 'foo'.rindex('o', 2) # => 2
4297 * 'foo'.rindex('o', 3) # => 2
4298 *
4299 * If +offset+ is a negative Integer, the maximum starting position in the
4300 * string to _end_ the search is the sum of the string's length and +offset+:
4301 *
4302 * 'foo'.rindex('o', -1) # => 2
4303 * 'foo'.rindex('o', -2) # => 1
4304 * 'foo'.rindex('o', -3) # => nil
4305 * 'foo'.rindex('o', -4) # => nil
4306 *
4307 * Related: String#index.
4308 */
4309
4310static VALUE
4311rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4312{
4313 VALUE sub;
4314 VALUE initpos;
4315 rb_encoding *enc = STR_ENC_GET(str);
4316 long pos, len = str_strlen(str, enc); /* str's enc */
4317
4318 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4319 pos = NUM2LONG(initpos);
4320 if (pos < 0 && (pos += len) < 0) {
4321 if (RB_TYPE_P(sub, T_REGEXP)) {
4323 }
4324 return Qnil;
4325 }
4326 if (pos > len) pos = len;
4327 }
4328 else {
4329 pos = len;
4330 }
4331
4332 if (RB_TYPE_P(sub, T_REGEXP)) {
4333 /* enc = rb_enc_check(str, sub); */
4334 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4335 enc, single_byte_optimizable(str));
4336
4337 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4338 VALUE match = rb_backref_get();
4339 struct re_registers *regs = RMATCH_REGS(match);
4340 pos = rb_str_sublen(str, BEG(0));
4341 return LONG2NUM(pos);
4342 }
4343 }
4344 else {
4345 StringValue(sub);
4346 pos = rb_str_rindex(str, sub, pos);
4347 if (pos >= 0) {
4348 pos = rb_str_sublen(str, pos);
4349 return LONG2NUM(pos);
4350 }
4351 }
4352 return Qnil;
4353}
4354
4355static long
4356rb_str_byterindex(VALUE str, VALUE sub, long pos)
4357{
4358 long len, slen;
4359 char *sbeg, *s;
4360 rb_encoding *enc;
4361
4362 enc = rb_enc_check(str, sub);
4363 if (is_broken_string(sub)) return -1;
4364 len = RSTRING_LEN(str);
4365 slen = RSTRING_LEN(sub);
4366
4367 /* substring longer than string */
4368 if (len < slen) return -1;
4369 if (len - pos < slen) pos = len - slen;
4370 if (len == 0) return pos;
4371
4372 sbeg = RSTRING_PTR(str);
4373
4374 if (pos == 0) {
4375 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4376 return 0;
4377 else
4378 return -1;
4379 }
4380
4381 s = sbeg + pos;
4382 return str_rindex(str, sub, s, enc);
4383}
4384
4385
4386/*
4387 * call-seq:
4388 * byterindex(substring, offset = self.bytesize) -> integer or nil
4389 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4390 *
4391 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4392 * or +nil+ if none found:
4393 *
4394 * 'foo'.byterindex('f') # => 0
4395 * 'foo'.byterindex('o') # => 2
4396 * 'foo'.byterindex('oo') # => 1
4397 * 'foo'.byterindex('ooo') # => nil
4398 *
4399 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4400 * or +nil+ if none found:
4401 *
4402 * 'foo'.byterindex(/f/) # => 0
4403 * 'foo'.byterindex(/o/) # => 2
4404 * 'foo'.byterindex(/oo/) # => 1
4405 * 'foo'.byterindex(/ooo/) # => nil
4406 *
4407 * The _last_ match means starting at the possible last position, not
4408 * the last of longest matches.
4409 *
4410 * 'foo'.byterindex(/o+/) # => 2
4411 * $~ #=> #<MatchData "o">
4412 *
4413 * To get the last longest match, needs to combine with negative
4414 * lookbehind.
4415 *
4416 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4417 * $~ #=> #<MatchData "oo">
4418 *
4419 * Or String#byteindex with negative lookforward.
4420 *
4421 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4422 * $~ #=> #<MatchData "oo">
4423 *
4424 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4425 * string to _end_ the search:
4426 *
4427 * 'foo'.byterindex('o', 0) # => nil
4428 * 'foo'.byterindex('o', 1) # => 1
4429 * 'foo'.byterindex('o', 2) # => 2
4430 * 'foo'.byterindex('o', 3) # => 2
4431 *
4432 * If +offset+ is a negative Integer, the maximum starting position in the
4433 * string to _end_ the search is the sum of the string's length and +offset+:
4434 *
4435 * 'foo'.byterindex('o', -1) # => 2
4436 * 'foo'.byterindex('o', -2) # => 1
4437 * 'foo'.byterindex('o', -3) # => nil
4438 * 'foo'.byterindex('o', -4) # => nil
4439 *
4440 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4441 * raised.
4442 *
4443 * Related: String#byteindex.
4444 */
4445
4446static VALUE
4447rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4448{
4449 VALUE sub;
4450 VALUE initpos;
4451 long pos, len = RSTRING_LEN(str);
4452
4453 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4454 pos = NUM2LONG(initpos);
4455 if (pos < 0 && (pos += len) < 0) {
4456 if (RB_TYPE_P(sub, T_REGEXP)) {
4458 }
4459 return Qnil;
4460 }
4461 if (pos > len) pos = len;
4462 }
4463 else {
4464 pos = len;
4465 }
4466
4467 str_ensure_byte_pos(str, pos);
4468
4469 if (RB_TYPE_P(sub, T_REGEXP)) {
4470 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4471 VALUE match = rb_backref_get();
4472 struct re_registers *regs = RMATCH_REGS(match);
4473 pos = BEG(0);
4474 return LONG2NUM(pos);
4475 }
4476 }
4477 else {
4478 StringValue(sub);
4479 pos = rb_str_byterindex(str, sub, pos);
4480 if (pos >= 0) return LONG2NUM(pos);
4481 }
4482 return Qnil;
4483}
4484
4485/*
4486 * call-seq:
4487 * string =~ regexp -> integer or nil
4488 * string =~ object -> integer or nil
4489 *
4490 * Returns the Integer index of the first substring that matches
4491 * the given +regexp+, or +nil+ if no match found:
4492 *
4493 * 'foo' =~ /f/ # => 0
4494 * 'foo' =~ /o/ # => 1
4495 * 'foo' =~ /x/ # => nil
4496 *
4497 * Note: also updates Regexp@Global+Variables.
4498 *
4499 * If the given +object+ is not a Regexp, returns the value
4500 * returned by <tt>object =~ self</tt>.
4501 *
4502 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4503 * (see Regexp#=~):
4504 *
4505 * number= nil
4506 * "no. 9" =~ /(?<number>\d+)/
4507 * number # => nil (not assigned)
4508 * /(?<number>\d+)/ =~ "no. 9"
4509 * number #=> "9"
4510 *
4511 */
4512
4513static VALUE
4514rb_str_match(VALUE x, VALUE y)
4515{
4516 switch (OBJ_BUILTIN_TYPE(y)) {
4517 case T_STRING:
4518 rb_raise(rb_eTypeError, "type mismatch: String given");
4519
4520 case T_REGEXP:
4521 return rb_reg_match(y, x);
4522
4523 default:
4524 return rb_funcall(y, idEqTilde, 1, x);
4525 }
4526}
4527
4528
4529static VALUE get_pat(VALUE);
4530
4531
4532/*
4533 * call-seq:
4534 * match(pattern, offset = 0) -> matchdata or nil
4535 * match(pattern, offset = 0) {|matchdata| ... } -> object
4536 *
4537 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4538 *
4539 * Note: also updates Regexp@Global+Variables.
4540 *
4541 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4542 * regexp = Regexp.new(pattern)
4543 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4544 * (see Regexp#match):
4545 * matchdata = <tt>regexp.match(self)
4546 *
4547 * With no block given, returns the computed +matchdata+:
4548 *
4549 * 'foo'.match('f') # => #<MatchData "f">
4550 * 'foo'.match('o') # => #<MatchData "o">
4551 * 'foo'.match('x') # => nil
4552 *
4553 * If Integer argument +offset+ is given, the search begins at index +offset+:
4554 *
4555 * 'foo'.match('f', 1) # => nil
4556 * 'foo'.match('o', 1) # => #<MatchData "o">
4557 *
4558 * With a block given, calls the block with the computed +matchdata+
4559 * and returns the block's return value:
4560 *
4561 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4562 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4563 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4564 *
4565 */
4566
4567static VALUE
4568rb_str_match_m(int argc, VALUE *argv, VALUE str)
4569{
4570 VALUE re, result;
4571 if (argc < 1)
4572 rb_check_arity(argc, 1, 2);
4573 re = argv[0];
4574 argv[0] = str;
4575 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4576 if (!NIL_P(result) && rb_block_given_p()) {
4577 return rb_yield(result);
4578 }
4579 return result;
4580}
4581
4582/*
4583 * call-seq:
4584 * match?(pattern, offset = 0) -> true or false
4585 *
4586 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4587 *
4588 * Note: does not update Regexp@Global+Variables.
4589 *
4590 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4591 * regexp = Regexp.new(pattern)
4592 *
4593 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
4594 * +false+ otherwise:
4595 *
4596 * 'foo'.match?(/o/) # => true
4597 * 'foo'.match?('o') # => true
4598 * 'foo'.match?(/x/) # => false
4599 *
4600 * If Integer argument +offset+ is given, the search begins at index +offset+:
4601 * 'foo'.match?('f', 1) # => false
4602 * 'foo'.match?('o', 1) # => true
4603 *
4604 */
4605
4606static VALUE
4607rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4608{
4609 VALUE re;
4610 rb_check_arity(argc, 1, 2);
4611 re = get_pat(argv[0]);
4612 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4613}
4614
4615enum neighbor_char {
4616 NEIGHBOR_NOT_CHAR,
4617 NEIGHBOR_FOUND,
4618 NEIGHBOR_WRAPPED
4619};
4620
4621static enum neighbor_char
4622enc_succ_char(char *p, long len, rb_encoding *enc)
4623{
4624 long i;
4625 int l;
4626
4627 if (rb_enc_mbminlen(enc) > 1) {
4628 /* wchar, trivial case */
4629 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4630 if (!MBCLEN_CHARFOUND_P(r)) {
4631 return NEIGHBOR_NOT_CHAR;
4632 }
4633 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4634 l = rb_enc_code_to_mbclen(c, enc);
4635 if (!l) return NEIGHBOR_NOT_CHAR;
4636 if (l != len) return NEIGHBOR_WRAPPED;
4637 rb_enc_mbcput(c, p, enc);
4638 r = rb_enc_precise_mbclen(p, p + len, enc);
4639 if (!MBCLEN_CHARFOUND_P(r)) {
4640 return NEIGHBOR_NOT_CHAR;
4641 }
4642 return NEIGHBOR_FOUND;
4643 }
4644 while (1) {
4645 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4646 p[i] = '\0';
4647 if (i < 0)
4648 return NEIGHBOR_WRAPPED;
4649 ++((unsigned char*)p)[i];
4650 l = rb_enc_precise_mbclen(p, p+len, enc);
4651 if (MBCLEN_CHARFOUND_P(l)) {
4652 l = MBCLEN_CHARFOUND_LEN(l);
4653 if (l == len) {
4654 return NEIGHBOR_FOUND;
4655 }
4656 else {
4657 memset(p+l, 0xff, len-l);
4658 }
4659 }
4660 if (MBCLEN_INVALID_P(l) && i < len-1) {
4661 long len2;
4662 int l2;
4663 for (len2 = len-1; 0 < len2; len2--) {
4664 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4665 if (!MBCLEN_INVALID_P(l2))
4666 break;
4667 }
4668 memset(p+len2+1, 0xff, len-(len2+1));
4669 }
4670 }
4671}
4672
4673static enum neighbor_char
4674enc_pred_char(char *p, long len, rb_encoding *enc)
4675{
4676 long i;
4677 int l;
4678 if (rb_enc_mbminlen(enc) > 1) {
4679 /* wchar, trivial case */
4680 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4681 if (!MBCLEN_CHARFOUND_P(r)) {
4682 return NEIGHBOR_NOT_CHAR;
4683 }
4684 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4685 if (!c) return NEIGHBOR_NOT_CHAR;
4686 --c;
4687 l = rb_enc_code_to_mbclen(c, enc);
4688 if (!l) return NEIGHBOR_NOT_CHAR;
4689 if (l != len) return NEIGHBOR_WRAPPED;
4690 rb_enc_mbcput(c, p, enc);
4691 r = rb_enc_precise_mbclen(p, p + len, enc);
4692 if (!MBCLEN_CHARFOUND_P(r)) {
4693 return NEIGHBOR_NOT_CHAR;
4694 }
4695 return NEIGHBOR_FOUND;
4696 }
4697 while (1) {
4698 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4699 p[i] = '\xff';
4700 if (i < 0)
4701 return NEIGHBOR_WRAPPED;
4702 --((unsigned char*)p)[i];
4703 l = rb_enc_precise_mbclen(p, p+len, enc);
4704 if (MBCLEN_CHARFOUND_P(l)) {
4705 l = MBCLEN_CHARFOUND_LEN(l);
4706 if (l == len) {
4707 return NEIGHBOR_FOUND;
4708 }
4709 else {
4710 memset(p+l, 0, len-l);
4711 }
4712 }
4713 if (MBCLEN_INVALID_P(l) && i < len-1) {
4714 long len2;
4715 int l2;
4716 for (len2 = len-1; 0 < len2; len2--) {
4717 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4718 if (!MBCLEN_INVALID_P(l2))
4719 break;
4720 }
4721 memset(p+len2+1, 0, len-(len2+1));
4722 }
4723 }
4724}
4725
4726/*
4727 overwrite +p+ by succeeding letter in +enc+ and returns
4728 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4729 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4730 assuming each ranges are successive, and mbclen
4731 never change in each ranges.
4732 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4733 character.
4734 */
4735static enum neighbor_char
4736enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4737{
4738 enum neighbor_char ret;
4739 unsigned int c;
4740 int ctype;
4741 int range;
4742 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4743
4744 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4745 int try;
4746 const int max_gaps = 1;
4747
4748 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4749 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4750 ctype = ONIGENC_CTYPE_DIGIT;
4751 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4752 ctype = ONIGENC_CTYPE_ALPHA;
4753 else
4754 return NEIGHBOR_NOT_CHAR;
4755
4756 MEMCPY(save, p, char, len);
4757 for (try = 0; try <= max_gaps; ++try) {
4758 ret = enc_succ_char(p, len, enc);
4759 if (ret == NEIGHBOR_FOUND) {
4760 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4761 if (rb_enc_isctype(c, ctype, enc))
4762 return NEIGHBOR_FOUND;
4763 }
4764 }
4765 MEMCPY(p, save, char, len);
4766 range = 1;
4767 while (1) {
4768 MEMCPY(save, p, char, len);
4769 ret = enc_pred_char(p, len, enc);
4770 if (ret == NEIGHBOR_FOUND) {
4771 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4772 if (!rb_enc_isctype(c, ctype, enc)) {
4773 MEMCPY(p, save, char, len);
4774 break;
4775 }
4776 }
4777 else {
4778 MEMCPY(p, save, char, len);
4779 break;
4780 }
4781 range++;
4782 }
4783 if (range == 1) {
4784 return NEIGHBOR_NOT_CHAR;
4785 }
4786
4787 if (ctype != ONIGENC_CTYPE_DIGIT) {
4788 MEMCPY(carry, p, char, len);
4789 return NEIGHBOR_WRAPPED;
4790 }
4791
4792 MEMCPY(carry, p, char, len);
4793 enc_succ_char(carry, len, enc);
4794 return NEIGHBOR_WRAPPED;
4795}
4796
4797
4798static VALUE str_succ(VALUE str);
4799
4800/*
4801 * call-seq:
4802 * succ -> new_str
4803 *
4804 * Returns the successor to +self+. The successor is calculated by
4805 * incrementing characters.
4806 *
4807 * The first character to be incremented is the rightmost alphanumeric:
4808 * or, if no alphanumerics, the rightmost character:
4809 *
4810 * 'THX1138'.succ # => "THX1139"
4811 * '<<koala>>'.succ # => "<<koalb>>"
4812 * '***'.succ # => '**+'
4813 *
4814 * The successor to a digit is another digit, "carrying" to the next-left
4815 * character for a "rollover" from 9 to 0, and prepending another digit
4816 * if necessary:
4817 *
4818 * '00'.succ # => "01"
4819 * '09'.succ # => "10"
4820 * '99'.succ # => "100"
4821 *
4822 * The successor to a letter is another letter of the same case,
4823 * carrying to the next-left character for a rollover,
4824 * and prepending another same-case letter if necessary:
4825 *
4826 * 'aa'.succ # => "ab"
4827 * 'az'.succ # => "ba"
4828 * 'zz'.succ # => "aaa"
4829 * 'AA'.succ # => "AB"
4830 * 'AZ'.succ # => "BA"
4831 * 'ZZ'.succ # => "AAA"
4832 *
4833 * The successor to a non-alphanumeric character is the next character
4834 * in the underlying character set's collating sequence,
4835 * carrying to the next-left character for a rollover,
4836 * and prepending another character if necessary:
4837 *
4838 * s = 0.chr * 3
4839 * s # => "\x00\x00\x00"
4840 * s.succ # => "\x00\x00\x01"
4841 * s = 255.chr * 3
4842 * s # => "\xFF\xFF\xFF"
4843 * s.succ # => "\x01\x00\x00\x00"
4844 *
4845 * Carrying can occur between and among mixtures of alphanumeric characters:
4846 *
4847 * s = 'zz99zz99'
4848 * s.succ # => "aaa00aa00"
4849 * s = '99zz99zz'
4850 * s.succ # => "100aa00aa"
4851 *
4852 * The successor to an empty \String is a new empty \String:
4853 *
4854 * ''.succ # => ""
4855 *
4856 */
4857
4858VALUE
4860{
4861 VALUE str;
4862 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4863 rb_enc_cr_str_copy_for_substr(str, orig);
4864 return str_succ(str);
4865}
4866
4867static VALUE
4868str_succ(VALUE str)
4869{
4870 rb_encoding *enc;
4871 char *sbeg, *s, *e, *last_alnum = 0;
4872 int found_alnum = 0;
4873 long l, slen;
4874 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4875 long carry_pos = 0, carry_len = 1;
4876 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4877
4878 slen = RSTRING_LEN(str);
4879 if (slen == 0) return str;
4880
4881 enc = STR_ENC_GET(str);
4882 sbeg = RSTRING_PTR(str);
4883 s = e = sbeg + slen;
4884
4885 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4886 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4887 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4888 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4889 break;
4890 }
4891 }
4892 l = rb_enc_precise_mbclen(s, e, enc);
4893 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4894 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4895 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4896 switch (neighbor) {
4897 case NEIGHBOR_NOT_CHAR:
4898 continue;
4899 case NEIGHBOR_FOUND:
4900 return str;
4901 case NEIGHBOR_WRAPPED:
4902 last_alnum = s;
4903 break;
4904 }
4905 found_alnum = 1;
4906 carry_pos = s - sbeg;
4907 carry_len = l;
4908 }
4909 if (!found_alnum) { /* str contains no alnum */
4910 s = e;
4911 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4912 enum neighbor_char neighbor;
4913 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4914 l = rb_enc_precise_mbclen(s, e, enc);
4915 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4916 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4917 MEMCPY(tmp, s, char, l);
4918 neighbor = enc_succ_char(tmp, l, enc);
4919 switch (neighbor) {
4920 case NEIGHBOR_FOUND:
4921 MEMCPY(s, tmp, char, l);
4922 return str;
4923 break;
4924 case NEIGHBOR_WRAPPED:
4925 MEMCPY(s, tmp, char, l);
4926 break;
4927 case NEIGHBOR_NOT_CHAR:
4928 break;
4929 }
4930 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4931 /* wrapped to \0...\0. search next valid char. */
4932 enc_succ_char(s, l, enc);
4933 }
4934 if (!rb_enc_asciicompat(enc)) {
4935 MEMCPY(carry, s, char, l);
4936 carry_len = l;
4937 }
4938 carry_pos = s - sbeg;
4939 }
4941 }
4942 RESIZE_CAPA(str, slen + carry_len);
4943 sbeg = RSTRING_PTR(str);
4944 s = sbeg + carry_pos;
4945 memmove(s + carry_len, s, slen - carry_pos);
4946 memmove(s, carry, carry_len);
4947 slen += carry_len;
4948 STR_SET_LEN(str, slen);
4949 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4950 rb_enc_str_coderange(str);
4951 return str;
4952}
4953
4954
4955/*
4956 * call-seq:
4957 * succ! -> self
4958 *
4959 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4960 */
4961
4962static VALUE
4963rb_str_succ_bang(VALUE str)
4964{
4965 rb_str_modify(str);
4966 str_succ(str);
4967 return str;
4968}
4969
4970static int
4971all_digits_p(const char *s, long len)
4972{
4973 while (len-- > 0) {
4974 if (!ISDIGIT(*s)) return 0;
4975 s++;
4976 }
4977 return 1;
4978}
4979
4980static int
4981str_upto_i(VALUE str, VALUE arg)
4982{
4983 rb_yield(str);
4984 return 0;
4985}
4986
4987/*
4988 * call-seq:
4989 * upto(other_string, exclusive = false) {|string| ... } -> self
4990 * upto(other_string, exclusive = false) -> new_enumerator
4991 *
4992 * With a block given, calls the block with each \String value
4993 * returned by successive calls to String#succ;
4994 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4995 * the sequence terminates when value +other_string+ is reached;
4996 * returns +self+:
4997 *
4998 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4999 * Output:
5000 *
5001 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5002 *
5003 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5004 *
5005 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5006 *
5007 * Output:
5008 *
5009 * a8 a9 b0 b1 b2 b3 b4 b5
5010 *
5011 * If +other_string+ would not be reached, does not call the block:
5012 *
5013 * '25'.upto('5') {|s| fail s }
5014 * 'aa'.upto('a') {|s| fail s }
5015 *
5016 * With no block given, returns a new Enumerator:
5017 *
5018 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5019 *
5020 */
5021
5022static VALUE
5023rb_str_upto(int argc, VALUE *argv, VALUE beg)
5024{
5025 VALUE end, exclusive;
5026
5027 rb_scan_args(argc, argv, "11", &end, &exclusive);
5028 RETURN_ENUMERATOR(beg, argc, argv);
5029 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5030}
5031
5032VALUE
5033rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5034{
5035 VALUE current, after_end;
5036 ID succ;
5037 int n, ascii;
5038 rb_encoding *enc;
5039
5040 CONST_ID(succ, "succ");
5041 StringValue(end);
5042 enc = rb_enc_check(beg, end);
5043 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5044 /* single character */
5045 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5046 char c = RSTRING_PTR(beg)[0];
5047 char e = RSTRING_PTR(end)[0];
5048
5049 if (c > e || (excl && c == e)) return beg;
5050 for (;;) {
5051 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
5052 if (!excl && c == e) break;
5053 c++;
5054 if (excl && c == e) break;
5055 }
5056 return beg;
5057 }
5058 /* both edges are all digits */
5059 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5060 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5061 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5062 VALUE b, e;
5063 int width;
5064
5065 width = RSTRING_LENINT(beg);
5066 b = rb_str_to_inum(beg, 10, FALSE);
5067 e = rb_str_to_inum(end, 10, FALSE);
5068 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5069 long bi = FIX2LONG(b);
5070 long ei = FIX2LONG(e);
5071 rb_encoding *usascii = rb_usascii_encoding();
5072
5073 while (bi <= ei) {
5074 if (excl && bi == ei) break;
5075 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5076 bi++;
5077 }
5078 }
5079 else {
5080 ID op = excl ? '<' : idLE;
5081 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5082
5083 args[0] = INT2FIX(width);
5084 while (rb_funcall(b, op, 1, e)) {
5085 args[1] = b;
5086 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5087 b = rb_funcallv(b, succ, 0, 0);
5088 }
5089 }
5090 return beg;
5091 }
5092 /* normal case */
5093 n = rb_str_cmp(beg, end);
5094 if (n > 0 || (excl && n == 0)) return beg;
5095
5096 after_end = rb_funcallv(end, succ, 0, 0);
5097 current = str_duplicate(rb_cString, beg);
5098 while (!rb_str_equal(current, after_end)) {
5099 VALUE next = Qnil;
5100 if (excl || !rb_str_equal(current, end))
5101 next = rb_funcallv(current, succ, 0, 0);
5102 if ((*each)(current, arg)) break;
5103 if (NIL_P(next)) break;
5104 current = next;
5105 StringValue(current);
5106 if (excl && rb_str_equal(current, end)) break;
5107 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5108 break;
5109 }
5110
5111 return beg;
5112}
5113
5114VALUE
5115rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5116{
5117 VALUE current;
5118 ID succ;
5119
5120 CONST_ID(succ, "succ");
5121 /* both edges are all digits */
5122 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5123 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5124 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5125 int width = RSTRING_LENINT(beg);
5126 b = rb_str_to_inum(beg, 10, FALSE);
5127 if (FIXNUM_P(b)) {
5128 long bi = FIX2LONG(b);
5129 rb_encoding *usascii = rb_usascii_encoding();
5130
5131 while (FIXABLE(bi)) {
5132 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5133 bi++;
5134 }
5135 b = LONG2NUM(bi);
5136 }
5137 args[0] = INT2FIX(width);
5138 while (1) {
5139 args[1] = b;
5140 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5141 b = rb_funcallv(b, succ, 0, 0);
5142 }
5143 }
5144 /* normal case */
5145 current = str_duplicate(rb_cString, beg);
5146 while (1) {
5147 VALUE next = rb_funcallv(current, succ, 0, 0);
5148 if ((*each)(current, arg)) break;
5149 current = next;
5150 StringValue(current);
5151 if (RSTRING_LEN(current) == 0)
5152 break;
5153 }
5154
5155 return beg;
5156}
5157
5158static int
5159include_range_i(VALUE str, VALUE arg)
5160{
5161 VALUE *argp = (VALUE *)arg;
5162 if (!rb_equal(str, *argp)) return 0;
5163 *argp = Qnil;
5164 return 1;
5165}
5166
5167VALUE
5168rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5169{
5170 beg = rb_str_new_frozen(beg);
5171 StringValue(end);
5172 end = rb_str_new_frozen(end);
5173 if (NIL_P(val)) return Qfalse;
5174 val = rb_check_string_type(val);
5175 if (NIL_P(val)) return Qfalse;
5176 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5177 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5178 rb_enc_asciicompat(STR_ENC_GET(val))) {
5179 const char *bp = RSTRING_PTR(beg);
5180 const char *ep = RSTRING_PTR(end);
5181 const char *vp = RSTRING_PTR(val);
5182 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5183 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5184 return Qfalse;
5185 else {
5186 char b = *bp;
5187 char e = *ep;
5188 char v = *vp;
5189
5190 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5191 if (b <= v && v < e) return Qtrue;
5192 return RBOOL(!RTEST(exclusive) && v == e);
5193 }
5194 }
5195 }
5196#if 0
5197 /* both edges are all digits */
5198 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5199 all_digits_p(bp, RSTRING_LEN(beg)) &&
5200 all_digits_p(ep, RSTRING_LEN(end))) {
5201 /* TODO */
5202 }
5203#endif
5204 }
5205 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5206
5207 return RBOOL(NIL_P(val));
5208}
5209
5210static VALUE
5211rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5212{
5213 if (rb_reg_search(re, str, 0, 0) >= 0) {
5214 VALUE match = rb_backref_get();
5215 int nth = rb_reg_backref_number(match, backref);
5216 return rb_reg_nth_match(nth, match);
5217 }
5218 return Qnil;
5219}
5220
5221static VALUE
5222rb_str_aref(VALUE str, VALUE indx)
5223{
5224 long idx;
5225
5226 if (FIXNUM_P(indx)) {
5227 idx = FIX2LONG(indx);
5228 }
5229 else if (RB_TYPE_P(indx, T_REGEXP)) {
5230 return rb_str_subpat(str, indx, INT2FIX(0));
5231 }
5232 else if (RB_TYPE_P(indx, T_STRING)) {
5233 if (rb_str_index(str, indx, 0) != -1)
5234 return str_duplicate(rb_cString, indx);
5235 return Qnil;
5236 }
5237 else {
5238 /* check if indx is Range */
5239 long beg, len = str_strlen(str, NULL);
5240 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5241 case Qfalse:
5242 break;
5243 case Qnil:
5244 return Qnil;
5245 default:
5246 return rb_str_substr(str, beg, len);
5247 }
5248 idx = NUM2LONG(indx);
5249 }
5250
5251 return str_substr(str, idx, 1, FALSE);
5252}
5253
5254
5255/*
5256 * call-seq:
5257 * string[index] -> new_string or nil
5258 * string[start, length] -> new_string or nil
5259 * string[range] -> new_string or nil
5260 * string[regexp, capture = 0] -> new_string or nil
5261 * string[substring] -> new_string or nil
5262 *
5263 * Returns the substring of +self+ specified by the arguments.
5264 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5265 *
5266 *
5267 */
5268
5269static VALUE
5270rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5271{
5272 if (argc == 2) {
5273 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5274 return rb_str_subpat(str, argv[0], argv[1]);
5275 }
5276 else {
5277 long beg = NUM2LONG(argv[0]);
5278 long len = NUM2LONG(argv[1]);
5279 return rb_str_substr(str, beg, len);
5280 }
5281 }
5282 rb_check_arity(argc, 1, 2);
5283 return rb_str_aref(str, argv[0]);
5284}
5285
5286VALUE
5288{
5289 char *ptr = RSTRING_PTR(str);
5290 long olen = RSTRING_LEN(str), nlen;
5291
5292 str_modifiable(str);
5293 if (len > olen) len = olen;
5294 nlen = olen - len;
5295 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5296 char *oldptr = ptr;
5297 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5298 STR_SET_EMBED(str);
5299 ptr = RSTRING(str)->as.embed.ary;
5300 memmove(ptr, oldptr + len, nlen);
5301 if (fl == STR_NOEMBED) xfree(oldptr);
5302 }
5303 else {
5304 if (!STR_SHARED_P(str)) {
5305 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5306 rb_enc_cr_str_exact_copy(shared, str);
5307 OBJ_FREEZE(shared);
5308 }
5309 ptr = RSTRING(str)->as.heap.ptr += len;
5310 }
5311 STR_SET_LEN(str, nlen);
5312
5313 if (!SHARABLE_MIDDLE_SUBSTRING) {
5314 TERM_FILL(ptr + nlen, TERM_LEN(str));
5315 }
5317 return str;
5318}
5319
5320static void
5321rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5322{
5323 char *sptr;
5324 long slen;
5325 int cr;
5326
5327 if (beg == 0 && vlen == 0) {
5328 rb_str_drop_bytes(str, len);
5329 return;
5330 }
5331
5332 str_modify_keep_cr(str);
5333 RSTRING_GETMEM(str, sptr, slen);
5334 if (len < vlen) {
5335 /* expand string */
5336 RESIZE_CAPA(str, slen + vlen - len);
5337 sptr = RSTRING_PTR(str);
5338 }
5339
5341 cr = rb_enc_str_coderange(val);
5342 else
5344
5345 if (vlen != len) {
5346 memmove(sptr + beg + vlen,
5347 sptr + beg + len,
5348 slen - (beg + len));
5349 }
5350 if (vlen < beg && len < 0) {
5351 MEMZERO(sptr + slen, char, -len);
5352 }
5353 if (vlen > 0) {
5354 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5355 }
5356 slen += vlen - len;
5357 STR_SET_LEN(str, slen);
5358 TERM_FILL(&sptr[slen], TERM_LEN(str));
5359 ENC_CODERANGE_SET(str, cr);
5360}
5361
5362static inline void
5363rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5364{
5365 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5366}
5367
5368void
5369rb_str_update(VALUE str, long beg, long len, VALUE val)
5370{
5371 long slen;
5372 char *p, *e;
5373 rb_encoding *enc;
5374 int singlebyte = single_byte_optimizable(str);
5375 int cr;
5376
5377 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5378
5379 StringValue(val);
5380 enc = rb_enc_check(str, val);
5381 slen = str_strlen(str, enc); /* rb_enc_check */
5382
5383 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5384 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5385 }
5386 if (beg < 0) {
5387 beg += slen;
5388 }
5389 assert(beg >= 0);
5390 assert(beg <= slen);
5391 if (len > slen - beg) {
5392 len = slen - beg;
5393 }
5394 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5395 if (!p) p = RSTRING_END(str);
5396 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5397 if (!e) e = RSTRING_END(str);
5398 /* error check */
5399 beg = p - RSTRING_PTR(str); /* physical position */
5400 len = e - p; /* physical length */
5401 rb_str_update_0(str, beg, len, val);
5402 rb_enc_associate(str, enc);
5404 if (cr != ENC_CODERANGE_BROKEN)
5405 ENC_CODERANGE_SET(str, cr);
5406}
5407
5408static void
5409rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5410{
5411 int nth;
5412 VALUE match;
5413 long start, end, len;
5414 rb_encoding *enc;
5415 struct re_registers *regs;
5416
5417 if (rb_reg_search(re, str, 0, 0) < 0) {
5418 rb_raise(rb_eIndexError, "regexp not matched");
5419 }
5420 match = rb_backref_get();
5421 nth = rb_reg_backref_number(match, backref);
5422 regs = RMATCH_REGS(match);
5423 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5424 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5425 }
5426 if (nth < 0) {
5427 nth += regs->num_regs;
5428 }
5429
5430 start = BEG(nth);
5431 if (start == -1) {
5432 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5433 }
5434 end = END(nth);
5435 len = end - start;
5436 StringValue(val);
5437 enc = rb_enc_check_str(str, val);
5438 rb_str_update_0(str, start, len, val);
5439 rb_enc_associate(str, enc);
5440}
5441
5442static VALUE
5443rb_str_aset(VALUE str, VALUE indx, VALUE val)
5444{
5445 long idx, beg;
5446
5447 switch (TYPE(indx)) {
5448 case T_REGEXP:
5449 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5450 return val;
5451
5452 case T_STRING:
5453 beg = rb_str_index(str, indx, 0);
5454 if (beg < 0) {
5455 rb_raise(rb_eIndexError, "string not matched");
5456 }
5457 beg = rb_str_sublen(str, beg);
5458 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5459 return val;
5460
5461 default:
5462 /* check if indx is Range */
5463 {
5464 long beg, len;
5465 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5466 rb_str_update(str, beg, len, val);
5467 return val;
5468 }
5469 }
5470 /* FALLTHROUGH */
5471
5472 case T_FIXNUM:
5473 idx = NUM2LONG(indx);
5474 rb_str_update(str, idx, 1, val);
5475 return val;
5476 }
5477}
5478
5479/*
5480 * call-seq:
5481 * string[index] = new_string
5482 * string[start, length] = new_string
5483 * string[range] = new_string
5484 * string[regexp, capture = 0] = new_string
5485 * string[substring] = new_string
5486 *
5487 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5488 * See {String Slices}[rdoc-ref:String@String+Slices].
5489 *
5490 * A few examples:
5491 *
5492 * s = 'foo'
5493 * s[2] = 'rtune' # => "rtune"
5494 * s # => "fortune"
5495 * s[1, 5] = 'init' # => "init"
5496 * s # => "finite"
5497 * s[3..4] = 'al' # => "al"
5498 * s # => "finale"
5499 * s[/e$/] = 'ly' # => "ly"
5500 * s # => "finally"
5501 * s['lly'] = 'ncial' # => "ncial"
5502 * s # => "financial"
5503 *
5504 */
5505
5506static VALUE
5507rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5508{
5509 if (argc == 3) {
5510 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5511 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5512 }
5513 else {
5514 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5515 }
5516 return argv[2];
5517 }
5518 rb_check_arity(argc, 2, 3);
5519 return rb_str_aset(str, argv[0], argv[1]);
5520}
5521
5522/*
5523 * call-seq:
5524 * insert(index, other_string) -> self
5525 *
5526 * Inserts the given +other_string+ into +self+; returns +self+.
5527 *
5528 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5529 *
5530 * 'foo'.insert(1, 'bar') # => "fbaroo"
5531 *
5532 * If the Integer +index+ is negative, counts backward from the end of +self+
5533 * and inserts +other_string+ at offset <tt>index+1</tt>
5534 * (that is, _after_ <tt>self[index]</tt>):
5535 *
5536 * 'foo'.insert(-2, 'bar') # => "fobaro"
5537 *
5538 */
5539
5540static VALUE
5541rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5542{
5543 long pos = NUM2LONG(idx);
5544
5545 if (pos == -1) {
5546 return rb_str_append(str, str2);
5547 }
5548 else if (pos < 0) {
5549 pos++;
5550 }
5551 rb_str_update(str, pos, 0, str2);
5552 return str;
5553}
5554
5555
5556/*
5557 * call-seq:
5558 * slice!(index) -> new_string or nil
5559 * slice!(start, length) -> new_string or nil
5560 * slice!(range) -> new_string or nil
5561 * slice!(regexp, capture = 0) -> new_string or nil
5562 * slice!(substring) -> new_string or nil
5563 *
5564 * Removes and returns the substring of +self+ specified by the arguments.
5565 * See {String Slices}[rdoc-ref:String@String+Slices].
5566 *
5567 * A few examples:
5568 *
5569 * string = "This is a string"
5570 * string.slice!(2) #=> "i"
5571 * string.slice!(3..6) #=> " is "
5572 * string.slice!(/s.*t/) #=> "sa st"
5573 * string.slice!("r") #=> "r"
5574 * string #=> "Thing"
5575 *
5576 */
5577
5578static VALUE
5579rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5580{
5581 VALUE result = Qnil;
5582 VALUE indx;
5583 long beg, len = 1;
5584 char *p;
5585
5586 rb_check_arity(argc, 1, 2);
5587 str_modify_keep_cr(str);
5588 indx = argv[0];
5589 if (RB_TYPE_P(indx, T_REGEXP)) {
5590 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5591 VALUE match = rb_backref_get();
5592 struct re_registers *regs = RMATCH_REGS(match);
5593 int nth = 0;
5594 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5595 if ((nth += regs->num_regs) <= 0) return Qnil;
5596 }
5597 else if (nth >= regs->num_regs) return Qnil;
5598 beg = BEG(nth);
5599 len = END(nth) - beg;
5600 goto subseq;
5601 }
5602 else if (argc == 2) {
5603 beg = NUM2LONG(indx);
5604 len = NUM2LONG(argv[1]);
5605 goto num_index;
5606 }
5607 else if (FIXNUM_P(indx)) {
5608 beg = FIX2LONG(indx);
5609 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5610 if (!len) return Qnil;
5611 beg = p - RSTRING_PTR(str);
5612 goto subseq;
5613 }
5614 else if (RB_TYPE_P(indx, T_STRING)) {
5615 beg = rb_str_index(str, indx, 0);
5616 if (beg == -1) return Qnil;
5617 len = RSTRING_LEN(indx);
5618 result = str_duplicate(rb_cString, indx);
5619 goto squash;
5620 }
5621 else {
5622 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5623 case Qnil:
5624 return Qnil;
5625 case Qfalse:
5626 beg = NUM2LONG(indx);
5627 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5628 if (!len) return Qnil;
5629 beg = p - RSTRING_PTR(str);
5630 goto subseq;
5631 default:
5632 goto num_index;
5633 }
5634 }
5635
5636 num_index:
5637 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5638 beg = p - RSTRING_PTR(str);
5639
5640 subseq:
5641 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5642 rb_enc_cr_str_copy_for_substr(result, str);
5643
5644 squash:
5645 if (len > 0) {
5646 if (beg == 0) {
5647 rb_str_drop_bytes(str, len);
5648 }
5649 else {
5650 char *sptr = RSTRING_PTR(str);
5651 long slen = RSTRING_LEN(str);
5652 if (beg + len > slen) /* pathological check */
5653 len = slen - beg;
5654 memmove(sptr + beg,
5655 sptr + beg + len,
5656 slen - (beg + len));
5657 slen -= len;
5658 STR_SET_LEN(str, slen);
5659 TERM_FILL(&sptr[slen], TERM_LEN(str));
5660 }
5661 }
5662 return result;
5663}
5664
5665static VALUE
5666get_pat(VALUE pat)
5667{
5668 VALUE val;
5669
5670 switch (OBJ_BUILTIN_TYPE(pat)) {
5671 case T_REGEXP:
5672 return pat;
5673
5674 case T_STRING:
5675 break;
5676
5677 default:
5678 val = rb_check_string_type(pat);
5679 if (NIL_P(val)) {
5680 Check_Type(pat, T_REGEXP);
5681 }
5682 pat = val;
5683 }
5684
5685 return rb_reg_regcomp(pat);
5686}
5687
5688static VALUE
5689get_pat_quoted(VALUE pat, int check)
5690{
5691 VALUE val;
5692
5693 switch (OBJ_BUILTIN_TYPE(pat)) {
5694 case T_REGEXP:
5695 return pat;
5696
5697 case T_STRING:
5698 break;
5699
5700 default:
5701 val = rb_check_string_type(pat);
5702 if (NIL_P(val)) {
5703 Check_Type(pat, T_REGEXP);
5704 }
5705 pat = val;
5706 }
5707 if (check && is_broken_string(pat)) {
5708 rb_exc_raise(rb_reg_check_preprocess(pat));
5709 }
5710 return pat;
5711}
5712
5713static long
5714rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5715{
5716 if (BUILTIN_TYPE(pat) == T_STRING) {
5717 pos = rb_str_byteindex(str, pat, pos);
5718 if (set_backref_str) {
5719 if (pos >= 0) {
5720 str = rb_str_new_frozen_String(str);
5721 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5722 }
5723 else {
5725 }
5726 }
5727 return pos;
5728 }
5729 else {
5730 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5731 }
5732}
5733
5734
5735/*
5736 * call-seq:
5737 * sub!(pattern, replacement) -> self or nil
5738 * sub!(pattern) {|match| ... } -> self or nil
5739 *
5740 * Returns +self+ with only the first occurrence
5741 * (not all occurrences) of the given +pattern+ replaced.
5742 *
5743 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5744 *
5745 * Related: String#sub, String#gsub, String#gsub!.
5746 *
5747 */
5748
5749static VALUE
5750rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5751{
5752 VALUE pat, repl, hash = Qnil;
5753 int iter = 0;
5754 long plen;
5755 int min_arity = rb_block_given_p() ? 1 : 2;
5756 long beg;
5757
5758 rb_check_arity(argc, min_arity, 2);
5759 if (argc == 1) {
5760 iter = 1;
5761 }
5762 else {
5763 repl = argv[1];
5764 hash = rb_check_hash_type(argv[1]);
5765 if (NIL_P(hash)) {
5766 StringValue(repl);
5767 }
5768 }
5769
5770 pat = get_pat_quoted(argv[0], 1);
5771
5772 str_modifiable(str);
5773 beg = rb_pat_search(pat, str, 0, 1);
5774 if (beg >= 0) {
5775 rb_encoding *enc;
5776 int cr = ENC_CODERANGE(str);
5777 long beg0, end0;
5778 VALUE match, match0 = Qnil;
5779 struct re_registers *regs;
5780 char *p, *rp;
5781 long len, rlen;
5782
5783 match = rb_backref_get();
5784 regs = RMATCH_REGS(match);
5785 if (RB_TYPE_P(pat, T_STRING)) {
5786 beg0 = beg;
5787 end0 = beg0 + RSTRING_LEN(pat);
5788 match0 = pat;
5789 }
5790 else {
5791 beg0 = BEG(0);
5792 end0 = END(0);
5793 if (iter) match0 = rb_reg_nth_match(0, match);
5794 }
5795
5796 if (iter || !NIL_P(hash)) {
5797 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5798
5799 if (iter) {
5800 repl = rb_obj_as_string(rb_yield(match0));
5801 }
5802 else {
5803 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5804 repl = rb_obj_as_string(repl);
5805 }
5806 str_mod_check(str, p, len);
5807 rb_check_frozen(str);
5808 }
5809 else {
5810 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5811 }
5812
5813 enc = rb_enc_compatible(str, repl);
5814 if (!enc) {
5815 rb_encoding *str_enc = STR_ENC_GET(str);
5816 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5817 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5818 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5819 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5820 rb_enc_name(str_enc),
5821 rb_enc_name(STR_ENC_GET(repl)));
5822 }
5823 enc = STR_ENC_GET(repl);
5824 }
5825 rb_str_modify(str);
5826 rb_enc_associate(str, enc);
5828 int cr2 = ENC_CODERANGE(repl);
5829 if (cr2 == ENC_CODERANGE_BROKEN ||
5830 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5832 else
5833 cr = cr2;
5834 }
5835 plen = end0 - beg0;
5836 rlen = RSTRING_LEN(repl);
5837 len = RSTRING_LEN(str);
5838 if (rlen > plen) {
5839 RESIZE_CAPA(str, len + rlen - plen);
5840 }
5841 p = RSTRING_PTR(str);
5842 if (rlen != plen) {
5843 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5844 }
5845 rp = RSTRING_PTR(repl);
5846 memmove(p + beg0, rp, rlen);
5847 len += rlen - plen;
5848 STR_SET_LEN(str, len);
5849 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5850 ENC_CODERANGE_SET(str, cr);
5851
5852 RB_GC_GUARD(match);
5853
5854 return str;
5855 }
5856 return Qnil;
5857}
5858
5859
5860/*
5861 * call-seq:
5862 * sub(pattern, replacement) -> new_string
5863 * sub(pattern) {|match| ... } -> new_string
5864 *
5865 * Returns a copy of +self+ with only the first occurrence
5866 * (not all occurrences) of the given +pattern+ replaced.
5867 *
5868 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5869 *
5870 * Related: String#sub!, String#gsub, String#gsub!.
5871 *
5872 */
5873
5874static VALUE
5875rb_str_sub(int argc, VALUE *argv, VALUE str)
5876{
5877 str = str_duplicate(rb_cString, str);
5878 rb_str_sub_bang(argc, argv, str);
5879 return str;
5880}
5881
5882static VALUE
5883str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5884{
5885 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
5886 long beg, beg0, end0;
5887 long offset, blen, slen, len, last;
5888 enum {STR, ITER, MAP} mode = STR;
5889 char *sp, *cp;
5890 int need_backref = -1;
5891 rb_encoding *str_enc;
5892
5893 switch (argc) {
5894 case 1:
5895 RETURN_ENUMERATOR(str, argc, argv);
5896 mode = ITER;
5897 break;
5898 case 2:
5899 repl = argv[1];
5900 hash = rb_check_hash_type(argv[1]);
5901 if (NIL_P(hash)) {
5902 StringValue(repl);
5903 }
5904 else {
5905 mode = MAP;
5906 }
5907 break;
5908 default:
5909 rb_error_arity(argc, 1, 2);
5910 }
5911
5912 pat = get_pat_quoted(argv[0], 1);
5913 beg = rb_pat_search(pat, str, 0, need_backref);
5914 if (beg < 0) {
5915 if (bang) return Qnil; /* no match, no substitution */
5916 return str_duplicate(rb_cString, str);
5917 }
5918
5919 offset = 0;
5920 blen = RSTRING_LEN(str) + 30; /* len + margin */
5921 dest = rb_str_buf_new(blen);
5922 sp = RSTRING_PTR(str);
5923 slen = RSTRING_LEN(str);
5924 cp = sp;
5925 str_enc = STR_ENC_GET(str);
5926 rb_enc_associate(dest, str_enc);
5927 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5928
5929 do {
5930 VALUE match = rb_backref_get();
5931 struct re_registers *regs = RMATCH_REGS(match);
5932 if (RB_TYPE_P(pat, T_STRING)) {
5933 beg0 = beg;
5934 end0 = beg0 + RSTRING_LEN(pat);
5935 match0 = pat;
5936 }
5937 else {
5938 beg0 = BEG(0);
5939 end0 = END(0);
5940 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5941 }
5942
5943 if (mode) {
5944 if (mode == ITER) {
5945 val = rb_obj_as_string(rb_yield(match0));
5946 }
5947 else {
5948 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5949 val = rb_obj_as_string(val);
5950 }
5951 str_mod_check(str, sp, slen);
5952 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5953 rb_raise(rb_eRuntimeError, "block should not cheat");
5954 }
5955 }
5956 else if (need_backref) {
5957 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5958 if (need_backref < 0) {
5959 need_backref = val != repl;
5960 }
5961 }
5962 else {
5963 val = repl;
5964 }
5965
5966 len = beg0 - offset; /* copy pre-match substr */
5967 if (len) {
5968 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5969 }
5970
5971 rb_str_buf_append(dest, val);
5972
5973 last = offset;
5974 offset = end0;
5975 if (beg0 == end0) {
5976 /*
5977 * Always consume at least one character of the input string
5978 * in order to prevent infinite loops.
5979 */
5980 if (RSTRING_LEN(str) <= end0) break;
5981 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5982 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5983 offset = end0 + len;
5984 }
5985 cp = RSTRING_PTR(str) + offset;
5986 if (offset > RSTRING_LEN(str)) break;
5987 beg = rb_pat_search(pat, str, offset, need_backref);
5988
5989 RB_GC_GUARD(match);
5990 } while (beg >= 0);
5991 if (RSTRING_LEN(str) > offset) {
5992 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5993 }
5994 rb_pat_search(pat, str, last, 1);
5995 if (bang) {
5996 str_shared_replace(str, dest);
5997 }
5998 else {
5999 str = dest;
6000 }
6001
6002 return str;
6003}
6004
6005
6006/*
6007 * call-seq:
6008 * gsub!(pattern, replacement) -> self or nil
6009 * gsub!(pattern) {|match| ... } -> self or nil
6010 * gsub!(pattern) -> an_enumerator
6011 *
6012 * Performs the specified substring replacement(s) on +self+;
6013 * returns +self+ if any replacement occurred, +nil+ otherwise.
6014 *
6015 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6016 *
6017 * Returns an Enumerator if no +replacement+ and no block given.
6018 *
6019 * Related: String#sub, String#gsub, String#sub!.
6020 *
6021 */
6022
6023static VALUE
6024rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6025{
6026 str_modify_keep_cr(str);
6027 return str_gsub(argc, argv, str, 1);
6028}
6029
6030
6031/*
6032 * call-seq:
6033 * gsub(pattern, replacement) -> new_string
6034 * gsub(pattern) {|match| ... } -> new_string
6035 * gsub(pattern) -> enumerator
6036 *
6037 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6038 *
6039 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6040 *
6041 * Returns an Enumerator if no +replacement+ and no block given.
6042 *
6043 * Related: String#sub, String#sub!, String#gsub!.
6044 *
6045 */
6046
6047static VALUE
6048rb_str_gsub(int argc, VALUE *argv, VALUE str)
6049{
6050 return str_gsub(argc, argv, str, 0);
6051}
6052
6053
6054/*
6055 * call-seq:
6056 * replace(other_string) -> self
6057 *
6058 * Replaces the contents of +self+ with the contents of +other_string+:
6059 *
6060 * s = 'foo' # => "foo"
6061 * s.replace('bar') # => "bar"
6062 *
6063 */
6064
6065VALUE
6067{
6068 str_modifiable(str);
6069 if (str == str2) return str;
6070
6071 StringValue(str2);
6072 str_discard(str);
6073 return str_replace(str, str2);
6074}
6075
6076/*
6077 * call-seq:
6078 * clear -> self
6079 *
6080 * Removes the contents of +self+:
6081 *
6082 * s = 'foo' # => "foo"
6083 * s.clear # => ""
6084 *
6085 */
6086
6087static VALUE
6088rb_str_clear(VALUE str)
6089{
6090 str_discard(str);
6091 STR_SET_EMBED(str);
6092 STR_SET_LEN(str, 0);
6093 RSTRING_PTR(str)[0] = 0;
6094 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6096 else
6098 return str;
6099}
6100
6101/*
6102 * call-seq:
6103 * chr -> string
6104 *
6105 * Returns a string containing the first character of +self+:
6106 *
6107 * s = 'foo' # => "foo"
6108 * s.chr # => "f"
6109 *
6110 */
6111
6112static VALUE
6113rb_str_chr(VALUE str)
6114{
6115 return rb_str_substr(str, 0, 1);
6116}
6117
6118/*
6119 * call-seq:
6120 * getbyte(index) -> integer or nil
6121 *
6122 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6123 *
6124 * s = 'abcde' # => "abcde"
6125 * s.getbyte(0) # => 97
6126 * s.getbyte(-1) # => 101
6127 * s.getbyte(5) # => nil
6128 *
6129 * Related: String#setbyte.
6130 */
6131VALUE
6132rb_str_getbyte(VALUE str, VALUE index)
6133{
6134 long pos = NUM2LONG(index);
6135
6136 if (pos < 0)
6137 pos += RSTRING_LEN(str);
6138 if (pos < 0 || RSTRING_LEN(str) <= pos)
6139 return Qnil;
6140
6141 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6142}
6143
6144/*
6145 * call-seq:
6146 * setbyte(index, integer) -> integer
6147 *
6148 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6149 *
6150 * s = 'abcde' # => "abcde"
6151 * s.setbyte(0, 98) # => 98
6152 * s # => "bbcde"
6153 *
6154 * Related: String#getbyte.
6155 */
6156static VALUE
6157rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6158{
6159 long pos = NUM2LONG(index);
6160 long len = RSTRING_LEN(str);
6161 char *ptr, *head, *left = 0;
6162 rb_encoding *enc;
6163 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6164
6165 if (pos < -len || len <= pos)
6166 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6167 if (pos < 0)
6168 pos += len;
6169
6170 VALUE v = rb_to_int(value);
6171 VALUE w = rb_int_and(v, INT2FIX(0xff));
6172 char byte = (char)(NUM2INT(w) & 0xFF);
6173
6174 if (!str_independent(str))
6175 str_make_independent(str);
6176 enc = STR_ENC_GET(str);
6177 head = RSTRING_PTR(str);
6178 ptr = &head[pos];
6179 if (!STR_EMBED_P(str)) {
6180 cr = ENC_CODERANGE(str);
6181 switch (cr) {
6182 case ENC_CODERANGE_7BIT:
6183 left = ptr;
6184 *ptr = byte;
6185 if (ISASCII(byte)) goto end;
6186 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6187 if (!MBCLEN_CHARFOUND_P(nlen))
6189 else
6191 goto end;
6193 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6194 width = rb_enc_precise_mbclen(left, head+len, enc);
6195 *ptr = byte;
6196 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6197 if (!MBCLEN_CHARFOUND_P(nlen))
6199 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6201 goto end;
6202 }
6203 }
6205 *ptr = byte;
6206
6207 end:
6208 return value;
6209}
6210
6211static VALUE
6212str_byte_substr(VALUE str, long beg, long len, int empty)
6213{
6214 long n = RSTRING_LEN(str);
6215
6216 if (beg > n || len < 0) return Qnil;
6217 if (beg < 0) {
6218 beg += n;
6219 if (beg < 0) return Qnil;
6220 }
6221 if (len > n - beg)
6222 len = n - beg;
6223 if (len <= 0) {
6224 if (!empty) return Qnil;
6225 len = 0;
6226 }
6227
6228 VALUE str2 = str_subseq(str, beg, len);
6229
6230 str_enc_copy_direct(str2, str);
6231
6232 if (RSTRING_LEN(str2) == 0) {
6233 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6235 else
6237 }
6238 else {
6239 switch (ENC_CODERANGE(str)) {
6240 case ENC_CODERANGE_7BIT:
6242 break;
6243 default:
6245 break;
6246 }
6247 }
6248
6249 return str2;
6250}
6251
6252static VALUE
6253str_byte_aref(VALUE str, VALUE indx)
6254{
6255 long idx;
6256 if (FIXNUM_P(indx)) {
6257 idx = FIX2LONG(indx);
6258 }
6259 else {
6260 /* check if indx is Range */
6261 long beg, len = RSTRING_LEN(str);
6262
6263 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6264 case Qfalse:
6265 break;
6266 case Qnil:
6267 return Qnil;
6268 default:
6269 return str_byte_substr(str, beg, len, TRUE);
6270 }
6271
6272 idx = NUM2LONG(indx);
6273 }
6274 return str_byte_substr(str, idx, 1, FALSE);
6275}
6276
6277/*
6278 * call-seq:
6279 * byteslice(index, length = 1) -> string or nil
6280 * byteslice(range) -> string or nil
6281 *
6282 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6283 *
6284 * With integer arguments +index+ and +length+ given,
6285 * returns the substring beginning at the given +index+
6286 * of the given +length+ (if possible),
6287 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6288 *
6289 * s = '0123456789' # => "0123456789"
6290 * s.byteslice(2) # => "2"
6291 * s.byteslice(200) # => nil
6292 * s.byteslice(4, 3) # => "456"
6293 * s.byteslice(4, 30) # => "456789"
6294 * s.byteslice(4, -1) # => nil
6295 * s.byteslice(40, 2) # => nil
6296 *
6297 * In either case above, counts backwards from the end of +self+
6298 * if +index+ is negative:
6299 *
6300 * s = '0123456789' # => "0123456789"
6301 * s.byteslice(-4) # => "6"
6302 * s.byteslice(-4, 3) # => "678"
6303 *
6304 * With Range argument +range+ given, returns
6305 * <tt>byteslice(range.begin, range.size)</tt>:
6306 *
6307 * s = '0123456789' # => "0123456789"
6308 * s.byteslice(4..6) # => "456"
6309 * s.byteslice(-6..-4) # => "456"
6310 * s.byteslice(5..2) # => "" # range.size is zero.
6311 * s.byteslice(40..42) # => nil
6312 *
6313 * In all cases, a returned string has the same encoding as +self+:
6314 *
6315 * s.encoding # => #<Encoding:UTF-8>
6316 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6317 *
6318 */
6319
6320static VALUE
6321rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6322{
6323 if (argc == 2) {
6324 long beg = NUM2LONG(argv[0]);
6325 long len = NUM2LONG(argv[1]);
6326 return str_byte_substr(str, beg, len, TRUE);
6327 }
6328 rb_check_arity(argc, 1, 2);
6329 return str_byte_aref(str, argv[0]);
6330}
6331
6332static void
6333str_check_beg_len(VALUE str, long *beg, long *len)
6334{
6335 long end, slen = RSTRING_LEN(str);
6336
6337 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6338 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6339 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6340 }
6341 if (*beg < 0) {
6342 *beg += slen;
6343 }
6344 assert(*beg >= 0);
6345 assert(*beg <= slen);
6346 if (*len > slen - *beg) {
6347 *len = slen - *beg;
6348 }
6349 end = *beg + *len;
6350 str_ensure_byte_pos(str, *beg);
6351 str_ensure_byte_pos(str, end);
6352}
6353
6354/*
6355 * call-seq:
6356 * bytesplice(index, length, str) -> string
6357 * bytesplice(index, length, str, str_index, str_length) -> string
6358 * bytesplice(range, str) -> string
6359 * bytesplice(range, str, str_range) -> string
6360 *
6361 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6362 * The portion of the string affected is determined using
6363 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6364 * If the replacement string is not the same length as the text it is replacing,
6365 * the string will be adjusted accordingly.
6366 *
6367 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6368 *
6369 * The form that take an Integer will raise an IndexError if the value is out
6370 * of range; the Range form will raise a RangeError.
6371 * If the beginning or ending offset does not land on character (codepoint)
6372 * boundary, an IndexError will be raised.
6373 */
6374
6375static VALUE
6376rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6377{
6378 long beg, len, vbeg, vlen;
6379 VALUE val;
6380 rb_encoding *enc;
6381 int cr;
6382
6383 rb_check_arity(argc, 2, 5);
6384 if (!(argc == 2 || argc == 3 || argc == 5)) {
6385 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6386 }
6387 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6388 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6389 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6390 rb_builtin_class_name(argv[0]));
6391 }
6392 val = argv[1];
6393 StringValue(val);
6394 if (argc == 2) {
6395 /* bytesplice(range, str) */
6396 vbeg = 0;
6397 vlen = RSTRING_LEN(val);
6398 }
6399 else {
6400 /* bytesplice(range, str, str_range) */
6401 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6402 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6403 rb_builtin_class_name(argv[2]));
6404 }
6405 }
6406 }
6407 else {
6408 beg = NUM2LONG(argv[0]);
6409 len = NUM2LONG(argv[1]);
6410 val = argv[2];
6411 StringValue(val);
6412 if (argc == 3) {
6413 /* bytesplice(index, length, str) */
6414 vbeg = 0;
6415 vlen = RSTRING_LEN(val);
6416 }
6417 else {
6418 /* bytesplice(index, length, str, str_index, str_length) */
6419 vbeg = NUM2LONG(argv[3]);
6420 vlen = NUM2LONG(argv[4]);
6421 }
6422 }
6423 str_check_beg_len(str, &beg, &len);
6424 str_check_beg_len(val, &vbeg, &vlen);
6425 enc = rb_enc_check(str, val);
6426 str_modify_keep_cr(str);
6427 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6428 rb_enc_associate(str, enc);
6430 if (cr != ENC_CODERANGE_BROKEN)
6431 ENC_CODERANGE_SET(str, cr);
6432 return str;
6433}
6434
6435/*
6436 * call-seq:
6437 * reverse -> string
6438 *
6439 * Returns a new string with the characters from +self+ in reverse order.
6440 *
6441 * 'stressed'.reverse # => "desserts"
6442 *
6443 */
6444
6445static VALUE
6446rb_str_reverse(VALUE str)
6447{
6448 rb_encoding *enc;
6449 VALUE rev;
6450 char *s, *e, *p;
6451 int cr;
6452
6453 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6454 enc = STR_ENC_GET(str);
6455 rev = rb_str_new(0, RSTRING_LEN(str));
6456 s = RSTRING_PTR(str); e = RSTRING_END(str);
6457 p = RSTRING_END(rev);
6458 cr = ENC_CODERANGE(str);
6459
6460 if (RSTRING_LEN(str) > 1) {
6461 if (single_byte_optimizable(str)) {
6462 while (s < e) {
6463 *--p = *s++;
6464 }
6465 }
6466 else if (cr == ENC_CODERANGE_VALID) {
6467 while (s < e) {
6468 int clen = rb_enc_fast_mbclen(s, e, enc);
6469
6470 p -= clen;
6471 memcpy(p, s, clen);
6472 s += clen;
6473 }
6474 }
6475 else {
6476 cr = rb_enc_asciicompat(enc) ?
6478 while (s < e) {
6479 int clen = rb_enc_mbclen(s, e, enc);
6480
6481 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6482 p -= clen;
6483 memcpy(p, s, clen);
6484 s += clen;
6485 }
6486 }
6487 }
6488 STR_SET_LEN(rev, RSTRING_LEN(str));
6489 str_enc_copy_direct(rev, str);
6490 ENC_CODERANGE_SET(rev, cr);
6491
6492 return rev;
6493}
6494
6495
6496/*
6497 * call-seq:
6498 * reverse! -> self
6499 *
6500 * Returns +self+ with its characters reversed:
6501 *
6502 * s = 'stressed'
6503 * s.reverse! # => "desserts"
6504 * s # => "desserts"
6505 *
6506 */
6507
6508static VALUE
6509rb_str_reverse_bang(VALUE str)
6510{
6511 if (RSTRING_LEN(str) > 1) {
6512 if (single_byte_optimizable(str)) {
6513 char *s, *e, c;
6514
6515 str_modify_keep_cr(str);
6516 s = RSTRING_PTR(str);
6517 e = RSTRING_END(str) - 1;
6518 while (s < e) {
6519 c = *s;
6520 *s++ = *e;
6521 *e-- = c;
6522 }
6523 }
6524 else {
6525 str_shared_replace(str, rb_str_reverse(str));
6526 }
6527 }
6528 else {
6529 str_modify_keep_cr(str);
6530 }
6531 return str;
6532}
6533
6534
6535/*
6536 * call-seq:
6537 * include? other_string -> true or false
6538 *
6539 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6540 *
6541 * s = 'foo'
6542 * s.include?('f') # => true
6543 * s.include?('fo') # => true
6544 * s.include?('food') # => false
6545 *
6546 */
6547
6548VALUE
6549rb_str_include(VALUE str, VALUE arg)
6550{
6551 long i;
6552
6553 StringValue(arg);
6554 i = rb_str_index(str, arg, 0);
6555
6556 return RBOOL(i != -1);
6557}
6558
6559
6560/*
6561 * call-seq:
6562 * to_i(base = 10) -> integer
6563 *
6564 * Returns the result of interpreting leading characters in +self+
6565 * as an integer in the given +base+ (which must be in (0, 2..36)):
6566 *
6567 * '123456'.to_i # => 123456
6568 * '123def'.to_i(16) # => 1195503
6569 *
6570 * With +base+ zero, string +object+ may contain leading characters
6571 * to specify the actual base:
6572 *
6573 * '123def'.to_i(0) # => 123
6574 * '0123def'.to_i(0) # => 83
6575 * '0b123def'.to_i(0) # => 1
6576 * '0o123def'.to_i(0) # => 83
6577 * '0d123def'.to_i(0) # => 123
6578 * '0x123def'.to_i(0) # => 1195503
6579 *
6580 * Characters past a leading valid number (in the given +base+) are ignored:
6581 *
6582 * '12.345'.to_i # => 12
6583 * '12345'.to_i(2) # => 1
6584 *
6585 * Returns zero if there is no leading valid number:
6586 *
6587 * 'abcdef'.to_i # => 0
6588 * '2'.to_i(2) # => 0
6589 *
6590 */
6591
6592static VALUE
6593rb_str_to_i(int argc, VALUE *argv, VALUE str)
6594{
6595 int base = 10;
6596
6597 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6598 rb_raise(rb_eArgError, "invalid radix %d", base);
6599 }
6600 return rb_str_to_inum(str, base, FALSE);
6601}
6602
6603
6604/*
6605 * call-seq:
6606 * to_f -> float
6607 *
6608 * Returns the result of interpreting leading characters in +self+ as a Float:
6609 *
6610 * '3.14159'.to_f # => 3.14159
6611 * '1.234e-2'.to_f # => 0.01234
6612 *
6613 * Characters past a leading valid number (in the given +base+) are ignored:
6614 *
6615 * '3.14 (pi to two places)'.to_f # => 3.14
6616 *
6617 * Returns zero if there is no leading valid number:
6618 *
6619 * 'abcdef'.to_f # => 0.0
6620 *
6621 */
6622
6623static VALUE
6624rb_str_to_f(VALUE str)
6625{
6626 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6627}
6628
6629
6630/*
6631 * call-seq:
6632 * to_s -> self or string
6633 *
6634 * Returns +self+ if +self+ is a \String,
6635 * or +self+ converted to a \String if +self+ is a subclass of \String.
6636 */
6637
6638static VALUE
6639rb_str_to_s(VALUE str)
6640{
6641 if (rb_obj_class(str) != rb_cString) {
6642 return str_duplicate(rb_cString, str);
6643 }
6644 return str;
6645}
6646
6647#if 0
6648static void
6649str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6650{
6651 char s[RUBY_MAX_CHAR_LEN];
6652 int n = rb_enc_codelen(c, enc);
6653
6654 rb_enc_mbcput(c, s, enc);
6655 rb_enc_str_buf_cat(str, s, n, enc);
6656}
6657#endif
6658
6659#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6660
6661int
6662rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6663{
6664 char buf[CHAR_ESC_LEN + 1];
6665 int l;
6666
6667#if SIZEOF_INT > 4
6668 c &= 0xffffffff;
6669#endif
6670 if (unicode_p) {
6671 if (c < 0x7F && ISPRINT(c)) {
6672 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6673 }
6674 else if (c < 0x10000) {
6675 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6676 }
6677 else {
6678 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6679 }
6680 }
6681 else {
6682 if (c < 0x100) {
6683 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6684 }
6685 else {
6686 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6687 }
6688 }
6689 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6690 rb_str_buf_cat(result, buf, l);
6691 return l;
6692}
6693
6694const char *
6695ruby_escaped_char(int c)
6696{
6697 switch (c) {
6698 case '\0': return "\\0";
6699 case '\n': return "\\n";
6700 case '\r': return "\\r";
6701 case '\t': return "\\t";
6702 case '\f': return "\\f";
6703 case '\013': return "\\v";
6704 case '\010': return "\\b";
6705 case '\007': return "\\a";
6706 case '\033': return "\\e";
6707 case '\x7f': return "\\c?";
6708 }
6709 return NULL;
6710}
6711
6712VALUE
6713rb_str_escape(VALUE str)
6714{
6715 int encidx = ENCODING_GET(str);
6716 rb_encoding *enc = rb_enc_from_index(encidx);
6717 const char *p = RSTRING_PTR(str);
6718 const char *pend = RSTRING_END(str);
6719 const char *prev = p;
6720 char buf[CHAR_ESC_LEN + 1];
6721 VALUE result = rb_str_buf_new(0);
6722 int unicode_p = rb_enc_unicode_p(enc);
6723 int asciicompat = rb_enc_asciicompat(enc);
6724
6725 while (p < pend) {
6726 unsigned int c;
6727 const char *cc;
6728 int n = rb_enc_precise_mbclen(p, pend, enc);
6729 if (!MBCLEN_CHARFOUND_P(n)) {
6730 if (p > prev) str_buf_cat(result, prev, p - prev);
6731 n = rb_enc_mbminlen(enc);
6732 if (pend < p + n)
6733 n = (int)(pend - p);
6734 while (n--) {
6735 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6736 str_buf_cat(result, buf, strlen(buf));
6737 prev = ++p;
6738 }
6739 continue;
6740 }
6741 n = MBCLEN_CHARFOUND_LEN(n);
6742 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6743 p += n;
6744 cc = ruby_escaped_char(c);
6745 if (cc) {
6746 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6747 str_buf_cat(result, cc, strlen(cc));
6748 prev = p;
6749 }
6750 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6751 }
6752 else {
6753 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6754 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6755 prev = p;
6756 }
6757 }
6758 if (p > prev) str_buf_cat(result, prev, p - prev);
6759 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6760
6761 return result;
6762}
6763
6764/*
6765 * call-seq:
6766 * inspect -> string
6767 *
6768 * Returns a printable version of +self+, enclosed in double-quotes,
6769 * and with special characters escaped:
6770 *
6771 * s = "foo\tbar\tbaz\n"
6772 * s.inspect
6773 * # => "\"foo\\tbar\\tbaz\\n\""
6774 *
6775 */
6776
6777VALUE
6779{
6780 int encidx = ENCODING_GET(str);
6781 rb_encoding *enc = rb_enc_from_index(encidx);
6782 const char *p, *pend, *prev;
6783 char buf[CHAR_ESC_LEN + 1];
6784 VALUE result = rb_str_buf_new(0);
6785 rb_encoding *resenc = rb_default_internal_encoding();
6786 int unicode_p = rb_enc_unicode_p(enc);
6787 int asciicompat = rb_enc_asciicompat(enc);
6788
6789 if (resenc == NULL) resenc = rb_default_external_encoding();
6790 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6791 rb_enc_associate(result, resenc);
6792 str_buf_cat2(result, "\"");
6793
6794 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6795 prev = p;
6796 while (p < pend) {
6797 unsigned int c, cc;
6798 int n;
6799
6800 n = rb_enc_precise_mbclen(p, pend, enc);
6801 if (!MBCLEN_CHARFOUND_P(n)) {
6802 if (p > prev) str_buf_cat(result, prev, p - prev);
6803 n = rb_enc_mbminlen(enc);
6804 if (pend < p + n)
6805 n = (int)(pend - p);
6806 while (n--) {
6807 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6808 str_buf_cat(result, buf, strlen(buf));
6809 prev = ++p;
6810 }
6811 continue;
6812 }
6813 n = MBCLEN_CHARFOUND_LEN(n);
6814 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6815 p += n;
6816 if ((asciicompat || unicode_p) &&
6817 (c == '"'|| c == '\\' ||
6818 (c == '#' &&
6819 p < pend &&
6820 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6821 (cc = rb_enc_codepoint(p,pend,enc),
6822 (cc == '$' || cc == '@' || cc == '{'))))) {
6823 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6824 str_buf_cat2(result, "\\");
6825 if (asciicompat || enc == resenc) {
6826 prev = p - n;
6827 continue;
6828 }
6829 }
6830 switch (c) {
6831 case '\n': cc = 'n'; break;
6832 case '\r': cc = 'r'; break;
6833 case '\t': cc = 't'; break;
6834 case '\f': cc = 'f'; break;
6835 case '\013': cc = 'v'; break;
6836 case '\010': cc = 'b'; break;
6837 case '\007': cc = 'a'; break;
6838 case 033: cc = 'e'; break;
6839 default: cc = 0; break;
6840 }
6841 if (cc) {
6842 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6843 buf[0] = '\\';
6844 buf[1] = (char)cc;
6845 str_buf_cat(result, buf, 2);
6846 prev = p;
6847 continue;
6848 }
6849 /* The special casing of 0x85 (NEXT_LINE) here is because
6850 * Oniguruma historically treats it as printable, but it
6851 * doesn't match the print POSIX bracket class or character
6852 * property in regexps.
6853 *
6854 * See Ruby Bug #16842 for details:
6855 * https://bugs.ruby-lang.org/issues/16842
6856 */
6857 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6858 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6859 continue;
6860 }
6861 else {
6862 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6863 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6864 prev = p;
6865 continue;
6866 }
6867 }
6868 if (p > prev) str_buf_cat(result, prev, p - prev);
6869 str_buf_cat2(result, "\"");
6870
6871 return result;
6872}
6873
6874#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6875
6876/*
6877 * call-seq:
6878 * dump -> string
6879 *
6880 * Returns a printable version of +self+, enclosed in double-quotes,
6881 * with special characters escaped, and with non-printing characters
6882 * replaced by hexadecimal notation:
6883 *
6884 * "hello \n ''".dump # => "\"hello \\n ''\""
6885 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6886 *
6887 * Related: String#undump (inverse of String#dump).
6888 *
6889 */
6890
6891VALUE
6893{
6894 int encidx = rb_enc_get_index(str);
6895 rb_encoding *enc = rb_enc_from_index(encidx);
6896 long len;
6897 const char *p, *pend;
6898 char *q, *qend;
6899 VALUE result;
6900 int u8 = (encidx == rb_utf8_encindex());
6901 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6902
6903 len = 2; /* "" */
6904 if (!rb_enc_asciicompat(enc)) {
6905 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6906 len += strlen(enc->name);
6907 }
6908
6909 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6910 while (p < pend) {
6911 int clen;
6912 unsigned char c = *p++;
6913
6914 switch (c) {
6915 case '"': case '\\':
6916 case '\n': case '\r':
6917 case '\t': case '\f':
6918 case '\013': case '\010': case '\007': case '\033':
6919 clen = 2;
6920 break;
6921
6922 case '#':
6923 clen = IS_EVSTR(p, pend) ? 2 : 1;
6924 break;
6925
6926 default:
6927 if (ISPRINT(c)) {
6928 clen = 1;
6929 }
6930 else {
6931 if (u8 && c > 0x7F) { /* \u notation */
6932 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6933 if (MBCLEN_CHARFOUND_P(n)) {
6934 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6935 if (cc <= 0xFFFF)
6936 clen = 6; /* \uXXXX */
6937 else if (cc <= 0xFFFFF)
6938 clen = 9; /* \u{XXXXX} */
6939 else
6940 clen = 10; /* \u{XXXXXX} */
6941 p += MBCLEN_CHARFOUND_LEN(n)-1;
6942 break;
6943 }
6944 }
6945 clen = 4; /* \xNN */
6946 }
6947 break;
6948 }
6949
6950 if (clen > LONG_MAX - len) {
6951 rb_raise(rb_eRuntimeError, "string size too big");
6952 }
6953 len += clen;
6954 }
6955
6956 result = rb_str_new(0, len);
6957 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6958 q = RSTRING_PTR(result); qend = q + len + 1;
6959
6960 *q++ = '"';
6961 while (p < pend) {
6962 unsigned char c = *p++;
6963
6964 if (c == '"' || c == '\\') {
6965 *q++ = '\\';
6966 *q++ = c;
6967 }
6968 else if (c == '#') {
6969 if (IS_EVSTR(p, pend)) *q++ = '\\';
6970 *q++ = '#';
6971 }
6972 else if (c == '\n') {
6973 *q++ = '\\';
6974 *q++ = 'n';
6975 }
6976 else if (c == '\r') {
6977 *q++ = '\\';
6978 *q++ = 'r';
6979 }
6980 else if (c == '\t') {
6981 *q++ = '\\';
6982 *q++ = 't';
6983 }
6984 else if (c == '\f') {
6985 *q++ = '\\';
6986 *q++ = 'f';
6987 }
6988 else if (c == '\013') {
6989 *q++ = '\\';
6990 *q++ = 'v';
6991 }
6992 else if (c == '\010') {
6993 *q++ = '\\';
6994 *q++ = 'b';
6995 }
6996 else if (c == '\007') {
6997 *q++ = '\\';
6998 *q++ = 'a';
6999 }
7000 else if (c == '\033') {
7001 *q++ = '\\';
7002 *q++ = 'e';
7003 }
7004 else if (ISPRINT(c)) {
7005 *q++ = c;
7006 }
7007 else {
7008 *q++ = '\\';
7009 if (u8) {
7010 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7011 if (MBCLEN_CHARFOUND_P(n)) {
7012 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7013 p += n;
7014 if (cc <= 0xFFFF)
7015 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7016 else
7017 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7018 q += strlen(q);
7019 continue;
7020 }
7021 }
7022 snprintf(q, qend-q, "x%02X", c);
7023 q += 3;
7024 }
7025 }
7026 *q++ = '"';
7027 *q = '\0';
7028 if (!rb_enc_asciicompat(enc)) {
7029 snprintf(q, qend-q, nonascii_suffix, enc->name);
7030 encidx = rb_ascii8bit_encindex();
7031 }
7032 /* result from dump is ASCII */
7033 rb_enc_associate_index(result, encidx);
7035 return result;
7036}
7037
7038static int
7039unescape_ascii(unsigned int c)
7040{
7041 switch (c) {
7042 case 'n':
7043 return '\n';
7044 case 'r':
7045 return '\r';
7046 case 't':
7047 return '\t';
7048 case 'f':
7049 return '\f';
7050 case 'v':
7051 return '\13';
7052 case 'b':
7053 return '\010';
7054 case 'a':
7055 return '\007';
7056 case 'e':
7057 return 033;
7058 }
7060}
7061
7062static void
7063undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7064{
7065 const char *s = *ss;
7066 unsigned int c;
7067 int codelen;
7068 size_t hexlen;
7069 unsigned char buf[6];
7070 static rb_encoding *enc_utf8 = NULL;
7071
7072 switch (*s) {
7073 case '\\':
7074 case '"':
7075 case '#':
7076 rb_str_cat(undumped, s, 1); /* cat itself */
7077 s++;
7078 break;
7079 case 'n':
7080 case 'r':
7081 case 't':
7082 case 'f':
7083 case 'v':
7084 case 'b':
7085 case 'a':
7086 case 'e':
7087 *buf = unescape_ascii(*s);
7088 rb_str_cat(undumped, (char *)buf, 1);
7089 s++;
7090 break;
7091 case 'u':
7092 if (*binary) {
7093 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7094 }
7095 *utf8 = true;
7096 if (++s >= s_end) {
7097 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7098 }
7099 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7100 if (*penc != enc_utf8) {
7101 *penc = enc_utf8;
7102 rb_enc_associate(undumped, enc_utf8);
7103 }
7104 if (*s == '{') { /* handle \u{...} form */
7105 s++;
7106 for (;;) {
7107 if (s >= s_end) {
7108 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7109 }
7110 if (*s == '}') {
7111 s++;
7112 break;
7113 }
7114 if (ISSPACE(*s)) {
7115 s++;
7116 continue;
7117 }
7118 c = scan_hex(s, s_end-s, &hexlen);
7119 if (hexlen == 0 || hexlen > 6) {
7120 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7121 }
7122 if (c > 0x10ffff) {
7123 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7124 }
7125 if (0xd800 <= c && c <= 0xdfff) {
7126 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7127 }
7128 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7129 rb_str_cat(undumped, (char *)buf, codelen);
7130 s += hexlen;
7131 }
7132 }
7133 else { /* handle \uXXXX form */
7134 c = scan_hex(s, 4, &hexlen);
7135 if (hexlen != 4) {
7136 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7137 }
7138 if (0xd800 <= c && c <= 0xdfff) {
7139 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7140 }
7141 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7142 rb_str_cat(undumped, (char *)buf, codelen);
7143 s += hexlen;
7144 }
7145 break;
7146 case 'x':
7147 if (*utf8) {
7148 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7149 }
7150 *binary = true;
7151 if (++s >= s_end) {
7152 rb_raise(rb_eRuntimeError, "invalid hex escape");
7153 }
7154 *buf = scan_hex(s, 2, &hexlen);
7155 if (hexlen != 2) {
7156 rb_raise(rb_eRuntimeError, "invalid hex escape");
7157 }
7158 rb_str_cat(undumped, (char *)buf, 1);
7159 s += hexlen;
7160 break;
7161 default:
7162 rb_str_cat(undumped, s-1, 2);
7163 s++;
7164 }
7165
7166 *ss = s;
7167}
7168
7169static VALUE rb_str_is_ascii_only_p(VALUE str);
7170
7171/*
7172 * call-seq:
7173 * undump -> string
7174 *
7175 * Returns an unescaped version of +self+:
7176 *
7177 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7178 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7179 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7180 * s_undumped == s_orig # => true
7181 *
7182 * Related: String#dump (inverse of String#undump).
7183 *
7184 */
7185
7186static VALUE
7187str_undump(VALUE str)
7188{
7189 const char *s = RSTRING_PTR(str);
7190 const char *s_end = RSTRING_END(str);
7191 rb_encoding *enc = rb_enc_get(str);
7192 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7193 bool utf8 = false;
7194 bool binary = false;
7195 int w;
7196
7198 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7199 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7200 }
7201 if (!str_null_check(str, &w)) {
7202 rb_raise(rb_eRuntimeError, "string contains null byte");
7203 }
7204 if (RSTRING_LEN(str) < 2) goto invalid_format;
7205 if (*s != '"') goto invalid_format;
7206
7207 /* strip '"' at the start */
7208 s++;
7209
7210 for (;;) {
7211 if (s >= s_end) {
7212 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7213 }
7214
7215 if (*s == '"') {
7216 /* epilogue */
7217 s++;
7218 if (s == s_end) {
7219 /* ascii compatible dumped string */
7220 break;
7221 }
7222 else {
7223 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7224 static const char dup_suffix[] = ".dup";
7225 const char *encname;
7226 int encidx;
7227 ptrdiff_t size;
7228
7229 /* check separately for strings dumped by older versions */
7230 size = sizeof(dup_suffix) - 1;
7231 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7232
7233 size = sizeof(force_encoding_suffix) - 1;
7234 if (s_end - s <= size) goto invalid_format;
7235 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7236 s += size;
7237
7238 if (utf8) {
7239 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7240 }
7241
7242 encname = s;
7243 s = memchr(s, '"', s_end-s);
7244 size = s - encname;
7245 if (!s) goto invalid_format;
7246 if (s_end - s != 2) goto invalid_format;
7247 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7248
7249 encidx = rb_enc_find_index2(encname, (long)size);
7250 if (encidx < 0) {
7251 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7252 }
7253 rb_enc_associate_index(undumped, encidx);
7254 }
7255 break;
7256 }
7257
7258 if (*s == '\\') {
7259 s++;
7260 if (s >= s_end) {
7261 rb_raise(rb_eRuntimeError, "invalid escape");
7262 }
7263 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7264 }
7265 else {
7266 rb_str_cat(undumped, s++, 1);
7267 }
7268 }
7269
7270 RB_GC_GUARD(str);
7271
7272 return undumped;
7273invalid_format:
7274 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7275}
7276
7277static void
7278rb_str_check_dummy_enc(rb_encoding *enc)
7279{
7280 if (rb_enc_dummy_p(enc)) {
7281 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7282 rb_enc_name(enc));
7283 }
7284}
7285
7286static rb_encoding *
7287str_true_enc(VALUE str)
7288{
7289 rb_encoding *enc = STR_ENC_GET(str);
7290 rb_str_check_dummy_enc(enc);
7291 return enc;
7292}
7293
7294static OnigCaseFoldType
7295check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7296{
7297 if (argc==0)
7298 return flags;
7299 if (argc>2)
7300 rb_raise(rb_eArgError, "too many options");
7301 if (argv[0]==sym_turkic) {
7302 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7303 if (argc==2) {
7304 if (argv[1]==sym_lithuanian)
7305 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7306 else
7307 rb_raise(rb_eArgError, "invalid second option");
7308 }
7309 }
7310 else if (argv[0]==sym_lithuanian) {
7311 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7312 if (argc==2) {
7313 if (argv[1]==sym_turkic)
7314 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7315 else
7316 rb_raise(rb_eArgError, "invalid second option");
7317 }
7318 }
7319 else if (argc>1)
7320 rb_raise(rb_eArgError, "too many options");
7321 else if (argv[0]==sym_ascii)
7322 flags |= ONIGENC_CASE_ASCII_ONLY;
7323 else if (argv[0]==sym_fold) {
7324 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7325 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7326 else
7327 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7328 }
7329 else
7330 rb_raise(rb_eArgError, "invalid option");
7331 return flags;
7332}
7333
7334static inline bool
7335case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7336{
7337 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7338 return true;
7339 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7340}
7341
7342/* 16 should be long enough to absorb any kind of single character length increase */
7343#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7344#ifndef CASEMAP_DEBUG
7345# define CASEMAP_DEBUG 0
7346#endif
7347
7348struct mapping_buffer;
7349typedef struct mapping_buffer {
7350 size_t capa;
7351 size_t used;
7352 struct mapping_buffer *next;
7353 OnigUChar space[FLEX_ARY_LEN];
7355
7356static void
7357mapping_buffer_free(void *p)
7358{
7359 mapping_buffer *previous_buffer;
7360 mapping_buffer *current_buffer = p;
7361 while (current_buffer) {
7362 previous_buffer = current_buffer;
7363 current_buffer = current_buffer->next;
7364 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7365 }
7366}
7367
7368static const rb_data_type_t mapping_buffer_type = {
7369 "mapping_buffer",
7370 {0, mapping_buffer_free,},
7371 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7372};
7373
7374static VALUE
7375rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7376{
7377 VALUE target;
7378
7379 const OnigUChar *source_current, *source_end;
7380 int target_length = 0;
7381 VALUE buffer_anchor;
7382 mapping_buffer *current_buffer = 0;
7383 mapping_buffer **pre_buffer;
7384 size_t buffer_count = 0;
7385 int buffer_length_or_invalid;
7386
7387 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7388
7389 source_current = (OnigUChar*)RSTRING_PTR(source);
7390 source_end = (OnigUChar*)RSTRING_END(source);
7391
7392 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7393 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7394 while (source_current < source_end) {
7395 /* increase multiplier using buffer count to converge quickly */
7396 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7397 if (CASEMAP_DEBUG) {
7398 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7399 }
7400 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7401 *pre_buffer = current_buffer;
7402 pre_buffer = &current_buffer->next;
7403 current_buffer->next = NULL;
7404 current_buffer->capa = capa;
7405 buffer_length_or_invalid = enc->case_map(flags,
7406 &source_current, source_end,
7407 current_buffer->space,
7408 current_buffer->space+current_buffer->capa,
7409 enc);
7410 if (buffer_length_or_invalid < 0) {
7411 current_buffer = DATA_PTR(buffer_anchor);
7412 DATA_PTR(buffer_anchor) = 0;
7413 mapping_buffer_free(current_buffer);
7414 rb_raise(rb_eArgError, "input string invalid");
7415 }
7416 target_length += current_buffer->used = buffer_length_or_invalid;
7417 }
7418 if (CASEMAP_DEBUG) {
7419 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7420 }
7421
7422 if (buffer_count==1) {
7423 target = rb_str_new((const char*)current_buffer->space, target_length);
7424 }
7425 else {
7426 char *target_current;
7427
7428 target = rb_str_new(0, target_length);
7429 target_current = RSTRING_PTR(target);
7430 current_buffer = DATA_PTR(buffer_anchor);
7431 while (current_buffer) {
7432 memcpy(target_current, current_buffer->space, current_buffer->used);
7433 target_current += current_buffer->used;
7434 current_buffer = current_buffer->next;
7435 }
7436 }
7437 current_buffer = DATA_PTR(buffer_anchor);
7438 DATA_PTR(buffer_anchor) = 0;
7439 mapping_buffer_free(current_buffer);
7440
7441 RB_GC_GUARD(buffer_anchor);
7442
7443 /* TODO: check about string terminator character */
7444 str_enc_copy_direct(target, source);
7445 /*ENC_CODERANGE_SET(mapped, cr);*/
7446
7447 return target;
7448}
7449
7450static VALUE
7451rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7452{
7453 const OnigUChar *source_current, *source_end;
7454 OnigUChar *target_current, *target_end;
7455 long old_length = RSTRING_LEN(source);
7456 int length_or_invalid;
7457
7458 if (old_length == 0) return Qnil;
7459
7460 source_current = (OnigUChar*)RSTRING_PTR(source);
7461 source_end = (OnigUChar*)RSTRING_END(source);
7462 if (source == target) {
7463 target_current = (OnigUChar*)source_current;
7464 target_end = (OnigUChar*)source_end;
7465 }
7466 else {
7467 target_current = (OnigUChar*)RSTRING_PTR(target);
7468 target_end = (OnigUChar*)RSTRING_END(target);
7469 }
7470
7471 length_or_invalid = onigenc_ascii_only_case_map(flags,
7472 &source_current, source_end,
7473 target_current, target_end, enc);
7474 if (length_or_invalid < 0)
7475 rb_raise(rb_eArgError, "input string invalid");
7476 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7477 fprintf(stderr, "problem with rb_str_ascii_casemap"
7478 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7479 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7480 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7481 }
7482
7483 str_enc_copy(target, source);
7484
7485 return target;
7486}
7487
7488static bool
7489upcase_single(VALUE str)
7490{
7491 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7492 bool modified = false;
7493
7494 while (s < send) {
7495 unsigned int c = *(unsigned char*)s;
7496
7497 if ('a' <= c && c <= 'z') {
7498 *s = 'A' + (c - 'a');
7499 modified = true;
7500 }
7501 s++;
7502 }
7503 return modified;
7504}
7505
7506/*
7507 * call-seq:
7508 * upcase!(*options) -> self or nil
7509 *
7510 * Upcases the characters in +self+;
7511 * returns +self+ if any changes were made, +nil+ otherwise:
7512 *
7513 * s = 'Hello World!' # => "Hello World!"
7514 * s.upcase! # => "HELLO WORLD!"
7515 * s # => "HELLO WORLD!"
7516 * s.upcase! # => nil
7517 *
7518 * The casing may be affected by the given +options+;
7519 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7520 *
7521 * Related: String#upcase, String#downcase, String#downcase!.
7522 *
7523 */
7524
7525static VALUE
7526rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7527{
7528 rb_encoding *enc;
7529 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7530
7531 flags = check_case_options(argc, argv, flags);
7532 str_modify_keep_cr(str);
7533 enc = str_true_enc(str);
7534 if (case_option_single_p(flags, enc, str)) {
7535 if (upcase_single(str))
7536 flags |= ONIGENC_CASE_MODIFIED;
7537 }
7538 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7539 rb_str_ascii_casemap(str, str, &flags, enc);
7540 else
7541 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7542
7543 if (ONIGENC_CASE_MODIFIED&flags) return str;
7544 return Qnil;
7545}
7546
7547
7548/*
7549 * call-seq:
7550 * upcase(*options) -> string
7551 *
7552 * Returns a string containing the upcased characters in +self+:
7553 *
7554 * s = 'Hello World!' # => "Hello World!"
7555 * s.upcase # => "HELLO WORLD!"
7556 *
7557 * The casing may be affected by the given +options+;
7558 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7559 *
7560 * Related: String#upcase!, String#downcase, String#downcase!.
7561 *
7562 */
7563
7564static VALUE
7565rb_str_upcase(int argc, VALUE *argv, VALUE str)
7566{
7567 rb_encoding *enc;
7568 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7569 VALUE ret;
7570
7571 flags = check_case_options(argc, argv, flags);
7572 enc = str_true_enc(str);
7573 if (case_option_single_p(flags, enc, str)) {
7574 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7575 str_enc_copy_direct(ret, str);
7576 upcase_single(ret);
7577 }
7578 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7579 ret = rb_str_new(0, RSTRING_LEN(str));
7580 rb_str_ascii_casemap(str, ret, &flags, enc);
7581 }
7582 else {
7583 ret = rb_str_casemap(str, &flags, enc);
7584 }
7585
7586 return ret;
7587}
7588
7589static bool
7590downcase_single(VALUE str)
7591{
7592 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7593 bool modified = false;
7594
7595 while (s < send) {
7596 unsigned int c = *(unsigned char*)s;
7597
7598 if ('A' <= c && c <= 'Z') {
7599 *s = 'a' + (c - 'A');
7600 modified = true;
7601 }
7602 s++;
7603 }
7604
7605 return modified;
7606}
7607
7608/*
7609 * call-seq:
7610 * downcase!(*options) -> self or nil
7611 *
7612 * Downcases the characters in +self+;
7613 * returns +self+ if any changes were made, +nil+ otherwise:
7614 *
7615 * s = 'Hello World!' # => "Hello World!"
7616 * s.downcase! # => "hello world!"
7617 * s # => "hello world!"
7618 * s.downcase! # => nil
7619 *
7620 * The casing may be affected by the given +options+;
7621 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7622 *
7623 * Related: String#downcase, String#upcase, String#upcase!.
7624 *
7625 */
7626
7627static VALUE
7628rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7629{
7630 rb_encoding *enc;
7631 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7632
7633 flags = check_case_options(argc, argv, flags);
7634 str_modify_keep_cr(str);
7635 enc = str_true_enc(str);
7636 if (case_option_single_p(flags, enc, str)) {
7637 if (downcase_single(str))
7638 flags |= ONIGENC_CASE_MODIFIED;
7639 }
7640 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7641 rb_str_ascii_casemap(str, str, &flags, enc);
7642 else
7643 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7644
7645 if (ONIGENC_CASE_MODIFIED&flags) return str;
7646 return Qnil;
7647}
7648
7649
7650/*
7651 * call-seq:
7652 * downcase(*options) -> string
7653 *
7654 * Returns a string containing the downcased characters in +self+:
7655 *
7656 * s = 'Hello World!' # => "Hello World!"
7657 * s.downcase # => "hello world!"
7658 *
7659 * The casing may be affected by the given +options+;
7660 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7661 *
7662 * Related: String#downcase!, String#upcase, String#upcase!.
7663 *
7664 */
7665
7666static VALUE
7667rb_str_downcase(int argc, VALUE *argv, VALUE str)
7668{
7669 rb_encoding *enc;
7670 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7671 VALUE ret;
7672
7673 flags = check_case_options(argc, argv, flags);
7674 enc = str_true_enc(str);
7675 if (case_option_single_p(flags, enc, str)) {
7676 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7677 str_enc_copy_direct(ret, str);
7678 downcase_single(ret);
7679 }
7680 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7681 ret = rb_str_new(0, RSTRING_LEN(str));
7682 rb_str_ascii_casemap(str, ret, &flags, enc);
7683 }
7684 else {
7685 ret = rb_str_casemap(str, &flags, enc);
7686 }
7687
7688 return ret;
7689}
7690
7691
7692/*
7693 * call-seq:
7694 * capitalize!(*options) -> self or nil
7695 *
7696 * Upcases the first character in +self+;
7697 * downcases the remaining characters;
7698 * returns +self+ if any changes were made, +nil+ otherwise:
7699 *
7700 * s = 'hello World!' # => "hello World!"
7701 * s.capitalize! # => "Hello world!"
7702 * s # => "Hello world!"
7703 * s.capitalize! # => nil
7704 *
7705 * The casing may be affected by the given +options+;
7706 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7707 *
7708 * Related: String#capitalize.
7709 *
7710 */
7711
7712static VALUE
7713rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7714{
7715 rb_encoding *enc;
7716 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7717
7718 flags = check_case_options(argc, argv, flags);
7719 str_modify_keep_cr(str);
7720 enc = str_true_enc(str);
7721 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7722 if (flags&ONIGENC_CASE_ASCII_ONLY)
7723 rb_str_ascii_casemap(str, str, &flags, enc);
7724 else
7725 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7726
7727 if (ONIGENC_CASE_MODIFIED&flags) return str;
7728 return Qnil;
7729}
7730
7731
7732/*
7733 * call-seq:
7734 * capitalize(*options) -> string
7735 *
7736 * Returns a string containing the characters in +self+;
7737 * the first character is upcased;
7738 * the remaining characters are downcased:
7739 *
7740 * s = 'hello World!' # => "hello World!"
7741 * s.capitalize # => "Hello world!"
7742 *
7743 * The casing may be affected by the given +options+;
7744 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7745 *
7746 * Related: String#capitalize!.
7747 *
7748 */
7749
7750static VALUE
7751rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7752{
7753 rb_encoding *enc;
7754 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7755 VALUE ret;
7756
7757 flags = check_case_options(argc, argv, flags);
7758 enc = str_true_enc(str);
7759 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7760 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7761 ret = rb_str_new(0, RSTRING_LEN(str));
7762 rb_str_ascii_casemap(str, ret, &flags, enc);
7763 }
7764 else {
7765 ret = rb_str_casemap(str, &flags, enc);
7766 }
7767 return ret;
7768}
7769
7770
7771/*
7772 * call-seq:
7773 * swapcase!(*options) -> self or nil
7774 *
7775 * Upcases each lowercase character in +self+;
7776 * downcases uppercase character;
7777 * returns +self+ if any changes were made, +nil+ otherwise:
7778 *
7779 * s = 'Hello World!' # => "Hello World!"
7780 * s.swapcase! # => "hELLO wORLD!"
7781 * s # => "hELLO wORLD!"
7782 * ''.swapcase! # => nil
7783 *
7784 * The casing may be affected by the given +options+;
7785 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7786 *
7787 * Related: String#swapcase.
7788 *
7789 */
7790
7791static VALUE
7792rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7793{
7794 rb_encoding *enc;
7795 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7796
7797 flags = check_case_options(argc, argv, flags);
7798 str_modify_keep_cr(str);
7799 enc = str_true_enc(str);
7800 if (flags&ONIGENC_CASE_ASCII_ONLY)
7801 rb_str_ascii_casemap(str, str, &flags, enc);
7802 else
7803 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7804
7805 if (ONIGENC_CASE_MODIFIED&flags) return str;
7806 return Qnil;
7807}
7808
7809
7810/*
7811 * call-seq:
7812 * swapcase(*options) -> string
7813 *
7814 * Returns a string containing the characters in +self+, with cases reversed;
7815 * each uppercase character is downcased;
7816 * each lowercase character is upcased:
7817 *
7818 * s = 'Hello World!' # => "Hello World!"
7819 * s.swapcase # => "hELLO wORLD!"
7820 *
7821 * The casing may be affected by the given +options+;
7822 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7823 *
7824 * Related: String#swapcase!.
7825 *
7826 */
7827
7828static VALUE
7829rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7830{
7831 rb_encoding *enc;
7832 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7833 VALUE ret;
7834
7835 flags = check_case_options(argc, argv, flags);
7836 enc = str_true_enc(str);
7837 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7838 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7839 ret = rb_str_new(0, RSTRING_LEN(str));
7840 rb_str_ascii_casemap(str, ret, &flags, enc);
7841 }
7842 else {
7843 ret = rb_str_casemap(str, &flags, enc);
7844 }
7845 return ret;
7846}
7847
7848typedef unsigned char *USTR;
7849
7850struct tr {
7851 int gen;
7852 unsigned int now, max;
7853 char *p, *pend;
7854};
7855
7856static unsigned int
7857trnext(struct tr *t, rb_encoding *enc)
7858{
7859 int n;
7860
7861 for (;;) {
7862 nextpart:
7863 if (!t->gen) {
7864 if (t->p == t->pend) return -1;
7865 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7866 t->p += n;
7867 }
7868 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7869 t->p += n;
7870 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7871 t->p += n;
7872 if (t->p < t->pend) {
7873 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7874 t->p += n;
7875 if (t->now > c) {
7876 if (t->now < 0x80 && c < 0x80) {
7877 rb_raise(rb_eArgError,
7878 "invalid range \"%c-%c\" in string transliteration",
7879 t->now, c);
7880 }
7881 else {
7882 rb_raise(rb_eArgError, "invalid range in string transliteration");
7883 }
7884 continue; /* not reached */
7885 }
7886 else if (t->now < c) {
7887 t->gen = 1;
7888 t->max = c;
7889 }
7890 }
7891 }
7892 return t->now;
7893 }
7894 else {
7895 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7896 if (t->now == t->max) {
7897 t->gen = 0;
7898 goto nextpart;
7899 }
7900 }
7901 if (t->now < t->max) {
7902 return t->now;
7903 }
7904 else {
7905 t->gen = 0;
7906 return t->max;
7907 }
7908 }
7909 }
7910}
7911
7912static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7913
7914static VALUE
7915tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7916{
7917 const unsigned int errc = -1;
7918 unsigned int trans[256];
7919 rb_encoding *enc, *e1, *e2;
7920 struct tr trsrc, trrepl;
7921 int cflag = 0;
7922 unsigned int c, c0, last = 0;
7923 int modify = 0, i, l;
7924 unsigned char *s, *send;
7925 VALUE hash = 0;
7926 int singlebyte = single_byte_optimizable(str);
7927 int termlen;
7928 int cr;
7929
7930#define CHECK_IF_ASCII(c) \
7931 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7932 (cr = ENC_CODERANGE_VALID) : 0)
7933
7934 StringValue(src);
7935 StringValue(repl);
7936 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7937 if (RSTRING_LEN(repl) == 0) {
7938 return rb_str_delete_bang(1, &src, str);
7939 }
7940
7941 cr = ENC_CODERANGE(str);
7942 e1 = rb_enc_check(str, src);
7943 e2 = rb_enc_check(str, repl);
7944 if (e1 == e2) {
7945 enc = e1;
7946 }
7947 else {
7948 enc = rb_enc_check(src, repl);
7949 }
7950 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7951 if (RSTRING_LEN(src) > 1 &&
7952 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7953 trsrc.p + l < trsrc.pend) {
7954 cflag = 1;
7955 trsrc.p += l;
7956 }
7957 trrepl.p = RSTRING_PTR(repl);
7958 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7959 trsrc.gen = trrepl.gen = 0;
7960 trsrc.now = trrepl.now = 0;
7961 trsrc.max = trrepl.max = 0;
7962
7963 if (cflag) {
7964 for (i=0; i<256; i++) {
7965 trans[i] = 1;
7966 }
7967 while ((c = trnext(&trsrc, enc)) != errc) {
7968 if (c < 256) {
7969 trans[c] = errc;
7970 }
7971 else {
7972 if (!hash) hash = rb_hash_new();
7973 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7974 }
7975 }
7976 while ((c = trnext(&trrepl, enc)) != errc)
7977 /* retrieve last replacer */;
7978 last = trrepl.now;
7979 for (i=0; i<256; i++) {
7980 if (trans[i] != errc) {
7981 trans[i] = last;
7982 }
7983 }
7984 }
7985 else {
7986 unsigned int r;
7987
7988 for (i=0; i<256; i++) {
7989 trans[i] = errc;
7990 }
7991 while ((c = trnext(&trsrc, enc)) != errc) {
7992 r = trnext(&trrepl, enc);
7993 if (r == errc) r = trrepl.now;
7994 if (c < 256) {
7995 trans[c] = r;
7996 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7997 }
7998 else {
7999 if (!hash) hash = rb_hash_new();
8000 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8001 }
8002 }
8003 }
8004
8005 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8006 cr = ENC_CODERANGE_7BIT;
8007 str_modify_keep_cr(str);
8008 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8009 termlen = rb_enc_mbminlen(enc);
8010 if (sflag) {
8011 int clen, tlen;
8012 long offset, max = RSTRING_LEN(str);
8013 unsigned int save = -1;
8014 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8015
8016 while (s < send) {
8017 int may_modify = 0;
8018
8019 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
8020 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8021
8022 s += clen;
8023 if (c < 256) {
8024 c = trans[c];
8025 }
8026 else if (hash) {
8027 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8028 if (NIL_P(tmp)) {
8029 if (cflag) c = last;
8030 else c = errc;
8031 }
8032 else if (cflag) c = errc;
8033 else c = NUM2INT(tmp);
8034 }
8035 else {
8036 c = errc;
8037 }
8038 if (c != (unsigned int)-1) {
8039 if (save == c) {
8040 CHECK_IF_ASCII(c);
8041 continue;
8042 }
8043 save = c;
8044 tlen = rb_enc_codelen(c, enc);
8045 modify = 1;
8046 }
8047 else {
8048 save = -1;
8049 c = c0;
8050 if (enc != e1) may_modify = 1;
8051 }
8052 if ((offset = t - buf) + tlen > max) {
8053 size_t MAYBE_UNUSED(old) = max + termlen;
8054 max = offset + tlen + (send - s);
8055 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8056 t = buf + offset;
8057 }
8058 rb_enc_mbcput(c, t, enc);
8059 if (may_modify && memcmp(s, t, tlen) != 0) {
8060 modify = 1;
8061 }
8062 CHECK_IF_ASCII(c);
8063 t += tlen;
8064 }
8065 if (!STR_EMBED_P(str)) {
8066 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8067 }
8068 TERM_FILL((char *)t, termlen);
8069 RSTRING(str)->as.heap.ptr = (char *)buf;
8070 STR_SET_LEN(str, t - buf);
8071 STR_SET_NOEMBED(str);
8072 RSTRING(str)->as.heap.aux.capa = max;
8073 }
8074 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8075 while (s < send) {
8076 c = (unsigned char)*s;
8077 if (trans[c] != errc) {
8078 if (!cflag) {
8079 c = trans[c];
8080 *s = c;
8081 modify = 1;
8082 }
8083 else {
8084 *s = last;
8085 modify = 1;
8086 }
8087 }
8088 CHECK_IF_ASCII(c);
8089 s++;
8090 }
8091 }
8092 else {
8093 int clen, tlen;
8094 long offset, max = (long)((send - s) * 1.2);
8095 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8096
8097 while (s < send) {
8098 int may_modify = 0;
8099 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
8100 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8101
8102 if (c < 256) {
8103 c = trans[c];
8104 }
8105 else if (hash) {
8106 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8107 if (NIL_P(tmp)) {
8108 if (cflag) c = last;
8109 else c = errc;
8110 }
8111 else if (cflag) c = errc;
8112 else c = NUM2INT(tmp);
8113 }
8114 else {
8115 c = cflag ? last : errc;
8116 }
8117 if (c != errc) {
8118 tlen = rb_enc_codelen(c, enc);
8119 modify = 1;
8120 }
8121 else {
8122 c = c0;
8123 if (enc != e1) may_modify = 1;
8124 }
8125 if ((offset = t - buf) + tlen > max) {
8126 size_t MAYBE_UNUSED(old) = max + termlen;
8127 max = offset + tlen + (long)((send - s) * 1.2);
8128 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8129 t = buf + offset;
8130 }
8131 if (s != t) {
8132 rb_enc_mbcput(c, t, enc);
8133 if (may_modify && memcmp(s, t, tlen) != 0) {
8134 modify = 1;
8135 }
8136 }
8137 CHECK_IF_ASCII(c);
8138 s += clen;
8139 t += tlen;
8140 }
8141 if (!STR_EMBED_P(str)) {
8142 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8143 }
8144 TERM_FILL((char *)t, termlen);
8145 RSTRING(str)->as.heap.ptr = (char *)buf;
8146 STR_SET_LEN(str, t - buf);
8147 STR_SET_NOEMBED(str);
8148 RSTRING(str)->as.heap.aux.capa = max;
8149 }
8150
8151 if (modify) {
8152 if (cr != ENC_CODERANGE_BROKEN)
8153 ENC_CODERANGE_SET(str, cr);
8154 rb_enc_associate(str, enc);
8155 return str;
8156 }
8157 return Qnil;
8158}
8159
8160
8161/*
8162 * call-seq:
8163 * tr!(selector, replacements) -> self or nil
8164 *
8165 * Like String#tr, but modifies +self+ in place.
8166 * Returns +self+ if any changes were made, +nil+ otherwise.
8167 *
8168 */
8169
8170static VALUE
8171rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8172{
8173 return tr_trans(str, src, repl, 0);
8174}
8175
8176
8177/*
8178 * call-seq:
8179 * tr(selector, replacements) -> new_string
8180 *
8181 * Returns a copy of +self+ with each character specified by string +selector+
8182 * translated to the corresponding character in string +replacements+.
8183 * The correspondence is _positional_:
8184 *
8185 * - Each occurrence of the first character specified by +selector+
8186 * is translated to the first character in +replacements+.
8187 * - Each occurrence of the second character specified by +selector+
8188 * is translated to the second character in +replacements+.
8189 * - And so on.
8190 *
8191 * Example:
8192 *
8193 * 'hello'.tr('el', 'ip') #=> "hippo"
8194 *
8195 * If +replacements+ is shorter than +selector+,
8196 * it is implicitly padded with its own last character:
8197 *
8198 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8199 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8200 *
8201 * Arguments +selector+ and +replacements+ must be valid character selectors
8202 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8203 * and may use any of its valid forms, including negation, ranges, and escaping:
8204 *
8205 * # Negation.
8206 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8207 * # Ranges.
8208 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8209 * # Escapes.
8210 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8211 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8212 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8213 *
8214 */
8215
8216static VALUE
8217rb_str_tr(VALUE str, VALUE src, VALUE repl)
8218{
8219 str = str_duplicate(rb_cString, str);
8220 tr_trans(str, src, repl, 0);
8221 return str;
8222}
8223
8224#define TR_TABLE_MAX (UCHAR_MAX+1)
8225#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8226static void
8227tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8228 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8229{
8230 const unsigned int errc = -1;
8231 char buf[TR_TABLE_MAX];
8232 struct tr tr;
8233 unsigned int c;
8234 VALUE table = 0, ptable = 0;
8235 int i, l, cflag = 0;
8236
8237 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8238 tr.gen = tr.now = tr.max = 0;
8239
8240 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8241 cflag = 1;
8242 tr.p += l;
8243 }
8244 if (first) {
8245 for (i=0; i<TR_TABLE_MAX; i++) {
8246 stable[i] = 1;
8247 }
8248 stable[TR_TABLE_MAX] = cflag;
8249 }
8250 else if (stable[TR_TABLE_MAX] && !cflag) {
8251 stable[TR_TABLE_MAX] = 0;
8252 }
8253 for (i=0; i<TR_TABLE_MAX; i++) {
8254 buf[i] = cflag;
8255 }
8256
8257 while ((c = trnext(&tr, enc)) != errc) {
8258 if (c < TR_TABLE_MAX) {
8259 buf[(unsigned char)c] = !cflag;
8260 }
8261 else {
8262 VALUE key = UINT2NUM(c);
8263
8264 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8265 if (cflag) {
8266 ptable = *ctablep;
8267 table = ptable ? ptable : rb_hash_new();
8268 *ctablep = table;
8269 }
8270 else {
8271 table = rb_hash_new();
8272 ptable = *tablep;
8273 *tablep = table;
8274 }
8275 }
8276 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8277 rb_hash_aset(table, key, Qtrue);
8278 }
8279 }
8280 }
8281 for (i=0; i<TR_TABLE_MAX; i++) {
8282 stable[i] = stable[i] && buf[i];
8283 }
8284 if (!table && !cflag) {
8285 *tablep = 0;
8286 }
8287}
8288
8289
8290static int
8291tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8292{
8293 if (c < TR_TABLE_MAX) {
8294 return table[c] != 0;
8295 }
8296 else {
8297 VALUE v = UINT2NUM(c);
8298
8299 if (del) {
8300 if (!NIL_P(rb_hash_lookup(del, v)) &&
8301 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8302 return TRUE;
8303 }
8304 }
8305 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8306 return FALSE;
8307 }
8308 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8309 }
8310}
8311
8312/*
8313 * call-seq:
8314 * delete!(*selectors) -> self or nil
8315 *
8316 * Like String#delete, but modifies +self+ in place.
8317 * Returns +self+ if any changes were made, +nil+ otherwise.
8318 *
8319 */
8320
8321static VALUE
8322rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8323{
8324 char squeez[TR_TABLE_SIZE];
8325 rb_encoding *enc = 0;
8326 char *s, *send, *t;
8327 VALUE del = 0, nodel = 0;
8328 int modify = 0;
8329 int i, ascompat, cr;
8330
8331 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8333 for (i=0; i<argc; i++) {
8334 VALUE s = argv[i];
8335
8336 StringValue(s);
8337 enc = rb_enc_check(str, s);
8338 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8339 }
8340
8341 str_modify_keep_cr(str);
8342 ascompat = rb_enc_asciicompat(enc);
8343 s = t = RSTRING_PTR(str);
8344 send = RSTRING_END(str);
8345 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8346 while (s < send) {
8347 unsigned int c;
8348 int clen;
8349
8350 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8351 if (squeez[c]) {
8352 modify = 1;
8353 }
8354 else {
8355 if (t != s) *t = c;
8356 t++;
8357 }
8358 s++;
8359 }
8360 else {
8361 c = rb_enc_codepoint_len(s, send, &clen, enc);
8362
8363 if (tr_find(c, squeez, del, nodel)) {
8364 modify = 1;
8365 }
8366 else {
8367 if (t != s) rb_enc_mbcput(c, t, enc);
8368 t += clen;
8370 }
8371 s += clen;
8372 }
8373 }
8374 TERM_FILL(t, TERM_LEN(str));
8375 STR_SET_LEN(str, t - RSTRING_PTR(str));
8376 ENC_CODERANGE_SET(str, cr);
8377
8378 if (modify) return str;
8379 return Qnil;
8380}
8381
8382
8383/*
8384 * call-seq:
8385 * delete(*selectors) -> new_string
8386 *
8387 * Returns a copy of +self+ with characters specified by +selectors+ removed
8388 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8389 *
8390 * "hello".delete "l","lo" #=> "heo"
8391 * "hello".delete "lo" #=> "he"
8392 * "hello".delete "aeiou", "^e" #=> "hell"
8393 * "hello".delete "ej-m" #=> "ho"
8394 *
8395 */
8396
8397static VALUE
8398rb_str_delete(int argc, VALUE *argv, VALUE str)
8399{
8400 str = str_duplicate(rb_cString, str);
8401 rb_str_delete_bang(argc, argv, str);
8402 return str;
8403}
8404
8405
8406/*
8407 * call-seq:
8408 * squeeze!(*selectors) -> self or nil
8409 *
8410 * Like String#squeeze, but modifies +self+ in place.
8411 * Returns +self+ if any changes were made, +nil+ otherwise.
8412 */
8413
8414static VALUE
8415rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8416{
8417 char squeez[TR_TABLE_SIZE];
8418 rb_encoding *enc = 0;
8419 VALUE del = 0, nodel = 0;
8420 unsigned char *s, *send, *t;
8421 int i, modify = 0;
8422 int ascompat, singlebyte = single_byte_optimizable(str);
8423 unsigned int save;
8424
8425 if (argc == 0) {
8426 enc = STR_ENC_GET(str);
8427 }
8428 else {
8429 for (i=0; i<argc; i++) {
8430 VALUE s = argv[i];
8431
8432 StringValue(s);
8433 enc = rb_enc_check(str, s);
8434 if (singlebyte && !single_byte_optimizable(s))
8435 singlebyte = 0;
8436 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8437 }
8438 }
8439
8440 str_modify_keep_cr(str);
8441 s = t = (unsigned char *)RSTRING_PTR(str);
8442 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8443 send = (unsigned char *)RSTRING_END(str);
8444 save = -1;
8445 ascompat = rb_enc_asciicompat(enc);
8446
8447 if (singlebyte) {
8448 while (s < send) {
8449 unsigned int c = *s++;
8450 if (c != save || (argc > 0 && !squeez[c])) {
8451 *t++ = save = c;
8452 }
8453 }
8454 }
8455 else {
8456 while (s < send) {
8457 unsigned int c;
8458 int clen;
8459
8460 if (ascompat && (c = *s) < 0x80) {
8461 if (c != save || (argc > 0 && !squeez[c])) {
8462 *t++ = save = c;
8463 }
8464 s++;
8465 }
8466 else {
8467 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8468
8469 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8470 if (t != s) rb_enc_mbcput(c, t, enc);
8471 save = c;
8472 t += clen;
8473 }
8474 s += clen;
8475 }
8476 }
8477 }
8478
8479 TERM_FILL((char *)t, TERM_LEN(str));
8480 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8481 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8482 modify = 1;
8483 }
8484
8485 if (modify) return str;
8486 return Qnil;
8487}
8488
8489
8490/*
8491 * call-seq:
8492 * squeeze(*selectors) -> new_string
8493 *
8494 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8495 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8496 *
8497 * "Squeezed" means that each multiple-character run of a selected character
8498 * is squeezed down to a single character;
8499 * with no arguments given, squeezes all characters:
8500 *
8501 * "yellow moon".squeeze #=> "yelow mon"
8502 * " now is the".squeeze(" ") #=> " now is the"
8503 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8504 *
8505 */
8506
8507static VALUE
8508rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8509{
8510 str = str_duplicate(rb_cString, str);
8511 rb_str_squeeze_bang(argc, argv, str);
8512 return str;
8513}
8514
8515
8516/*
8517 * call-seq:
8518 * tr_s!(selector, replacements) -> self or nil
8519 *
8520 * Like String#tr_s, but modifies +self+ in place.
8521 * Returns +self+ if any changes were made, +nil+ otherwise.
8522 *
8523 * Related: String#squeeze!.
8524 */
8525
8526static VALUE
8527rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8528{
8529 return tr_trans(str, src, repl, 1);
8530}
8531
8532
8533/*
8534 * call-seq:
8535 * tr_s(selector, replacements) -> string
8536 *
8537 * Like String#tr, but also squeezes the modified portions of the translated string;
8538 * returns a new string (translated and squeezed).
8539 *
8540 * 'hello'.tr_s('l', 'r') #=> "hero"
8541 * 'hello'.tr_s('el', '-') #=> "h-o"
8542 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8543 *
8544 * Related: String#squeeze.
8545 *
8546 */
8547
8548static VALUE
8549rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8550{
8551 str = str_duplicate(rb_cString, str);
8552 tr_trans(str, src, repl, 1);
8553 return str;
8554}
8555
8556
8557/*
8558 * call-seq:
8559 * count(*selectors) -> integer
8560 *
8561 * Returns the total number of characters in +self+
8562 * that are specified by the given +selectors+
8563 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8564 *
8565 * a = "hello world"
8566 * a.count "lo" #=> 5
8567 * a.count "lo", "o" #=> 2
8568 * a.count "hello", "^l" #=> 4
8569 * a.count "ej-m" #=> 4
8570 *
8571 * "hello^world".count "\\^aeiou" #=> 4
8572 * "hello-world".count "a\\-eo" #=> 4
8573 *
8574 * c = "hello world\\r\\n"
8575 * c.count "\\" #=> 2
8576 * c.count "\\A" #=> 0
8577 * c.count "X-\\w" #=> 3
8578 */
8579
8580static VALUE
8581rb_str_count(int argc, VALUE *argv, VALUE str)
8582{
8583 char table[TR_TABLE_SIZE];
8584 rb_encoding *enc = 0;
8585 VALUE del = 0, nodel = 0, tstr;
8586 char *s, *send;
8587 int i;
8588 int ascompat;
8589 size_t n = 0;
8590
8592
8593 tstr = argv[0];
8594 StringValue(tstr);
8595 enc = rb_enc_check(str, tstr);
8596 if (argc == 1) {
8597 const char *ptstr;
8598 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8599 (ptstr = RSTRING_PTR(tstr),
8600 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8601 !is_broken_string(str)) {
8602 int clen;
8603 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8604
8605 s = RSTRING_PTR(str);
8606 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8607 send = RSTRING_END(str);
8608 while (s < send) {
8609 if (*(unsigned char*)s++ == c) n++;
8610 }
8611 return SIZET2NUM(n);
8612 }
8613 }
8614
8615 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8616 for (i=1; i<argc; i++) {
8617 tstr = argv[i];
8618 StringValue(tstr);
8619 enc = rb_enc_check(str, tstr);
8620 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8621 }
8622
8623 s = RSTRING_PTR(str);
8624 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8625 send = RSTRING_END(str);
8626 ascompat = rb_enc_asciicompat(enc);
8627 while (s < send) {
8628 unsigned int c;
8629
8630 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8631 if (table[c]) {
8632 n++;
8633 }
8634 s++;
8635 }
8636 else {
8637 int clen;
8638 c = rb_enc_codepoint_len(s, send, &clen, enc);
8639 if (tr_find(c, table, del, nodel)) {
8640 n++;
8641 }
8642 s += clen;
8643 }
8644 }
8645
8646 return SIZET2NUM(n);
8647}
8648
8649static VALUE
8650rb_fs_check(VALUE val)
8651{
8652 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8653 val = rb_check_string_type(val);
8654 if (NIL_P(val)) return 0;
8655 }
8656 return val;
8657}
8658
8659static const char isspacetable[256] = {
8660 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8662 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8674 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8676};
8677
8678#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8679
8680static long
8681split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8682{
8683 if (empty_count >= 0 && len == 0) {
8684 return empty_count + 1;
8685 }
8686 if (empty_count > 0) {
8687 /* make different substrings */
8688 if (result) {
8689 do {
8690 rb_ary_push(result, str_new_empty_String(str));
8691 } while (--empty_count > 0);
8692 }
8693 else {
8694 do {
8695 rb_yield(str_new_empty_String(str));
8696 } while (--empty_count > 0);
8697 }
8698 }
8699 str = rb_str_subseq(str, beg, len);
8700 if (result) {
8701 rb_ary_push(result, str);
8702 }
8703 else {
8704 rb_yield(str);
8705 }
8706 return empty_count;
8707}
8708
8709typedef enum {
8710 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8711} split_type_t;
8712
8713static split_type_t
8714literal_split_pattern(VALUE spat, split_type_t default_type)
8715{
8716 rb_encoding *enc = STR_ENC_GET(spat);
8717 const char *ptr;
8718 long len;
8719 RSTRING_GETMEM(spat, ptr, len);
8720 if (len == 0) {
8721 /* Special case - split into chars */
8722 return SPLIT_TYPE_CHARS;
8723 }
8724 else if (rb_enc_asciicompat(enc)) {
8725 if (len == 1 && ptr[0] == ' ') {
8726 return SPLIT_TYPE_AWK;
8727 }
8728 }
8729 else {
8730 int l;
8731 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8732 return SPLIT_TYPE_AWK;
8733 }
8734 }
8735 return default_type;
8736}
8737
8738/*
8739 * call-seq:
8740 * split(field_sep = $;, limit = nil) -> array
8741 * split(field_sep = $;, limit = nil) {|substring| ... } -> self
8742 *
8743 * :include: doc/string/split.rdoc
8744 *
8745 */
8746
8747static VALUE
8748rb_str_split_m(int argc, VALUE *argv, VALUE str)
8749{
8750 rb_encoding *enc;
8751 VALUE spat;
8752 VALUE limit;
8753 split_type_t split_type;
8754 long beg, end, i = 0, empty_count = -1;
8755 int lim = 0;
8756 VALUE result, tmp;
8757
8758 result = rb_block_given_p() ? Qfalse : Qnil;
8759 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8760 lim = NUM2INT(limit);
8761 if (lim <= 0) limit = Qnil;
8762 else if (lim == 1) {
8763 if (RSTRING_LEN(str) == 0)
8764 return result ? rb_ary_new2(0) : str;
8765 tmp = str_duplicate(rb_cString, str);
8766 if (!result) {
8767 rb_yield(tmp);
8768 return str;
8769 }
8770 return rb_ary_new3(1, tmp);
8771 }
8772 i = 1;
8773 }
8774 if (NIL_P(limit) && !lim) empty_count = 0;
8775
8776 enc = STR_ENC_GET(str);
8777 split_type = SPLIT_TYPE_REGEXP;
8778 if (!NIL_P(spat)) {
8779 spat = get_pat_quoted(spat, 0);
8780 }
8781 else if (NIL_P(spat = rb_fs)) {
8782 split_type = SPLIT_TYPE_AWK;
8783 }
8784 else if (!(spat = rb_fs_check(spat))) {
8785 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8786 }
8787 else {
8788 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8789 }
8790 if (split_type != SPLIT_TYPE_AWK) {
8791 switch (BUILTIN_TYPE(spat)) {
8792 case T_REGEXP:
8793 rb_reg_options(spat); /* check if uninitialized */
8794 tmp = RREGEXP_SRC(spat);
8795 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8796 if (split_type == SPLIT_TYPE_AWK) {
8797 spat = tmp;
8798 split_type = SPLIT_TYPE_STRING;
8799 }
8800 break;
8801
8802 case T_STRING:
8803 mustnot_broken(spat);
8804 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8805 break;
8806
8807 default:
8809 }
8810 }
8811
8812#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8813
8814 beg = 0;
8815 char *ptr = RSTRING_PTR(str);
8816 char *eptr = RSTRING_END(str);
8817 if (split_type == SPLIT_TYPE_AWK) {
8818 char *bptr = ptr;
8819 int skip = 1;
8820 unsigned int c;
8821
8822 if (result) result = rb_ary_new();
8823 end = beg;
8824 if (is_ascii_string(str)) {
8825 while (ptr < eptr) {
8826 c = (unsigned char)*ptr++;
8827 if (skip) {
8828 if (ascii_isspace(c)) {
8829 beg = ptr - bptr;
8830 }
8831 else {
8832 end = ptr - bptr;
8833 skip = 0;
8834 if (!NIL_P(limit) && lim <= i) break;
8835 }
8836 }
8837 else if (ascii_isspace(c)) {
8838 SPLIT_STR(beg, end-beg);
8839 skip = 1;
8840 beg = ptr - bptr;
8841 if (!NIL_P(limit)) ++i;
8842 }
8843 else {
8844 end = ptr - bptr;
8845 }
8846 }
8847 }
8848 else {
8849 while (ptr < eptr) {
8850 int n;
8851
8852 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8853 ptr += n;
8854 if (skip) {
8855 if (rb_isspace(c)) {
8856 beg = ptr - bptr;
8857 }
8858 else {
8859 end = ptr - bptr;
8860 skip = 0;
8861 if (!NIL_P(limit) && lim <= i) break;
8862 }
8863 }
8864 else if (rb_isspace(c)) {
8865 SPLIT_STR(beg, end-beg);
8866 skip = 1;
8867 beg = ptr - bptr;
8868 if (!NIL_P(limit)) ++i;
8869 }
8870 else {
8871 end = ptr - bptr;
8872 }
8873 }
8874 }
8875 }
8876 else if (split_type == SPLIT_TYPE_STRING) {
8877 char *str_start = ptr;
8878 char *substr_start = ptr;
8879 char *sptr = RSTRING_PTR(spat);
8880 long slen = RSTRING_LEN(spat);
8881
8882 if (result) result = rb_ary_new();
8883 mustnot_broken(str);
8884 enc = rb_enc_check(str, spat);
8885 while (ptr < eptr &&
8886 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8887 /* Check we are at the start of a char */
8888 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8889 if (t != ptr + end) {
8890 ptr = t;
8891 continue;
8892 }
8893 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8894 ptr += end + slen;
8895 substr_start = ptr;
8896 if (!NIL_P(limit) && lim <= ++i) break;
8897 }
8898 beg = ptr - str_start;
8899 }
8900 else if (split_type == SPLIT_TYPE_CHARS) {
8901 char *str_start = ptr;
8902 int n;
8903
8904 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
8905 mustnot_broken(str);
8906 enc = rb_enc_get(str);
8907 while (ptr < eptr &&
8908 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8909 SPLIT_STR(ptr - str_start, n);
8910 ptr += n;
8911 if (!NIL_P(limit) && lim <= ++i) break;
8912 }
8913 beg = ptr - str_start;
8914 }
8915 else {
8916 if (result) result = rb_ary_new();
8917 long len = RSTRING_LEN(str);
8918 long start = beg;
8919 long idx;
8920 int last_null = 0;
8921 struct re_registers *regs;
8922 VALUE match = 0;
8923
8924 for (; rb_reg_search(spat, str, start, 0) >= 0;
8925 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8926 match = rb_backref_get();
8927 if (!result) rb_match_busy(match);
8928 regs = RMATCH_REGS(match);
8929 end = BEG(0);
8930 if (start == end && BEG(0) == END(0)) {
8931 if (!ptr) {
8932 SPLIT_STR(0, 0);
8933 break;
8934 }
8935 else if (last_null == 1) {
8936 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8937 beg = start;
8938 }
8939 else {
8940 if (start == len)
8941 start++;
8942 else
8943 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8944 last_null = 1;
8945 continue;
8946 }
8947 }
8948 else {
8949 SPLIT_STR(beg, end-beg);
8950 beg = start = END(0);
8951 }
8952 last_null = 0;
8953
8954 for (idx=1; idx < regs->num_regs; idx++) {
8955 if (BEG(idx) == -1) continue;
8956 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8957 }
8958 if (!NIL_P(limit) && lim <= ++i) break;
8959 }
8960 if (match) rb_match_unbusy(match);
8961 }
8962 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8963 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8964 }
8965
8966 return result ? result : str;
8967}
8968
8969VALUE
8970rb_str_split(VALUE str, const char *sep0)
8971{
8972 VALUE sep;
8973
8974 StringValue(str);
8975 sep = rb_str_new_cstr(sep0);
8976 return rb_str_split_m(1, &sep, str);
8977}
8978
8979#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8980
8981static inline int
8982enumerator_element(VALUE ary, VALUE e)
8983{
8984 if (ary) {
8985 rb_ary_push(ary, e);
8986 return 0;
8987 }
8988 else {
8989 rb_yield(e);
8990 return 1;
8991 }
8992}
8993
8994#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8995
8996static const char *
8997chomp_newline(const char *p, const char *e, rb_encoding *enc)
8998{
8999 const char *prev = rb_enc_prev_char(p, e, e, enc);
9000 if (rb_enc_is_newline(prev, e, enc)) {
9001 e = prev;
9002 prev = rb_enc_prev_char(p, e, e, enc);
9003 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9004 e = prev;
9005 }
9006 return e;
9007}
9008
9009static VALUE
9010get_rs(void)
9011{
9012 VALUE rs = rb_rs;
9013 if (!NIL_P(rs) &&
9014 (!RB_TYPE_P(rs, T_STRING) ||
9015 RSTRING_LEN(rs) != 1 ||
9016 RSTRING_PTR(rs)[0] != '\n')) {
9017 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9018 }
9019 return rs;
9020}
9021
9022#define rb_rs get_rs()
9023
9024static VALUE
9025rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9026{
9027 rb_encoding *enc;
9028 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9029 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9030 long pos, len, rslen;
9031 int rsnewline = 0;
9032
9033 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9034 rs = rb_rs;
9035 if (!NIL_P(opts)) {
9036 static ID keywords[1];
9037 if (!keywords[0]) {
9038 keywords[0] = rb_intern_const("chomp");
9039 }
9040 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9041 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9042 }
9043
9044 if (NIL_P(rs)) {
9045 if (!ENUM_ELEM(ary, str)) {
9046 return ary;
9047 }
9048 else {
9049 return orig;
9050 }
9051 }
9052
9053 if (!RSTRING_LEN(str)) goto end;
9054 str = rb_str_new_frozen(str);
9055 ptr = subptr = RSTRING_PTR(str);
9056 pend = RSTRING_END(str);
9057 len = RSTRING_LEN(str);
9058 StringValue(rs);
9059 rslen = RSTRING_LEN(rs);
9060
9061 if (rs == rb_default_rs)
9062 enc = rb_enc_get(str);
9063 else
9064 enc = rb_enc_check(str, rs);
9065
9066 if (rslen == 0) {
9067 /* paragraph mode */
9068 int n;
9069 const char *eol = NULL;
9070 subend = subptr;
9071 while (subend < pend) {
9072 long chomp_rslen = 0;
9073 do {
9074 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9075 n = 0;
9076 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9077 if (rb_enc_is_newline(subend + n, pend, enc)) {
9078 if (eol == subend) break;
9079 subend += rslen;
9080 if (subptr) {
9081 eol = subend;
9082 chomp_rslen = -rslen;
9083 }
9084 }
9085 else {
9086 if (!subptr) subptr = subend;
9087 subend += rslen;
9088 }
9089 rslen = 0;
9090 } while (subend < pend);
9091 if (!subptr) break;
9092 if (rslen == 0) chomp_rslen = 0;
9093 line = rb_str_subseq(str, subptr - ptr,
9094 subend - subptr + (chomp ? chomp_rslen : rslen));
9095 if (ENUM_ELEM(ary, line)) {
9096 str_mod_check(str, ptr, len);
9097 }
9098 subptr = eol = NULL;
9099 }
9100 goto end;
9101 }
9102 else {
9103 rsptr = RSTRING_PTR(rs);
9104 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9105 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9106 rsnewline = 1;
9107 }
9108 }
9109
9110 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9111 rs = rb_str_new(rsptr, rslen);
9112 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9113 rsptr = RSTRING_PTR(rs);
9114 rslen = RSTRING_LEN(rs);
9115 }
9116
9117 while (subptr < pend) {
9118 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9119 if (pos < 0) break;
9120 hit = subptr + pos;
9121 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9122 if (hit != adjusted) {
9123 subptr = adjusted;
9124 continue;
9125 }
9126 subend = hit += rslen;
9127 if (chomp) {
9128 if (rsnewline) {
9129 subend = chomp_newline(subptr, subend, enc);
9130 }
9131 else {
9132 subend -= rslen;
9133 }
9134 }
9135 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9136 if (ENUM_ELEM(ary, line)) {
9137 str_mod_check(str, ptr, len);
9138 }
9139 subptr = hit;
9140 }
9141
9142 if (subptr != pend) {
9143 if (chomp) {
9144 if (rsnewline) {
9145 pend = chomp_newline(subptr, pend, enc);
9146 }
9147 else if (pend - subptr >= rslen &&
9148 memcmp(pend - rslen, rsptr, rslen) == 0) {
9149 pend -= rslen;
9150 }
9151 }
9152 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9153 ENUM_ELEM(ary, line);
9154 RB_GC_GUARD(str);
9155 }
9156
9157 end:
9158 if (ary)
9159 return ary;
9160 else
9161 return orig;
9162}
9163
9164/*
9165 * call-seq:
9166 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9167 * each_line(line_sep = $/, chomp: false) -> enumerator
9168 *
9169 * :include: doc/string/each_line.rdoc
9170 *
9171 */
9172
9173static VALUE
9174rb_str_each_line(int argc, VALUE *argv, VALUE str)
9175{
9176 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9177 return rb_str_enumerate_lines(argc, argv, str, 0);
9178}
9179
9180/*
9181 * call-seq:
9182 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9183 *
9184 * Forms substrings ("lines") of +self+ according to the given arguments
9185 * (see String#each_line for details); returns the lines in an array.
9186 *
9187 */
9188
9189static VALUE
9190rb_str_lines(int argc, VALUE *argv, VALUE str)
9191{
9192 VALUE ary = WANTARRAY("lines", 0);
9193 return rb_str_enumerate_lines(argc, argv, str, ary);
9194}
9195
9196static VALUE
9197rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9198{
9199 return LONG2FIX(RSTRING_LEN(str));
9200}
9201
9202static VALUE
9203rb_str_enumerate_bytes(VALUE str, VALUE ary)
9204{
9205 long i;
9206
9207 for (i=0; i<RSTRING_LEN(str); i++) {
9208 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9209 }
9210 if (ary)
9211 return ary;
9212 else
9213 return str;
9214}
9215
9216/*
9217 * call-seq:
9218 * each_byte {|byte| ... } -> self
9219 * each_byte -> enumerator
9220 *
9221 * :include: doc/string/each_byte.rdoc
9222 *
9223 */
9224
9225static VALUE
9226rb_str_each_byte(VALUE str)
9227{
9228 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9229 return rb_str_enumerate_bytes(str, 0);
9230}
9231
9232/*
9233 * call-seq:
9234 * bytes -> array_of_bytes
9235 *
9236 * :include: doc/string/bytes.rdoc
9237 *
9238 */
9239
9240static VALUE
9241rb_str_bytes(VALUE str)
9242{
9243 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9244 return rb_str_enumerate_bytes(str, ary);
9245}
9246
9247static VALUE
9248rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9249{
9250 return rb_str_length(str);
9251}
9252
9253static VALUE
9254rb_str_enumerate_chars(VALUE str, VALUE ary)
9255{
9256 VALUE orig = str;
9257 long i, len, n;
9258 const char *ptr;
9259 rb_encoding *enc;
9260
9261 str = rb_str_new_frozen(str);
9262 ptr = RSTRING_PTR(str);
9263 len = RSTRING_LEN(str);
9264 enc = rb_enc_get(str);
9265
9267 for (i = 0; i < len; i += n) {
9268 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9269 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9270 }
9271 }
9272 else {
9273 for (i = 0; i < len; i += n) {
9274 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9275 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9276 }
9277 }
9278 RB_GC_GUARD(str);
9279 if (ary)
9280 return ary;
9281 else
9282 return orig;
9283}
9284
9285/*
9286 * call-seq:
9287 * each_char {|c| ... } -> self
9288 * each_char -> enumerator
9289 *
9290 * :include: doc/string/each_char.rdoc
9291 *
9292 */
9293
9294static VALUE
9295rb_str_each_char(VALUE str)
9296{
9297 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9298 return rb_str_enumerate_chars(str, 0);
9299}
9300
9301/*
9302 * call-seq:
9303 * chars -> array_of_characters
9304 *
9305 * :include: doc/string/chars.rdoc
9306 *
9307 */
9308
9309static VALUE
9310rb_str_chars(VALUE str)
9311{
9312 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9313 return rb_str_enumerate_chars(str, ary);
9314}
9315
9316static VALUE
9317rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9318{
9319 VALUE orig = str;
9320 int n;
9321 unsigned int c;
9322 const char *ptr, *end;
9323 rb_encoding *enc;
9324
9325 if (single_byte_optimizable(str))
9326 return rb_str_enumerate_bytes(str, ary);
9327
9328 str = rb_str_new_frozen(str);
9329 ptr = RSTRING_PTR(str);
9330 end = RSTRING_END(str);
9331 enc = STR_ENC_GET(str);
9332
9333 while (ptr < end) {
9334 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9335 ENUM_ELEM(ary, UINT2NUM(c));
9336 ptr += n;
9337 }
9338 RB_GC_GUARD(str);
9339 if (ary)
9340 return ary;
9341 else
9342 return orig;
9343}
9344
9345/*
9346 * call-seq:
9347 * each_codepoint {|integer| ... } -> self
9348 * each_codepoint -> enumerator
9349 *
9350 * :include: doc/string/each_codepoint.rdoc
9351 *
9352 */
9353
9354static VALUE
9355rb_str_each_codepoint(VALUE str)
9356{
9357 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9358 return rb_str_enumerate_codepoints(str, 0);
9359}
9360
9361/*
9362 * call-seq:
9363 * codepoints -> array_of_integers
9364 *
9365 * :include: doc/string/codepoints.rdoc
9366 *
9367 */
9368
9369static VALUE
9370rb_str_codepoints(VALUE str)
9371{
9372 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9373 return rb_str_enumerate_codepoints(str, ary);
9374}
9375
9376static regex_t *
9377get_reg_grapheme_cluster(rb_encoding *enc)
9378{
9379 int encidx = rb_enc_to_index(enc);
9380
9381 const OnigUChar source_ascii[] = "\\X";
9382 const OnigUChar *source = source_ascii;
9383 size_t source_len = sizeof(source_ascii) - 1;
9384
9385 switch (encidx) {
9386#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9387#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9388#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9389#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9390#define CASE_UTF(e) \
9391 case ENCINDEX_UTF_##e: { \
9392 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9393 source = source_UTF_##e; \
9394 source_len = sizeof(source_UTF_##e); \
9395 break; \
9396 }
9397 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9398#undef CASE_UTF
9399#undef CHARS_16BE
9400#undef CHARS_16LE
9401#undef CHARS_32BE
9402#undef CHARS_32LE
9403 }
9404
9405 regex_t *reg_grapheme_cluster;
9406 OnigErrorInfo einfo;
9407 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9408 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9409 if (r) {
9410 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9411 onig_error_code_to_str(message, r, &einfo);
9412 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9413 }
9414
9415 return reg_grapheme_cluster;
9416}
9417
9418static regex_t *
9419get_cached_reg_grapheme_cluster(rb_encoding *enc)
9420{
9421 int encidx = rb_enc_to_index(enc);
9422 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9423
9424 if (encidx == rb_utf8_encindex()) {
9425 if (!reg_grapheme_cluster_utf8) {
9426 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9427 }
9428
9429 return reg_grapheme_cluster_utf8;
9430 }
9431
9432 return NULL;
9433}
9434
9435static VALUE
9436rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9437{
9438 size_t grapheme_cluster_count = 0;
9439 rb_encoding *enc = get_encoding(str);
9440 const char *ptr, *end;
9441
9442 if (!rb_enc_unicode_p(enc)) {
9443 return rb_str_length(str);
9444 }
9445
9446 bool cached_reg_grapheme_cluster = true;
9447 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9448 if (!reg_grapheme_cluster) {
9449 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9450 cached_reg_grapheme_cluster = false;
9451 }
9452
9453 ptr = RSTRING_PTR(str);
9454 end = RSTRING_END(str);
9455
9456 while (ptr < end) {
9457 OnigPosition len = onig_match(reg_grapheme_cluster,
9458 (const OnigUChar *)ptr, (const OnigUChar *)end,
9459 (const OnigUChar *)ptr, NULL, 0);
9460 if (len <= 0) break;
9461 grapheme_cluster_count++;
9462 ptr += len;
9463 }
9464
9465 if (!cached_reg_grapheme_cluster) {
9466 onig_free(reg_grapheme_cluster);
9467 }
9468
9469 return SIZET2NUM(grapheme_cluster_count);
9470}
9471
9472static VALUE
9473rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9474{
9475 VALUE orig = str;
9476 rb_encoding *enc = get_encoding(str);
9477 const char *ptr0, *ptr, *end;
9478
9479 if (!rb_enc_unicode_p(enc)) {
9480 return rb_str_enumerate_chars(str, ary);
9481 }
9482
9483 if (!ary) str = rb_str_new_frozen(str);
9484
9485 bool cached_reg_grapheme_cluster = true;
9486 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9487 if (!reg_grapheme_cluster) {
9488 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9489 cached_reg_grapheme_cluster = false;
9490 }
9491
9492 ptr0 = ptr = RSTRING_PTR(str);
9493 end = RSTRING_END(str);
9494
9495 while (ptr < end) {
9496 OnigPosition len = onig_match(reg_grapheme_cluster,
9497 (const OnigUChar *)ptr, (const OnigUChar *)end,
9498 (const OnigUChar *)ptr, NULL, 0);
9499 if (len <= 0) break;
9500 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9501 ptr += len;
9502 }
9503
9504 if (!cached_reg_grapheme_cluster) {
9505 onig_free(reg_grapheme_cluster);
9506 }
9507
9508 RB_GC_GUARD(str);
9509 if (ary)
9510 return ary;
9511 else
9512 return orig;
9513}
9514
9515/*
9516 * call-seq:
9517 * each_grapheme_cluster {|gc| ... } -> self
9518 * each_grapheme_cluster -> enumerator
9519 *
9520 * :include: doc/string/each_grapheme_cluster.rdoc
9521 *
9522 */
9523
9524static VALUE
9525rb_str_each_grapheme_cluster(VALUE str)
9526{
9527 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9528 return rb_str_enumerate_grapheme_clusters(str, 0);
9529}
9530
9531/*
9532 * call-seq:
9533 * grapheme_clusters -> array_of_grapheme_clusters
9534 *
9535 * :include: doc/string/grapheme_clusters.rdoc
9536 *
9537 */
9538
9539static VALUE
9540rb_str_grapheme_clusters(VALUE str)
9541{
9542 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9543 return rb_str_enumerate_grapheme_clusters(str, ary);
9544}
9545
9546static long
9547chopped_length(VALUE str)
9548{
9549 rb_encoding *enc = STR_ENC_GET(str);
9550 const char *p, *p2, *beg, *end;
9551
9552 beg = RSTRING_PTR(str);
9553 end = beg + RSTRING_LEN(str);
9554 if (beg >= end) return 0;
9555 p = rb_enc_prev_char(beg, end, end, enc);
9556 if (!p) return 0;
9557 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9558 p2 = rb_enc_prev_char(beg, p, end, enc);
9559 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9560 }
9561 return p - beg;
9562}
9563
9564/*
9565 * call-seq:
9566 * chop! -> self or nil
9567 *
9568 * Like String#chop, but modifies +self+ in place;
9569 * returns +nil+ if +self+ is empty, +self+ otherwise.
9570 *
9571 * Related: String#chomp!.
9572 */
9573
9574static VALUE
9575rb_str_chop_bang(VALUE str)
9576{
9577 str_modify_keep_cr(str);
9578 if (RSTRING_LEN(str) > 0) {
9579 long len;
9580 len = chopped_length(str);
9581 STR_SET_LEN(str, len);
9582 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9583 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9585 }
9586 return str;
9587 }
9588 return Qnil;
9589}
9590
9591
9592/*
9593 * call-seq:
9594 * chop -> new_string
9595 *
9596 * :include: doc/string/chop.rdoc
9597 *
9598 */
9599
9600static VALUE
9601rb_str_chop(VALUE str)
9602{
9603 return rb_str_subseq(str, 0, chopped_length(str));
9604}
9605
9606static long
9607smart_chomp(VALUE str, const char *e, const char *p)
9608{
9609 rb_encoding *enc = rb_enc_get(str);
9610 if (rb_enc_mbminlen(enc) > 1) {
9611 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9612 if (rb_enc_is_newline(pp, e, enc)) {
9613 e = pp;
9614 }
9615 pp = e - rb_enc_mbminlen(enc);
9616 if (pp >= p) {
9617 pp = rb_enc_left_char_head(p, pp, e, enc);
9618 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9619 e = pp;
9620 }
9621 }
9622 }
9623 else {
9624 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9625 case '\n':
9626 if (--e > p && *(e-1) == '\r') {
9627 --e;
9628 }
9629 break;
9630 case '\r':
9631 --e;
9632 break;
9633 }
9634 }
9635 return e - p;
9636}
9637
9638static long
9639chompped_length(VALUE str, VALUE rs)
9640{
9641 rb_encoding *enc;
9642 int newline;
9643 char *pp, *e, *rsptr;
9644 long rslen;
9645 char *const p = RSTRING_PTR(str);
9646 long len = RSTRING_LEN(str);
9647
9648 if (len == 0) return 0;
9649 e = p + len;
9650 if (rs == rb_default_rs) {
9651 return smart_chomp(str, e, p);
9652 }
9653
9654 enc = rb_enc_get(str);
9655 RSTRING_GETMEM(rs, rsptr, rslen);
9656 if (rslen == 0) {
9657 if (rb_enc_mbminlen(enc) > 1) {
9658 while (e > p) {
9659 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9660 if (!rb_enc_is_newline(pp, e, enc)) break;
9661 e = pp;
9662 pp -= rb_enc_mbminlen(enc);
9663 if (pp >= p) {
9664 pp = rb_enc_left_char_head(p, pp, e, enc);
9665 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9666 e = pp;
9667 }
9668 }
9669 }
9670 }
9671 else {
9672 while (e > p && *(e-1) == '\n') {
9673 --e;
9674 if (e > p && *(e-1) == '\r')
9675 --e;
9676 }
9677 }
9678 return e - p;
9679 }
9680 if (rslen > len) return len;
9681
9682 enc = rb_enc_get(rs);
9683 newline = rsptr[rslen-1];
9684 if (rslen == rb_enc_mbminlen(enc)) {
9685 if (rslen == 1) {
9686 if (newline == '\n')
9687 return smart_chomp(str, e, p);
9688 }
9689 else {
9690 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9691 return smart_chomp(str, e, p);
9692 }
9693 }
9694
9695 enc = rb_enc_check(str, rs);
9696 if (is_broken_string(rs)) {
9697 return len;
9698 }
9699 pp = e - rslen;
9700 if (p[len-1] == newline &&
9701 (rslen <= 1 ||
9702 memcmp(rsptr, pp, rslen) == 0)) {
9703 if (at_char_boundary(p, pp, e, enc))
9704 return len - rslen;
9705 RB_GC_GUARD(rs);
9706 }
9707 return len;
9708}
9709
9715static VALUE
9716chomp_rs(int argc, const VALUE *argv)
9717{
9718 rb_check_arity(argc, 0, 1);
9719 if (argc > 0) {
9720 VALUE rs = argv[0];
9721 if (!NIL_P(rs)) StringValue(rs);
9722 return rs;
9723 }
9724 else {
9725 return rb_rs;
9726 }
9727}
9728
9729VALUE
9730rb_str_chomp_string(VALUE str, VALUE rs)
9731{
9732 long olen = RSTRING_LEN(str);
9733 long len = chompped_length(str, rs);
9734 if (len >= olen) return Qnil;
9735 str_modify_keep_cr(str);
9736 STR_SET_LEN(str, len);
9737 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9738 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9740 }
9741 return str;
9742}
9743
9744/*
9745 * call-seq:
9746 * chomp!(line_sep = $/) -> self or nil
9747 *
9748 * Like String#chomp, but modifies +self+ in place;
9749 * returns +nil+ if no modification made, +self+ otherwise.
9750 *
9751 */
9752
9753static VALUE
9754rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9755{
9756 VALUE rs;
9757 str_modifiable(str);
9758 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
9759 rs = chomp_rs(argc, argv);
9760 if (NIL_P(rs)) return Qnil;
9761 return rb_str_chomp_string(str, rs);
9762}
9763
9764
9765/*
9766 * call-seq:
9767 * chomp(line_sep = $/) -> new_string
9768 *
9769 * :include: doc/string/chomp.rdoc
9770 *
9771 */
9772
9773static VALUE
9774rb_str_chomp(int argc, VALUE *argv, VALUE str)
9775{
9776 VALUE rs = chomp_rs(argc, argv);
9777 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9778 return rb_str_subseq(str, 0, chompped_length(str, rs));
9779}
9780
9781static long
9782lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9783{
9784 const char *const start = s;
9785
9786 if (!s || s >= e) return 0;
9787
9788 /* remove spaces at head */
9789 if (single_byte_optimizable(str)) {
9790 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9791 }
9792 else {
9793 while (s < e) {
9794 int n;
9795 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9796
9797 if (cc && !rb_isspace(cc)) break;
9798 s += n;
9799 }
9800 }
9801 return s - start;
9802}
9803
9804/*
9805 * call-seq:
9806 * lstrip! -> self or nil
9807 *
9808 * Like String#lstrip, except that any modifications are made in +self+;
9809 * returns +self+ if any modification are made, +nil+ otherwise.
9810 *
9811 * Related: String#rstrip!, String#strip!.
9812 */
9813
9814static VALUE
9815rb_str_lstrip_bang(VALUE str)
9816{
9817 rb_encoding *enc;
9818 char *start, *s;
9819 long olen, loffset;
9820
9821 str_modify_keep_cr(str);
9822 enc = STR_ENC_GET(str);
9823 RSTRING_GETMEM(str, start, olen);
9824 loffset = lstrip_offset(str, start, start+olen, enc);
9825 if (loffset > 0) {
9826 long len = olen-loffset;
9827 s = start + loffset;
9828 memmove(start, s, len);
9829 STR_SET_LEN(str, len);
9830 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9831 return str;
9832 }
9833 return Qnil;
9834}
9835
9836
9837/*
9838 * call-seq:
9839 * lstrip -> new_string
9840 *
9841 * Returns a copy of +self+ with leading whitespace removed;
9842 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9843 *
9844 * whitespace = "\x00\t\n\v\f\r "
9845 * s = whitespace + 'abc' + whitespace
9846 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9847 * s.lstrip # => "abc\u0000\t\n\v\f\r "
9848 *
9849 * Related: String#rstrip, String#strip.
9850 */
9851
9852static VALUE
9853rb_str_lstrip(VALUE str)
9854{
9855 char *start;
9856 long len, loffset;
9857 RSTRING_GETMEM(str, start, len);
9858 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9859 if (loffset <= 0) return str_duplicate(rb_cString, str);
9860 return rb_str_subseq(str, loffset, len - loffset);
9861}
9862
9863static long
9864rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9865{
9866 const char *t;
9867
9868 rb_str_check_dummy_enc(enc);
9869 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
9870 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9871 }
9872 if (!s || s >= e) return 0;
9873 t = e;
9874
9875 /* remove trailing spaces or '\0's */
9876 if (single_byte_optimizable(str)) {
9877 unsigned char c;
9878 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9879 }
9880 else {
9881 char *tp;
9882
9883 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9884 unsigned int c = rb_enc_codepoint(tp, e, enc);
9885 if (c && !rb_isspace(c)) break;
9886 t = tp;
9887 }
9888 }
9889 return e - t;
9890}
9891
9892/*
9893 * call-seq:
9894 * rstrip! -> self or nil
9895 *
9896 * Like String#rstrip, except that any modifications are made in +self+;
9897 * returns +self+ if any modification are made, +nil+ otherwise.
9898 *
9899 * Related: String#lstrip!, String#strip!.
9900 */
9901
9902static VALUE
9903rb_str_rstrip_bang(VALUE str)
9904{
9905 rb_encoding *enc;
9906 char *start;
9907 long olen, roffset;
9908
9909 str_modify_keep_cr(str);
9910 enc = STR_ENC_GET(str);
9911 RSTRING_GETMEM(str, start, olen);
9912 roffset = rstrip_offset(str, start, start+olen, enc);
9913 if (roffset > 0) {
9914 long len = olen - roffset;
9915
9916 STR_SET_LEN(str, len);
9917 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9918 return str;
9919 }
9920 return Qnil;
9921}
9922
9923
9924/*
9925 * call-seq:
9926 * rstrip -> new_string
9927 *
9928 * Returns a copy of the receiver with trailing whitespace removed;
9929 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9930 *
9931 * whitespace = "\x00\t\n\v\f\r "
9932 * s = whitespace + 'abc' + whitespace
9933 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9934 * s.rstrip # => "\u0000\t\n\v\f\r abc"
9935 *
9936 * Related: String#lstrip, String#strip.
9937 */
9938
9939static VALUE
9940rb_str_rstrip(VALUE str)
9941{
9942 rb_encoding *enc;
9943 char *start;
9944 long olen, roffset;
9945
9946 enc = STR_ENC_GET(str);
9947 RSTRING_GETMEM(str, start, olen);
9948 roffset = rstrip_offset(str, start, start+olen, enc);
9949
9950 if (roffset <= 0) return str_duplicate(rb_cString, str);
9951 return rb_str_subseq(str, 0, olen-roffset);
9952}
9953
9954
9955/*
9956 * call-seq:
9957 * strip! -> self or nil
9958 *
9959 * Like String#strip, except that any modifications are made in +self+;
9960 * returns +self+ if any modification are made, +nil+ otherwise.
9961 *
9962 * Related: String#lstrip!, String#strip!.
9963 */
9964
9965static VALUE
9966rb_str_strip_bang(VALUE str)
9967{
9968 char *start;
9969 long olen, loffset, roffset;
9970 rb_encoding *enc;
9971
9972 str_modify_keep_cr(str);
9973 enc = STR_ENC_GET(str);
9974 RSTRING_GETMEM(str, start, olen);
9975 loffset = lstrip_offset(str, start, start+olen, enc);
9976 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9977
9978 if (loffset > 0 || roffset > 0) {
9979 long len = olen-roffset;
9980 if (loffset > 0) {
9981 len -= loffset;
9982 memmove(start, start + loffset, len);
9983 }
9984 STR_SET_LEN(str, len);
9985 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9986 return str;
9987 }
9988 return Qnil;
9989}
9990
9991
9992/*
9993 * call-seq:
9994 * strip -> new_string
9995 *
9996 * Returns a copy of the receiver with leading and trailing whitespace removed;
9997 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9998 *
9999 * whitespace = "\x00\t\n\v\f\r "
10000 * s = whitespace + 'abc' + whitespace
10001 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10002 * s.strip # => "abc"
10003 *
10004 * Related: String#lstrip, String#rstrip.
10005 */
10006
10007static VALUE
10008rb_str_strip(VALUE str)
10009{
10010 char *start;
10011 long olen, loffset, roffset;
10012 rb_encoding *enc = STR_ENC_GET(str);
10013
10014 RSTRING_GETMEM(str, start, olen);
10015 loffset = lstrip_offset(str, start, start+olen, enc);
10016 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10017
10018 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10019 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10020}
10021
10022static VALUE
10023scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10024{
10025 VALUE result = Qnil;
10026 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10027 if (pos >= 0) {
10028 VALUE match;
10029 struct re_registers *regs;
10030 if (BUILTIN_TYPE(pat) == T_STRING) {
10031 regs = NULL;
10032 end = pos + RSTRING_LEN(pat);
10033 }
10034 else {
10035 match = rb_backref_get();
10036 regs = RMATCH_REGS(match);
10037 pos = BEG(0);
10038 end = END(0);
10039 }
10040
10041 if (pos == end) {
10042 rb_encoding *enc = STR_ENC_GET(str);
10043 /*
10044 * Always consume at least one character of the input string
10045 */
10046 if (RSTRING_LEN(str) > end)
10047 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10048 RSTRING_END(str), enc);
10049 else
10050 *start = end + 1;
10051 }
10052 else {
10053 *start = end;
10054 }
10055
10056 if (!regs || regs->num_regs == 1) {
10057 result = rb_str_subseq(str, pos, end - pos);
10058 return result;
10059 }
10060 else {
10061 result = rb_ary_new2(regs->num_regs);
10062 for (int i = 1; i < regs->num_regs; i++) {
10063 VALUE s = Qnil;
10064 if (BEG(i) >= 0) {
10065 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10066 }
10067
10068 rb_ary_push(result, s);
10069 }
10070 }
10071
10072 RB_GC_GUARD(match);
10073 }
10074
10075 return result;
10076}
10077
10078
10079/*
10080 * call-seq:
10081 * scan(string_or_regexp) -> array
10082 * scan(string_or_regexp) {|matches| ... } -> self
10083 *
10084 * Matches a pattern against +self+; the pattern is:
10085 *
10086 * - +string_or_regexp+ itself, if it is a Regexp.
10087 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10088 *
10089 * Iterates through +self+, generating a collection of matching results:
10090 *
10091 * - If the pattern contains no groups, each result is the
10092 * matched string, <code>$&</code>.
10093 * - If the pattern contains groups, each result is an array
10094 * containing one entry per group.
10095 *
10096 * With no block given, returns an array of the results:
10097 *
10098 * s = 'cruel world'
10099 * s.scan(/\w+/) # => ["cruel", "world"]
10100 * s.scan(/.../) # => ["cru", "el ", "wor"]
10101 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10102 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10103 *
10104 * With a block given, calls the block with each result; returns +self+:
10105 *
10106 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10107 * print "\n"
10108 * s.scan(/(.)(.)/) {|x,y| print y, x }
10109 * print "\n"
10110 *
10111 * Output:
10112 *
10113 * <<cruel>> <<world>>
10114 * rceu lowlr
10115 *
10116 */
10117
10118static VALUE
10119rb_str_scan(VALUE str, VALUE pat)
10120{
10121 VALUE result;
10122 long start = 0;
10123 long last = -1, prev = 0;
10124 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10125
10126 pat = get_pat_quoted(pat, 1);
10127 mustnot_broken(str);
10128 if (!rb_block_given_p()) {
10129 VALUE ary = rb_ary_new();
10130
10131 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10132 last = prev;
10133 prev = start;
10134 rb_ary_push(ary, result);
10135 }
10136 if (last >= 0) rb_pat_search(pat, str, last, 1);
10137 else rb_backref_set(Qnil);
10138 return ary;
10139 }
10140
10141 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10142 last = prev;
10143 prev = start;
10144 rb_yield(result);
10145 str_mod_check(str, p, len);
10146 }
10147 if (last >= 0) rb_pat_search(pat, str, last, 1);
10148 return str;
10149}
10150
10151
10152/*
10153 * call-seq:
10154 * hex -> integer
10155 *
10156 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10157 * (with an optional sign and an optional <code>0x</code>) and returns the
10158 * corresponding number;
10159 * returns zero if there is no such leading substring:
10160 *
10161 * '0x0a'.hex # => 10
10162 * '-1234'.hex # => -4660
10163 * '0'.hex # => 0
10164 * 'non-numeric'.hex # => 0
10165 *
10166 * Related: String#oct.
10167 *
10168 */
10169
10170static VALUE
10171rb_str_hex(VALUE str)
10172{
10173 return rb_str_to_inum(str, 16, FALSE);
10174}
10175
10176
10177/*
10178 * call-seq:
10179 * oct -> integer
10180 *
10181 * Interprets the leading substring of +self+ as a string of octal digits
10182 * (with an optional sign) and returns the corresponding number;
10183 * returns zero if there is no such leading substring:
10184 *
10185 * '123'.oct # => 83
10186 * '-377'.oct # => -255
10187 * '0377non-numeric'.oct # => 255
10188 * 'non-numeric'.oct # => 0
10189 *
10190 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10191 * see Kernel#Integer.
10192 *
10193 * Related: String#hex.
10194 *
10195 */
10196
10197static VALUE
10198rb_str_oct(VALUE str)
10199{
10200 return rb_str_to_inum(str, -8, FALSE);
10201}
10202
10203#ifndef HAVE_CRYPT_R
10204# include "ruby/thread_native.h"
10205# include "ruby/atomic.h"
10206
10207static struct {
10208 rb_nativethread_lock_t lock;
10209} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10210
10211static void
10212crypt_mutex_initialize(void)
10213{
10214}
10215#endif
10216
10217/*
10218 * call-seq:
10219 * crypt(salt_str) -> new_string
10220 *
10221 * Returns the string generated by calling <code>crypt(3)</code>
10222 * standard library function with <code>str</code> and
10223 * <code>salt_str</code>, in this order, as its arguments. Please do
10224 * not use this method any longer. It is legacy; provided only for
10225 * backward compatibility with ruby scripts in earlier days. It is
10226 * bad to use in contemporary programs for several reasons:
10227 *
10228 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10229 * run. The generated string lacks data portability.
10230 *
10231 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10232 * (i.e. silently ends up in unexpected results).
10233 *
10234 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10235 * thread safe.
10236 *
10237 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10238 * very very weak. According to its manpage, Linux's traditional
10239 * <code>crypt(3)</code> output has only 2**56 variations; too
10240 * easy to brute force today. And this is the default behaviour.
10241 *
10242 * * In order to make things robust some OSes implement so-called
10243 * "modular" usage. To go through, you have to do a complex
10244 * build-up of the <code>salt_str</code> parameter, by hand.
10245 * Failure in generation of a proper salt string tends not to
10246 * yield any errors; typos in parameters are normally not
10247 * detectable.
10248 *
10249 * * For instance, in the following example, the second invocation
10250 * of String#crypt is wrong; it has a typo in "round=" (lacks
10251 * "s"). However the call does not fail and something unexpected
10252 * is generated.
10253 *
10254 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10255 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10256 *
10257 * * Even in the "modular" mode, some hash functions are considered
10258 * archaic and no longer recommended at all; for instance module
10259 * <code>$1$</code> is officially abandoned by its author: see
10260 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10261 * instance module <code>$3$</code> is considered completely
10262 * broken: see the manpage of FreeBSD.
10263 *
10264 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10265 * written above, <code>crypt(3)</code> on Mac OS never fails.
10266 * This means even if you build up a proper salt string it
10267 * generates a traditional DES hash anyways, and there is no way
10268 * for you to be aware of.
10269 *
10270 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10271 *
10272 * If for some reason you cannot migrate to other secure contemporary
10273 * password hashing algorithms, install the string-crypt gem and
10274 * <code>require 'string/crypt'</code> to continue using it.
10275 */
10276
10277static VALUE
10278rb_str_crypt(VALUE str, VALUE salt)
10279{
10280#ifdef HAVE_CRYPT_R
10281 VALUE databuf;
10282 struct crypt_data *data;
10283# define CRYPT_END() ALLOCV_END(databuf)
10284#else
10285 extern char *crypt(const char *, const char *);
10286# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10287#endif
10288 VALUE result;
10289 const char *s, *saltp;
10290 char *res;
10291#ifdef BROKEN_CRYPT
10292 char salt_8bit_clean[3];
10293#endif
10294
10295 StringValue(salt);
10296 mustnot_wchar(str);
10297 mustnot_wchar(salt);
10298 s = StringValueCStr(str);
10299 saltp = RSTRING_PTR(salt);
10300 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10301 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10302 }
10303
10304#ifdef BROKEN_CRYPT
10305 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10306 salt_8bit_clean[0] = saltp[0] & 0x7f;
10307 salt_8bit_clean[1] = saltp[1] & 0x7f;
10308 salt_8bit_clean[2] = '\0';
10309 saltp = salt_8bit_clean;
10310 }
10311#endif
10312#ifdef HAVE_CRYPT_R
10313 data = ALLOCV(databuf, sizeof(struct crypt_data));
10314# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10315 data->initialized = 0;
10316# endif
10317 res = crypt_r(s, saltp, data);
10318#else
10319 crypt_mutex_initialize();
10320 rb_nativethread_lock_lock(&crypt_mutex.lock);
10321 res = crypt(s, saltp);
10322#endif
10323 if (!res) {
10324 int err = errno;
10325 CRYPT_END();
10326 rb_syserr_fail(err, "crypt");
10327 }
10328 result = rb_str_new_cstr(res);
10329 CRYPT_END();
10330 return result;
10331}
10332
10333
10334/*
10335 * call-seq:
10336 * ord -> integer
10337 *
10338 * :include: doc/string/ord.rdoc
10339 *
10340 */
10341
10342static VALUE
10343rb_str_ord(VALUE s)
10344{
10345 unsigned int c;
10346
10347 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10348 return UINT2NUM(c);
10349}
10350/*
10351 * call-seq:
10352 * sum(n = 16) -> integer
10353 *
10354 * :include: doc/string/sum.rdoc
10355 *
10356 */
10357
10358static VALUE
10359rb_str_sum(int argc, VALUE *argv, VALUE str)
10360{
10361 int bits = 16;
10362 char *ptr, *p, *pend;
10363 long len;
10364 VALUE sum = INT2FIX(0);
10365 unsigned long sum0 = 0;
10366
10367 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10368 bits = 0;
10369 }
10370 ptr = p = RSTRING_PTR(str);
10371 len = RSTRING_LEN(str);
10372 pend = p + len;
10373
10374 while (p < pend) {
10375 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10376 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10377 str_mod_check(str, ptr, len);
10378 sum0 = 0;
10379 }
10380 sum0 += (unsigned char)*p;
10381 p++;
10382 }
10383
10384 if (bits == 0) {
10385 if (sum0) {
10386 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10387 }
10388 }
10389 else {
10390 if (sum == INT2FIX(0)) {
10391 if (bits < (int)sizeof(long)*CHAR_BIT) {
10392 sum0 &= (((unsigned long)1)<<bits)-1;
10393 }
10394 sum = LONG2FIX(sum0);
10395 }
10396 else {
10397 VALUE mod;
10398
10399 if (sum0) {
10400 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10401 }
10402
10403 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10404 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10405 sum = rb_funcall(sum, '&', 1, mod);
10406 }
10407 }
10408 return sum;
10409}
10410
10411static VALUE
10412rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10413{
10414 rb_encoding *enc;
10415 VALUE w;
10416 long width, len, flen = 1, fclen = 1;
10417 VALUE res;
10418 char *p;
10419 const char *f = " ";
10420 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10421 VALUE pad;
10422 int singlebyte = 1, cr;
10423 int termlen;
10424
10425 rb_scan_args(argc, argv, "11", &w, &pad);
10426 enc = STR_ENC_GET(str);
10427 termlen = rb_enc_mbminlen(enc);
10428 width = NUM2LONG(w);
10429 if (argc == 2) {
10430 StringValue(pad);
10431 enc = rb_enc_check(str, pad);
10432 f = RSTRING_PTR(pad);
10433 flen = RSTRING_LEN(pad);
10434 fclen = str_strlen(pad, enc); /* rb_enc_check */
10435 singlebyte = single_byte_optimizable(pad);
10436 if (flen == 0 || fclen == 0) {
10437 rb_raise(rb_eArgError, "zero width padding");
10438 }
10439 }
10440 len = str_strlen(str, enc); /* rb_enc_check */
10441 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10442 n = width - len;
10443 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10444 rlen = n - llen;
10445 cr = ENC_CODERANGE(str);
10446 if (flen > 1) {
10447 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10448 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10449 }
10450 size = RSTRING_LEN(str);
10451 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10452 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10453 (len += llen2 + rlen2) >= LONG_MAX - size) {
10454 rb_raise(rb_eArgError, "argument too big");
10455 }
10456 len += size;
10457 res = str_new0(rb_cString, 0, len, termlen);
10458 p = RSTRING_PTR(res);
10459 if (flen <= 1) {
10460 memset(p, *f, llen);
10461 p += llen;
10462 }
10463 else {
10464 while (llen >= fclen) {
10465 memcpy(p,f,flen);
10466 p += flen;
10467 llen -= fclen;
10468 }
10469 if (llen > 0) {
10470 memcpy(p, f, llen2);
10471 p += llen2;
10472 }
10473 }
10474 memcpy(p, RSTRING_PTR(str), size);
10475 p += size;
10476 if (flen <= 1) {
10477 memset(p, *f, rlen);
10478 p += rlen;
10479 }
10480 else {
10481 while (rlen >= fclen) {
10482 memcpy(p,f,flen);
10483 p += flen;
10484 rlen -= fclen;
10485 }
10486 if (rlen > 0) {
10487 memcpy(p, f, rlen2);
10488 p += rlen2;
10489 }
10490 }
10491 TERM_FILL(p, termlen);
10492 STR_SET_LEN(res, p-RSTRING_PTR(res));
10493 rb_enc_associate(res, enc);
10494 if (argc == 2)
10495 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10496 if (cr != ENC_CODERANGE_BROKEN)
10497 ENC_CODERANGE_SET(res, cr);
10498
10499 RB_GC_GUARD(pad);
10500 return res;
10501}
10502
10503
10504/*
10505 * call-seq:
10506 * ljust(size, pad_string = ' ') -> new_string
10507 *
10508 * :include: doc/string/ljust.rdoc
10509 *
10510 * Related: String#rjust, String#center.
10511 *
10512 */
10513
10514static VALUE
10515rb_str_ljust(int argc, VALUE *argv, VALUE str)
10516{
10517 return rb_str_justify(argc, argv, str, 'l');
10518}
10519
10520/*
10521 * call-seq:
10522 * rjust(size, pad_string = ' ') -> new_string
10523 *
10524 * :include: doc/string/rjust.rdoc
10525 *
10526 * Related: String#ljust, String#center.
10527 *
10528 */
10529
10530static VALUE
10531rb_str_rjust(int argc, VALUE *argv, VALUE str)
10532{
10533 return rb_str_justify(argc, argv, str, 'r');
10534}
10535
10536
10537/*
10538 * call-seq:
10539 * center(size, pad_string = ' ') -> new_string
10540 *
10541 * :include: doc/string/center.rdoc
10542 *
10543 * Related: String#ljust, String#rjust.
10544 *
10545 */
10546
10547static VALUE
10548rb_str_center(int argc, VALUE *argv, VALUE str)
10549{
10550 return rb_str_justify(argc, argv, str, 'c');
10551}
10552
10553/*
10554 * call-seq:
10555 * partition(string_or_regexp) -> [head, match, tail]
10556 *
10557 * :include: doc/string/partition.rdoc
10558 *
10559 */
10560
10561static VALUE
10562rb_str_partition(VALUE str, VALUE sep)
10563{
10564 long pos;
10565
10566 sep = get_pat_quoted(sep, 0);
10567 if (RB_TYPE_P(sep, T_REGEXP)) {
10568 if (rb_reg_search(sep, str, 0, 0) < 0) {
10569 goto failed;
10570 }
10571 VALUE match = rb_backref_get();
10572 struct re_registers *regs = RMATCH_REGS(match);
10573
10574 pos = BEG(0);
10575 sep = rb_str_subseq(str, pos, END(0) - pos);
10576 }
10577 else {
10578 pos = rb_str_index(str, sep, 0);
10579 if (pos < 0) goto failed;
10580 }
10581 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10582 sep,
10583 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10584 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10585
10586 failed:
10587 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10588}
10589
10590/*
10591 * call-seq:
10592 * rpartition(sep) -> [head, match, tail]
10593 *
10594 * :include: doc/string/rpartition.rdoc
10595 *
10596 */
10597
10598static VALUE
10599rb_str_rpartition(VALUE str, VALUE sep)
10600{
10601 long pos = RSTRING_LEN(str);
10602
10603 sep = get_pat_quoted(sep, 0);
10604 if (RB_TYPE_P(sep, T_REGEXP)) {
10605 if (rb_reg_search(sep, str, pos, 1) < 0) {
10606 goto failed;
10607 }
10608 VALUE match = rb_backref_get();
10609 struct re_registers *regs = RMATCH_REGS(match);
10610
10611 pos = BEG(0);
10612 sep = rb_str_subseq(str, pos, END(0) - pos);
10613 }
10614 else {
10615 pos = rb_str_sublen(str, pos);
10616 pos = rb_str_rindex(str, sep, pos);
10617 if (pos < 0) {
10618 goto failed;
10619 }
10620 }
10621
10622 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10623 sep,
10624 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10625 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10626 failed:
10627 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10628}
10629
10630/*
10631 * call-seq:
10632 * start_with?(*string_or_regexp) -> true or false
10633 *
10634 * :include: doc/string/start_with_p.rdoc
10635 *
10636 */
10637
10638static VALUE
10639rb_str_start_with(int argc, VALUE *argv, VALUE str)
10640{
10641 int i;
10642
10643 for (i=0; i<argc; i++) {
10644 VALUE tmp = argv[i];
10645 if (RB_TYPE_P(tmp, T_REGEXP)) {
10646 if (rb_reg_start_with_p(tmp, str))
10647 return Qtrue;
10648 }
10649 else {
10650 const char *p, *s, *e;
10651 long slen, tlen;
10652 rb_encoding *enc;
10653
10654 StringValue(tmp);
10655 enc = rb_enc_check(str, tmp);
10656 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10657 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10658 p = RSTRING_PTR(str);
10659 e = p + slen;
10660 s = p + tlen;
10661 if (!at_char_right_boundary(p, s, e, enc))
10662 continue;
10663 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
10664 return Qtrue;
10665 }
10666 }
10667 return Qfalse;
10668}
10669
10670/*
10671 * call-seq:
10672 * end_with?(*strings) -> true or false
10673 *
10674 * :include: doc/string/end_with_p.rdoc
10675 *
10676 */
10677
10678static VALUE
10679rb_str_end_with(int argc, VALUE *argv, VALUE str)
10680{
10681 int i;
10682
10683 for (i=0; i<argc; i++) {
10684 VALUE tmp = argv[i];
10685 const char *p, *s, *e;
10686 long slen, tlen;
10687 rb_encoding *enc;
10688
10689 StringValue(tmp);
10690 enc = rb_enc_check(str, tmp);
10691 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10692 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10693 p = RSTRING_PTR(str);
10694 e = p + slen;
10695 s = e - tlen;
10696 if (!at_char_boundary(p, s, e, enc))
10697 continue;
10698 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
10699 return Qtrue;
10700 }
10701 return Qfalse;
10702}
10703
10713static long
10714deleted_prefix_length(VALUE str, VALUE prefix)
10715{
10716 const char *strptr, *prefixptr;
10717 long olen, prefixlen;
10718 rb_encoding *enc = rb_enc_get(str);
10719
10720 StringValue(prefix);
10721
10722 if (!is_broken_string(prefix) ||
10723 !rb_enc_asciicompat(enc) ||
10724 !rb_enc_asciicompat(rb_enc_get(prefix))) {
10725 enc = rb_enc_check(str, prefix);
10726 }
10727
10728 /* return 0 if not start with prefix */
10729 prefixlen = RSTRING_LEN(prefix);
10730 if (prefixlen <= 0) return 0;
10731 olen = RSTRING_LEN(str);
10732 if (olen < prefixlen) return 0;
10733 strptr = RSTRING_PTR(str);
10734 prefixptr = RSTRING_PTR(prefix);
10735 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10736 if (is_broken_string(prefix)) {
10737 if (!is_broken_string(str)) {
10738 /* prefix in a valid string cannot be broken */
10739 return 0;
10740 }
10741 const char *strend = strptr + olen;
10742 const char *after_prefix = strptr + prefixlen;
10743 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
10744 /* prefix does not end at char-boundary */
10745 return 0;
10746 }
10747 }
10748 /* prefix part in `str` also should be valid. */
10749
10750 return prefixlen;
10751}
10752
10753/*
10754 * call-seq:
10755 * delete_prefix!(prefix) -> self or nil
10756 *
10757 * Like String#delete_prefix, except that +self+ is modified in place.
10758 * Returns +self+ if the prefix is removed, +nil+ otherwise.
10759 *
10760 */
10761
10762static VALUE
10763rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10764{
10765 long prefixlen;
10766 str_modify_keep_cr(str);
10767
10768 prefixlen = deleted_prefix_length(str, prefix);
10769 if (prefixlen <= 0) return Qnil;
10770
10771 return rb_str_drop_bytes(str, prefixlen);
10772}
10773
10774/*
10775 * call-seq:
10776 * delete_prefix(prefix) -> new_string
10777 *
10778 * :include: doc/string/delete_prefix.rdoc
10779 *
10780 */
10781
10782static VALUE
10783rb_str_delete_prefix(VALUE str, VALUE prefix)
10784{
10785 long prefixlen;
10786
10787 prefixlen = deleted_prefix_length(str, prefix);
10788 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10789
10790 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10791}
10792
10802static long
10803deleted_suffix_length(VALUE str, VALUE suffix)
10804{
10805 const char *strptr, *suffixptr;
10806 long olen, suffixlen;
10807 rb_encoding *enc;
10808
10809 StringValue(suffix);
10810 if (is_broken_string(suffix)) return 0;
10811 enc = rb_enc_check(str, suffix);
10812
10813 /* return 0 if not start with suffix */
10814 suffixlen = RSTRING_LEN(suffix);
10815 if (suffixlen <= 0) return 0;
10816 olen = RSTRING_LEN(str);
10817 if (olen < suffixlen) return 0;
10818 strptr = RSTRING_PTR(str);
10819 suffixptr = RSTRING_PTR(suffix);
10820 const char *strend = strptr + olen;
10821 const char *before_suffix = strend - suffixlen;
10822 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
10823 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
10824
10825 return suffixlen;
10826}
10827
10828/*
10829 * call-seq:
10830 * delete_suffix!(suffix) -> self or nil
10831 *
10832 * Like String#delete_suffix, except that +self+ is modified in place.
10833 * Returns +self+ if the suffix is removed, +nil+ otherwise.
10834 *
10835 */
10836
10837static VALUE
10838rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10839{
10840 long olen, suffixlen, len;
10841 str_modifiable(str);
10842
10843 suffixlen = deleted_suffix_length(str, suffix);
10844 if (suffixlen <= 0) return Qnil;
10845
10846 olen = RSTRING_LEN(str);
10847 str_modify_keep_cr(str);
10848 len = olen - suffixlen;
10849 STR_SET_LEN(str, len);
10850 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10851 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10853 }
10854 return str;
10855}
10856
10857/*
10858 * call-seq:
10859 * delete_suffix(suffix) -> new_string
10860 *
10861 * :include: doc/string/delete_suffix.rdoc
10862 *
10863 */
10864
10865static VALUE
10866rb_str_delete_suffix(VALUE str, VALUE suffix)
10867{
10868 long suffixlen;
10869
10870 suffixlen = deleted_suffix_length(str, suffix);
10871 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10872
10873 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10874}
10875
10876void
10877rb_str_setter(VALUE val, ID id, VALUE *var)
10878{
10879 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10880 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10881 }
10882 *var = val;
10883}
10884
10885static void
10886rb_fs_setter(VALUE val, ID id, VALUE *var)
10887{
10888 val = rb_fs_check(val);
10889 if (!val) {
10890 rb_raise(rb_eTypeError,
10891 "value of %"PRIsVALUE" must be String or Regexp",
10892 rb_id2str(id));
10893 }
10894 if (!NIL_P(val)) {
10895 rb_warn_deprecated("`$;'", NULL);
10896 }
10897 *var = val;
10898}
10899
10900
10901/*
10902 * call-seq:
10903 * force_encoding(encoding) -> self
10904 *
10905 * :include: doc/string/force_encoding.rdoc
10906 *
10907 */
10908
10909static VALUE
10910rb_str_force_encoding(VALUE str, VALUE enc)
10911{
10912 str_modifiable(str);
10913
10914 rb_encoding *encoding = rb_to_encoding(enc);
10915 int idx = rb_enc_to_index(encoding);
10916
10917 // If the encoding is unchanged, we do nothing.
10918 if (ENCODING_GET(str) == idx) {
10919 return str;
10920 }
10921
10922 rb_enc_associate_index(str, idx);
10923
10924 // If the coderange was 7bit and the new encoding is ASCII-compatible
10925 // we can keep the coderange.
10926 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
10927 return str;
10928 }
10929
10931 return str;
10932}
10933
10934/*
10935 * call-seq:
10936 * b -> string
10937 *
10938 * :include: doc/string/b.rdoc
10939 *
10940 */
10941
10942static VALUE
10943rb_str_b(VALUE str)
10944{
10945 VALUE str2;
10946 if (STR_EMBED_P(str)) {
10947 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
10948 }
10949 else {
10950 str2 = str_alloc_heap(rb_cString);
10951 }
10952 str_replace_shared_without_enc(str2, str);
10953
10954 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
10955 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
10956 // If we know the receiver's code range then we know the result's code range.
10957 int cr = ENC_CODERANGE(str);
10958 switch (cr) {
10959 case ENC_CODERANGE_7BIT:
10961 break;
10965 break;
10966 default:
10967 ENC_CODERANGE_CLEAR(str2);
10968 break;
10969 }
10970 }
10971
10972 return str2;
10973}
10974
10975/*
10976 * call-seq:
10977 * valid_encoding? -> true or false
10978 *
10979 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
10980 *
10981 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
10982 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
10983 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
10984 */
10985
10986static VALUE
10987rb_str_valid_encoding_p(VALUE str)
10988{
10989 int cr = rb_enc_str_coderange(str);
10990
10991 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10992}
10993
10994/*
10995 * call-seq:
10996 * ascii_only? -> true or false
10997 *
10998 * Returns +true+ if +self+ contains only ASCII characters,
10999 * +false+ otherwise:
11000 *
11001 * 'abc'.ascii_only? # => true
11002 * "abc\u{6666}".ascii_only? # => false
11003 *
11004 */
11005
11006static VALUE
11007rb_str_is_ascii_only_p(VALUE str)
11008{
11009 int cr = rb_enc_str_coderange(str);
11010
11011 return RBOOL(cr == ENC_CODERANGE_7BIT);
11012}
11013
11014VALUE
11016{
11017 static const char ellipsis[] = "...";
11018 const long ellipsislen = sizeof(ellipsis) - 1;
11019 rb_encoding *const enc = rb_enc_get(str);
11020 const long blen = RSTRING_LEN(str);
11021 const char *const p = RSTRING_PTR(str), *e = p + blen;
11022 VALUE estr, ret = 0;
11023
11024 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11025 if (len * rb_enc_mbminlen(enc) >= blen ||
11026 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11027 ret = str;
11028 }
11029 else if (len <= ellipsislen ||
11030 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11031 if (rb_enc_asciicompat(enc)) {
11032 ret = rb_str_new(ellipsis, len);
11033 rb_enc_associate(ret, enc);
11034 }
11035 else {
11036 estr = rb_usascii_str_new(ellipsis, len);
11037 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11038 }
11039 }
11040 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11041 rb_str_cat(ret, ellipsis, ellipsislen);
11042 }
11043 else {
11044 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11045 rb_enc_from_encoding(enc), 0, Qnil);
11046 rb_str_append(ret, estr);
11047 }
11048 return ret;
11049}
11050
11051static VALUE
11052str_compat_and_valid(VALUE str, rb_encoding *enc)
11053{
11054 int cr;
11055 str = StringValue(str);
11056 cr = rb_enc_str_coderange(str);
11057 if (cr == ENC_CODERANGE_BROKEN) {
11058 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11059 }
11060 else {
11061 rb_encoding *e = STR_ENC_GET(str);
11062 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11063 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11064 rb_enc_name(enc), rb_enc_name(e));
11065 }
11066 }
11067 return str;
11068}
11069
11070static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11071
11072VALUE
11074{
11075 rb_encoding *enc = STR_ENC_GET(str);
11076 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11077}
11078
11079VALUE
11080rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11081{
11082 int cr = ENC_CODERANGE_UNKNOWN;
11083 if (enc == STR_ENC_GET(str)) {
11084 /* cached coderange makes sense only when enc equals the
11085 * actual encoding of str */
11086 cr = ENC_CODERANGE(str);
11087 }
11088 return enc_str_scrub(enc, str, repl, cr);
11089}
11090
11091static VALUE
11092enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11093{
11094 int encidx;
11095 VALUE buf = Qnil;
11096 const char *rep, *p, *e, *p1, *sp;
11097 long replen = -1;
11098 long slen;
11099
11100 if (rb_block_given_p()) {
11101 if (!NIL_P(repl))
11102 rb_raise(rb_eArgError, "both of block and replacement given");
11103 replen = 0;
11104 }
11105
11106 if (ENC_CODERANGE_CLEAN_P(cr))
11107 return Qnil;
11108
11109 if (!NIL_P(repl)) {
11110 repl = str_compat_and_valid(repl, enc);
11111 }
11112
11113 if (rb_enc_dummy_p(enc)) {
11114 return Qnil;
11115 }
11116 encidx = rb_enc_to_index(enc);
11117
11118#define DEFAULT_REPLACE_CHAR(str) do { \
11119 static const char replace[sizeof(str)-1] = str; \
11120 rep = replace; replen = (int)sizeof(replace); \
11121 } while (0)
11122
11123 slen = RSTRING_LEN(str);
11124 p = RSTRING_PTR(str);
11125 e = RSTRING_END(str);
11126 p1 = p;
11127 sp = p;
11128
11129 if (rb_enc_asciicompat(enc)) {
11130 int rep7bit_p;
11131 if (!replen) {
11132 rep = NULL;
11133 rep7bit_p = FALSE;
11134 }
11135 else if (!NIL_P(repl)) {
11136 rep = RSTRING_PTR(repl);
11137 replen = RSTRING_LEN(repl);
11138 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11139 }
11140 else if (encidx == rb_utf8_encindex()) {
11141 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11142 rep7bit_p = FALSE;
11143 }
11144 else {
11145 DEFAULT_REPLACE_CHAR("?");
11146 rep7bit_p = TRUE;
11147 }
11148 cr = ENC_CODERANGE_7BIT;
11149
11150 p = search_nonascii(p, e);
11151 if (!p) {
11152 p = e;
11153 }
11154 while (p < e) {
11155 int ret = rb_enc_precise_mbclen(p, e, enc);
11156 if (MBCLEN_NEEDMORE_P(ret)) {
11157 break;
11158 }
11159 else if (MBCLEN_CHARFOUND_P(ret)) {
11161 p += MBCLEN_CHARFOUND_LEN(ret);
11162 }
11163 else if (MBCLEN_INVALID_P(ret)) {
11164 /*
11165 * p1~p: valid ascii/multibyte chars
11166 * p ~e: invalid bytes + unknown bytes
11167 */
11168 long clen = rb_enc_mbmaxlen(enc);
11169 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11170 if (p > p1) {
11171 rb_str_buf_cat(buf, p1, p - p1);
11172 }
11173
11174 if (e - p < clen) clen = e - p;
11175 if (clen <= 2) {
11176 clen = 1;
11177 }
11178 else {
11179 const char *q = p;
11180 clen--;
11181 for (; clen > 1; clen--) {
11182 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11183 if (MBCLEN_NEEDMORE_P(ret)) break;
11184 if (MBCLEN_INVALID_P(ret)) continue;
11186 }
11187 }
11188 if (rep) {
11189 rb_str_buf_cat(buf, rep, replen);
11190 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11191 }
11192 else {
11193 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11194 str_mod_check(str, sp, slen);
11195 repl = str_compat_and_valid(repl, enc);
11196 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11199 }
11200 p += clen;
11201 p1 = p;
11202 p = search_nonascii(p, e);
11203 if (!p) {
11204 p = e;
11205 break;
11206 }
11207 }
11208 else {
11210 }
11211 }
11212 if (NIL_P(buf)) {
11213 if (p == e) {
11214 ENC_CODERANGE_SET(str, cr);
11215 return Qnil;
11216 }
11217 buf = rb_str_buf_new(RSTRING_LEN(str));
11218 }
11219 if (p1 < p) {
11220 rb_str_buf_cat(buf, p1, p - p1);
11221 }
11222 if (p < e) {
11223 if (rep) {
11224 rb_str_buf_cat(buf, rep, replen);
11225 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11226 }
11227 else {
11228 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11229 str_mod_check(str, sp, slen);
11230 repl = str_compat_and_valid(repl, enc);
11231 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11234 }
11235 }
11236 }
11237 else {
11238 /* ASCII incompatible */
11239 long mbminlen = rb_enc_mbminlen(enc);
11240 if (!replen) {
11241 rep = NULL;
11242 }
11243 else if (!NIL_P(repl)) {
11244 rep = RSTRING_PTR(repl);
11245 replen = RSTRING_LEN(repl);
11246 }
11247 else if (encidx == ENCINDEX_UTF_16BE) {
11248 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11249 }
11250 else if (encidx == ENCINDEX_UTF_16LE) {
11251 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11252 }
11253 else if (encidx == ENCINDEX_UTF_32BE) {
11254 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11255 }
11256 else if (encidx == ENCINDEX_UTF_32LE) {
11257 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11258 }
11259 else {
11260 DEFAULT_REPLACE_CHAR("?");
11261 }
11262
11263 while (p < e) {
11264 int ret = rb_enc_precise_mbclen(p, e, enc);
11265 if (MBCLEN_NEEDMORE_P(ret)) {
11266 break;
11267 }
11268 else if (MBCLEN_CHARFOUND_P(ret)) {
11269 p += MBCLEN_CHARFOUND_LEN(ret);
11270 }
11271 else if (MBCLEN_INVALID_P(ret)) {
11272 const char *q = p;
11273 long clen = rb_enc_mbmaxlen(enc);
11274 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11275 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11276
11277 if (e - p < clen) clen = e - p;
11278 if (clen <= mbminlen * 2) {
11279 clen = mbminlen;
11280 }
11281 else {
11282 clen -= mbminlen;
11283 for (; clen > mbminlen; clen-=mbminlen) {
11284 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11285 if (MBCLEN_NEEDMORE_P(ret)) break;
11286 if (MBCLEN_INVALID_P(ret)) continue;
11288 }
11289 }
11290 if (rep) {
11291 rb_str_buf_cat(buf, rep, replen);
11292 }
11293 else {
11294 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11295 str_mod_check(str, sp, slen);
11296 repl = str_compat_and_valid(repl, enc);
11297 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11298 }
11299 p += clen;
11300 p1 = p;
11301 }
11302 else {
11304 }
11305 }
11306 if (NIL_P(buf)) {
11307 if (p == e) {
11309 return Qnil;
11310 }
11311 buf = rb_str_buf_new(RSTRING_LEN(str));
11312 }
11313 if (p1 < p) {
11314 rb_str_buf_cat(buf, p1, p - p1);
11315 }
11316 if (p < e) {
11317 if (rep) {
11318 rb_str_buf_cat(buf, rep, replen);
11319 }
11320 else {
11321 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11322 str_mod_check(str, sp, slen);
11323 repl = str_compat_and_valid(repl, enc);
11324 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11325 }
11326 }
11328 }
11329 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11330 return buf;
11331}
11332
11333/*
11334 * call-seq:
11335 * scrub(replacement_string = default_replacement) -> new_string
11336 * scrub{|bytes| ... } -> new_string
11337 *
11338 * :include: doc/string/scrub.rdoc
11339 *
11340 */
11341static VALUE
11342str_scrub(int argc, VALUE *argv, VALUE str)
11343{
11344 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11345 VALUE new = rb_str_scrub(str, repl);
11346 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11347}
11348
11349/*
11350 * call-seq:
11351 * scrub! -> self
11352 * scrub!(replacement_string = default_replacement) -> self
11353 * scrub!{|bytes| ... } -> self
11354 *
11355 * Like String#scrub, except that any replacements are made in +self+.
11356 *
11357 */
11358static VALUE
11359str_scrub_bang(int argc, VALUE *argv, VALUE str)
11360{
11361 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11362 VALUE new = rb_str_scrub(str, repl);
11363 if (!NIL_P(new)) rb_str_replace(str, new);
11364 return str;
11365}
11366
11367static ID id_normalize;
11368static ID id_normalized_p;
11369static VALUE mUnicodeNormalize;
11370
11371static VALUE
11372unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11373{
11374 static int UnicodeNormalizeRequired = 0;
11375 VALUE argv2[2];
11376
11377 if (!UnicodeNormalizeRequired) {
11378 rb_require("unicode_normalize/normalize.rb");
11379 UnicodeNormalizeRequired = 1;
11380 }
11381 argv2[0] = str;
11382 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11383 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11384}
11385
11386/*
11387 * call-seq:
11388 * unicode_normalize(form = :nfc) -> string
11389 *
11390 * Returns a copy of +self+ with
11391 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11392 *
11393 * Argument +form+ must be one of the following symbols
11394 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11395 *
11396 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11397 * - +:nfd+: Canonical decomposition.
11398 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11399 * - +:nfkd+: Compatibility decomposition.
11400 *
11401 * The encoding of +self+ must be one of:
11402 *
11403 * - Encoding::UTF_8
11404 * - Encoding::UTF_16BE
11405 * - Encoding::UTF_16LE
11406 * - Encoding::UTF_32BE
11407 * - Encoding::UTF_32LE
11408 * - Encoding::GB18030
11409 * - Encoding::UCS_2BE
11410 * - Encoding::UCS_4BE
11411 *
11412 * Examples:
11413 *
11414 * "a\u0300".unicode_normalize # => "a"
11415 * "\u00E0".unicode_normalize(:nfd) # => "a "
11416 *
11417 * Related: String#unicode_normalize!, String#unicode_normalized?.
11418 */
11419static VALUE
11420rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11421{
11422 return unicode_normalize_common(argc, argv, str, id_normalize);
11423}
11424
11425/*
11426 * call-seq:
11427 * unicode_normalize!(form = :nfc) -> self
11428 *
11429 * Like String#unicode_normalize, except that the normalization
11430 * is performed on +self+.
11431 *
11432 * Related String#unicode_normalized?.
11433 *
11434 */
11435static VALUE
11436rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11437{
11438 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11439}
11440
11441/* call-seq:
11442 * unicode_normalized?(form = :nfc) -> true or false
11443 *
11444 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11445 * +false+ otherwise.
11446 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11447 *
11448 * Examples:
11449 *
11450 * "a\u0300".unicode_normalized? # => false
11451 * "a\u0300".unicode_normalized?(:nfd) # => true
11452 * "\u00E0".unicode_normalized? # => true
11453 * "\u00E0".unicode_normalized?(:nfd) # => false
11454 *
11455 *
11456 * Raises an exception if +self+ is not in a Unicode encoding:
11457 *
11458 * s = "\xE0".force_encoding('ISO-8859-1')
11459 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11460 *
11461 * Related: String#unicode_normalize, String#unicode_normalize!.
11462 *
11463 */
11464static VALUE
11465rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11466{
11467 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11468}
11469
11470/**********************************************************************
11471 * Document-class: Symbol
11472 *
11473 * \Symbol objects represent named identifiers inside the Ruby interpreter.
11474 *
11475 * You can create a \Symbol object explicitly with:
11476 *
11477 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11478 *
11479 * The same \Symbol object will be
11480 * created for a given name or string for the duration of a program's
11481 * execution, regardless of the context or meaning of that name. Thus
11482 * if <code>Fred</code> is a constant in one context, a method in
11483 * another, and a class in a third, the \Symbol <code>:Fred</code>
11484 * will be the same object in all three contexts.
11485 *
11486 * module One
11487 * class Fred
11488 * end
11489 * $f1 = :Fred
11490 * end
11491 * module Two
11492 * Fred = 1
11493 * $f2 = :Fred
11494 * end
11495 * def Fred()
11496 * end
11497 * $f3 = :Fred
11498 * $f1.object_id #=> 2514190
11499 * $f2.object_id #=> 2514190
11500 * $f3.object_id #=> 2514190
11501 *
11502 * Constant, method, and variable names are returned as symbols:
11503 *
11504 * module One
11505 * Two = 2
11506 * def three; 3 end
11507 * @four = 4
11508 * @@five = 5
11509 * $six = 6
11510 * end
11511 * seven = 7
11512 *
11513 * One.constants
11514 * # => [:Two]
11515 * One.instance_methods(true)
11516 * # => [:three]
11517 * One.instance_variables
11518 * # => [:@four]
11519 * One.class_variables
11520 * # => [:@@five]
11521 * global_variables.grep(/six/)
11522 * # => [:$six]
11523 * local_variables
11524 * # => [:seven]
11525 *
11526 * \Symbol objects are different from String objects in that
11527 * \Symbol objects represent identifiers, while String objects
11528 * represent text or data.
11529 *
11530 * == What's Here
11531 *
11532 * First, what's elsewhere. \Class \Symbol:
11533 *
11534 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11535 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11536 *
11537 * Here, class \Symbol provides methods that are useful for:
11538 *
11539 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11540 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11541 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11542 *
11543 * === Methods for Querying
11544 *
11545 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11546 * - #=~: Returns the index of the first substring in symbol that matches a
11547 * given Regexp or other object; returns +nil+ if no match is found.
11548 * - #[], #slice : Returns a substring of symbol
11549 * determined by a given index, start/length, or range, or string.
11550 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11551 * - #encoding: Returns the Encoding object that represents the encoding
11552 * of symbol.
11553 * - #end_with?: Returns +true+ if symbol ends with
11554 * any of the given strings.
11555 * - #match: Returns a MatchData object if symbol
11556 * matches a given Regexp; +nil+ otherwise.
11557 * - #match?: Returns +true+ if symbol
11558 * matches a given Regexp; +false+ otherwise.
11559 * - #length, #size: Returns the number of characters in symbol.
11560 * - #start_with?: Returns +true+ if symbol starts with
11561 * any of the given strings.
11562 *
11563 * === Methods for Comparing
11564 *
11565 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11566 * or larger than symbol.
11567 * - #==, #===: Returns +true+ if a given symbol has the same content and
11568 * encoding.
11569 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11570 * symbol is smaller than, equal to, or larger than symbol.
11571 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11572 * after Unicode case folding; +false+ otherwise.
11573 *
11574 * === Methods for Converting
11575 *
11576 * - #capitalize: Returns symbol with the first character upcased
11577 * and all other characters downcased.
11578 * - #downcase: Returns symbol with all characters downcased.
11579 * - #inspect: Returns the string representation of +self+ as a symbol literal.
11580 * - #name: Returns the frozen string corresponding to symbol.
11581 * - #succ, #next: Returns the symbol that is the successor to symbol.
11582 * - #swapcase: Returns symbol with all upcase characters downcased
11583 * and all downcase characters upcased.
11584 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11585 * - #to_s, #id2name: Returns the string corresponding to +self+.
11586 * - #to_sym, #intern: Returns +self+.
11587 * - #upcase: Returns symbol with all characters upcased.
11588 *
11589 */
11590
11591
11592/*
11593 * call-seq:
11594 * symbol == object -> true or false
11595 *
11596 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11597 */
11598
11599#define sym_equal rb_obj_equal
11600
11601static int
11602sym_printable(const char *s, const char *send, rb_encoding *enc)
11603{
11604 while (s < send) {
11605 int n;
11606 int c = rb_enc_precise_mbclen(s, send, enc);
11607
11608 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11609 n = MBCLEN_CHARFOUND_LEN(c);
11610 c = rb_enc_mbc_to_codepoint(s, send, enc);
11611 if (!rb_enc_isprint(c, enc)) return FALSE;
11612 s += n;
11613 }
11614 return TRUE;
11615}
11616
11617int
11618rb_str_symname_p(VALUE sym)
11619{
11620 rb_encoding *enc;
11621 const char *ptr;
11622 long len;
11623 rb_encoding *resenc = rb_default_internal_encoding();
11624
11625 if (resenc == NULL) resenc = rb_default_external_encoding();
11626 enc = STR_ENC_GET(sym);
11627 ptr = RSTRING_PTR(sym);
11628 len = RSTRING_LEN(sym);
11629 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11630 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11631 return FALSE;
11632 }
11633 return TRUE;
11634}
11635
11636VALUE
11637rb_str_quote_unprintable(VALUE str)
11638{
11639 rb_encoding *enc;
11640 const char *ptr;
11641 long len;
11642 rb_encoding *resenc;
11643
11644 Check_Type(str, T_STRING);
11645 resenc = rb_default_internal_encoding();
11646 if (resenc == NULL) resenc = rb_default_external_encoding();
11647 enc = STR_ENC_GET(str);
11648 ptr = RSTRING_PTR(str);
11649 len = RSTRING_LEN(str);
11650 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11651 !sym_printable(ptr, ptr + len, enc)) {
11652 return rb_str_escape(str);
11653 }
11654 return str;
11655}
11656
11657VALUE
11658rb_id_quote_unprintable(ID id)
11659{
11660 VALUE str = rb_id2str(id);
11661 if (!rb_str_symname_p(str)) {
11662 return rb_str_escape(str);
11663 }
11664 return str;
11665}
11666
11667/*
11668 * call-seq:
11669 * inspect -> string
11670 *
11671 * Returns a string representation of +self+ (including the leading colon):
11672 *
11673 * :foo.inspect # => ":foo"
11674 *
11675 * Related: Symbol#to_s, Symbol#name.
11676 *
11677 */
11678
11679static VALUE
11680sym_inspect(VALUE sym)
11681{
11682 VALUE str = rb_sym2str(sym);
11683 const char *ptr;
11684 long len;
11685 char *dest;
11686
11687 if (!rb_str_symname_p(str)) {
11688 str = rb_str_inspect(str);
11689 len = RSTRING_LEN(str);
11690 rb_str_resize(str, len + 1);
11691 dest = RSTRING_PTR(str);
11692 memmove(dest + 1, dest, len);
11693 }
11694 else {
11695 rb_encoding *enc = STR_ENC_GET(str);
11696
11697 VALUE orig_str = str;
11698 RSTRING_GETMEM(orig_str, ptr, len);
11699
11700 str = rb_enc_str_new(0, len + 1, enc);
11701 dest = RSTRING_PTR(str);
11702 memcpy(dest + 1, ptr, len);
11703
11704 RB_GC_GUARD(orig_str);
11705 }
11706 dest[0] = ':';
11707 return str;
11708}
11709
11710/*
11711 * call-seq:
11712 * to_s -> string
11713 *
11714 * Returns a string representation of +self+ (not including the leading colon):
11715 *
11716 * :foo.to_s # => "foo"
11717 *
11718 * Related: Symbol#inspect, Symbol#name.
11719 */
11720
11721VALUE
11723{
11724 return str_new_shared(rb_cString, rb_sym2str(sym));
11725}
11726
11727VALUE
11728rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11729{
11730 VALUE obj;
11731
11732 if (argc < 1) {
11733 rb_raise(rb_eArgError, "no receiver given");
11734 }
11735 obj = argv[0];
11736 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11737}
11738
11739/*
11740 * call-seq:
11741 * succ
11742 *
11743 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11744 *
11745 * :foo.succ # => :fop
11746 *
11747 * Related: String#succ.
11748 */
11749
11750static VALUE
11751sym_succ(VALUE sym)
11752{
11753 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11754}
11755
11756/*
11757 * call-seq:
11758 * symbol <=> object -> -1, 0, +1, or nil
11759 *
11760 * If +object+ is a symbol,
11761 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11762 *
11763 * :bar <=> :foo # => -1
11764 * :foo <=> :foo # => 0
11765 * :foo <=> :bar # => 1
11766 *
11767 * Otherwise, returns +nil+:
11768 *
11769 * :foo <=> 'bar' # => nil
11770 *
11771 * Related: String#<=>.
11772 */
11773
11774static VALUE
11775sym_cmp(VALUE sym, VALUE other)
11776{
11777 if (!SYMBOL_P(other)) {
11778 return Qnil;
11779 }
11780 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11781}
11782
11783/*
11784 * call-seq:
11785 * casecmp(object) -> -1, 0, 1, or nil
11786 *
11787 * :include: doc/symbol/casecmp.rdoc
11788 *
11789 */
11790
11791static VALUE
11792sym_casecmp(VALUE sym, VALUE other)
11793{
11794 if (!SYMBOL_P(other)) {
11795 return Qnil;
11796 }
11797 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11798}
11799
11800/*
11801 * call-seq:
11802 * casecmp?(object) -> true, false, or nil
11803 *
11804 * :include: doc/symbol/casecmp_p.rdoc
11805 *
11806 */
11807
11808static VALUE
11809sym_casecmp_p(VALUE sym, VALUE other)
11810{
11811 if (!SYMBOL_P(other)) {
11812 return Qnil;
11813 }
11814 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11815}
11816
11817/*
11818 * call-seq:
11819 * symbol =~ object -> integer or nil
11820 *
11821 * Equivalent to <tt>symbol.to_s =~ object</tt>,
11822 * including possible updates to global variables;
11823 * see String#=~.
11824 *
11825 */
11826
11827static VALUE
11828sym_match(VALUE sym, VALUE other)
11829{
11830 return rb_str_match(rb_sym2str(sym), other);
11831}
11832
11833/*
11834 * call-seq:
11835 * match(pattern, offset = 0) -> matchdata or nil
11836 * match(pattern, offset = 0) {|matchdata| } -> object
11837 *
11838 * Equivalent to <tt>self.to_s.match</tt>,
11839 * including possible updates to global variables;
11840 * see String#match.
11841 *
11842 */
11843
11844static VALUE
11845sym_match_m(int argc, VALUE *argv, VALUE sym)
11846{
11847 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11848}
11849
11850/*
11851 * call-seq:
11852 * match?(pattern, offset) -> true or false
11853 *
11854 * Equivalent to <tt>sym.to_s.match?</tt>;
11855 * see String#match.
11856 *
11857 */
11858
11859static VALUE
11860sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11861{
11862 return rb_str_match_m_p(argc, argv, sym);
11863}
11864
11865/*
11866 * call-seq:
11867 * symbol[index] -> string or nil
11868 * symbol[start, length] -> string or nil
11869 * symbol[range] -> string or nil
11870 * symbol[regexp, capture = 0] -> string or nil
11871 * symbol[substring] -> string or nil
11872 *
11873 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11874 *
11875 */
11876
11877static VALUE
11878sym_aref(int argc, VALUE *argv, VALUE sym)
11879{
11880 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11881}
11882
11883/*
11884 * call-seq:
11885 * length -> integer
11886 *
11887 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
11888 */
11889
11890static VALUE
11891sym_length(VALUE sym)
11892{
11893 return rb_str_length(rb_sym2str(sym));
11894}
11895
11896/*
11897 * call-seq:
11898 * empty? -> true or false
11899 *
11900 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11901 *
11902 */
11903
11904static VALUE
11905sym_empty(VALUE sym)
11906{
11907 return rb_str_empty(rb_sym2str(sym));
11908}
11909
11910/*
11911 * call-seq:
11912 * upcase(*options) -> symbol
11913 *
11914 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11915 *
11916 * See String#upcase.
11917 *
11918 */
11919
11920static VALUE
11921sym_upcase(int argc, VALUE *argv, VALUE sym)
11922{
11923 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11924}
11925
11926/*
11927 * call-seq:
11928 * downcase(*options) -> symbol
11929 *
11930 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11931 *
11932 * See String#downcase.
11933 *
11934 * Related: Symbol#upcase.
11935 *
11936 */
11937
11938static VALUE
11939sym_downcase(int argc, VALUE *argv, VALUE sym)
11940{
11941 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11942}
11943
11944/*
11945 * call-seq:
11946 * capitalize(*options) -> symbol
11947 *
11948 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11949 *
11950 * See String#capitalize.
11951 *
11952 */
11953
11954static VALUE
11955sym_capitalize(int argc, VALUE *argv, VALUE sym)
11956{
11957 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11958}
11959
11960/*
11961 * call-seq:
11962 * swapcase(*options) -> symbol
11963 *
11964 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11965 *
11966 * See String#swapcase.
11967 *
11968 */
11969
11970static VALUE
11971sym_swapcase(int argc, VALUE *argv, VALUE sym)
11972{
11973 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11974}
11975
11976/*
11977 * call-seq:
11978 * start_with?(*string_or_regexp) -> true or false
11979 *
11980 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
11981 *
11982 */
11983
11984static VALUE
11985sym_start_with(int argc, VALUE *argv, VALUE sym)
11986{
11987 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11988}
11989
11990/*
11991 * call-seq:
11992 * end_with?(*strings) -> true or false
11993 *
11994 *
11995 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
11996 *
11997 */
11998
11999static VALUE
12000sym_end_with(int argc, VALUE *argv, VALUE sym)
12001{
12002 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12003}
12004
12005/*
12006 * call-seq:
12007 * encoding -> encoding
12008 *
12009 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12010 *
12011 */
12012
12013static VALUE
12014sym_encoding(VALUE sym)
12015{
12016 return rb_obj_encoding(rb_sym2str(sym));
12017}
12018
12019static VALUE
12020string_for_symbol(VALUE name)
12021{
12022 if (!RB_TYPE_P(name, T_STRING)) {
12023 VALUE tmp = rb_check_string_type(name);
12024 if (NIL_P(tmp)) {
12025 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
12026 name);
12027 }
12028 name = tmp;
12029 }
12030 return name;
12031}
12032
12033ID
12035{
12036 if (SYMBOL_P(name)) {
12037 return SYM2ID(name);
12038 }
12039 name = string_for_symbol(name);
12040 return rb_intern_str(name);
12041}
12042
12043VALUE
12045{
12046 if (SYMBOL_P(name)) {
12047 return name;
12048 }
12049 name = string_for_symbol(name);
12050 return rb_str_intern(name);
12051}
12052
12053/*
12054 * call-seq:
12055 * Symbol.all_symbols -> array_of_symbols
12056 *
12057 * Returns an array of all symbols currently in Ruby's symbol table:
12058 *
12059 * Symbol.all_symbols.size # => 9334
12060 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12061 *
12062 */
12063
12064static VALUE
12065sym_all_symbols(VALUE _)
12066{
12067 return rb_sym_all_symbols();
12068}
12069
12070VALUE
12072{
12073 return rb_fstring(str);
12074}
12075
12076VALUE
12077rb_interned_str(const char *ptr, long len)
12078{
12079 struct RString fake_str;
12080 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
12081}
12082
12083VALUE
12084rb_interned_str_cstr(const char *ptr)
12085{
12086 return rb_interned_str(ptr, strlen(ptr));
12087}
12088
12089VALUE
12090rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12091{
12092 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12093 rb_enc_autoload(enc);
12094 }
12095
12096 struct RString fake_str;
12097 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
12098}
12099
12100VALUE
12101rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
12102{
12103 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12104}
12105
12106void
12107Init_String(void)
12108{
12109 rb_cString = rb_define_class("String", rb_cObject);
12110 assert(rb_vm_fstring_table());
12111 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12113 rb_define_alloc_func(rb_cString, empty_str_alloc);
12114 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12115 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12116 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12117 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12118 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12121 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12122 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12123 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12124 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12127 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12128 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12129 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12130 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12133 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12134 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12135 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12136 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12137 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12139 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12141 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12142 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12143 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12144 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12145 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12146 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12148 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12149 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12150 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12151 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12152 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12153 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12154 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12155 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12157 rb_define_method(rb_cString, "+@", str_uplus, 0);
12158 rb_define_method(rb_cString, "-@", str_uminus, 0);
12159 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12160 rb_define_alias(rb_cString, "dedup", "-@");
12161
12162 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12163 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12164 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12165 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12168 rb_define_method(rb_cString, "undump", str_undump, 0);
12169
12170 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12171 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12172 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12173 sym_fold = ID2SYM(rb_intern_const("fold"));
12174
12175 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12176 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12177 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12178 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12179
12180 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12181 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12182 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12183 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12184
12185 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12186 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12187 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12188 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12189 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12190 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12191 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12192 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12193 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12194 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12195 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12197 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12198 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12199 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12200 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12201 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12202
12203 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12204 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12205 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12206
12207 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12208
12209 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12210 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12211 rb_define_method(rb_cString, "center", rb_str_center, -1);
12212
12213 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12214 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12215 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12216 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12217 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12218 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12219 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12220 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12221 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12222
12223 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12224 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12225 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12226 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12227 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12228 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12229 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12230 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12231 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12232
12233 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12234 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12235 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12236 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12237 rb_define_method(rb_cString, "count", rb_str_count, -1);
12238
12239 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12240 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12241 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12242 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12243
12244 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12245 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12246 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12247 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12248 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12249
12250 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12251
12252 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12253 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12254
12255 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12256 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12257
12258 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12259 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12260 rb_define_method(rb_cString, "b", rb_str_b, 0);
12261 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12262 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12263
12264 /* define UnicodeNormalize module here so that we don't have to look it up */
12265 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12266 id_normalize = rb_intern_const("normalize");
12267 id_normalized_p = rb_intern_const("normalized?");
12268
12269 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12270 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12271 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12272
12273 rb_fs = Qnil;
12274 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12275 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12276 rb_gc_register_address(&rb_fs);
12277
12278 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12282 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12283
12284 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12285 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12286 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12288 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12289 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12290 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12291 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12292 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12293
12294 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12295 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12296 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12297 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12298
12299 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12300 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12301 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12302 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12303 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12304 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12305 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12306
12307 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12308 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12309 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12310 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12311
12312 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12313 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12314
12315 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12316}
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:167
Atomic operations.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1199
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1177
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:970
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1085
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2332
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2156
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2622
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:866
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2411
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:107
#define NEWOBJ_OF
Old name of RB_NEWOBJ_OF.
Definition newobj.h:61
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition fl_type.h:136
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:516
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:517
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:518
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:515
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:68
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:433
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3567
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1348
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1344
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1351
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1342
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1346
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:634
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2058
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2076
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1237
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3432
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:215
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:541
VALUE rb_cSymbol
Symbol class.
Definition string.c:79
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:147
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:78
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3145
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:619
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:682
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:703
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:570
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:446
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:98
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition encoding.h:590
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:431
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:618
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:725
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1149
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1015
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2757
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1034
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12090
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2101
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:962
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1254
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1155
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:781
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12101
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:653
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:414
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1121
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1208
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:495
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition error.h:264
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:280
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:538
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1802
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1020
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1808
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1744
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4150
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3647
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1441
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1857
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition string.c:12071
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1538
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1318
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2252
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3411
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1230
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:11722
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2324
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
Definition string.c:1206
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1532
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2785
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:4859
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:3631
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11015
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1747
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1585
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:997
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:815
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:3620
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2190
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1802
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6066
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:2890
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12084
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
Definition string.h:1604
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:2832
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:3733
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:6778
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2530
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12077
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:3687
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3502
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:3662
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3353
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3001
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5369
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11073
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1488
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2681
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:2979
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3072
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1009
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2486
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:6892
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1218
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2204
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5287
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:8970
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1003
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2937
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1274
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:276
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition symbol.c:953
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12044
ID rb_to_id(VALUE str)
Definition string.c:12034
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1796
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3431
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4394
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1376
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:366
#define ALLOCA_N(type, n)
Definition memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:161
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:152
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:71
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1248
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2658
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2542
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1242
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2553
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1576
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
struct rb_data_type_struct rb_data_type_t
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:197
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1394
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:77
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024::@365170260060164113275356137374160141226332013204 aux
Auxiliary info.
struct RString::@157025041137035241047331270155043025061071337053::@153056146250355212360325351117351053336274231135 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
union RString::@157025041137035241047331270155043025061071337053 as
String's specific fields.
struct RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024 heap
Strings that use separated memory region for contents use this pattern.
Definition string.c:7850
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:298
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:432