14#include "ruby/internal/config.h"
24#include "debug_counter.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
45#include "ruby_assert.h"
48#if defined HAVE_CRYPT_R
49# if defined HAVE_CRYPT_H
52#elif !defined HAVE_CRYPT
53# include "missing/crypt.h"
54# define HAVE_CRYPT_R 1
57#define BEG(no) (regs->beg[(no)])
58#define END(no) (regs->end[(no)])
61#undef rb_usascii_str_new
65#undef rb_usascii_str_new_cstr
66#undef rb_utf8_str_new_cstr
67#undef rb_enc_str_new_cstr
68#undef rb_external_str_new_cstr
69#undef rb_locale_str_new_cstr
70#undef rb_str_dup_frozen
71#undef rb_str_buf_new_cstr
101#define RUBY_MAX_CHAR_LEN 16
102#define STR_SHARED_ROOT FL_USER5
103#define STR_BORROWED FL_USER6
104#define STR_TMPLOCK FL_USER7
105#define STR_NOFREE FL_USER18
106#define STR_FAKESTR FL_USER19
108#define STR_SET_NOEMBED(str) do {\
109 FL_SET((str), STR_NOEMBED);\
110 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
112#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
114#define STR_SET_LEN(str, n) do { \
115 RSTRING(str)->len = (n); \
119str_enc_fastpath(
VALUE str)
123 case ENCINDEX_ASCII_8BIT:
125 case ENCINDEX_US_ASCII:
132#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
133#define TERM_FILL(ptr, termlen) do {\
134 char *const term_fill_ptr = (ptr);\
135 const int term_fill_len = (termlen);\
136 *term_fill_ptr = '\0';\
137 if (UNLIKELY(term_fill_len > 1))\
138 memset(term_fill_ptr, 0, term_fill_len);\
141#define RESIZE_CAPA(str,capacity) do {\
142 const int termlen = TERM_LEN(str);\
143 RESIZE_CAPA_TERM(str,capacity,termlen);\
145#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
146 if (STR_EMBED_P(str)) {\
147 if (str_embed_capa(str) < capacity + termlen) {\
148 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
149 const long tlen = RSTRING_LEN(str);\
150 memcpy(tmp, RSTRING_PTR(str), tlen);\
151 RSTRING(str)->as.heap.ptr = tmp;\
152 RSTRING(str)->len = tlen;\
153 STR_SET_NOEMBED(str);\
154 RSTRING(str)->as.heap.aux.capa = (capacity);\
158 assert(!FL_TEST((str), STR_SHARED)); \
159 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
160 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
161 RSTRING(str)->as.heap.aux.capa = (capacity);\
165#define STR_SET_SHARED(str, shared_str) do { \
166 if (!FL_TEST(str, STR_FAKESTR)) { \
167 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
168 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
169 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
170 FL_SET((str), STR_SHARED); \
171 FL_SET((shared_str), STR_SHARED_ROOT); \
172 if (RBASIC_CLASS((shared_str)) == 0) \
173 FL_SET_RAW((shared_str), STR_BORROWED); \
177#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
178#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
181#define STR_ENC_GET(str) get_encoding(str)
183#if !defined SHARABLE_MIDDLE_SUBSTRING
184# define SHARABLE_MIDDLE_SUBSTRING 0
186#if !SHARABLE_MIDDLE_SUBSTRING
187#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
189#define SHARABLE_SUBSTRING_P(beg, len, end) 1
194str_embed_capa(
VALUE str)
196 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.ary);
200rb_str_reembeddable_p(
VALUE str)
202 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
206rb_str_embed_size(
long capa)
212rb_str_size_as_embedded(
VALUE str)
215 if (STR_EMBED_P(str)) {
216 real_size = rb_str_embed_size(
RSTRING(str)->
len) + TERM_LEN(str);
220 else if (rb_str_reembeddable_p(str)) {
221 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
224 real_size =
sizeof(
struct RString);
230STR_EMBEDDABLE_P(
long len,
long termlen)
232 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
237static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
238static VALUE str_new_static(
VALUE klass,
const char *ptr,
long len,
int encindex);
240static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
241static inline void str_modifiable(
VALUE str);
245str_make_independent(
VALUE str)
247 long len = RSTRING_LEN(str);
248 int termlen = TERM_LEN(str);
249 str_make_independent_expand((str),
len, 0L, termlen);
252static inline int str_dependent_p(
VALUE str);
255rb_str_make_independent(
VALUE str)
257 if (str_dependent_p(str)) {
258 str_make_independent(str);
263rb_str_make_embedded(
VALUE str)
268 char *buf =
RSTRING(str)->as.heap.ptr;
272 STR_SET_LEN(str,
len);
275 memcpy(RSTRING_PTR(str), buf,
len);
279 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
283rb_debug_rstring_null_ptr(
const char *func)
285 fprintf(stderr,
"%s is returning NULL!! "
286 "SIGSEGV is highly expected to follow immediately.\n"
287 "If you could reproduce, attach your debugger here, "
288 "and look at the passed string.\n",
293static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
296get_encoding(
VALUE str)
302mustnot_broken(
VALUE str)
304 if (is_broken_string(str)) {
305 rb_raise(rb_eArgError,
"invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
310mustnot_wchar(
VALUE str)
312 rb_encoding *enc = STR_ENC_GET(str);
314 rb_raise(rb_eArgError,
"wide char encoding: %s", rb_enc_name(enc));
320static VALUE register_fstring(
VALUE str,
bool copy);
322const struct st_hash_type rb_fstring_hash_type = {
327#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
335fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data,
int existing)
345 if (rb_objspace_garbage_object_p(str)) {
357 rb_enc_copy(new_str, str);
370 if (STR_SHARED_P(str)) {
372 str_make_independent(str);
375 if (!BARE_STRING_P(str)) {
379 RBASIC(str)->flags |= RSTRING_FSTR;
381 *key = *value = arg->fstr = str;
395 if (
FL_TEST(str, RSTRING_FSTR))
398 bare = BARE_STRING_P(str);
400 if (STR_EMBED_P(str)) {
405 if (
FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
412 rb_str_resize(str, RSTRING_LEN(str));
414 fstr = register_fstring(str, FALSE);
417 str_replace_shared_without_enc(str, fstr);
425register_fstring(
VALUE str,
bool copy)
432 st_table *frozen_strings = rb_vm_fstring_table();
435 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
436 }
while (UNDEF_P(args.fstr));
448setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
464 return (
VALUE)fake_str;
471rb_setup_fake_str(
struct RString *fake_str,
const char *name,
long len, rb_encoding *enc)
473 return setup_fake_str(fake_str, name,
len, rb_enc_to_index(enc));
482rb_fstring_new(
const char *ptr,
long len)
485 return register_fstring(setup_fake_str(&fake_str, ptr,
len, ENCINDEX_US_ASCII), FALSE);
489rb_fstring_enc_new(
const char *ptr,
long len, rb_encoding *enc)
492 return register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc), FALSE);
496rb_fstring_cstr(
const char *ptr)
498 return rb_fstring_new(ptr, strlen(ptr));
502fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
512 const char *aptr, *bptr;
515 return (alen != blen ||
517 memcmp(aptr, bptr, alen) != 0);
521single_byte_optimizable(
VALUE str)
529 enc = STR_ENC_GET(str);
540static inline const char *
541search_nonascii(
const char *p,
const char *e)
543 const uintptr_t *s, *t;
545#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
546# if SIZEOF_UINTPTR_T == 8
547# define NONASCII_MASK UINT64_C(0x8080808080808080)
548# elif SIZEOF_UINTPTR_T == 4
549# define NONASCII_MASK UINT32_C(0x80808080)
551# error "don't know what to do."
554# if SIZEOF_UINTPTR_T == 8
555# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
556# elif SIZEOF_UINTPTR_T == 4
557# define NONASCII_MASK 0x80808080UL
559# error "don't know what to do."
563 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
564#if !UNALIGNED_WORD_ACCESS
565 if ((uintptr_t)p % SIZEOF_VOIDP) {
566 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
571 case 7:
if (p[-7]&0x80)
return p-7;
572 case 6:
if (p[-6]&0x80)
return p-6;
573 case 5:
if (p[-5]&0x80)
return p-5;
574 case 4:
if (p[-4]&0x80)
return p-4;
576 case 3:
if (p[-3]&0x80)
return p-3;
577 case 2:
if (p[-2]&0x80)
return p-2;
578 case 1:
if (p[-1]&0x80)
return p-1;
583#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
584#define aligned_ptr(value) \
585 __builtin_assume_aligned((value), sizeof(uintptr_t))
587#define aligned_ptr(value) (uintptr_t *)(value)
590 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
593 if (*s & NONASCII_MASK) {
594#ifdef WORDS_BIGENDIAN
595 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
597 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
607 case 7:
if (e[-7]&0x80)
return e-7;
608 case 6:
if (e[-6]&0x80)
return e-6;
609 case 5:
if (e[-5]&0x80)
return e-5;
610 case 4:
if (e[-4]&0x80)
return e-4;
612 case 3:
if (e[-3]&0x80)
return e-3;
613 case 2:
if (e[-2]&0x80)
return e-2;
614 case 1:
if (e[-1]&0x80)
return e-1;
620coderange_scan(
const char *p,
long len, rb_encoding *enc)
622 const char *e = p +
len;
624 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
626 p = search_nonascii(p, e);
630 if (rb_enc_asciicompat(enc)) {
631 p = search_nonascii(p, e);
634 int ret = rb_enc_precise_mbclen(p, e, enc);
638 p = search_nonascii(p, e);
644 int ret = rb_enc_precise_mbclen(p, e, enc);
660 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
663 p = search_nonascii(p, e);
667 else if (rb_enc_asciicompat(enc)) {
668 p = search_nonascii(p, e);
674 int ret = rb_enc_precise_mbclen(p, e, enc);
681 p = search_nonascii(p, e);
687 int ret = rb_enc_precise_mbclen(p, e, enc);
712 rb_enc_set_index(str1, rb_enc_get_index(str2));
720rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
725 str_enc_copy(dest, src);
726 if (RSTRING_LEN(dest) == 0) {
727 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
738 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
739 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
750rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
752 str_enc_copy(dest, src);
757enc_coderange_scan(
VALUE str, rb_encoding *enc)
759 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
763rb_enc_str_coderange_scan(
VALUE str, rb_encoding *enc)
765 return enc_coderange_scan(str, enc);
774 cr = enc_coderange_scan(str, get_encoding(str));
783 rb_encoding *enc = STR_ENC_GET(str);
785 if (!rb_enc_asciicompat(enc))
787 else if (is_ascii_string(str))
793str_mod_check(
VALUE s,
const char *p,
long len)
795 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) !=
len){
801str_capacity(
VALUE str,
const int termlen)
803 if (STR_EMBED_P(str)) {
804 return str_embed_capa(str) - termlen;
806 else if (
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
810 return RSTRING(str)->as.heap.aux.capa;
817 return str_capacity(str, TERM_LEN(str));
821must_not_null(
const char *ptr)
824 rb_raise(rb_eArgError,
"NULL pointer given");
831 size_t size = rb_str_embed_size(
capa);
833 assert(rb_gc_size_allocatable_p(size));
842str_alloc_heap(
VALUE klass)
851empty_str_alloc(
VALUE klass)
853 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
854 VALUE str = str_alloc_embed(klass, 0);
855 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
860str_new0(
VALUE klass,
const char *ptr,
long len,
int termlen)
865 rb_raise(rb_eArgError,
"negative string size (or size too big)");
868 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
870 if (STR_EMBEDDABLE_P(
len, termlen)) {
871 str = str_alloc_embed(klass,
len + termlen);
877 str = str_alloc_heap(klass);
883 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
886 memcpy(RSTRING_PTR(str), ptr,
len);
888 STR_SET_LEN(str,
len);
889 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
894str_new(
VALUE klass,
const char *ptr,
long len)
896 return str_new0(klass, ptr,
len, 1);
917 rb_enc_associate_index(str, rb_utf8_encindex());
922rb_enc_str_new(
const char *ptr,
long len, rb_encoding *enc)
929 rb_enc_associate(str, enc);
941 __msan_unpoison_string(ptr);
957 rb_enc_associate_index(str, rb_utf8_encindex());
966 rb_raise(rb_eArgError,
"wchar encoding given");
968 return rb_enc_str_new(ptr, strlen(ptr), enc);
972str_new_static(
VALUE klass,
const char *ptr,
long len,
int encindex)
977 rb_raise(rb_eArgError,
"negative string size (or size too big)");
981 rb_encoding *enc = rb_enc_get_from_index(encindex);
985 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
986 str = str_alloc_heap(klass);
988 RSTRING(str)->as.heap.ptr = (
char *)ptr;
990 RBASIC(str)->flags |= STR_NOFREE;
992 rb_enc_associate_index(str, encindex);
1005 return str_new_static(
rb_cString, ptr,
len, ENCINDEX_US_ASCII);
1011 return str_new_static(
rb_cString, ptr,
len, ENCINDEX_UTF_8);
1017 return str_new_static(
rb_cString, ptr,
len, rb_enc_to_index(enc));
1020static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1021 rb_encoding *from, rb_encoding *to,
1022 int ecflags,
VALUE ecopts);
1025is_enc_ascii_string(
VALUE str, rb_encoding *enc)
1027 int encidx = rb_enc_to_index(enc);
1028 if (rb_enc_get_index(str) == encidx)
1029 return is_ascii_string(str);
1040 if (!to)
return str;
1041 if (!from) from = rb_enc_get(str);
1042 if (from == to)
return str;
1043 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1044 rb_is_ascii8bit_enc(to)) {
1045 if (STR_ENC_GET(str) != to) {
1046 str = rb_str_dup(str);
1047 rb_enc_associate(str, to);
1053 newstr = str_cat_conv_enc_opts(rb_str_buf_new(
len), 0, ptr,
len,
1054 from, to, ecflags, ecopts);
1055 if (
NIL_P(newstr)) {
1063rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1064 rb_encoding *from,
int ecflags,
VALUE ecopts)
1068 olen = RSTRING_LEN(newstr);
1069 if (ofs < -olen || olen < ofs)
1071 if (ofs < 0) ofs += olen;
1073 STR_SET_LEN(newstr, ofs);
1074 return rb_str_cat(newstr, ptr,
len);
1077 rb_str_modify(newstr);
1078 return str_cat_conv_enc_opts(newstr, ofs, ptr,
len, from,
1084rb_str_initialize(
VALUE str,
const char *ptr,
long len, rb_encoding *enc)
1086 STR_SET_LEN(str, 0);
1087 rb_enc_associate(str, enc);
1088 rb_str_cat(str, ptr,
len);
1093str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *ptr,
long len,
1094 rb_encoding *from, rb_encoding *to,
1095 int ecflags,
VALUE ecopts)
1100 VALUE econv_wrapper;
1101 const unsigned char *start, *sp;
1102 unsigned char *dest, *dp;
1103 size_t converted_output = (size_t)ofs;
1108 RBASIC_CLEAR_CLASS(econv_wrapper);
1110 if (!ec)
return Qnil;
1113 sp = (
unsigned char*)ptr;
1115 while ((dest = (
unsigned char*)RSTRING_PTR(newstr)),
1116 (dp = dest + converted_output),
1120 size_t converted_input = sp - start;
1121 size_t rest =
len - converted_input;
1122 converted_output = dp - dest;
1123 rb_str_set_len(newstr, converted_output);
1124 if (converted_input && converted_output &&
1125 rest < (LONG_MAX / converted_output)) {
1126 rest = (rest * converted_output) / converted_input;
1131 olen += rest < 2 ? 2 : rest;
1132 rb_str_resize(newstr, olen);
1138 len = dp - (
unsigned char*)RSTRING_PTR(newstr);
1139 rb_str_set_len(newstr,
len);
1140 rb_enc_associate(newstr, to);
1159 const int eidx = rb_enc_to_index(eenc);
1162 return rb_enc_str_new(ptr,
len, eenc);
1166 if ((eidx == rb_ascii8bit_encindex()) ||
1167 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr +
len))) {
1171 ienc = rb_default_internal_encoding();
1172 if (!ienc || eenc == ienc) {
1173 return rb_enc_str_new(ptr,
len, eenc);
1177 if ((eidx == rb_ascii8bit_encindex()) ||
1178 (eidx == rb_usascii_encindex()) ||
1179 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr +
len))) {
1180 return rb_enc_str_new(ptr,
len, ienc);
1183 str = rb_enc_str_new(NULL, 0, ienc);
1186 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr,
len, eenc, 0,
Qnil))) {
1187 rb_str_initialize(str, ptr,
len, eenc);
1193rb_external_str_with_enc(
VALUE str, rb_encoding *eenc)
1195 int eidx = rb_enc_to_index(eenc);
1196 if (eidx == rb_usascii_encindex() &&
1197 !is_ascii_string(str)) {
1198 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1201 rb_enc_associate_index(str, eidx);
1236rb_filesystem_str_new_cstr(
const char *ptr)
1260str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1262 const int termlen = TERM_LEN(str);
1267 if (str_embed_capa(str2) >=
len + termlen) {
1268 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1269 STR_SET_EMBED(str2);
1270 memcpy(ptr2, RSTRING_PTR(str),
len);
1271 TERM_FILL(ptr2+
len, termlen);
1275 if (STR_SHARED_P(str)) {
1276 root =
RSTRING(str)->as.heap.aux.shared;
1280 root = rb_str_new_frozen(str);
1284 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1286 rb_fatal(
"about to free a possible shared root");
1288 char *ptr2 = STR_HEAP_PTR(str2);
1290 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1293 FL_SET(str2, STR_NOEMBED);
1294 RSTRING(str2)->as.heap.ptr = ptr;
1295 STR_SET_SHARED(str2, root);
1298 STR_SET_LEN(str2,
len);
1306 str_replace_shared_without_enc(str2, str);
1307 rb_enc_cr_str_exact_copy(str2, str);
1314 return str_replace_shared(str_alloc_heap(klass), str);
1331rb_str_new_frozen_String(
VALUE orig)
1338rb_str_tmp_frozen_acquire(
VALUE orig)
1341 return str_new_frozen_buffer(0, orig, FALSE);
1345rb_str_tmp_frozen_no_embed_acquire(
VALUE orig)
1347 if (
OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig))
return orig;
1348 if (STR_SHARED_P(orig) && !STR_EMBED_P(
RSTRING(orig)->
as.heap.aux.shared))
return rb_str_tmp_frozen_acquire(orig);
1350 VALUE str = str_alloc_heap(0);
1353 FL_SET(str, STR_SHARED_ROOT);
1355 size_t capa = str_capacity(orig, TERM_LEN(orig));
1361 if (STR_EMBED_P(orig) ||
FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1362 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(
sizeof(
char),
capa,
sizeof(
char), TERM_LEN(orig));
1369 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1370 RBASIC(orig)->flags &= ~STR_NOFREE;
1371 STR_SET_SHARED(orig, str);
1381rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1386 if (STR_EMBED_P(tmp)) {
1395 assert(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1399 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1400 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1405 STR_SET_LEN(tmp, 0);
1413 return str_new_frozen_buffer(klass, orig, TRUE);
1419 assert(!STR_EMBED_P(orig));
1420 assert(!STR_SHARED_P(orig));
1422 VALUE str = str_alloc_heap(klass);
1423 STR_SET_LEN(str, RSTRING_LEN(orig));
1424 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1425 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1426 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1427 RBASIC(orig)->flags &= ~STR_NOFREE;
1428 STR_SET_SHARED(orig, str);
1435str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1439 long len = RSTRING_LEN(orig);
1440 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1442 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1443 str = str_new0(klass, RSTRING_PTR(orig),
len, termlen);
1444 assert(STR_EMBED_P(str));
1449 long ofs =
RSTRING(orig)->as.heap.ptr - RSTRING_PTR(
shared);
1450 long rest = RSTRING_LEN(
shared) - ofs - RSTRING_LEN(orig);
1453 assert(ofs + rest <= RSTRING_LEN(
shared));
1456 if ((ofs > 0) || (rest > 0) ||
1459 str = str_new_shared(klass,
shared);
1460 assert(!STR_EMBED_P(str));
1461 RSTRING(str)->as.heap.ptr += ofs;
1462 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1470 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1471 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1473 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1474 STR_SET_LEN(str, RSTRING_LEN(orig));
1475 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1478 str = heap_str_make_shared(klass, orig);
1482 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1494str_new_empty_String(
VALUE str)
1497 rb_enc_copy(v, str);
1501#define STR_BUF_MIN_SIZE 63
1506 if (STR_EMBEDDABLE_P(
capa, 1)) {
1514 RSTRING(str)->as.heap.ptr[0] =
'\0';
1523 long len = strlen(ptr);
1525 str = rb_str_buf_new(
len);
1534 return str_new(0, 0,
len);
1540 if (
FL_TEST(str, RSTRING_FSTR)) {
1541 st_data_t fstr = (st_data_t)str;
1545 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1546 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1551 if (STR_EMBED_P(str)) {
1552 RB_DEBUG_COUNTER_INC(obj_str_embed);
1554 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1555 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1556 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1559 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1560 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1564RUBY_FUNC_EXPORTED
size_t
1565rb_str_memsize(
VALUE str)
1567 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1568 return STR_HEAP_SIZE(str);
1578 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1581static inline void str_discard(
VALUE str);
1582static void str_shared_replace(
VALUE str,
VALUE str2);
1587 if (str != str2) str_shared_replace(str, str2);
1598 enc = STR_ENC_GET(str2);
1603 STR_SET_LEN(str, RSTRING_LEN(str2));
1605 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1607 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (
size_t)RSTRING_LEN(str2) + termlen);
1608 rb_enc_associate(str, enc);
1612 if (STR_EMBED_P(str2)) {
1613 assert(!
FL_TEST(str2, STR_SHARED));
1614 long len = RSTRING_LEN(str2);
1615 assert(
len + termlen <= str_embed_capa(str2));
1617 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1618 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1619 RSTRING(str2)->as.heap.ptr = new_ptr;
1620 STR_SET_LEN(str2,
len);
1622 STR_SET_NOEMBED(str2);
1625 STR_SET_NOEMBED(str);
1627 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1629 if (
FL_TEST(str2, STR_SHARED)) {
1631 STR_SET_SHARED(str,
shared);
1634 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1638 STR_SET_EMBED(str2);
1639 RSTRING_PTR(str2)[0] = 0;
1640 STR_SET_LEN(str2, 0);
1641 rb_enc_associate(str, enc);
1655 return rb_obj_as_string_result(str, obj);
1671 len = RSTRING_LEN(str2);
1672 if (STR_SHARED_P(str2)) {
1675 STR_SET_NOEMBED(str);
1676 STR_SET_LEN(str,
len);
1677 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1678 STR_SET_SHARED(str,
shared);
1679 rb_enc_cr_str_exact_copy(str, str2);
1682 str_replace_shared(str, str2);
1691 size_t size = rb_str_embed_size(
capa);
1693 assert(rb_gc_size_allocatable_p(size));
1713 const VALUE flag_mask =
1719 if (STR_EMBED_P(str)) {
1720 long len = RSTRING_LEN(str);
1722 assert(STR_EMBED_P(dup));
1723 assert(str_embed_capa(dup) >=
len + 1);
1729 root =
RSTRING(str)->as.heap.aux.shared;
1731 else if (UNLIKELY(!(flags &
FL_FREEZE))) {
1732 root = str = str_new_frozen(klass, str);
1735 assert(!STR_SHARED_P(root));
1738 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1739 FL_SET(root, STR_SHARED_ROOT);
1741 flags |= RSTRING_NOEMBED | STR_SHARED;
1744 STR_SET_LEN(dup, RSTRING_LEN(str));
1747 encidx = rb_enc_get_index(str);
1751 if (encidx) rb_enc_associate_index(dup, encidx);
1759 if (STR_EMBED_P(str)) {
1760 dup = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1763 dup = ec_str_alloc_heap(ec, klass);
1766 return str_duplicate_setup(klass, str, dup);
1773 if (STR_EMBED_P(str)) {
1774 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1777 dup = str_alloc_heap(klass);
1780 return str_duplicate_setup(klass, str, dup);
1791rb_str_dup_m(
VALUE str)
1793 if (LIKELY(BARE_STRING_P(str))) {
1804 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1811 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1812 return ec_str_duplicate(ec,
rb_cString, str);
1827 static ID keyword_ids[2];
1828 VALUE orig, opt, venc, vcapa;
1830 rb_encoding *enc = 0;
1833 if (!keyword_ids[0]) {
1834 keyword_ids[0] = rb_id_encoding();
1835 CONST_ID(keyword_ids[1],
"capacity");
1843 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
1844 enc = rb_to_encoding(venc);
1846 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
1851 if (
capa < STR_BUF_MIN_SIZE) {
1852 capa = STR_BUF_MIN_SIZE;
1856 len = RSTRING_LEN(orig);
1860 if (orig == str) n = 0;
1862 str_modifiable(str);
1863 if (STR_EMBED_P(str) ||
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1865 const size_t size = (size_t)
capa + termlen;
1866 const char *
const old_ptr = RSTRING_PTR(str);
1867 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
1868 char *new_ptr =
ALLOC_N(
char, size);
1869 if (STR_EMBED_P(str))
RUBY_ASSERT(osize <= str_embed_capa(str));
1870 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1872 RSTRING(str)->as.heap.ptr = new_ptr;
1874 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
1875 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
1876 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
1878 STR_SET_LEN(str,
len);
1881 memcpy(
RSTRING(str)->
as.heap.ptr, RSTRING_PTR(orig),
len);
1882 rb_enc_cr_str_exact_copy(str, orig);
1884 FL_SET(str, STR_NOEMBED);
1891 rb_enc_associate(str, enc);
1903rb_str_s_new(
int argc,
VALUE *argv,
VALUE klass)
1909 static ID keyword_ids[2];
1912 rb_encoding *enc = NULL;
1919 keyword_ids[0] = rb_id_encoding();
1920 CONST_ID(keyword_ids[1],
"capacity");
1922 encoding = kwargs[0];
1923 capacity = kwargs[1];
1934 if (UNDEF_P(encoding)) {
1936 encoding = rb_obj_encoding(orig);
1940 if (!UNDEF_P(encoding)) {
1941 enc = rb_to_encoding(encoding);
1946 if (UNDEF_P(capacity)) {
1948 VALUE empty_str = str_new(klass,
"", 0);
1950 rb_enc_associate(empty_str, enc);
1954 VALUE copy = str_duplicate(klass, orig);
1955 rb_enc_associate(copy, enc);
1968 if (orig_capa >
capa) {
1973 VALUE str = str_new0(klass, NULL,
capa, termlen);
1974 STR_SET_LEN(str, 0);
1975 TERM_FILL(RSTRING_PTR(str), termlen);
1978 rb_enc_associate(str, enc);
1982 rb_str_buf_append(str, orig);
1989#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2004static inline uintptr_t
2005count_utf8_lead_bytes_with_word(
const uintptr_t *s)
2010 d = (d>>6) | (~d>>7);
2011 d &= NONASCII_MASK >> 7;
2014#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2016 return rb_popcount_intptr(d);
2020# if SIZEOF_VOIDP == 8
2029enc_strlen(
const char *p,
const char *e, rb_encoding *enc,
int cr)
2035 long diff = (long)(e - p);
2041 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2042 const uintptr_t *s, *t;
2043 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2044 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2045 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2046 while (p < (
const char *)s) {
2047 if (is_utf8_lead_byte(*p))
len++;
2051 len += count_utf8_lead_bytes_with_word(s);
2054 p = (
const char *)s;
2057 if (is_utf8_lead_byte(*p))
len++;
2063 else if (rb_enc_asciicompat(enc)) {
2068 q = search_nonascii(p, e);
2074 p += rb_enc_fast_mbclen(p, e, enc);
2081 q = search_nonascii(p, e);
2087 p += rb_enc_mbclen(p, e, enc);
2094 for (c=0; p<e; c++) {
2095 p += rb_enc_mbclen(p, e, enc);
2110rb_enc_strlen_cr(
const char *p,
const char *e, rb_encoding *enc,
int *cr)
2118 long diff = (long)(e - p);
2121 else if (rb_enc_asciicompat(enc)) {
2125 q = search_nonascii(p, e);
2133 ret = rb_enc_precise_mbclen(p, e, enc);
2148 for (c=0; p<e; c++) {
2149 ret = rb_enc_precise_mbclen(p, e, enc);
2168str_strlen(
VALUE str, rb_encoding *enc)
2173 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2174 if (!enc) enc = STR_ENC_GET(str);
2175 p = RSTRING_PTR(str);
2176 e = RSTRING_END(str);
2180 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2185 return enc_strlen(p, e, enc, cr);
2192 return str_strlen(str, NULL);
2206 return LONG2NUM(str_strlen(str, NULL));
2218rb_str_bytesize(
VALUE str)
2236rb_str_empty(
VALUE str)
2238 return RBOOL(RSTRING_LEN(str) == 0);
2256 char *ptr1, *ptr2, *ptr3;
2261 enc = rb_enc_check_str(str1, str2);
2265 if (len1 > LONG_MAX - len2) {
2266 rb_raise(rb_eArgError,
"string size too big");
2268 str3 = str_new0(
rb_cString, 0, len1+len2, termlen);
2269 ptr3 = RSTRING_PTR(str3);
2270 memcpy(ptr3, ptr1, len1);
2271 memcpy(ptr3+len1, ptr2, len2);
2272 TERM_FILL(&ptr3[len1+len2], termlen);
2288 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2291 int enc1 = rb_enc_get_index(str1);
2292 int enc2 = rb_enc_get_index(str2);
2297 else if (enc2 < 0) {
2300 else if (enc1 != enc2) {
2303 else if (len1 > LONG_MAX - len2) {
2336 rb_enc_copy(str2, str);
2341 rb_raise(rb_eArgError,
"negative argument");
2343 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2344 if (STR_EMBEDDABLE_P(
len, 1)) {
2346 memset(RSTRING_PTR(str2), 0,
len + 1);
2353 STR_SET_LEN(str2,
len);
2354 rb_enc_copy(str2, str);
2357 if (
len && LONG_MAX/
len < RSTRING_LEN(str)) {
2358 rb_raise(rb_eArgError,
"argument too big");
2361 len *= RSTRING_LEN(str);
2362 termlen = TERM_LEN(str);
2364 ptr2 = RSTRING_PTR(str2);
2366 n = RSTRING_LEN(str);
2367 memcpy(ptr2, RSTRING_PTR(str), n);
2368 while (n <=
len/2) {
2369 memcpy(ptr2 + n, ptr2, n);
2372 memcpy(ptr2 + n, ptr2,
len-n);
2374 STR_SET_LEN(str2,
len);
2375 TERM_FILL(&ptr2[
len], termlen);
2376 rb_enc_cr_str_copy_for_substr(str2, str);
2402 VALUE tmp = rb_check_array_type(arg);
2411rb_check_lockedtmp(
VALUE str)
2413 if (
FL_TEST(str, STR_TMPLOCK)) {
2419str_modifiable(
VALUE str)
2421 rb_check_lockedtmp(str);
2426str_dependent_p(
VALUE str)
2428 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2437str_independent(
VALUE str)
2439 str_modifiable(str);
2440 return !str_dependent_p(str);
2444str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2452 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2453 ptr =
RSTRING(str)->as.heap.ptr;
2457 STR_SET_LEN(str,
len);
2462 oldptr = RSTRING_PTR(str);
2464 memcpy(ptr, oldptr,
len);
2466 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2469 STR_SET_NOEMBED(str);
2470 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2471 TERM_FILL(ptr +
len, termlen);
2472 RSTRING(str)->as.heap.ptr = ptr;
2473 STR_SET_LEN(str,
len);
2480 if (!str_independent(str))
2481 str_make_independent(str);
2488 int termlen = TERM_LEN(str);
2489 long len = RSTRING_LEN(str);
2492 rb_raise(rb_eArgError,
"negative expanding string size");
2494 if (expand >= LONG_MAX -
len) {
2495 rb_raise(rb_eArgError,
"string size too big");
2498 if (!str_independent(str)) {
2499 str_make_independent_expand(str,
len, expand, termlen);
2501 else if (expand > 0) {
2502 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2509str_modify_keep_cr(
VALUE str)
2511 if (!str_independent(str))
2512 str_make_independent(str);
2519str_discard(
VALUE str)
2521 str_modifiable(str);
2522 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2523 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2524 RSTRING(str)->as.heap.ptr = 0;
2525 STR_SET_LEN(str, 0);
2532 rb_encoding *enc = rb_enc_get(str);
2536 if (!rb_enc_asciicompat(enc)) {
2556 return RSTRING_PTR(str);
2560zero_filled(
const char *s,
int n)
2562 for (; n > 0; --n) {
2569str_null_char(
const char *s,
long len,
const int minlen, rb_encoding *enc)
2571 const char *e = s +
len;
2573 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2574 if (zero_filled(s, minlen))
return s;
2580str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2585 if (str_dependent_p(str)) {
2586 if (!zero_filled(s +
len, termlen))
2587 str_make_independent_expand(str,
len, 0L, termlen);
2590 TERM_FILL(s +
len, termlen);
2593 return RSTRING_PTR(str);
2597rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2599 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2600 long len = RSTRING_LEN(str);
2604 rb_check_lockedtmp(str);
2605 str_make_independent_expand(str,
len, 0L, termlen);
2607 else if (str_dependent_p(str)) {
2608 if (termlen > oldtermlen)
2609 str_make_independent_expand(str,
len, 0L, termlen);
2612 if (!STR_EMBED_P(str)) {
2614 assert(!
FL_TEST((str), STR_SHARED));
2617 if (termlen > oldtermlen) {
2618 TERM_FILL(RSTRING_PTR(str) +
len, termlen);
2626str_null_check(
VALUE str,
int *w)
2628 char *s = RSTRING_PTR(str);
2629 long len = RSTRING_LEN(str);
2630 rb_encoding *enc = rb_enc_get(str);
2635 if (str_null_char(s,
len, minlen, enc)) {
2638 return str_fill_term(str, s,
len, minlen);
2641 if (!s || memchr(s, 0,
len)) {
2645 s = str_fill_term(str, s,
len, minlen);
2651rb_str_to_cstr(
VALUE str)
2654 return str_null_check(str, &w);
2662 char *s = str_null_check(str, &w);
2665 rb_raise(rb_eArgError,
"string contains null char");
2667 rb_raise(rb_eArgError,
"string contains null byte");
2673rb_str_fill_terminator(
VALUE str,
const int newminlen)
2675 char *s = RSTRING_PTR(str);
2676 long len = RSTRING_LEN(str);
2677 return str_fill_term(str, s,
len, newminlen);
2683 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2707str_nth_len(
const char *p,
const char *e,
long *nthp, rb_encoding *enc)
2716 else if (rb_enc_asciicompat(enc)) {
2717 const char *p2, *e2;
2720 while (p < e && 0 < nth) {
2727 p2 = search_nonascii(p, e2);
2736 n = rb_enc_mbclen(p, e, enc);
2747 while (p < e && nth--) {
2748 p += rb_enc_mbclen(p, e, enc);
2757rb_enc_nth(
const char *p,
const char *e,
long nth, rb_encoding *enc)
2759 return str_nth_len(p, e, &nth, enc);
2763str_nth(
const char *p,
const char *e,
long nth, rb_encoding *enc,
int singlebyte)
2768 p = str_nth_len(p, e, &nth, enc);
2777str_offset(
const char *p,
const char *e,
long nth, rb_encoding *enc,
int singlebyte)
2779 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2780 if (!pp)
return e - p;
2787 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2788 STR_ENC_GET(str), single_byte_optimizable(str));
2793str_utf8_nth(
const char *p,
const char *e,
long *nthp)
2796 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
2797 const uintptr_t *s, *t;
2798 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2799 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2800 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2801 while (p < (
const char *)s) {
2802 if (is_utf8_lead_byte(*p)) nth--;
2806 nth -= count_utf8_lead_bytes_with_word(s);
2808 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
2812 if (is_utf8_lead_byte(*p)) {
2813 if (nth == 0)
break;
2823str_utf8_offset(
const char *p,
const char *e,
long nth)
2825 const char *pp = str_utf8_nth(p, e, &nth);
2834 if (single_byte_optimizable(str) || pos < 0)
2837 char *p = RSTRING_PTR(str);
2838 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
2843str_subseq(
VALUE str,
long beg,
long len)
2849 assert(beg+
len <= RSTRING_LEN(str));
2851 const int termlen = TERM_LEN(str);
2852 if (!SHARABLE_SUBSTRING_P(beg,
len, RSTRING_LEN(str))) {
2859 if (str_embed_capa(str2) >=
len + termlen) {
2860 char *ptr2 =
RSTRING(str2)->as.embed.ary;
2861 STR_SET_EMBED(str2);
2862 memcpy(ptr2, RSTRING_PTR(str) + beg,
len);
2863 TERM_FILL(ptr2+
len, termlen);
2865 STR_SET_LEN(str2,
len);
2869 str_replace_shared(str2, str);
2870 assert(!STR_EMBED_P(str2));
2872 RSTRING(str2)->as.heap.ptr += beg;
2873 if (RSTRING_LEN(str2) >
len) {
2874 STR_SET_LEN(str2,
len);
2884 VALUE str2 = str_subseq(str, beg,
len);
2885 rb_enc_cr_str_copy_for_substr(str2, str);
2894 const long blen = RSTRING_LEN(str);
2895 rb_encoding *enc = STR_ENC_GET(str);
2896 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2898 if (
len < 0)
return 0;
2899 if (beg < 0 && -beg < 0)
return 0;
2903 if (single_byte_optimizable(str)) {
2904 if (beg > blen)
return 0;
2907 if (beg < 0)
return 0;
2909 if (
len > blen - beg)
2911 if (
len < 0)
return 0;
2916 if (
len > -beg)
len = -beg;
2920 while (beg-- >
len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2923 while (
len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2929 slen = str_strlen(str, enc);
2931 if (beg < 0)
return 0;
2933 if (
len == 0)
goto end;
2936 else if (beg > 0 && beg > blen) {
2940 if (beg > str_strlen(str, enc))
return 0;
2945 enc == rb_utf8_encoding()) {
2946 p = str_utf8_nth(s, e, &beg);
2947 if (beg > 0)
return 0;
2948 len = str_utf8_offset(p, e,
len);
2954 p = s + beg * char_sz;
2958 else if (
len * char_sz > e - p)
2963 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2964 if (beg > 0)
return 0;
2968 len = str_offset(p, e,
len, enc, 0);
2976static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
2981 return str_substr(str, beg,
len, TRUE);
2985str_substr(
VALUE str,
long beg,
long len,
int empty)
2989 if (!p)
return Qnil;
2990 if (!
len && !empty)
return Qnil;
2992 beg = p - RSTRING_PTR(str);
2994 VALUE str2 = str_subseq(str, beg,
len);
2995 rb_enc_cr_str_copy_for_substr(str2, str);
3004 rb_str_resize(str, RSTRING_LEN(str));
3005 return rb_obj_freeze(str);
3021 return rb_str_dup(str);
3050str_uminus(
VALUE str)
3053 str = rb_str_dup(str);
3055 return rb_fstring(str);
3059#define rb_str_dup_frozen rb_str_new_frozen
3064 if (
FL_TEST(str, STR_TMPLOCK)) {
3067 FL_SET(str, STR_TMPLOCK);
3074 if (!
FL_TEST(str, STR_TMPLOCK)) {
3081RUBY_FUNC_EXPORTED
VALUE
3092 const int termlen = TERM_LEN(str);
3094 str_modifiable(str);
3095 if (STR_SHARED_P(str)) {
3098 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3099 rb_bug(
"probable buffer overflow: %ld for %ld",
len,
capa);
3106 else if (
len > RSTRING_LEN(str)) {
3109 const char *
const prev_end = RSTRING_END(str);
3110 const char *
const new_end = RSTRING_PTR(str) +
len;
3111 rb_encoding *enc = rb_enc_get(str);
3120 else if (
len < RSTRING_LEN(str)) {
3128 STR_SET_LEN(str,
len);
3129 TERM_FILL(&RSTRING_PTR(str)[
len], termlen);
3136 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3139 int independent = str_independent(str);
3140 long slen = RSTRING_LEN(str);
3148 const int termlen = TERM_LEN(str);
3149 if (STR_EMBED_P(str)) {
3150 if (
len == slen)
return str;
3151 if (str_embed_capa(str) >=
len + termlen) {
3152 STR_SET_LEN(str,
len);
3156 str_make_independent_expand(str, slen,
len - slen, termlen);
3158 else if (str_embed_capa(str) >=
len + termlen) {
3159 char *ptr = STR_HEAP_PTR(str);
3161 if (slen >
len) slen =
len;
3164 STR_SET_LEN(str,
len);
3165 if (independent) ruby_xfree(ptr);
3168 else if (!independent) {
3169 if (
len == slen)
return str;
3170 str_make_independent_expand(str, slen,
len - slen, termlen);
3174 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3175 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3178 else if (
len == slen)
return str;
3179 STR_SET_LEN(str,
len);
3186str_buf_cat4(
VALUE str,
const char *ptr,
long len,
bool keep_cr)
3189 str_modify_keep_cr(str);
3194 if (
len == 0)
return 0;
3196 long total, olen,
off = -1;
3198 const int termlen = TERM_LEN(str);
3201 if (ptr >= sptr && ptr <= sptr + olen) {
3205 long capa = str_capacity(str, termlen);
3207 if (olen > LONG_MAX -
len) {
3208 rb_raise(rb_eArgError,
"string sizes too big");
3212 if (total >= LONG_MAX / 2) {
3215 while (total >
capa) {
3218 RESIZE_CAPA_TERM(str,
capa, termlen);
3219 sptr = RSTRING_PTR(str);
3224 memcpy(sptr + olen, ptr,
len);
3225 STR_SET_LEN(str, total);
3226 TERM_FILL(sptr + total, termlen);
3231#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3232#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3237 if (
len == 0)
return str;
3239 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3241 return str_buf_cat(str, ptr,
len);
3256rb_enc_cr_str_buf_cat(
VALUE str,
const char *ptr,
long len,
3257 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3262 rb_encoding *str_enc, *ptr_enc;
3266 if (str_encindex == ptr_encindex) {
3268 ptr_cr = coderange_scan(ptr,
len, rb_enc_from_index(ptr_encindex));
3272 str_enc = rb_enc_from_index(str_encindex);
3273 ptr_enc = rb_enc_from_index(ptr_encindex);
3274 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3277 if (RSTRING_LEN(str) == 0) {
3286 ptr_cr = coderange_scan(ptr,
len, ptr_enc);
3290 str_cr = rb_enc_str_coderange(str);
3295 *ptr_cr_ret = ptr_cr;
3297 if (str_encindex != ptr_encindex &&
3300 str_enc = rb_enc_from_index(str_encindex);
3301 ptr_enc = rb_enc_from_index(ptr_encindex);
3306 res_encindex = str_encindex;
3311 res_encindex = str_encindex;
3315 res_encindex = ptr_encindex;
3320 res_encindex = str_encindex;
3327 res_encindex = str_encindex;
3333 rb_raise(rb_eArgError,
"negative string size (or size too big)");
3335 str_buf_cat(str, ptr,
len);
3341 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3346rb_enc_str_buf_cat(
VALUE str,
const char *ptr,
long len, rb_encoding *ptr_enc)
3348 return rb_enc_cr_str_buf_cat(str, ptr,
len,
3357 rb_encoding *enc = rb_enc_from_index(encindex);
3358 if (rb_enc_asciicompat(enc)) {
3359 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3365 unsigned int c = (
unsigned char)*ptr;
3366 int len = rb_enc_codelen(c, enc);
3367 rb_enc_mbcput(c, buf, enc);
3368 rb_enc_cr_str_buf_cat(str, buf,
len,
3379 int str2_cr = rb_enc_str_coderange(str2);
3381 if (str_enc_fastpath(str)) {
3385 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3391 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
true);
3402 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3414 return rb_str_buf_append(str, str2);
3418rb_str_concat_literals(
size_t num,
const VALUE *strary)
3422 unsigned long len = 1;
3427 for (i = 0; i < num; ++i) {
len += RSTRING_LEN(strary[i]); }
3428 str = rb_str_buf_new(
len);
3429 str_enc_copy_direct(str, strary[0]);
3431 for (i = s; i < num; ++i) {
3432 const VALUE v = strary[i];
3435 rb_str_buf_append(str, v);
3436 if (encidx != ENCINDEX_US_ASCII) {
3438 rb_enc_set_index(str, encidx);
3463rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3465 str_modifiable(str);
3470 else if (argc > 1) {
3473 rb_enc_copy(arg_str, str);
3474 for (i = 0; i < argc; i++) {
3477 rb_str_buf_append(str, arg_str);
3505 rb_encoding *enc = STR_ENC_GET(str1);
3509 if (rb_num_to_uint(str2, &code) == 0) {
3522 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3525 buf[0] = (char)code;
3526 rb_str_cat(str1, buf, 1);
3527 if (encidx != rb_enc_to_index(enc)) {
3528 rb_enc_associate_index(str1, encidx);
3533 long pos = RSTRING_LEN(str1);
3538 switch (
len = rb_enc_codelen(code, enc)) {
3539 case ONIGERR_INVALID_CODE_POINT_VALUE:
3540 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3542 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3548 rb_enc_mbcput(code, buf, enc);
3549 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
3550 rb_raise(
rb_eRangeError,
"invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3552 rb_str_resize(str1, pos+
len);
3553 memcpy(RSTRING_PTR(str1) + pos, buf,
len);
3566rb_ascii8bit_appendable_encoding_index(rb_encoding *enc,
unsigned int code)
3568 int encidx = rb_enc_to_index(enc);
3570 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3575 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3576 return ENCINDEX_ASCII_8BIT;
3599rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
3601 str_modifiable(str);
3606 else if (argc > 1) {
3609 rb_enc_copy(arg_str, str);
3610 for (i = 0; i < argc; i++) {
3622 st_index_t h =
rb_memhash((
const void *)RSTRING_PTR(str), RSTRING_LEN(str));
3624 if (e && !is_ascii_string(str)) {
3634 const char *ptr1, *ptr2;
3637 return (len1 != len2 ||
3639 memcmp(ptr1, ptr2, len1) != 0);
3653rb_str_hash_m(
VALUE str)
3659#define lesser(a,b) (((a)>(b))?(b):(a))
3667 if (RSTRING_LEN(str1) == 0)
return TRUE;
3668 if (RSTRING_LEN(str2) == 0)
return TRUE;
3671 if (idx1 == idx2)
return TRUE;
3672 rc1 = rb_enc_str_coderange(str1);
3673 rc2 = rb_enc_str_coderange(str2);
3676 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3680 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3690 const char *ptr1, *ptr2;
3693 if (str1 == str2)
return 0;
3696 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3705 if (len1 > len2)
return 1;
3708 if (retval > 0)
return 1;
3735 if (str1 == str2)
return Qtrue;
3742 return rb_str_eql_internal(str1, str2);
3766 if (str1 == str2)
return Qtrue;
3768 return rb_str_eql_internal(str1, str2);
3799 return rb_invcmp(str1, str2);
3841 return str_casecmp(str1, s);
3849 const char *p1, *p1end, *p2, *p2end;
3851 enc = rb_enc_compatible(str1, str2);
3856 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3857 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3858 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3859 while (p1 < p1end && p2 < p2end) {
3861 unsigned int c1 =
TOLOWER(*p1 & 0xff);
3862 unsigned int c2 =
TOLOWER(*p2 & 0xff);
3864 return INT2FIX(c1 < c2 ? -1 : 1);
3871 while (p1 < p1end && p2 < p2end) {
3872 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3873 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3875 if (0 <= c1 && 0 <= c2) {
3879 return INT2FIX(c1 < c2 ? -1 : 1);
3883 l1 = rb_enc_mbclen(p1, p1end, enc);
3884 l2 = rb_enc_mbclen(p2, p2end, enc);
3885 len = l1 < l2 ? l1 : l2;
3886 r = memcmp(p1, p2,
len);
3888 return INT2FIX(r < 0 ? -1 : 1);
3890 return INT2FIX(l1 < l2 ? -1 : 1);
3896 if (RSTRING_LEN(str1) == RSTRING_LEN(str2))
return INT2FIX(0);
3897 if (RSTRING_LEN(str1) > RSTRING_LEN(str2))
return INT2FIX(1);
3931 return str_casecmp_p(str1, s);
3938 VALUE folded_str1, folded_str2;
3939 VALUE fold_opt = sym_fold;
3941 enc = rb_enc_compatible(str1, str2);
3946 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3947 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3949 return rb_str_eql(folded_str1, folded_str2);
3953strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
3954 const char *sub_ptr,
long sub_len,
long offset, rb_encoding *enc)
3956 const char *search_start = str_ptr;
3957 long pos, search_len = str_len - offset;
3961 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3962 if (pos < 0)
return pos;
3964 if (t == search_start + pos)
break;
3965 search_len -= t - search_start;
3966 if (search_len <= 0)
return -1;
3967 offset += t - search_start;
3970 return pos + offset;
3974#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3975#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
3978rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
3980 const char *str_ptr, *str_ptr_end, *sub_ptr;
3981 long str_len, sub_len;
3984 enc = rb_enc_check(str, sub);
3985 if (is_broken_string(sub))
return -1;
3987 str_ptr = RSTRING_PTR(str);
3988 str_ptr_end = RSTRING_END(str);
3989 str_len = RSTRING_LEN(str);
3990 sub_ptr = RSTRING_PTR(sub);
3991 sub_len = RSTRING_LEN(sub);
3993 if (str_len < sub_len)
return -1;
3996 long str_len_char, sub_len_char;
3997 int single_byte = single_byte_optimizable(str);
3998 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3999 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4001 offset += str_len_char;
4002 if (offset < 0)
return -1;
4004 if (str_len_char - offset < sub_len_char)
return -1;
4005 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4008 if (sub_len == 0)
return offset;
4011 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4025rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
4029 rb_encoding *enc = STR_ENC_GET(str);
4032 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4033 long slen = str_strlen(str, enc);
4035 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4047 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4048 enc, single_byte_optimizable(str));
4059 pos = rb_str_index(str, sub, pos);
4073str_ensure_byte_pos(
VALUE str,
long pos)
4075 const char *s = RSTRING_PTR(str);
4076 const char *e = RSTRING_END(str);
4077 const char *p = s + pos;
4078 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4080 "offset %ld does not land on character boundary", pos);
4126rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4132 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4133 long slen = RSTRING_LEN(str);
4135 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4146 str_ensure_byte_pos(str, pos);
4158 pos = rb_str_byteindex(str, sub, pos);
4159 if (pos >= 0)
return LONG2NUM(pos);
4166str_rindex(
VALUE str,
VALUE sub,
const char *s, rb_encoding *enc)
4168 char *hit, *adjusted;
4170 long slen, searchlen;
4173 sbeg = RSTRING_PTR(str);
4174 slen = RSTRING_LEN(sub);
4175 if (slen == 0)
return s - sbeg;
4176 e = RSTRING_END(str);
4177 t = RSTRING_PTR(sub);
4179 searchlen = s - sbeg + 1;
4182 hit = memrchr(sbeg, c, searchlen);
4185 if (hit != adjusted) {
4186 searchlen = adjusted - sbeg;
4189 if (memcmp(hit, t, slen) == 0)
4191 searchlen = adjusted - sbeg;
4192 }
while (searchlen > 0);
4198str_rindex(
VALUE str,
VALUE sub,
const char *s, rb_encoding *enc)
4203 sbeg = RSTRING_PTR(str);
4204 e = RSTRING_END(str);
4205 t = RSTRING_PTR(sub);
4206 slen = RSTRING_LEN(sub);
4209 if (memcmp(s, t, slen) == 0) {
4212 if (s <= sbeg)
break;
4213 s = rb_enc_prev_char(sbeg, s, e, enc);
4229 enc = rb_enc_check(str, sub);
4230 if (is_broken_string(sub))
return -1;
4231 singlebyte = single_byte_optimizable(str);
4232 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc);
4233 slen = str_strlen(sub, enc);
4236 if (
len < slen)
return -1;
4237 if (
len - pos < slen) pos =
len - slen;
4238 if (
len == 0)
return pos;
4240 sbeg = RSTRING_PTR(str);
4243 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4249 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4250 return str_rindex(str, sub, s, enc);
4311rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4315 rb_encoding *enc = STR_ENC_GET(str);
4316 long pos,
len = str_strlen(str, enc);
4318 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4320 if (pos < 0 && (pos +=
len) < 0) {
4326 if (pos >
len) pos =
len;
4334 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4335 enc, single_byte_optimizable(str));
4346 pos = rb_str_rindex(str, sub, pos);
4356rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4362 enc = rb_enc_check(str, sub);
4363 if (is_broken_string(sub))
return -1;
4364 len = RSTRING_LEN(str);
4365 slen = RSTRING_LEN(sub);
4368 if (
len < slen)
return -1;
4369 if (
len - pos < slen) pos =
len - slen;
4370 if (
len == 0)
return pos;
4372 sbeg = RSTRING_PTR(str);
4375 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4382 return str_rindex(str, sub, s, enc);
4447rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4451 long pos,
len = RSTRING_LEN(str);
4453 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4455 if (pos < 0 && (pos +=
len) < 0) {
4461 if (pos >
len) pos =
len;
4467 str_ensure_byte_pos(str, pos);
4479 pos = rb_str_byterindex(str, sub, pos);
4480 if (pos >= 0)
return LONG2NUM(pos);
4516 switch (OBJ_BUILTIN_TYPE(y)) {
4568rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
4575 result = rb_funcallv(get_pat(re), rb_intern(
"match"), argc, argv);
4607rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
4611 re = get_pat(argv[0]);
4612 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
4621static enum neighbor_char
4622enc_succ_char(
char *p,
long len, rb_encoding *enc)
4629 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
4631 return NEIGHBOR_NOT_CHAR;
4635 if (!l)
return NEIGHBOR_NOT_CHAR;
4636 if (l !=
len)
return NEIGHBOR_WRAPPED;
4637 rb_enc_mbcput(c, p, enc);
4638 r = rb_enc_precise_mbclen(p, p +
len, enc);
4640 return NEIGHBOR_NOT_CHAR;
4642 return NEIGHBOR_FOUND;
4645 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
4648 return NEIGHBOR_WRAPPED;
4649 ++((
unsigned char*)p)[i];
4650 l = rb_enc_precise_mbclen(p, p+
len, enc);
4654 return NEIGHBOR_FOUND;
4657 memset(p+l, 0xff,
len-l);
4663 for (len2 =
len-1; 0 < len2; len2--) {
4664 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4668 memset(p+len2+1, 0xff,
len-(len2+1));
4673static enum neighbor_char
4674enc_pred_char(
char *p,
long len, rb_encoding *enc)
4680 int r = rb_enc_precise_mbclen(p, p +
len, enc), c;
4682 return NEIGHBOR_NOT_CHAR;
4685 if (!c)
return NEIGHBOR_NOT_CHAR;
4688 if (!l)
return NEIGHBOR_NOT_CHAR;
4689 if (l !=
len)
return NEIGHBOR_WRAPPED;
4690 rb_enc_mbcput(c, p, enc);
4691 r = rb_enc_precise_mbclen(p, p +
len, enc);
4693 return NEIGHBOR_NOT_CHAR;
4695 return NEIGHBOR_FOUND;
4698 for (i =
len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
4701 return NEIGHBOR_WRAPPED;
4702 --((
unsigned char*)p)[i];
4703 l = rb_enc_precise_mbclen(p, p+
len, enc);
4707 return NEIGHBOR_FOUND;
4710 memset(p+l, 0,
len-l);
4716 for (len2 =
len-1; 0 < len2; len2--) {
4717 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4721 memset(p+len2+1, 0,
len-(len2+1));
4735static enum neighbor_char
4736enc_succ_alnum_char(
char *p,
long len, rb_encoding *enc,
char *carry)
4738 enum neighbor_char ret;
4742 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4746 const int max_gaps = 1;
4750 ctype = ONIGENC_CTYPE_DIGIT;
4752 ctype = ONIGENC_CTYPE_ALPHA;
4754 return NEIGHBOR_NOT_CHAR;
4757 for (
try = 0;
try <= max_gaps; ++
try) {
4758 ret = enc_succ_char(p,
len, enc);
4759 if (ret == NEIGHBOR_FOUND) {
4762 return NEIGHBOR_FOUND;
4769 ret = enc_pred_char(p,
len, enc);
4770 if (ret == NEIGHBOR_FOUND) {
4784 return NEIGHBOR_NOT_CHAR;
4787 if (ctype != ONIGENC_CTYPE_DIGIT) {
4789 return NEIGHBOR_WRAPPED;
4793 enc_succ_char(carry,
len, enc);
4794 return NEIGHBOR_WRAPPED;
4862 str =
rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4863 rb_enc_cr_str_copy_for_substr(str, orig);
4864 return str_succ(str);
4871 char *sbeg, *s, *e, *last_alnum = 0;
4872 int found_alnum = 0;
4874 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
4875 long carry_pos = 0, carry_len = 1;
4876 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4878 slen = RSTRING_LEN(str);
4879 if (slen == 0)
return str;
4881 enc = STR_ENC_GET(str);
4882 sbeg = RSTRING_PTR(str);
4883 s = e = sbeg + slen;
4885 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4886 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4892 l = rb_enc_precise_mbclen(s, e, enc);
4893 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
4894 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4895 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4897 case NEIGHBOR_NOT_CHAR:
4899 case NEIGHBOR_FOUND:
4901 case NEIGHBOR_WRAPPED:
4906 carry_pos = s - sbeg;
4911 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4912 enum neighbor_char neighbor;
4913 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4914 l = rb_enc_precise_mbclen(s, e, enc);
4915 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
4916 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4918 neighbor = enc_succ_char(tmp, l, enc);
4920 case NEIGHBOR_FOUND:
4924 case NEIGHBOR_WRAPPED:
4927 case NEIGHBOR_NOT_CHAR:
4930 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4932 enc_succ_char(s, l, enc);
4934 if (!rb_enc_asciicompat(enc)) {
4935 MEMCPY(carry, s,
char, l);
4938 carry_pos = s - sbeg;
4942 RESIZE_CAPA(str, slen + carry_len);
4943 sbeg = RSTRING_PTR(str);
4944 s = sbeg + carry_pos;
4945 memmove(s + carry_len, s, slen - carry_pos);
4946 memmove(s, carry, carry_len);
4948 STR_SET_LEN(str, slen);
4950 rb_enc_str_coderange(str);
4963rb_str_succ_bang(
VALUE str)
4971all_digits_p(
const char *s,
long len)
5025 VALUE end, exclusive;
5029 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5035 VALUE current, after_end;
5042 enc = rb_enc_check(beg, end);
5043 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5045 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5046 char c = RSTRING_PTR(beg)[0];
5047 char e = RSTRING_PTR(end)[0];
5049 if (c > e || (excl && c == e))
return beg;
5051 if ((*each)(rb_enc_str_new(&c, 1, enc), arg))
break;
5052 if (!excl && c == e)
break;
5054 if (excl && c == e)
break;
5059 if (ascii &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
ISDIGIT(RSTRING_PTR(end)[0]) &&
5060 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5061 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5066 b = rb_str_to_inum(beg, 10, FALSE);
5067 e = rb_str_to_inum(end, 10, FALSE);
5071 rb_encoding *usascii = rb_usascii_encoding();
5074 if (excl && bi == ei)
break;
5075 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5080 ID op = excl ?
'<' : idLE;
5081 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5086 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5087 b = rb_funcallv(b, succ, 0, 0);
5094 if (n > 0 || (excl && n == 0))
return beg;
5096 after_end = rb_funcallv(end, succ, 0, 0);
5101 next = rb_funcallv(current, succ, 0, 0);
5102 if ((*each)(current, arg))
break;
5103 if (
NIL_P(next))
break;
5107 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5122 if (is_ascii_string(beg) &&
ISDIGIT(RSTRING_PTR(beg)[0]) &&
5123 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5124 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5126 b = rb_str_to_inum(beg, 10, FALSE);
5129 rb_encoding *usascii = rb_usascii_encoding();
5132 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5140 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5141 b = rb_funcallv(b, succ, 0, 0);
5147 VALUE next = rb_funcallv(current, succ, 0, 0);
5148 if ((*each)(current, arg))
break;
5151 if (RSTRING_LEN(current) == 0)
5162 if (!
rb_equal(str, *argp))
return 0;
5170 beg = rb_str_new_frozen(beg);
5172 end = rb_str_new_frozen(end);
5176 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5177 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5178 rb_enc_asciicompat(STR_ENC_GET(val))) {
5179 const char *bp = RSTRING_PTR(beg);
5180 const char *ep = RSTRING_PTR(end);
5181 const char *vp = RSTRING_PTR(val);
5182 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5183 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5191 if (b <= v && v < e)
return Qtrue;
5192 return RBOOL(!
RTEST(exclusive) && v == e);
5199 all_digits_p(bp, RSTRING_LEN(beg)) &&
5200 all_digits_p(ep, RSTRING_LEN(end))) {
5205 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5207 return RBOOL(
NIL_P(val));
5229 else if (RB_TYPE_P(indx,
T_REGEXP)) {
5230 return rb_str_subpat(str, indx,
INT2FIX(0));
5232 else if (RB_TYPE_P(indx,
T_STRING)) {
5233 if (rb_str_index(str, indx, 0) != -1)
5239 long beg,
len = str_strlen(str, NULL);
5251 return str_substr(str, idx, 1, FALSE);
5270rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5273 if (RB_TYPE_P(argv[0],
T_REGEXP)) {
5274 return rb_str_subpat(str, argv[0], argv[1]);
5283 return rb_str_aref(str, argv[0]);
5289 char *ptr = RSTRING_PTR(str);
5290 long olen = RSTRING_LEN(str), nlen;
5292 str_modifiable(str);
5293 if (
len > olen)
len = olen;
5295 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5297 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5299 ptr =
RSTRING(str)->as.embed.ary;
5300 memmove(ptr, oldptr +
len, nlen);
5301 if (fl == STR_NOEMBED)
xfree(oldptr);
5304 if (!STR_SHARED_P(str)) {
5306 rb_enc_cr_str_exact_copy(shared, str);
5311 STR_SET_LEN(str, nlen);
5313 if (!SHARABLE_MIDDLE_SUBSTRING) {
5314 TERM_FILL(ptr + nlen, TERM_LEN(str));
5321rb_str_update_1(
VALUE str,
long beg,
long len,
VALUE val,
long vbeg,
long vlen)
5327 if (beg == 0 && vlen == 0) {
5332 str_modify_keep_cr(str);
5336 RESIZE_CAPA(str, slen + vlen -
len);
5337 sptr = RSTRING_PTR(str);
5341 cr = rb_enc_str_coderange(val);
5346 memmove(sptr + beg + vlen,
5348 slen - (beg +
len));
5350 if (vlen < beg &&
len < 0) {
5354 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5357 STR_SET_LEN(str, slen);
5358 TERM_FILL(&sptr[slen], TERM_LEN(str));
5365 rb_str_update_1(str, beg,
len, val, 0, RSTRING_LEN(val));
5374 int singlebyte = single_byte_optimizable(str);
5380 enc = rb_enc_check(str, val);
5381 slen = str_strlen(str, enc);
5383 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5390 assert(beg <= slen);
5391 if (
len > slen - beg) {
5394 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5395 if (!p) p = RSTRING_END(str);
5396 e = str_nth(p, RSTRING_END(str),
len, enc, singlebyte);
5397 if (!e) e = RSTRING_END(str);
5399 beg = p - RSTRING_PTR(str);
5401 rb_str_update_0(str, beg,
len, val);
5402 rb_enc_associate(str, enc);
5413 long start, end,
len;
5423 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5427 nth += regs->num_regs;
5437 enc = rb_enc_check_str(str, val);
5438 rb_str_update_0(str, start,
len, val);
5439 rb_enc_associate(str, enc);
5447 switch (
TYPE(indx)) {
5449 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5453 beg = rb_str_index(str, indx, 0);
5507rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5510 if (RB_TYPE_P(argv[0],
T_REGEXP)) {
5511 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5519 return rb_str_aset(str, argv[0], argv[1]);
5579rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
5587 str_modify_keep_cr(str);
5595 if ((nth += regs->num_regs) <= 0)
return Qnil;
5597 else if (nth >= regs->num_regs)
return Qnil;
5599 len = END(nth) - beg;
5602 else if (argc == 2) {
5611 beg = p - RSTRING_PTR(str);
5614 else if (RB_TYPE_P(indx,
T_STRING)) {
5615 beg = rb_str_index(str, indx, 0);
5616 if (beg == -1)
return Qnil;
5617 len = RSTRING_LEN(indx);
5629 beg = p - RSTRING_PTR(str);
5638 beg = p - RSTRING_PTR(str);
5642 rb_enc_cr_str_copy_for_substr(result, str);
5650 char *sptr = RSTRING_PTR(str);
5651 long slen = RSTRING_LEN(str);
5652 if (beg +
len > slen)
5656 slen - (beg +
len));
5658 STR_SET_LEN(str, slen);
5659 TERM_FILL(&sptr[slen], TERM_LEN(str));
5670 switch (OBJ_BUILTIN_TYPE(pat)) {
5689get_pat_quoted(
VALUE pat,
int check)
5693 switch (OBJ_BUILTIN_TYPE(pat)) {
5707 if (check && is_broken_string(pat)) {
5708 rb_exc_raise(rb_reg_check_preprocess(pat));
5714rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
5717 pos = rb_str_byteindex(str, pat, pos);
5718 if (set_backref_str) {
5720 str = rb_str_new_frozen_String(str);
5721 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5730 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5750rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
5764 hash = rb_check_hash_type(argv[1]);
5770 pat = get_pat_quoted(argv[0], 1);
5772 str_modifiable(str);
5773 beg = rb_pat_search(pat, str, 0, 1);
5787 end0 = beg0 + RSTRING_LEN(pat);
5796 if (iter || !
NIL_P(hash)) {
5797 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
5800 repl = rb_obj_as_string(
rb_yield(match0));
5803 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5804 repl = rb_obj_as_string(repl);
5806 str_mod_check(str, p,
len);
5813 enc = rb_enc_compatible(str, repl);
5815 rb_encoding *str_enc = STR_ENC_GET(str);
5816 p = RSTRING_PTR(str);
len = RSTRING_LEN(str);
5820 rb_enc_name(str_enc),
5821 rb_enc_name(STR_ENC_GET(repl)));
5823 enc = STR_ENC_GET(repl);
5826 rb_enc_associate(str, enc);
5836 rlen = RSTRING_LEN(repl);
5837 len = RSTRING_LEN(str);
5839 RESIZE_CAPA(str,
len + rlen - plen);
5841 p = RSTRING_PTR(str);
5843 memmove(p + beg0 + rlen, p + beg0 + plen,
len - beg0 - plen);
5845 rp = RSTRING_PTR(repl);
5846 memmove(p + beg0, rp, rlen);
5848 STR_SET_LEN(str,
len);
5849 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
5878 rb_str_sub_bang(argc, argv, str);
5883str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
5886 long beg, beg0, end0;
5887 long offset, blen, slen,
len, last;
5888 enum {STR, ITER, MAP} mode = STR;
5890 int need_backref = -1;
5891 rb_encoding *str_enc;
5900 hash = rb_check_hash_type(argv[1]);
5909 rb_error_arity(argc, 1, 2);
5912 pat = get_pat_quoted(argv[0], 1);
5913 beg = rb_pat_search(pat, str, 0, need_backref);
5915 if (bang)
return Qnil;
5920 blen = RSTRING_LEN(str) + 30;
5921 dest = rb_str_buf_new(blen);
5922 sp = RSTRING_PTR(str);
5923 slen = RSTRING_LEN(str);
5925 str_enc = STR_ENC_GET(str);
5926 rb_enc_associate(dest, str_enc);
5934 end0 = beg0 + RSTRING_LEN(pat);
5945 val = rb_obj_as_string(
rb_yield(match0));
5948 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5949 val = rb_obj_as_string(val);
5951 str_mod_check(str, sp, slen);
5956 else if (need_backref) {
5958 if (need_backref < 0) {
5959 need_backref = val != repl;
5966 len = beg0 - offset;
5968 rb_enc_str_buf_cat(dest, cp,
len, str_enc);
5971 rb_str_buf_append(dest, val);
5980 if (RSTRING_LEN(str) <= end0)
break;
5981 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5982 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0,
len, str_enc);
5983 offset = end0 +
len;
5985 cp = RSTRING_PTR(str) + offset;
5986 if (offset > RSTRING_LEN(str))
break;
5987 beg = rb_pat_search(pat, str, offset, need_backref);
5991 if (RSTRING_LEN(str) > offset) {
5992 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5994 rb_pat_search(pat, str, last, 1);
5996 str_shared_replace(str, dest);
6024rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
6026 str_modify_keep_cr(str);
6027 return str_gsub(argc, argv, str, 1);
6050 return str_gsub(argc, argv, str, 0);
6068 str_modifiable(str);
6069 if (str == str2)
return str;
6073 return str_replace(str, str2);
6088rb_str_clear(
VALUE str)
6092 STR_SET_LEN(str, 0);
6093 RSTRING_PTR(str)[0] = 0;
6094 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6113rb_str_chr(
VALUE str)
6137 pos += RSTRING_LEN(str);
6138 if (pos < 0 || RSTRING_LEN(str) <= pos)
6141 return INT2FIX((
unsigned char)RSTRING_PTR(str)[pos]);
6160 long len = RSTRING_LEN(str);
6161 char *ptr, *head, *left = 0;
6165 if (pos < -
len ||
len <= pos)
6172 char byte = (char)(
NUM2INT(w) & 0xFF);
6174 if (!str_independent(str))
6175 str_make_independent(str);
6176 enc = STR_ENC_GET(str);
6177 head = RSTRING_PTR(str);
6179 if (!STR_EMBED_P(str)) {
6186 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6194 width = rb_enc_precise_mbclen(left, head+
len, enc);
6196 nlen = rb_enc_precise_mbclen(left, head+
len, enc);
6212str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6214 long n = RSTRING_LEN(str);
6216 if (beg > n ||
len < 0)
return Qnil;
6219 if (beg < 0)
return Qnil;
6224 if (!empty)
return Qnil;
6228 VALUE str2 = str_subseq(str, beg,
len);
6230 str_enc_copy_direct(str2, str);
6232 if (RSTRING_LEN(str2) == 0) {
6233 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6261 long beg,
len = RSTRING_LEN(str);
6269 return str_byte_substr(str, beg,
len, TRUE);
6274 return str_byte_substr(str, idx, 1, FALSE);
6321rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6326 return str_byte_substr(str, beg,
len, TRUE);
6329 return str_byte_aref(str, argv[0]);
6333str_check_beg_len(
VALUE str,
long *beg,
long *
len)
6335 long end, slen = RSTRING_LEN(str);
6338 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6345 assert(*beg <= slen);
6346 if (*
len > slen - *beg) {
6350 str_ensure_byte_pos(str, *beg);
6351 str_ensure_byte_pos(str, end);
6376rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6378 long beg,
len, vbeg, vlen;
6384 if (!(argc == 2 || argc == 3 || argc == 5)) {
6385 rb_raise(rb_eArgError,
"wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6389 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6390 rb_builtin_class_name(argv[0]));
6397 vlen = RSTRING_LEN(val);
6402 rb_raise(
rb_eTypeError,
"wrong argument type %s (expected Range)",
6403 rb_builtin_class_name(argv[2]));
6415 vlen = RSTRING_LEN(val);
6423 str_check_beg_len(str, &beg, &
len);
6424 str_check_beg_len(val, &vbeg, &vlen);
6425 enc = rb_enc_check(str, val);
6426 str_modify_keep_cr(str);
6427 rb_str_update_1(str, beg,
len, val, vbeg, vlen);
6428 rb_enc_associate(str, enc);
6446rb_str_reverse(
VALUE str)
6453 if (RSTRING_LEN(str) <= 1)
return str_duplicate(
rb_cString, str);
6454 enc = STR_ENC_GET(str);
6456 s = RSTRING_PTR(str); e = RSTRING_END(str);
6457 p = RSTRING_END(rev);
6460 if (RSTRING_LEN(str) > 1) {
6461 if (single_byte_optimizable(str)) {
6468 int clen = rb_enc_fast_mbclen(s, e, enc);
6476 cr = rb_enc_asciicompat(enc) ?
6479 int clen = rb_enc_mbclen(s, e, enc);
6488 STR_SET_LEN(rev, RSTRING_LEN(str));
6489 str_enc_copy_direct(rev, str);
6509rb_str_reverse_bang(
VALUE str)
6511 if (RSTRING_LEN(str) > 1) {
6512 if (single_byte_optimizable(str)) {
6515 str_modify_keep_cr(str);
6516 s = RSTRING_PTR(str);
6517 e = RSTRING_END(str) - 1;
6525 str_shared_replace(str, rb_str_reverse(str));
6529 str_modify_keep_cr(str);
6554 i = rb_str_index(str, arg, 0);
6556 return RBOOL(i != -1);
6598 rb_raise(rb_eArgError,
"invalid radix %d", base);
6600 return rb_str_to_inum(str, base, FALSE);
6624rb_str_to_f(
VALUE str)
6639rb_str_to_s(
VALUE str)
6649str_cat_char(
VALUE str,
unsigned int c, rb_encoding *enc)
6651 char s[RUBY_MAX_CHAR_LEN];
6652 int n = rb_enc_codelen(c, enc);
6654 rb_enc_mbcput(c, s, enc);
6655 rb_enc_str_buf_cat(str, s, n, enc);
6659#define CHAR_ESC_LEN 13
6662rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
6664 char buf[CHAR_ESC_LEN + 1];
6672 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
6674 else if (c < 0x10000) {
6675 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
6678 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
6683 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
6686 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
6689 l = (int)strlen(buf);
6695ruby_escaped_char(
int c)
6698 case '\0':
return "\\0";
6699 case '\n':
return "\\n";
6700 case '\r':
return "\\r";
6701 case '\t':
return "\\t";
6702 case '\f':
return "\\f";
6703 case '\013':
return "\\v";
6704 case '\010':
return "\\b";
6705 case '\007':
return "\\a";
6706 case '\033':
return "\\e";
6707 case '\x7f':
return "\\c?";
6713rb_str_escape(
VALUE str)
6716 rb_encoding *enc = rb_enc_from_index(encidx);
6717 const char *p = RSTRING_PTR(str);
6718 const char *pend = RSTRING_END(str);
6719 const char *prev = p;
6720 char buf[CHAR_ESC_LEN + 1];
6721 VALUE result = rb_str_buf_new(0);
6722 int unicode_p = rb_enc_unicode_p(enc);
6723 int asciicompat = rb_enc_asciicompat(enc);
6728 int n = rb_enc_precise_mbclen(p, pend, enc);
6730 if (p > prev) str_buf_cat(result, prev, p - prev);
6733 n = (int)(pend - p);
6735 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
6736 str_buf_cat(result, buf, strlen(buf));
6744 cc = ruby_escaped_char(c);
6746 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6747 str_buf_cat(result, cc, strlen(cc));
6753 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6754 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6758 if (p > prev) str_buf_cat(result, prev, p - prev);
6781 rb_encoding *enc = rb_enc_from_index(encidx);
6782 const char *p, *pend, *prev;
6783 char buf[CHAR_ESC_LEN + 1];
6784 VALUE result = rb_str_buf_new(0);
6785 rb_encoding *resenc = rb_default_internal_encoding();
6786 int unicode_p = rb_enc_unicode_p(enc);
6787 int asciicompat = rb_enc_asciicompat(enc);
6789 if (resenc == NULL) resenc = rb_default_external_encoding();
6790 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6791 rb_enc_associate(result, resenc);
6792 str_buf_cat2(result,
"\"");
6794 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6800 n = rb_enc_precise_mbclen(p, pend, enc);
6802 if (p > prev) str_buf_cat(result, prev, p - prev);
6805 n = (int)(pend - p);
6807 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
6808 str_buf_cat(result, buf, strlen(buf));
6816 if ((asciicompat || unicode_p) &&
6817 (c ==
'"'|| c ==
'\\' ||
6822 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
6823 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6824 str_buf_cat2(result,
"\\");
6825 if (asciicompat || enc == resenc) {
6831 case '\n': cc =
'n';
break;
6832 case '\r': cc =
'r';
break;
6833 case '\t': cc =
't';
break;
6834 case '\f': cc =
'f';
break;
6835 case '\013': cc =
'v';
break;
6836 case '\010': cc =
'b';
break;
6837 case '\007': cc =
'a';
break;
6838 case 033: cc =
'e';
break;
6839 default: cc = 0;
break;
6842 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6845 str_buf_cat(result, buf, 2);
6862 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6863 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6868 if (p > prev) str_buf_cat(result, prev, p - prev);
6869 str_buf_cat2(result,
"\"");
6874#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6894 int encidx = rb_enc_get_index(str);
6895 rb_encoding *enc = rb_enc_from_index(encidx);
6897 const char *p, *pend;
6900 int u8 = (encidx == rb_utf8_encindex());
6901 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
6904 if (!rb_enc_asciicompat(enc)) {
6906 len += strlen(enc->name);
6909 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6912 unsigned char c = *p++;
6915 case '"':
case '\\':
6916 case '\n':
case '\r':
6917 case '\t':
case '\f':
6918 case '\013':
case '\010':
case '\007':
case '\033':
6923 clen = IS_EVSTR(p, pend) ? 2 : 1;
6931 if (u8 && c > 0x7F) {
6932 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6937 else if (cc <= 0xFFFFF)
6950 if (clen > LONG_MAX -
len) {
6957 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6958 q = RSTRING_PTR(result); qend = q +
len + 1;
6962 unsigned char c = *p++;
6964 if (c ==
'"' || c ==
'\\') {
6968 else if (c ==
'#') {
6969 if (IS_EVSTR(p, pend)) *q++ =
'\\';
6972 else if (c ==
'\n') {
6976 else if (c ==
'\r') {
6980 else if (c ==
'\t') {
6984 else if (c ==
'\f') {
6988 else if (c ==
'\013') {
6992 else if (c ==
'\010') {
6996 else if (c ==
'\007') {
7000 else if (c ==
'\033') {
7010 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7015 snprintf(q, qend-q,
"u%04X", cc);
7017 snprintf(q, qend-q,
"u{%X}", cc);
7022 snprintf(q, qend-q,
"x%02X", c);
7028 if (!rb_enc_asciicompat(enc)) {
7029 snprintf(q, qend-q, nonascii_suffix, enc->name);
7030 encidx = rb_ascii8bit_encindex();
7033 rb_enc_associate_index(result, encidx);
7039unescape_ascii(
unsigned int c)
7063undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end, rb_encoding **penc,
bool *utf8,
bool *binary)
7065 const char *s = *ss;
7069 unsigned char buf[6];
7070 static rb_encoding *enc_utf8 = NULL;
7076 rb_str_cat(undumped, s, 1);
7087 *buf = unescape_ascii(*s);
7088 rb_str_cat(undumped, (
char *)buf, 1);
7099 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7100 if (*penc != enc_utf8) {
7102 rb_enc_associate(undumped, enc_utf8);
7119 if (hexlen == 0 || hexlen > 6) {
7125 if (0xd800 <= c && c <= 0xdfff) {
7128 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7129 rb_str_cat(undumped, (
char *)buf, codelen);
7138 if (0xd800 <= c && c <= 0xdfff) {
7141 codelen = rb_enc_mbcput(c, (
char *)buf, *penc);
7142 rb_str_cat(undumped, (
char *)buf, codelen);
7158 rb_str_cat(undumped, (
char *)buf, 1);
7162 rb_str_cat(undumped, s-1, 2);
7169static VALUE rb_str_is_ascii_only_p(
VALUE str);
7187str_undump(
VALUE str)
7189 const char *s = RSTRING_PTR(str);
7190 const char *s_end = RSTRING_END(str);
7191 rb_encoding *enc = rb_enc_get(str);
7192 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7194 bool binary =
false;
7198 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7201 if (!str_null_check(str, &w)) {
7204 if (RSTRING_LEN(str) < 2)
goto invalid_format;
7205 if (*s !=
'"')
goto invalid_format;
7223 static const char force_encoding_suffix[] =
".force_encoding(\"";
7224 static const char dup_suffix[] =
".dup";
7225 const char *encname;
7230 size =
sizeof(dup_suffix) - 1;
7231 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7233 size =
sizeof(force_encoding_suffix) - 1;
7234 if (s_end - s <= size)
goto invalid_format;
7235 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7239 rb_raise(
rb_eRuntimeError,
"dumped string contained Unicode escape but used force_encoding");
7243 s = memchr(s,
'"', s_end-s);
7245 if (!s)
goto invalid_format;
7246 if (s_end - s != 2)
goto invalid_format;
7247 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7249 encidx = rb_enc_find_index2(encname, (
long)size);
7253 rb_enc_associate_index(undumped, encidx);
7263 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7266 rb_str_cat(undumped, s++, 1);
7274 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7278rb_str_check_dummy_enc(rb_encoding *enc)
7280 if (rb_enc_dummy_p(enc)) {
7287str_true_enc(
VALUE str)
7289 rb_encoding *enc = STR_ENC_GET(str);
7290 rb_str_check_dummy_enc(enc);
7294static OnigCaseFoldType
7295check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7300 rb_raise(rb_eArgError,
"too many options");
7301 if (argv[0]==sym_turkic) {
7302 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7304 if (argv[1]==sym_lithuanian)
7305 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7307 rb_raise(rb_eArgError,
"invalid second option");
7310 else if (argv[0]==sym_lithuanian) {
7311 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7313 if (argv[1]==sym_turkic)
7314 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7316 rb_raise(rb_eArgError,
"invalid second option");
7320 rb_raise(rb_eArgError,
"too many options");
7321 else if (argv[0]==sym_ascii)
7322 flags |= ONIGENC_CASE_ASCII_ONLY;
7323 else if (argv[0]==sym_fold) {
7324 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7325 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7327 rb_raise(rb_eArgError,
"option :fold only allowed for downcasing");
7330 rb_raise(rb_eArgError,
"invalid option");
7335case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc,
VALUE str)
7337 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7343#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7344#ifndef CASEMAP_DEBUG
7345# define CASEMAP_DEBUG 0
7353 OnigUChar space[FLEX_ARY_LEN];
7357mapping_buffer_free(
void *p)
7361 while (current_buffer) {
7362 previous_buffer = current_buffer;
7363 current_buffer = current_buffer->next;
7364 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7370 {0, mapping_buffer_free,},
7371 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7375rb_str_casemap(
VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7379 const OnigUChar *source_current, *source_end;
7380 int target_length = 0;
7381 VALUE buffer_anchor;
7384 size_t buffer_count = 0;
7385 int buffer_length_or_invalid;
7387 if (RSTRING_LEN(source) == 0)
return str_duplicate(
rb_cString, source);
7389 source_current = (OnigUChar*)RSTRING_PTR(source);
7390 source_end = (OnigUChar*)RSTRING_END(source);
7394 while (source_current < source_end) {
7396 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7397 if (CASEMAP_DEBUG) {
7398 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n",
capa);
7401 *pre_buffer = current_buffer;
7402 pre_buffer = ¤t_buffer->next;
7403 current_buffer->next = NULL;
7404 current_buffer->capa =
capa;
7405 buffer_length_or_invalid = enc->case_map(flags,
7406 &source_current, source_end,
7407 current_buffer->space,
7408 current_buffer->space+current_buffer->capa,
7410 if (buffer_length_or_invalid < 0) {
7411 current_buffer =
DATA_PTR(buffer_anchor);
7413 mapping_buffer_free(current_buffer);
7414 rb_raise(rb_eArgError,
"input string invalid");
7416 target_length += current_buffer->used = buffer_length_or_invalid;
7418 if (CASEMAP_DEBUG) {
7419 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7422 if (buffer_count==1) {
7423 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7426 char *target_current;
7429 target_current = RSTRING_PTR(target);
7430 current_buffer =
DATA_PTR(buffer_anchor);
7431 while (current_buffer) {
7432 memcpy(target_current, current_buffer->space, current_buffer->used);
7433 target_current += current_buffer->used;
7434 current_buffer = current_buffer->next;
7437 current_buffer =
DATA_PTR(buffer_anchor);
7439 mapping_buffer_free(current_buffer);
7444 str_enc_copy_direct(target, source);
7451rb_str_ascii_casemap(
VALUE source,
VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7453 const OnigUChar *source_current, *source_end;
7454 OnigUChar *target_current, *target_end;
7455 long old_length = RSTRING_LEN(source);
7456 int length_or_invalid;
7458 if (old_length == 0)
return Qnil;
7460 source_current = (OnigUChar*)RSTRING_PTR(source);
7461 source_end = (OnigUChar*)RSTRING_END(source);
7462 if (source == target) {
7463 target_current = (OnigUChar*)source_current;
7464 target_end = (OnigUChar*)source_end;
7467 target_current = (OnigUChar*)RSTRING_PTR(target);
7468 target_end = (OnigUChar*)RSTRING_END(target);
7471 length_or_invalid = onigenc_ascii_only_case_map(flags,
7472 &source_current, source_end,
7473 target_current, target_end, enc);
7474 if (length_or_invalid < 0)
7475 rb_raise(rb_eArgError,
"input string invalid");
7476 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7477 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7478 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7479 rb_raise(rb_eArgError,
"internal problem with rb_str_ascii_casemap"
7480 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7483 str_enc_copy(target, source);
7489upcase_single(
VALUE str)
7491 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7492 bool modified =
false;
7495 unsigned int c = *(
unsigned char*)s;
7497 if (
'a' <= c && c <=
'z') {
7498 *s =
'A' + (c -
'a');
7526rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
7529 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7531 flags = check_case_options(argc, argv, flags);
7532 str_modify_keep_cr(str);
7533 enc = str_true_enc(str);
7534 if (case_option_single_p(flags, enc, str)) {
7535 if (upcase_single(str))
7536 flags |= ONIGENC_CASE_MODIFIED;
7538 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7539 rb_str_ascii_casemap(str, str, &flags, enc);
7541 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7543 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7565rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
7568 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7571 flags = check_case_options(argc, argv, flags);
7572 enc = str_true_enc(str);
7573 if (case_option_single_p(flags, enc, str)) {
7574 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7575 str_enc_copy_direct(ret, str);
7578 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7580 rb_str_ascii_casemap(str, ret, &flags, enc);
7583 ret = rb_str_casemap(str, &flags, enc);
7590downcase_single(
VALUE str)
7592 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7593 bool modified =
false;
7596 unsigned int c = *(
unsigned char*)s;
7598 if (
'A' <= c && c <=
'Z') {
7599 *s =
'a' + (c -
'A');
7628rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
7631 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7633 flags = check_case_options(argc, argv, flags);
7634 str_modify_keep_cr(str);
7635 enc = str_true_enc(str);
7636 if (case_option_single_p(flags, enc, str)) {
7637 if (downcase_single(str))
7638 flags |= ONIGENC_CASE_MODIFIED;
7640 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7641 rb_str_ascii_casemap(str, str, &flags, enc);
7643 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7645 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7667rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
7670 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7673 flags = check_case_options(argc, argv, flags);
7674 enc = str_true_enc(str);
7675 if (case_option_single_p(flags, enc, str)) {
7676 ret =
rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7677 str_enc_copy_direct(ret, str);
7678 downcase_single(ret);
7680 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7682 rb_str_ascii_casemap(str, ret, &flags, enc);
7685 ret = rb_str_casemap(str, &flags, enc);
7713rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
7716 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7718 flags = check_case_options(argc, argv, flags);
7719 str_modify_keep_cr(str);
7720 enc = str_true_enc(str);
7721 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
7722 if (flags&ONIGENC_CASE_ASCII_ONLY)
7723 rb_str_ascii_casemap(str, str, &flags, enc);
7725 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7727 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7751rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
7754 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7757 flags = check_case_options(argc, argv, flags);
7758 enc = str_true_enc(str);
7759 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str;
7760 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7762 rb_str_ascii_casemap(str, ret, &flags, enc);
7765 ret = rb_str_casemap(str, &flags, enc);
7792rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
7795 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7797 flags = check_case_options(argc, argv, flags);
7798 str_modify_keep_cr(str);
7799 enc = str_true_enc(str);
7800 if (flags&ONIGENC_CASE_ASCII_ONLY)
7801 rb_str_ascii_casemap(str, str, &flags, enc);
7803 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7805 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7829rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
7832 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7835 flags = check_case_options(argc, argv, flags);
7836 enc = str_true_enc(str);
7837 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return str_duplicate(
rb_cString, str);
7838 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7840 rb_str_ascii_casemap(str, ret, &flags, enc);
7843 ret = rb_str_casemap(str, &flags, enc);
7848typedef unsigned char *USTR;
7852 unsigned int now, max;
7857trnext(
struct tr *t, rb_encoding *enc)
7864 if (t->p == t->pend)
return -1;
7865 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
7868 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7870 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
7872 if (t->p < t->pend) {
7873 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7876 if (t->now < 0x80 && c < 0x80) {
7877 rb_raise(rb_eArgError,
7878 "invalid range \"%c-%c\" in string transliteration",
7882 rb_raise(rb_eArgError,
"invalid range in string transliteration");
7886 else if (t->now < c) {
7895 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7896 if (t->now == t->max) {
7901 if (t->now < t->max) {
7917 const unsigned int errc = -1;
7918 unsigned int trans[256];
7919 rb_encoding *enc, *e1, *e2;
7920 struct tr trsrc, trrepl;
7922 unsigned int c, c0, last = 0;
7923 int modify = 0, i, l;
7924 unsigned char *s, *send;
7926 int singlebyte = single_byte_optimizable(str);
7930#define CHECK_IF_ASCII(c) \
7931 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7932 (cr = ENC_CODERANGE_VALID) : 0)
7936 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
7937 if (RSTRING_LEN(repl) == 0) {
7938 return rb_str_delete_bang(1, &src, str);
7942 e1 = rb_enc_check(str, src);
7943 e2 = rb_enc_check(str, repl);
7948 enc = rb_enc_check(src, repl);
7950 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7951 if (RSTRING_LEN(src) > 1 &&
7952 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
7953 trsrc.p + l < trsrc.pend) {
7957 trrepl.p = RSTRING_PTR(repl);
7958 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7959 trsrc.gen = trrepl.gen = 0;
7960 trsrc.now = trrepl.now = 0;
7961 trsrc.max = trrepl.max = 0;
7964 for (i=0; i<256; i++) {
7967 while ((c = trnext(&trsrc, enc)) != errc) {
7972 if (!hash) hash = rb_hash_new();
7976 while ((c = trnext(&trrepl, enc)) != errc)
7979 for (i=0; i<256; i++) {
7980 if (trans[i] != errc) {
7988 for (i=0; i<256; i++) {
7991 while ((c = trnext(&trsrc, enc)) != errc) {
7992 r = trnext(&trrepl, enc);
7993 if (r == errc) r = trrepl.now;
7996 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7999 if (!hash) hash = rb_hash_new();
8007 str_modify_keep_cr(str);
8008 s = (
unsigned char *)RSTRING_PTR(str); send = (
unsigned char *)RSTRING_END(str);
8012 long offset, max = RSTRING_LEN(str);
8013 unsigned int save = -1;
8014 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8019 c0 = c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, e1);
8020 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8029 if (cflag) c = last;
8032 else if (cflag) c = errc;
8038 if (c != (
unsigned int)-1) {
8044 tlen = rb_enc_codelen(c, enc);
8050 if (enc != e1) may_modify = 1;
8052 if ((offset = t - buf) + tlen > max) {
8053 size_t MAYBE_UNUSED(old) = max + termlen;
8054 max = offset + tlen + (send - s);
8055 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8058 rb_enc_mbcput(c, t, enc);
8059 if (may_modify && memcmp(s, t, tlen) != 0) {
8065 if (!STR_EMBED_P(str)) {
8066 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8068 TERM_FILL((
char *)t, termlen);
8069 RSTRING(str)->as.heap.ptr = (
char *)buf;
8070 STR_SET_LEN(str, t - buf);
8071 STR_SET_NOEMBED(str);
8072 RSTRING(str)->as.heap.aux.capa = max;
8076 c = (
unsigned char)*s;
8077 if (trans[c] != errc) {
8094 long offset, max = (long)((send - s) * 1.2);
8095 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8099 c0 = c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, e1);
8100 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8108 if (cflag) c = last;
8111 else if (cflag) c = errc;
8115 c = cflag ? last : errc;
8118 tlen = rb_enc_codelen(c, enc);
8123 if (enc != e1) may_modify = 1;
8125 if ((offset = t - buf) + tlen > max) {
8126 size_t MAYBE_UNUSED(old) = max + termlen;
8127 max = offset + tlen + (long)((send - s) * 1.2);
8128 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8132 rb_enc_mbcput(c, t, enc);
8133 if (may_modify && memcmp(s, t, tlen) != 0) {
8141 if (!STR_EMBED_P(str)) {
8142 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8144 TERM_FILL((
char *)t, termlen);
8145 RSTRING(str)->as.heap.ptr = (
char *)buf;
8146 STR_SET_LEN(str, t - buf);
8147 STR_SET_NOEMBED(str);
8148 RSTRING(str)->as.heap.aux.capa = max;
8154 rb_enc_associate(str, enc);
8173 return tr_trans(str, src, repl, 0);
8220 tr_trans(str, src, repl, 0);
8224#define TR_TABLE_MAX (UCHAR_MAX+1)
8225#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8227tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8228 VALUE *tablep,
VALUE *ctablep, rb_encoding *enc)
8230 const unsigned int errc = -1;
8231 char buf[TR_TABLE_MAX];
8234 VALUE table = 0, ptable = 0;
8235 int i, l, cflag = 0;
8237 tr.p = RSTRING_PTR(str);
tr.pend =
tr.p + RSTRING_LEN(str);
8238 tr.gen =
tr.now =
tr.max = 0;
8240 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8245 for (i=0; i<TR_TABLE_MAX; i++) {
8248 stable[TR_TABLE_MAX] = cflag;
8250 else if (stable[TR_TABLE_MAX] && !cflag) {
8251 stable[TR_TABLE_MAX] = 0;
8253 for (i=0; i<TR_TABLE_MAX; i++) {
8257 while ((c = trnext(&
tr, enc)) != errc) {
8258 if (c < TR_TABLE_MAX) {
8259 buf[(
unsigned char)c] = !cflag;
8264 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8267 table = ptable ? ptable : rb_hash_new();
8271 table = rb_hash_new();
8276 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8277 rb_hash_aset(table, key,
Qtrue);
8281 for (i=0; i<TR_TABLE_MAX; i++) {
8282 stable[i] = stable[i] && buf[i];
8284 if (!table && !cflag) {
8291tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8293 if (c < TR_TABLE_MAX) {
8294 return table[c] != 0;
8300 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8301 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8305 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8308 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8322rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8324 char squeez[TR_TABLE_SIZE];
8325 rb_encoding *enc = 0;
8327 VALUE del = 0, nodel = 0;
8329 int i, ascompat, cr;
8331 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str))
return Qnil;
8333 for (i=0; i<argc; i++) {
8337 enc = rb_enc_check(str, s);
8338 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8341 str_modify_keep_cr(str);
8342 ascompat = rb_enc_asciicompat(enc);
8343 s = t = RSTRING_PTR(str);
8344 send = RSTRING_END(str);
8350 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8361 c = rb_enc_codepoint_len(s, send, &clen, enc);
8363 if (tr_find(c, squeez, del, nodel)) {
8367 if (t != s) rb_enc_mbcput(c, t, enc);
8374 TERM_FILL(t, TERM_LEN(str));
8375 STR_SET_LEN(str, t - RSTRING_PTR(str));
8378 if (modify)
return str;
8398rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8401 rb_str_delete_bang(argc, argv, str);
8415rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8417 char squeez[TR_TABLE_SIZE];
8418 rb_encoding *enc = 0;
8419 VALUE del = 0, nodel = 0;
8420 unsigned char *s, *send, *t;
8422 int ascompat, singlebyte = single_byte_optimizable(str);
8426 enc = STR_ENC_GET(str);
8429 for (i=0; i<argc; i++) {
8433 enc = rb_enc_check(str, s);
8434 if (singlebyte && !single_byte_optimizable(s))
8436 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8440 str_modify_keep_cr(str);
8441 s = t = (
unsigned char *)RSTRING_PTR(str);
8442 if (!s || RSTRING_LEN(str) == 0)
return Qnil;
8443 send = (
unsigned char *)RSTRING_END(str);
8445 ascompat = rb_enc_asciicompat(enc);
8449 unsigned int c = *s++;
8450 if (c != save || (argc > 0 && !squeez[c])) {
8460 if (ascompat && (c = *s) < 0x80) {
8461 if (c != save || (argc > 0 && !squeez[c])) {
8467 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8469 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8470 if (t != s) rb_enc_mbcput(c, t, enc);
8479 TERM_FILL((
char *)t, TERM_LEN(str));
8480 if ((
char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8481 STR_SET_LEN(str, (
char *)t - RSTRING_PTR(str));
8485 if (modify)
return str;
8508rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8511 rb_str_squeeze_bang(argc, argv, str);
8529 return tr_trans(str, src, repl, 1);
8552 tr_trans(str, src, repl, 1);
8581rb_str_count(
int argc,
VALUE *argv,
VALUE str)
8583 char table[TR_TABLE_SIZE];
8584 rb_encoding *enc = 0;
8585 VALUE del = 0, nodel = 0, tstr;
8595 enc = rb_enc_check(str, tstr);
8598 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8599 (ptstr = RSTRING_PTR(tstr),
8600 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
8601 !is_broken_string(str)) {
8603 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8605 s = RSTRING_PTR(str);
8606 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
8607 send = RSTRING_END(str);
8609 if (*(
unsigned char*)s++ == c) n++;
8615 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8616 for (i=1; i<argc; i++) {
8619 enc = rb_enc_check(str, tstr);
8620 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8623 s = RSTRING_PTR(str);
8624 if (!s || RSTRING_LEN(str) == 0)
return INT2FIX(0);
8625 send = RSTRING_END(str);
8626 ascompat = rb_enc_asciicompat(enc);
8630 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8638 c = rb_enc_codepoint_len(s, send, &clen, enc);
8639 if (tr_find(c, table, del, nodel)) {
8650rb_fs_check(
VALUE val)
8654 if (
NIL_P(val))
return 0;
8659static const char isspacetable[256] = {
8660 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8662 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8674 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8678#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8681split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
8683 if (empty_count >= 0 &&
len == 0) {
8684 return empty_count + 1;
8686 if (empty_count > 0) {
8690 rb_ary_push(result, str_new_empty_String(str));
8691 }
while (--empty_count > 0);
8695 rb_yield(str_new_empty_String(str));
8696 }
while (--empty_count > 0);
8699 str = rb_str_subseq(str, beg,
len);
8701 rb_ary_push(result, str);
8710 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8714literal_split_pattern(
VALUE spat, split_type_t default_type)
8716 rb_encoding *enc = STR_ENC_GET(spat);
8722 return SPLIT_TYPE_CHARS;
8724 else if (rb_enc_asciicompat(enc)) {
8725 if (
len == 1 && ptr[0] ==
' ') {
8726 return SPLIT_TYPE_AWK;
8731 if (rb_enc_ascget(ptr, ptr +
len, &l, enc) ==
' ' &&
len == l) {
8732 return SPLIT_TYPE_AWK;
8735 return default_type;
8748rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
8753 split_type_t split_type;
8754 long beg, end, i = 0, empty_count = -1;
8759 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
8761 if (lim <= 0) limit =
Qnil;
8762 else if (lim == 1) {
8763 if (RSTRING_LEN(str) == 0)
8774 if (
NIL_P(limit) && !lim) empty_count = 0;
8776 enc = STR_ENC_GET(str);
8777 split_type = SPLIT_TYPE_REGEXP;
8779 spat = get_pat_quoted(spat, 0);
8782 split_type = SPLIT_TYPE_AWK;
8784 else if (!(spat = rb_fs_check(spat))) {
8785 rb_raise(
rb_eTypeError,
"value of $; must be String or Regexp");
8790 if (split_type != SPLIT_TYPE_AWK) {
8795 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8796 if (split_type == SPLIT_TYPE_AWK) {
8798 split_type = SPLIT_TYPE_STRING;
8803 mustnot_broken(spat);
8804 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8812#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8815 char *ptr = RSTRING_PTR(str);
8816 char *eptr = RSTRING_END(str);
8817 if (split_type == SPLIT_TYPE_AWK) {
8822 if (result) result = rb_ary_new();
8824 if (is_ascii_string(str)) {
8825 while (ptr < eptr) {
8826 c = (
unsigned char)*ptr++;
8828 if (ascii_isspace(c)) {
8834 if (!
NIL_P(limit) && lim <= i)
break;
8837 else if (ascii_isspace(c)) {
8838 SPLIT_STR(beg, end-beg);
8841 if (!
NIL_P(limit)) ++i;
8849 while (ptr < eptr) {
8852 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8861 if (!
NIL_P(limit) && lim <= i)
break;
8865 SPLIT_STR(beg, end-beg);
8868 if (!
NIL_P(limit)) ++i;
8876 else if (split_type == SPLIT_TYPE_STRING) {
8877 char *str_start = ptr;
8878 char *substr_start = ptr;
8879 char *sptr = RSTRING_PTR(spat);
8880 long slen = RSTRING_LEN(spat);
8882 if (result) result = rb_ary_new();
8883 mustnot_broken(str);
8884 enc = rb_enc_check(str, spat);
8885 while (ptr < eptr &&
8886 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8889 if (t != ptr + end) {
8893 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8896 if (!
NIL_P(limit) && lim <= ++i)
break;
8898 beg = ptr - str_start;
8900 else if (split_type == SPLIT_TYPE_CHARS) {
8901 char *str_start = ptr;
8904 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
8905 mustnot_broken(str);
8906 enc = rb_enc_get(str);
8907 while (ptr < eptr &&
8908 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8909 SPLIT_STR(ptr - str_start, n);
8911 if (!
NIL_P(limit) && lim <= ++i)
break;
8913 beg = ptr - str_start;
8916 if (result) result = rb_ary_new();
8917 long len = RSTRING_LEN(str);
8925 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
8930 if (start == end && BEG(0) == END(0)) {
8935 else if (last_null == 1) {
8936 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8943 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8949 SPLIT_STR(beg, end-beg);
8950 beg = start = END(0);
8954 for (idx=1; idx < regs->num_regs; idx++) {
8955 if (BEG(idx) == -1)
continue;
8956 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8958 if (!
NIL_P(limit) && lim <= ++i)
break;
8960 if (match) rb_match_unbusy(match);
8962 if (RSTRING_LEN(str) > 0 && (!
NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8963 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8966 return result ? result : str;
8976 return rb_str_split_m(1, &sep, str);
8979#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8985 rb_ary_push(ary, e);
8994#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8997chomp_newline(
const char *p,
const char *e, rb_encoding *enc)
8999 const char *prev = rb_enc_prev_char(p, e, e, enc);
9002 prev = rb_enc_prev_char(p, e, e, enc);
9003 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
9015 RSTRING_LEN(rs) != 1 ||
9016 RSTRING_PTR(rs)[0] !=
'\n')) {
9022#define rb_rs get_rs()
9029 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9030 long pos,
len, rslen;
9036 static ID keywords[1];
9041 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
9045 if (!ENUM_ELEM(ary, str)) {
9053 if (!RSTRING_LEN(str))
goto end;
9054 str = rb_str_new_frozen(str);
9055 ptr = subptr = RSTRING_PTR(str);
9056 pend = RSTRING_END(str);
9057 len = RSTRING_LEN(str);
9059 rslen = RSTRING_LEN(rs);
9061 if (rs == rb_default_rs)
9062 enc = rb_enc_get(str);
9064 enc = rb_enc_check(str, rs);
9069 const char *eol = NULL;
9071 while (subend < pend) {
9072 long chomp_rslen = 0;
9074 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9076 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9078 if (eol == subend)
break;
9082 chomp_rslen = -rslen;
9086 if (!subptr) subptr = subend;
9090 }
while (subend < pend);
9092 if (rslen == 0) chomp_rslen = 0;
9093 line = rb_str_subseq(str, subptr - ptr,
9094 subend - subptr + (chomp ? chomp_rslen : rslen));
9095 if (ENUM_ELEM(ary, line)) {
9096 str_mod_check(str, ptr,
len);
9098 subptr = eol = NULL;
9103 rsptr = RSTRING_PTR(rs);
9110 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9113 rsptr = RSTRING_PTR(rs);
9114 rslen = RSTRING_LEN(rs);
9117 while (subptr < pend) {
9118 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9122 if (hit != adjusted) {
9126 subend = hit += rslen;
9129 subend = chomp_newline(subptr, subend, enc);
9135 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9136 if (ENUM_ELEM(ary, line)) {
9137 str_mod_check(str, ptr,
len);
9142 if (subptr != pend) {
9145 pend = chomp_newline(subptr, pend, enc);
9147 else if (pend - subptr >= rslen &&
9148 memcmp(pend - rslen, rsptr, rslen) == 0) {
9152 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9153 ENUM_ELEM(ary, line);
9174rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9177 return rb_str_enumerate_lines(argc, argv, str, 0);
9190rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9192 VALUE ary = WANTARRAY(
"lines", 0);
9193 return rb_str_enumerate_lines(argc, argv, str, ary);
9207 for (i=0; i<RSTRING_LEN(str); i++) {
9208 ENUM_ELEM(ary,
INT2FIX((
unsigned char)RSTRING_PTR(str)[i]));
9226rb_str_each_byte(
VALUE str)
9229 return rb_str_enumerate_bytes(str, 0);
9241rb_str_bytes(
VALUE str)
9243 VALUE ary = WANTARRAY(
"bytes", RSTRING_LEN(str));
9244 return rb_str_enumerate_bytes(str, ary);
9261 str = rb_str_new_frozen(str);
9262 ptr = RSTRING_PTR(str);
9263 len = RSTRING_LEN(str);
9264 enc = rb_enc_get(str);
9267 for (i = 0; i <
len; i += n) {
9268 n = rb_enc_fast_mbclen(ptr + i, ptr +
len, enc);
9269 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9273 for (i = 0; i <
len; i += n) {
9274 n = rb_enc_mbclen(ptr + i, ptr +
len, enc);
9275 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9295rb_str_each_char(
VALUE str)
9298 return rb_str_enumerate_chars(str, 0);
9310rb_str_chars(
VALUE str)
9313 return rb_str_enumerate_chars(str, ary);
9317rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9322 const char *ptr, *end;
9325 if (single_byte_optimizable(str))
9326 return rb_str_enumerate_bytes(str, ary);
9328 str = rb_str_new_frozen(str);
9329 ptr = RSTRING_PTR(str);
9330 end = RSTRING_END(str);
9331 enc = STR_ENC_GET(str);
9334 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9355rb_str_each_codepoint(
VALUE str)
9358 return rb_str_enumerate_codepoints(str, 0);
9370rb_str_codepoints(
VALUE str)
9373 return rb_str_enumerate_codepoints(str, ary);
9377get_reg_grapheme_cluster(rb_encoding *enc)
9379 int encidx = rb_enc_to_index(enc);
9381 const OnigUChar source_ascii[] =
"\\X";
9382 const OnigUChar *source = source_ascii;
9383 size_t source_len =
sizeof(source_ascii) - 1;
9386#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9387#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9388#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9389#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9390#define CASE_UTF(e) \
9391 case ENCINDEX_UTF_##e: { \
9392 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9393 source = source_UTF_##e; \
9394 source_len = sizeof(source_UTF_##e); \
9397 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9405 regex_t *reg_grapheme_cluster;
9407 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9408 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9410 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9411 onig_error_code_to_str(message, r, &einfo);
9412 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9415 return reg_grapheme_cluster;
9419get_cached_reg_grapheme_cluster(rb_encoding *enc)
9421 int encidx = rb_enc_to_index(enc);
9422 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9424 if (encidx == rb_utf8_encindex()) {
9425 if (!reg_grapheme_cluster_utf8) {
9426 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9429 return reg_grapheme_cluster_utf8;
9438 size_t grapheme_cluster_count = 0;
9439 rb_encoding *enc = get_encoding(str);
9440 const char *ptr, *end;
9442 if (!rb_enc_unicode_p(enc)) {
9446 bool cached_reg_grapheme_cluster =
true;
9447 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9448 if (!reg_grapheme_cluster) {
9449 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9450 cached_reg_grapheme_cluster =
false;
9453 ptr = RSTRING_PTR(str);
9454 end = RSTRING_END(str);
9457 OnigPosition
len = onig_match(reg_grapheme_cluster,
9458 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9459 (
const OnigUChar *)ptr, NULL, 0);
9460 if (
len <= 0)
break;
9461 grapheme_cluster_count++;
9465 if (!cached_reg_grapheme_cluster) {
9466 onig_free(reg_grapheme_cluster);
9469 return SIZET2NUM(grapheme_cluster_count);
9473rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9476 rb_encoding *enc = get_encoding(str);
9477 const char *ptr0, *ptr, *end;
9479 if (!rb_enc_unicode_p(enc)) {
9480 return rb_str_enumerate_chars(str, ary);
9483 if (!ary) str = rb_str_new_frozen(str);
9485 bool cached_reg_grapheme_cluster =
true;
9486 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9487 if (!reg_grapheme_cluster) {
9488 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9489 cached_reg_grapheme_cluster =
false;
9492 ptr0 = ptr = RSTRING_PTR(str);
9493 end = RSTRING_END(str);
9496 OnigPosition
len = onig_match(reg_grapheme_cluster,
9497 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9498 (
const OnigUChar *)ptr, NULL, 0);
9499 if (
len <= 0)
break;
9500 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0,
len));
9504 if (!cached_reg_grapheme_cluster) {
9505 onig_free(reg_grapheme_cluster);
9525rb_str_each_grapheme_cluster(
VALUE str)
9528 return rb_str_enumerate_grapheme_clusters(str, 0);
9540rb_str_grapheme_clusters(
VALUE str)
9543 return rb_str_enumerate_grapheme_clusters(str, ary);
9547chopped_length(
VALUE str)
9549 rb_encoding *enc = STR_ENC_GET(str);
9550 const char *p, *p2, *beg, *end;
9552 beg = RSTRING_PTR(str);
9553 end = beg + RSTRING_LEN(str);
9554 if (beg >= end)
return 0;
9555 p = rb_enc_prev_char(beg, end, end, enc);
9557 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
9558 p2 = rb_enc_prev_char(beg, p, end, enc);
9559 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
9575rb_str_chop_bang(
VALUE str)
9577 str_modify_keep_cr(str);
9578 if (RSTRING_LEN(str) > 0) {
9580 len = chopped_length(str);
9581 STR_SET_LEN(str,
len);
9582 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
9601rb_str_chop(
VALUE str)
9603 return rb_str_subseq(str, 0, chopped_length(str));
9607smart_chomp(
VALUE str,
const char *e,
const char *p)
9609 rb_encoding *enc = rb_enc_get(str);
9618 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
9626 if (--e > p && *(e-1) ==
'\r') {
9643 char *pp, *e, *rsptr;
9645 char *
const p = RSTRING_PTR(str);
9646 long len = RSTRING_LEN(str);
9648 if (
len == 0)
return 0;
9650 if (rs == rb_default_rs) {
9651 return smart_chomp(str, e, p);
9654 enc = rb_enc_get(str);
9665 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
9672 while (e > p && *(e-1) ==
'\n') {
9674 if (e > p && *(e-1) ==
'\r')
9680 if (rslen >
len)
return len;
9682 enc = rb_enc_get(rs);
9683 newline = rsptr[rslen-1];
9686 if (newline ==
'\n')
9687 return smart_chomp(str, e, p);
9691 return smart_chomp(str, e, p);
9695 enc = rb_enc_check(str, rs);
9696 if (is_broken_string(rs)) {
9700 if (p[
len-1] == newline &&
9702 memcmp(rsptr, pp, rslen) == 0)) {
9703 if (at_char_boundary(p, pp, e, enc))
9716chomp_rs(
int argc,
const VALUE *argv)
9732 long olen = RSTRING_LEN(str);
9733 long len = chompped_length(str, rs);
9735 str_modify_keep_cr(str);
9736 STR_SET_LEN(str,
len);
9737 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
9754rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
9757 str_modifiable(str);
9758 if (RSTRING_LEN(str) == 0 && argc < 2)
return Qnil;
9759 rs = chomp_rs(argc, argv);
9761 return rb_str_chomp_string(str, rs);
9774rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
9776 VALUE rs = chomp_rs(argc, argv);
9778 return rb_str_subseq(str, 0, chompped_length(str, rs));
9782lstrip_offset(
VALUE str,
const char *s,
const char *e, rb_encoding *enc)
9784 const char *
const start = s;
9786 if (!s || s >= e)
return 0;
9789 if (single_byte_optimizable(str)) {
9790 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
9795 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9815rb_str_lstrip_bang(
VALUE str)
9821 str_modify_keep_cr(str);
9822 enc = STR_ENC_GET(str);
9824 loffset = lstrip_offset(str, start, start+olen, enc);
9826 long len = olen-loffset;
9827 s = start + loffset;
9828 memmove(start, s,
len);
9829 STR_SET_LEN(str,
len);
9853rb_str_lstrip(
VALUE str)
9858 loffset = lstrip_offset(str, start, start+
len, STR_ENC_GET(str));
9859 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
9860 return rb_str_subseq(str, loffset,
len - loffset);
9864rstrip_offset(
VALUE str,
const char *s,
const char *e, rb_encoding *enc)
9868 rb_str_check_dummy_enc(enc);
9872 if (!s || s >= e)
return 0;
9876 if (single_byte_optimizable(str)) {
9878 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
9883 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9903rb_str_rstrip_bang(
VALUE str)
9909 str_modify_keep_cr(str);
9910 enc = STR_ENC_GET(str);
9912 roffset = rstrip_offset(str, start, start+olen, enc);
9914 long len = olen - roffset;
9916 STR_SET_LEN(str,
len);
9940rb_str_rstrip(
VALUE str)
9946 enc = STR_ENC_GET(str);
9948 roffset = rstrip_offset(str, start, start+olen, enc);
9950 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
9951 return rb_str_subseq(str, 0, olen-roffset);
9966rb_str_strip_bang(
VALUE str)
9969 long olen, loffset, roffset;
9972 str_modify_keep_cr(str);
9973 enc = STR_ENC_GET(str);
9975 loffset = lstrip_offset(str, start, start+olen, enc);
9976 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9978 if (loffset > 0 || roffset > 0) {
9979 long len = olen-roffset;
9982 memmove(start, start + loffset,
len);
9984 STR_SET_LEN(str,
len);
10008rb_str_strip(
VALUE str)
10011 long olen, loffset, roffset;
10012 rb_encoding *enc = STR_ENC_GET(str);
10015 loffset = lstrip_offset(str, start, start+olen, enc);
10016 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10018 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
10019 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10023scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
10026 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10032 end = pos + RSTRING_LEN(pat);
10042 rb_encoding *enc = STR_ENC_GET(str);
10046 if (RSTRING_LEN(str) > end)
10047 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10048 RSTRING_END(str), enc);
10056 if (!regs || regs->num_regs == 1) {
10057 result = rb_str_subseq(str, pos, end - pos);
10062 for (
int i = 1; i < regs->num_regs; i++) {
10065 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10068 rb_ary_push(result, s);
10123 long last = -1, prev = 0;
10124 char *p = RSTRING_PTR(str);
long len = RSTRING_LEN(str);
10126 pat = get_pat_quoted(pat, 1);
10127 mustnot_broken(str);
10129 VALUE ary = rb_ary_new();
10131 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10134 rb_ary_push(ary, result);
10136 if (last >= 0) rb_pat_search(pat, str, last, 1);
10141 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10145 str_mod_check(str, p,
len);
10147 if (last >= 0) rb_pat_search(pat, str, last, 1);
10171rb_str_hex(
VALUE str)
10173 return rb_str_to_inum(str, 16, FALSE);
10198rb_str_oct(
VALUE str)
10200 return rb_str_to_inum(str, -8, FALSE);
10203#ifndef HAVE_CRYPT_R
10208 rb_nativethread_lock_t lock;
10209} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10212crypt_mutex_initialize(
void)
10283# define CRYPT_END() ALLOCV_END(databuf)
10285 extern char *crypt(
const char *,
const char *);
10286# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10289 const char *s, *saltp;
10292 char salt_8bit_clean[3];
10296 mustnot_wchar(str);
10297 mustnot_wchar(salt);
10299 saltp = RSTRING_PTR(salt);
10300 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10301 rb_raise(rb_eArgError,
"salt too short (need >=2 bytes)");
10305 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10306 salt_8bit_clean[0] = saltp[0] & 0x7f;
10307 salt_8bit_clean[1] = saltp[1] & 0x7f;
10308 salt_8bit_clean[2] =
'\0';
10309 saltp = salt_8bit_clean;
10314# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10315 data->initialized = 0;
10317 res = crypt_r(s, saltp, data);
10319 crypt_mutex_initialize();
10321 res = crypt(s, saltp);
10362 char *ptr, *p, *pend;
10365 unsigned long sum0 = 0;
10370 ptr = p = RSTRING_PTR(str);
10371 len = RSTRING_LEN(str);
10377 str_mod_check(str, ptr,
len);
10380 sum0 += (
unsigned char)*p;
10391 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10392 sum0 &= (((
unsigned long)1)<<bits)-1;
10412rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10416 long width,
len, flen = 1, fclen = 1;
10419 const char *f =
" ";
10420 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10422 int singlebyte = 1, cr;
10426 enc = STR_ENC_GET(str);
10431 enc = rb_enc_check(str, pad);
10432 f = RSTRING_PTR(pad);
10433 flen = RSTRING_LEN(pad);
10434 fclen = str_strlen(pad, enc);
10435 singlebyte = single_byte_optimizable(pad);
10436 if (flen == 0 || fclen == 0) {
10437 rb_raise(rb_eArgError,
"zero width padding");
10440 len = str_strlen(str, enc);
10441 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
10443 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
10447 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10448 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10450 size = RSTRING_LEN(str);
10451 if ((
len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10452 (
len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10453 (
len += llen2 + rlen2) >= LONG_MAX - size) {
10454 rb_raise(rb_eArgError,
"argument too big");
10458 p = RSTRING_PTR(res);
10460 memset(p, *f, llen);
10464 while (llen >= fclen) {
10470 memcpy(p, f, llen2);
10474 memcpy(p, RSTRING_PTR(str), size);
10477 memset(p, *f, rlen);
10481 while (rlen >= fclen) {
10487 memcpy(p, f, rlen2);
10491 TERM_FILL(p, termlen);
10492 STR_SET_LEN(res, p-RSTRING_PTR(res));
10493 rb_enc_associate(res, enc);
10515rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
10517 return rb_str_justify(argc, argv, str,
'l');
10531rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
10533 return rb_str_justify(argc, argv, str,
'r');
10548rb_str_center(
int argc,
VALUE *argv,
VALUE str)
10550 return rb_str_justify(argc, argv, str,
'c');
10566 sep = get_pat_quoted(sep, 0);
10575 sep = rb_str_subseq(str, pos, END(0) - pos);
10578 pos = rb_str_index(str, sep, 0);
10579 if (pos < 0)
goto failed;
10581 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10583 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10584 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10587 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10601 long pos = RSTRING_LEN(str);
10603 sep = get_pat_quoted(sep, 0);
10612 sep = rb_str_subseq(str, pos, END(0) - pos);
10616 pos = rb_str_rindex(str, sep, pos);
10622 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10624 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10625 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10627 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
10639rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
10643 for (i=0; i<argc; i++) {
10644 VALUE tmp = argv[i];
10646 if (rb_reg_start_with_p(tmp, str))
10650 const char *p, *s, *e;
10655 enc = rb_enc_check(str, tmp);
10656 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
10657 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
10658 p = RSTRING_PTR(str);
10661 if (!at_char_right_boundary(p, s, e, enc))
10663 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
10679rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
10683 for (i=0; i<argc; i++) {
10684 VALUE tmp = argv[i];
10685 const char *p, *s, *e;
10690 enc = rb_enc_check(str, tmp);
10691 if ((tlen = RSTRING_LEN(tmp)) == 0)
return Qtrue;
10692 if ((slen = RSTRING_LEN(str)) < tlen)
continue;
10693 p = RSTRING_PTR(str);
10696 if (!at_char_boundary(p, s, e, enc))
10698 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
10714deleted_prefix_length(
VALUE str,
VALUE prefix)
10716 const char *strptr, *prefixptr;
10717 long olen, prefixlen;
10718 rb_encoding *enc = rb_enc_get(str);
10722 if (!is_broken_string(prefix) ||
10723 !rb_enc_asciicompat(enc) ||
10724 !rb_enc_asciicompat(rb_enc_get(prefix))) {
10725 enc = rb_enc_check(str, prefix);
10729 prefixlen = RSTRING_LEN(prefix);
10730 if (prefixlen <= 0)
return 0;
10731 olen = RSTRING_LEN(str);
10732 if (olen < prefixlen)
return 0;
10733 strptr = RSTRING_PTR(str);
10734 prefixptr = RSTRING_PTR(prefix);
10735 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
10736 if (is_broken_string(prefix)) {
10737 if (!is_broken_string(str)) {
10741 const char *strend = strptr + olen;
10742 const char *after_prefix = strptr + prefixlen;
10743 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
10763rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
10766 str_modify_keep_cr(str);
10768 prefixlen = deleted_prefix_length(str, prefix);
10769 if (prefixlen <= 0)
return Qnil;
10783rb_str_delete_prefix(
VALUE str,
VALUE prefix)
10787 prefixlen = deleted_prefix_length(str, prefix);
10788 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
10790 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10803deleted_suffix_length(
VALUE str,
VALUE suffix)
10805 const char *strptr, *suffixptr;
10806 long olen, suffixlen;
10810 if (is_broken_string(suffix))
return 0;
10811 enc = rb_enc_check(str, suffix);
10814 suffixlen = RSTRING_LEN(suffix);
10815 if (suffixlen <= 0)
return 0;
10816 olen = RSTRING_LEN(str);
10817 if (olen < suffixlen)
return 0;
10818 strptr = RSTRING_PTR(str);
10819 suffixptr = RSTRING_PTR(suffix);
10820 const char *strend = strptr + olen;
10821 const char *before_suffix = strend - suffixlen;
10822 if (memcmp(before_suffix, suffixptr, suffixlen) != 0)
return 0;
10823 if (!at_char_boundary(strptr, before_suffix, strend, enc))
return 0;
10838rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
10840 long olen, suffixlen,
len;
10841 str_modifiable(str);
10843 suffixlen = deleted_suffix_length(str, suffix);
10844 if (suffixlen <= 0)
return Qnil;
10846 olen = RSTRING_LEN(str);
10847 str_modify_keep_cr(str);
10848 len = olen - suffixlen;
10849 STR_SET_LEN(str,
len);
10850 TERM_FILL(&RSTRING_PTR(str)[
len], TERM_LEN(str));
10866rb_str_delete_suffix(
VALUE str,
VALUE suffix)
10870 suffixlen = deleted_suffix_length(str, suffix);
10871 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
10873 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10880 rb_raise(
rb_eTypeError,
"value of %"PRIsVALUE
" must be String", rb_id2str(
id));
10888 val = rb_fs_check(val);
10891 "value of %"PRIsVALUE
" must be String or Regexp",
10895 rb_warn_deprecated(
"`$;'", NULL);
10912 str_modifiable(str);
10914 rb_encoding *encoding = rb_to_encoding(enc);
10915 int idx = rb_enc_to_index(encoding);
10922 rb_enc_associate_index(str, idx);
10946 if (STR_EMBED_P(str)) {
10947 str2 = str_alloc_embed(
rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
10952 str_replace_shared_without_enc(str2, str);
10954 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
10987rb_str_valid_encoding_p(
VALUE str)
10989 int cr = rb_enc_str_coderange(str);
11007rb_str_is_ascii_only_p(
VALUE str)
11009 int cr = rb_enc_str_coderange(str);
11017 static const char ellipsis[] =
"...";
11018 const long ellipsislen =
sizeof(ellipsis) - 1;
11019 rb_encoding *
const enc = rb_enc_get(str);
11020 const long blen = RSTRING_LEN(str);
11021 const char *
const p = RSTRING_PTR(str), *e = p + blen;
11022 VALUE estr, ret = 0;
11029 else if (
len <= ellipsislen ||
11031 if (rb_enc_asciicompat(enc)) {
11033 rb_enc_associate(ret, enc);
11040 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11041 rb_str_cat(ret, ellipsis, ellipsislen);
11045 rb_enc_from_encoding(enc), 0,
Qnil);
11052str_compat_and_valid(
VALUE str, rb_encoding *enc)
11056 cr = rb_enc_str_coderange(str);
11058 rb_raise(rb_eArgError,
"replacement must be valid byte sequence '%+"PRIsVALUE
"'", str);
11061 rb_encoding *e = STR_ENC_GET(str);
11064 rb_enc_name(enc), rb_enc_name(e));
11070static VALUE enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl,
int cr);
11075 rb_encoding *enc = STR_ENC_GET(str);
11080rb_enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl)
11083 if (enc == STR_ENC_GET(str)) {
11088 return enc_str_scrub(enc, str, repl, cr);
11092enc_str_scrub(rb_encoding *enc,
VALUE str,
VALUE repl,
int cr)
11096 const char *rep, *p, *e, *p1, *sp;
11102 rb_raise(rb_eArgError,
"both of block and replacement given");
11109 if (!
NIL_P(repl)) {
11110 repl = str_compat_and_valid(repl, enc);
11113 if (rb_enc_dummy_p(enc)) {
11116 encidx = rb_enc_to_index(enc);
11118#define DEFAULT_REPLACE_CHAR(str) do { \
11119 static const char replace[sizeof(str)-1] = str; \
11120 rep = replace; replen = (int)sizeof(replace); \
11123 slen = RSTRING_LEN(str);
11124 p = RSTRING_PTR(str);
11125 e = RSTRING_END(str);
11129 if (rb_enc_asciicompat(enc)) {
11135 else if (!
NIL_P(repl)) {
11136 rep = RSTRING_PTR(repl);
11137 replen = RSTRING_LEN(repl);
11140 else if (encidx == rb_utf8_encindex()) {
11141 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11145 DEFAULT_REPLACE_CHAR(
"?");
11150 p = search_nonascii(p, e);
11155 int ret = rb_enc_precise_mbclen(p, e, enc);
11169 if (
NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11174 if (e - p < clen) clen = e - p;
11181 for (; clen > 1; clen--) {
11182 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11193 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11194 str_mod_check(str, sp, slen);
11195 repl = str_compat_and_valid(repl, enc);
11202 p = search_nonascii(p, e);
11217 buf = rb_str_buf_new(RSTRING_LEN(str));
11228 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11229 str_mod_check(str, sp, slen);
11230 repl = str_compat_and_valid(repl, enc);
11243 else if (!
NIL_P(repl)) {
11244 rep = RSTRING_PTR(repl);
11245 replen = RSTRING_LEN(repl);
11247 else if (encidx == ENCINDEX_UTF_16BE) {
11248 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11250 else if (encidx == ENCINDEX_UTF_16LE) {
11251 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11253 else if (encidx == ENCINDEX_UTF_32BE) {
11254 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11256 else if (encidx == ENCINDEX_UTF_32LE) {
11257 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11260 DEFAULT_REPLACE_CHAR(
"?");
11264 int ret = rb_enc_precise_mbclen(p, e, enc);
11274 if (
NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11277 if (e - p < clen) clen = e - p;
11278 if (clen <= mbminlen * 2) {
11283 for (; clen > mbminlen; clen-=mbminlen) {
11284 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11294 repl =
rb_yield(rb_enc_str_new(p, clen, enc));
11295 str_mod_check(str, sp, slen);
11296 repl = str_compat_and_valid(repl, enc);
11311 buf = rb_str_buf_new(RSTRING_LEN(str));
11321 repl =
rb_yield(rb_enc_str_new(p, e-p, enc));
11322 str_mod_check(str, sp, slen);
11323 repl = str_compat_and_valid(repl, enc);
11359str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11367static ID id_normalize;
11368static ID id_normalized_p;
11369static VALUE mUnicodeNormalize;
11372unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11374 static int UnicodeNormalizeRequired = 0;
11377 if (!UnicodeNormalizeRequired) {
11378 rb_require(
"unicode_normalize/normalize.rb");
11379 UnicodeNormalizeRequired = 1;
11383 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11420rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11422 return unicode_normalize_common(argc, argv, str, id_normalize);
11436rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
11438 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11465rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
11467 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11599#define sym_equal rb_obj_equal
11602sym_printable(
const char *s,
const char *send, rb_encoding *enc)
11606 int c = rb_enc_precise_mbclen(s, send, enc);
11618rb_str_symname_p(
VALUE sym)
11623 rb_encoding *resenc = rb_default_internal_encoding();
11625 if (resenc == NULL) resenc = rb_default_external_encoding();
11626 enc = STR_ENC_GET(sym);
11627 ptr = RSTRING_PTR(sym);
11628 len = RSTRING_LEN(sym);
11629 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) ||
len != (
long)strlen(ptr) ||
11637rb_str_quote_unprintable(
VALUE str)
11642 rb_encoding *resenc;
11645 resenc = rb_default_internal_encoding();
11646 if (resenc == NULL) resenc = rb_default_external_encoding();
11647 enc = STR_ENC_GET(str);
11648 ptr = RSTRING_PTR(str);
11649 len = RSTRING_LEN(str);
11650 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11651 !sym_printable(ptr, ptr +
len, enc)) {
11652 return rb_str_escape(str);
11658rb_id_quote_unprintable(
ID id)
11660 VALUE str = rb_id2str(
id);
11661 if (!rb_str_symname_p(str)) {
11662 return rb_str_escape(str);
11680sym_inspect(
VALUE sym)
11687 if (!rb_str_symname_p(str)) {
11689 len = RSTRING_LEN(str);
11690 rb_str_resize(str,
len + 1);
11691 dest = RSTRING_PTR(str);
11692 memmove(dest + 1, dest,
len);
11695 rb_encoding *enc = STR_ENC_GET(str);
11697 VALUE orig_str = str;
11700 str = rb_enc_str_new(0,
len + 1, enc);
11701 dest = RSTRING_PTR(str);
11702 memcpy(dest + 1, ptr,
len);
11728rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
11733 rb_raise(rb_eArgError,
"no receiver given");
11830 return rb_str_match(
rb_sym2str(sym), other);
11845sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
11847 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
11860sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
11862 return rb_str_match_m_p(argc, argv, sym);
11880 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
11891sym_length(
VALUE sym)
11905sym_empty(
VALUE sym)
11923 return rb_str_intern(rb_str_upcase(argc, argv,
rb_sym2str(sym)));
11939sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
11941 return rb_str_intern(rb_str_downcase(argc, argv,
rb_sym2str(sym)));
11955sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
11957 return rb_str_intern(rb_str_capitalize(argc, argv,
rb_sym2str(sym)));
11971sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
11973 return rb_str_intern(rb_str_swapcase(argc, argv,
rb_sym2str(sym)));
11985sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
11987 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
12000sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
12002 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
12014sym_encoding(
VALUE sym)
12020string_for_symbol(
VALUE name)
12039 name = string_for_symbol(name);
12040 return rb_intern_str(name);
12049 name = string_for_symbol(name);
12050 return rb_str_intern(name);
12073 return rb_fstring(str);
12080 return register_fstring(setup_fake_str(&fake_str, ptr,
len, ENCINDEX_US_ASCII), TRUE);
12092 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12093 rb_enc_autoload(enc);
12097 return register_fstring(rb_setup_fake_str(&fake_str, ptr,
len, enc), TRUE);
12110 assert(rb_vm_fstring_table());
12111 st_foreach(rb_vm_fstring_table(), fstring_set_class_i,
rb_cString);
12276 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define NEWOBJ_OF
Old name of RB_NEWOBJ_OF.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
VALUE rb_cSymbol
Symbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
#define rb_check_frozen
Just another name of rb_check_frozen.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
#define rb_hash_end(h)
Just another name of st_hash_end.
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
int capa
Designed capacity of the buffer.
int off
Offset inside of ptr.
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
struct rb_data_type_struct rb_data_type_t
This is the struct that holds necessary info for a struct.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define errno
Ractor-aware version of errno.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
long len
Length of the string, not including terminating NUL character.
union RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024::@365170260060164113275356137374160141226332013204 aux
Auxiliary info.
struct RString::@157025041137035241047331270155043025061071337053::@153056146250355212360325351117351053336274231135 embed
Embedded contents.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
union RString::@157025041137035241047331270155043025061071337053 as
String's specific fields.
struct RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024 heap
Strings that use separated memory region for contents use this pattern.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.