Ruby  2.7.2p137(2020-10-01revision5445e0435260b449decf2ac16f9d09bae3cafe72)
string.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author$
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "ruby/encoding.h"
15 #include "ruby/re.h"
16 #include "internal.h"
17 #include "encindex.h"
18 #include "probes.h"
19 #include "gc.h"
20 #include "ruby_assert.h"
21 #include "id.h"
22 #include "debug_counter.h"
23 #include "ruby/util.h"
24 
25 #define BEG(no) (regs->beg[(no)])
26 #define END(no) (regs->end[(no)])
27 
28 #include <errno.h>
29 #include <math.h>
30 #include <ctype.h>
31 
32 #ifdef HAVE_UNISTD_H
33 #include <unistd.h>
34 #endif
35 
36 #if defined HAVE_CRYPT_R
37 # if defined HAVE_CRYPT_H
38 # include <crypt.h>
39 # endif
40 #elif !defined HAVE_CRYPT
41 # include "missing/crypt.h"
42 # define HAVE_CRYPT_R 1
43 #endif
44 
45 #undef rb_str_new
46 #undef rb_usascii_str_new
47 #undef rb_utf8_str_new
48 #undef rb_enc_str_new
49 #undef rb_str_new_cstr
50 #undef rb_tainted_str_new_cstr
51 #undef rb_usascii_str_new_cstr
52 #undef rb_utf8_str_new_cstr
53 #undef rb_enc_str_new_cstr
54 #undef rb_external_str_new_cstr
55 #undef rb_locale_str_new_cstr
56 #undef rb_str_dup_frozen
57 #undef rb_str_buf_new_cstr
58 #undef rb_str_buf_cat
59 #undef rb_str_buf_cat2
60 #undef rb_str_cat2
61 #undef rb_str_cat_cstr
62 #undef rb_fstring_cstr
63 
64 static VALUE rb_str_clear(VALUE str);
65 
68 
69 /* FLAGS of RString
70  *
71  * 1: RSTRING_NOEMBED
72  * 2: STR_SHARED (== ELTS_SHARED)
73  * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
74  * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
75  * other strings that rely on this string's buffer)
76  * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
77  * early, specific to rb_str_tmp_frozen_{acquire,release})
78  * 7: STR_TMPLOCK
79  * 8-9: ENC_CODERANGE (2 bits)
80  * 10-16: ENCODING (7 bits == 128)
81  * 17: RSTRING_FSTR
82  * 18: STR_NOFREE
83  * 19: STR_FAKESTR
84  */
85 
86 #define RUBY_MAX_CHAR_LEN 16
87 #define STR_SHARED_ROOT FL_USER5
88 #define STR_BORROWED FL_USER6
89 #define STR_TMPLOCK FL_USER7
90 #define STR_NOFREE FL_USER18
91 #define STR_FAKESTR FL_USER19
92 
93 #define STR_SET_NOEMBED(str) do {\
94  FL_SET((str), STR_NOEMBED);\
95  STR_SET_EMBED_LEN((str), 0);\
96 } while (0)
97 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
98 #define STR_SET_EMBED_LEN(str, n) do { \
99  long tmp_n = (n);\
100  RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
101  RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
102 } while (0)
103 
104 #define STR_SET_LEN(str, n) do { \
105  if (STR_EMBED_P(str)) {\
106  STR_SET_EMBED_LEN((str), (n));\
107  }\
108  else {\
109  RSTRING(str)->as.heap.len = (n);\
110  }\
111 } while (0)
112 
113 #define STR_DEC_LEN(str) do {\
114  if (STR_EMBED_P(str)) {\
115  long n = RSTRING_LEN(str);\
116  n--;\
117  STR_SET_EMBED_LEN((str), n);\
118  }\
119  else {\
120  RSTRING(str)->as.heap.len--;\
121  }\
122 } while (0)
123 
124 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
125 #define TERM_FILL(ptr, termlen) do {\
126  char *const term_fill_ptr = (ptr);\
127  const int term_fill_len = (termlen);\
128  *term_fill_ptr = '\0';\
129  if (UNLIKELY(term_fill_len > 1))\
130  memset(term_fill_ptr, 0, term_fill_len);\
131 } while (0)
132 
133 #define RESIZE_CAPA(str,capacity) do {\
134  const int termlen = TERM_LEN(str);\
135  RESIZE_CAPA_TERM(str,capacity,termlen);\
136 } while (0)
137 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
138  if (STR_EMBED_P(str)) {\
139  if (!STR_EMBEDDABLE_P(capacity, termlen)) {\
140  char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
141  const long tlen = RSTRING_LEN(str);\
142  memcpy(tmp, RSTRING_PTR(str), tlen);\
143  RSTRING(str)->as.heap.ptr = tmp;\
144  RSTRING(str)->as.heap.len = tlen;\
145  STR_SET_NOEMBED(str);\
146  RSTRING(str)->as.heap.aux.capa = (capacity);\
147  }\
148  }\
149  else {\
150  assert(!FL_TEST((str), STR_SHARED)); \
151  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
152  (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
153  RSTRING(str)->as.heap.aux.capa = (capacity);\
154  }\
155 } while (0)
156 
157 #define STR_SET_SHARED(str, shared_str) do { \
158  if (!FL_TEST(str, STR_FAKESTR)) { \
159  RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
160  FL_SET((str), STR_SHARED); \
161  FL_SET((shared_str), STR_SHARED_ROOT); \
162  if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
163  FL_SET_RAW((shared_str), STR_BORROWED); \
164  } \
165 } while (0)
166 
167 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
168 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
169 
170 #define STR_ENC_GET(str) get_encoding(str)
171 
172 #if !defined SHARABLE_MIDDLE_SUBSTRING
173 # define SHARABLE_MIDDLE_SUBSTRING 0
174 #endif
175 #if !SHARABLE_MIDDLE_SUBSTRING
176 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
177 #else
178 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
179 #endif
180 
181 #define STR_EMBEDDABLE_P(len, termlen) \
182  ((len) <= RSTRING_EMBED_LEN_MAX + 1 - (termlen))
183 
184 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
185 static VALUE str_new_shared(VALUE klass, VALUE str);
186 static VALUE str_new_frozen(VALUE klass, VALUE orig);
187 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
188 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
189 static inline void str_modifiable(VALUE str);
190 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
191 
192 static inline void
193 str_make_independent(VALUE str)
194 {
195  long len = RSTRING_LEN(str);
196  int termlen = TERM_LEN(str);
197  str_make_independent_expand((str), len, 0L, termlen);
198 }
199 
200 /* symbols for [up|down|swap]case/capitalize options */
201 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
202 
203 static rb_encoding *
204 get_actual_encoding(const int encidx, VALUE str)
205 {
206  const unsigned char *q;
207 
208  switch (encidx) {
209  case ENCINDEX_UTF_16:
210  if (RSTRING_LEN(str) < 2) break;
211  q = (const unsigned char *)RSTRING_PTR(str);
212  if (q[0] == 0xFE && q[1] == 0xFF) {
214  }
215  if (q[0] == 0xFF && q[1] == 0xFE) {
217  }
218  return rb_ascii8bit_encoding();
219  case ENCINDEX_UTF_32:
220  if (RSTRING_LEN(str) < 4) break;
221  q = (const unsigned char *)RSTRING_PTR(str);
222  if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
224  }
225  if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
227  }
228  return rb_ascii8bit_encoding();
229  }
230  return rb_enc_from_index(encidx);
231 }
232 
233 static rb_encoding *
234 get_encoding(VALUE str)
235 {
236  return get_actual_encoding(ENCODING_GET(str), str);
237 }
238 
239 static void
240 mustnot_broken(VALUE str)
241 {
242  if (is_broken_string(str)) {
243  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
244  }
245 }
246 
247 static void
248 mustnot_wchar(VALUE str)
249 {
250  rb_encoding *enc = STR_ENC_GET(str);
251  if (rb_enc_mbminlen(enc) > 1) {
252  rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
253  }
254 }
255 
256 static int fstring_cmp(VALUE a, VALUE b);
257 
258 static VALUE register_fstring(VALUE str);
259 
260 const struct st_hash_type rb_fstring_hash_type = {
261  fstring_cmp,
262  rb_str_hash,
263 };
264 
265 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
266 
267 static int
268 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
269 {
270  VALUE *fstr = (VALUE *)arg;
271  VALUE str = (VALUE)*key;
272 
273  if (existing) {
274  /* because of lazy sweep, str may be unmarked already and swept
275  * at next time */
276 
278  *fstr = Qundef;
279  return ST_DELETE;
280  }
281 
282  *fstr = str;
283  return ST_STOP;
284  }
285  else {
286  if (FL_TEST_RAW(str, STR_FAKESTR)) {
287  str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
288  RSTRING(str)->as.heap.len,
289  ENCODING_GET(str));
291  }
292  else {
293  str = str_new_frozen(rb_cString, str);
294  if (STR_SHARED_P(str)) { /* str should not be shared */
295  /* shared substring */
296  str_make_independent(str);
298  }
299  if (!BARE_STRING_P(str)) {
300  str = str_new_frozen(rb_cString, str);
301  }
302  }
303  RBASIC(str)->flags |= RSTRING_FSTR;
304 
305  *key = *value = *fstr = str;
306  return ST_CONTINUE;
307  }
308 }
309 
311 VALUE
313 {
314  VALUE fstr;
315  int bare;
316 
318 
319  if (FL_TEST(str, RSTRING_FSTR))
320  return str;
321 
322  bare = BARE_STRING_P(str);
323  if (!bare) {
324  if (STR_EMBED_P(str)) {
326  return str;
327  }
330  return str;
331  }
332  }
333 
334  if (!OBJ_FROZEN(str))
336 
337  fstr = register_fstring(str);
338 
339  if (!bare) {
340  str_replace_shared_without_enc(str, fstr);
342  return str;
343  }
344  return fstr;
345 }
346 
347 static VALUE
348 register_fstring(VALUE str)
349 {
350  VALUE ret;
351  st_table *frozen_strings = rb_vm_fstring_table();
352 
353  do {
354  ret = str;
355  st_update(frozen_strings, (st_data_t)str,
356  fstr_update_callback, (st_data_t)&ret);
357  } while (ret == Qundef);
358 
359  assert(OBJ_FROZEN(ret));
361  assert(!FL_TEST_RAW(ret, FL_EXIVAR));
362  assert(RBASIC_CLASS(ret) == rb_cString);
363  return ret;
364 }
365 
366 static VALUE
367 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
368 {
370  /* SHARED to be allocated by the callback */
371 
372  ENCODING_SET_INLINED((VALUE)fake_str, encidx);
373 
375  fake_str->as.heap.len = len;
376  fake_str->as.heap.ptr = (char *)name;
377  fake_str->as.heap.aux.capa = len;
378  return (VALUE)fake_str;
379 }
380 
381 /*
382  * set up a fake string which refers a static string literal.
383  */
384 VALUE
385 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
386 {
387  return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
388 }
389 
390 /*
391  * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
392  * shared string which refers a static string literal. `ptr` must
393  * point a constant string.
394  */
396 rb_fstring_new(const char *ptr, long len)
397 {
398  struct RString fake_str;
399  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII));
400 }
401 
402 VALUE
403 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
404 {
405  struct RString fake_str;
406  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc));
407 }
408 
409 VALUE
410 rb_fstring_cstr(const char *ptr)
411 {
412  return rb_fstring_new(ptr, strlen(ptr));
413 }
414 
415 static int
416 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
417 {
419  return ST_CONTINUE;
420 }
421 
422 static int
423 fstring_cmp(VALUE a, VALUE b)
424 {
425  long alen, blen;
426  const char *aptr, *bptr;
427  RSTRING_GETMEM(a, aptr, alen);
428  RSTRING_GETMEM(b, bptr, blen);
429  return (alen != blen ||
430  ENCODING_GET(a) != ENCODING_GET(b) ||
431  memcmp(aptr, bptr, alen) != 0);
432 }
433 
434 static inline int
435 single_byte_optimizable(VALUE str)
436 {
437  rb_encoding *enc;
438 
439  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
441  return 1;
442 
443  enc = STR_ENC_GET(str);
444  if (rb_enc_mbmaxlen(enc) == 1)
445  return 1;
446 
447  /* Conservative. Possibly single byte.
448  * "\xa1" in Shift_JIS for example. */
449  return 0;
450 }
451 
453 
454 static inline const char *
455 search_nonascii(const char *p, const char *e)
456 {
457  const uintptr_t *s, *t;
458 
459 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
460 # if SIZEOF_UINTPTR_T == 8
461 # define NONASCII_MASK UINT64_C(0x8080808080808080)
462 # elif SIZEOF_UINTPTR_T == 4
463 # define NONASCII_MASK UINT32_C(0x80808080)
464 # else
465 # error "don't know what to do."
466 # endif
467 #else
468 # if SIZEOF_UINTPTR_T == 8
469 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
470 # elif SIZEOF_UINTPTR_T == 4
471 # define NONASCII_MASK 0x80808080UL /* or...? */
472 # else
473 # error "don't know what to do."
474 # endif
475 #endif
476 
477  if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
478 #if !UNALIGNED_WORD_ACCESS
479  if ((uintptr_t)p % SIZEOF_VOIDP) {
480  int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
481  p += l;
482  switch (l) {
483  default: UNREACHABLE;
484 #if SIZEOF_VOIDP > 4
485  case 7: if (p[-7]&0x80) return p-7;
486  case 6: if (p[-6]&0x80) return p-6;
487  case 5: if (p[-5]&0x80) return p-5;
488  case 4: if (p[-4]&0x80) return p-4;
489 #endif
490  case 3: if (p[-3]&0x80) return p-3;
491  case 2: if (p[-2]&0x80) return p-2;
492  case 1: if (p[-1]&0x80) return p-1;
493  case 0: break;
494  }
495  }
496 #endif
497 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
498 #define aligned_ptr(value) \
499  __builtin_assume_aligned((value), sizeof(uintptr_t))
500 #else
501 #define aligned_ptr(value) (uintptr_t *)(value)
502 #endif
503  s = aligned_ptr(p);
504  t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
505 #undef aligned_ptr
506  for (;s < t; s++) {
507  if (*s & NONASCII_MASK) {
508 #ifdef WORDS_BIGENDIAN
509  return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
510 #else
511  return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
512 #endif
513  }
514  }
515  p = (const char *)s;
516  }
517 
518  switch (e - p) {
519  default: UNREACHABLE;
520 #if SIZEOF_VOIDP > 4
521  case 7: if (e[-7]&0x80) return e-7;
522  case 6: if (e[-6]&0x80) return e-6;
523  case 5: if (e[-5]&0x80) return e-5;
524  case 4: if (e[-4]&0x80) return e-4;
525 #endif
526  case 3: if (e[-3]&0x80) return e-3;
527  case 2: if (e[-2]&0x80) return e-2;
528  case 1: if (e[-1]&0x80) return e-1;
529  case 0: return NULL;
530  }
531 }
532 
533 static int
534 coderange_scan(const char *p, long len, rb_encoding *enc)
535 {
536  const char *e = p + len;
537 
538  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
539  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
540  p = search_nonascii(p, e);
542  }
543 
544  if (rb_enc_asciicompat(enc)) {
545  p = search_nonascii(p, e);
546  if (!p) return ENC_CODERANGE_7BIT;
547  for (;;) {
548  int ret = rb_enc_precise_mbclen(p, e, enc);
549  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
550  p += MBCLEN_CHARFOUND_LEN(ret);
551  if (p == e) break;
552  p = search_nonascii(p, e);
553  if (!p) break;
554  }
555  }
556  else {
557  while (p < e) {
558  int ret = rb_enc_precise_mbclen(p, e, enc);
559  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
560  p += MBCLEN_CHARFOUND_LEN(ret);
561  }
562  }
563  return ENC_CODERANGE_VALID;
564 }
565 
566 long
567 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
568 {
569  const char *p = s;
570 
571  if (*cr == ENC_CODERANGE_BROKEN)
572  return e - s;
573 
574  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
575  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
576  if (*cr == ENC_CODERANGE_VALID) return e - s;
577  p = search_nonascii(p, e);
579  return e - s;
580  }
581  else if (rb_enc_asciicompat(enc)) {
582  p = search_nonascii(p, e);
583  if (!p) {
584  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
585  return e - s;
586  }
587  for (;;) {
588  int ret = rb_enc_precise_mbclen(p, e, enc);
589  if (!MBCLEN_CHARFOUND_P(ret)) {
591  return p - s;
592  }
593  p += MBCLEN_CHARFOUND_LEN(ret);
594  if (p == e) break;
595  p = search_nonascii(p, e);
596  if (!p) break;
597  }
598  }
599  else {
600  while (p < e) {
601  int ret = rb_enc_precise_mbclen(p, e, enc);
602  if (!MBCLEN_CHARFOUND_P(ret)) {
604  return p - s;
605  }
606  p += MBCLEN_CHARFOUND_LEN(ret);
607  }
608  }
609  *cr = ENC_CODERANGE_VALID;
610  return e - s;
611 }
612 
613 static inline void
614 str_enc_copy(VALUE str1, VALUE str2)
615 {
616  rb_enc_set_index(str1, ENCODING_GET(str2));
617 }
618 
619 static void
620 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
621 {
622  /* this function is designed for copying encoding and coderange
623  * from src to new string "dest" which is made from the part of src.
624  */
625  str_enc_copy(dest, src);
626  if (RSTRING_LEN(dest) == 0) {
629  else
631  return;
632  }
633  switch (ENC_CODERANGE(src)) {
634  case ENC_CODERANGE_7BIT:
636  break;
637  case ENC_CODERANGE_VALID:
639  search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
641  else
643  break;
644  default:
645  break;
646  }
647 }
648 
649 static void
650 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
651 {
652  str_enc_copy(dest, src);
654 }
655 
656 int
658 {
659  int cr = ENC_CODERANGE(str);
660 
661  if (cr == ENC_CODERANGE_UNKNOWN) {
662  int encidx = ENCODING_GET(str);
663  rb_encoding *enc = rb_enc_from_index(encidx);
664  if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
665  rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
667  }
668  else {
669  cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str),
670  enc);
671  }
672  ENC_CODERANGE_SET(str, cr);
673  }
674  return cr;
675 }
676 
677 int
679 {
680  rb_encoding *enc = STR_ENC_GET(str);
681 
682  if (!rb_enc_asciicompat(enc))
683  return FALSE;
685  return TRUE;
686  return FALSE;
687 }
688 
689 static inline void
690 str_mod_check(VALUE s, const char *p, long len)
691 {
692  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
693  rb_raise(rb_eRuntimeError, "string modified");
694  }
695 }
696 
697 static size_t
698 str_capacity(VALUE str, const int termlen)
699 {
700  if (STR_EMBED_P(str)) {
701  return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
702  }
703  else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
704  return RSTRING(str)->as.heap.len;
705  }
706  else {
707  return RSTRING(str)->as.heap.aux.capa;
708  }
709 }
710 
711 size_t
713 {
714  return str_capacity(str, TERM_LEN(str));
715 }
716 
717 static inline void
718 must_not_null(const char *ptr)
719 {
720  if (!ptr) {
721  rb_raise(rb_eArgError, "NULL pointer given");
722  }
723 }
724 
725 static inline VALUE
726 str_alloc(VALUE klass)
727 {
729  return (VALUE)str;
730 }
731 
732 static inline VALUE
733 empty_str_alloc(VALUE klass)
734 {
735  RUBY_DTRACE_CREATE_HOOK(STRING, 0);
736  return str_alloc(klass);
737 }
738 
739 static VALUE
740 str_new0(VALUE klass, const char *ptr, long len, int termlen)
741 {
742  VALUE str;
743 
744  if (len < 0) {
745  rb_raise(rb_eArgError, "negative string size (or size too big)");
746  }
747 
748  RUBY_DTRACE_CREATE_HOOK(STRING, len);
749 
750  str = str_alloc(klass);
751  if (!STR_EMBEDDABLE_P(len, termlen)) {
752  RSTRING(str)->as.heap.aux.capa = len;
753  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)len + termlen);
755  }
756  else if (len == 0) {
758  }
759  if (ptr) {
761  }
762  STR_SET_LEN(str, len);
763  TERM_FILL(RSTRING_PTR(str) + len, termlen);
764  return str;
765 }
766 
767 static VALUE
768 str_new(VALUE klass, const char *ptr, long len)
769 {
770  return str_new0(klass, ptr, len, 1);
771 }
772 
773 VALUE
774 rb_str_new(const char *ptr, long len)
775 {
776  return str_new(rb_cString, ptr, len);
777 }
778 
779 VALUE
780 rb_usascii_str_new(const char *ptr, long len)
781 {
782  VALUE str = rb_str_new(ptr, len);
784  return str;
785 }
786 
787 VALUE
788 rb_utf8_str_new(const char *ptr, long len)
789 {
790  VALUE str = str_new(rb_cString, ptr, len);
792  return str;
793 }
794 
795 VALUE
796 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
797 {
798  VALUE str;
799 
800  if (!enc) return rb_str_new(ptr, len);
801 
802  str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
803  rb_enc_associate(str, enc);
804  return str;
805 }
806 
807 VALUE
808 rb_str_new_cstr(const char *ptr)
809 {
810  must_not_null(ptr);
811  /* rb_str_new_cstr() can take pointer from non-malloc-generated
812  * memory regions, and that cannot be detected by the MSAN. Just
813  * trust the programmer that the argument passed here is a sane C
814  * string. */
816  return rb_str_new(ptr, strlen(ptr));
817 }
818 
819 VALUE
821 {
824  return str;
825 }
826 
827 VALUE
829 {
832  return str;
833 }
834 
835 VALUE
837 {
838  must_not_null(ptr);
839  if (rb_enc_mbminlen(enc) != 1) {
840  rb_raise(rb_eArgError, "wchar encoding given");
841  }
842  return rb_enc_str_new(ptr, strlen(ptr), enc);
843 }
844 
845 static VALUE
846 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
847 {
848  VALUE str;
849 
850  if (len < 0) {
851  rb_raise(rb_eArgError, "negative string size (or size too big)");
852  }
853 
854  if (!ptr) {
855  rb_encoding *enc = rb_enc_get_from_index(encindex);
856  str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
857  }
858  else {
859  RUBY_DTRACE_CREATE_HOOK(STRING, len);
860  str = str_alloc(klass);
861  RSTRING(str)->as.heap.len = len;
862  RSTRING(str)->as.heap.ptr = (char *)ptr;
863  RSTRING(str)->as.heap.aux.capa = len;
865  RBASIC(str)->flags |= STR_NOFREE;
866  }
867  rb_enc_associate_index(str, encindex);
868  return str;
869 }
870 
871 VALUE
872 rb_str_new_static(const char *ptr, long len)
873 {
874  return str_new_static(rb_cString, ptr, len, 0);
875 }
876 
877 VALUE
879 {
880  return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
881 }
882 
883 VALUE
884 rb_utf8_str_new_static(const char *ptr, long len)
885 {
886  return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
887 }
888 
889 VALUE
890 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
891 {
892  return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
893 }
894 
895 VALUE
896 rb_tainted_str_new(const char *ptr, long len)
897 {
898  rb_warning("rb_tainted_str_new is deprecated and will be removed in Ruby 3.2.");
899  return rb_str_new(ptr, len);
900 }
901 
902 VALUE
904 {
905  rb_warning("rb_tainted_str_new_cstr is deprecated and will be removed in Ruby 3.2.");
906  return rb_str_new_cstr(ptr);
907 }
908 
909 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
910  rb_encoding *from, rb_encoding *to,
911  int ecflags, VALUE ecopts);
912 
913 VALUE
914 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
915 {
916  long len;
917  const char *ptr;
918  VALUE newstr;
919 
920  if (!to) return str;
921  if (!from) from = rb_enc_get(str);
922  if (from == to) return str;
923  if ((rb_enc_asciicompat(to) && is_ascii_string(str)) ||
924  to == rb_ascii8bit_encoding()) {
925  if (STR_ENC_GET(str) != to) {
926  str = rb_str_dup(str);
927  rb_enc_associate(str, to);
928  }
929  return str;
930  }
931 
933  newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
934  from, to, ecflags, ecopts);
935  if (NIL_P(newstr)) {
936  /* some error, return original */
937  return str;
938  }
939  return newstr;
940 }
941 
942 VALUE
943 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
944  rb_encoding *from, int ecflags, VALUE ecopts)
945 {
946  long olen;
947 
948  olen = RSTRING_LEN(newstr);
949  if (ofs < -olen || olen < ofs)
950  rb_raise(rb_eIndexError, "index %ld out of string", ofs);
951  if (ofs < 0) ofs += olen;
952  if (!from) {
953  STR_SET_LEN(newstr, ofs);
954  return rb_str_cat(newstr, ptr, len);
955  }
956 
957  rb_str_modify(newstr);
958  return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
959  rb_enc_get(newstr),
960  ecflags, ecopts);
961 }
962 
963 VALUE
964 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
965 {
966  STR_SET_LEN(str, 0);
967  rb_enc_associate(str, enc);
968  rb_str_cat(str, ptr, len);
969  return str;
970 }
971 
972 static VALUE
973 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
974  rb_encoding *from, rb_encoding *to,
975  int ecflags, VALUE ecopts)
976 {
977  rb_econv_t *ec;
978  rb_econv_result_t ret;
979  long olen;
980  VALUE econv_wrapper;
981  const unsigned char *start, *sp;
982  unsigned char *dest, *dp;
983  size_t converted_output = (size_t)ofs;
984 
985  olen = rb_str_capacity(newstr);
986 
987  econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
988  RBASIC_CLEAR_CLASS(econv_wrapper);
989  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
990  if (!ec) return Qnil;
991  DATA_PTR(econv_wrapper) = ec;
992 
993  sp = (unsigned char*)ptr;
994  start = sp;
995  while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
996  (dp = dest + converted_output),
997  (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
999  /* destination buffer short */
1000  size_t converted_input = sp - start;
1001  size_t rest = len - converted_input;
1002  converted_output = dp - dest;
1003  rb_str_set_len(newstr, converted_output);
1004  if (converted_input && converted_output &&
1005  rest < (LONG_MAX / converted_output)) {
1006  rest = (rest * converted_output) / converted_input;
1007  }
1008  else {
1009  rest = olen;
1010  }
1011  olen += rest < 2 ? 2 : rest;
1012  rb_str_resize(newstr, olen);
1013  }
1014  DATA_PTR(econv_wrapper) = 0;
1015  rb_econv_close(ec);
1016  rb_gc_force_recycle(econv_wrapper);
1017  switch (ret) {
1018  case econv_finished:
1019  len = dp - (unsigned char*)RSTRING_PTR(newstr);
1020  rb_str_set_len(newstr, len);
1021  rb_enc_associate(newstr, to);
1022  return newstr;
1023 
1024  default:
1025  return Qnil;
1026  }
1027 }
1028 
1029 VALUE
1031 {
1032  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1033 }
1034 
1035 VALUE
1037 {
1038  rb_encoding *ienc;
1039  VALUE str;
1040  const int eidx = rb_enc_to_index(eenc);
1041 
1042  if (!ptr) {
1043  return rb_enc_str_new(ptr, len, eenc);
1044  }
1045 
1046  /* ASCII-8BIT case, no conversion */
1047  if ((eidx == rb_ascii8bit_encindex()) ||
1048  (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1049  return rb_str_new(ptr, len);
1050  }
1051  /* no default_internal or same encoding, no conversion */
1053  if (!ienc || eenc == ienc) {
1054  return rb_enc_str_new(ptr, len, eenc);
1055  }
1056  /* ASCII compatible, and ASCII only string, no conversion in
1057  * default_internal */
1058  if ((eidx == rb_ascii8bit_encindex()) ||
1059  (eidx == rb_usascii_encindex()) ||
1060  (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1061  return rb_enc_str_new(ptr, len, ienc);
1062  }
1063  /* convert from the given encoding to default_internal */
1064  str = rb_enc_str_new(NULL, 0, ienc);
1065  /* when the conversion failed for some reason, just ignore the
1066  * default_internal and result in the given encoding as-is. */
1067  if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1068  rb_str_initialize(str, ptr, len, eenc);
1069  }
1070  return str;
1071 }
1072 
1073 VALUE
1075 {
1076  int eidx = rb_enc_to_index(eenc);
1077  if (eidx == rb_usascii_encindex() &&
1080  return str;
1081  }
1082  rb_enc_associate_index(str, eidx);
1084 }
1085 
1086 VALUE
1087 rb_external_str_new(const char *ptr, long len)
1088 {
1090 }
1091 
1092 VALUE
1094 {
1096 }
1097 
1098 VALUE
1099 rb_locale_str_new(const char *ptr, long len)
1100 {
1102 }
1103 
1104 VALUE
1106 {
1108 }
1109 
1110 VALUE
1111 rb_filesystem_str_new(const char *ptr, long len)
1112 {
1114 }
1115 
1116 VALUE
1118 {
1120 }
1121 
1122 VALUE
1124 {
1126 }
1127 
1128 VALUE
1130 {
1132 }
1133 
1134 VALUE
1136 {
1137  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1138 }
1139 
1140 static VALUE
1141 str_replace_shared_without_enc(VALUE str2, VALUE str)
1142 {
1143  const int termlen = TERM_LEN(str);
1144  char *ptr;
1145  long len;
1146 
1148  if (STR_EMBEDDABLE_P(len, termlen)) {
1149  char *ptr2 = RSTRING(str2)->as.ary;
1150  STR_SET_EMBED(str2);
1151  memcpy(ptr2, RSTRING_PTR(str), len);
1152  STR_SET_EMBED_LEN(str2, len);
1153  TERM_FILL(ptr2+len, termlen);
1154  }
1155  else {
1156  VALUE root;
1157  if (STR_SHARED_P(str)) {
1158  root = RSTRING(str)->as.heap.aux.shared;
1160  }
1161  else {
1162  root = rb_str_new_frozen(str);
1163  RSTRING_GETMEM(root, ptr, len);
1164  }
1165  if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1166  if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1167  rb_fatal("about to free a possible shared root");
1168  }
1169  char *ptr2 = STR_HEAP_PTR(str2);
1170  if (ptr2 != ptr) {
1171  ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1172  }
1173  }
1174  FL_SET(str2, STR_NOEMBED);
1175  RSTRING(str2)->as.heap.len = len;
1176  RSTRING(str2)->as.heap.ptr = ptr;
1177  STR_SET_SHARED(str2, root);
1178  }
1179  return str2;
1180 }
1181 
1182 static VALUE
1183 str_replace_shared(VALUE str2, VALUE str)
1184 {
1185  str_replace_shared_without_enc(str2, str);
1186  rb_enc_cr_str_exact_copy(str2, str);
1187  return str2;
1188 }
1189 
1190 static VALUE
1191 str_new_shared(VALUE klass, VALUE str)
1192 {
1193  return str_replace_shared(str_alloc(klass), str);
1194 }
1195 
1196 VALUE
1198 {
1199  return str_new_shared(rb_obj_class(str), str);
1200 }
1201 
1202 VALUE
1204 {
1205  if (OBJ_FROZEN(orig)) return orig;
1206  return str_new_frozen(rb_obj_class(orig), orig);
1207 }
1208 
1209 VALUE
1211 {
1212  if (OBJ_FROZEN_RAW(orig)) return orig;
1213  return str_new_frozen(0, orig);
1214 }
1215 
1216 void
1218 {
1219  if (RBASIC_CLASS(tmp) != 0)
1220  return;
1221 
1222  if (STR_EMBED_P(tmp)) {
1223  assert(OBJ_FROZEN_RAW(tmp));
1224  rb_gc_force_recycle(tmp);
1225  }
1226  else if (FL_TEST_RAW(orig, STR_SHARED) &&
1228  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1229 
1230  if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1231  FL_UNSET_RAW(orig, STR_SHARED);
1232  assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1233  assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1234  RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1235  RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1236  assert(OBJ_FROZEN_RAW(tmp));
1237  rb_gc_force_recycle(tmp);
1238  }
1239  }
1240 }
1241 
1242 static VALUE
1243 str_new_frozen(VALUE klass, VALUE orig)
1244 {
1245  VALUE str;
1246 
1247  if (STR_EMBED_P(orig)) {
1248  str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1249  }
1250  else {
1251  if (FL_TEST_RAW(orig, STR_SHARED)) {
1252  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1253  long ofs = RSTRING(orig)->as.heap.ptr - RSTRING(shared)->as.heap.ptr;
1254  long rest = RSTRING(shared)->as.heap.len - ofs - RSTRING(orig)->as.heap.len;
1257 
1258  if ((ofs > 0) || (rest > 0) ||
1259  (klass != RBASIC(shared)->klass) ||
1260  ENCODING_GET(shared) != ENCODING_GET(orig)) {
1261  str = str_new_shared(klass, shared);
1262  RSTRING(str)->as.heap.ptr += ofs;
1263  RSTRING(str)->as.heap.len -= ofs + rest;
1264  }
1265  else {
1266  if (RBASIC_CLASS(shared) == 0)
1268  return shared;
1269  }
1270  }
1271  else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1272  str = str_alloc(klass);
1273  STR_SET_EMBED(str);
1274  memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1276  TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1277  }
1278  else {
1279  str = str_alloc(klass);
1281  RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1282  RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1283  RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1284  RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1285  RBASIC(orig)->flags &= ~STR_NOFREE;
1286  STR_SET_SHARED(orig, str);
1287  if (klass == 0)
1289  }
1290  }
1291 
1292  rb_enc_cr_str_exact_copy(str, orig);
1293  OBJ_FREEZE(str);
1294  return str;
1295 }
1296 
1297 VALUE
1299 {
1300  return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1301 }
1302 
1303 static VALUE
1304 str_new_empty(VALUE str)
1305 {
1306  VALUE v = rb_str_new_with_class(str, 0, 0);
1307  rb_enc_copy(v, str);
1308  return v;
1309 }
1310 
1311 #define STR_BUF_MIN_SIZE 63
1313 
1314 VALUE
1316 {
1317  VALUE str = str_alloc(rb_cString);
1318 
1319  if (capa < STR_BUF_MIN_SIZE) {
1321  }
1323  RSTRING(str)->as.heap.aux.capa = capa;
1324  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1325  RSTRING(str)->as.heap.ptr[0] = '\0';
1326 
1327  return str;
1328 }
1329 
1330 VALUE
1332 {
1333  VALUE str;
1334  long len = strlen(ptr);
1335 
1336  str = rb_str_buf_new(len);
1338 
1339  return str;
1340 }
1341 
1342 VALUE
1344 {
1345  return str_new(0, 0, len);
1346 }
1347 
1348 void
1350 {
1351  if (FL_TEST(str, RSTRING_FSTR)) {
1352  st_data_t fstr = (st_data_t)str;
1353  st_delete(rb_vm_fstring_table(), &fstr, NULL);
1354  RB_DEBUG_COUNTER_INC(obj_str_fstr);
1355  }
1356 
1357  if (STR_EMBED_P(str)) {
1358  RB_DEBUG_COUNTER_INC(obj_str_embed);
1359  }
1360  else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1361  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1362  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1363  }
1364  else {
1365  RB_DEBUG_COUNTER_INC(obj_str_ptr);
1367  }
1368 }
1369 
1370 RUBY_FUNC_EXPORTED size_t
1372 {
1374  return STR_HEAP_SIZE(str);
1375  }
1376  else {
1377  return 0;
1378  }
1379 }
1380 
1381 VALUE
1383 {
1384  return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1385 }
1386 
1387 static inline void str_discard(VALUE str);
1388 static void str_shared_replace(VALUE str, VALUE str2);
1389 
1390 void
1392 {
1393  if (str != str2) str_shared_replace(str, str2);
1394 }
1395 
1396 static void
1397 str_shared_replace(VALUE str, VALUE str2)
1398 {
1399  rb_encoding *enc;
1400  int cr;
1401  int termlen;
1402 
1403  RUBY_ASSERT(str2 != str);
1404  enc = STR_ENC_GET(str2);
1405  cr = ENC_CODERANGE(str2);
1406  str_discard(str);
1407  termlen = rb_enc_mbminlen(enc);
1408 
1409  if (STR_EMBEDDABLE_P(RSTRING_LEN(str2), termlen)) {
1410  STR_SET_EMBED(str);
1411  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1413  rb_enc_associate(str, enc);
1414  ENC_CODERANGE_SET(str, cr);
1415  }
1416  else {
1419  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1420  RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1421 
1422  if (FL_TEST(str2, STR_SHARED)) {
1423  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1425  }
1426  else {
1427  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1428  }
1429 
1430  /* abandon str2 */
1431  STR_SET_EMBED(str2);
1432  RSTRING_PTR(str2)[0] = 0;
1433  STR_SET_EMBED_LEN(str2, 0);
1434  rb_enc_associate(str, enc);
1435  ENC_CODERANGE_SET(str, cr);
1436  }
1437 }
1438 
1439 VALUE
1441 {
1442  VALUE str;
1443 
1444  if (RB_TYPE_P(obj, T_STRING)) {
1445  return obj;
1446  }
1447  str = rb_funcall(obj, idTo_s, 0);
1448  return rb_obj_as_string_result(str, obj);
1449 }
1450 
1453 {
1454  if (!RB_TYPE_P(str, T_STRING))
1455  return rb_any_to_s(obj);
1456  return str;
1457 }
1458 
1459 static VALUE
1460 str_replace(VALUE str, VALUE str2)
1461 {
1462  long len;
1463 
1464  len = RSTRING_LEN(str2);
1465  if (STR_SHARED_P(str2)) {
1466  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1469  RSTRING(str)->as.heap.len = len;
1470  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1472  rb_enc_cr_str_exact_copy(str, str2);
1473  }
1474  else {
1475  str_replace_shared(str, str2);
1476  }
1477 
1478  return str;
1479 }
1480 
1481 static inline VALUE
1482 str_duplicate(VALUE klass, VALUE str)
1483 {
1484  enum {embed_size = RSTRING_EMBED_LEN_MAX + 1};
1485  const VALUE flag_mask =
1488  FL_FREEZE
1489  ;
1490  VALUE flags = FL_TEST_RAW(str, flag_mask);
1491  VALUE dup = str_alloc(klass);
1492  MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1493  char, embed_size);
1494  if (flags & STR_NOEMBED) {
1495  if (FL_TEST_RAW(str, STR_SHARED)) {
1496  str = RSTRING(str)->as.heap.aux.shared;
1497  }
1498  else if (UNLIKELY(!(flags & FL_FREEZE))) {
1499  str = str_new_frozen(klass, str);
1500  flags = FL_TEST_RAW(str, flag_mask);
1501  }
1502  if (flags & STR_NOEMBED) {
1503  RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, str);
1504  flags |= STR_SHARED;
1505  }
1506  else {
1507  MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1508  char, embed_size);
1509  }
1510  }
1511  FL_SET_RAW(dup, flags & ~FL_FREEZE);
1512  return dup;
1513 }
1514 
1515 VALUE
1517 {
1518  return str_duplicate(rb_obj_class(str), str);
1519 }
1520 
1521 VALUE
1523 {
1525  return str_duplicate(rb_cString, str);
1526 }
1527 
1528 /*
1529  * call-seq:
1530  * String.new(str="") -> new_str
1531  * String.new(str="", encoding: enc) -> new_str
1532  * String.new(str="", capacity: size) -> new_str
1533  *
1534  * Returns a new string object containing a copy of <i>str</i>.
1535  *
1536  * The optional <i>encoding</i> keyword argument specifies the encoding
1537  * of the new string.
1538  * If not specified, the encoding of <i>str</i> is used
1539  * (or ASCII-8BIT, if <i>str</i> is not specified).
1540  *
1541  * The optional <i>capacity</i> keyword argument specifies the size
1542  * of the internal buffer.
1543  * This may improve performance, when the string will be concatenated many
1544  * times (causing many realloc calls).
1545  */
1546 
1547 static VALUE
1548 rb_str_init(int argc, VALUE *argv, VALUE str)
1549 {
1550  static ID keyword_ids[2];
1551  VALUE orig, opt, venc, vcapa;
1552  VALUE kwargs[2];
1553  rb_encoding *enc = 0;
1554  int n;
1555 
1556  if (!keyword_ids[0]) {
1557  keyword_ids[0] = rb_id_encoding();
1558  CONST_ID(keyword_ids[1], "capacity");
1559  }
1560 
1561  n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1562  if (!NIL_P(opt)) {
1563  rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1564  venc = kwargs[0];
1565  vcapa = kwargs[1];
1566  if (venc != Qundef && !NIL_P(venc)) {
1567  enc = rb_to_encoding(venc);
1568  }
1569  if (vcapa != Qundef && !NIL_P(vcapa)) {
1570  long capa = NUM2LONG(vcapa);
1571  long len = 0;
1572  int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1573 
1574  if (capa < STR_BUF_MIN_SIZE) {
1576  }
1577  if (n == 1) {
1578  StringValue(orig);
1579  len = RSTRING_LEN(orig);
1580  if (capa < len) {
1581  capa = len;
1582  }
1583  if (orig == str) n = 0;
1584  }
1585  str_modifiable(str);
1586  if (STR_EMBED_P(str)) { /* make noembed always */
1587  char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1588  memcpy(new_ptr, RSTRING(str)->as.ary, RSTRING_EMBED_LEN_MAX + 1);
1589  RSTRING(str)->as.heap.ptr = new_ptr;
1590  }
1591  else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1592  const size_t size = (size_t)capa + termlen;
1593  const char *const old_ptr = RSTRING_PTR(str);
1594  const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1595  char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1596  memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1598  RSTRING(str)->as.heap.ptr = new_ptr;
1599  }
1600  else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1601  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1602  (size_t)capa + termlen, STR_HEAP_SIZE(str));
1603  }
1604  RSTRING(str)->as.heap.len = len;
1605  TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1606  if (n == 1) {
1607  memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1608  rb_enc_cr_str_exact_copy(str, orig);
1609  }
1611  RSTRING(str)->as.heap.aux.capa = capa;
1612  }
1613  else if (n == 1) {
1614  rb_str_replace(str, orig);
1615  }
1616  if (enc) {
1617  rb_enc_associate(str, enc);
1619  }
1620  }
1621  else if (n == 1) {
1622  rb_str_replace(str, orig);
1623  }
1624  return str;
1625 }
1626 
1627 #ifdef NONASCII_MASK
1628 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1629 
1630 /*
1631  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1632  * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
1633  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1634  *
1635  * if (!(byte & 0x80))
1636  * byte |= 0x40; // turn on bit6
1637  * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1638  *
1639  * This function calculates whether a byte is leading or not for all bytes
1640  * in the argument word by concurrently using the above logic, and then
1641  * adds up the number of leading bytes in the word.
1642  */
1643 static inline uintptr_t
1644 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1645 {
1646  uintptr_t d = *s;
1647 
1648  /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1649  d = (d>>6) | (~d>>7);
1650  d &= NONASCII_MASK >> 7;
1651 
1652  /* Gather all bytes. */
1653 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1654  /* use only if it can use POPCNT */
1655  return rb_popcount_intptr(d);
1656 #else
1657  d += (d>>8);
1658  d += (d>>16);
1659 # if SIZEOF_VOIDP == 8
1660  d += (d>>32);
1661 # endif
1662  return (d&0xF);
1663 #endif
1664 }
1665 #endif
1666 
1667 static inline long
1668 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1669 {
1670  long c;
1671  const char *q;
1672 
1673  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1674  long diff = (long)(e - p);
1675  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1676  }
1677 #ifdef NONASCII_MASK
1678  else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1679  uintptr_t len = 0;
1680  if ((int)sizeof(uintptr_t) * 2 < e - p) {
1681  const uintptr_t *s, *t;
1682  const uintptr_t lowbits = sizeof(uintptr_t) - 1;
1683  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
1684  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
1685  while (p < (const char *)s) {
1686  if (is_utf8_lead_byte(*p)) len++;
1687  p++;
1688  }
1689  while (s < t) {
1690  len += count_utf8_lead_bytes_with_word(s);
1691  s++;
1692  }
1693  p = (const char *)s;
1694  }
1695  while (p < e) {
1696  if (is_utf8_lead_byte(*p)) len++;
1697  p++;
1698  }
1699  return (long)len;
1700  }
1701 #endif
1702  else if (rb_enc_asciicompat(enc)) {
1703  c = 0;
1704  if (ENC_CODERANGE_CLEAN_P(cr)) {
1705  while (p < e) {
1706  if (ISASCII(*p)) {
1707  q = search_nonascii(p, e);
1708  if (!q)
1709  return c + (e - p);
1710  c += q - p;
1711  p = q;
1712  }
1713  p += rb_enc_fast_mbclen(p, e, enc);
1714  c++;
1715  }
1716  }
1717  else {
1718  while (p < e) {
1719  if (ISASCII(*p)) {
1720  q = search_nonascii(p, e);
1721  if (!q)
1722  return c + (e - p);
1723  c += q - p;
1724  p = q;
1725  }
1726  p += rb_enc_mbclen(p, e, enc);
1727  c++;
1728  }
1729  }
1730  return c;
1731  }
1732 
1733  for (c=0; p<e; c++) {
1734  p += rb_enc_mbclen(p, e, enc);
1735  }
1736  return c;
1737 }
1738 
1739 long
1740 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1741 {
1742  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1743 }
1744 
1745 /* To get strlen with cr
1746  * Note that given cr is not used.
1747  */
1748 long
1749 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1750 {
1751  long c;
1752  const char *q;
1753  int ret;
1754 
1755  *cr = 0;
1756  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1757  long diff = (long)(e - p);
1758  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1759  }
1760  else if (rb_enc_asciicompat(enc)) {
1761  c = 0;
1762  while (p < e) {
1763  if (ISASCII(*p)) {
1764  q = search_nonascii(p, e);
1765  if (!q) {
1766  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1767  return c + (e - p);
1768  }
1769  c += q - p;
1770  p = q;
1771  }
1772  ret = rb_enc_precise_mbclen(p, e, enc);
1773  if (MBCLEN_CHARFOUND_P(ret)) {
1774  *cr |= ENC_CODERANGE_VALID;
1775  p += MBCLEN_CHARFOUND_LEN(ret);
1776  }
1777  else {
1778  *cr = ENC_CODERANGE_BROKEN;
1779  p++;
1780  }
1781  c++;
1782  }
1783  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1784  return c;
1785  }
1786 
1787  for (c=0; p<e; c++) {
1788  ret = rb_enc_precise_mbclen(p, e, enc);
1789  if (MBCLEN_CHARFOUND_P(ret)) {
1790  *cr |= ENC_CODERANGE_VALID;
1791  p += MBCLEN_CHARFOUND_LEN(ret);
1792  }
1793  else {
1794  *cr = ENC_CODERANGE_BROKEN;
1795  if (p + rb_enc_mbminlen(enc) <= e)
1796  p += rb_enc_mbminlen(enc);
1797  else
1798  p = e;
1799  }
1800  }
1801  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1802  return c;
1803 }
1804 
1805 /* enc must be str's enc or rb_enc_check(str, str2) */
1806 static long
1807 str_strlen(VALUE str, rb_encoding *enc)
1808 {
1809  const char *p, *e;
1810  int cr;
1811 
1812  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1813  if (!enc) enc = STR_ENC_GET(str);
1814  p = RSTRING_PTR(str);
1815  e = RSTRING_END(str);
1816  cr = ENC_CODERANGE(str);
1817 
1818  if (cr == ENC_CODERANGE_UNKNOWN) {
1819  long n = rb_enc_strlen_cr(p, e, enc, &cr);
1820  if (cr) ENC_CODERANGE_SET(str, cr);
1821  return n;
1822  }
1823  else {
1824  return enc_strlen(p, e, enc, cr);
1825  }
1826 }
1827 
1828 long
1830 {
1831  return str_strlen(str, NULL);
1832 }
1833 
1834 /*
1835  * call-seq:
1836  * str.length -> integer
1837  * str.size -> integer
1838  *
1839  * Returns the character length of <i>str</i>.
1840  */
1841 
1842 VALUE
1844 {
1845  return LONG2NUM(str_strlen(str, NULL));
1846 }
1847 
1848 /*
1849  * call-seq:
1850  * str.bytesize -> integer
1851  *
1852  * Returns the length of +str+ in bytes.
1853  *
1854  * "\x80\u3042".bytesize #=> 4
1855  * "hello".bytesize #=> 5
1856  */
1857 
1858 static VALUE
1859 rb_str_bytesize(VALUE str)
1860 {
1861  return LONG2NUM(RSTRING_LEN(str));
1862 }
1863 
1864 /*
1865  * call-seq:
1866  * str.empty? -> true or false
1867  *
1868  * Returns <code>true</code> if <i>str</i> has a length of zero.
1869  *
1870  * "hello".empty? #=> false
1871  * " ".empty? #=> false
1872  * "".empty? #=> true
1873  */
1874 
1875 static VALUE
1876 rb_str_empty(VALUE str)
1877 {
1878  if (RSTRING_LEN(str) == 0)
1879  return Qtrue;
1880  return Qfalse;
1881 }
1882 
1883 /*
1884  * call-seq:
1885  * str + other_str -> new_str
1886  *
1887  * Concatenation---Returns a new String containing
1888  * <i>other_str</i> concatenated to <i>str</i>.
1889  *
1890  * "Hello from " + self.to_s #=> "Hello from main"
1891  */
1892 
1893 VALUE
1895 {
1896  VALUE str3;
1897  rb_encoding *enc;
1898  char *ptr1, *ptr2, *ptr3;
1899  long len1, len2;
1900  int termlen;
1901 
1902  StringValue(str2);
1903  enc = rb_enc_check_str(str1, str2);
1904  RSTRING_GETMEM(str1, ptr1, len1);
1905  RSTRING_GETMEM(str2, ptr2, len2);
1906  termlen = rb_enc_mbminlen(enc);
1907  if (len1 > LONG_MAX - len2) {
1908  rb_raise(rb_eArgError, "string size too big");
1909  }
1910  str3 = str_new0(rb_cString, 0, len1+len2, termlen);
1911  ptr3 = RSTRING_PTR(str3);
1912  memcpy(ptr3, ptr1, len1);
1913  memcpy(ptr3+len1, ptr2, len2);
1914  TERM_FILL(&ptr3[len1+len2], termlen);
1915 
1918  RB_GC_GUARD(str1);
1919  RB_GC_GUARD(str2);
1920  return str3;
1921 }
1922 
1923 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
1926 {
1927  assert(RBASIC_CLASS(str1) == rb_cString);
1928  assert(RBASIC_CLASS(str2) == rb_cString);
1929  long len1, len2;
1930  MAYBE_UNUSED(char) *ptr1, *ptr2;
1931  RSTRING_GETMEM(str1, ptr1, len1);
1932  RSTRING_GETMEM(str2, ptr2, len2);
1933  int enc1 = rb_enc_get_index(str1);
1934  int enc2 = rb_enc_get_index(str2);
1935 
1936  if (enc1 < 0) {
1937  return Qundef;
1938  }
1939  else if (enc2 < 0) {
1940  return Qundef;
1941  }
1942  else if (enc1 != enc2) {
1943  return Qundef;
1944  }
1945  else if (len1 > LONG_MAX - len2) {
1946  return Qundef;
1947  }
1948  else {
1949  return rb_str_plus(str1, str2);
1950  }
1951 
1952 }
1953 
1954 /*
1955  * call-seq:
1956  * str * integer -> new_str
1957  *
1958  * Copy --- Returns a new String containing +integer+ copies of the receiver.
1959  * +integer+ must be greater than or equal to 0.
1960  *
1961  * "Ho! " * 3 #=> "Ho! Ho! Ho! "
1962  * "Ho! " * 0 #=> ""
1963  */
1964 
1965 VALUE
1967 {
1968  VALUE str2;
1969  long n, len;
1970  char *ptr2;
1971  int termlen;
1972 
1973  if (times == INT2FIX(1)) {
1974  return rb_str_dup(str);
1975  }
1976  if (times == INT2FIX(0)) {
1977  str2 = str_alloc(rb_obj_class(str));
1978  rb_enc_copy(str2, str);
1979  return str2;
1980  }
1981  len = NUM2LONG(times);
1982  if (len < 0) {
1983  rb_raise(rb_eArgError, "negative argument");
1984  }
1985  if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
1986  str2 = str_alloc(rb_obj_class(str));
1987  if (!STR_EMBEDDABLE_P(len, 1)) {
1988  RSTRING(str2)->as.heap.aux.capa = len;
1989  RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
1990  STR_SET_NOEMBED(str2);
1991  }
1992  STR_SET_LEN(str2, len);
1993  rb_enc_copy(str2, str);
1994  return str2;
1995  }
1996  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
1997  rb_raise(rb_eArgError, "argument too big");
1998  }
1999 
2000  len *= RSTRING_LEN(str);
2001  termlen = TERM_LEN(str);
2002  str2 = str_new0(rb_obj_class(str), 0, len, termlen);
2003  ptr2 = RSTRING_PTR(str2);
2004  if (len) {
2005  n = RSTRING_LEN(str);
2006  memcpy(ptr2, RSTRING_PTR(str), n);
2007  while (n <= len/2) {
2008  memcpy(ptr2 + n, ptr2, n);
2009  n *= 2;
2010  }
2011  memcpy(ptr2 + n, ptr2, len-n);
2012  }
2013  STR_SET_LEN(str2, len);
2014  TERM_FILL(&ptr2[len], termlen);
2015  rb_enc_cr_str_copy_for_substr(str2, str);
2016 
2017  return str2;
2018 }
2019 
2020 /*
2021  * call-seq:
2022  * str % arg -> new_str
2023  *
2024  * Format---Uses <i>str</i> as a format specification, and returns
2025  * the result of applying it to <i>arg</i>. If the format
2026  * specification contains more than one substitution, then <i>arg</i>
2027  * must be an Array or Hash containing the values to be
2028  * substituted. See Kernel#sprintf for details of the format string.
2029  *
2030  * "%05d" % 123 #=> "00123"
2031  * "%-5s: %016x" % [ "ID", self.object_id ] #=> "ID : 00002b054ec93168"
2032  * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
2033  */
2034 
2035 static VALUE
2036 rb_str_format_m(VALUE str, VALUE arg)
2037 {
2038  VALUE tmp = rb_check_array_type(arg);
2039 
2040  if (!NIL_P(tmp)) {
2041  return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2042  }
2043  return rb_str_format(1, &arg, str);
2044 }
2045 
2046 static inline void
2047 rb_check_lockedtmp(VALUE str)
2048 {
2049  if (FL_TEST(str, STR_TMPLOCK)) {
2050  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2051  }
2052 }
2053 
2054 static inline void
2055 str_modifiable(VALUE str)
2056 {
2057  rb_check_lockedtmp(str);
2059 }
2060 
2061 static inline int
2062 str_dependent_p(VALUE str)
2063 {
2065  return 0;
2066  }
2067  else {
2068  return 1;
2069  }
2070 }
2071 
2072 static inline int
2073 str_independent(VALUE str)
2074 {
2075  str_modifiable(str);
2076  return !str_dependent_p(str);
2077 }
2078 
2079 static void
2080 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2081 {
2082  char *ptr;
2083  char *oldptr;
2084  long capa = len + expand;
2085 
2086  if (len > capa) len = capa;
2087 
2088  if (!STR_EMBED_P(str) && STR_EMBEDDABLE_P(capa, termlen)) {
2089  ptr = RSTRING(str)->as.heap.ptr;
2090  STR_SET_EMBED(str);
2091  memcpy(RSTRING(str)->as.ary, ptr, len);
2092  TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2094  return;
2095  }
2096 
2097  ptr = ALLOC_N(char, (size_t)capa + termlen);
2098  oldptr = RSTRING_PTR(str);
2099  if (oldptr) {
2100  memcpy(ptr, oldptr, len);
2101  }
2103  xfree(oldptr);
2104  }
2107  TERM_FILL(ptr + len, termlen);
2108  RSTRING(str)->as.heap.ptr = ptr;
2109  RSTRING(str)->as.heap.len = len;
2110  RSTRING(str)->as.heap.aux.capa = capa;
2111 }
2112 
2113 void
2115 {
2116  if (!str_independent(str))
2117  str_make_independent(str);
2119 }
2120 
2121 void
2123 {
2124  int termlen = TERM_LEN(str);
2125  long len = RSTRING_LEN(str);
2126 
2127  if (expand < 0) {
2128  rb_raise(rb_eArgError, "negative expanding string size");
2129  }
2130  if (expand >= LONG_MAX - len) {
2131  rb_raise(rb_eArgError, "string size too big");
2132  }
2133 
2134  if (!str_independent(str)) {
2135  str_make_independent_expand(str, len, expand, termlen);
2136  }
2137  else if (expand > 0) {
2138  RESIZE_CAPA_TERM(str, len + expand, termlen);
2139  }
2141 }
2142 
2143 /* As rb_str_modify(), but don't clear coderange */
2144 static void
2145 str_modify_keep_cr(VALUE str)
2146 {
2147  if (!str_independent(str))
2148  str_make_independent(str);
2150  /* Force re-scan later */
2152 }
2153 
2154 static inline void
2155 str_discard(VALUE str)
2156 {
2157  str_modifiable(str);
2160  RSTRING(str)->as.heap.ptr = 0;
2161  RSTRING(str)->as.heap.len = 0;
2162  }
2163 }
2164 
2165 void
2167 {
2168  rb_encoding *enc = rb_enc_get(str);
2169  if (!rb_enc_asciicompat(enc)) {
2170  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2171  }
2172 }
2173 
2174 VALUE
2176 {
2177  VALUE s = *ptr;
2178  if (!RB_TYPE_P(s, T_STRING)) {
2179  s = rb_str_to_str(s);
2180  *ptr = s;
2181  }
2182  return s;
2183 }
2184 
2185 char *
2187 {
2189  return RSTRING_PTR(str);
2190 }
2191 
2192 static int
2193 zero_filled(const char *s, int n)
2194 {
2195  for (; n > 0; --n) {
2196  if (*s++) return 0;
2197  }
2198  return 1;
2199 }
2200 
2201 static const char *
2202 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2203 {
2204  const char *e = s + len;
2205 
2206  for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2207  if (zero_filled(s, minlen)) return s;
2208  }
2209  return 0;
2210 }
2211 
2212 static char *
2213 str_fill_term(VALUE str, char *s, long len, int termlen)
2214 {
2215  /* This function assumes that (capa + termlen) bytes of memory
2216  * is allocated, like many other functions in this file.
2217  */
2218  if (str_dependent_p(str)) {
2219  if (!zero_filled(s + len, termlen))
2220  str_make_independent_expand(str, len, 0L, termlen);
2221  }
2222  else {
2223  TERM_FILL(s + len, termlen);
2224  return s;
2225  }
2226  return RSTRING_PTR(str);
2227 }
2228 
2229 void
2230 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2231 {
2232  long capa = str_capacity(str, oldtermlen) + oldtermlen;
2233  long len = RSTRING_LEN(str);
2234 
2235  assert(capa >= len);
2236  if (capa - len < termlen) {
2237  rb_check_lockedtmp(str);
2238  str_make_independent_expand(str, len, 0L, termlen);
2239  }
2240  else if (str_dependent_p(str)) {
2241  if (termlen > oldtermlen)
2242  str_make_independent_expand(str, len, 0L, termlen);
2243  }
2244  else {
2245  if (!STR_EMBED_P(str)) {
2246  /* modify capa instead of realloc */
2247  assert(!FL_TEST((str), STR_SHARED));
2248  RSTRING(str)->as.heap.aux.capa = capa - termlen;
2249  }
2250  if (termlen > oldtermlen) {
2251  TERM_FILL(RSTRING_PTR(str) + len, termlen);
2252  }
2253  }
2254 
2255  return;
2256 }
2257 
2258 static char *
2259 str_null_check(VALUE str, int *w)
2260 {
2261  char *s = RSTRING_PTR(str);
2262  long len = RSTRING_LEN(str);
2263  rb_encoding *enc = rb_enc_get(str);
2264  const int minlen = rb_enc_mbminlen(enc);
2265 
2266  if (minlen > 1) {
2267  *w = 1;
2268  if (str_null_char(s, len, minlen, enc)) {
2269  return NULL;
2270  }
2271  return str_fill_term(str, s, len, minlen);
2272  }
2273  *w = 0;
2274  if (!s || memchr(s, 0, len)) {
2275  return NULL;
2276  }
2277  if (s[len]) {
2278  s = str_fill_term(str, s, len, minlen);
2279  }
2280  return s;
2281 }
2282 
2283 char *
2285 {
2286  int w;
2287  return str_null_check(str, &w);
2288 }
2289 
2290 char *
2292 {
2294  int w;
2295  char *s = str_null_check(str, &w);
2296  if (!s) {
2297  if (w) {
2298  rb_raise(rb_eArgError, "string contains null char");
2299  }
2300  rb_raise(rb_eArgError, "string contains null byte");
2301  }
2302  return s;
2303 }
2304 
2305 char *
2306 rb_str_fill_terminator(VALUE str, const int newminlen)
2307 {
2308  char *s = RSTRING_PTR(str);
2309  long len = RSTRING_LEN(str);
2310  return str_fill_term(str, s, len, newminlen);
2311 }
2312 
2313 VALUE
2315 {
2317  return str;
2318 }
2319 
2320 /*
2321  * call-seq:
2322  * String.try_convert(obj) -> string or nil
2323  *
2324  * Try to convert <i>obj</i> into a String, using to_str method.
2325  * Returns converted string or nil if <i>obj</i> cannot be converted
2326  * for any reason.
2327  *
2328  * String.try_convert("str") #=> "str"
2329  * String.try_convert(/re/) #=> nil
2330  */
2331 static VALUE
2332 rb_str_s_try_convert(VALUE dummy, VALUE str)
2333 {
2334  return rb_check_string_type(str);
2335 }
2336 
2337 static char*
2338 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2339 {
2340  long nth = *nthp;
2341  if (rb_enc_mbmaxlen(enc) == 1) {
2342  p += nth;
2343  }
2344  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2345  p += nth * rb_enc_mbmaxlen(enc);
2346  }
2347  else if (rb_enc_asciicompat(enc)) {
2348  const char *p2, *e2;
2349  int n;
2350 
2351  while (p < e && 0 < nth) {
2352  e2 = p + nth;
2353  if (e < e2) {
2354  *nthp = nth;
2355  return (char *)e;
2356  }
2357  if (ISASCII(*p)) {
2358  p2 = search_nonascii(p, e2);
2359  if (!p2) {
2360  nth -= e2 - p;
2361  *nthp = nth;
2362  return (char *)e2;
2363  }
2364  nth -= p2 - p;
2365  p = p2;
2366  }
2367  n = rb_enc_mbclen(p, e, enc);
2368  p += n;
2369  nth--;
2370  }
2371  *nthp = nth;
2372  if (nth != 0) {
2373  return (char *)e;
2374  }
2375  return (char *)p;
2376  }
2377  else {
2378  while (p < e && nth--) {
2379  p += rb_enc_mbclen(p, e, enc);
2380  }
2381  }
2382  if (p > e) p = e;
2383  *nthp = nth;
2384  return (char*)p;
2385 }
2386 
2387 char*
2388 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2389 {
2390  return str_nth_len(p, e, &nth, enc);
2391 }
2392 
2393 static char*
2394 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2395 {
2396  if (singlebyte)
2397  p += nth;
2398  else {
2399  p = str_nth_len(p, e, &nth, enc);
2400  }
2401  if (!p) return 0;
2402  if (p > e) p = e;
2403  return (char *)p;
2404 }
2405 
2406 /* char offset to byte offset */
2407 static long
2408 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2409 {
2410  const char *pp = str_nth(p, e, nth, enc, singlebyte);
2411  if (!pp) return e - p;
2412  return pp - p;
2413 }
2414 
2415 long
2417 {
2418  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2419  STR_ENC_GET(str), single_byte_optimizable(str));
2420 }
2421 
2422 #ifdef NONASCII_MASK
2423 static char *
2424 str_utf8_nth(const char *p, const char *e, long *nthp)
2425 {
2426  long nth = *nthp;
2427  if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2428  const uintptr_t *s, *t;
2429  const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2430  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2431  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2432  while (p < (const char *)s) {
2433  if (is_utf8_lead_byte(*p)) nth--;
2434  p++;
2435  }
2436  do {
2437  nth -= count_utf8_lead_bytes_with_word(s);
2438  s++;
2439  } while (s < t && (int)SIZEOF_VOIDP <= nth);
2440  p = (char *)s;
2441  }
2442  while (p < e) {
2443  if (is_utf8_lead_byte(*p)) {
2444  if (nth == 0) break;
2445  nth--;
2446  }
2447  p++;
2448  }
2449  *nthp = nth;
2450  return (char *)p;
2451 }
2452 
2453 static long
2454 str_utf8_offset(const char *p, const char *e, long nth)
2455 {
2456  const char *pp = str_utf8_nth(p, e, &nth);
2457  return pp - p;
2458 }
2459 #endif
2460 
2461 /* byte offset to char offset */
2462 long
2464 {
2465  if (single_byte_optimizable(str) || pos < 0)
2466  return pos;
2467  else {
2468  char *p = RSTRING_PTR(str);
2469  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2470  }
2471 }
2472 
2473 VALUE
2474 rb_str_subseq(VALUE str, long beg, long len)
2475 {
2476  VALUE str2;
2477 
2478  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2480  long olen;
2482  RSTRING(str2)->as.heap.ptr += beg;
2483  olen = RSTRING(str2)->as.heap.len;
2484  if (olen > len) RSTRING(str2)->as.heap.len = len;
2485  }
2486  else {
2487  str2 = rb_str_new_with_class(str, RSTRING_PTR(str)+beg, len);
2488  RB_GC_GUARD(str);
2489  }
2490 
2491  rb_enc_cr_str_copy_for_substr(str2, str);
2492 
2493  return str2;
2494 }
2495 
2496 char *
2497 rb_str_subpos(VALUE str, long beg, long *lenp)
2498 {
2499  long len = *lenp;
2500  long slen = -1L;
2501  long blen = RSTRING_LEN(str);
2502  rb_encoding *enc = STR_ENC_GET(str);
2503  char *p, *s = RSTRING_PTR(str), *e = s + blen;
2504 
2505  if (len < 0) return 0;
2506  if (!blen) {
2507  len = 0;
2508  }
2509  if (single_byte_optimizable(str)) {
2510  if (beg > blen) return 0;
2511  if (beg < 0) {
2512  beg += blen;
2513  if (beg < 0) return 0;
2514  }
2515  if (len > blen - beg)
2516  len = blen - beg;
2517  if (len < 0) return 0;
2518  p = s + beg;
2519  goto end;
2520  }
2521  if (beg < 0) {
2522  if (len > -beg) len = -beg;
2523  if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2524  beg = -beg;
2525  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2526  p = e;
2527  if (!p) return 0;
2528  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2529  if (!p) return 0;
2530  len = e - p;
2531  goto end;
2532  }
2533  else {
2534  slen = str_strlen(str, enc);
2535  beg += slen;
2536  if (beg < 0) return 0;
2537  p = s + beg;
2538  if (len == 0) goto end;
2539  }
2540  }
2541  else if (beg > 0 && beg > RSTRING_LEN(str)) {
2542  return 0;
2543  }
2544  if (len == 0) {
2545  if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2546  p = s + beg;
2547  }
2548 #ifdef NONASCII_MASK
2549  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2550  enc == rb_utf8_encoding()) {
2551  p = str_utf8_nth(s, e, &beg);
2552  if (beg > 0) return 0;
2553  len = str_utf8_offset(p, e, len);
2554  }
2555 #endif
2556  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2557  int char_sz = rb_enc_mbmaxlen(enc);
2558 
2559  p = s + beg * char_sz;
2560  if (p > e) {
2561  return 0;
2562  }
2563  else if (len * char_sz > e - p)
2564  len = e - p;
2565  else
2566  len *= char_sz;
2567  }
2568  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2569  if (beg > 0) return 0;
2570  len = 0;
2571  }
2572  else {
2573  len = str_offset(p, e, len, enc, 0);
2574  }
2575  end:
2576  *lenp = len;
2577  RB_GC_GUARD(str);
2578  return p;
2579 }
2580 
2581 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2582 
2583 VALUE
2584 rb_str_substr(VALUE str, long beg, long len)
2585 {
2586  return str_substr(str, beg, len, TRUE);
2587 }
2588 
2589 static VALUE
2590 str_substr(VALUE str, long beg, long len, int empty)
2591 {
2592  VALUE str2;
2593  char *p = rb_str_subpos(str, beg, &len);
2594 
2595  if (!p) return Qnil;
2596  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2598  long ofs = p - RSTRING_PTR(str);
2599  str2 = rb_str_new_frozen(str);
2600  str2 = str_new_shared(rb_obj_class(str2), str2);
2601  RSTRING(str2)->as.heap.ptr += ofs;
2602  RSTRING(str2)->as.heap.len = len;
2603  ENC_CODERANGE_CLEAR(str2);
2604  }
2605  else {
2606  if (!len && !empty) return Qnil;
2607  str2 = rb_str_new_with_class(str, p, len);
2608  RB_GC_GUARD(str);
2609  }
2610  rb_enc_cr_str_copy_for_substr(str2, str);
2611 
2612  return str2;
2613 }
2614 
2615 VALUE
2617 {
2618  if (OBJ_FROZEN(str)) return str;
2620  return rb_obj_freeze(str);
2621 }
2622 
2623 
2624 /*
2625  * call-seq:
2626  * +str -> str (mutable)
2627  *
2628  * If the string is frozen, then return duplicated mutable string.
2629  *
2630  * If the string is not frozen, then return the string itself.
2631  */
2632 static VALUE
2633 str_uplus(VALUE str)
2634 {
2635  if (OBJ_FROZEN(str)) {
2636  return rb_str_dup(str);
2637  }
2638  else {
2639  return str;
2640  }
2641 }
2642 
2643 /*
2644  * call-seq:
2645  * -str -> str (frozen)
2646  *
2647  * Returns a frozen, possibly pre-existing copy of the string.
2648  *
2649  * The string will be deduplicated as long as it does not have
2650  * any instance variables set on it.
2651  */
2652 static VALUE
2653 str_uminus(VALUE str)
2654 {
2655  if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2656  str = rb_str_dup(str);
2657  }
2658  return rb_fstring(str);
2659 }
2660 
2662 #define rb_str_dup_frozen rb_str_new_frozen
2663 
2664 VALUE
2666 {
2667  if (FL_TEST(str, STR_TMPLOCK)) {
2668  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2669  }
2671  return str;
2672 }
2673 
2674 VALUE
2676 {
2677  if (!FL_TEST(str, STR_TMPLOCK)) {
2678  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
2679  }
2681  return str;
2682 }
2683 
2686 {
2688  return rb_ensure(func, arg, rb_str_unlocktmp, str);
2689 }
2690 
2691 void
2693 {
2694  long capa;
2695  const int termlen = TERM_LEN(str);
2696 
2697  str_modifiable(str);
2698  if (STR_SHARED_P(str)) {
2699  rb_raise(rb_eRuntimeError, "can't set length of shared string");
2700  }
2701  if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
2702  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
2703  }
2704  STR_SET_LEN(str, len);
2705  TERM_FILL(&RSTRING_PTR(str)[len], termlen);
2706 }
2707 
2708 VALUE
2710 {
2711  long slen;
2712  int independent;
2713 
2714  if (len < 0) {
2715  rb_raise(rb_eArgError, "negative string size (or size too big)");
2716  }
2717 
2718  independent = str_independent(str);
2720  slen = RSTRING_LEN(str);
2721 
2722  {
2723  long capa;
2724  const int termlen = TERM_LEN(str);
2725  if (STR_EMBED_P(str)) {
2726  if (len == slen) return str;
2727  if (STR_EMBEDDABLE_P(len, termlen)) {
2729  TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2730  return str;
2731  }
2732  str_make_independent_expand(str, slen, len - slen, termlen);
2733  }
2734  else if (STR_EMBEDDABLE_P(len, termlen)) {
2735  char *ptr = STR_HEAP_PTR(str);
2736  STR_SET_EMBED(str);
2737  if (slen > len) slen = len;
2738  if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
2739  TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2741  if (independent) ruby_xfree(ptr);
2742  return str;
2743  }
2744  else if (!independent) {
2745  if (len == slen) return str;
2746  str_make_independent_expand(str, slen, len - slen, termlen);
2747  }
2748  else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
2749  (capa - len) > (len < 1024 ? len : 1024)) {
2750  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2751  (size_t)len + termlen, STR_HEAP_SIZE(str));
2752  RSTRING(str)->as.heap.aux.capa = len;
2753  }
2754  else if (len == slen) return str;
2755  RSTRING(str)->as.heap.len = len;
2756  TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
2757  }
2758  return str;
2759 }
2760 
2761 static VALUE
2762 str_buf_cat(VALUE str, const char *ptr, long len)
2763 {
2764  long capa, total, olen, off = -1;
2765  char *sptr;
2766  const int termlen = TERM_LEN(str);
2767  assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
2768 
2769  RSTRING_GETMEM(str, sptr, olen);
2770  if (ptr >= sptr && ptr <= sptr + olen) {
2771  off = ptr - sptr;
2772  }
2773  rb_str_modify(str);
2774  if (len == 0) return 0;
2775  if (STR_EMBED_P(str)) {
2776  capa = RSTRING_EMBED_LEN_MAX + 1 - termlen;
2777  sptr = RSTRING(str)->as.ary;
2778  olen = RSTRING_EMBED_LEN(str);
2779  }
2780  else {
2781  capa = RSTRING(str)->as.heap.aux.capa;
2782  sptr = RSTRING(str)->as.heap.ptr;
2783  olen = RSTRING(str)->as.heap.len;
2784  }
2785  if (olen > LONG_MAX - len) {
2786  rb_raise(rb_eArgError, "string sizes too big");
2787  }
2788  total = olen + len;
2789  if (capa < total) {
2790  if (total >= LONG_MAX / 2) {
2791  capa = total;
2792  }
2793  while (total > capa) {
2794  capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
2795  }
2796  RESIZE_CAPA_TERM(str, capa, termlen);
2797  sptr = RSTRING_PTR(str);
2798  }
2799  if (off != -1) {
2800  ptr = sptr + off;
2801  }
2802  memcpy(sptr + olen, ptr, len);
2803  STR_SET_LEN(str, total);
2804  TERM_FILL(sptr + total, termlen); /* sentinel */
2805 
2806  return str;
2807 }
2808 
2809 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
2810 
2811 VALUE
2812 rb_str_cat(VALUE str, const char *ptr, long len)
2813 {
2814  if (len == 0) return str;
2815  if (len < 0) {
2816  rb_raise(rb_eArgError, "negative string size (or size too big)");
2817  }
2818  return str_buf_cat(str, ptr, len);
2819 }
2820 
2821 VALUE
2823 {
2824  must_not_null(ptr);
2825  return rb_str_buf_cat(str, ptr, strlen(ptr));
2826 }
2827 
2831 
2832 static VALUE
2833 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
2834  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
2835 {
2836  int str_encindex = ENCODING_GET(str);
2837  int res_encindex;
2838  int str_cr, res_cr;
2839  rb_encoding *str_enc, *ptr_enc;
2840 
2842 
2843  if (str_encindex == ptr_encindex) {
2844  if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
2845  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
2846  }
2847  }
2848  else {
2849  str_enc = rb_enc_from_index(str_encindex);
2850  ptr_enc = rb_enc_from_index(ptr_encindex);
2851  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
2852  if (len == 0)
2853  return str;
2854  if (RSTRING_LEN(str) == 0) {
2856  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
2857  return str;
2858  }
2859  goto incompatible;
2860  }
2861  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2862  ptr_cr = coderange_scan(ptr, len, ptr_enc);
2863  }
2864  if (str_cr == ENC_CODERANGE_UNKNOWN) {
2865  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
2866  str_cr = rb_enc_str_coderange(str);
2867  }
2868  }
2869  }
2870  if (ptr_cr_ret)
2871  *ptr_cr_ret = ptr_cr;
2872 
2873  if (str_encindex != ptr_encindex &&
2874  str_cr != ENC_CODERANGE_7BIT &&
2875  ptr_cr != ENC_CODERANGE_7BIT) {
2876  str_enc = rb_enc_from_index(str_encindex);
2877  ptr_enc = rb_enc_from_index(ptr_encindex);
2878  incompatible:
2879  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
2880  rb_enc_name(str_enc), rb_enc_name(ptr_enc));
2881  }
2882 
2883  if (str_cr == ENC_CODERANGE_UNKNOWN) {
2884  res_encindex = str_encindex;
2885  res_cr = ENC_CODERANGE_UNKNOWN;
2886  }
2887  else if (str_cr == ENC_CODERANGE_7BIT) {
2888  if (ptr_cr == ENC_CODERANGE_7BIT) {
2889  res_encindex = str_encindex;
2890  res_cr = ENC_CODERANGE_7BIT;
2891  }
2892  else {
2893  res_encindex = ptr_encindex;
2894  res_cr = ptr_cr;
2895  }
2896  }
2897  else if (str_cr == ENC_CODERANGE_VALID) {
2898  res_encindex = str_encindex;
2899  if (ENC_CODERANGE_CLEAN_P(ptr_cr))
2900  res_cr = str_cr;
2901  else
2902  res_cr = ptr_cr;
2903  }
2904  else { /* str_cr == ENC_CODERANGE_BROKEN */
2905  res_encindex = str_encindex;
2906  res_cr = str_cr;
2907  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
2908  }
2909 
2910  if (len < 0) {
2911  rb_raise(rb_eArgError, "negative string size (or size too big)");
2912  }
2913  str_buf_cat(str, ptr, len);
2914  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
2915  return str;
2916 }
2917 
2918 VALUE
2919 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2920 {
2921  return rb_enc_cr_str_buf_cat(str, ptr, len,
2923 }
2924 
2925 VALUE
2927 {
2928  /* ptr must reference NUL terminated ASCII string. */
2929  int encindex = ENCODING_GET(str);
2930  rb_encoding *enc = rb_enc_from_index(encindex);
2931  if (rb_enc_asciicompat(enc)) {
2932  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2933  encindex, ENC_CODERANGE_7BIT, 0);
2934  }
2935  else {
2936  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2937  while (*ptr) {
2938  unsigned int c = (unsigned char)*ptr;
2939  int len = rb_enc_codelen(c, enc);
2940  rb_enc_mbcput(c, buf, enc);
2941  rb_enc_cr_str_buf_cat(str, buf, len,
2942  encindex, ENC_CODERANGE_VALID, 0);
2943  ptr++;
2944  }
2945  return str;
2946  }
2947 }
2948 
2949 VALUE
2951 {
2952  int str2_cr;
2953 
2954  str2_cr = ENC_CODERANGE(str2);
2955 
2956  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2957  ENCODING_GET(str2), str2_cr, &str2_cr);
2958 
2959  ENC_CODERANGE_SET(str2, str2_cr);
2960 
2961  return str;
2962 }
2963 
2964 VALUE
2966 {
2967  StringValue(str2);
2968  return rb_str_buf_append(str, str2);
2969 }
2970 
2971 #define MIN_PRE_ALLOC_SIZE 48
2972 
2974 rb_str_concat_literals(size_t num, const VALUE *strary)
2975 {
2976  VALUE str;
2977  size_t i, s;
2978  long len = 1;
2979 
2980  if (UNLIKELY(!num)) return rb_str_new(0, 0);
2981  if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
2982 
2983  for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
2984  if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
2985  str = rb_str_resurrect(strary[0]);
2986  s = 1;
2987  }
2988  else {
2989  str = rb_str_buf_new(len);
2990  rb_enc_copy(str, strary[0]);
2991  s = 0;
2992  }
2993 
2994  for (i = s; i < num; ++i) {
2995  const VALUE v = strary[i];
2996  int encidx = ENCODING_GET(v);
2997 
2998  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
2999  encidx, ENC_CODERANGE(v), NULL);
3000  if (encidx != ENCINDEX_US_ASCII) {
3002  rb_enc_set_index(str, encidx);
3003  }
3004  }
3005  return str;
3006 }
3007 
3008 /*
3009  * call-seq:
3010  * str.concat(obj1, obj2, ...) -> str
3011  *
3012  * Concatenates the given object(s) to <i>str</i>. If an object is an
3013  * Integer, it is considered a codepoint and converted to a character
3014  * before concatenation.
3015  *
3016  * +concat+ can take multiple arguments, and all the arguments are
3017  * concatenated in order.
3018  *
3019  * a = "hello "
3020  * a.concat("world", 33) #=> "hello world!"
3021  * a #=> "hello world!"
3022  *
3023  * b = "sn"
3024  * b.concat("_", b, "_", b) #=> "sn_sn_sn"
3025  *
3026  * See also String#<<, which takes a single argument.
3027  */
3028 static VALUE
3029 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3030 {
3031  str_modifiable(str);
3032 
3033  if (argc == 1) {
3034  return rb_str_concat(str, argv[0]);
3035  }
3036  else if (argc > 1) {
3037  int i;
3038  VALUE arg_str = rb_str_tmp_new(0);
3039  rb_enc_copy(arg_str, str);
3040  for (i = 0; i < argc; i++) {
3041  rb_str_concat(arg_str, argv[i]);
3042  }
3043  rb_str_buf_append(str, arg_str);
3044  }
3045 
3046  return str;
3047 }
3048 
3049 /*
3050  * call-seq:
3051  * str << obj -> str
3052  * str << integer -> str
3053  *
3054  * Appends the given object to <i>str</i>. If the object is an
3055  * Integer, it is considered a codepoint and converted to a character
3056  * before being appended.
3057  *
3058  * a = "hello "
3059  * a << "world" #=> "hello world"
3060  * a << 33 #=> "hello world!"
3061  *
3062  * See also String#concat, which takes multiple arguments.
3063  */
3064 VALUE
3066 {
3067  unsigned int code;
3068  rb_encoding *enc = STR_ENC_GET(str1);
3069  int encidx;
3070 
3071  if (RB_INTEGER_TYPE_P(str2)) {
3072  if (rb_num_to_uint(str2, &code) == 0) {
3073  }
3074  else if (FIXNUM_P(str2)) {
3075  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3076  }
3077  else {
3078  rb_raise(rb_eRangeError, "bignum out of char range");
3079  }
3080  }
3081  else {
3082  return rb_str_append(str1, str2);
3083  }
3084 
3085  encidx = rb_enc_to_index(enc);
3086  if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3087  /* US-ASCII automatically extended to ASCII-8BIT */
3088  char buf[1];
3089  buf[0] = (char)code;
3090  if (code > 0xFF) {
3091  rb_raise(rb_eRangeError, "%u out of char range", code);
3092  }
3093  rb_str_cat(str1, buf, 1);
3094  if (encidx == ENCINDEX_US_ASCII && code > 127) {
3097  }
3098  }
3099  else {
3100  long pos = RSTRING_LEN(str1);
3101  int cr = ENC_CODERANGE(str1);
3102  int len;
3103  char *buf;
3104 
3105  switch (len = rb_enc_codelen(code, enc)) {
3107  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3108  break;
3110  case 0:
3111  rb_raise(rb_eRangeError, "%u out of char range", code);
3112  break;
3113  }
3114  buf = ALLOCA_N(char, len + 1);
3115  rb_enc_mbcput(code, buf, enc);
3116  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3117  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3118  }
3119  rb_str_resize(str1, pos+len);
3120  memcpy(RSTRING_PTR(str1) + pos, buf, len);
3121  if (cr == ENC_CODERANGE_7BIT && code > 127)
3122  cr = ENC_CODERANGE_VALID;
3123  ENC_CODERANGE_SET(str1, cr);
3124  }
3125  return str1;
3126 }
3127 
3128 /*
3129  * call-seq:
3130  * str.prepend(other_str1, other_str2, ...) -> str
3131  *
3132  * Prepend---Prepend the given strings to <i>str</i>.
3133  *
3134  * a = "!"
3135  * a.prepend("hello ", "world") #=> "hello world!"
3136  * a #=> "hello world!"
3137  *
3138  * See also String#concat.
3139  */
3140 
3141 static VALUE
3142 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3143 {
3144  str_modifiable(str);
3145 
3146  if (argc == 1) {
3147  rb_str_update(str, 0L, 0L, argv[0]);
3148  }
3149  else if (argc > 1) {
3150  int i;
3151  VALUE arg_str = rb_str_tmp_new(0);
3152  rb_enc_copy(arg_str, str);
3153  for (i = 0; i < argc; i++) {
3154  rb_str_append(arg_str, argv[i]);
3155  }
3156  rb_str_update(str, 0L, 0L, arg_str);
3157  }
3158 
3159  return str;
3160 }
3161 
3162 st_index_t
3164 {
3165  int e = ENCODING_GET(str);
3167  e = 0;
3168  }
3169  return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3170 }
3171 
3172 int
3174 {
3175  long len1, len2;
3176  const char *ptr1, *ptr2;
3177  RSTRING_GETMEM(str1, ptr1, len1);
3178  RSTRING_GETMEM(str2, ptr2, len2);
3179  return (len1 != len2 ||
3180  !rb_str_comparable(str1, str2) ||
3181  memcmp(ptr1, ptr2, len1) != 0);
3182 }
3183 
3184 /*
3185  * call-seq:
3186  * str.hash -> integer
3187  *
3188  * Returns a hash based on the string's length, content and encoding.
3189  *
3190  * See also Object#hash.
3191  */
3192 
3193 static VALUE
3194 rb_str_hash_m(VALUE str)
3195 {
3196  st_index_t hval = rb_str_hash(str);
3197  return ST2FIX(hval);
3198 }
3199 
3200 #define lesser(a,b) (((a)>(b))?(b):(a))
3201 
3202 int
3204 {
3205  int idx1, idx2;
3206  int rc1, rc2;
3207 
3208  if (RSTRING_LEN(str1) == 0) return TRUE;
3209  if (RSTRING_LEN(str2) == 0) return TRUE;
3210  idx1 = ENCODING_GET(str1);
3211  idx2 = ENCODING_GET(str2);
3212  if (idx1 == idx2) return TRUE;
3213  rc1 = rb_enc_str_coderange(str1);
3214  rc2 = rb_enc_str_coderange(str2);
3215  if (rc1 == ENC_CODERANGE_7BIT) {
3216  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3218  return TRUE;
3219  }
3220  if (rc2 == ENC_CODERANGE_7BIT) {
3222  return TRUE;
3223  }
3224  return FALSE;
3225 }
3226 
3227 int
3229 {
3230  long len1, len2;
3231  const char *ptr1, *ptr2;
3232  int retval;
3233 
3234  if (str1 == str2) return 0;
3235  RSTRING_GETMEM(str1, ptr1, len1);
3236  RSTRING_GETMEM(str2, ptr2, len2);
3237  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3238  if (len1 == len2) {
3239  if (!rb_str_comparable(str1, str2)) {
3240  if (ENCODING_GET(str1) > ENCODING_GET(str2))
3241  return 1;
3242  return -1;
3243  }
3244  return 0;
3245  }
3246  if (len1 > len2) return 1;
3247  return -1;
3248  }
3249  if (retval > 0) return 1;
3250  return -1;
3251 }
3252 
3253 /*
3254  * call-seq:
3255  * str == obj -> true or false
3256  * str === obj -> true or false
3257  *
3258  * Equality---Returns whether +str+ == +obj+, similar to Object#==.
3259  *
3260  * If +obj+ is not an instance of String but responds to +to_str+, then the
3261  * two strings are compared using <code>obj.==</code>.
3262  *
3263  * Otherwise, returns similarly to String#eql?, comparing length and content.
3264  */
3265 
3266 VALUE
3268 {
3269  if (str1 == str2) return Qtrue;
3270  if (!RB_TYPE_P(str2, T_STRING)) {
3271  if (!rb_respond_to(str2, idTo_str)) {
3272  return Qfalse;
3273  }
3274  return rb_equal(str2, str1);
3275  }
3276  return rb_str_eql_internal(str1, str2);
3277 }
3278 
3279 /*
3280  * call-seq:
3281  * str.eql?(other) -> true or false
3282  *
3283  * Two strings are equal if they have the same length and content.
3284  */
3285 
3288 {
3289  if (str1 == str2) return Qtrue;
3290  if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3291  return rb_str_eql_internal(str1, str2);
3292 }
3293 
3294 /*
3295  * call-seq:
3296  * string <=> other_string -> -1, 0, +1, or nil
3297  *
3298  * Comparison---Returns -1, 0, +1, or +nil+ depending on whether +string+ is
3299  * less than, equal to, or greater than +other_string+.
3300  *
3301  * +nil+ is returned if the two values are incomparable.
3302  *
3303  * If the strings are of different lengths, and the strings are equal when
3304  * compared up to the shortest length, then the longer string is considered
3305  * greater than the shorter one.
3306  *
3307  * <code><=></code> is the basis for the methods <code><</code>,
3308  * <code><=</code>, <code>></code>, <code>>=</code>, and
3309  * <code>between?</code>, included from module Comparable. The method
3310  * String#== does not use Comparable#==.
3311  *
3312  * "abcdef" <=> "abcde" #=> 1
3313  * "abcdef" <=> "abcdef" #=> 0
3314  * "abcdef" <=> "abcdefg" #=> -1
3315  * "abcdef" <=> "ABCDEF" #=> 1
3316  * "abcdef" <=> 1 #=> nil
3317  */
3318 
3319 static VALUE
3320 rb_str_cmp_m(VALUE str1, VALUE str2)
3321 {
3322  int result;
3323  VALUE s = rb_check_string_type(str2);
3324  if (NIL_P(s)) {
3325  return rb_invcmp(str1, str2);
3326  }
3327  result = rb_str_cmp(str1, s);
3328  return INT2FIX(result);
3329 }
3330 
3331 static VALUE str_casecmp(VALUE str1, VALUE str2);
3332 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3333 
3334 /*
3335  * call-seq:
3336  * str.casecmp(other_str) -> -1, 0, +1, or nil
3337  *
3338  * Case-insensitive version of String#<=>.
3339  * Currently, case-insensitivity only works on characters A-Z/a-z,
3340  * not all of Unicode. This is different from String#casecmp?.
3341  *
3342  * "aBcDeF".casecmp("abcde") #=> 1
3343  * "aBcDeF".casecmp("abcdef") #=> 0
3344  * "aBcDeF".casecmp("abcdefg") #=> -1
3345  * "abcdef".casecmp("ABCDEF") #=> 0
3346  *
3347  * +nil+ is returned if the two strings have incompatible encodings,
3348  * or if +other_str+ is not a string.
3349  *
3350  * "foo".casecmp(2) #=> nil
3351  * "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp("\u{c4 d6 dc}") #=> nil
3352  */
3353 
3354 static VALUE
3355 rb_str_casecmp(VALUE str1, VALUE str2)
3356 {
3357  VALUE s = rb_check_string_type(str2);
3358  if (NIL_P(s)) {
3359  return Qnil;
3360  }
3361  return str_casecmp(str1, s);
3362 }
3363 
3364 static VALUE
3365 str_casecmp(VALUE str1, VALUE str2)
3366 {
3367  long len;
3368  rb_encoding *enc;
3369  char *p1, *p1end, *p2, *p2end;
3370 
3371  enc = rb_enc_compatible(str1, str2);
3372  if (!enc) {
3373  return Qnil;
3374  }
3375 
3376  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3377  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3378  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3379  while (p1 < p1end && p2 < p2end) {
3380  if (*p1 != *p2) {
3381  unsigned int c1 = TOLOWER(*p1 & 0xff);
3382  unsigned int c2 = TOLOWER(*p2 & 0xff);
3383  if (c1 != c2)
3384  return INT2FIX(c1 < c2 ? -1 : 1);
3385  }
3386  p1++;
3387  p2++;
3388  }
3389  }
3390  else {
3391  while (p1 < p1end && p2 < p2end) {
3392  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3393  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3394 
3395  if (0 <= c1 && 0 <= c2) {
3396  c1 = TOLOWER(c1);
3397  c2 = TOLOWER(c2);
3398  if (c1 != c2)
3399  return INT2FIX(c1 < c2 ? -1 : 1);
3400  }
3401  else {
3402  int r;
3403  l1 = rb_enc_mbclen(p1, p1end, enc);
3404  l2 = rb_enc_mbclen(p2, p2end, enc);
3405  len = l1 < l2 ? l1 : l2;
3406  r = memcmp(p1, p2, len);
3407  if (r != 0)
3408  return INT2FIX(r < 0 ? -1 : 1);
3409  if (l1 != l2)
3410  return INT2FIX(l1 < l2 ? -1 : 1);
3411  }
3412  p1 += l1;
3413  p2 += l2;
3414  }
3415  }
3416  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3417  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3418  return INT2FIX(-1);
3419 }
3420 
3421 /*
3422  * call-seq:
3423  * str.casecmp?(other_str) -> true, false, or nil
3424  *
3425  * Returns +true+ if +str+ and +other_str+ are equal after
3426  * Unicode case folding, +false+ if they are not equal.
3427  *
3428  * "aBcDeF".casecmp?("abcde") #=> false
3429  * "aBcDeF".casecmp?("abcdef") #=> true
3430  * "aBcDeF".casecmp?("abcdefg") #=> false
3431  * "abcdef".casecmp?("ABCDEF") #=> true
3432  * "\u{e4 f6 fc}".casecmp?("\u{c4 d6 dc}") #=> true
3433  *
3434  * +nil+ is returned if the two strings have incompatible encodings,
3435  * or if +other_str+ is not a string.
3436  *
3437  * "foo".casecmp?(2) #=> nil
3438  * "\u{e4 f6 fc}".encode("ISO-8859-1").casecmp?("\u{c4 d6 dc}") #=> nil
3439  */
3440 
3441 static VALUE
3442 rb_str_casecmp_p(VALUE str1, VALUE str2)
3443 {
3444  VALUE s = rb_check_string_type(str2);
3445  if (NIL_P(s)) {
3446  return Qnil;
3447  }
3448  return str_casecmp_p(str1, s);
3449 }
3450 
3451 static VALUE
3452 str_casecmp_p(VALUE str1, VALUE str2)
3453 {
3454  rb_encoding *enc;
3455  VALUE folded_str1, folded_str2;
3456  VALUE fold_opt = sym_fold;
3457 
3458  enc = rb_enc_compatible(str1, str2);
3459  if (!enc) {
3460  return Qnil;
3461  }
3462 
3463  folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3464  folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3465 
3466  return rb_str_eql(folded_str1, folded_str2);
3467 }
3468 
3469 static long
3470 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3471  const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3472 {
3473  const char *search_start = str_ptr;
3474  long pos, search_len = str_len - offset;
3475 
3476  for (;;) {
3477  const char *t;
3478  pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3479  if (pos < 0) return pos;
3480  t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3481  if (t == search_start + pos) break;
3482  search_len -= t - search_start;
3483  if (search_len <= 0) return -1;
3484  offset += t - search_start;
3485  search_start = t;
3486  }
3487  return pos + offset;
3488 }
3489 
3490 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3491 
3492 static long
3493 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3494 {
3495  const char *str_ptr, *str_ptr_end, *sub_ptr;
3496  long str_len, sub_len;
3497  int single_byte = single_byte_optimizable(str);
3498  rb_encoding *enc;
3499 
3500  enc = rb_enc_check(str, sub);
3501  if (is_broken_string(sub)) return -1;
3502 
3503  str_ptr = RSTRING_PTR(str);
3504  str_ptr_end = RSTRING_END(str);
3505  str_len = RSTRING_LEN(str);
3506  sub_ptr = RSTRING_PTR(sub);
3507  sub_len = RSTRING_LEN(sub);
3508 
3509  if (str_len < sub_len) return -1;
3510 
3511  if (offset != 0) {
3512  long str_len_char, sub_len_char;
3513  str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3514  sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3515  if (offset < 0) {
3516  offset += str_len_char;
3517  if (offset < 0) return -1;
3518  }
3519  if (str_len_char - offset < sub_len_char) return -1;
3520  if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3521  str_ptr += offset;
3522  }
3523  if (sub_len == 0) return offset;
3524 
3525  /* need proceed one character at a time */
3526  return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3527 }
3528 
3529 
3530 /*
3531  * call-seq:
3532  * str.index(substring [, offset]) -> integer or nil
3533  * str.index(regexp [, offset]) -> integer or nil
3534  *
3535  * Returns the index of the first occurrence of the given <i>substring</i> or
3536  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3537  * found. If the second parameter is present, it specifies the position in the
3538  * string to begin the search.
3539  *
3540  * "hello".index('e') #=> 1
3541  * "hello".index('lo') #=> 3
3542  * "hello".index('a') #=> nil
3543  * "hello".index(?e) #=> 1
3544  * "hello".index(/[aeiou]/, -3) #=> 4
3545  */
3546 
3547 static VALUE
3548 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3549 {
3550  VALUE sub;
3551  VALUE initpos;
3552  long pos;
3553 
3554  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3555  pos = NUM2LONG(initpos);
3556  }
3557  else {
3558  pos = 0;
3559  }
3560  if (pos < 0) {
3561  pos += str_strlen(str, NULL);
3562  if (pos < 0) {
3563  if (RB_TYPE_P(sub, T_REGEXP)) {
3565  }
3566  return Qnil;
3567  }
3568  }
3569 
3570  if (SPECIAL_CONST_P(sub)) goto generic;
3571  switch (BUILTIN_TYPE(sub)) {
3572  case T_REGEXP:
3573  if (pos > str_strlen(str, NULL))
3574  return Qnil;
3575  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3576  rb_enc_check(str, sub), single_byte_optimizable(str));
3577 
3578  pos = rb_reg_search(sub, str, pos, 0);
3579  pos = rb_str_sublen(str, pos);
3580  break;
3581 
3582  generic:
3583  default: {
3584  VALUE tmp;
3585 
3586  tmp = rb_check_string_type(sub);
3587  if (NIL_P(tmp)) {
3588  rb_raise(rb_eTypeError, "type mismatch: %s given",
3590  }
3591  sub = tmp;
3592  }
3593  /* fall through */
3594  case T_STRING:
3595  pos = rb_str_index(str, sub, pos);
3596  pos = rb_str_sublen(str, pos);
3597  break;
3598  }
3599 
3600  if (pos == -1) return Qnil;
3601  return LONG2NUM(pos);
3602 }
3603 
3604 #ifdef HAVE_MEMRCHR
3605 static long
3606 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3607 {
3608  char *hit, *adjusted;
3609  int c;
3610  long slen, searchlen;
3611  char *sbeg, *e, *t;
3612 
3613  slen = RSTRING_LEN(sub);
3614  if (slen == 0) return pos;
3615  sbeg = RSTRING_PTR(str);
3616  e = RSTRING_END(str);
3617  t = RSTRING_PTR(sub);
3618  c = *t & 0xff;
3619  searchlen = s - sbeg + 1;
3620 
3621  do {
3622  hit = memrchr(sbeg, c, searchlen);
3623  if (!hit) break;
3624  adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
3625  if (hit != adjusted) {
3626  searchlen = adjusted - sbeg;
3627  continue;
3628  }
3629  if (memcmp(hit, t, slen) == 0)
3630  return rb_str_sublen(str, hit - sbeg);
3631  searchlen = adjusted - sbeg;
3632  } while (searchlen > 0);
3633 
3634  return -1;
3635 }
3636 #else
3637 static long
3638 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3639 {
3640  long slen;
3641  char *sbeg, *e, *t;
3642 
3643  sbeg = RSTRING_PTR(str);
3644  e = RSTRING_END(str);
3645  t = RSTRING_PTR(sub);
3646  slen = RSTRING_LEN(sub);
3647 
3648  while (s) {
3649  if (memcmp(s, t, slen) == 0) {
3650  return pos;
3651  }
3652  if (pos == 0) break;
3653  pos--;
3654  s = rb_enc_prev_char(sbeg, s, e, enc);
3655  }
3656 
3657  return -1;
3658 }
3659 #endif
3660 
3661 static long
3662 rb_str_rindex(VALUE str, VALUE sub, long pos)
3663 {
3664  long len, slen;
3665  char *sbeg, *s;
3666  rb_encoding *enc;
3667  int singlebyte;
3668 
3669  enc = rb_enc_check(str, sub);
3670  if (is_broken_string(sub)) return -1;
3671  singlebyte = single_byte_optimizable(str);
3672  len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
3673  slen = str_strlen(sub, enc); /* rb_enc_check */
3674 
3675  /* substring longer than string */
3676  if (len < slen) return -1;
3677  if (len - pos < slen) pos = len - slen;
3678  if (len == 0) return pos;
3679 
3680  sbeg = RSTRING_PTR(str);
3681 
3682  if (pos == 0) {
3683  if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
3684  return 0;
3685  else
3686  return -1;
3687  }
3688 
3689  s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
3690  return str_rindex(str, sub, s, pos, enc);
3691 }
3692 
3693 
3694 /*
3695  * call-seq:
3696  * str.rindex(substring [, integer]) -> integer or nil
3697  * str.rindex(regexp [, integer]) -> integer or nil
3698  *
3699  * Returns the index of the last occurrence of the given <i>substring</i> or
3700  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3701  * found. If the second parameter is present, it specifies the position in the
3702  * string to end the search---characters beyond this point will not be
3703  * considered.
3704  *
3705  * "hello".rindex('e') #=> 1
3706  * "hello".rindex('l') #=> 3
3707  * "hello".rindex('a') #=> nil
3708  * "hello".rindex(?e) #=> 1
3709  * "hello".rindex(/[aeiou]/, -2) #=> 1
3710  */
3711 
3712 static VALUE
3713 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
3714 {
3715  VALUE sub;
3716  VALUE vpos;
3717  rb_encoding *enc = STR_ENC_GET(str);
3718  long pos, len = str_strlen(str, enc); /* str's enc */
3719 
3720  if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
3721  pos = NUM2LONG(vpos);
3722  if (pos < 0) {
3723  pos += len;
3724  if (pos < 0) {
3725  if (RB_TYPE_P(sub, T_REGEXP)) {
3727  }
3728  return Qnil;
3729  }
3730  }
3731  if (pos > len) pos = len;
3732  }
3733  else {
3734  pos = len;
3735  }
3736 
3737  if (SPECIAL_CONST_P(sub)) goto generic;
3738  switch (BUILTIN_TYPE(sub)) {
3739  case T_REGEXP:
3740  /* enc = rb_get_check(str, sub); */
3741  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3742  enc, single_byte_optimizable(str));
3743 
3744  pos = rb_reg_search(sub, str, pos, 1);
3745  pos = rb_str_sublen(str, pos);
3746  if (pos >= 0) return LONG2NUM(pos);
3747  break;
3748 
3749  generic:
3750  default: {
3751  VALUE tmp;
3752 
3753  tmp = rb_check_string_type(sub);
3754  if (NIL_P(tmp)) {
3755  rb_raise(rb_eTypeError, "type mismatch: %s given",
3757  }
3758  sub = tmp;
3759  }
3760  /* fall through */
3761  case T_STRING:
3762  pos = rb_str_rindex(str, sub, pos);
3763  if (pos >= 0) return LONG2NUM(pos);
3764  break;
3765  }
3766  return Qnil;
3767 }
3768 
3769 /*
3770  * call-seq:
3771  * str =~ obj -> integer or nil
3772  *
3773  * Match---If <i>obj</i> is a Regexp, use it as a pattern to match
3774  * against <i>str</i>,and returns the position the match starts, or
3775  * <code>nil</code> if there is no match. Otherwise, invokes
3776  * <i>obj.=~</i>, passing <i>str</i> as an argument. The default
3777  * <code>=~</code> in Object returns <code>nil</code>.
3778  *
3779  * Note: <code>str =~ regexp</code> is not the same as
3780  * <code>regexp =~ str</code>. Strings captured from named capture groups
3781  * are assigned to local variables only in the second case.
3782  *
3783  * "cat o' 9 tails" =~ /\d/ #=> 7
3784  * "cat o' 9 tails" =~ 9 #=> nil
3785  */
3786 
3787 static VALUE
3788 rb_str_match(VALUE x, VALUE y)
3789 {
3790  if (SPECIAL_CONST_P(y)) goto generic;
3791  switch (BUILTIN_TYPE(y)) {
3792  case T_STRING:
3793  rb_raise(rb_eTypeError, "type mismatch: String given");
3794 
3795  case T_REGEXP:
3796  return rb_reg_match(y, x);
3797 
3798  generic:
3799  default:
3800  return rb_funcall(y, idEqTilde, 1, x);
3801  }
3802 }
3803 
3804 
3805 static VALUE get_pat(VALUE);
3806 
3807 
3808 /*
3809  * call-seq:
3810  * str.match(pattern) -> matchdata or nil
3811  * str.match(pattern, pos) -> matchdata or nil
3812  *
3813  * Converts <i>pattern</i> to a Regexp (if it isn't already one),
3814  * then invokes its <code>match</code> method on <i>str</i>. If the second
3815  * parameter is present, it specifies the position in the string to begin the
3816  * search.
3817  *
3818  * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
3819  * 'hello'.match('(.)\1')[0] #=> "ll"
3820  * 'hello'.match(/(.)\1/)[0] #=> "ll"
3821  * 'hello'.match(/(.)\1/, 3) #=> nil
3822  * 'hello'.match('xx') #=> nil
3823  *
3824  * If a block is given, invoke the block with MatchData if match succeed, so
3825  * that you can write
3826  *
3827  * str.match(pat) {|m| ...}
3828  *
3829  * instead of
3830  *
3831  * if m = str.match(pat)
3832  * ...
3833  * end
3834  *
3835  * The return value is a value from block execution in this case.
3836  */
3837 
3838 static VALUE
3839 rb_str_match_m(int argc, VALUE *argv, VALUE str)
3840 {
3841  VALUE re, result;
3842  if (argc < 1)
3843  rb_check_arity(argc, 1, 2);
3844  re = argv[0];
3845  argv[0] = str;
3846  result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
3847  if (!NIL_P(result) && rb_block_given_p()) {
3848  return rb_yield(result);
3849  }
3850  return result;
3851 }
3852 
3853 /*
3854  * call-seq:
3855  * str.match?(pattern) -> true or false
3856  * str.match?(pattern, pos) -> true or false
3857  *
3858  * Converts _pattern_ to a +Regexp+ (if it isn't already one), then
3859  * returns a +true+ or +false+ indicates whether the regexp is
3860  * matched _str_ or not without updating <code>$~</code> and other
3861  * related variables. If the second parameter is present, it
3862  * specifies the position in the string to begin the search.
3863  *
3864  * "Ruby".match?(/R.../) #=> true
3865  * "Ruby".match?(/R.../, 1) #=> false
3866  * "Ruby".match?(/P.../) #=> false
3867  * $& #=> nil
3868  */
3869 
3870 static VALUE
3871 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
3872 {
3873  VALUE re;
3874  rb_check_arity(argc, 1, 2);
3875  re = get_pat(argv[0]);
3876  return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
3877 }
3878 
3883 };
3884 
3885 static enum neighbor_char
3886 enc_succ_char(char *p, long len, rb_encoding *enc)
3887 {
3888  long i;
3889  int l;
3890 
3891  if (rb_enc_mbminlen(enc) > 1) {
3892  /* wchar, trivial case */
3893  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3894  if (!MBCLEN_CHARFOUND_P(r)) {
3895  return NEIGHBOR_NOT_CHAR;
3896  }
3897  c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
3898  l = rb_enc_code_to_mbclen(c, enc);
3899  if (!l) return NEIGHBOR_NOT_CHAR;
3900  if (l != len) return NEIGHBOR_WRAPPED;
3901  rb_enc_mbcput(c, p, enc);
3902  r = rb_enc_precise_mbclen(p, p + len, enc);
3903  if (!MBCLEN_CHARFOUND_P(r)) {
3904  return NEIGHBOR_NOT_CHAR;
3905  }
3906  return NEIGHBOR_FOUND;
3907  }
3908  while (1) {
3909  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
3910  p[i] = '\0';
3911  if (i < 0)
3912  return NEIGHBOR_WRAPPED;
3913  ++((unsigned char*)p)[i];
3914  l = rb_enc_precise_mbclen(p, p+len, enc);
3915  if (MBCLEN_CHARFOUND_P(l)) {
3916  l = MBCLEN_CHARFOUND_LEN(l);
3917  if (l == len) {
3918  return NEIGHBOR_FOUND;
3919  }
3920  else {
3921  memset(p+l, 0xff, len-l);
3922  }
3923  }
3924  if (MBCLEN_INVALID_P(l) && i < len-1) {
3925  long len2;
3926  int l2;
3927  for (len2 = len-1; 0 < len2; len2--) {
3928  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
3929  if (!MBCLEN_INVALID_P(l2))
3930  break;
3931  }
3932  memset(p+len2+1, 0xff, len-(len2+1));
3933  }
3934  }
3935 }
3936 
3937 static enum neighbor_char
3938 enc_pred_char(char *p, long len, rb_encoding *enc)
3939 {
3940  long i;
3941  int l;
3942  if (rb_enc_mbminlen(enc) > 1) {
3943  /* wchar, trivial case */
3944  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3945  if (!MBCLEN_CHARFOUND_P(r)) {
3946  return NEIGHBOR_NOT_CHAR;
3947  }
3948  c = rb_enc_mbc_to_codepoint(p, p + len, enc);
3949  if (!c) return NEIGHBOR_NOT_CHAR;
3950  --c;
3951  l = rb_enc_code_to_mbclen(c, enc);
3952  if (!l) return NEIGHBOR_NOT_CHAR;
3953  if (l != len) return NEIGHBOR_WRAPPED;
3954  rb_enc_mbcput(c, p, enc);
3955  r = rb_enc_precise_mbclen(p, p + len, enc);
3956  if (!MBCLEN_CHARFOUND_P(r)) {
3957  return NEIGHBOR_NOT_CHAR;
3958  }
3959  return NEIGHBOR_FOUND;
3960  }
3961  while (1) {
3962  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
3963  p[i] = '\xff';
3964  if (i < 0)
3965  return NEIGHBOR_WRAPPED;
3966  --((unsigned char*)p)[i];
3967  l = rb_enc_precise_mbclen(p, p+len, enc);
3968  if (MBCLEN_CHARFOUND_P(l)) {
3969  l = MBCLEN_CHARFOUND_LEN(l);
3970  if (l == len) {
3971  return NEIGHBOR_FOUND;
3972  }
3973  else {
3974  memset(p+l, 0, len-l);
3975  }
3976  }
3977  if (MBCLEN_INVALID_P(l) && i < len-1) {
3978  long len2;
3979  int l2;
3980  for (len2 = len-1; 0 < len2; len2--) {
3981  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
3982  if (!MBCLEN_INVALID_P(l2))
3983  break;
3984  }
3985  memset(p+len2+1, 0, len-(len2+1));
3986  }
3987  }
3988 }
3989 
3990 /*
3991  overwrite +p+ by succeeding letter in +enc+ and returns
3992  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
3993  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
3994  assuming each ranges are successive, and mbclen
3995  never change in each ranges.
3996  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
3997  character.
3998  */
3999 static enum neighbor_char
4000 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4001 {
4002  enum neighbor_char ret;
4003  unsigned int c;
4004  int ctype;
4005  int range;
4006  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4007 
4008  /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4009  int try;
4010  const int max_gaps = 1;
4011 
4012  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4013  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4014  ctype = ONIGENC_CTYPE_DIGIT;
4015  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4016  ctype = ONIGENC_CTYPE_ALPHA;
4017  else
4018  return NEIGHBOR_NOT_CHAR;
4019 
4020  MEMCPY(save, p, char, len);
4021  for (try = 0; try <= max_gaps; ++try) {
4022  ret = enc_succ_char(p, len, enc);
4023  if (ret == NEIGHBOR_FOUND) {
4024  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4025  if (rb_enc_isctype(c, ctype, enc))
4026  return NEIGHBOR_FOUND;
4027  }
4028  }
4029  MEMCPY(p, save, char, len);
4030  range = 1;
4031  while (1) {
4032  MEMCPY(save, p, char, len);
4033  ret = enc_pred_char(p, len, enc);
4034  if (ret == NEIGHBOR_FOUND) {
4035  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4036  if (!rb_enc_isctype(c, ctype, enc)) {
4037  MEMCPY(p, save, char, len);
4038  break;
4039  }
4040  }
4041  else {
4042  MEMCPY(p, save, char, len);
4043  break;
4044  }
4045  range++;
4046  }
4047  if (range == 1) {
4048  return NEIGHBOR_NOT_CHAR;
4049  }
4050 
4051  if (ctype != ONIGENC_CTYPE_DIGIT) {
4052  MEMCPY(carry, p, char, len);
4053  return NEIGHBOR_WRAPPED;
4054  }
4055 
4056  MEMCPY(carry, p, char, len);
4057  enc_succ_char(carry, len, enc);
4058  return NEIGHBOR_WRAPPED;
4059 }
4060 
4061 
4062 static VALUE str_succ(VALUE str);
4063 
4064 /*
4065  * call-seq:
4066  * str.succ -> new_str
4067  * str.next -> new_str
4068  *
4069  * Returns the successor to <i>str</i>. The successor is calculated by
4070  * incrementing characters starting from the rightmost alphanumeric (or
4071  * the rightmost character if there are no alphanumerics) in the
4072  * string. Incrementing a digit always results in another digit, and
4073  * incrementing a letter results in another letter of the same case.
4074  * Incrementing nonalphanumerics uses the underlying character set's
4075  * collating sequence.
4076  *
4077  * If the increment generates a ``carry,'' the character to the left of
4078  * it is incremented. This process repeats until there is no carry,
4079  * adding an additional character if necessary.
4080  *
4081  * "abcd".succ #=> "abce"
4082  * "THX1138".succ #=> "THX1139"
4083  * "<<koala>>".succ #=> "<<koalb>>"
4084  * "1999zzz".succ #=> "2000aaa"
4085  * "ZZZ9999".succ #=> "AAAA0000"
4086  * "***".succ #=> "**+"
4087  */
4088 
4089 VALUE
4091 {
4092  VALUE str;
4093  str = rb_str_new_with_class(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
4094  rb_enc_cr_str_copy_for_substr(str, orig);
4095  return str_succ(str);
4096 }
4097 
4098 static VALUE
4099 str_succ(VALUE str)
4100 {
4101  rb_encoding *enc;
4102  char *sbeg, *s, *e, *last_alnum = 0;
4103  int found_alnum = 0;
4104  long l, slen;
4105  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4106  long carry_pos = 0, carry_len = 1;
4107  enum neighbor_char neighbor = NEIGHBOR_FOUND;
4108 
4109  slen = RSTRING_LEN(str);
4110  if (slen == 0) return str;
4111 
4112  enc = STR_ENC_GET(str);
4113  sbeg = RSTRING_PTR(str);
4114  s = e = sbeg + slen;
4115 
4116  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4117  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4118  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4119  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4120  break;
4121  }
4122  }
4123  l = rb_enc_precise_mbclen(s, e, enc);
4124  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4126  neighbor = enc_succ_alnum_char(s, l, enc, carry);
4127  switch (neighbor) {
4128  case NEIGHBOR_NOT_CHAR:
4129  continue;
4130  case NEIGHBOR_FOUND:
4131  return str;
4132  case NEIGHBOR_WRAPPED:
4133  last_alnum = s;
4134  break;
4135  }
4136  found_alnum = 1;
4137  carry_pos = s - sbeg;
4138  carry_len = l;
4139  }
4140  if (!found_alnum) { /* str contains no alnum */
4141  s = e;
4142  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4143  enum neighbor_char neighbor;
4144  char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4145  l = rb_enc_precise_mbclen(s, e, enc);
4146  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4148  MEMCPY(tmp, s, char, l);
4149  neighbor = enc_succ_char(tmp, l, enc);
4150  switch (neighbor) {
4151  case NEIGHBOR_FOUND:
4152  MEMCPY(s, tmp, char, l);
4153  return str;
4154  break;
4155  case NEIGHBOR_WRAPPED:
4156  MEMCPY(s, tmp, char, l);
4157  break;
4158  case NEIGHBOR_NOT_CHAR:
4159  break;
4160  }
4161  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4162  /* wrapped to \0...\0. search next valid char. */
4163  enc_succ_char(s, l, enc);
4164  }
4165  if (!rb_enc_asciicompat(enc)) {
4166  MEMCPY(carry, s, char, l);
4167  carry_len = l;
4168  }
4169  carry_pos = s - sbeg;
4170  }
4172  }
4173  RESIZE_CAPA(str, slen + carry_len);
4174  sbeg = RSTRING_PTR(str);
4175  s = sbeg + carry_pos;
4176  memmove(s + carry_len, s, slen - carry_pos);
4177  memmove(s, carry, carry_len);
4178  slen += carry_len;
4179  STR_SET_LEN(str, slen);
4180  TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4182  return str;
4183 }
4184 
4185 
4186 /*
4187  * call-seq:
4188  * str.succ! -> str
4189  * str.next! -> str
4190  *
4191  * Equivalent to String#succ, but modifies the receiver in place.
4192  */
4193 
4194 static VALUE
4195 rb_str_succ_bang(VALUE str)
4196 {
4197  rb_str_modify(str);
4198  str_succ(str);
4199  return str;
4200 }
4201 
4202 static int
4203 all_digits_p(const char *s, long len)
4204 {
4205  while (len-- > 0) {
4206  if (!ISDIGIT(*s)) return 0;
4207  s++;
4208  }
4209  return 1;
4210 }
4211 
4212 static int
4213 str_upto_i(VALUE str, VALUE arg)
4214 {
4215  rb_yield(str);
4216  return 0;
4217 }
4218 
4219 /*
4220  * call-seq:
4221  * str.upto(other_str, exclusive=false) {|s| block } -> str
4222  * str.upto(other_str, exclusive=false) -> an_enumerator
4223  *
4224  * Iterates through successive values, starting at <i>str</i> and
4225  * ending at <i>other_str</i> inclusive, passing each value in turn
4226  * to the block. The String#succ method is used to generate each
4227  * value. If optional second argument exclusive is omitted or is
4228  * false, the last value will be included; otherwise it will be
4229  * excluded.
4230  *
4231  * If no block is given, an enumerator is returned instead.
4232  *
4233  * "a8".upto("b6") {|s| print s, ' ' }
4234  * for s in "a8".."b6"
4235  * print s, ' '
4236  * end
4237  *
4238  * <em>produces:</em>
4239  *
4240  * a8 a9 b0 b1 b2 b3 b4 b5 b6
4241  * a8 a9 b0 b1 b2 b3 b4 b5 b6
4242  *
4243  * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
4244  * both are recognized as decimal numbers. In addition, the width of
4245  * string (e.g. leading zeros) is handled appropriately.
4246  *
4247  * "9".upto("11").to_a #=> ["9", "10", "11"]
4248  * "25".upto("5").to_a #=> []
4249  * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"]
4250  */
4251 
4252 static VALUE
4253 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4254 {
4255  VALUE end, exclusive;
4256 
4257  rb_scan_args(argc, argv, "11", &end, &exclusive);
4258  RETURN_ENUMERATOR(beg, argc, argv);
4259  return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4260 }
4261 
4262 VALUE
4263 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4264 {
4265  VALUE current, after_end;
4266  ID succ;
4267  int n, ascii;
4268  rb_encoding *enc;
4269 
4270  CONST_ID(succ, "succ");
4271  StringValue(end);
4272  enc = rb_enc_check(beg, end);
4273  ascii = (is_ascii_string(beg) && is_ascii_string(end));
4274  /* single character */
4275  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4276  char c = RSTRING_PTR(beg)[0];
4277  char e = RSTRING_PTR(end)[0];
4278 
4279  if (c > e || (excl && c == e)) return beg;
4280  for (;;) {
4281  if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4282  if (!excl && c == e) break;
4283  c++;
4284  if (excl && c == e) break;
4285  }
4286  return beg;
4287  }
4288  /* both edges are all digits */
4289  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4290  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4291  all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4292  VALUE b, e;
4293  int width;
4294 
4295  width = RSTRING_LENINT(beg);
4296  b = rb_str_to_inum(beg, 10, FALSE);
4297  e = rb_str_to_inum(end, 10, FALSE);
4298  if (FIXNUM_P(b) && FIXNUM_P(e)) {
4299  long bi = FIX2LONG(b);
4300  long ei = FIX2LONG(e);
4301  rb_encoding *usascii = rb_usascii_encoding();
4302 
4303  while (bi <= ei) {
4304  if (excl && bi == ei) break;
4305  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4306  bi++;
4307  }
4308  }
4309  else {
4310  ID op = excl ? '<' : idLE;
4311  VALUE args[2], fmt = rb_fstring_lit("%.*d");
4312 
4313  args[0] = INT2FIX(width);
4314  while (rb_funcall(b, op, 1, e)) {
4315  args[1] = b;
4316  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4317  b = rb_funcallv(b, succ, 0, 0);
4318  }
4319  }
4320  return beg;
4321  }
4322  /* normal case */
4323  n = rb_str_cmp(beg, end);
4324  if (n > 0 || (excl && n == 0)) return beg;
4325 
4326  after_end = rb_funcallv(end, succ, 0, 0);
4327  current = rb_str_dup(beg);
4328  while (!rb_str_equal(current, after_end)) {
4329  VALUE next = Qnil;
4330  if (excl || !rb_str_equal(current, end))
4331  next = rb_funcallv(current, succ, 0, 0);
4332  if ((*each)(current, arg)) break;
4333  if (NIL_P(next)) break;
4334  current = next;
4335  StringValue(current);
4336  if (excl && rb_str_equal(current, end)) break;
4337  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4338  break;
4339  }
4340 
4341  return beg;
4342 }
4343 
4344 VALUE
4346 {
4347  VALUE current;
4348  ID succ;
4349 
4350  CONST_ID(succ, "succ");
4351  /* both edges are all digits */
4352  if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4353  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4354  VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4355  int width = RSTRING_LENINT(beg);
4356  b = rb_str_to_inum(beg, 10, FALSE);
4357  if (FIXNUM_P(b)) {
4358  long bi = FIX2LONG(b);
4359  rb_encoding *usascii = rb_usascii_encoding();
4360 
4361  while (FIXABLE(bi)) {
4362  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4363  bi++;
4364  }
4365  b = LONG2NUM(bi);
4366  }
4367  args[0] = INT2FIX(width);
4368  while (1) {
4369  args[1] = b;
4370  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4371  b = rb_funcallv(b, succ, 0, 0);
4372  }
4373  }
4374  /* normal case */
4375  current = rb_str_dup(beg);
4376  while (1) {
4377  VALUE next = rb_funcallv(current, succ, 0, 0);
4378  if ((*each)(current, arg)) break;
4379  current = next;
4380  StringValue(current);
4381  if (RSTRING_LEN(current) == 0)
4382  break;
4383  }
4384 
4385  return beg;
4386 }
4387 
4388 static int
4389 include_range_i(VALUE str, VALUE arg)
4390 {
4391  VALUE *argp = (VALUE *)arg;
4392  if (!rb_equal(str, *argp)) return 0;
4393  *argp = Qnil;
4394  return 1;
4395 }
4396 
4397 VALUE
4398 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4399 {
4400  beg = rb_str_new_frozen(beg);
4401  StringValue(end);
4402  end = rb_str_new_frozen(end);
4403  if (NIL_P(val)) return Qfalse;
4404  val = rb_check_string_type(val);
4405  if (NIL_P(val)) return Qfalse;
4406  if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4409  const char *bp = RSTRING_PTR(beg);
4410  const char *ep = RSTRING_PTR(end);
4411  const char *vp = RSTRING_PTR(val);
4412  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4413  if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4414  return Qfalse;
4415  else {
4416  char b = *bp;
4417  char e = *ep;
4418  char v = *vp;
4419 
4420  if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4421  if (b <= v && v < e) return Qtrue;
4422  if (!RTEST(exclusive) && v == e) return Qtrue;
4423  return Qfalse;
4424  }
4425  }
4426  }
4427 #if 0
4428  /* both edges are all digits */
4429  if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4430  all_digits_p(bp, RSTRING_LEN(beg)) &&
4431  all_digits_p(ep, RSTRING_LEN(end))) {
4432  /* TODO */
4433  }
4434 #endif
4435  }
4436  rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4437 
4438  return NIL_P(val) ? Qtrue : Qfalse;
4439 }
4440 
4441 static VALUE
4442 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4443 {
4444  if (rb_reg_search(re, str, 0, 0) >= 0) {
4445  VALUE match = rb_backref_get();
4446  int nth = rb_reg_backref_number(match, backref);
4447  return rb_reg_nth_match(nth, match);
4448  }
4449  return Qnil;
4450 }
4451 
4452 static VALUE
4453 rb_str_aref(VALUE str, VALUE indx)
4454 {
4455  long idx;
4456 
4457  if (FIXNUM_P(indx)) {
4458  idx = FIX2LONG(indx);
4459  }
4460  else if (RB_TYPE_P(indx, T_REGEXP)) {
4461  return rb_str_subpat(str, indx, INT2FIX(0));
4462  }
4463  else if (RB_TYPE_P(indx, T_STRING)) {
4464  if (rb_str_index(str, indx, 0) != -1)
4465  return rb_str_dup(indx);
4466  return Qnil;
4467  }
4468  else {
4469  /* check if indx is Range */
4470  long beg, len = str_strlen(str, NULL);
4471  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4472  case Qfalse:
4473  break;
4474  case Qnil:
4475  return Qnil;
4476  default:
4477  return rb_str_substr(str, beg, len);
4478  }
4479  idx = NUM2LONG(indx);
4480  }
4481 
4482  return str_substr(str, idx, 1, FALSE);
4483 }
4484 
4485 
4486 /*
4487  * call-seq:
4488  * str[index] -> new_str or nil
4489  * str[start, length] -> new_str or nil
4490  * str[range] -> new_str or nil
4491  * str[regexp] -> new_str or nil
4492  * str[regexp, capture] -> new_str or nil
4493  * str[match_str] -> new_str or nil
4494  * str.slice(index) -> new_str or nil
4495  * str.slice(start, length) -> new_str or nil
4496  * str.slice(range) -> new_str or nil
4497  * str.slice(regexp) -> new_str or nil
4498  * str.slice(regexp, capture) -> new_str or nil
4499  * str.slice(match_str) -> new_str or nil
4500  *
4501  * Element Reference --- If passed a single +index+, returns a substring of
4502  * one character at that index. If passed a +start+ index and a +length+,
4503  * returns a substring containing +length+ characters starting at the
4504  * +start+ index. If passed a +range+, its beginning and end are interpreted as
4505  * offsets delimiting the substring to be returned.
4506  *
4507  * In these three cases, if an index is negative, it is counted from the end
4508  * of the string. For the +start+ and +range+ cases the starting index
4509  * is just before a character and an index matching the string's size.
4510  * Additionally, an empty string is returned when the starting index for a
4511  * character range is at the end of the string.
4512  *
4513  * Returns +nil+ if the initial index falls outside the string or the length
4514  * is negative.
4515  *
4516  * If a +Regexp+ is supplied, the matching portion of the string is
4517  * returned. If a +capture+ follows the regular expression, which may be a
4518  * capture group index or name, follows the regular expression that component
4519  * of the MatchData is returned instead.
4520  *
4521  * If a +match_str+ is given, that string is returned if it occurs in
4522  * the string.
4523  *
4524  * Returns +nil+ if the regular expression does not match or the match string
4525  * cannot be found.
4526  *
4527  * a = "hello there"
4528  *
4529  * a[1] #=> "e"
4530  * a[2, 3] #=> "llo"
4531  * a[2..3] #=> "ll"
4532  *
4533  * a[-3, 2] #=> "er"
4534  * a[7..-2] #=> "her"
4535  * a[-4..-2] #=> "her"
4536  * a[-2..-4] #=> ""
4537  *
4538  * a[11, 0] #=> ""
4539  * a[11] #=> nil
4540  * a[12, 0] #=> nil
4541  * a[12..-1] #=> nil
4542  *
4543  * a[/[aeiou](.)\1/] #=> "ell"
4544  * a[/[aeiou](.)\1/, 0] #=> "ell"
4545  * a[/[aeiou](.)\1/, 1] #=> "l"
4546  * a[/[aeiou](.)\1/, 2] #=> nil
4547  *
4548  * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
4549  * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"] #=> "e"
4550  *
4551  * a["lo"] #=> "lo"
4552  * a["bye"] #=> nil
4553  */
4554 
4555 static VALUE
4556 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
4557 {
4558  if (argc == 2) {
4559  if (RB_TYPE_P(argv[0], T_REGEXP)) {
4560  return rb_str_subpat(str, argv[0], argv[1]);
4561  }
4562  else {
4563  long beg = NUM2LONG(argv[0]);
4564  long len = NUM2LONG(argv[1]);
4565  return rb_str_substr(str, beg, len);
4566  }
4567  }
4568  rb_check_arity(argc, 1, 2);
4569  return rb_str_aref(str, argv[0]);
4570 }
4571 
4572 VALUE
4574 {
4575  char *ptr = RSTRING_PTR(str);
4576  long olen = RSTRING_LEN(str), nlen;
4577 
4578  str_modifiable(str);
4579  if (len > olen) len = olen;
4580  nlen = olen - len;
4581  if (STR_EMBEDDABLE_P(nlen, TERM_LEN(str))) {
4582  char *oldptr = ptr;
4583  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
4584  STR_SET_EMBED(str);
4585  STR_SET_EMBED_LEN(str, nlen);
4586  ptr = RSTRING(str)->as.ary;
4587  memmove(ptr, oldptr + len, nlen);
4588  if (fl == STR_NOEMBED) xfree(oldptr);
4589  }
4590  else {
4592  ptr = RSTRING(str)->as.heap.ptr += len;
4593  RSTRING(str)->as.heap.len = nlen;
4594  }
4595  ptr[nlen] = 0;
4597  return str;
4598 }
4599 
4600 static void
4601 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
4602 {
4603  char *sptr;
4604  long slen, vlen = RSTRING_LEN(val);
4605  int cr;
4606 
4607  if (beg == 0 && vlen == 0) {
4609  return;
4610  }
4611 
4612  str_modify_keep_cr(str);
4613  RSTRING_GETMEM(str, sptr, slen);
4614  if (len < vlen) {
4615  /* expand string */
4616  RESIZE_CAPA(str, slen + vlen - len);
4617  sptr = RSTRING_PTR(str);
4618  }
4619 
4621  cr = rb_enc_str_coderange(val);
4622  else
4623  cr = ENC_CODERANGE_UNKNOWN;
4624 
4625  if (vlen != len) {
4626  memmove(sptr + beg + vlen,
4627  sptr + beg + len,
4628  slen - (beg + len));
4629  }
4630  if (vlen < beg && len < 0) {
4631  MEMZERO(sptr + slen, char, -len);
4632  }
4633  if (vlen > 0) {
4634  memmove(sptr + beg, RSTRING_PTR(val), vlen);
4635  }
4636  slen += vlen - len;
4637  STR_SET_LEN(str, slen);
4638  TERM_FILL(&sptr[slen], TERM_LEN(str));
4639  ENC_CODERANGE_SET(str, cr);
4640 }
4641 
4642 void
4643 rb_str_update(VALUE str, long beg, long len, VALUE val)
4644 {
4645  long slen;
4646  char *p, *e;
4647  rb_encoding *enc;
4648  int singlebyte = single_byte_optimizable(str);
4649  int cr;
4650 
4651  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
4652 
4653  StringValue(val);
4654  enc = rb_enc_check(str, val);
4655  slen = str_strlen(str, enc); /* rb_enc_check */
4656 
4657  if (slen < beg) {
4658  out_of_range:
4659  rb_raise(rb_eIndexError, "index %ld out of string", beg);
4660  }
4661  if (beg < 0) {
4662  if (beg + slen < 0) {
4663  goto out_of_range;
4664  }
4665  beg += slen;
4666  }
4667  assert(beg >= 0);
4668  assert(beg <= slen);
4669  if (len > slen - beg) {
4670  len = slen - beg;
4671  }
4672  str_modify_keep_cr(str);
4673  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
4674  if (!p) p = RSTRING_END(str);
4675  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
4676  if (!e) e = RSTRING_END(str);
4677  /* error check */
4678  beg = p - RSTRING_PTR(str); /* physical position */
4679  len = e - p; /* physical length */
4680  rb_str_splice_0(str, beg, len, val);
4681  rb_enc_associate(str, enc);
4683  if (cr != ENC_CODERANGE_BROKEN)
4684  ENC_CODERANGE_SET(str, cr);
4685 }
4686 
4687 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
4688 
4689 static void
4690 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
4691 {
4692  int nth;
4693  VALUE match;
4694  long start, end, len;
4695  rb_encoding *enc;
4696  struct re_registers *regs;
4697 
4698  if (rb_reg_search(re, str, 0, 0) < 0) {
4699  rb_raise(rb_eIndexError, "regexp not matched");
4700  }
4701  match = rb_backref_get();
4702  nth = rb_reg_backref_number(match, backref);
4703  regs = RMATCH_REGS(match);
4704  if (nth >= regs->num_regs) {
4705  out_of_range:
4706  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
4707  }
4708  if (nth < 0) {
4709  if (-nth >= regs->num_regs) {
4710  goto out_of_range;
4711  }
4712  nth += regs->num_regs;
4713  }
4714 
4715  start = BEG(nth);
4716  if (start == -1) {
4717  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
4718  }
4719  end = END(nth);
4720  len = end - start;
4721  StringValue(val);
4722  enc = rb_enc_check_str(str, val);
4723  rb_str_splice_0(str, start, len, val);
4724  rb_enc_associate(str, enc);
4725 }
4726 
4727 static VALUE
4728 rb_str_aset(VALUE str, VALUE indx, VALUE val)
4729 {
4730  long idx, beg;
4731 
4732  if (FIXNUM_P(indx)) {
4733  idx = FIX2LONG(indx);
4734  num_index:
4735  rb_str_splice(str, idx, 1, val);
4736  return val;
4737  }
4738 
4739  if (SPECIAL_CONST_P(indx)) goto generic;
4740  switch (BUILTIN_TYPE(indx)) {
4741  case T_REGEXP:
4742  rb_str_subpat_set(str, indx, INT2FIX(0), val);
4743  return val;
4744 
4745  case T_STRING:
4746  beg = rb_str_index(str, indx, 0);
4747  if (beg < 0) {
4748  rb_raise(rb_eIndexError, "string not matched");
4749  }
4750  beg = rb_str_sublen(str, beg);
4751  rb_str_splice(str, beg, str_strlen(indx, NULL), val);
4752  return val;
4753 
4754  generic:
4755  default:
4756  /* check if indx is Range */
4757  {
4758  long beg, len;
4759  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
4760  rb_str_splice(str, beg, len, val);
4761  return val;
4762  }
4763  }
4764  idx = NUM2LONG(indx);
4765  goto num_index;
4766  }
4767 }
4768 
4769 /*
4770  * call-seq:
4771  * str[integer] = new_str
4772  * str[integer, integer] = new_str
4773  * str[range] = aString
4774  * str[regexp] = new_str
4775  * str[regexp, integer] = new_str
4776  * str[regexp, name] = new_str
4777  * str[other_str] = new_str
4778  *
4779  * Element Assignment---Replaces some or all of the content of
4780  * <i>str</i>. The portion of the string affected is determined using
4781  * the same criteria as String#[]. If the replacement string is not
4782  * the same length as the text it is replacing, the string will be
4783  * adjusted accordingly. If the regular expression or string is used
4784  * as the index doesn't match a position in the string, IndexError is
4785  * raised. If the regular expression form is used, the optional
4786  * second Integer allows you to specify which portion of the match to
4787  * replace (effectively using the MatchData indexing rules. The forms
4788  * that take an Integer will raise an IndexError if the value is out
4789  * of range; the Range form will raise a RangeError, and the Regexp
4790  * and String will raise an IndexError on negative match.
4791  */
4792 
4793 static VALUE
4794 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
4795 {
4796  if (argc == 3) {
4797  if (RB_TYPE_P(argv[0], T_REGEXP)) {
4798  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
4799  }
4800  else {
4801  rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
4802  }
4803  return argv[2];
4804  }
4805  rb_check_arity(argc, 2, 3);
4806  return rb_str_aset(str, argv[0], argv[1]);
4807 }
4808 
4809 /*
4810  * call-seq:
4811  * str.insert(index, other_str) -> str
4812  *
4813  * Inserts <i>other_str</i> before the character at the given
4814  * <i>index</i>, modifying <i>str</i>. Negative indices count from the
4815  * end of the string, and insert <em>after</em> the given character.
4816  * The intent is insert <i>aString</i> so that it starts at the given
4817  * <i>index</i>.
4818  *
4819  * "abcd".insert(0, 'X') #=> "Xabcd"
4820  * "abcd".insert(3, 'X') #=> "abcXd"
4821  * "abcd".insert(4, 'X') #=> "abcdX"
4822  * "abcd".insert(-3, 'X') #=> "abXcd"
4823  * "abcd".insert(-1, 'X') #=> "abcdX"
4824  */
4825 
4826 static VALUE
4827 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
4828 {
4829  long pos = NUM2LONG(idx);
4830 
4831  if (pos == -1) {
4832  return rb_str_append(str, str2);
4833  }
4834  else if (pos < 0) {
4835  pos++;
4836  }
4837  rb_str_splice(str, pos, 0, str2);
4838  return str;
4839 }
4840 
4841 
4842 /*
4843  * call-seq:
4844  * str.slice!(integer) -> new_str or nil
4845  * str.slice!(integer, integer) -> new_str or nil
4846  * str.slice!(range) -> new_str or nil
4847  * str.slice!(regexp) -> new_str or nil
4848  * str.slice!(other_str) -> new_str or nil
4849  *
4850  * Deletes the specified portion from <i>str</i>, and returns the portion
4851  * deleted.
4852  *
4853  * string = "this is a string"
4854  * string.slice!(2) #=> "i"
4855  * string.slice!(3..6) #=> " is "
4856  * string.slice!(/s.*t/) #=> "sa st"
4857  * string.slice!("r") #=> "r"
4858  * string #=> "thing"
4859  */
4860 
4861 static VALUE
4862 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
4863 {
4864  VALUE result;
4865  VALUE buf[3];
4866  int i;
4867 
4868  rb_check_arity(argc, 1, 2);
4869  for (i=0; i<argc; i++) {
4870  buf[i] = argv[i];
4871  }
4872  str_modify_keep_cr(str);
4873  result = rb_str_aref_m(argc, buf, str);
4874  if (!NIL_P(result)) {
4875  buf[i] = rb_str_new(0,0);
4876  rb_str_aset_m(argc+1, buf, str);
4877  }
4878  return result;
4879 }
4880 
4881 static VALUE
4882 get_pat(VALUE pat)
4883 {
4884  VALUE val;
4885 
4886  if (SPECIAL_CONST_P(pat)) goto to_string;
4887  switch (BUILTIN_TYPE(pat)) {
4888  case T_REGEXP:
4889  return pat;
4890 
4891  case T_STRING:
4892  break;
4893 
4894  default:
4895  to_string:
4896  val = rb_check_string_type(pat);
4897  if (NIL_P(val)) {
4898  Check_Type(pat, T_REGEXP);
4899  }
4900  pat = val;
4901  }
4902 
4903  return rb_reg_regcomp(pat);
4904 }
4905 
4906 static VALUE
4907 get_pat_quoted(VALUE pat, int check)
4908 {
4909  VALUE val;
4910 
4911  if (SPECIAL_CONST_P(pat)) goto to_string;
4912  switch (BUILTIN_TYPE(pat)) {
4913  case T_REGEXP:
4914  return pat;
4915 
4916  case T_STRING:
4917  break;
4918 
4919  default:
4920  to_string:
4921  val = rb_check_string_type(pat);
4922  if (NIL_P(val)) {
4923  Check_Type(pat, T_REGEXP);
4924  }
4925  pat = val;
4926  }
4927  if (check && is_broken_string(pat)) {
4929  }
4930  return pat;
4931 }
4932 
4933 static long
4934 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
4935 {
4936  if (BUILTIN_TYPE(pat) == T_STRING) {
4937  pos = rb_strseq_index(str, pat, pos, 1);
4938  if (set_backref_str) {
4939  if (pos >= 0) {
4942  }
4943  else {
4945  }
4946  }
4947  return pos;
4948  }
4949  else {
4950  return rb_reg_search0(pat, str, pos, 0, set_backref_str);
4951  }
4952 }
4953 
4954 
4955 /*
4956  * call-seq:
4957  * str.sub!(pattern, replacement) -> str or nil
4958  * str.sub!(pattern) {|match| block } -> str or nil
4959  *
4960  * Performs the same substitution as String#sub in-place.
4961  *
4962  * Returns +str+ if a substitution was performed or +nil+ if no substitution
4963  * was performed.
4964  */
4965 
4966 static VALUE
4967 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
4968 {
4969  VALUE pat, repl, hash = Qnil;
4970  int iter = 0;
4971  long plen;
4972  int min_arity = rb_block_given_p() ? 1 : 2;
4973  long beg;
4974 
4975  rb_check_arity(argc, min_arity, 2);
4976  if (argc == 1) {
4977  iter = 1;
4978  }
4979  else {
4980  repl = argv[1];
4981  hash = rb_check_hash_type(argv[1]);
4982  if (NIL_P(hash)) {
4983  StringValue(repl);
4984  }
4985  }
4986 
4987  pat = get_pat_quoted(argv[0], 1);
4988 
4989  str_modifiable(str);
4990  beg = rb_pat_search(pat, str, 0, 1);
4991  if (beg >= 0) {
4992  rb_encoding *enc;
4993  int cr = ENC_CODERANGE(str);
4994  long beg0, end0;
4995  VALUE match, match0 = Qnil;
4996  struct re_registers *regs;
4997  char *p, *rp;
4998  long len, rlen;
4999 
5000  match = rb_backref_get();
5001  regs = RMATCH_REGS(match);
5002  if (RB_TYPE_P(pat, T_STRING)) {
5003  beg0 = beg;
5004  end0 = beg0 + RSTRING_LEN(pat);
5005  match0 = pat;
5006  }
5007  else {
5008  beg0 = BEG(0);
5009  end0 = END(0);
5010  if (iter) match0 = rb_reg_nth_match(0, match);
5011  }
5012 
5013  if (iter || !NIL_P(hash)) {
5014  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5015 
5016  if (iter) {
5017  repl = rb_obj_as_string(rb_yield(match0));
5018  }
5019  else {
5020  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5021  repl = rb_obj_as_string(repl);
5022  }
5023  str_mod_check(str, p, len);
5025  }
5026  else {
5027  repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5028  }
5029 
5030  enc = rb_enc_compatible(str, repl);
5031  if (!enc) {
5032  rb_encoding *str_enc = STR_ENC_GET(str);
5033  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5034  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5035  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5036  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5037  rb_enc_name(str_enc),
5038  rb_enc_name(STR_ENC_GET(repl)));
5039  }
5040  enc = STR_ENC_GET(repl);
5041  }
5042  rb_str_modify(str);
5043  rb_enc_associate(str, enc);
5044  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5045  int cr2 = ENC_CODERANGE(repl);
5046  if (cr2 == ENC_CODERANGE_BROKEN ||
5047  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5048  cr = ENC_CODERANGE_UNKNOWN;
5049  else
5050  cr = cr2;
5051  }
5052  plen = end0 - beg0;
5053  rlen = RSTRING_LEN(repl);
5054  len = RSTRING_LEN(str);
5055  if (rlen > plen) {
5056  RESIZE_CAPA(str, len + rlen - plen);
5057  }
5058  p = RSTRING_PTR(str);
5059  if (rlen != plen) {
5060  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5061  }
5062  rp = RSTRING_PTR(repl);
5063  memmove(p + beg0, rp, rlen);
5064  len += rlen - plen;
5065  STR_SET_LEN(str, len);
5067  ENC_CODERANGE_SET(str, cr);
5068 
5069  return str;
5070  }
5071  return Qnil;
5072 }
5073 
5074 
5075 /*
5076  * call-seq:
5077  * str.sub(pattern, replacement) -> new_str
5078  * str.sub(pattern, hash) -> new_str
5079  * str.sub(pattern) {|match| block } -> new_str
5080  *
5081  * Returns a copy of +str+ with the _first_ occurrence of +pattern+
5082  * replaced by the second argument. The +pattern+ is typically a Regexp; if
5083  * given as a String, any regular expression metacharacters it contains will
5084  * be interpreted literally, e.g. <code>\d</code> will match a backslash
5085  * followed by 'd', instead of a digit.
5086  *
5087  * If +replacement+ is a String it will be substituted for the matched text.
5088  * It may contain back-references to the pattern's capture groups of the form
5089  * <code>\d</code>, where <i>d</i> is a group number, or
5090  * <code>\k<n></code>, where <i>n</i> is a group name.
5091  * Similarly, <code>\&</code>, <code>\'</code>, <code>\`</code>, and
5092  * <code>\+</code> correspond to special variables, <code>$&</code>,
5093  * <code>$'</code>, <code>$`</code>, and <code>$+</code>, respectively.
5094  * (See rdoc-ref:regexp.rdoc for details.)
5095  * <code>\0</code> is the same as <code>\&</code>.
5096  * <code>\\\</code> is interpreted as an escape, i.e., a single backslash.
5097  * Note that, within +replacement+ the special match variables, such as
5098  * <code>$&</code>, will not refer to the current match.
5099  *
5100  * If the second argument is a Hash, and the matched text is one of its keys,
5101  * the corresponding value is the replacement string.
5102  *
5103  * In the block form, the current match string is passed in as a parameter,
5104  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5105  * <code>$&</code>, and <code>$'</code> will be set appropriately.
5106  * (See rdoc-ref:regexp.rdoc for details.)
5107  * The value returned by the block will be substituted for the match on each
5108  * call.
5109  *
5110  * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
5111  * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
5112  * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
5113  * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
5114  * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
5115  * #=> "Is /bin/bash your preferred shell?"
5116  *
5117  * Note that a string literal consumes backslashes.
5118  * (See rdoc-ref:syntax/literals.rdoc for details about string literals.)
5119  * Back-references are typically preceded by an additional backslash.
5120  * For example, if you want to write a back-reference <code>\&</code> in
5121  * +replacement+ with a double-quoted string literal, you need to write:
5122  * <code>"..\\\\&.."</code>.
5123  * If you want to write a non-back-reference string <code>\&</code> in
5124  * +replacement+, you need first to escape the backslash to prevent
5125  * this method from interpreting it as a back-reference, and then you
5126  * need to escape the backslashes again to prevent a string literal from
5127  * consuming them: <code>"..\\\\\\\\&.."</code>.
5128  * You may want to use the block form to avoid a lot of backslashes.
5129  */
5130 
5131 static VALUE
5132 rb_str_sub(int argc, VALUE *argv, VALUE str)
5133 {
5134  str = rb_str_dup(str);
5135  rb_str_sub_bang(argc, argv, str);
5136  return str;
5137 }
5138 
5139 static VALUE
5140 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5141 {
5142  VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5143  struct re_registers *regs;
5144  long beg, beg0, end0;
5145  long offset, blen, slen, len, last;
5146  enum {STR, ITER, MAP} mode = STR;
5147  char *sp, *cp;
5148  int need_backref = -1;
5149  rb_encoding *str_enc;
5150 
5151  switch (argc) {
5152  case 1:
5154  mode = ITER;
5155  break;
5156  case 2:
5157  repl = argv[1];
5158  hash = rb_check_hash_type(argv[1]);
5159  if (NIL_P(hash)) {
5160  StringValue(repl);
5161  }
5162  else {
5163  mode = MAP;
5164  }
5165  break;
5166  default:
5167  rb_error_arity(argc, 1, 2);
5168  }
5169 
5170  pat = get_pat_quoted(argv[0], 1);
5171  beg = rb_pat_search(pat, str, 0, need_backref);
5172  if (beg < 0) {
5173  if (bang) return Qnil; /* no match, no substitution */
5174  return rb_str_dup(str);
5175  }
5176 
5177  offset = 0;
5178  blen = RSTRING_LEN(str) + 30; /* len + margin */
5179  dest = rb_str_buf_new(blen);
5180  sp = RSTRING_PTR(str);
5181  slen = RSTRING_LEN(str);
5182  cp = sp;
5183  str_enc = STR_ENC_GET(str);
5184  rb_enc_associate(dest, str_enc);
5186 
5187  do {
5188  match = rb_backref_get();
5189  regs = RMATCH_REGS(match);
5190  if (RB_TYPE_P(pat, T_STRING)) {
5191  beg0 = beg;
5192  end0 = beg0 + RSTRING_LEN(pat);
5193  match0 = pat;
5194  }
5195  else {
5196  beg0 = BEG(0);
5197  end0 = END(0);
5198  if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5199  }
5200 
5201  if (mode) {
5202  if (mode == ITER) {
5203  val = rb_obj_as_string(rb_yield(match0));
5204  }
5205  else {
5206  val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5207  val = rb_obj_as_string(val);
5208  }
5209  str_mod_check(str, sp, slen);
5210  if (val == dest) { /* paranoid check [ruby-dev:24827] */
5211  rb_raise(rb_eRuntimeError, "block should not cheat");
5212  }
5213  }
5214  else if (need_backref) {
5215  val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5216  if (need_backref < 0) {
5217  need_backref = val != repl;
5218  }
5219  }
5220  else {
5221  val = repl;
5222  }
5223 
5224  len = beg0 - offset; /* copy pre-match substr */
5225  if (len) {
5226  rb_enc_str_buf_cat(dest, cp, len, str_enc);
5227  }
5228 
5229  rb_str_buf_append(dest, val);
5230 
5231  last = offset;
5232  offset = end0;
5233  if (beg0 == end0) {
5234  /*
5235  * Always consume at least one character of the input string
5236  * in order to prevent infinite loops.
5237  */
5238  if (RSTRING_LEN(str) <= end0) break;
5239  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5240  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5241  offset = end0 + len;
5242  }
5243  cp = RSTRING_PTR(str) + offset;
5244  if (offset > RSTRING_LEN(str)) break;
5245  beg = rb_pat_search(pat, str, offset, need_backref);
5246  } while (beg >= 0);
5247  if (RSTRING_LEN(str) > offset) {
5248  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5249  }
5250  rb_pat_search(pat, str, last, 1);
5251  if (bang) {
5252  str_shared_replace(str, dest);
5253  }
5254  else {
5256  str = dest;
5257  }
5258 
5259  return str;
5260 }
5261 
5262 
5263 /*
5264  * call-seq:
5265  * str.gsub!(pattern, replacement) -> str or nil
5266  * str.gsub!(pattern, hash) -> str or nil
5267  * str.gsub!(pattern) {|match| block } -> str or nil
5268  * str.gsub!(pattern) -> an_enumerator
5269  *
5270  * Performs the substitutions of String#gsub in place, returning
5271  * <i>str</i>, or <code>nil</code> if no substitutions were
5272  * performed. If no block and no <i>replacement</i> is given, an
5273  * enumerator is returned instead.
5274  */
5275 
5276 static VALUE
5277 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5278 {
5279  str_modify_keep_cr(str);
5280  return str_gsub(argc, argv, str, 1);
5281 }
5282 
5283 
5284 /*
5285  * call-seq:
5286  * str.gsub(pattern, replacement) -> new_str
5287  * str.gsub(pattern, hash) -> new_str
5288  * str.gsub(pattern) {|match| block } -> new_str
5289  * str.gsub(pattern) -> enumerator
5290  *
5291  * Returns a copy of <i>str</i> with <em>all</em> occurrences of
5292  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
5293  * typically a Regexp; if given as a String, any
5294  * regular expression metacharacters it contains will be interpreted
5295  * literally, e.g. <code>\d</code> will match a backslash followed by 'd',
5296  * instead of a digit.
5297  *
5298  * If +replacement+ is a String it will be substituted for the matched text.
5299  * It may contain back-references to the pattern's capture groups of the form
5300  * <code>\d</code>, where <i>d</i> is a group number, or
5301  * <code>\k<n></code>, where <i>n</i> is a group name.
5302  * Similarly, <code>\&</code>, <code>\'</code>, <code>\`</code>, and
5303  * <code>\+</code> correspond to special variables, <code>$&</code>,
5304  * <code>$'</code>, <code>$`</code>, and <code>$+</code>, respectively.
5305  * (See rdoc-ref:regexp.rdoc for details.)
5306  * <code>\0</code> is the same as <code>\&</code>.
5307  * <code>\\\</code> is interpreted as an escape, i.e., a single backslash.
5308  * Note that, within +replacement+ the special match variables, such as
5309  * <code>$&</code>, will not refer to the current match.
5310  *
5311  * If the second argument is a Hash, and the matched text is one
5312  * of its keys, the corresponding value is the replacement string.
5313  *
5314  * In the block form, the current match string is passed in as a parameter,
5315  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5316  * <code>$&</code>, and <code>$'</code> will be set appropriately.
5317  * (See rdoc-ref:regexp.rdoc for details.)
5318  * The value returned by the block will be substituted for the match on each
5319  * call.
5320  *
5321  * When neither a block nor a second argument is supplied, an
5322  * Enumerator is returned.
5323  *
5324  * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
5325  * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
5326  * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
5327  * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
5328  * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
5329  *
5330  * Note that a string literal consumes backslashes.
5331  * (See rdoc-ref:syntax/literals.rdoc for details on string literals.)
5332  * Back-references are typically preceded by an additional backslash.
5333  * For example, if you want to write a back-reference <code>\&</code> in
5334  * +replacement+ with a double-quoted string literal, you need to write:
5335  * <code>"..\\\\&.."</code>.
5336  * If you want to write a non-back-reference string <code>\&</code> in
5337  * +replacement+, you need first to escape the backslash to prevent
5338  * this method from interpreting it as a back-reference, and then you
5339  * need to escape the backslashes again to prevent a string literal from
5340  * consuming them: <code>"..\\\\\\\\&.."</code>.
5341  * You may want to use the block form to avoid a lot of backslashes.
5342  */
5343 
5344 static VALUE
5345 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5346 {
5347  return str_gsub(argc, argv, str, 0);
5348 }
5349 
5350 
5351 /*
5352  * call-seq:
5353  * str.replace(other_str) -> str
5354  *
5355  * Replaces the contents of <i>str</i> with the corresponding
5356  * values in <i>other_str</i>.
5357  *
5358  * s = "hello" #=> "hello"
5359  * s.replace "world" #=> "world"
5360  */
5361 
5362 VALUE
5364 {
5365  str_modifiable(str);
5366  if (str == str2) return str;
5367 
5368  StringValue(str2);
5369  str_discard(str);
5370  return str_replace(str, str2);
5371 }
5372 
5373 /*
5374  * call-seq:
5375  * string.clear -> string
5376  *
5377  * Makes string empty.
5378  *
5379  * a = "abcde"
5380  * a.clear #=> ""
5381  */
5382 
5383 static VALUE
5384 rb_str_clear(VALUE str)
5385 {
5386  str_discard(str);
5387  STR_SET_EMBED(str);
5388  STR_SET_EMBED_LEN(str, 0);
5389  RSTRING_PTR(str)[0] = 0;
5392  else
5394  return str;
5395 }
5396 
5397 /*
5398  * call-seq:
5399  * string.chr -> string
5400  *
5401  * Returns a one-character string at the beginning of the string.
5402  *
5403  * a = "abcde"
5404  * a.chr #=> "a"
5405  */
5406 
5407 static VALUE
5408 rb_str_chr(VALUE str)
5409 {
5410  return rb_str_substr(str, 0, 1);
5411 }
5412 
5413 /*
5414  * call-seq:
5415  * str.getbyte(index) -> 0 .. 255
5416  *
5417  * returns the <i>index</i>th byte as an integer.
5418  */
5419 static VALUE
5420 rb_str_getbyte(VALUE str, VALUE index)
5421 {
5422  long pos = NUM2LONG(index);
5423 
5424  if (pos < 0)
5425  pos += RSTRING_LEN(str);
5426  if (pos < 0 || RSTRING_LEN(str) <= pos)
5427  return Qnil;
5428 
5429  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5430 }
5431 
5432 /*
5433  * call-seq:
5434  * str.setbyte(index, integer) -> integer
5435  *
5436  * modifies the <i>index</i>th byte as <i>integer</i>.
5437  */
5438 static VALUE
5439 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5440 {
5441  long pos = NUM2LONG(index);
5442  long len = RSTRING_LEN(str);
5443  char *head, *left = 0;
5444  unsigned char *ptr;
5445  rb_encoding *enc;
5446  int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5447 
5448  if (pos < -len || len <= pos)
5449  rb_raise(rb_eIndexError, "index %ld out of string", pos);
5450  if (pos < 0)
5451  pos += len;
5452 
5453  VALUE v = rb_to_int(value);
5454  VALUE w = rb_int_and(v, INT2FIX(0xff));
5455  unsigned char byte = NUM2INT(w) & 0xFF;
5456 
5457  if (!str_independent(str))
5458  str_make_independent(str);
5459  enc = STR_ENC_GET(str);
5460  head = RSTRING_PTR(str);
5461  ptr = (unsigned char *)&head[pos];
5462  if (!STR_EMBED_P(str)) {
5463  cr = ENC_CODERANGE(str);
5464  switch (cr) {
5465  case ENC_CODERANGE_7BIT:
5466  left = (char *)ptr;
5467  *ptr = byte;
5468  if (ISASCII(byte)) goto end;
5469  nlen = rb_enc_precise_mbclen(left, head+len, enc);
5470  if (!MBCLEN_CHARFOUND_P(nlen))
5472  else
5474  goto end;
5475  case ENC_CODERANGE_VALID:
5476  left = rb_enc_left_char_head(head, ptr, head+len, enc);
5477  width = rb_enc_precise_mbclen(left, head+len, enc);
5478  *ptr = byte;
5479  nlen = rb_enc_precise_mbclen(left, head+len, enc);
5480  if (!MBCLEN_CHARFOUND_P(nlen))
5482  else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5484  goto end;
5485  }
5486  }
5488  *ptr = byte;
5489 
5490  end:
5491  return value;
5492 }
5493 
5494 static VALUE
5495 str_byte_substr(VALUE str, long beg, long len, int empty)
5496 {
5497  char *p, *s = RSTRING_PTR(str);
5498  long n = RSTRING_LEN(str);
5499  VALUE str2;
5500 
5501  if (beg > n || len < 0) return Qnil;
5502  if (beg < 0) {
5503  beg += n;
5504  if (beg < 0) return Qnil;
5505  }
5506  if (len > n - beg)
5507  len = n - beg;
5508  if (len <= 0) {
5509  if (!empty) return Qnil;
5510  len = 0;
5511  p = 0;
5512  }
5513  else
5514  p = s + beg;
5515 
5517  str2 = rb_str_new_frozen(str);
5518  str2 = str_new_shared(rb_obj_class(str2), str2);
5519  RSTRING(str2)->as.heap.ptr += beg;
5520  RSTRING(str2)->as.heap.len = len;
5521  }
5522  else {
5523  str2 = rb_str_new_with_class(str, p, len);
5524  }
5525 
5526  str_enc_copy(str2, str);
5527 
5528  if (RSTRING_LEN(str2) == 0) {
5531  else
5533  }
5534  else {
5535  switch (ENC_CODERANGE(str)) {
5536  case ENC_CODERANGE_7BIT:
5538  break;
5539  default:
5541  break;
5542  }
5543  }
5544 
5545  return str2;
5546 }
5547 
5548 static VALUE
5549 str_byte_aref(VALUE str, VALUE indx)
5550 {
5551  long idx;
5552  if (FIXNUM_P(indx)) {
5553  idx = FIX2LONG(indx);
5554  }
5555  else {
5556  /* check if indx is Range */
5557  long beg, len = RSTRING_LEN(str);
5558 
5559  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5560  case Qfalse:
5561  break;
5562  case Qnil:
5563  return Qnil;
5564  default:
5565  return str_byte_substr(str, beg, len, TRUE);
5566  }
5567 
5568  idx = NUM2LONG(indx);
5569  }
5570  return str_byte_substr(str, idx, 1, FALSE);
5571 }
5572 
5573 /*
5574  * call-seq:
5575  * str.byteslice(integer) -> new_str or nil
5576  * str.byteslice(integer, integer) -> new_str or nil
5577  * str.byteslice(range) -> new_str or nil
5578  *
5579  * Byte Reference---If passed a single Integer, returns a
5580  * substring of one byte at that position. If passed two Integer
5581  * objects, returns a substring starting at the offset given by the first, and
5582  * a length given by the second. If given a Range, a substring containing
5583  * bytes at offsets given by the range is returned. In all three cases, if
5584  * an offset is negative, it is counted from the end of <i>str</i>. Returns
5585  * <code>nil</code> if the initial offset falls outside the string, the length
5586  * is negative, or the beginning of the range is greater than the end.
5587  * The encoding of the resulted string keeps original encoding.
5588  *
5589  * "hello".byteslice(1) #=> "e"
5590  * "hello".byteslice(-1) #=> "o"
5591  * "hello".byteslice(1, 2) #=> "el"
5592  * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
5593  * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
5594  */
5595 
5596 static VALUE
5597 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
5598 {
5599  if (argc == 2) {
5600  long beg = NUM2LONG(argv[0]);
5601  long end = NUM2LONG(argv[1]);
5602  return str_byte_substr(str, beg, end, TRUE);
5603  }
5604  rb_check_arity(argc, 1, 2);
5605  return str_byte_aref(str, argv[0]);
5606 }
5607 
5608 /*
5609  * call-seq:
5610  * str.reverse -> new_str
5611  *
5612  * Returns a new string with the characters from <i>str</i> in reverse order.
5613  *
5614  * "stressed".reverse #=> "desserts"
5615  */
5616 
5617 static VALUE
5618 rb_str_reverse(VALUE str)
5619 {
5620  rb_encoding *enc;
5621  VALUE rev;
5622  char *s, *e, *p;
5623  int cr;
5624 
5625  if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
5626  enc = STR_ENC_GET(str);
5628  s = RSTRING_PTR(str); e = RSTRING_END(str);
5629  p = RSTRING_END(rev);
5630  cr = ENC_CODERANGE(str);
5631 
5632  if (RSTRING_LEN(str) > 1) {
5633  if (single_byte_optimizable(str)) {
5634  while (s < e) {
5635  *--p = *s++;
5636  }
5637  }
5638  else if (cr == ENC_CODERANGE_VALID) {
5639  while (s < e) {
5640  int clen = rb_enc_fast_mbclen(s, e, enc);
5641 
5642  p -= clen;
5643  memcpy(p, s, clen);
5644  s += clen;
5645  }
5646  }
5647  else {
5648  cr = rb_enc_asciicompat(enc) ?
5650  while (s < e) {
5651  int clen = rb_enc_mbclen(s, e, enc);
5652 
5653  if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
5654  p -= clen;
5655  memcpy(p, s, clen);
5656  s += clen;
5657  }
5658  }
5659  }
5660  STR_SET_LEN(rev, RSTRING_LEN(str));
5661  str_enc_copy(rev, str);
5662  ENC_CODERANGE_SET(rev, cr);
5663 
5664  return rev;
5665 }
5666 
5667 
5668 /*
5669  * call-seq:
5670  * str.reverse! -> str
5671  *
5672  * Reverses <i>str</i> in place.
5673  */
5674 
5675 static VALUE
5676 rb_str_reverse_bang(VALUE str)
5677 {
5678  if (RSTRING_LEN(str) > 1) {
5679  if (single_byte_optimizable(str)) {
5680  char *s, *e, c;
5681 
5682  str_modify_keep_cr(str);
5683  s = RSTRING_PTR(str);
5684  e = RSTRING_END(str) - 1;
5685  while (s < e) {
5686  c = *s;
5687  *s++ = *e;
5688  *e-- = c;
5689  }
5690  }
5691  else {
5692  str_shared_replace(str, rb_str_reverse(str));
5693  }
5694  }
5695  else {
5696  str_modify_keep_cr(str);
5697  }
5698  return str;
5699 }
5700 
5701 
5702 /*
5703  * call-seq:
5704  * str.include? other_str -> true or false
5705  *
5706  * Returns <code>true</code> if <i>str</i> contains the given string or
5707  * character.
5708  *
5709  * "hello".include? "lo" #=> true
5710  * "hello".include? "ol" #=> false
5711  * "hello".include? ?h #=> true
5712  */
5713 
5714 static VALUE
5715 rb_str_include(VALUE str, VALUE arg)
5716 {
5717  long i;
5718 
5719  StringValue(arg);
5720  i = rb_str_index(str, arg, 0);
5721 
5722  if (i == -1) return Qfalse;
5723  return Qtrue;
5724 }
5725 
5726 
5727 /*
5728  * call-seq:
5729  * str.to_i(base=10) -> integer
5730  *
5731  * Returns the result of interpreting leading characters in <i>str</i> as an
5732  * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
5733  * end of a valid number are ignored. If there is not a valid number at the
5734  * start of <i>str</i>, <code>0</code> is returned. This method never raises an
5735  * exception when <i>base</i> is valid.
5736  *
5737  * "12345".to_i #=> 12345
5738  * "99 red balloons".to_i #=> 99
5739  * "0a".to_i #=> 0
5740  * "0a".to_i(16) #=> 10
5741  * "hello".to_i #=> 0
5742  * "1100101".to_i(2) #=> 101
5743  * "1100101".to_i(8) #=> 294977
5744  * "1100101".to_i(10) #=> 1100101
5745  * "1100101".to_i(16) #=> 17826049
5746  */
5747 
5748 static VALUE
5749 rb_str_to_i(int argc, VALUE *argv, VALUE str)
5750 {
5751  int base = 10;
5752 
5753  if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
5754  rb_raise(rb_eArgError, "invalid radix %d", base);
5755  }
5756  return rb_str_to_inum(str, base, FALSE);
5757 }
5758 
5759 
5760 /*
5761  * call-seq:
5762  * str.to_f -> float
5763  *
5764  * Returns the result of interpreting leading characters in <i>str</i> as a
5765  * floating point number. Extraneous characters past the end of a valid number
5766  * are ignored. If there is not a valid number at the start of <i>str</i>,
5767  * <code>0.0</code> is returned. This method never raises an exception.
5768  *
5769  * "123.45e1".to_f #=> 1234.5
5770  * "45.67 degrees".to_f #=> 45.67
5771  * "thx1138".to_f #=> 0.0
5772  */
5773 
5774 static VALUE
5775 rb_str_to_f(VALUE str)
5776 {
5777  return DBL2NUM(rb_str_to_dbl(str, FALSE));
5778 }
5779 
5780 
5781 /*
5782  * call-seq:
5783  * str.to_s -> str
5784  * str.to_str -> str
5785  *
5786  * Returns +self+.
5787  *
5788  * If called on a subclass of String, converts the receiver to a String object.
5789  */
5790 
5791 static VALUE
5792 rb_str_to_s(VALUE str)
5793 {
5794  if (rb_obj_class(str) != rb_cString) {
5795  return str_duplicate(rb_cString, str);
5796  }
5797  return str;
5798 }
5799 
5800 #if 0
5801 static void
5802 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
5803 {
5804  char s[RUBY_MAX_CHAR_LEN];
5805  int n = rb_enc_codelen(c, enc);
5806 
5807  rb_enc_mbcput(c, s, enc);
5808  rb_enc_str_buf_cat(str, s, n, enc);
5809 }
5810 #endif
5811 
5812 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
5813 
5814 int
5815 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
5816 {
5817  char buf[CHAR_ESC_LEN + 1];
5818  int l;
5819 
5820 #if SIZEOF_INT > 4
5821  c &= 0xffffffff;
5822 #endif
5823  if (unicode_p) {
5824  if (c < 0x7F && ISPRINT(c)) {
5825  snprintf(buf, CHAR_ESC_LEN, "%c", c);
5826  }
5827  else if (c < 0x10000) {
5828  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
5829  }
5830  else {
5831  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
5832  }
5833  }
5834  else {
5835  if (c < 0x100) {
5836  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
5837  }
5838  else {
5839  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
5840  }
5841  }
5842  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
5843  rb_str_buf_cat(result, buf, l);
5844  return l;
5845 }
5846 
5847 const char *
5849 {
5850  switch (c) {
5851  case '\0': return "\\0";
5852  case '\n': return "\\n";
5853  case '\r': return "\\r";
5854  case '\t': return "\\t";
5855  case '\f': return "\\f";
5856  case '\013': return "\\v";
5857  case '\010': return "\\b";
5858  case '\007': return "\\a";
5859  case '\033': return "\\e";
5860  case '\x7f': return "\\c?";
5861  }
5862  return NULL;
5863 }
5864 
5865 VALUE
5867 {
5868  int encidx = ENCODING_GET(str);
5869  rb_encoding *enc = rb_enc_from_index(encidx);
5870  const char *p = RSTRING_PTR(str);
5871  const char *pend = RSTRING_END(str);
5872  const char *prev = p;
5873  char buf[CHAR_ESC_LEN + 1];
5874  VALUE result = rb_str_buf_new(0);
5875  int unicode_p = rb_enc_unicode_p(enc);
5876  int asciicompat = rb_enc_asciicompat(enc);
5877 
5878  while (p < pend) {
5879  unsigned int c;
5880  const char *cc;
5881  int n = rb_enc_precise_mbclen(p, pend, enc);
5882  if (!MBCLEN_CHARFOUND_P(n)) {
5883  if (p > prev) str_buf_cat(result, prev, p - prev);
5884  n = rb_enc_mbminlen(enc);
5885  if (pend < p + n)
5886  n = (int)(pend - p);
5887  while (n--) {
5888  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5889  str_buf_cat(result, buf, strlen(buf));
5890  prev = ++p;
5891  }
5892  continue;
5893  }
5895  c = rb_enc_mbc_to_codepoint(p, pend, enc);
5896  p += n;
5897  cc = ruby_escaped_char(c);
5898  if (cc) {
5899  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5900  str_buf_cat(result, cc, strlen(cc));
5901  prev = p;
5902  }
5903  else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
5904  }
5905  else {
5906  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5907  rb_str_buf_cat_escaped_char(result, c, unicode_p);
5908  prev = p;
5909  }
5910  }
5911  if (p > prev) str_buf_cat(result, prev, p - prev);
5913 
5914  return result;
5915 }
5916 
5917 /*
5918  * call-seq:
5919  * str.inspect -> string
5920  *
5921  * Returns a printable version of _str_, surrounded by quote marks,
5922  * with special characters escaped.
5923  *
5924  * str = "hello"
5925  * str[3] = "\b"
5926  * str.inspect #=> "\"hel\\bo\""
5927  */
5928 
5929 VALUE
5931 {
5932  int encidx = ENCODING_GET(str);
5933  rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
5934  const char *p, *pend, *prev;
5935  char buf[CHAR_ESC_LEN + 1];
5936  VALUE result = rb_str_buf_new(0);
5938  int unicode_p = rb_enc_unicode_p(enc);
5939  int asciicompat = rb_enc_asciicompat(enc);
5940 
5941  if (resenc == NULL) resenc = rb_default_external_encoding();
5942  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
5943  rb_enc_associate(result, resenc);
5944  str_buf_cat2(result, "\"");
5945 
5946  p = RSTRING_PTR(str); pend = RSTRING_END(str);
5947  prev = p;
5948  actenc = get_actual_encoding(encidx, str);
5949  if (actenc != enc) {
5950  enc = actenc;
5951  if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
5952  }
5953  while (p < pend) {
5954  unsigned int c, cc;
5955  int n;
5956 
5957  n = rb_enc_precise_mbclen(p, pend, enc);
5958  if (!MBCLEN_CHARFOUND_P(n)) {
5959  if (p > prev) str_buf_cat(result, prev, p - prev);
5960  n = rb_enc_mbminlen(enc);
5961  if (pend < p + n)
5962  n = (int)(pend - p);
5963  while (n--) {
5964  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5965  str_buf_cat(result, buf, strlen(buf));
5966  prev = ++p;
5967  }
5968  continue;
5969  }
5971  c = rb_enc_mbc_to_codepoint(p, pend, enc);
5972  p += n;
5973  if ((asciicompat || unicode_p) &&
5974  (c == '"'|| c == '\\' ||
5975  (c == '#' &&
5976  p < pend &&
5978  (cc = rb_enc_codepoint(p,pend,enc),
5979  (cc == '$' || cc == '@' || cc == '{'))))) {
5980  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5981  str_buf_cat2(result, "\\");
5982  if (asciicompat || enc == resenc) {
5983  prev = p - n;
5984  continue;
5985  }
5986  }
5987  switch (c) {
5988  case '\n': cc = 'n'; break;
5989  case '\r': cc = 'r'; break;
5990  case '\t': cc = 't'; break;
5991  case '\f': cc = 'f'; break;
5992  case '\013': cc = 'v'; break;
5993  case '\010': cc = 'b'; break;
5994  case '\007': cc = 'a'; break;
5995  case 033: cc = 'e'; break;
5996  default: cc = 0; break;
5997  }
5998  if (cc) {
5999  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6000  buf[0] = '\\';
6001  buf[1] = (char)cc;
6002  str_buf_cat(result, buf, 2);
6003  prev = p;
6004  continue;
6005  }
6006  if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6007  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6008  continue;
6009  }
6010  else {
6011  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6012  rb_str_buf_cat_escaped_char(result, c, unicode_p);
6013  prev = p;
6014  continue;
6015  }
6016  }
6017  if (p > prev) str_buf_cat(result, prev, p - prev);
6018  str_buf_cat2(result, "\"");
6019 
6020  return result;
6021 }
6022 
6023 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6024 
6025 /*
6026  * call-seq:
6027  * str.dump -> new_str
6028  *
6029  * Returns a quoted version of the string with all non-printing characters
6030  * replaced by <code>\xHH</code> notation and all special characters escaped.
6031  *
6032  * This method can be used for round-trip: if the resulting +new_str+ is
6033  * eval'ed, it will produce the original string.
6034  *
6035  * "hello \n ''".dump #=> "\"hello \\n ''\""
6036  * "\f\x00\xff\\\"".dump #=> "\"\\f\\x00\\xFF\\\\\\\"\""
6037  *
6038  * See also String#undump.
6039  */
6040 
6041 VALUE
6043 {
6044  int encidx = rb_enc_get_index(str);
6045  rb_encoding *enc = rb_enc_from_index(encidx);
6046  long len;
6047  const char *p, *pend;
6048  char *q, *qend;
6049  VALUE result;
6050  int u8 = (encidx == rb_utf8_encindex());
6051  static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6052 
6053  len = 2; /* "" */
6054  if (!rb_enc_asciicompat(enc)) {
6055  len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6056  len += strlen(enc->name);
6057  }
6058 
6059  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6060  while (p < pend) {
6061  int clen;
6062  unsigned char c = *p++;
6063 
6064  switch (c) {
6065  case '"': case '\\':
6066  case '\n': case '\r':
6067  case '\t': case '\f':
6068  case '\013': case '\010': case '\007': case '\033':
6069  clen = 2;
6070  break;
6071 
6072  case '#':
6073  clen = IS_EVSTR(p, pend) ? 2 : 1;
6074  break;
6075 
6076  default:
6077  if (ISPRINT(c)) {
6078  clen = 1;
6079  }
6080  else {
6081  if (u8 && c > 0x7F) { /* \u notation */
6082  int n = rb_enc_precise_mbclen(p-1, pend, enc);
6083  if (MBCLEN_CHARFOUND_P(n)) {
6084  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6085  if (cc <= 0xFFFF)
6086  clen = 6; /* \uXXXX */
6087  else if (cc <= 0xFFFFF)
6088  clen = 9; /* \u{XXXXX} */
6089  else
6090  clen = 10; /* \u{XXXXXX} */
6091  p += MBCLEN_CHARFOUND_LEN(n)-1;
6092  break;
6093  }
6094  }
6095  clen = 4; /* \xNN */
6096  }
6097  break;
6098  }
6099 
6100  if (clen > LONG_MAX - len) {
6101  rb_raise(rb_eRuntimeError, "string size too big");
6102  }
6103  len += clen;
6104  }
6105 
6106  result = rb_str_new_with_class(str, 0, len);
6107  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6108  q = RSTRING_PTR(result); qend = q + len + 1;
6109 
6110  *q++ = '"';
6111  while (p < pend) {
6112  unsigned char c = *p++;
6113 
6114  if (c == '"' || c == '\\') {
6115  *q++ = '\\';
6116  *q++ = c;
6117  }
6118  else if (c == '#') {
6119  if (IS_EVSTR(p, pend)) *q++ = '\\';
6120  *q++ = '#';
6121  }
6122  else if (c == '\n') {
6123  *q++ = '\\';
6124  *q++ = 'n';
6125  }
6126  else if (c == '\r') {
6127  *q++ = '\\';
6128  *q++ = 'r';
6129  }
6130  else if (c == '\t') {
6131  *q++ = '\\';
6132  *q++ = 't';
6133  }
6134  else if (c == '\f') {
6135  *q++ = '\\';
6136  *q++ = 'f';
6137  }
6138  else if (c == '\013') {
6139  *q++ = '\\';
6140  *q++ = 'v';
6141  }
6142  else if (c == '\010') {
6143  *q++ = '\\';
6144  *q++ = 'b';
6145  }
6146  else if (c == '\007') {
6147  *q++ = '\\';
6148  *q++ = 'a';
6149  }
6150  else if (c == '\033') {
6151  *q++ = '\\';
6152  *q++ = 'e';
6153  }
6154  else if (ISPRINT(c)) {
6155  *q++ = c;
6156  }
6157  else {
6158  *q++ = '\\';
6159  if (u8) {
6160  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6161  if (MBCLEN_CHARFOUND_P(n)) {
6162  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6163  p += n;
6164  if (cc <= 0xFFFF)
6165  snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6166  else
6167  snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6168  q += strlen(q);
6169  continue;
6170  }
6171  }
6172  snprintf(q, qend-q, "x%02X", c);
6173  q += 3;
6174  }
6175  }
6176  *q++ = '"';
6177  *q = '\0';
6178  if (!rb_enc_asciicompat(enc)) {
6179  snprintf(q, qend-q, nonascii_suffix, enc->name);
6180  encidx = rb_ascii8bit_encindex();
6181  }
6182  /* result from dump is ASCII */
6183  rb_enc_associate_index(result, encidx);
6185  return result;
6186 }
6187 
6188 static int
6189 unescape_ascii(unsigned int c)
6190 {
6191  switch (c) {
6192  case 'n':
6193  return '\n';
6194  case 'r':
6195  return '\r';
6196  case 't':
6197  return '\t';
6198  case 'f':
6199  return '\f';
6200  case 'v':
6201  return '\13';
6202  case 'b':
6203  return '\010';
6204  case 'a':
6205  return '\007';
6206  case 'e':
6207  return 033;
6208  default:
6209  UNREACHABLE;
6210  }
6211 }
6212 
6213 static void
6214 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6215 {
6216  const char *s = *ss;
6217  unsigned int c;
6218  int codelen;
6219  size_t hexlen;
6220  unsigned char buf[6];
6221  static rb_encoding *enc_utf8 = NULL;
6222 
6223  switch (*s) {
6224  case '\\':
6225  case '"':
6226  case '#':
6227  rb_str_cat(undumped, s, 1); /* cat itself */
6228  s++;
6229  break;
6230  case 'n':
6231  case 'r':
6232  case 't':
6233  case 'f':
6234  case 'v':
6235  case 'b':
6236  case 'a':
6237  case 'e':
6238  *buf = unescape_ascii(*s);
6239  rb_str_cat(undumped, (char *)buf, 1);
6240  s++;
6241  break;
6242  case 'u':
6243  if (*binary) {
6244  rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6245  }
6246  *utf8 = true;
6247  if (++s >= s_end) {
6248  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6249  }
6250  if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6251  if (*penc != enc_utf8) {
6252  *penc = enc_utf8;
6253  rb_enc_associate(undumped, enc_utf8);
6254  }
6255  if (*s == '{') { /* handle \u{...} form */
6256  s++;
6257  for (;;) {
6258  if (s >= s_end) {
6259  rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6260  }
6261  if (*s == '}') {
6262  s++;
6263  break;
6264  }
6265  if (ISSPACE(*s)) {
6266  s++;
6267  continue;
6268  }
6269  c = scan_hex(s, s_end-s, &hexlen);
6270  if (hexlen == 0 || hexlen > 6) {
6271  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6272  }
6273  if (c > 0x10ffff) {
6274  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6275  }
6276  if (0xd800 <= c && c <= 0xdfff) {
6277  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6278  }
6279  codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6280  rb_str_cat(undumped, (char *)buf, codelen);
6281  s += hexlen;
6282  }
6283  }
6284  else { /* handle \uXXXX form */
6285  c = scan_hex(s, 4, &hexlen);
6286  if (hexlen != 4) {
6287  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6288  }
6289  if (0xd800 <= c && c <= 0xdfff) {
6290  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6291  }
6292  codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6293  rb_str_cat(undumped, (char *)buf, codelen);
6294  s += hexlen;
6295  }
6296  break;
6297  case 'x':
6298  if (*utf8) {
6299  rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6300  }
6301  *binary = true;
6302  if (++s >= s_end) {
6303  rb_raise(rb_eRuntimeError, "invalid hex escape");
6304  }
6305  *buf = scan_hex(s, 2, &hexlen);
6306  if (hexlen != 2) {
6307  rb_raise(rb_eRuntimeError, "invalid hex escape");
6308  }
6309  rb_str_cat(undumped, (char *)buf, 1);
6310  s += hexlen;
6311  break;
6312  default:
6313  rb_str_cat(undumped, s-1, 2);
6314  s++;
6315  }
6316 
6317  *ss = s;
6318 }
6319 
6320 static VALUE rb_str_is_ascii_only_p(VALUE str);
6321 
6322 /*
6323  * call-seq:
6324  * str.undump -> new_str
6325  *
6326  * Returns an unescaped version of the string.
6327  * This does the inverse of String#dump.
6328  *
6329  * "\"hello \\n ''\"".undump #=> "hello \n ''"
6330  */
6331 
6332 static VALUE
6333 str_undump(VALUE str)
6334 {
6335  const char *s = RSTRING_PTR(str);
6336  const char *s_end = RSTRING_END(str);
6337  rb_encoding *enc = rb_enc_get(str);
6338  VALUE undumped = rb_enc_str_new(s, 0L, enc);
6339  bool utf8 = false;
6340  bool binary = false;
6341  int w;
6342 
6344  if (rb_str_is_ascii_only_p(str) == Qfalse) {
6345  rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6346  }
6347  if (!str_null_check(str, &w)) {
6348  rb_raise(rb_eRuntimeError, "string contains null byte");
6349  }
6350  if (RSTRING_LEN(str) < 2) goto invalid_format;
6351  if (*s != '"') goto invalid_format;
6352 
6353  /* strip '"' at the start */
6354  s++;
6355 
6356  for (;;) {
6357  if (s >= s_end) {
6358  rb_raise(rb_eRuntimeError, "unterminated dumped string");
6359  }
6360 
6361  if (*s == '"') {
6362  /* epilogue */
6363  s++;
6364  if (s == s_end) {
6365  /* ascii compatible dumped string */
6366  break;
6367  }
6368  else {
6369  static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6370  static const char dup_suffix[] = ".dup";
6371  const char *encname;
6372  int encidx;
6373  ptrdiff_t size;
6374 
6375  /* check separately for strings dumped by older versions */
6376  size = sizeof(dup_suffix) - 1;
6377  if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6378 
6379  size = sizeof(force_encoding_suffix) - 1;
6380  if (s_end - s <= size) goto invalid_format;
6381  if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6382  s += size;
6383 
6384  if (utf8) {
6385  rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6386  }
6387 
6388  encname = s;
6389  s = memchr(s, '"', s_end-s);
6390  size = s - encname;
6391  if (!s) goto invalid_format;
6392  if (s_end - s != 2) goto invalid_format;
6393  if (s[0] != '"' || s[1] != ')') goto invalid_format;
6394 
6395  encidx = rb_enc_find_index2(encname, (long)size);
6396  if (encidx < 0) {
6397  rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6398  }
6399  rb_enc_associate_index(undumped, encidx);
6400  }
6401  break;
6402  }
6403 
6404  if (*s == '\\') {
6405  s++;
6406  if (s >= s_end) {
6407  rb_raise(rb_eRuntimeError, "invalid escape");
6408  }
6409  undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6410  }
6411  else {
6412  rb_str_cat(undumped, s++, 1);
6413  }
6414  }
6415 
6416  return undumped;
6417 invalid_format:
6418  rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6419 }
6420 
6421 static void
6422 rb_str_check_dummy_enc(rb_encoding *enc)
6423 {
6424  if (rb_enc_dummy_p(enc)) {
6425  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6426  rb_enc_name(enc));
6427  }
6428 }
6429 
6430 static rb_encoding *
6431 str_true_enc(VALUE str)
6432 {
6433  rb_encoding *enc = STR_ENC_GET(str);
6434  rb_str_check_dummy_enc(enc);
6435  return enc;
6436 }
6437 
6438 static OnigCaseFoldType
6439 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6440 {
6441  if (argc==0)
6442  return flags;
6443  if (argc>2)
6444  rb_raise(rb_eArgError, "too many options");
6445  if (argv[0]==sym_turkic) {
6447  if (argc==2) {
6448  if (argv[1]==sym_lithuanian)
6450  else
6451  rb_raise(rb_eArgError, "invalid second option");
6452  }
6453  }
6454  else if (argv[0]==sym_lithuanian) {
6456  if (argc==2) {
6457  if (argv[1]==sym_turkic)
6459  else
6460  rb_raise(rb_eArgError, "invalid second option");
6461  }
6462  }
6463  else if (argc>1)
6464  rb_raise(rb_eArgError, "too many options");
6465  else if (argv[0]==sym_ascii)
6466  flags |= ONIGENC_CASE_ASCII_ONLY;
6467  else if (argv[0]==sym_fold) {
6470  else
6471  rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
6472  }
6473  else
6474  rb_raise(rb_eArgError, "invalid option");
6475  return flags;
6476 }
6477 
6478 static inline bool
6479 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
6480 {
6481  if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
6482  return true;
6484 }
6485 
6486 /* 16 should be long enough to absorb any kind of single character length increase */
6487 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
6488 #ifndef CASEMAP_DEBUG
6489 # define CASEMAP_DEBUG 0
6490 #endif
6491 
6492 struct mapping_buffer;
6493 typedef struct mapping_buffer {
6494  size_t capa;
6495  size_t used;
6499 
6500 static void
6501 mapping_buffer_free(void *p)
6502 {
6503  mapping_buffer *previous_buffer;
6504  mapping_buffer *current_buffer = p;
6505  while (current_buffer) {
6506  previous_buffer = current_buffer;
6507  current_buffer = current_buffer->next;
6508  ruby_sized_xfree(previous_buffer, previous_buffer->capa);
6509  }
6510 }
6511 
6512 static const rb_data_type_t mapping_buffer_type = {
6513  "mapping_buffer",
6514  {0, mapping_buffer_free,}
6515 };
6516 
6517 static VALUE
6518 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
6519 {
6520  VALUE target;
6521 
6522  const OnigUChar *source_current, *source_end;
6523  int target_length = 0;
6524  VALUE buffer_anchor;
6525  mapping_buffer *current_buffer = 0;
6526  mapping_buffer **pre_buffer;
6527  size_t buffer_count = 0;
6528  int buffer_length_or_invalid;
6529 
6530  if (RSTRING_LEN(source) == 0) return rb_str_dup(source);
6531 
6532  source_current = (OnigUChar*)RSTRING_PTR(source);
6533  source_end = (OnigUChar*)RSTRING_END(source);
6534 
6535  buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
6536  pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
6537  while (source_current < source_end) {
6538  /* increase multiplier using buffer count to converge quickly */
6539  size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
6540  if (CASEMAP_DEBUG) {
6541  fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
6542  }
6543  current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
6544  *pre_buffer = current_buffer;
6545  pre_buffer = &current_buffer->next;
6546  current_buffer->next = NULL;
6547  current_buffer->capa = capa;
6548  buffer_length_or_invalid = enc->case_map(flags,
6549  (const OnigUChar**)&source_current, source_end,
6550  current_buffer->space,
6551  current_buffer->space+current_buffer->capa,
6552  enc);
6553  if (buffer_length_or_invalid < 0) {
6554  current_buffer = DATA_PTR(buffer_anchor);
6555  DATA_PTR(buffer_anchor) = 0;
6556  mapping_buffer_free(current_buffer);
6557  rb_raise(rb_eArgError, "input string invalid");
6558  }
6559  target_length += current_buffer->used = buffer_length_or_invalid;
6560  }
6561  if (CASEMAP_DEBUG) {
6562  fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
6563  }
6564 
6565  if (buffer_count==1) {
6566  target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length);
6567  }
6568  else {
6569  char *target_current;
6570 
6571  target = rb_str_new_with_class(source, 0, target_length);
6572  target_current = RSTRING_PTR(target);
6573  current_buffer = DATA_PTR(buffer_anchor);
6574  while (current_buffer) {
6575  memcpy(target_current, current_buffer->space, current_buffer->used);
6576  target_current += current_buffer->used;
6577  current_buffer = current_buffer->next;
6578  }
6579  }
6580  current_buffer = DATA_PTR(buffer_anchor);
6581  DATA_PTR(buffer_anchor) = 0;
6582  mapping_buffer_free(current_buffer);
6583 
6584  /* TODO: check about string terminator character */
6585  str_enc_copy(target, source);
6586  /*ENC_CODERANGE_SET(mapped, cr);*/
6587 
6588  return target;
6589 }
6590 
6591 static VALUE
6592 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
6593 {
6594  const OnigUChar *source_current, *source_end;
6595  OnigUChar *target_current, *target_end;
6596  long old_length = RSTRING_LEN(source);
6597  int length_or_invalid;
6598 
6599  if (old_length == 0) return Qnil;
6600 
6601  source_current = (OnigUChar*)RSTRING_PTR(source);
6602  source_end = (OnigUChar*)RSTRING_END(source);
6603  if (source == target) {
6604  target_current = (OnigUChar*)source_current;
6605  target_end = (OnigUChar*)source_end;
6606  }
6607  else {
6608  target_current = (OnigUChar*)RSTRING_PTR(target);
6609  target_end = (OnigUChar*)RSTRING_END(target);
6610  }
6611 
6612  length_or_invalid = onigenc_ascii_only_case_map(flags,
6613  &source_current, source_end,
6614  target_current, target_end, enc);
6615  if (length_or_invalid < 0)
6616  rb_raise(rb_eArgError, "input string invalid");
6617  if (CASEMAP_DEBUG && length_or_invalid != old_length) {
6618  fprintf(stderr, "problem with rb_str_ascii_casemap"
6619  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6620  rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
6621  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6622  }
6623 
6624  str_enc_copy(target, source);
6625 
6626  return target;
6627 }
6628 
6629 static bool
6630 upcase_single(VALUE str)
6631 {
6632  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6633  bool modified = false;
6634 
6635  while (s < send) {
6636  unsigned int c = *(unsigned char*)s;
6637 
6638  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
6639  *s = 'A' + (c - 'a');
6640  modified = true;
6641  }
6642  s++;
6643  }
6644  return modified;
6645 }
6646 
6647 /*
6648  * call-seq:
6649  * str.upcase! -> str or nil
6650  * str.upcase!([options]) -> str or nil
6651  *
6652  * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
6653  * were made.
6654  *
6655  * See String#downcase for meaning of +options+ and use with different encodings.
6656  */
6657 
6658 static VALUE
6659 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
6660 {
6661  rb_encoding *enc;
6663 
6664  flags = check_case_options(argc, argv, flags);
6665  str_modify_keep_cr(str);
6666  enc = str_true_enc(str);
6667  if (case_option_single_p(flags, enc, str)) {
6668  if (upcase_single(str))
6669  flags |= ONIGENC_CASE_MODIFIED;
6670  }
6671  else if (flags&ONIGENC_CASE_ASCII_ONLY)
6672  rb_str_ascii_casemap(str, str, &flags, enc);
6673  else
6674  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6675 
6676  if (ONIGENC_CASE_MODIFIED&flags) return str;
6677  return Qnil;
6678 }
6679 
6680 
6681 /*
6682  * call-seq:
6683  * str.upcase -> new_str
6684  * str.upcase([options]) -> new_str
6685  *
6686  * Returns a copy of <i>str</i> with all lowercase letters replaced with their
6687  * uppercase counterparts.
6688  *
6689  * See String#downcase for meaning of +options+ and use with different encodings.
6690  *
6691  * "hEllO".upcase #=> "HELLO"
6692  */
6693 
6694 static VALUE
6695 rb_str_upcase(int argc, VALUE *argv, VALUE str)
6696 {
6697  rb_encoding *enc;
6699  VALUE ret;
6700 
6701  flags = check_case_options(argc, argv, flags);
6702  enc = str_true_enc(str);
6703  if (case_option_single_p(flags, enc, str)) {
6705  str_enc_copy(ret, str);
6706  upcase_single(ret);
6707  }
6708  else if (flags&ONIGENC_CASE_ASCII_ONLY) {
6710  rb_str_ascii_casemap(str, ret, &flags, enc);
6711  }
6712  else {
6713  ret = rb_str_casemap(str, &flags, enc);
6714  }
6715 
6716  return ret;
6717 }
6718 
6719 static bool
6720 downcase_single(VALUE str)
6721 {
6722  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6723  bool modified = false;
6724 
6725  while (s < send) {
6726  unsigned int c = *(unsigned char*)s;
6727 
6728  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
6729  *s = 'a' + (c - 'A');
6730  modified = true;
6731  }
6732  s++;
6733  }
6734 
6735  return modified;
6736 }
6737 
6738 /*
6739  * call-seq:
6740  * str.downcase! -> str or nil
6741  * str.downcase!([options]) -> str or nil
6742  *
6743  * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
6744  * changes were made.
6745  *
6746  * See String#downcase for meaning of +options+ and use with different encodings.
6747  */
6748 
6749 static VALUE
6750 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
6751 {
6752  rb_encoding *enc;
6754 
6755  flags = check_case_options(argc, argv, flags);
6756  str_modify_keep_cr(str);
6757  enc = str_true_enc(str);
6758  if (case_option_single_p(flags, enc, str)) {
6759  if (downcase_single(str))
6760  flags |= ONIGENC_CASE_MODIFIED;
6761  }
6762  else if (flags&ONIGENC_CASE_ASCII_ONLY)
6763  rb_str_ascii_casemap(str, str, &flags, enc);
6764  else
6765  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6766 
6767  if (ONIGENC_CASE_MODIFIED&flags) return str;
6768  return Qnil;
6769 }
6770 
6771 
6772 /*
6773  * call-seq:
6774  * str.downcase -> new_str
6775  * str.downcase([options]) -> new_str
6776  *
6777  * Returns a copy of <i>str</i> with all uppercase letters replaced with their
6778  * lowercase counterparts. Which letters exactly are replaced, and by which
6779  * other letters, depends on the presence or absence of options, and on the
6780  * +encoding+ of the string.
6781  *
6782  * The meaning of the +options+ is as follows:
6783  *
6784  * No option ::
6785  * Full Unicode case mapping, suitable for most languages
6786  * (see :turkic and :lithuanian options below for exceptions).
6787  * Context-dependent case mapping as described in Table 3-14 of the
6788  * Unicode standard is currently not supported.
6789  * :ascii ::
6790  * Only the ASCII region, i.e. the characters ``A'' to ``Z'' and
6791  * ``a'' to ``z'', are affected.
6792  * This option cannot be combined with any other option.
6793  * :turkic ::
6794  * Full Unicode case mapping, adapted for Turkic languages
6795  * (Turkish, Azerbaijani, ...). This means that upper case I is mapped to
6796  * lower case dotless i, and so on.
6797  * :lithuanian ::
6798  * Currently, just full Unicode case mapping. In the future, full Unicode
6799  * case mapping adapted for Lithuanian (keeping the dot on the lower case
6800  * i even if there is an accent on top).
6801  * :fold ::
6802  * Only available on +downcase+ and +downcase!+. Unicode case <b>folding</b>,
6803  * which is more far-reaching than Unicode case mapping.
6804  * This option currently cannot be combined with any other option
6805  * (i.e. there is currently no variant for turkic languages).
6806  *
6807  * Please note that several assumptions that are valid for ASCII-only case
6808  * conversions do not hold for more general case conversions. For example,
6809  * the length of the result may not be the same as the length of the input
6810  * (neither in characters nor in bytes), some roundtrip assumptions
6811  * (e.g. str.downcase == str.upcase.downcase) may not apply, and Unicode
6812  * normalization (i.e. String#unicode_normalize) is not necessarily maintained
6813  * by case mapping operations.
6814  *
6815  * Non-ASCII case mapping/folding is currently supported for UTF-8,
6816  * UTF-16BE/LE, UTF-32BE/LE, and ISO-8859-1~16 Strings/Symbols.
6817  * This support will be extended to other encodings.
6818  *
6819  * "hEllO".downcase #=> "hello"
6820  */
6821 
6822 static VALUE
6823 rb_str_downcase(int argc, VALUE *argv, VALUE str)
6824 {
6825  rb_encoding *enc;
6827  VALUE ret;
6828 
6829  flags = check_case_options(argc, argv, flags);
6830  enc = str_true_enc(str);
6831  if (case_option_single_p(flags, enc, str)) {
6833  str_enc_copy(ret, str);
6834  downcase_single(ret);
6835  }
6836  else if (flags&ONIGENC_CASE_ASCII_ONLY) {
6838  rb_str_ascii_casemap(str, ret, &flags, enc);
6839  }
6840  else {
6841  ret = rb_str_casemap(str, &flags, enc);
6842  }
6843 
6844  return ret;
6845 }
6846 
6847 
6848 /*
6849  * call-seq:
6850  * str.capitalize! -> str or nil
6851  * str.capitalize!([options]) -> str or nil
6852  *
6853  * Modifies <i>str</i> by converting the first character to uppercase and the
6854  * remainder to lowercase. Returns <code>nil</code> if no changes are made.
6855  * There is an exception for modern Georgian (mkhedruli/MTAVRULI), where
6856  * the result is the same as for String#downcase, to avoid mixed case.
6857  *
6858  * See String#downcase for meaning of +options+ and use with different encodings.
6859  *
6860  * a = "hello"
6861  * a.capitalize! #=> "Hello"
6862  * a #=> "Hello"
6863  * a.capitalize! #=> nil
6864  */
6865 
6866 static VALUE
6867 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
6868 {
6869  rb_encoding *enc;
6871 
6872  flags = check_case_options(argc, argv, flags);
6873  str_modify_keep_cr(str);
6874  enc = str_true_enc(str);
6875  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6876  if (flags&ONIGENC_CASE_ASCII_ONLY)
6877  rb_str_ascii_casemap(str, str, &flags, enc);
6878  else
6879  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6880 
6881  if (ONIGENC_CASE_MODIFIED&flags) return str;
6882  return Qnil;
6883 }
6884 
6885 
6886 /*
6887  * call-seq:
6888  * str.capitalize -> new_str
6889  * str.capitalize([options]) -> new_str
6890  *
6891  * Returns a copy of <i>str</i> with the first character converted to uppercase
6892  * and the remainder to lowercase.
6893  *
6894  * See String#downcase for meaning of +options+ and use with different encodings.
6895  *
6896  * "hello".capitalize #=> "Hello"
6897  * "HELLO".capitalize #=> "Hello"
6898  * "123ABC".capitalize #=> "123abc"
6899  */
6900 
6901 static VALUE
6902 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
6903 {
6904  rb_encoding *enc;
6906  VALUE ret;
6907 
6908  flags = check_case_options(argc, argv, flags);
6909  enc = str_true_enc(str);
6910  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
6911  if (flags&ONIGENC_CASE_ASCII_ONLY) {
6913  rb_str_ascii_casemap(str, ret, &flags, enc);
6914  }
6915  else {
6916  ret = rb_str_casemap(str, &flags, enc);
6917  }
6918  return ret;
6919 }
6920 
6921 
6922 /*
6923  * call-seq:
6924  * str.swapcase! -> str or nil
6925  * str.swapcase!([options]) -> str or nil
6926  *
6927  * Equivalent to String#swapcase, but modifies the receiver in place,
6928  * returning <i>str</i>, or <code>nil</code> if no changes were made.
6929  *
6930  * See String#downcase for meaning of +options+ and use with
6931  * different encodings.
6932  */
6933 
6934 static VALUE
6935 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
6936 {
6937  rb_encoding *enc;
6939 
6940  flags = check_case_options(argc, argv, flags);
6941  str_modify_keep_cr(str);
6942  enc = str_true_enc(str);
6943  if (flags&ONIGENC_CASE_ASCII_ONLY)
6944  rb_str_ascii_casemap(str, str, &flags, enc);
6945  else
6946  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6947 
6948  if (ONIGENC_CASE_MODIFIED&flags) return str;
6949  return Qnil;
6950 }
6951 
6952 
6953 /*
6954  * call-seq:
6955  * str.swapcase -> new_str
6956  * str.swapcase([options]) -> new_str
6957  *
6958  * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
6959  * to lowercase and lowercase characters converted to uppercase.
6960  *
6961  * See String#downcase for meaning of +options+ and use with different encodings.
6962  *
6963  * "Hello".swapcase #=> "hELLO"
6964  * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
6965  */
6966 
6967 static VALUE
6968 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
6969 {
6970  rb_encoding *enc;
6972  VALUE ret;
6973 
6974  flags = check_case_options(argc, argv, flags);
6975  enc = str_true_enc(str);
6976  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
6977  if (flags&ONIGENC_CASE_ASCII_ONLY) {
6979  rb_str_ascii_casemap(str, ret, &flags, enc);
6980  }
6981  else {
6982  ret = rb_str_casemap(str, &flags, enc);
6983  }
6984  return ret;
6985 }
6986 
6987 typedef unsigned char *USTR;
6988 
6989 struct tr {
6990  int gen;
6991  unsigned int now, max;
6992  char *p, *pend;
6993 };
6994 
6995 static unsigned int
6996 trnext(struct tr *t, rb_encoding *enc)
6997 {
6998  int n;
6999 
7000  for (;;) {
7001  if (!t->gen) {
7002 nextpart:
7003  if (t->p == t->pend) return -1;
7004  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7005  t->p += n;
7006  }
7007  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7008  t->p += n;
7009  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7010  t->p += n;
7011  if (t->p < t->pend) {
7012  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7013  t->p += n;
7014  if (t->now > c) {
7015  if (t->now < 0x80 && c < 0x80) {
7017  "invalid range \"%c-%c\" in string transliteration",
7018  t->now, c);
7019  }
7020  else {
7021  rb_raise(rb_eArgError, "invalid range in string transliteration");
7022  }
7023  continue; /* not reached */
7024  }
7025  t->gen = 1;
7026  t->max = c;
7027  }
7028  }
7029  return t->now;
7030  }
7031  else {
7032  while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7033  if (t->now == t->max) {
7034  t->gen = 0;
7035  goto nextpart;
7036  }
7037  }
7038  if (t->now < t->max) {
7039  return t->now;
7040  }
7041  else {
7042  t->gen = 0;
7043  return t->max;
7044  }
7045  }
7046  }
7047 }
7048 
7049 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7050 
7051 static VALUE
7052 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7053 {
7054  const unsigned int errc = -1;
7055  unsigned int trans[256];
7056  rb_encoding *enc, *e1, *e2;
7057  struct tr trsrc, trrepl;
7058  int cflag = 0;
7059  unsigned int c, c0, last = 0;
7060  int modify = 0, i, l;
7061  unsigned char *s, *send;
7062  VALUE hash = 0;
7063  int singlebyte = single_byte_optimizable(str);
7064  int termlen;
7065  int cr;
7066 
7067 #define CHECK_IF_ASCII(c) \
7068  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7069  (cr = ENC_CODERANGE_VALID) : 0)
7070 
7071  StringValue(src);
7072  StringValue(repl);
7073  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7074  if (RSTRING_LEN(repl) == 0) {
7075  return rb_str_delete_bang(1, &src, str);
7076  }
7077 
7078  cr = ENC_CODERANGE(str);
7079  e1 = rb_enc_check(str, src);
7080  e2 = rb_enc_check(str, repl);
7081  if (e1 == e2) {
7082  enc = e1;
7083  }
7084  else {
7085  enc = rb_enc_check(src, repl);
7086  }
7087  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7088  if (RSTRING_LEN(src) > 1 &&
7089  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7090  trsrc.p + l < trsrc.pend) {
7091  cflag = 1;
7092  trsrc.p += l;
7093  }
7094  trrepl.p = RSTRING_PTR(repl);
7095  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7096  trsrc.gen = trrepl.gen = 0;
7097  trsrc.now = trrepl.now = 0;
7098  trsrc.max = trrepl.max = 0;
7099 
7100  if (cflag) {
7101  for (i=0; i<256; i++) {
7102  trans[i] = 1;
7103  }
7104  while ((c = trnext(&trsrc, enc)) != errc) {
7105  if (c < 256) {
7106  trans[c] = errc;
7107  }
7108  else {
7109  if (!hash) hash = rb_hash_new();
7110  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7111  }
7112  }
7113  while ((c = trnext(&trrepl, enc)) != errc)
7114  /* retrieve last replacer */;
7115  last = trrepl.now;
7116  for (i=0; i<256; i++) {
7117  if (trans[i] != errc) {
7118  trans[i] = last;
7119  }
7120  }
7121  }
7122  else {
7123  unsigned int r;
7124 
7125  for (i=0; i<256; i++) {
7126  trans[i] = errc;
7127  }
7128  while ((c = trnext(&trsrc, enc)) != errc) {
7129  r = trnext(&trrepl, enc);
7130  if (r == errc) r = trrepl.now;
7131  if (c < 256) {
7132  trans[c] = r;
7133  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7134  }
7135  else {
7136  if (!hash) hash = rb_hash_new();
7137  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7138  }
7139  }
7140  }
7141 
7142  if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7143  cr = ENC_CODERANGE_7BIT;
7144  str_modify_keep_cr(str);
7145  s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7146  termlen = rb_enc_mbminlen(enc);
7147  if (sflag) {
7148  int clen, tlen;
7149  long offset, max = RSTRING_LEN(str);
7150  unsigned int save = -1;
7151  unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7152 
7153  while (s < send) {
7154  int may_modify = 0;
7155 
7156  c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7157  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7158 
7159  s += clen;
7160  if (c < 256) {
7161  c = trans[c];
7162  }
7163  else if (hash) {
7164  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7165  if (NIL_P(tmp)) {
7166  if (cflag) c = last;
7167  else c = errc;
7168  }
7169  else if (cflag) c = errc;
7170  else c = NUM2INT(tmp);
7171  }
7172  else {
7173  c = errc;
7174  }
7175  if (c != (unsigned int)-1) {
7176  if (save == c) {
7177  CHECK_IF_ASCII(c);
7178  continue;
7179  }
7180  save = c;
7181  tlen = rb_enc_codelen(c, enc);
7182  modify = 1;
7183  }
7184  else {
7185  save = -1;
7186  c = c0;
7187  if (enc != e1) may_modify = 1;
7188  }
7189  if ((offset = t - buf) + tlen > max) {
7190  size_t MAYBE_UNUSED(old) = max + termlen;
7191  max = offset + tlen + (send - s);
7192  SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7193  t = buf + offset;
7194  }
7195  rb_enc_mbcput(c, t, enc);
7196  if (may_modify && memcmp(s, t, tlen) != 0) {
7197  modify = 1;
7198  }
7199  CHECK_IF_ASCII(c);
7200  t += tlen;
7201  }
7202  if (!STR_EMBED_P(str)) {
7204  }
7205  TERM_FILL((char *)t, termlen);
7206  RSTRING(str)->as.heap.ptr = (char *)buf;
7207  RSTRING(str)->as.heap.len = t - buf;
7209  RSTRING(str)->as.heap.aux.capa = max;
7210  }
7211  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7212  while (s < send) {
7213  c = (unsigned char)*s;
7214  if (trans[c] != errc) {
7215  if (!cflag) {
7216  c = trans[c];
7217  *s = c;
7218  modify = 1;
7219  }
7220  else {
7221  *s = last;
7222  modify = 1;
7223  }
7224  }
7225  CHECK_IF_ASCII(c);
7226  s++;
7227  }
7228  }
7229  else {
7230  int clen, tlen;
7231  long offset, max = (long)((send - s) * 1.2);
7232  unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7233 
7234  while (s < send) {
7235  int may_modify = 0;
7236  c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7237  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7238 
7239  if (c < 256) {
7240  c = trans[c];
7241  }
7242  else if (hash) {
7243  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7244  if (NIL_P(tmp)) {
7245  if (cflag) c = last;
7246  else c = errc;
7247  }
7248  else if (cflag) c = errc;
7249  else c = NUM2INT(tmp);
7250  }
7251  else {
7252  c = cflag ? last : errc;
7253  }
7254  if (c != errc) {
7255  tlen = rb_enc_codelen(c, enc);
7256  modify = 1;
7257  }
7258  else {
7259  c = c0;
7260  if (enc != e1) may_modify = 1;
7261  }
7262  if ((offset = t - buf) + tlen > max) {
7263  size_t MAYBE_UNUSED(old) = max + termlen;
7264  max = offset + tlen + (long)((send - s) * 1.2);
7265  SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7266  t = buf + offset;
7267  }
7268  if (s != t) {
7269  rb_enc_mbcput(c, t, enc);
7270  if (may_modify && memcmp(s, t, tlen) != 0) {
7271  modify = 1;
7272  }
7273  }
7274  CHECK_IF_ASCII(c);
7275  s += clen;
7276  t += tlen;
7277  }
7278  if (!STR_EMBED_P(str)) {
7280  }
7281  TERM_FILL((char *)t, termlen);
7282  RSTRING(str)->as.heap.ptr = (char *)buf;
7283  RSTRING(str)->as.heap.len = t - buf;
7285  RSTRING(str)->as.heap.aux.capa = max;
7286  }
7287 
7288  if (modify) {
7289  if (cr != ENC_CODERANGE_BROKEN)
7290  ENC_CODERANGE_SET(str, cr);
7291  rb_enc_associate(str, enc);
7292  return str;
7293  }
7294  return Qnil;
7295 }
7296 
7297 
7298 /*
7299  * call-seq:
7300  * str.tr!(from_str, to_str) -> str or nil
7301  *
7302  * Translates <i>str</i> in place, using the same rules as
7303  * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7304  * were made.
7305  */
7306 
7307 static VALUE
7308 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7309 {
7310  return tr_trans(str, src, repl, 0);
7311 }
7312 
7313 
7314 /*
7315  * call-seq:
7316  * str.tr(from_str, to_str) => new_str
7317  *
7318  * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7319  * corresponding characters in +to_str+. If +to_str+ is shorter than
7320  * +from_str+, it is padded with its last character in order to maintain the
7321  * correspondence.
7322  *
7323  * "hello".tr('el', 'ip') #=> "hippo"
7324  * "hello".tr('aeiou', '*') #=> "h*ll*"
7325  * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7326  *
7327  * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7328  * characters, and +from_str+ may start with a <code>^</code>, which denotes
7329  * all characters except those listed.
7330  *
7331  * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7332  * "hello".tr('^aeiou', '*') #=> "*e**o"
7333  *
7334  * The backslash character <code>\</code> can be used to escape
7335  * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7336  * appears at the end of a range or the end of the +from_str+ or +to_str+:
7337  *
7338  * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7339  * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7340  *
7341  * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7342  * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7343  * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7344  *
7345  * "X['\\b']".tr("X\\", "") #=> "['b']"
7346  * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7347  */
7348 
7349 static VALUE
7350 rb_str_tr(VALUE str, VALUE src, VALUE repl)
7351 {
7352  str = rb_str_dup(str);
7353  tr_trans(str, src, repl, 0);
7354  return str;
7355 }
7356 
7357 #define TR_TABLE_SIZE 257
7358 static void
7359 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7360  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7361 {
7362  const unsigned int errc = -1;
7363  char buf[256];
7364  struct tr tr;
7365  unsigned int c;
7366  VALUE table = 0, ptable = 0;
7367  int i, l, cflag = 0;
7368 
7369  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7370  tr.gen = tr.now = tr.max = 0;
7371 
7372  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7373  cflag = 1;
7374  tr.p += l;
7375  }
7376  if (first) {
7377  for (i=0; i<256; i++) {
7378  stable[i] = 1;
7379  }
7380  stable[256] = cflag;
7381  }
7382  else if (stable[256] && !cflag) {
7383  stable[256] = 0;
7384  }
7385  for (i=0; i<256; i++) {
7386  buf[i] = cflag;
7387  }
7388 
7389  while ((c = trnext(&tr, enc)) != errc) {
7390  if (c < 256) {
7391  buf[c & 0xff] = !cflag;
7392  }
7393  else {
7394  VALUE key = UINT2NUM(c);
7395 
7396  if (!table && (first || *tablep || stable[256])) {
7397  if (cflag) {
7398  ptable = *ctablep;
7399  table = ptable ? ptable : rb_hash_new();
7400  *ctablep = table;
7401  }
7402  else {
7403  table = rb_hash_new();
7404  ptable = *tablep;
7405  *tablep = table;
7406  }
7407  }
7408  if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7409  rb_hash_aset(table, key, Qtrue);
7410  }
7411  }
7412  }
7413  for (i=0; i<256; i++) {
7414  stable[i] = stable[i] && buf[i];
7415  }
7416  if (!table && !cflag) {
7417  *tablep = 0;
7418  }
7419 }
7420 
7421 
7422 static int
7423 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7424 {
7425  if (c < 256) {
7426  return table[c] != 0;
7427  }
7428  else {
7429  VALUE v = UINT2NUM(c);
7430 
7431  if (del) {
7432  if (!NIL_P(rb_hash_lookup(del, v)) &&
7433  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7434  return TRUE;
7435  }
7436  }
7437  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7438  return FALSE;
7439  }
7440  return table[256] ? TRUE : FALSE;
7441  }
7442 }
7443 
7444 /*
7445  * call-seq:
7446  * str.delete!([other_str]+) -> str or nil
7447  *
7448  * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7449  * <code>nil</code> if <i>str</i> was not modified.
7450  */
7451 
7452 static VALUE
7453 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7454 {
7455  char squeez[TR_TABLE_SIZE];
7456  rb_encoding *enc = 0;
7457  char *s, *send, *t;
7458  VALUE del = 0, nodel = 0;
7459  int modify = 0;
7460  int i, ascompat, cr;
7461 
7462  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7464  for (i=0; i<argc; i++) {
7465  VALUE s = argv[i];
7466 
7467  StringValue(s);
7468  enc = rb_enc_check(str, s);
7469  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7470  }
7471 
7472  str_modify_keep_cr(str);
7473  ascompat = rb_enc_asciicompat(enc);
7474  s = t = RSTRING_PTR(str);
7475  send = RSTRING_END(str);
7476  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
7477  while (s < send) {
7478  unsigned int c;
7479  int clen;
7480 
7481  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7482  if (squeez[c]) {
7483  modify = 1;
7484  }
7485  else {
7486  if (t != s) *t = c;
7487  t++;
7488  }
7489  s++;
7490  }
7491  else {
7492  c = rb_enc_codepoint_len(s, send, &clen, enc);
7493 
7494  if (tr_find(c, squeez, del, nodel)) {
7495  modify = 1;
7496  }
7497  else {
7498  if (t != s) rb_enc_mbcput(c, t, enc);
7499  t += clen;
7500  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
7501  }
7502  s += clen;
7503  }
7504  }
7505  TERM_FILL(t, TERM_LEN(str));
7506  STR_SET_LEN(str, t - RSTRING_PTR(str));
7507  ENC_CODERANGE_SET(str, cr);
7508 
7509  if (modify) return str;
7510  return Qnil;
7511 }
7512 
7513 
7514 /*
7515  * call-seq:
7516  * str.delete([other_str]+) -> new_str
7517  *
7518  * Returns a copy of <i>str</i> with all characters in the intersection of its
7519  * arguments deleted. Uses the same rules for building the set of characters as
7520  * String#count.
7521  *
7522  * "hello".delete "l","lo" #=> "heo"
7523  * "hello".delete "lo" #=> "he"
7524  * "hello".delete "aeiou", "^e" #=> "hell"
7525  * "hello".delete "ej-m" #=> "ho"
7526  */
7527 
7528 static VALUE
7529 rb_str_delete(int argc, VALUE *argv, VALUE str)
7530 {
7531  str = rb_str_dup(str);
7532  rb_str_delete_bang(argc, argv, str);
7533  return str;
7534 }
7535 
7536 
7537 /*
7538  * call-seq:
7539  * str.squeeze!([other_str]*) -> str or nil
7540  *
7541  * Squeezes <i>str</i> in place, returning either <i>str</i>, or
7542  * <code>nil</code> if no changes were made.
7543  */
7544 
7545 static VALUE
7546 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
7547 {
7548  char squeez[TR_TABLE_SIZE];
7549  rb_encoding *enc = 0;
7550  VALUE del = 0, nodel = 0;
7551  unsigned char *s, *send, *t;
7552  int i, modify = 0;
7553  int ascompat, singlebyte = single_byte_optimizable(str);
7554  unsigned int save;
7555 
7556  if (argc == 0) {
7557  enc = STR_ENC_GET(str);
7558  }
7559  else {
7560  for (i=0; i<argc; i++) {
7561  VALUE s = argv[i];
7562 
7563  StringValue(s);
7564  enc = rb_enc_check(str, s);
7565  if (singlebyte && !single_byte_optimizable(s))
7566  singlebyte = 0;
7567  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7568  }
7569  }
7570 
7571  str_modify_keep_cr(str);
7572  s = t = (unsigned char *)RSTRING_PTR(str);
7573  if (!s || RSTRING_LEN(str) == 0) return Qnil;
7574  send = (unsigned char *)RSTRING_END(str);
7575  save = -1;
7576  ascompat = rb_enc_asciicompat(enc);
7577 
7578  if (singlebyte) {
7579  while (s < send) {
7580  unsigned int c = *s++;
7581  if (c != save || (argc > 0 && !squeez[c])) {
7582  *t++ = save = c;
7583  }
7584  }
7585  }
7586  else {
7587  while (s < send) {
7588  unsigned int c;
7589  int clen;
7590 
7591  if (ascompat && (c = *s) < 0x80) {
7592  if (c != save || (argc > 0 && !squeez[c])) {
7593  *t++ = save = c;
7594  }
7595  s++;
7596  }
7597  else {
7598  c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
7599 
7600  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
7601  if (t != s) rb_enc_mbcput(c, t, enc);
7602  save = c;
7603  t += clen;
7604  }
7605  s += clen;
7606  }
7607  }
7608  }
7609 
7610  TERM_FILL((char *)t, TERM_LEN(str));
7611  if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
7612  STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
7613  modify = 1;
7614  }
7615 
7616  if (modify) return str;
7617  return Qnil;
7618 }
7619 
7620 
7621 /*
7622  * call-seq:
7623  * str.squeeze([other_str]*) -> new_str
7624  *
7625  * Builds a set of characters from the <i>other_str</i> parameter(s)
7626  * using the procedure described for String#count. Returns a new
7627  * string where runs of the same character that occur in this set are
7628  * replaced by a single character. If no arguments are given, all
7629  * runs of identical characters are replaced by a single character.
7630  *
7631  * "yellow moon".squeeze #=> "yelow mon"
7632  * " now is the".squeeze(" ") #=> " now is the"
7633  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
7634  */
7635 
7636 static VALUE
7637 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
7638 {
7639  str = rb_str_dup(str);
7640  rb_str_squeeze_bang(argc, argv, str);
7641  return str;
7642 }
7643 
7644 
7645 /*
7646  * call-seq:
7647  * str.tr_s!(from_str, to_str) -> str or nil
7648  *
7649  * Performs String#tr_s processing on <i>str</i> in place,
7650  * returning <i>str</i>, or <code>nil</code> if no changes were made.
7651  */
7652 
7653 static VALUE
7654 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
7655 {
7656  return tr_trans(str, src, repl, 1);
7657 }
7658 
7659 
7660 /*
7661  * call-seq:
7662  * str.tr_s(from_str, to_str) -> new_str
7663  *
7664  * Processes a copy of <i>str</i> as described under String#tr, then
7665  * removes duplicate characters in regions that were affected by the
7666  * translation.
7667  *
7668  * "hello".tr_s('l', 'r') #=> "hero"
7669  * "hello".tr_s('el', '*') #=> "h*o"
7670  * "hello".tr_s('el', 'hx') #=> "hhxo"
7671  */
7672 
7673 static VALUE
7674 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
7675 {
7676  str = rb_str_dup(str);
7677  tr_trans(str, src, repl, 1);
7678  return str;
7679 }
7680 
7681 
7682 /*
7683  * call-seq:
7684  * str.count([other_str]+) -> integer
7685  *
7686  * Each +other_str+ parameter defines a set of characters to count. The
7687  * intersection of these sets defines the characters to count in +str+. Any
7688  * +other_str+ that starts with a caret <code>^</code> is negated. The
7689  * sequence <code>c1-c2</code> means all characters between c1 and c2. The
7690  * backslash character <code>\</code> can be used to escape <code>^</code> or
7691  * <code>-</code> and is otherwise ignored unless it appears at the end of a
7692  * sequence or the end of a +other_str+.
7693  *
7694  * a = "hello world"
7695  * a.count "lo" #=> 5
7696  * a.count "lo", "o" #=> 2
7697  * a.count "hello", "^l" #=> 4
7698  * a.count "ej-m" #=> 4
7699  *
7700  * "hello^world".count "\\^aeiou" #=> 4
7701  * "hello-world".count "a\\-eo" #=> 4
7702  *
7703  * c = "hello world\\r\\n"
7704  * c.count "\\" #=> 2
7705  * c.count "\\A" #=> 0
7706  * c.count "X-\\w" #=> 3
7707  */
7708 
7709 static VALUE
7710 rb_str_count(int argc, VALUE *argv, VALUE str)
7711 {
7712  char table[TR_TABLE_SIZE];
7713  rb_encoding *enc = 0;
7714  VALUE del = 0, nodel = 0, tstr;
7715  char *s, *send;
7716  int i;
7717  int ascompat;
7718 
7720 
7721  tstr = argv[0];
7722  StringValue(tstr);
7723  enc = rb_enc_check(str, tstr);
7724  if (argc == 1) {
7725  const char *ptstr;
7726  if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
7727  (ptstr = RSTRING_PTR(tstr),
7728  ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
7729  !is_broken_string(str)) {
7730  int n = 0;
7731  int clen;
7732  unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
7733 
7734  s = RSTRING_PTR(str);
7735  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7736  send = RSTRING_END(str);
7737  while (s < send) {
7738  if (*(unsigned char*)s++ == c) n++;
7739  }
7740  return INT2NUM(n);
7741  }
7742  }
7743 
7744  tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
7745  for (i=1; i<argc; i++) {
7746  tstr = argv[i];
7747  StringValue(tstr);
7748  enc = rb_enc_check(str, tstr);
7749  tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
7750  }
7751 
7752  s = RSTRING_PTR(str);
7753  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7754  send = RSTRING_END(str);
7755  ascompat = rb_enc_asciicompat(enc);
7756  i = 0;
7757  while (s < send) {
7758  unsigned int c;
7759 
7760  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7761  if (table[c]) {
7762  i++;
7763  }
7764  s++;
7765  }
7766  else {
7767  int clen;
7768  c = rb_enc_codepoint_len(s, send, &clen, enc);
7769  if (tr_find(c, table, del, nodel)) {
7770  i++;
7771  }
7772  s += clen;
7773  }
7774  }
7775 
7776  return INT2NUM(i);
7777 }
7778 
7779 static VALUE
7780 rb_fs_check(VALUE val)
7781 {
7782  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
7783  val = rb_check_string_type(val);
7784  if (NIL_P(val)) return 0;
7785  }
7786  return val;
7787 }
7788 
7789 static const char isspacetable[256] = {
7790  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
7791  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7792  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7793  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7794  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7795  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7796  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7797  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7798  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7799  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7800  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7801  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7802  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7803  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7804  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7805  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
7806 };
7807 
7808 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
7809 
7810 static long
7811 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
7812 {
7813  if (empty_count >= 0 && len == 0) {
7814  return empty_count + 1;
7815  }
7816  if (empty_count > 0) {
7817  /* make different substrings */
7818  if (result) {
7819  do {
7820  rb_ary_push(result, str_new_empty(str));
7821  } while (--empty_count > 0);
7822  }
7823  else {
7824  do {
7825  rb_yield(str_new_empty(str));
7826  } while (--empty_count > 0);
7827  }
7828  }
7829  str = rb_str_subseq(str, beg, len);
7830  if (result) {
7831  rb_ary_push(result, str);
7832  }
7833  else {
7834  rb_yield(str);
7835  }
7836  return empty_count;
7837 }
7838 
7839 /*
7840  * call-seq:
7841  * str.split(pattern=nil, [limit]) -> an_array
7842  * str.split(pattern=nil, [limit]) {|sub| block } -> str
7843  *
7844  * Divides <i>str</i> into substrings based on a delimiter, returning an array
7845  * of these substrings.
7846  *
7847  * If <i>pattern</i> is a String, then its contents are used as
7848  * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
7849  * space, <i>str</i> is split on whitespace, with leading and trailing
7850  * whitespace and runs of contiguous whitespace characters ignored.
7851  *
7852  * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
7853  * pattern matches. Whenever the pattern matches a zero-length string,
7854  * <i>str</i> is split into individual characters. If <i>pattern</i> contains
7855  * groups, the respective matches will be returned in the array as well.
7856  *
7857  * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
7858  * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
7859  * split on whitespace as if ' ' were specified.
7860  *
7861  * If the <i>limit</i> parameter is omitted, trailing null fields are
7862  * suppressed. If <i>limit</i> is a positive number, at most that number
7863  * of split substrings will be returned (captured groups will be returned
7864  * as well, but are not counted towards the limit).
7865  * If <i>limit</i> is <code>1</code>, the entire
7866  * string is returned as the only entry in an array. If negative, there is no
7867  * limit to the number of fields returned, and trailing null fields are not
7868  * suppressed.
7869  *
7870  * When the input +str+ is empty an empty Array is returned as the string is
7871  * considered to have no fields to split.
7872  *
7873  * " now's the time ".split #=> ["now's", "the", "time"]
7874  * " now's the time ".split(' ') #=> ["now's", "the", "time"]
7875  * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
7876  * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
7877  * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
7878  * "hello".split(//, 3) #=> ["h", "e", "llo"]
7879  * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
7880  *
7881  * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
7882  * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
7883  * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
7884  * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
7885  *
7886  * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
7887  *
7888  * "".split(',', -1) #=> []
7889  *
7890  * If a block is given, invoke the block with each split substring.
7891  *
7892  */
7893 
7894 static VALUE
7895 rb_str_split_m(int argc, VALUE *argv, VALUE str)
7896 {
7897  rb_encoding *enc;
7898  VALUE spat;
7899  VALUE limit;
7900  enum {awk, string, regexp, chars} split_type;
7901  long beg, end, i = 0, empty_count = -1;
7902  int lim = 0;
7903  VALUE result, tmp;
7904 
7905  result = rb_block_given_p() ? Qfalse : Qnil;
7906  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
7907  lim = NUM2INT(limit);
7908  if (lim <= 0) limit = Qnil;
7909  else if (lim == 1) {
7910  if (RSTRING_LEN(str) == 0)
7911  return result ? rb_ary_new2(0) : str;
7912  tmp = rb_str_dup(str);
7913  if (!result) {
7914  rb_yield(tmp);
7915  return str;
7916  }
7917  return rb_ary_new3(1, tmp);
7918  }
7919  i = 1;
7920  }
7921  if (NIL_P(limit) && !lim) empty_count = 0;
7922 
7923  enc = STR_ENC_GET(str);
7924  split_type = regexp;
7925  if (!NIL_P(spat)) {
7926  spat = get_pat_quoted(spat, 0);
7927  }
7928  else if (NIL_P(spat = rb_fs)) {
7929  split_type = awk;
7930  }
7931  else if (!(spat = rb_fs_check(spat))) {
7932  rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
7933  }
7934  else {
7935  rb_warn("$; is set to non-nil value");
7936  }
7937  if (split_type != awk) {
7938  if (BUILTIN_TYPE(spat) == T_STRING) {
7939  rb_encoding *enc2 = STR_ENC_GET(spat);
7940 
7941  mustnot_broken(spat);
7942  split_type = string;
7943  if (RSTRING_LEN(spat) == 0) {
7944  /* Special case - split into chars */
7945  split_type = chars;
7946  }
7947  else if (rb_enc_asciicompat(enc2) == 1) {
7948  if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
7949  split_type = awk;
7950  }
7951  }
7952  else {
7953  int l;
7954  if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
7955  RSTRING_LEN(spat) == l) {
7956  split_type = awk;
7957  }
7958  }
7959  }
7960  }
7961 
7962 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
7963 
7964  if (result) result = rb_ary_new();
7965  beg = 0;
7966  char *ptr = RSTRING_PTR(str);
7967  char *eptr = RSTRING_END(str);
7968  if (split_type == awk) {
7969  char *bptr = ptr;
7970  int skip = 1;
7971  unsigned int c;
7972 
7973  end = beg;
7974  if (is_ascii_string(str)) {
7975  while (ptr < eptr) {
7976  c = (unsigned char)*ptr++;
7977  if (skip) {
7978  if (ascii_isspace(c)) {
7979  beg = ptr - bptr;
7980  }
7981  else {
7982  end = ptr - bptr;
7983  skip = 0;
7984  if (!NIL_P(limit) && lim <= i) break;
7985  }
7986  }
7987  else if (ascii_isspace(c)) {
7988  SPLIT_STR(beg, end-beg);
7989  skip = 1;
7990  beg = ptr - bptr;
7991  if (!NIL_P(limit)) ++i;
7992  }
7993  else {
7994  end = ptr - bptr;
7995  }
7996  }
7997  }
7998  else {
7999  while (ptr < eptr) {
8000  int n;
8001 
8002  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8003  ptr += n;
8004  if (skip) {
8005  if (rb_isspace(c)) {
8006  beg = ptr - bptr;
8007  }
8008  else {
8009  end = ptr - bptr;
8010  skip = 0;
8011  if (!NIL_P(limit) && lim <= i) break;
8012  }
8013  }
8014  else if (rb_isspace(c)) {
8015  SPLIT_STR(beg, end-beg);
8016  skip = 1;
8017  beg = ptr - bptr;
8018  if (!NIL_P(limit)) ++i;
8019  }
8020  else {
8021  end = ptr - bptr;
8022  }
8023  }
8024  }
8025  }
8026  else if (split_type == string) {
8027  char *str_start = ptr;
8028  char *substr_start = ptr;
8029  char *sptr = RSTRING_PTR(spat);
8030  long slen = RSTRING_LEN(spat);
8031 
8032  mustnot_broken(str);
8033  enc = rb_enc_check(str, spat);
8034  while (ptr < eptr &&
8035  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8036  /* Check we are at the start of a char */
8037  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8038  if (t != ptr + end) {
8039  ptr = t;
8040  continue;
8041  }
8042  SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8043  ptr += end + slen;
8044  substr_start = ptr;
8045  if (!NIL_P(limit) && lim <= ++i) break;
8046  }
8047  beg = ptr - str_start;
8048  }
8049  else if (split_type == chars) {
8050  char *str_start = ptr;
8051  int n;
8052 
8053  mustnot_broken(str);
8054  enc = rb_enc_get(str);
8055  while (ptr < eptr &&
8056  (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8057  SPLIT_STR(ptr - str_start, n);
8058  ptr += n;
8059  if (!NIL_P(limit) && lim <= ++i) break;
8060  }
8061  beg = ptr - str_start;
8062  }
8063  else {
8064  long len = RSTRING_LEN(str);
8065  long start = beg;
8066  long idx;
8067  int last_null = 0;
8068  struct re_registers *regs;
8069  VALUE match = 0;
8070 
8071  for (; (end = rb_reg_search(spat, str, start, 0)) >= 0;
8072  (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8073  match = rb_backref_get();
8074  if (!result) rb_match_busy(match);
8075  regs = RMATCH_REGS(match);
8076  if (start == end && BEG(0) == END(0)) {
8077  if (!ptr) {
8078  SPLIT_STR(0, 0);
8079  break;
8080  }
8081  else if (last_null == 1) {
8082  SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8083  beg = start;
8084  }
8085  else {
8086  if (start == len)
8087  start++;
8088  else
8089  start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8090  last_null = 1;
8091  continue;
8092  }
8093  }
8094  else {
8095  SPLIT_STR(beg, end-beg);
8096  beg = start = END(0);
8097  }
8098  last_null = 0;
8099 
8100  for (idx=1; idx < regs->num_regs; idx++) {
8101  if (BEG(idx) == -1) continue;
8102  SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8103  }
8104  if (!NIL_P(limit) && lim <= ++i) break;
8105  }
8106  if (match) rb_match_unbusy(match);
8107  }
8108  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8109  SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8110  }
8111 
8112  return result ? result : str;
8113 }
8114 
8115 VALUE
8116 rb_str_split(VALUE str, const char *sep0)
8117 {
8118  VALUE sep;
8119 
8120  StringValue(str);
8121  sep = rb_str_new_cstr(sep0);
8122  return rb_str_split_m(1, &sep, str);
8123 }
8124 
8125 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8126 
8127 static inline int
8128 enumerator_element(VALUE ary, VALUE e)
8129 {
8130  if (ary) {
8131  rb_ary_push(ary, e);
8132  return 0;
8133  }
8134  else {
8135  rb_yield(e);
8136  return 1;
8137  }
8138 }
8139 
8140 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8141 
8142 static const char *
8143 chomp_newline(const char *p, const char *e, rb_encoding *enc)
8144 {
8145  const char *prev = rb_enc_prev_char(p, e, e, enc);
8146  if (rb_enc_is_newline(prev, e, enc)) {
8147  e = prev;
8148  prev = rb_enc_prev_char(p, e, e, enc);
8149  if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8150  e = prev;
8151  }
8152  return e;
8153 }
8154 
8155 static VALUE
8156 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8157 {
8158  rb_encoding *enc;
8159  VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8160  const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8161  long pos, len, rslen;
8162  int rsnewline = 0;
8163 
8164  if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8165  rs = rb_rs;
8166  if (!NIL_P(opts)) {
8167  static ID keywords[1];
8168  if (!keywords[0]) {
8169  keywords[0] = rb_intern_const("chomp");
8170  }
8171  rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8172  chomp = (chomp != Qundef && RTEST(chomp));
8173  }
8174 
8175  if (NIL_P(rs)) {
8176  if (!ENUM_ELEM(ary, str)) {
8177  return ary;
8178  }
8179  else {
8180  return orig;
8181  }
8182  }
8183 
8184  if (!RSTRING_LEN(str)) goto end;
8186  ptr = subptr = RSTRING_PTR(str);
8187  pend = RSTRING_END(str);
8188  len = RSTRING_LEN(str);
8189  StringValue(rs);
8190  rslen = RSTRING_LEN(rs);
8191 
8192  if (rs == rb_default_rs)
8193  enc = rb_enc_get(str);
8194  else
8195  enc = rb_enc_check(str, rs);
8196 
8197  if (rslen == 0) {
8198  /* paragraph mode */
8199  int n;
8200  const char *eol = NULL;
8201  subend = subptr;
8202  while (subend < pend) {
8203  do {
8204  if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8205  n = 0;
8206  rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8207  if (rb_enc_is_newline(subend + n, pend, enc)) {
8208  if (eol == subend) break;
8209  subend += rslen;
8210  if (subptr) eol = subend;
8211  }
8212  else {
8213  if (!subptr) subptr = subend;
8214  subend += rslen;
8215  }
8216  rslen = 0;
8217  } while (subend < pend);
8218  if (!subptr) break;
8219  line = rb_str_subseq(str, subptr - ptr,
8220  subend - subptr + (chomp ? 0 : rslen));
8221  if (ENUM_ELEM(ary, line)) {
8222  str_mod_check(str, ptr, len);
8223  }
8224  subptr = eol = NULL;
8225  }
8226  goto end;
8227  }
8228  else {
8229  rsptr = RSTRING_PTR(rs);
8230  if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8231  rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8232  rsnewline = 1;
8233  }
8234  }
8235 
8236  if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8237  rs = rb_str_new(rsptr, rslen);
8238  rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8239  rsptr = RSTRING_PTR(rs);
8240  rslen = RSTRING_LEN(rs);
8241  }
8242 
8243  while (subptr < pend) {
8244  pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8245  if (pos < 0) break;
8246  hit = subptr + pos;
8247  adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8248  if (hit != adjusted) {
8249  subptr = adjusted;
8250  continue;
8251  }
8252  subend = hit += rslen;
8253  if (chomp) {
8254  if (rsnewline) {
8255  subend = chomp_newline(subptr, subend, enc);
8256  }
8257  else {
8258  subend -= rslen;
8259  }
8260  }
8261  line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8262  if (ENUM_ELEM(ary, line)) {
8263  str_mod_check(str, ptr, len);
8264  }
8265  subptr = hit;
8266  }
8267 
8268  if (subptr != pend) {
8269  if (chomp) {
8270  if (rsnewline) {
8271  pend = chomp_newline(subptr, pend, enc);
8272  }
8273  else if (pend - subptr >= rslen &&
8274  memcmp(pend - rslen, rsptr, rslen) == 0) {
8275  pend -= rslen;
8276  }
8277  }
8278  line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8279  ENUM_ELEM(ary, line);
8280  RB_GC_GUARD(str);
8281  }
8282 
8283  end:
8284  if (ary)
8285  return ary;
8286  else
8287  return orig;
8288 }
8289 
8290 /*
8291  * call-seq:
8292  * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8293  * str.each_line(separator=$/, chomp: false) -> an_enumerator
8294  *
8295  * Splits <i>str</i> using the supplied parameter as the record
8296  * separator (<code>$/</code> by default), passing each substring in
8297  * turn to the supplied block. If a zero-length record separator is
8298  * supplied, the string is split into paragraphs delimited by
8299  * multiple successive newlines.
8300  *
8301  * If +chomp+ is +true+, +separator+ will be removed from the end of each
8302  * line.
8303  *
8304  * If no block is given, an enumerator is returned instead.
8305  *
8306  * "hello\nworld".each_line {|s| p s}
8307  * # prints:
8308  * # "hello\n"
8309  * # "world"
8310  *
8311  * "hello\nworld".each_line('l') {|s| p s}
8312  * # prints:
8313  * # "hel"
8314  * # "l"
8315  * # "o\nworl"
8316  * # "d"
8317  *
8318  * "hello\n\n\nworld".each_line('') {|s| p s}
8319  * # prints
8320  * # "hello\n\n"
8321  * # "world"
8322  *
8323  * "hello\nworld".each_line(chomp: true) {|s| p s}
8324  * # prints:
8325  * # "hello"
8326  * # "world"
8327  *
8328  * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8329  * # prints:
8330  * # "he"
8331  * # ""
8332  * # "o\nwor"
8333  * # "d"
8334  *
8335  */
8336 
8337 static VALUE
8338 rb_str_each_line(int argc, VALUE *argv, VALUE str)
8339 {
8341  return rb_str_enumerate_lines(argc, argv, str, 0);
8342 }
8343 
8344 /*
8345  * call-seq:
8346  * str.lines(separator=$/, chomp: false) -> an_array
8347  *
8348  * Returns an array of lines in <i>str</i> split using the supplied
8349  * record separator (<code>$/</code> by default). This is a
8350  * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8351  *
8352  * If +chomp+ is +true+, +separator+ will be removed from the end of each
8353  * line.
8354  *
8355  * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8356  * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8357  * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8358  *
8359  * If a block is given, which is a deprecated form, works the same as
8360  * <code>each_line</code>.
8361  */
8362 
8363 static VALUE
8364 rb_str_lines(int argc, VALUE *argv, VALUE str)
8365 {
8366  VALUE ary = WANTARRAY("lines", 0);
8367  return rb_str_enumerate_lines(argc, argv, str, ary);
8368 }
8369 
8370 static VALUE
8371 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8372 {
8373  return LONG2FIX(RSTRING_LEN(str));
8374 }
8375 
8376 static VALUE
8377 rb_str_enumerate_bytes(VALUE str, VALUE ary)
8378 {
8379  long i;
8380 
8381  for (i=0; i<RSTRING_LEN(str); i++) {
8382  ENUM_ELEM(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
8383  }
8384  if (ary)
8385  return ary;
8386  else
8387  return str;
8388 }
8389 
8390 /*
8391  * call-seq:
8392  * str.each_byte {|integer| block } -> str
8393  * str.each_byte -> an_enumerator
8394  *
8395  * Passes each byte in <i>str</i> to the given block, or returns an
8396  * enumerator if no block is given.
8397  *
8398  * "hello".each_byte {|c| print c, ' ' }
8399  *
8400  * <em>produces:</em>
8401  *
8402  * 104 101 108 108 111
8403  */
8404 
8405 static VALUE
8406 rb_str_each_byte(VALUE str)
8407 {
8408  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8409  return rb_str_enumerate_bytes(str, 0);
8410 }
8411 
8412 /*
8413  * call-seq:
8414  * str.bytes -> an_array
8415  *
8416  * Returns an array of bytes in <i>str</i>. This is a shorthand for
8417  * <code>str.each_byte.to_a</code>.
8418  *
8419  * If a block is given, which is a deprecated form, works the same as
8420  * <code>each_byte</code>.
8421  */
8422 
8423 static VALUE
8424 rb_str_bytes(VALUE str)
8425 {
8426  VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
8427  return rb_str_enumerate_bytes(str, ary);
8428 }
8429 
8430 static VALUE
8431 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
8432 {
8433  return rb_str_length(str);
8434 }
8435 
8436 static VALUE
8437 rb_str_enumerate_chars(VALUE str, VALUE ary)
8438 {
8439  VALUE orig = str;
8440  long i, len, n;
8441  const char *ptr;
8442  rb_encoding *enc;
8443 
8445  ptr = RSTRING_PTR(str);
8446  len = RSTRING_LEN(str);
8447  enc = rb_enc_get(str);
8448 
8450  for (i = 0; i < len; i += n) {
8451  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
8453  }
8454  }
8455  else {
8456  for (i = 0; i < len; i += n) {
8457  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
8459  }
8460  }
8461  RB_GC_GUARD(str);
8462  if (ary)
8463  return ary;
8464  else
8465  return orig;
8466 }
8467 
8468 /*
8469  * call-seq:
8470  * str.each_char {|cstr| block } -> str
8471  * str.each_char -> an_enumerator
8472  *
8473  * Passes each character in <i>str</i> to the given block, or returns
8474  * an enumerator if no block is given.
8475  *
8476  * "hello".each_char {|c| print c, ' ' }
8477  *
8478  * <em>produces:</em>
8479  *
8480  * h e l l o
8481  */
8482 
8483 static VALUE
8484 rb_str_each_char(VALUE str)
8485 {
8486  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8487  return rb_str_enumerate_chars(str, 0);
8488 }
8489 
8490 /*
8491  * call-seq:
8492  * str.chars -> an_array
8493  *
8494  * Returns an array of characters in <i>str</i>. This is a shorthand
8495  * for <code>str.each_char.to_a</code>.
8496  *
8497  * If a block is given, which is a deprecated form, works the same as
8498  * <code>each_char</code>.
8499  */
8500 
8501 static VALUE
8502 rb_str_chars(VALUE str)
8503 {
8504  VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
8505  return rb_str_enumerate_chars(str, ary);
8506 }
8507 
8508 static VALUE
8509 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
8510 {
8511  VALUE orig = str;
8512  int n;
8513  unsigned int c;
8514  const char *ptr, *end;
8515  rb_encoding *enc;
8516 
8517  if (single_byte_optimizable(str))
8518  return rb_str_enumerate_bytes(str, ary);
8519 
8521  ptr = RSTRING_PTR(str);
8522  end = RSTRING_END(str);
8523  enc = STR_ENC_GET(str);
8524 
8525  while (ptr < end) {
8526  c = rb_enc_codepoint_len(ptr, end, &n, enc);
8527  ENUM_ELEM(ary, UINT2NUM(c));
8528  ptr += n;
8529  }
8530  RB_GC_GUARD(str);
8531  if (ary)
8532  return ary;
8533  else
8534  return orig;
8535 }
8536 
8537 /*
8538  * call-seq:
8539  * str.each_codepoint {|integer| block } -> str
8540  * str.each_codepoint -> an_enumerator
8541  *
8542  * Passes the Integer ordinal of each character in <i>str</i>,
8543  * also known as a <i>codepoint</i> when applied to Unicode strings to the
8544  * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
8545  * values are directly derived from the binary representation
8546  * of each character.
8547  *
8548  * If no block is given, an enumerator is returned instead.
8549  *
8550  * "hello\u0639".each_codepoint {|c| print c, ' ' }
8551  *
8552  * <em>produces:</em>
8553  *
8554  * 104 101 108 108 111 1593
8555  */
8556 
8557 static VALUE
8558 rb_str_each_codepoint(VALUE str)
8559 {
8560  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8561  return rb_str_enumerate_codepoints(str, 0);
8562 }
8563 
8564 /*
8565  * call-seq:
8566  * str.codepoints -> an_array
8567  *
8568  * Returns an array of the Integer ordinals of the
8569  * characters in <i>str</i>. This is a shorthand for
8570  * <code>str.each_codepoint.to_a</code>.
8571  *
8572  * If a block is given, which is a deprecated form, works the same as
8573  * <code>each_codepoint</code>.
8574  */
8575 
8576 static VALUE
8577 rb_str_codepoints(VALUE str)
8578 {
8579  VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
8580  return rb_str_enumerate_codepoints(str, ary);
8581 }
8582 
8583 static regex_t *
8584 get_reg_grapheme_cluster(rb_encoding *enc)
8585 {
8586  int encidx = rb_enc_to_index(enc);
8587  regex_t *reg_grapheme_cluster = NULL;
8588  static regex_t *reg_grapheme_cluster_utf8 = NULL;
8589 
8590  /* synchronize */
8591  if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
8592  reg_grapheme_cluster = reg_grapheme_cluster_utf8;
8593  }
8594  if (!reg_grapheme_cluster) {
8595  const OnigUChar source_ascii[] = "\\X";
8596  OnigErrorInfo einfo;
8597  const OnigUChar *source = source_ascii;
8598  size_t source_len = sizeof(source_ascii) - 1;
8599  switch (encidx) {
8600 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
8601 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
8602 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
8603 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
8604 #define CASE_UTF(e) \
8605  case ENCINDEX_UTF_##e: { \
8606  static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
8607  source = source_UTF_##e; \
8608  source_len = sizeof(source_UTF_##e); \
8609  break; \
8610  }
8611  CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
8612 #undef CASE_UTF
8613 #undef CHARS_16BE
8614 #undef CHARS_16LE
8615 #undef CHARS_32BE
8616 #undef CHARS_32LE
8617  }
8618  int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
8619  ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
8620  if (r) {
8622  onig_error_code_to_str(message, r, &einfo);
8623  rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
8624  }
8625  if (encidx == rb_utf8_encindex()) {
8626  reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
8627  }
8628  }
8629  return reg_grapheme_cluster;
8630 }
8631 
8632 static VALUE
8633 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
8634 {
8635  size_t grapheme_cluster_count = 0;
8636  regex_t *reg_grapheme_cluster = NULL;
8638  const char *ptr, *end;
8639 
8640  if (!rb_enc_unicode_p(enc)) {
8641  return rb_str_length(str);
8642  }
8643 
8644  reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
8645  ptr = RSTRING_PTR(str);
8646  end = RSTRING_END(str);
8647 
8648  while (ptr < end) {
8649  OnigPosition len = onig_match(reg_grapheme_cluster,
8650  (const OnigUChar *)ptr, (const OnigUChar *)end,
8651  (const OnigUChar *)ptr, NULL, 0);
8652  if (len <= 0) break;
8653  grapheme_cluster_count++;
8654  ptr += len;
8655  }
8656 
8657  return SIZET2NUM(grapheme_cluster_count);
8658 }
8659 
8660 static VALUE
8661 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
8662 {
8663  VALUE orig = str;
8664  regex_t *reg_grapheme_cluster = NULL;
8666  const char *ptr0, *ptr, *end;
8667 
8668  if (!rb_enc_unicode_p(enc)) {
8669  return rb_str_enumerate_chars(str, ary);
8670  }
8671 
8672  if (!ary) str = rb_str_new_frozen(str);
8673  reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
8674  ptr0 = ptr = RSTRING_PTR(str);
8675  end = RSTRING_END(str);
8676 
8677  while (ptr < end) {
8678  OnigPosition len = onig_match(reg_grapheme_cluster,
8679  (const OnigUChar *)ptr, (const OnigUChar *)end,
8680  (const OnigUChar *)ptr, NULL, 0);
8681  if (len <= 0) break;
8682  ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
8683  ptr += len;
8684  }
8685  RB_GC_GUARD(str);
8686  if (ary)
8687  return ary;
8688  else
8689  return orig;
8690 }
8691 
8692 /*
8693  * call-seq:
8694  * str.each_grapheme_cluster {|cstr| block } -> str
8695  * str.each_grapheme_cluster -> an_enumerator
8696  *
8697  * Passes each grapheme cluster in <i>str</i> to the given block, or returns
8698  * an enumerator if no block is given.
8699  * Unlike String#each_char, this enumerates by grapheme clusters defined by
8700  * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
8701  *
8702  * "a\u0300".each_char.to_a.size #=> 2
8703  * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
8704  *
8705  */
8706 
8707 static VALUE
8708 rb_str_each_grapheme_cluster(VALUE str)
8709 {
8710  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
8711  return rb_str_enumerate_grapheme_clusters(str, 0);
8712 }
8713 
8714 /*
8715  * call-seq:
8716  * str.grapheme_clusters -> an_array
8717  *
8718  * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
8719  * for <code>str.each_grapheme_cluster.to_a</code>.
8720  *
8721  * If a block is given, which is a deprecated form, works the same as
8722  * <code>each_grapheme_cluster</code>.
8723  */
8724 
8725 static VALUE
8726 rb_str_grapheme_clusters(VALUE str)
8727 {
8728  VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
8729  return rb_str_enumerate_grapheme_clusters(str, ary);
8730 }
8731 
8732 static long
8733 chopped_length(VALUE str)
8734 {
8735  rb_encoding *enc = STR_ENC_GET(str);
8736  const char *p, *p2, *beg, *end;
8737 
8738  beg = RSTRING_PTR(str);
8739  end = beg + RSTRING_LEN(str);
8740  if (beg >= end) return 0;
8741  p = rb_enc_prev_char(beg, end, end, enc);
8742  if (!p) return 0;
8743  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
8744  p2 = rb_enc_prev_char(beg, p, end, enc);
8745  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
8746  }
8747  return p - beg;
8748 }
8749 
8750 /*
8751  * call-seq:
8752  * str.chop! -> str or nil
8753  *
8754  * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
8755  * <code>nil</code> if <i>str</i> is the empty string. See also
8756  * String#chomp!.
8757  */
8758 
8759 static VALUE
8760 rb_str_chop_bang(VALUE str)
8761 {
8762  str_modify_keep_cr(str);
8763  if (RSTRING_LEN(str) > 0) {
8764  long len;
8765  len = chopped_length(str);
8766  STR_SET_LEN(str, len);
8770  }
8771  return str;
8772  }
8773  return Qnil;
8774 }
8775 
8776 
8777 /*
8778  * call-seq:
8779  * str.chop -> new_str
8780  *
8781  * Returns a new String with the last character removed. If the
8782  * string ends with <code>\r\n</code>, both characters are
8783  * removed. Applying <code>chop</code> to an empty string returns an
8784  * empty string. String#chomp is often a safer alternative, as it
8785  * leaves the string unchanged if it doesn't end in a record
8786  * separator.
8787  *
8788  * "string\r\n".chop #=> "string"
8789  * "string\n\r".chop #=> "string\n"
8790  * "string\n".chop #=> "string"
8791  * "string".chop #=> "strin"
8792  * "x".chop.chop #=> ""
8793  */
8794 
8795 static VALUE
8796 rb_str_chop(VALUE str)
8797 {
8798  return rb_str_subseq(str, 0, chopped_length(str));
8799 }
8800 
8801 
8802 static long
8803 chompped_length(VALUE str, VALUE rs)
8804 {
8805  rb_encoding *enc;
8806  int newline;
8807  char *pp, *e, *rsptr;
8808  long rslen;
8809  char *const p = RSTRING_PTR(str);
8810  long len = RSTRING_LEN(str);
8811 
8812  if (len == 0) return 0;
8813  e = p + len;
8814  if (rs == rb_default_rs) {
8815  smart_chomp:
8816  enc = rb_enc_get(str);
8817  if (rb_enc_mbminlen(enc) > 1) {
8818  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8819  if (rb_enc_is_newline(pp, e, enc)) {
8820  e = pp;
8821  }
8822  pp = e - rb_enc_mbminlen(enc);
8823  if (pp >= p) {
8824  pp = rb_enc_left_char_head(p, pp, e, enc);
8825  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8826  e = pp;
8827  }
8828  }
8829  }
8830  else {
8831  switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
8832  case '\n':
8833  if (--e > p && *(e-1) == '\r') {
8834  --e;
8835  }
8836  break;
8837  case '\r':
8838  --e;
8839  break;
8840  }
8841  }
8842  return e - p;
8843  }
8844 
8845  enc = rb_enc_get(str);
8846  RSTRING_GETMEM(rs, rsptr, rslen);
8847  if (rslen == 0) {
8848  if (rb_enc_mbminlen(enc) > 1) {
8849  while (e > p) {
8850  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8851  if (!rb_enc_is_newline(pp, e, enc)) break;
8852  e = pp;
8853  pp -= rb_enc_mbminlen(enc);
8854  if (pp >= p) {
8855  pp = rb_enc_left_char_head(p, pp, e, enc);
8856  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8857  e = pp;
8858  }
8859  }
8860  }
8861  }
8862  else {
8863  while (e > p && *(e-1) == '\n') {
8864  --e;
8865  if (e > p && *(e-1) == '\r')
8866  --e;
8867  }
8868  }
8869  return e - p;
8870  }
8871  if (rslen > len) return len;
8872 
8873  enc = rb_enc_get(rs);
8874  newline = rsptr[rslen-1];
8875  if (rslen == rb_enc_mbminlen(enc)) {
8876  if (rslen == 1) {
8877  if (newline == '\n')
8878  goto smart_chomp;
8879  }
8880  else {
8881  if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
8882  goto smart_chomp;
8883  }
8884  }
8885 
8886  enc = rb_enc_check(str, rs);
8887  if (is_broken_string(rs)) {
8888  return len;
8889  }
8890  pp = e - rslen;
8891  if (p[len-1] == newline &&
8892  (rslen <= 1 ||
8893  memcmp(rsptr, pp, rslen) == 0)) {
8894  if (rb_enc_left_char_head(p, pp, e, enc) == pp)
8895  return len - rslen;
8896  RB_GC_GUARD(rs);
8897  }
8898  return len;
8899 }
8900 
8906 static VALUE
8907 chomp_rs(int argc, const VALUE *argv)
8908 {
8909  rb_check_arity(argc, 0, 1);
8910  if (argc > 0) {
8911  VALUE rs = argv[0];
8912  if (!NIL_P(rs)) StringValue(rs);
8913  return rs;
8914  }
8915  else {
8916  return rb_rs;
8917  }
8918 }
8919 
8920 VALUE
8922 {
8923  long olen = RSTRING_LEN(str);
8924  long len = chompped_length(str, rs);
8925  if (len >= olen) return Qnil;
8926  str_modify_keep_cr(str);
8927  STR_SET_LEN(str, len);
8931  }
8932  return str;
8933 }
8934 
8935 /*
8936  * call-seq:
8937  * str.chomp!(separator=$/) -> str or nil
8938  *
8939  * Modifies <i>str</i> in place as described for String#chomp,
8940  * returning <i>str</i>, or <code>nil</code> if no modifications were
8941  * made.
8942  */
8943 
8944 static VALUE
8945 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
8946 {
8947  VALUE rs;
8948  str_modifiable(str);
8949  if (RSTRING_LEN(str) == 0) return Qnil;
8950  rs = chomp_rs(argc, argv);
8951  if (NIL_P(rs)) return Qnil;
8952  return rb_str_chomp_string(str, rs);
8953 }
8954 
8955 
8956 /*
8957  * call-seq:
8958  * str.chomp(separator=$/) -> new_str
8959  *
8960  * Returns a new String with the given record separator removed
8961  * from the end of <i>str</i> (if present). If <code>$/</code> has not been
8962  * changed from the default Ruby record separator, then <code>chomp</code> also
8963  * removes carriage return characters (that is it will remove <code>\n</code>,
8964  * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
8965  * it will remove all trailing newlines from the string.
8966  *
8967  * "hello".chomp #=> "hello"
8968  * "hello\n".chomp #=> "hello"
8969  * "hello\r\n".chomp #=> "hello"
8970  * "hello\n\r".chomp #=> "hello\n"
8971  * "hello\r".chomp #=> "hello"
8972  * "hello \n there".chomp #=> "hello \n there"
8973  * "hello".chomp("llo") #=> "he"
8974  * "hello\r\n\r\n".chomp('') #=> "hello"
8975  * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
8976  */
8977 
8978 static VALUE
8979 rb_str_chomp(int argc, VALUE *argv, VALUE str)
8980 {
8981  VALUE rs = chomp_rs(argc, argv);
8982  if (NIL_P(rs)) return rb_str_dup(str);
8983  return rb_str_subseq(str, 0, chompped_length(str, rs));
8984 }
8985 
8986 static long
8987 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
8988 {
8989  const char *const start = s;
8990 
8991  if (!s || s >= e) return 0;
8992 
8993  /* remove spaces at head */
8994  if (single_byte_optimizable(str)) {
8995  while (s < e && ascii_isspace(*s)) s++;
8996  }
8997  else {
8998  while (s < e) {
8999  int n;
9000  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9001 
9002  if (!rb_isspace(cc)) break;
9003  s += n;
9004  }
9005  }
9006  return s - start;
9007 }
9008 
9009 /*
9010  * call-seq:
9011  * str.lstrip! -> self or nil
9012  *
9013  * Removes leading whitespace from the receiver.
9014  * Returns the altered receiver, or +nil+ if no change was made.
9015  * See also String#rstrip! and String#strip!.
9016  *
9017  * Refer to String#strip for the definition of whitespace.
9018  *
9019  * " hello ".lstrip! #=> "hello "
9020  * "hello ".lstrip! #=> nil
9021  * "hello".lstrip! #=> nil
9022  */
9023 
9024 static VALUE
9025 rb_str_lstrip_bang(VALUE str)
9026 {
9027  rb_encoding *enc;
9028  char *start, *s;
9029  long olen, loffset;
9030 
9031  str_modify_keep_cr(str);
9032  enc = STR_ENC_GET(str);
9033  RSTRING_GETMEM(str, start, olen);
9034  loffset = lstrip_offset(str, start, start+olen, enc);
9035  if (loffset > 0) {
9036  long len = olen-loffset;
9037  s = start + loffset;
9038  memmove(start, s, len);
9039  STR_SET_LEN(str, len);
9040 #if !SHARABLE_MIDDLE_SUBSTRING
9041  TERM_FILL(start+len, rb_enc_mbminlen(enc));
9042 #endif
9043  return str;
9044  }
9045  return Qnil;
9046 }
9047 
9048 
9049 /*
9050  * call-seq:
9051  * str.lstrip -> new_str
9052  *
9053  * Returns a copy of the receiver with leading whitespace removed.
9054  * See also String#rstrip and String#strip.
9055  *
9056  * Refer to String#strip for the definition of whitespace.
9057  *
9058  * " hello ".lstrip #=> "hello "
9059  * "hello".lstrip #=> "hello"
9060  */
9061 
9062 static VALUE
9063 rb_str_lstrip(VALUE str)
9064 {
9065  char *start;
9066  long len, loffset;
9067  RSTRING_GETMEM(str, start, len);
9068  loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9069  if (loffset <= 0) return rb_str_dup(str);
9070  return rb_str_subseq(str, loffset, len - loffset);
9071 }
9072 
9073 static long
9074 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9075 {
9076  const char *t;
9077 
9078  rb_str_check_dummy_enc(enc);
9079  if (!s || s >= e) return 0;
9080  t = e;
9081 
9082  /* remove trailing spaces or '\0's */
9083  if (single_byte_optimizable(str)) {
9084  unsigned char c;
9085  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9086  }
9087  else {
9088  char *tp;
9089 
9090  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9091  unsigned int c = rb_enc_codepoint(tp, e, enc);
9092  if (c && !rb_isspace(c)) break;
9093  t = tp;
9094  }
9095  }
9096  return e - t;
9097 }
9098 
9099 /*
9100  * call-seq:
9101  * str.rstrip! -> self or nil
9102  *
9103  * Removes trailing whitespace from the receiver.
9104  * Returns the altered receiver, or +nil+ if no change was made.
9105  * See also String#lstrip! and String#strip!.
9106  *
9107  * Refer to String#strip for the definition of whitespace.
9108  *
9109  * " hello ".rstrip! #=> " hello"
9110  * " hello".rstrip! #=> nil
9111  * "hello".rstrip! #=> nil
9112  */
9113 
9114 static VALUE
9115 rb_str_rstrip_bang(VALUE str)
9116 {
9117  rb_encoding *enc;
9118  char *start;
9119  long olen, roffset;
9120 
9121  str_modify_keep_cr(str);
9122  enc = STR_ENC_GET(str);
9123  RSTRING_GETMEM(str, start, olen);
9124  roffset = rstrip_offset(str, start, start+olen, enc);
9125  if (roffset > 0) {
9126  long len = olen - roffset;
9127 
9128  STR_SET_LEN(str, len);
9129 #if !SHARABLE_MIDDLE_SUBSTRING
9130  TERM_FILL(start+len, rb_enc_mbminlen(enc));
9131 #endif
9132  return str;
9133  }
9134  return Qnil;
9135 }
9136 
9137 
9138 /*
9139  * call-seq:
9140  * str.rstrip -> new_str
9141  *
9142  * Returns a copy of the receiver with trailing whitespace removed.
9143  * See also String#lstrip and String#strip.
9144  *
9145  * Refer to String#strip for the definition of whitespace.
9146  *
9147  * " hello ".rstrip #=> " hello"
9148  * "hello".rstrip #=> "hello"
9149  */
9150 
9151 static VALUE
9152 rb_str_rstrip(VALUE str)
9153 {
9154  rb_encoding *enc;
9155  char *start;
9156  long olen, roffset;
9157 
9158  enc = STR_ENC_GET(str);
9159  RSTRING_GETMEM(str, start, olen);
9160  roffset = rstrip_offset(str, start, start+olen, enc);
9161 
9162  if (roffset <= 0) return rb_str_dup(str);
9163  return rb_str_subseq(str, 0, olen-roffset);
9164 }
9165 
9166 
9167 /*
9168  * call-seq:
9169  * str.strip! -> self or nil
9170  *
9171  * Removes leading and trailing whitespace from the receiver.
9172  * Returns the altered receiver, or +nil+ if there was no change.
9173  *
9174  * Refer to String#strip for the definition of whitespace.
9175  *
9176  * " hello ".strip! #=> "hello"
9177  * "hello".strip! #=> nil
9178  */
9179 
9180 static VALUE
9181 rb_str_strip_bang(VALUE str)
9182 {
9183  char *start;
9184  long olen, loffset, roffset;
9185  rb_encoding *enc;
9186 
9187  str_modify_keep_cr(str);
9188  enc = STR_ENC_GET(str);
9189  RSTRING_GETMEM(str, start, olen);
9190  loffset = lstrip_offset(str, start, start+olen, enc);
9191  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9192 
9193  if (loffset > 0 || roffset > 0) {
9194  long len = olen-roffset;
9195  if (loffset > 0) {
9196  len -= loffset;
9197  memmove(start, start + loffset, len);
9198  }
9199  STR_SET_LEN(str, len);
9200 #if !SHARABLE_MIDDLE_SUBSTRING
9201  TERM_FILL(start+len, rb_enc_mbminlen(enc));
9202 #endif
9203  return str;
9204  }
9205  return Qnil;
9206 }
9207 
9208 
9209 /*
9210  * call-seq:
9211  * str.strip -> new_str
9212  *
9213  * Returns a copy of the receiver with leading and trailing whitespace removed.
9214  *
9215  * Whitespace is defined as any of the following characters:
9216  * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9217  *
9218  * " hello ".strip #=> "hello"
9219  * "\tgoodbye\r\n".strip #=> "goodbye"
9220  * "\x00\t\n\v\f\r ".strip #=> ""
9221  * "hello".strip #=> "hello"
9222  */
9223 
9224 static VALUE
9225 rb_str_strip(VALUE str)
9226 {
9227  char *start;
9228  long olen, loffset, roffset;
9229  rb_encoding *enc = STR_ENC_GET(str);
9230 
9231  RSTRING_GETMEM(str, start, olen);
9232  loffset = lstrip_offset(str, start, start+olen, enc);
9233  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9234 
9235  if (loffset <= 0 && roffset <= 0) return rb_str_dup(str);
9236  return rb_str_subseq(str, loffset, olen-loffset-roffset);
9237 }
9238 
9239 static VALUE
9240 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9241 {
9242  VALUE result, match;
9243  struct re_registers *regs;
9244  int i;
9245  long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9246  if (pos >= 0) {
9247  if (BUILTIN_TYPE(pat) == T_STRING) {
9248  regs = NULL;
9249  end = pos + RSTRING_LEN(pat);
9250  }
9251  else {
9252  match = rb_backref_get();
9253  regs = RMATCH_REGS(match);
9254  pos = BEG(0);
9255  end = END(0);
9256  }
9257  if (pos == end) {
9258  rb_encoding *enc = STR_ENC_GET(str);
9259  /*
9260  * Always consume at least one character of the input string
9261  */
9262  if (RSTRING_LEN(str) > end)
9263  *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9264  RSTRING_END(str), enc);
9265  else
9266  *start = end + 1;
9267  }
9268  else {
9269  *start = end;
9270  }
9271  if (!regs || regs->num_regs == 1) {
9272  result = rb_str_subseq(str, pos, end - pos);
9273  return result;
9274  }
9275  result = rb_ary_new2(regs->num_regs);
9276  for (i=1; i < regs->num_regs; i++) {
9277  VALUE s = Qnil;
9278  if (BEG(i) >= 0) {
9279  s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9280  }
9281  rb_ary_push(result, s);
9282  }
9283 
9284  return result;
9285  }
9286  return Qnil;
9287 }
9288 
9289 
9290 /*
9291  * call-seq:
9292  * str.scan(pattern) -> array
9293  * str.scan(pattern) {|match, ...| block } -> str
9294  *
9295  * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9296  * Regexp or a String). For each match, a result is
9297  * generated and either added to the result array or passed to the block. If
9298  * the pattern contains no groups, each individual result consists of the
9299  * matched string, <code>$&</code>. If the pattern contains groups, each
9300  * individual result is itself an array containing one entry per group.
9301  *
9302  * a = "cruel world"
9303  * a.scan(/\w+/) #=> ["cruel", "world"]
9304  * a.scan(/.../) #=> ["cru", "el ", "wor"]
9305  * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9306  * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9307  *
9308  * And the block form:
9309  *
9310  * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9311  * print "\n"
9312  * a.scan(/(.)(.)/) {|x,y| print y, x }
9313  * print "\n"
9314  *
9315  * <em>produces:</em>
9316  *
9317  * <<cruel>> <<world>>
9318  * rceu lowlr
9319  */
9320 
9321 static VALUE
9322 rb_str_scan(VALUE str, VALUE pat)
9323 {
9324  VALUE result;
9325  long start = 0;
9326  long last = -1, prev = 0;
9327  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9328 
9329  pat = get_pat_quoted(pat, 1);
9330  mustnot_broken(str);
9331  if (!rb_block_given_p()) {
9332  VALUE ary = rb_ary_new();
9333 
9334  while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9335  last = prev;
9336  prev = start;
9337  rb_ary_push(ary, result);
9338  }
9339  if (last >= 0) rb_pat_search(pat, str, last, 1);
9340  else rb_backref_set(Qnil);
9341  return ary;
9342  }
9343 
9344  while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9345  last = prev;
9346  prev = start;
9347  rb_yield(result);
9348  str_mod_check(str, p, len);
9349  }
9350  if (last >= 0) rb_pat_search(pat, str, last, 1);
9351  return str;
9352 }
9353 
9354 
9355 /*
9356  * call-seq:
9357  * str.hex -> integer
9358  *
9359  * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9360  * (with an optional sign and an optional <code>0x</code>) and returns the
9361  * corresponding number. Zero is returned on error.
9362  *
9363  * "0x0a".hex #=> 10
9364  * "-1234".hex #=> -4660
9365  * "0".hex #=> 0
9366  * "wombat".hex #=> 0
9367  */
9368 
9369 static VALUE
9370 rb_str_hex(VALUE str)
9371 {
9372  return rb_str_to_inum(str, 16, FALSE);
9373 }
9374 
9375 
9376 /*
9377  * call-seq:
9378  * str.oct -> integer
9379  *
9380  * Treats leading characters of <i>str</i> as a string of octal digits (with an
9381  * optional sign) and returns the corresponding number. Returns 0 if the
9382  * conversion fails.
9383  *
9384  * "123".oct #=> 83
9385  * "-377".oct #=> -255
9386  * "bad".oct #=> 0
9387  * "0377bad".oct #=> 255
9388  *
9389  * If +str+ starts with <code>0</code>, radix indicators are honored.
9390  * See Kernel#Integer.
9391  */
9392 
9393 static VALUE
9394 rb_str_oct(VALUE str)
9395 {
9396  return rb_str_to_inum(str, -8, FALSE);
9397 }
9398 
9399 
9400 /*
9401  * call-seq:
9402  * str.crypt(salt_str) -> new_str
9403  *
9404  * Returns the string generated by calling <code>crypt(3)</code>
9405  * standard library function with <code>str</code> and
9406  * <code>salt_str</code>, in this order, as its arguments. Please do
9407  * not use this method any longer. It is legacy; provided only for
9408  * backward compatibility with ruby scripts in earlier days. It is
9409  * bad to use in contemporary programs for several reasons:
9410  *
9411  * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
9412  * run. The generated string lacks data portability.
9413  *
9414  * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
9415  * (i.e. silently ends up in unexpected results).
9416  *
9417  * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
9418  * thread safe.
9419  *
9420  * * So-called "traditional" usage of <code>crypt(3)</code> is very
9421  * very very weak. According to its manpage, Linux's traditional
9422  * <code>crypt(3)</code> output has only 2**56 variations; too
9423  * easy to brute force today. And this is the default behaviour.
9424  *
9425  * * In order to make things robust some OSes implement so-called
9426  * "modular" usage. To go through, you have to do a complex
9427  * build-up of the <code>salt_str</code> parameter, by hand.
9428  * Failure in generation of a proper salt string tends not to
9429  * yield any errors; typos in parameters are normally not
9430  * detectable.
9431  *
9432  * * For instance, in the following example, the second invocation
9433  * of String#crypt is wrong; it has a typo in "round=" (lacks
9434  * "s"). However the call does not fail and something unexpected
9435  * is generated.
9436  *
9437  * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
9438  * "foo".crypt("$5$round=1000$salt$") # Typo not detected
9439  *
9440  * * Even in the "modular" mode, some hash functions are considered
9441  * archaic and no longer recommended at all; for instance module
9442  * <code>$1$</code> is officially abandoned by its author: see
9443  * http://phk.freebsd.dk/sagas/md5crypt_eol.html . For another
9444  * instance module <code>$3$</code> is considered completely
9445  * broken: see the manpage of FreeBSD.
9446  *
9447  * * On some OS such as Mac OS, there is no modular mode. Yet, as
9448  * written above, <code>crypt(3)</code> on Mac OS never fails.
9449  * This means even if you build up a proper salt string it
9450  * generates a traditional DES hash anyways, and there is no way
9451  * for you to be aware of.
9452  *
9453  * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
9454  *
9455  * If for some reason you cannot migrate to other secure contemporary
9456  * password hashing algorithms, install the string-crypt gem and
9457  * <code>require 'string/crypt'</code> to continue using it.
9458  */
9459 
9460 static VALUE
9461 rb_str_crypt(VALUE str, VALUE salt)
9462 {
9463 #ifdef HAVE_CRYPT_R
9464  VALUE databuf;
9465  struct crypt_data *data;
9466 # define CRYPT_END() ALLOCV_END(databuf)
9467 #else
9468  extern char *crypt(const char *, const char *);
9469 # define CRYPT_END() (void)0
9470 #endif
9471  VALUE result;
9472  const char *s, *saltp;
9473  char *res;
9474 #ifdef BROKEN_CRYPT
9475  char salt_8bit_clean[3];
9476 #endif
9477 
9478  StringValue(salt);
9479  mustnot_wchar(str);
9480  mustnot_wchar(salt);
9481  if (RSTRING_LEN(salt) < 2) {
9482  short_salt:
9483  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
9484  }
9485 
9486  s = StringValueCStr(str);
9487  saltp = RSTRING_PTR(salt);
9488  if (!saltp[0] || !saltp[1]) goto short_salt;
9489 #ifdef BROKEN_CRYPT
9490  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
9491  salt_8bit_clean[0] = saltp[0] & 0x7f;
9492  salt_8bit_clean[1] = saltp[1] & 0x7f;
9493  salt_8bit_clean[2] = '\0';
9494  saltp = salt_8bit_clean;
9495  }
9496 #endif
9497 #ifdef HAVE_CRYPT_R
9498  data = ALLOCV(databuf, sizeof(struct crypt_data));
9499 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
9500  data->initialized = 0;
9501 # endif
9502  res = crypt_r(s, saltp, data);
9503 #else
9504  res = crypt(s, saltp);
9505 #endif
9506  if (!res) {
9507  int err = errno;
9508  CRYPT_END();
9509  rb_syserr_fail(err, "crypt");
9510  }
9511  result = rb_str_new_cstr(res);
9512  CRYPT_END();
9513  return result;
9514 }
9515 
9516 
9517 /*
9518  * call-seq:
9519  * str.ord -> integer
9520  *
9521  * Returns the Integer ordinal of a one-character string.
9522  *
9523  * "a".ord #=> 97
9524  */
9525 
9526 VALUE
9528 {
9529  unsigned int c;
9530 
9532  return UINT2NUM(c);
9533 }
9534 /*
9535  * call-seq:
9536  * str.sum(n=16) -> integer
9537  *
9538  * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
9539  * where <em>n</em> is the optional Integer parameter, defaulting
9540  * to 16. The result is simply the sum of the binary value of each byte in
9541  * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
9542  * checksum.
9543  */
9544 
9545 static VALUE
9546 rb_str_sum(int argc, VALUE *argv, VALUE str)
9547 {
9548  int bits = 16;
9549  char *ptr, *p, *pend;
9550  long len;
9551  VALUE sum = INT2FIX(0);
9552  unsigned long sum0 = 0;
9553 
9554  if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
9555  bits = 0;
9556  }
9557  ptr = p = RSTRING_PTR(str);
9558  len = RSTRING_LEN(str);
9559  pend = p + len;
9560 
9561  while (p < pend) {
9562  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
9563  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9564  str_mod_check(str, ptr, len);
9565  sum0 = 0;
9566  }
9567  sum0 += (unsigned char)*p;
9568  p++;
9569  }
9570 
9571  if (bits == 0) {
9572  if (sum0) {
9573  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9574  }
9575  }
9576  else {
9577  if (sum == INT2FIX(0)) {
9578  if (bits < (int)sizeof(long)*CHAR_BIT) {
9579  sum0 &= (((unsigned long)1)<<bits)-1;
9580  }
9581  sum = LONG2FIX(sum0);
9582  }
9583  else {
9584  VALUE mod;
9585 
9586  if (sum0) {
9587  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
9588  }
9589 
9590  mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
9591  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
9592  sum = rb_funcall(sum, '&', 1, mod);
9593  }
9594  }
9595  return sum;
9596 }
9597 
9598 static VALUE
9599 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
9600 {
9601  rb_encoding *enc;
9602  VALUE w;
9603  long width, len, flen = 1, fclen = 1;
9604  VALUE res;
9605  char *p;
9606  const char *f = " ";
9607  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
9608  VALUE pad;
9609  int singlebyte = 1, cr;
9610  int termlen;
9611 
9612  rb_scan_args(argc, argv, "11", &w, &pad);
9613  enc = STR_ENC_GET(str);
9614  termlen = rb_enc_mbminlen(enc);
9615  width = NUM2LONG(w);
9616  if (argc == 2) {
9617  StringValue(pad);
9618  enc = rb_enc_check(str, pad);
9619  f = RSTRING_PTR(pad);
9620  flen = RSTRING_LEN(pad);
9621  fclen = str_strlen(pad, enc); /* rb_enc_check */
9622  singlebyte = single_byte_optimizable(pad);
9623  if (flen == 0 || fclen == 0) {
9624  rb_raise(rb_eArgError, "zero width padding");
9625  }
9626  }
9627  len = str_strlen(str, enc); /* rb_enc_check */
9628  if (width < 0 || len >= width) return rb_str_dup(str);
9629  n = width - len;
9630  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
9631  rlen = n - llen;
9632  cr = ENC_CODERANGE(str);
9633  if (flen > 1) {
9634  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
9635  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
9636  }
9637  size = RSTRING_LEN(str);
9638  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
9639  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
9640  (len += llen2 + rlen2) >= LONG_MAX - size) {
9641  rb_raise(rb_eArgError, "argument too big");
9642  }
9643  len += size;
9644  res = str_new0(rb_obj_class(str), 0, len, termlen);
9645  p = RSTRING_PTR(res);
9646  if (flen <= 1) {
9647  memset(p, *f, llen);
9648  p += llen;
9649  }
9650  else {
9651  while (llen >= fclen) {
9652  memcpy(p,f,flen);
9653  p += flen;
9654  llen -= fclen;
9655  }
9656  if (llen > 0) {
9657  memcpy(p, f, llen2);
9658  p += llen2;
9659  }
9660  }
9661  memcpy(p, RSTRING_PTR(str), size);
9662  p += size;
9663  if (flen <= 1) {
9664  memset(p, *f, rlen);
9665  p += rlen;
9666  }
9667  else {
9668  while (rlen >= fclen) {
9669  memcpy(p,f,flen);
9670  p += flen;
9671  rlen -= fclen;
9672  }
9673  if (rlen > 0) {
9674  memcpy(p, f, rlen2);
9675  p += rlen2;
9676  }
9677  }
9678  TERM_FILL(p, termlen);
9679  STR_SET_LEN(res, p-RSTRING_PTR(res));
9680  rb_enc_associate(res, enc);
9681  if (argc == 2)
9682  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
9683  if (cr != ENC_CODERANGE_BROKEN)
9684  ENC_CODERANGE_SET(res, cr);
9685 
9686  RB_GC_GUARD(pad);
9687  return res;
9688 }
9689 
9690 
9691 /*
9692  * call-seq:
9693  * str.ljust(integer, padstr=' ') -> new_str
9694  *
9695  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
9696  * String of length <i>integer</i> with <i>str</i> left justified
9697  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
9698  *
9699  * "hello".ljust(4) #=> "hello"
9700  * "hello".ljust(20) #=> "hello "
9701  * "hello".ljust(20, '1234') #=> "hello123412341234123"
9702  */
9703 
9704 static VALUE
9705 rb_str_ljust(int argc, VALUE *argv, VALUE str)
9706 {
9707  return rb_str_justify(argc, argv, str, 'l');
9708 }
9709 
9710 
9711 /*
9712  * call-seq:
9713  * str.rjust(integer, padstr=' ') -> new_str
9714  *
9715  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
9716  * String of length <i>integer</i> with <i>str</i> right justified
9717  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
9718  *
9719  * "hello".rjust(4) #=> "hello"
9720  * "hello".rjust(20) #=> " hello"
9721  * "hello".rjust(20, '1234') #=> "123412341234123hello"
9722  */
9723 
9724 static VALUE
9725 rb_str_rjust(int argc, VALUE *argv, VALUE str)
9726 {
9727  return rb_str_justify(argc, argv, str, 'r');
9728 }
9729 
9730 
9731 /*
9732  * call-seq:
9733  * str.center(width, padstr=' ') -> new_str
9734  *
9735  * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
9736  * returns a new String of length +width+ with +str+ centered and padded with
9737  * +padstr+; otherwise, returns +str+.
9738  *
9739  * "hello".center(4) #=> "hello"
9740  * "hello".center(20) #=> " hello "
9741  * "hello".center(20, '123') #=> "1231231hello12312312"
9742  */
9743 
9744 static VALUE
9745 rb_str_center(int argc, VALUE *argv, VALUE str)
9746 {
9747  return rb_str_justify(argc, argv, str, 'c');
9748 }
9749 
9750 /*
9751  * call-seq:
9752  * str.partition(sep) -> [head, sep, tail]
9753  * str.partition(regexp) -> [head, match, tail]
9754  *
9755  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
9756  * and returns the part before it, the match, and the part
9757  * after it.
9758  * If it is not found, returns two empty strings and <i>str</i>.
9759  *
9760  * "hello".partition("l") #=> ["he", "l", "lo"]
9761  * "hello".partition("x") #=> ["hello", "", ""]
9762  * "hello".partition(/.l/) #=> ["h", "el", "lo"]
9763  */
9764 
9765 static VALUE
9766 rb_str_partition(VALUE str, VALUE sep)
9767 {
9768  long pos;
9769 
9770  sep = get_pat_quoted(sep, 0);
9771  if (RB_TYPE_P(sep, T_REGEXP)) {
9772  pos = rb_reg_search(sep, str, 0, 0);
9773  if (pos < 0) {
9774  failed:
9775  return rb_ary_new3(3, rb_str_dup(str), str_new_empty(str), str_new_empty(str));
9776  }
9777  sep = rb_str_subpat(str, sep, INT2FIX(0));
9778  if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
9779  }
9780  else {
9781  pos = rb_str_index(str, sep, 0);
9782  if (pos < 0) goto failed;
9783  }
9784  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
9785  sep,
9786  rb_str_subseq(str, pos+RSTRING_LEN(sep),
9787  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
9788 }
9789 
9790 /*
9791  * call-seq:
9792  * str.rpartition(sep) -> [head, sep, tail]
9793  * str.rpartition(regexp) -> [head, match, tail]
9794  *
9795  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
9796  * of the string, and returns the part before it, the match, and the part
9797  * after it.
9798  * If it is not found, returns two empty strings and <i>str</i>.
9799  *
9800  * "hello".rpartition("l") #=> ["hel", "l", "o"]
9801  * "hello".rpartition("x") #=> ["", "", "hello"]
9802  * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
9803  */
9804 
9805 static VALUE
9806 rb_str_rpartition(VALUE str, VALUE sep)
9807 {
9808  long pos = RSTRING_LEN(str);
9809  int regex = FALSE;
9810 
9811  if (RB_TYPE_P(sep, T_REGEXP)) {
9812  pos = rb_reg_search(sep, str, pos, 1);
9813  regex = TRUE;
9814  }
9815  else {
9816  VALUE tmp;
9817 
9818  tmp = rb_check_string_type(sep);
9819  if (NIL_P(tmp)) {
9820  rb_raise(rb_eTypeError, "type mismatch: %s given",
9821  rb_obj_classname(sep));
9822  }
9823  sep = tmp;
9824  pos = rb_str_sublen(str, pos);
9825  pos = rb_str_rindex(str, sep, pos);
9826  }
9827  if (pos < 0) {
9828  return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), rb_str_dup(str));
9829  }
9830  if (regex) {
9831  sep = rb_reg_nth_match(0, rb_backref_get());
9832  }
9833  else {
9834  pos = rb_str_offset(str, pos);
9835  }
9836  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
9837  sep,
9838  rb_str_subseq(str, pos+RSTRING_LEN(sep),
9839  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
9840 }
9841 
9842 /*
9843  * call-seq:
9844  * str.start_with?([prefixes]+) -> true or false
9845  *
9846  * Returns true if +str+ starts with one of the +prefixes+ given.
9847  * Each of the +prefixes+ should be a String or a Regexp.
9848  *
9849  * "hello".start_with?("hell") #=> true
9850  * "hello".start_with?(/H/i) #=> true
9851  *
9852  * # returns true if one of the prefixes matches.
9853  * "hello".start_with?("heaven", "hell") #=> true
9854  * "hello".start_with?("heaven", "paradise") #=> false
9855  */
9856 
9857 static VALUE
9858 rb_str_start_with(int argc, VALUE *argv, VALUE str)
9859 {
9860  int i;
9861 
9862  for (i=0; i<argc; i++) {
9863  VALUE tmp = argv[i];
9864  if (RB_TYPE_P(tmp, T_REGEXP)) {
9865  if (rb_reg_start_with_p(tmp, str))
9866  return Qtrue;
9867  }
9868  else {
9869  StringValue(tmp);
9870  rb_enc_check(str, tmp);
9871  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9872  if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9873  return Qtrue;
9874  }
9875  }
9876  return Qfalse;
9877 }
9878 
9879 /*
9880  * call-seq:
9881  * str.end_with?([suffixes]+) -> true or false
9882  *
9883  * Returns true if +str+ ends with one of the +suffixes+ given.
9884  *
9885  * "hello".end_with?("ello") #=> true
9886  *
9887  * # returns true if one of the +suffixes+ matches.
9888  * "hello".end_with?("heaven", "ello") #=> true
9889  * "hello".end_with?("heaven", "paradise") #=> false
9890  */
9891 
9892 static VALUE
9893 rb_str_end_with(int argc, VALUE *argv, VALUE str)
9894 {
9895  int i;
9896  char *p, *s, *e;
9897  rb_encoding *enc;
9898 
9899  for (i=0; i<argc; i++) {
9900  VALUE tmp = argv[i];
9901  StringValue(tmp);
9902  enc = rb_enc_check(str, tmp);
9903  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9904  p = RSTRING_PTR(str);
9905  e = p + RSTRING_LEN(str);
9906  s = e - RSTRING_LEN(tmp);
9907  if (rb_enc_left_char_head(p, s, e, enc) != s)
9908  continue;
9909  if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9910  return Qtrue;
9911  }
9912  return Qfalse;
9913 }
9914 
9924 static long
9925 deleted_prefix_length(VALUE str, VALUE prefix)
9926 {
9927  char *strptr, *prefixptr;
9928  long olen, prefixlen;
9929 
9930  StringValue(prefix);
9931  if (is_broken_string(prefix)) return 0;
9932  rb_enc_check(str, prefix);
9933 
9934  /* return 0 if not start with prefix */
9935  prefixlen = RSTRING_LEN(prefix);
9936  if (prefixlen <= 0) return 0;
9937  olen = RSTRING_LEN(str);
9938  if (olen < prefixlen) return 0;
9939  strptr = RSTRING_PTR(str);
9940  prefixptr = RSTRING_PTR(prefix);
9941  if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
9942 
9943  return prefixlen;
9944 }
9945 
9946 /*
9947  * call-seq:
9948  * str.delete_prefix!(prefix) -> self or nil
9949  *
9950  * Deletes leading <code>prefix</code> from <i>str</i>, returning
9951  * <code>nil</code> if no change was made.
9952  *
9953  * "hello".delete_prefix!("hel") #=> "lo"
9954  * "hello".delete_prefix!("llo") #=> nil
9955  */
9956 
9957 static VALUE
9958 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
9959 {
9960  long prefixlen;
9961  str_modify_keep_cr(str);
9962 
9963  prefixlen = deleted_prefix_length(str, prefix);
9964  if (prefixlen <= 0) return Qnil;
9965 
9966  return rb_str_drop_bytes(str, prefixlen);
9967 }
9968 
9969 /*
9970  * call-seq:
9971  * str.delete_prefix(prefix) -> new_str
9972  *
9973  * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
9974  *
9975  * "hello".delete_prefix("hel") #=> "lo"
9976  * "hello".delete_prefix("llo") #=> "hello"
9977  */
9978 
9979 static VALUE
9980 rb_str_delete_prefix(VALUE str, VALUE prefix)
9981 {
9982  long prefixlen;
9983 
9984  prefixlen = deleted_prefix_length(str, prefix);
9985  if (prefixlen <= 0) return rb_str_dup(str);
9986 
9987  return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
9988 }
9989 
9999 static long
10000 deleted_suffix_length(VALUE str, VALUE suffix)
10001 {
10002  char *strptr, *suffixptr, *s;
10003  long olen, suffixlen;
10004  rb_encoding *enc;
10005 
10006  StringValue(suffix);
10007  if (is_broken_string(suffix)) return 0;
10008  enc = rb_enc_check(str, suffix);
10009 
10010  /* return 0 if not start with suffix */
10011  suffixlen = RSTRING_LEN(suffix);
10012  if (suffixlen <= 0) return 0;
10013  olen = RSTRING_LEN(str);
10014  if (olen < suffixlen) return 0;
10015  strptr = RSTRING_PTR(str);
10016  suffixptr = RSTRING_PTR(suffix);
10017  s = strptr + olen - suffixlen;
10018  if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10019  if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10020 
10021  return suffixlen;
10022 }
10023 
10024 /*
10025  * call-seq:
10026  * str.delete_suffix!(suffix) -> self or nil
10027  *
10028  * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10029  * <code>nil</code> if no change was made.
10030  *
10031  * "hello".delete_suffix!("llo") #=> "he"
10032  * "hello".delete_suffix!("hel") #=> nil
10033  */
10034 
10035 static VALUE
10036 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10037 {
10038  long olen, suffixlen, len;
10039  str_modifiable(str);
10040 
10041  suffixlen = deleted_suffix_length(str, suffix);
10042  if (suffixlen <= 0) return Qnil;
10043 
10044  olen = RSTRING_LEN(str);
10045  str_modify_keep_cr(str);
10046  len = olen - suffixlen;
10047  STR_SET_LEN(str, len);
10051  }
10052  return str;
10053 }
10054 
10055 /*
10056  * call-seq:
10057  * str.delete_suffix(suffix) -> new_str
10058  *
10059  * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10060  *
10061  * "hello".delete_suffix("llo") #=> "he"
10062  * "hello".delete_suffix("hel") #=> "hello"
10063  */
10064 
10065 static VALUE
10066 rb_str_delete_suffix(VALUE str, VALUE suffix)
10067 {
10068  long suffixlen;
10069 
10070  suffixlen = deleted_suffix_length(str, suffix);
10071  if (suffixlen <= 0) return rb_str_dup(str);
10072 
10073  return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10074 }
10075 
10076 void
10077 rb_str_setter(VALUE val, ID id, VALUE *var)
10078 {
10079  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10080  rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10081  }
10082  *var = val;
10083 }
10084 
10085 static void
10086 rb_fs_setter(VALUE val, ID id, VALUE *var)
10087 {
10088  val = rb_fs_check(val);
10089  if (!val) {
10091  "value of %"PRIsVALUE" must be String or Regexp",
10092  rb_id2str(id));
10093  }
10094  if (!NIL_P(val)) {
10095  rb_warn_deprecated("`$;'", NULL);
10096  }
10097  *var = val;
10098 }
10099 
10100 
10101 /*
10102  * call-seq:
10103  * str.force_encoding(encoding) -> str
10104  *
10105  * Changes the encoding to +encoding+ and returns self.
10106  */
10107 
10108 static VALUE
10109 rb_str_force_encoding(VALUE str, VALUE enc)
10110 {
10111  str_modifiable(str);
10114  return str;
10115 }
10116 
10117 /*
10118  * call-seq:
10119  * str.b -> str
10120  *
10121  * Returns a copied string whose encoding is ASCII-8BIT.
10122  */
10123 
10124 static VALUE
10125 rb_str_b(VALUE str)
10126 {
10127  VALUE str2 = str_alloc(rb_cString);
10128  str_replace_shared_without_enc(str2, str);
10129  ENC_CODERANGE_CLEAR(str2);
10130  return str2;
10131 }
10132 
10133 /*
10134  * call-seq:
10135  * str.valid_encoding? -> true or false
10136  *
10137  * Returns true for a string which is encoded correctly.
10138  *
10139  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10140  * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10141  * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10142  */
10143 
10144 static VALUE
10145 rb_str_valid_encoding_p(VALUE str)
10146 {
10147  int cr = rb_enc_str_coderange(str);
10148 
10149  return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
10150 }
10151 
10152 /*
10153  * call-seq:
10154  * str.ascii_only? -> true or false
10155  *
10156  * Returns true for a string which has only ASCII characters.
10157  *
10158  * "abc".force_encoding("UTF-8").ascii_only? #=> true
10159  * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10160  */
10161 
10162 static VALUE
10163 rb_str_is_ascii_only_p(VALUE str)
10164 {
10165  int cr = rb_enc_str_coderange(str);
10166 
10167  return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
10168 }
10169 
10184 VALUE
10186 {
10187  static const char ellipsis[] = "...";
10188  const long ellipsislen = sizeof(ellipsis) - 1;
10189  rb_encoding *const enc = rb_enc_get(str);
10190  const long blen = RSTRING_LEN(str);
10191  const char *const p = RSTRING_PTR(str), *e = p + blen;
10192  VALUE estr, ret = 0;
10193 
10194  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10195  if (len * rb_enc_mbminlen(enc) >= blen ||
10196  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10197  ret = str;
10198  }
10199  else if (len <= ellipsislen ||
10200  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10201  if (rb_enc_asciicompat(enc)) {
10202  ret = rb_str_new_with_class(str, ellipsis, len);
10203  rb_enc_associate(ret, enc);
10204  }
10205  else {
10206  estr = rb_usascii_str_new(ellipsis, len);
10207  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10208  }
10209  }
10210  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10211  rb_str_cat(ret, ellipsis, ellipsislen);
10212  }
10213  else {
10214  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10215  rb_enc_from_encoding(enc), 0, Qnil);
10216  rb_str_append(ret, estr);
10217  }
10218  return ret;
10219 }
10220 
10221 static VALUE
10222 str_compat_and_valid(VALUE str, rb_encoding *enc)
10223 {
10224  int cr;
10225  str = StringValue(str);
10226  cr = rb_enc_str_coderange(str);
10227  if (cr == ENC_CODERANGE_BROKEN) {
10228  rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10229  }
10230  else {
10231  rb_encoding *e = STR_ENC_GET(str);
10232  if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10233  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10234  rb_enc_name(enc), rb_enc_name(e));
10235  }
10236  }
10237  return str;
10238 }
10239 
10240 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10241 
10247 VALUE
10249 {
10250  rb_encoding *enc = STR_ENC_GET(str);
10251  return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10252 }
10253 
10254 VALUE
10256 {
10257  int cr = ENC_CODERANGE_UNKNOWN;
10258  if (enc == STR_ENC_GET(str)) {
10259  /* cached coderange makes sense only when enc equals the
10260  * actual encoding of str */
10261  cr = ENC_CODERANGE(str);
10262  }
10263  return enc_str_scrub(enc, str, repl, cr);
10264 }
10265 
10266 static VALUE
10267 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10268 {
10269  int encidx;
10270  VALUE buf = Qnil;
10271  const char *rep, *p, *e, *p1, *sp;
10272  long replen = -1;
10273  long slen;
10274 
10275  if (rb_block_given_p()) {
10276  if (!NIL_P(repl))
10277  rb_raise(rb_eArgError, "both of block and replacement given");
10278  replen = 0;
10279  }
10280 
10281  if (ENC_CODERANGE_CLEAN_P(cr))
10282  return Qnil;
10283 
10284  if (!NIL_P(repl)) {
10285  repl = str_compat_and_valid(repl, enc);
10286  }
10287 
10288  if (rb_enc_dummy_p(enc)) {
10289  return Qnil;
10290  }
10291  encidx = rb_enc_to_index(enc);
10292 
10293 #define DEFAULT_REPLACE_CHAR(str) do { \
10294  static const char replace[sizeof(str)-1] = str; \
10295  rep = replace; replen = (int)sizeof(replace); \
10296  } while (0)
10297 
10298  slen = RSTRING_LEN(str);
10299  p = RSTRING_PTR(str);
10300  e = RSTRING_END(str);
10301  p1 = p;
10302  sp = p;
10303 
10304  if (rb_enc_asciicompat(enc)) {
10305  int rep7bit_p;
10306  if (!replen) {
10307  rep = NULL;
10308  rep7bit_p = FALSE;
10309  }
10310  else if (!NIL_P(repl)) {
10311  rep = RSTRING_PTR(repl);
10312  replen = RSTRING_LEN(repl);
10313  rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10314  }
10315  else if (encidx == rb_utf8_encindex()) {
10316  DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10317  rep7bit_p = FALSE;
10318  }
10319  else {
10320  DEFAULT_REPLACE_CHAR("?");
10321  rep7bit_p = TRUE;
10322  }
10323  cr = ENC_CODERANGE_7BIT;
10324 
10325  p = search_nonascii(p, e);
10326  if (!p) {
10327  p = e;
10328  }
10329  while (p < e) {
10330  int ret = rb_enc_precise_mbclen(p, e, enc);
10331  if (MBCLEN_NEEDMORE_P(ret)) {
10332  break;
10333  }
10334  else if (MBCLEN_CHARFOUND_P(ret)) {
10335  cr = ENC_CODERANGE_VALID;
10336  p += MBCLEN_CHARFOUND_LEN(ret);
10337  }
10338  else if (MBCLEN_INVALID_P(ret)) {
10339  /*
10340  * p1~p: valid ascii/multibyte chars
10341  * p ~e: invalid bytes + unknown bytes
10342  */
10343  long clen = rb_enc_mbmaxlen(enc);
10345  if (p > p1) {
10346  rb_str_buf_cat(buf, p1, p - p1);
10347  }
10348 
10349  if (e - p < clen) clen = e - p;
10350  if (clen <= 2) {
10351  clen = 1;
10352  }
10353  else {
10354  const char *q = p;
10355  clen--;
10356  for (; clen > 1; clen--) {
10357  ret = rb_enc_precise_mbclen(q, q + clen, enc);
10358  if (MBCLEN_NEEDMORE_P(ret)) break;
10359  if (MBCLEN_INVALID_P(ret)) continue;
10360  UNREACHABLE;
10361  }
10362  }
10363  if (rep) {
10364  rb_str_buf_cat(buf, rep, replen);
10365  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10366  }
10367  else {
10368  repl = rb_yield(rb_enc_str_new(p, clen, enc));
10369  str_mod_check(str, sp, slen);
10370  repl = str_compat_and_valid(repl, enc);
10371  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10372  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10373  cr = ENC_CODERANGE_VALID;
10374  }
10375  p += clen;
10376  p1 = p;
10377  p = search_nonascii(p, e);
10378  if (!p) {
10379  p = e;
10380  break;
10381  }
10382  }
10383  else {
10384  UNREACHABLE;
10385  }
10386  }
10387  if (NIL_P(buf)) {
10388  if (p == e) {
10389  ENC_CODERANGE_SET(str, cr);
10390  return Qnil;
10391  }
10393  }
10394  if (p1 < p) {
10395  rb_str_buf_cat(buf, p1, p - p1);
10396  }
10397  if (p < e) {
10398  if (rep) {
10399  rb_str_buf_cat(buf, rep, replen);
10400  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10401  }
10402  else {
10403  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10404  str_mod_check(str, sp, slen);
10405  repl = str_compat_and_valid(repl, enc);
10406  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10407  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10408  cr = ENC_CODERANGE_VALID;
10409  }
10410  }
10411  }
10412  else {
10413  /* ASCII incompatible */
10414  long mbminlen = rb_enc_mbminlen(enc);
10415  if (!replen) {
10416  rep = NULL;
10417  }
10418  else if (!NIL_P(repl)) {
10419  rep = RSTRING_PTR(repl);
10420  replen = RSTRING_LEN(repl);
10421  }
10422  else if (encidx == ENCINDEX_UTF_16BE) {
10423  DEFAULT_REPLACE_CHAR("\xFF\xFD");
10424  }
10425  else if (encidx == ENCINDEX_UTF_16LE) {
10426  DEFAULT_REPLACE_CHAR("\xFD\xFF");
10427  }
10428  else if (encidx == ENCINDEX_UTF_32BE) {
10429  DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
10430  }
10431  else if (encidx == ENCINDEX_UTF_32LE) {
10432  DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
10433  }
10434  else {
10435  DEFAULT_REPLACE_CHAR("?");
10436  }
10437 
10438  while (p < e) {
10439  int ret = rb_enc_precise_mbclen(p, e, enc);
10440  if (MBCLEN_NEEDMORE_P(ret)) {
10441  break;
10442  }
10443  else if (MBCLEN_CHARFOUND_P(ret)) {
10444  p += MBCLEN_CHARFOUND_LEN(ret);
10445  }
10446  else if (MBCLEN_INVALID_P(ret)) {
10447  const char *q = p;
10448  long clen = rb_enc_mbmaxlen(enc);
10450  if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
10451 
10452  if (e - p < clen) clen = e - p;
10453  if (clen <= mbminlen * 2) {
10454  clen = mbminlen;
10455  }
10456  else {
10457  clen -= mbminlen;
10458  for (; clen > mbminlen; clen-=mbminlen) {
10459  ret = rb_enc_precise_mbclen(q, q + clen, enc);
10460  if (MBCLEN_NEEDMORE_P(ret)) break;
10461  if (MBCLEN_INVALID_P(ret)) continue;
10462  UNREACHABLE;
10463  }
10464  }
10465  if (rep) {
10466  rb_str_buf_cat(buf, rep, replen);
10467  }
10468  else {
10469  repl = rb_yield(rb_enc_str_new(p, clen, enc));
10470  str_mod_check(str, sp, slen);
10471  repl = str_compat_and_valid(repl, enc);
10472  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10473  }
10474  p += clen;
10475  p1 = p;
10476  }
10477  else {
10478  UNREACHABLE;
10479  }
10480  }
10481  if (NIL_P(buf)) {
10482  if (p == e) {
10484  return Qnil;
10485  }
10487  }
10488  if (p1 < p) {
10489  rb_str_buf_cat(buf, p1, p - p1);
10490  }
10491  if (p < e) {
10492  if (rep) {
10493  rb_str_buf_cat(buf, rep, replen);
10494  }
10495  else {
10496  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10497  str_mod_check(str, sp, slen);
10498  repl = str_compat_and_valid(repl, enc);
10499  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10500  }
10501  }
10502  cr = ENC_CODERANGE_VALID;
10503  }
10505  return buf;
10506 }
10507 
10508 /*
10509  * call-seq:
10510  * str.scrub -> new_str
10511  * str.scrub(repl) -> new_str
10512  * str.scrub{|bytes|} -> new_str
10513  *
10514  * If the string is invalid byte sequence then replace invalid bytes with given replacement
10515  * character, else returns self.
10516  * If block is given, replace invalid bytes with returned value of the block.
10517  *
10518  * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
10519  * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
10520  * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
10521  */
10522 static VALUE
10523 str_scrub(int argc, VALUE *argv, VALUE str)
10524 {
10525  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
10526  VALUE new = rb_str_scrub(str, repl);
10527  return NIL_P(new) ? rb_str_dup(str): new;
10528 }
10529 
10530 /*
10531  * call-seq:
10532  * str.scrub! -> str
10533  * str.scrub!(repl) -> str
10534  * str.scrub!{|bytes|} -> str
10535  *
10536  * If the string is invalid byte sequence then replace invalid bytes with given replacement
10537  * character, else returns self.
10538  * If block is given, replace invalid bytes with returned value of the block.
10539  *
10540  * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
10541  * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
10542  * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
10543  */
10544 static VALUE
10545 str_scrub_bang(int argc, VALUE *argv, VALUE str)
10546 {
10547  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
10548  VALUE new = rb_str_scrub(str, repl);
10549  if (!NIL_P(new)) rb_str_replace(str, new);
10550  return str;
10551 }
10552 
10553 static ID id_normalize;
10554 static ID id_normalized_p;
10555 static VALUE mUnicodeNormalize;
10556 
10557 static VALUE
10558 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
10559 {
10560  static int UnicodeNormalizeRequired = 0;
10561  VALUE argv2[2];
10562 
10563  if (!UnicodeNormalizeRequired) {
10564  rb_require("unicode_normalize/normalize.rb");
10565  UnicodeNormalizeRequired = 1;
10566  }
10567  argv2[0] = str;
10568  if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
10569  return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
10570 }
10571 
10572 /*
10573  * call-seq:
10574  * str.unicode_normalize(form=:nfc)
10575  *
10576  * Unicode Normalization---Returns a normalized form of +str+,
10577  * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
10578  * The normalization form used is determined by +form+, which can
10579  * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10580  * The default is +:nfc+.
10581  *
10582  * If the string is not in a Unicode Encoding, then an Exception is raised.
10583  * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
10584  * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
10585  * Anything other than UTF-8 is implemented by converting to UTF-8,
10586  * which makes it slower than UTF-8.
10587  *
10588  * "a\u0300".unicode_normalize #=> "\u00E0"
10589  * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
10590  * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
10591  * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
10592  * #=> Encoding::CompatibilityError raised
10593  */
10594 static VALUE
10595 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
10596 {
10597  return unicode_normalize_common(argc, argv, str, id_normalize);
10598 }
10599 
10600 /*
10601  * call-seq:
10602  * str.unicode_normalize!(form=:nfc)
10603  *
10604  * Destructive version of String#unicode_normalize, doing Unicode
10605  * normalization in place.
10606  */
10607 static VALUE
10608 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
10609 {
10610  return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
10611 }
10612 
10613 /* call-seq:
10614  * str.unicode_normalized?(form=:nfc)
10615  *
10616  * Checks whether +str+ is in Unicode normalization form +form+,
10617  * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
10618  * The default is +:nfc+.
10619  *
10620  * If the string is not in a Unicode Encoding, then an Exception is raised.
10621  * For details, see String#unicode_normalize.
10622  *
10623  * "a\u0300".unicode_normalized? #=> false
10624  * "a\u0300".unicode_normalized?(:nfd) #=> true
10625  * "\u00E0".unicode_normalized? #=> true
10626  * "\u00E0".unicode_normalized?(:nfd) #=> false
10627  * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
10628  * #=> Encoding::CompatibilityError raised
10629  */
10630 static VALUE
10631 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
10632 {
10633  return unicode_normalize_common(argc, argv, str, id_normalized_p);
10634 }
10635 
10636 /**********************************************************************
10637  * Document-class: Symbol
10638  *
10639  * Symbol objects represent names inside the Ruby interpreter. They
10640  * are generated using the <code>:name</code> and
10641  * <code>:"string"</code> literals syntax, and by the various
10642  * <code>to_sym</code> methods. The same Symbol object will be
10643  * created for a given name or string for the duration of a program's
10644  * execution, regardless of the context or meaning of that name. Thus
10645  * if <code>Fred</code> is a constant in one context, a method in
10646  * another, and a class in a third, the Symbol <code>:Fred</code>
10647  * will be the same object in all three contexts.
10648  *
10649  * module One
10650  * class Fred
10651  * end
10652  * $f1 = :Fred
10653  * end
10654  * module Two
10655  * Fred = 1
10656  * $f2 = :Fred
10657  * end
10658  * def Fred()
10659  * end
10660  * $f3 = :Fred
10661  * $f1.object_id #=> 2514190
10662  * $f2.object_id #=> 2514190
10663  * $f3.object_id #=> 2514190
10664  *
10665  */
10666 
10667 
10668 /*
10669  * call-seq:
10670  * sym == obj -> true or false
10671  *
10672  * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
10673  * symbol, returns <code>true</code>.
10674  */
10675 
10676 #define sym_equal rb_obj_equal
10677 
10678 static int
10679 sym_printable(const char *s, const char *send, rb_encoding *enc)
10680 {
10681  while (s < send) {
10682  int n;
10683  int c = rb_enc_precise_mbclen(s, send, enc);
10684 
10685  if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
10686  n = MBCLEN_CHARFOUND_LEN(c);
10687  c = rb_enc_mbc_to_codepoint(s, send, enc);
10688  if (!rb_enc_isprint(c, enc)) return FALSE;
10689  s += n;
10690  }
10691  return TRUE;
10692 }
10693 
10694 int
10696 {
10697  rb_encoding *enc;
10698  const char *ptr;
10699  long len;
10701 
10702  if (resenc == NULL) resenc = rb_default_external_encoding();
10703  enc = STR_ENC_GET(sym);
10704  ptr = RSTRING_PTR(sym);
10705  len = RSTRING_LEN(sym);
10706  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
10707  !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
10708  return FALSE;
10709  }
10710  return TRUE;
10711 }
10712 
10713 VALUE
10715 {
10716  rb_encoding *enc;
10717  const char *ptr;
10718  long len;
10719  rb_encoding *resenc;
10720 
10722  resenc = rb_default_internal_encoding();
10723  if (resenc == NULL) resenc = rb_default_external_encoding();
10724  enc = STR_ENC_GET(str);
10725  ptr = RSTRING_PTR(str);
10726  len = RSTRING_LEN(str);
10727  if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
10728  !sym_printable(ptr, ptr + len, enc)) {
10729  return rb_str_inspect(str);
10730  }
10731  return str;
10732 }
10733 
10736 {
10737  VALUE str = rb_id2str(id);
10738  if (!rb_str_symname_p(str)) {
10739  return rb_str_inspect(str);
10740  }
10741  return str;
10742 }
10743 
10744 /*
10745  * call-seq:
10746  * sym.inspect -> string
10747  *
10748  * Returns the representation of <i>sym</i> as a symbol literal.
10749  *
10750  * :fred.inspect #=> ":fred"
10751  */
10752 
10753 static VALUE
10754 sym_inspect(VALUE sym)
10755 {
10756  VALUE str = rb_sym2str(sym);
10757  const char *ptr;
10758  long len;
10759  char *dest;
10760 
10761  if (!rb_str_symname_p(str)) {
10762  str = rb_str_inspect(str);
10763  len = RSTRING_LEN(str);
10764  rb_str_resize(str, len + 1);
10765  dest = RSTRING_PTR(str);
10766  memmove(dest + 1, dest, len);
10767  }
10768  else {
10769  rb_encoding *enc = STR_ENC_GET(str);
10771  str = rb_enc_str_new(0, len + 1, enc);
10772  dest = RSTRING_PTR(str);
10773  memcpy(dest + 1, ptr, len);
10774  }
10775  dest[0] = ':';
10776  return str;
10777 }
10778 
10779 
10780 /*
10781  * call-seq:
10782  * sym.id2name -> string
10783  * sym.to_s -> string
10784  *
10785  * Returns the name or string corresponding to <i>sym</i>.
10786  *
10787  * :fred.id2name #=> "fred"
10788  * :ginger.to_s #=> "ginger"
10789  */
10790 
10791 
10792 VALUE
10794 {
10795  return str_new_shared(rb_cString, rb_sym2str(sym));
10796 }
10797 
10798 
10799 /*
10800  * call-seq:
10801  * sym.to_sym -> sym
10802  * sym.intern -> sym
10803  *
10804  * In general, <code>to_sym</code> returns the Symbol corresponding
10805  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
10806  * in this case.
10807  */
10808 
10809 static VALUE
10810 sym_to_sym(VALUE sym)
10811 {
10812  return sym;
10813 }
10814 
10816 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
10817 {
10818  VALUE obj;
10819 
10820  if (argc < 1) {
10821  rb_raise(rb_eArgError, "no receiver given");
10822  }
10823  obj = argv[0];
10824  return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
10825 }
10826 
10827 #if 0
10828 /*
10829  * call-seq:
10830  * sym.to_proc
10831  *
10832  * Returns a _Proc_ object which responds to the given method by _sym_.
10833  *
10834  * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
10835  */
10836 
10837 VALUE
10839 {
10840 }
10841 #endif
10842 
10843 /*
10844  * call-seq:
10845  *
10846  * sym.succ
10847  *
10848  * Same as <code>sym.to_s.succ.intern</code>.
10849  */
10850 
10851 static VALUE
10852 sym_succ(VALUE sym)
10853 {
10855 }
10856 
10857 /*
10858  * call-seq:
10859  *
10860  * symbol <=> other_symbol -> -1, 0, +1, or nil
10861  *
10862  * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
10863  * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
10864  * less than, equal to, or greater than +other_symbol+.
10865  *
10866  * +nil+ is returned if the two values are incomparable.
10867  *
10868  * See String#<=> for more information.
10869  */
10870 
10871 static VALUE
10872 sym_cmp(VALUE sym, VALUE other)
10873 {
10874  if (!SYMBOL_P(other)) {
10875  return Qnil;
10876  }
10877  return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
10878 }
10879 
10880 /*
10881  * call-seq:
10882  * sym.casecmp(other_symbol) -> -1, 0, +1, or nil
10883  *
10884  * Case-insensitive version of Symbol#<=>.
10885  * Currently, case-insensitivity only works on characters A-Z/a-z,
10886  * not all of Unicode. This is different from Symbol#casecmp?.
10887  *
10888  * :aBcDeF.casecmp(:abcde) #=> 1
10889  * :aBcDeF.casecmp(:abcdef) #=> 0
10890  * :aBcDeF.casecmp(:abcdefg) #=> -1
10891  * :abcdef.casecmp(:ABCDEF) #=> 0
10892  *
10893  * +nil+ is returned if the two symbols have incompatible encodings,
10894  * or if +other_symbol+ is not a symbol.
10895  *
10896  * :foo.casecmp(2) #=> nil
10897  * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp(:"\u{c4 d6 dc}") #=> nil
10898  */
10899 
10900 static VALUE
10901 sym_casecmp(VALUE sym, VALUE other)
10902 {
10903  if (!SYMBOL_P(other)) {
10904  return Qnil;
10905  }
10906  return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
10907 }
10908 
10909 /*
10910  * call-seq:
10911  * sym.casecmp?(other_symbol) -> true, false, or nil
10912  *
10913  * Returns +true+ if +sym+ and +other_symbol+ are equal after
10914  * Unicode case folding, +false+ if they are not equal.
10915  *
10916  * :aBcDeF.casecmp?(:abcde) #=> false
10917  * :aBcDeF.casecmp?(:abcdef) #=> true
10918  * :aBcDeF.casecmp?(:abcdefg) #=> false
10919  * :abcdef.casecmp?(:ABCDEF) #=> true
10920  * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
10921  *
10922  * +nil+ is returned if the two symbols have incompatible encodings,
10923  * or if +other_symbol+ is not a symbol.
10924  *
10925  * :foo.casecmp?(2) #=> nil
10926  * "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp?(:"\u{c4 d6 dc}") #=> nil
10927  */
10928 
10929 static VALUE
10930 sym_casecmp_p(VALUE sym, VALUE other)
10931 {
10932  if (!SYMBOL_P(other)) {
10933  return Qnil;
10934  }
10935  return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
10936 }
10937 
10938 /*
10939  * call-seq:
10940  * sym =~ obj -> integer or nil
10941  *
10942  * Returns <code>sym.to_s =~ obj</code>.
10943  */
10944 
10945 static VALUE
10946 sym_match(VALUE sym, VALUE other)
10947 {
10948  return rb_str_match(rb_sym2str(sym), other);
10949 }
10950 
10951 /*
10952  * call-seq:
10953  * sym.match(pattern) -> matchdata or nil
10954  * sym.match(pattern, pos) -> matchdata or nil
10955  *
10956  * Returns <code>sym.to_s.match</code>.
10957  */
10958 
10959 static VALUE
10960 sym_match_m(int argc, VALUE *argv, VALUE sym)
10961 {
10962  return rb_str_match_m(argc, argv, rb_sym2str(sym));
10963 }
10964 
10965 /*
10966  * call-seq:
10967  * sym.match?(pattern) -> true or false
10968  * sym.match?(pattern, pos) -> true or false
10969  *
10970  * Returns <code>sym.to_s.match?</code>.
10971  */
10972 
10973 static VALUE
10974 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
10975 {
10976  return rb_str_match_m_p(argc, argv, sym);
10977 }
10978 
10979 /*
10980  * call-seq:
10981  * sym[idx] -> char
10982  * sym[b, n] -> string
10983  * sym.slice(idx) -> char
10984  * sym.slice(b, n) -> string
10985  *
10986  * Returns <code>sym.to_s[]</code>.
10987  */
10988 
10989 static VALUE
10990 sym_aref(int argc, VALUE *argv, VALUE sym)
10991 {
10992  return rb_str_aref_m(argc, argv, rb_sym2str(sym));
10993 }
10994 
10995 /*
10996  * call-seq:
10997  * sym.length -> integer
10998  * sym.size -> integer
10999  *
11000  * Same as <code>sym.to_s.length</code>.
11001  */
11002 
11003 static VALUE
11004 sym_length(VALUE sym)
11005 {
11006  return rb_str_length(rb_sym2str(sym));
11007 }
11008 
11009 /*
11010  * call-seq:
11011  * sym.empty? -> true or false
11012  *
11013  * Returns whether _sym_ is :"" or not.
11014  */
11015 
11016 static VALUE
11017 sym_empty(VALUE sym)
11018 {
11019  return rb_str_empty(rb_sym2str(sym));
11020 }
11021 
11022 /*
11023  * call-seq:
11024  * sym.upcase -> symbol
11025  * sym.upcase([options]) -> symbol
11026  *
11027  * Same as <code>sym.to_s.upcase.intern</code>.
11028  */
11029 
11030 static VALUE
11031 sym_upcase(int argc, VALUE *argv, VALUE sym)
11032 {
11033  return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11034 }
11035 
11036 /*
11037  * call-seq:
11038  * sym.downcase -> symbol
11039  * sym.downcase([options]) -> symbol
11040  *
11041  * Same as <code>sym.to_s.downcase.intern</code>.
11042  */
11043 
11044 static VALUE
11045 sym_downcase(int argc, VALUE *argv, VALUE sym)
11046 {
11047  return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11048 }
11049 
11050 /*
11051  * call-seq:
11052  * sym.capitalize -> symbol
11053  * sym.capitalize([options]) -> symbol
11054  *
11055  * Same as <code>sym.to_s.capitalize.intern</code>.
11056  */
11057 
11058 static VALUE
11059 sym_capitalize(int argc, VALUE *argv, VALUE sym)
11060 {
11061  return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11062 }
11063 
11064 /*
11065  * call-seq:
11066  * sym.swapcase -> symbol
11067  * sym.swapcase([options]) -> symbol
11068  *
11069  * Same as <code>sym.to_s.swapcase.intern</code>.
11070  */
11071 
11072 static VALUE
11073 sym_swapcase(int argc, VALUE *argv, VALUE sym)
11074 {
11075  return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11076 }
11077 
11078 /*
11079  * call-seq:
11080  * sym.start_with?([prefixes]+) -> true or false
11081  *
11082  * Returns true if +sym+ starts with one of the +prefixes+ given.
11083  * Each of the +prefixes+ should be a String or a Regexp.
11084  *
11085  * :hello.start_with?("hell") #=> true
11086  * :hello.start_with?(/H/i) #=> true
11087  *
11088  * # returns true if one of the prefixes matches.
11089  * :hello.start_with?("heaven", "hell") #=> true
11090  * :hello.start_with?("heaven", "paradise") #=> false
11091  */
11092 
11093 static VALUE
11094 sym_start_with(int argc, VALUE *argv, VALUE sym)
11095 {
11096  return rb_str_start_with(argc, argv, rb_sym2str(sym));
11097 }
11098 
11099 /*
11100  * call-seq:
11101  * sym.end_with?([suffixes]+) -> true or false
11102  *
11103  * Returns true if +sym+ ends with one of the +suffixes+ given.
11104  *
11105  * :hello.end_with?("ello") #=> true
11106  *
11107  * # returns true if one of the +suffixes+ matches.
11108  * :hello.end_with?("heaven", "ello") #=> true
11109  * :hello.end_with?("heaven", "paradise") #=> false
11110  */
11111 
11112 static VALUE
11113 sym_end_with(int argc, VALUE *argv, VALUE sym)
11114 {
11115  return rb_str_end_with(argc, argv, rb_sym2str(sym));
11116 }
11117 
11118 /*
11119  * call-seq:
11120  * sym.encoding -> encoding
11121  *
11122  * Returns the Encoding object that represents the encoding of _sym_.
11123  */
11124 
11125 static VALUE
11126 sym_encoding(VALUE sym)
11127 {
11128  return rb_obj_encoding(rb_sym2str(sym));
11129 }
11130 
11131 static VALUE
11132 string_for_symbol(VALUE name)
11133 {
11134  if (!RB_TYPE_P(name, T_STRING)) {
11136  if (NIL_P(tmp)) {
11137  rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11138  name);
11139  }
11140  name = tmp;
11141  }
11142  return name;
11143 }
11144 
11145 ID
11147 {
11148  if (SYMBOL_P(name)) {
11149  return SYM2ID(name);
11150  }
11151  name = string_for_symbol(name);
11152  return rb_intern_str(name);
11153 }
11154 
11155 VALUE
11157 {
11158  if (SYMBOL_P(name)) {
11159  return name;
11160  }
11161  name = string_for_symbol(name);
11162  return rb_str_intern(name);
11163 }
11164 
11165 /*
11166  * call-seq:
11167  * Symbol.all_symbols => array
11168  *
11169  * Returns an array of all the symbols currently in Ruby's symbol
11170  * table.
11171  *
11172  * Symbol.all_symbols.size #=> 903
11173  * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11174  * :chown, :EOFError, :$;, :String,
11175  * :LOCK_SH, :"setuid?", :$<,
11176  * :default_proc, :compact, :extend,
11177  * :Tms, :getwd, :$=, :ThreadGroup,
11178  * :wait2, :$>]
11179  */
11180 
11181 static VALUE
11182 sym_all_symbols(VALUE _)
11183 {
11184  return rb_sym_all_symbols();
11185 }
11186 
11187 /*
11188  * A String object holds and manipulates an arbitrary sequence of
11189  * bytes, typically representing characters. String objects may be created
11190  * using String::new or as literals.
11191  *
11192  * Because of aliasing issues, users of strings should be aware of the methods
11193  * that modify the contents of a String object. Typically,
11194  * methods with names ending in ``!'' modify their receiver, while those
11195  * without a ``!'' return a new String. However, there are
11196  * exceptions, such as String#[]=.
11197  *
11198  */
11199 
11200 void
11202 {
11203 #undef rb_intern
11204 #define rb_intern(str) rb_intern_const(str)
11205 
11206  rb_cString = rb_define_class("String", rb_cObject);
11208  st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
11210  rb_define_alloc_func(rb_cString, empty_str_alloc);
11211  rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
11212  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
11213  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
11214  rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
11217  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
11218  rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
11219  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
11220  rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
11223  rb_define_method(rb_cString, "%", rb_str_format_m, 1);
11224  rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
11225  rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
11226  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
11227  rb_define_method(rb_cString, "length", rb_str_length, 0);
11229  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
11230  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
11231  rb_define_method(rb_cString, "=~", rb_str_match, 1);
11232  rb_define_method(rb_cString, "match", rb_str_match_m, -1);
11233  rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
11235  rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
11237  rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
11238  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
11239  rb_define_method(rb_cString, "index", rb_str_index_m, -1);
11240  rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
11241  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
11242  rb_define_method(rb_cString, "clear", rb_str_clear, 0);
11243  rb_define_method(rb_cString, "chr", rb_str_chr, 0);
11244  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
11245  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
11246  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
11247  rb_define_method(rb_cString, "scrub", str_scrub, -1);
11248  rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
11249  rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
11250  rb_define_method(rb_cString, "+@", str_uplus, 0);
11251  rb_define_method(rb_cString, "-@", str_uminus, 0);
11252 
11253  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
11254  rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
11255  rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
11256  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
11257  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
11259  rb_define_method(rb_cString, "undump", str_undump, 0);
11260 
11261  sym_ascii = ID2SYM(rb_intern("ascii"));
11262  sym_turkic = ID2SYM(rb_intern("turkic"));
11263  sym_lithuanian = ID2SYM(rb_intern("lithuanian"));
11264  sym_fold = ID2SYM(rb_intern("fold"));
11265 
11266  rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
11267  rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
11268  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
11269  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
11270 
11271  rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
11272  rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
11273  rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
11274  rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
11275 
11276  rb_define_method(rb_cString, "hex", rb_str_hex, 0);
11277  rb_define_method(rb_cString, "oct", rb_str_oct, 0);
11278  rb_define_method(rb_cString, "split", rb_str_split_m, -1);
11279  rb_define_method(rb_cString, "lines", rb_str_lines, -1);
11280  rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
11281  rb_define_method(rb_cString, "chars", rb_str_chars, 0);
11282  rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
11283  rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
11284  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
11285  rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
11286  rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
11288  rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
11289  rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
11290  rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
11291  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
11293 
11294  rb_define_method(rb_cString, "include?", rb_str_include, 1);
11295  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
11296  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
11297 
11298  rb_define_method(rb_cString, "scan", rb_str_scan, 1);
11299 
11300  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
11301  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
11302  rb_define_method(rb_cString, "center", rb_str_center, -1);
11303 
11304  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
11305  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
11306  rb_define_method(rb_cString, "chop", rb_str_chop, 0);
11307  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
11308  rb_define_method(rb_cString, "strip", rb_str_strip, 0);
11309  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
11310  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
11311  rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
11312  rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
11313 
11314  rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
11315  rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
11316  rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
11317  rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
11318  rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
11319  rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
11320  rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
11321  rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
11322  rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
11323 
11324  rb_define_method(rb_cString, "tr", rb_str_tr, 2);
11325  rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
11326  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
11327  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
11328  rb_define_method(rb_cString, "count", rb_str_count, -1);
11329 
11330  rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
11331  rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
11332  rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
11333  rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
11334 
11335  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
11336  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
11337  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
11338  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
11339  rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
11340 
11341  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
11342 
11343  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
11344  rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
11345 
11346  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
11347  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
11348 
11349  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
11350  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
11351  rb_define_method(rb_cString, "b", rb_str_b, 0);
11352  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
11353  rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
11354 
11355  /* define UnicodeNormalize module here so that we don't have to look it up */
11356  mUnicodeNormalize = rb_define_module("UnicodeNormalize");
11357  id_normalize = rb_intern("normalize");
11358  id_normalized_p = rb_intern("normalized?");
11359 
11360  rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
11361  rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
11362  rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
11363 
11364  rb_fs = Qnil;
11365  rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
11366  rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
11368 
11369  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
11373  rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
11374 
11377  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
11379  rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
11380  rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
11381  rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
11382  rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
11383  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
11384  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
11385 
11386  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
11387  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
11388  rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
11389  rb_define_method(rb_cSymbol, "=~", sym_match, 1);
11390 
11391  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
11392  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
11393  rb_define_method(rb_cSymbol, "length", sym_length, 0);
11394  rb_define_method(rb_cSymbol, "size", sym_length, 0);
11395  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
11396  rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
11397  rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
11398 
11399  rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
11400  rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
11401  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
11402  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
11403 
11404  rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
11405  rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
11406 
11407  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
11408 }
STR_EMBEDDABLE_P
#define STR_EMBEDDABLE_P(len, termlen)
Definition: string.c:181
memset
void * memset(void *, int, size_t)
RUBY_MAX_CHAR_LEN
#define RUBY_MAX_CHAR_LEN
Definition: string.c:86
mapping_buffer
Definition: string.c:6493
rb_str_shared_replace
void rb_str_shared_replace(VALUE str, VALUE str2)
Definition: string.c:1391
rb_match_busy
void rb_match_busy(VALUE)
Definition: re.c:1287
OnigDefaultSyntax
ONIG_EXTERN const OnigSyntaxType * OnigDefaultSyntax
Definition: onigmo.h:515
RMATCH_REGS
#define RMATCH_REGS(obj)
Definition: re.h:51
rb_get_kwargs
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Definition: class.c:1886
UNLIKELY
#define UNLIKELY(x)
Definition: ffi_common.h:126
ISASCII
#define ISASCII(c)
Definition: ruby.h:2304
ID
unsigned long ID
Definition: ruby.h:103
rb_str_plus
VALUE rb_str_plus(VALUE str1, VALUE str2)
Definition: string.c:1894
ruby_xfree
void ruby_xfree(void *x)
Definition: gc.c:10170
rb_define_class
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:649
rb_str_scrub
VALUE rb_str_scrub(VALUE str, VALUE repl)
Definition: string.c:10248
rb_str_offset
long rb_str_offset(VALUE str, long pos)
Definition: string.c:2416
ONIGENC_CTYPE_ALPHA
#define ONIGENC_CTYPE_ALPHA
Definition: onigmo.h:295
lesser
#define lesser(a, b)
Definition: string.c:3200
Check_Type
#define Check_Type(v, t)
Definition: ruby.h:595
rb_enc_isascii
#define rb_enc_isascii(c, enc)
Definition: encoding.h:230
TRUE
#define TRUE
Definition: nkf.h:175
rb_check_convert_type_with_id
VALUE rb_check_convert_type_with_id(VALUE, int, const char *, ID)
Definition: object.c:2957
RSTRING_GETMEM
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Definition: ruby.h:1018
SIZEOF_VOIDP
#define SIZEOF_VOIDP
Definition: rb_mjit_min_header-2.7.2.h:90
rb_enc_find_index2
int rb_enc_find_index2(const char *name, long len)
Definition: encoding.c:717
rb_enc_unicode_p
int rb_enc_unicode_p(rb_encoding *enc)
Definition: encoding.c:521
rb_reg_start_with_p
bool rb_reg_start_with_p(VALUE re, VALUE str)
Definition: re.c:1618
ONIGENC_CASE_MODIFIED
#define ONIGENC_CASE_MODIFIED
Definition: onigmo.h:119
RB_DEBUG_COUNTER_INC_IF
#define RB_DEBUG_COUNTER_INC_IF(type, cond)
Definition: debug_counter.h:377
rb_include_module
void rb_include_module(VALUE klass, VALUE module)
Definition: class.c:869
rb_enc_strlen
long rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
Definition: string.c:1740
rb_str_buf_new_cstr
VALUE rb_str_buf_new_cstr(const char *ptr)
Definition: string.c:1331
onig_error_code_to_str
ONIG_EXTERN int onig_error_code_to_str(OnigUChar *s, OnigPosition err_code,...)
rb_default_rs
RUBY_EXTERN VALUE rb_default_rs
Definition: intern.h:586
rb_enc_name
#define rb_enc_name(enc)
Definition: encoding.h:177
rb_filesystem_encoding
rb_encoding * rb_filesystem_encoding(void)
Definition: encoding.c:1387
assert
#define assert(x)
Definition: dlmalloc.c:1176
rb_enc_mbc_to_codepoint
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:208
RSTRING_EMBED_LEN_MASK
@ RSTRING_EMBED_LEN_MASK
Definition: ruby.h:979
range
#define range(low, item, hi)
Definition: date_strftime.c:21
LONG_MAX
#define LONG_MAX
Definition: ruby.h:220
ENCINDEX_UTF_8
#define ENCINDEX_UTF_8
Definition: encindex.h:43
rb_filesystem_str_new_cstr
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Definition: string.c:1117
NEIGHBOR_NOT_CHAR
@ NEIGHBOR_NOT_CHAR
Definition: string.c:3880
STR_SET_EMBED
#define STR_SET_EMBED(str)
Definition: string.c:97
rb_enc_codepoint
#define rb_enc_codepoint(p, e, enc)
Definition: encoding.h:207
ENCODING_SET_INLINED
#define ENCODING_SET_INLINED(obj, i)
Definition: encoding.h:59
rb_hash_new
VALUE rb_hash_new(void)
Definition: hash.c:1523
rb_enc_mbclen
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1020
rb_str_new_cstr
VALUE rb_str_new_cstr(const char *ptr)
Definition: string.c:808
rb_str_subpos
char * rb_str_subpos(VALUE str, long beg, long *lenp)
Definition: string.c:2497
rb_reg_match_p
VALUE rb_reg_match_p(VALUE re, VALUE str, long pos)
Definition: re.c:3340
rb_enc_mbcput
#define rb_enc_mbcput(c, buf, enc)
Definition: encoding.h:217
rb_str_memsize
RUBY_FUNC_EXPORTED size_t rb_str_memsize(VALUE str)
Definition: string.c:1371
rb_str_buf_cat2
#define rb_str_buf_cat2
Definition: intern.h:911
rb_warn
void rb_warn(const char *fmt,...)
Definition: error.c:315
ENC_CODERANGE_VALID
#define ENC_CODERANGE_VALID
Definition: encoding.h:105
rb_block_given_p
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition: eval.c:898
RUBY_FL_FREEZE
@ RUBY_FL_FREEZE
Definition: ruby.h:851
ISDIGIT
#define ISDIGIT(c)
Definition: ruby.h:2312
rb_warning
void rb_warning(const char *fmt,...)
Definition: error.c:336
ONIGENC_CASE_ASCII_ONLY
#define ONIGENC_CASE_ASCII_ONLY
Definition: onigmo.h:125
gc.h
int
__inline__ int
Definition: rb_mjit_min_header-2.7.2.h:2877
RBASIC_CLEAR_CLASS
#define RBASIC_CLEAR_CLASS(obj)
Definition: internal.h:1987
ST_STOP
@ ST_STOP
Definition: st.h:99
rb_str_subseq
VALUE rb_str_subseq(VALUE str, long beg, long len)
Definition: string.c:2474
RSTRING_EMBED_LEN
#define RSTRING_EMBED_LEN(str)
Definition: ruby.h:1002
TERM_FILL
#define TERM_FILL(ptr, termlen)
Definition: string.c:125
rb_reg_search
long rb_reg_search(VALUE, VALUE, long, int)
Definition: re.c:1612
rb_str_concat
VALUE rb_str_concat(VALUE str1, VALUE str2)
Definition: string.c:3065
STR_SET_NOEMBED
#define STR_SET_NOEMBED(str)
Definition: string.c:93
RESIZE_CAPA_TERM
#define RESIZE_CAPA_TERM(str, capacity, termlen)
Definition: string.c:137
rb_num_to_uint
int rb_num_to_uint(VALUE val, unsigned int *ret)
Definition: numeric.c:244
rb_str_escape
VALUE rb_str_escape(VALUE str)
Definition: string.c:5866
rb_gc_force_recycle
void rb_gc_force_recycle(VALUE obj)
Definition: gc.c:7014
INT2FIX
#define INT2FIX(i)
Definition: ruby.h:263
rb_to_symbol
VALUE rb_to_symbol(VALUE name)
Definition: string.c:11156
ENCINDEX_UTF_16BE
#define ENCINDEX_UTF_16BE
Definition: encindex.h:45
bp
#define bp()
Definition: internal.h:1445
rp
#define rp(obj)
Definition: internal.h:1435
neighbor_char
neighbor_char
Definition: string.c:3879
STR_TMPLOCK
#define STR_TMPLOCK
Definition: string.c:89
rb_external_str_new
VALUE rb_external_str_new(const char *ptr, long len)
Definition: string.c:1087
RSTRING_PTR
#define RSTRING_PTR(str)
Definition: ruby.h:1009
re.h
i
uint32_t i
Definition: rb_mjit_min_header-2.7.2.h:5499
NUM2LONG
#define NUM2LONG(x)
Definition: ruby.h:679
tr
Definition: string.c:6989
rb_str_buf_cat_escaped_char
int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
Definition: string.c:5815
rb_str_cmp
int rb_str_cmp(VALUE str1, VALUE str2)
Definition: string.c:3228
ENCODING_IS_ASCII8BIT
#define ENCODING_IS_ASCII8BIT(obj)
Definition: encoding.h:63
rb_backref_set_string
void rb_backref_set_string(VALUE string, long pos, long len)
Definition: re.c:1340
rb_hash_aref
VALUE rb_hash_aref(VALUE hash, VALUE key)
Definition: hash.c:2037
rb_usascii_str_new
VALUE rb_usascii_str_new(const char *ptr, long len)
Definition: string.c:780
ONIGENC_CODE_TO_MBC_MAXLEN
#define ONIGENC_CODE_TO_MBC_MAXLEN
Definition: onigmo.h:289
rb_str_buf_append
VALUE rb_str_buf_append(VALUE str, VALUE str2)
Definition: string.c:2950
rb_str_setter
void rb_str_setter(VALUE val, ID id, VALUE *var)
Definition: string.c:10077
rb_str_fill_terminator
char * rb_str_fill_terminator(VALUE str, const int newminlen)
Definition: string.c:2306
TERM_LEN
#define TERM_LEN(str)
Definition: string.c:124
rb_equal
VALUE rb_equal(VALUE, VALUE)
Same as Object#===, case equality.
Definition: object.c:124
rb_utf8_encindex
int rb_utf8_encindex(void)
Definition: encoding.c:1334
FL_FREEZE
#define FL_FREEZE
Definition: ruby.h:1287
rb_locale_encoding
rb_encoding * rb_locale_encoding(void)
Definition: encoding.c:1372
rb_default_external_encoding
rb_encoding * rb_default_external_encoding(void)
Definition: encoding.c:1427
rb_enc_sprintf
VALUE rb_enc_sprintf(rb_encoding *enc, const char *format,...)
Definition: sprintf.c:1178
VALUE
unsigned long VALUE
Definition: ruby.h:102
long
#define long
Definition: rb_mjit_min_header-2.7.2.h:2921
rb_str_dup
VALUE rb_str_dup(VALUE str)
Definition: string.c:1516
rb_obj_encoding
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:1004
rb_eArgError
VALUE rb_eArgError
Definition: error.c:925
encoding.h
RSTRING_EMBED_LEN_MAX
@ RSTRING_EMBED_LEN_MAX
Definition: ruby.h:982
offsetof
#define offsetof(p_type, field)
Definition: addrinfo.h:186
st_delete
int st_delete(st_table *tab, st_data_t *key, st_data_t *value)
Definition: st.c:1418
RB_TYPE_P
#define RB_TYPE_P(obj, type)
Definition: ruby.h:560
rb_intern_const
#define rb_intern_const(str)
Definition: ruby.h:1879
STR_HEAP_PTR
#define STR_HEAP_PTR(str)
Definition: string.c:167
SHARABLE_SUBSTRING_P
#define SHARABLE_SUBSTRING_P(beg, len, end)
Definition: string.c:176
rb_fstring_new
MJIT_FUNC_EXPORTED VALUE rb_fstring_new(const char *ptr, long len)
Definition: string.c:396
STR_BORROWED
#define STR_BORROWED
Definition: string.c:88
idEqTilde
@ idEqTilde
Definition: id.h:103
rb_enc_get
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:872
rb_enc_asciicompat
#define rb_enc_asciicompat(enc)
Definition: encoding.h:245
rb_enc_check
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Definition: encoding.c:891
rb_enc_precise_mbclen
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1032
rb_string_value_cstr
char * rb_string_value_cstr(volatile VALUE *ptr)
Definition: string.c:2291
BARE_STRING_P
#define BARE_STRING_P(str)
Definition: string.c:265
RSTRING_LENINT
#define RSTRING_LENINT(str)
Definition: ruby.h:1017
RUBY_DTRACE_CREATE_HOOK
#define RUBY_DTRACE_CREATE_HOOK(name, arg)
Definition: internal.h:2590
ONIGENC_CASE_TITLECASE
#define ONIGENC_CASE_TITLECASE
Definition: onigmo.h:115
rb_define_module
VALUE rb_define_module(const char *name)
Definition: class.c:772
rb_invcmp
VALUE rb_invcmp(VALUE x, VALUE y)
Definition: compar.c:47
SPLIT_STR
#define SPLIT_STR(beg, len)
id.h
STR_NOFREE
#define STR_NOFREE
Definition: string.c:90
UINT2NUM
#define UINT2NUM(x)
Definition: ruby.h:1610
STR_ENC_GET
#define STR_ENC_GET(str)
Definition: string.c:170
rb_reg_regsub
VALUE rb_reg_regsub(VALUE, VALUE, struct re_registers *, VALUE)
Definition: re.c:3776
rb_eIndexError
VALUE rb_eIndexError
Definition: error.c:926
ISALPHA
#define ISALPHA(c)
Definition: ruby.h:2311
rb_str_cat2
#define rb_str_cat2
Definition: intern.h:912
rb_str_succ
VALUE rb_str_succ(VALUE orig)
Definition: string.c:4090
rb_str_new_static
VALUE rb_str_new_static(const char *ptr, long len)
Definition: string.c:872
rb_str_include_range_p
VALUE rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
Definition: string.c:4398
Qundef
#define Qundef
Definition: ruby.h:470
ONIGENC_MBCLEN_CHARFOUND_P
#define ONIGENC_MBCLEN_CHARFOUND_P(r)
Definition: onigmo.h:346
rb_str_drop_bytes
VALUE rb_str_drop_bytes(VALUE str, long len)
Definition: string.c:4573
rb_define_singleton_method
void rb_define_singleton_method(VALUE obj, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a singleton method for obj.
Definition: class.c:1755
rb_econv_t
Definition: transcode.c:111
CHAR_BIT
#define CHAR_BIT
Definition: ruby.h:227
rb_enc_fast_mbclen
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1014
rb_str_to_dbl
double rb_str_to_dbl(VALUE, int)
Parses a string representation of a floating point number.
Definition: object.c:3371
rb_str_new_frozen
VALUE rb_str_new_frozen(VALUE orig)
Definition: string.c:1203
rb_define_method
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Definition: class.c:1551
rb_str_buf_new
VALUE rb_str_buf_new(long capa)
Definition: string.c:1315
ENC_CODERANGE_SET
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:110
rb_enc_dummy_p
int rb_enc_dummy_p(rb_encoding *enc)
Definition: encoding.c:131
INT2NUM
#define INT2NUM(x)
Definition: ruby.h:1609
rb_econv_convert
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Definition: transcode.c:1429
RString::shared
VALUE shared
Definition: ruby.h:996
idLE
@ idLE
Definition: id.h:93
ptr
struct RIMemo * ptr
Definition: debug.c:65
rb_int_and
VALUE rb_int_and(VALUE x, VALUE y)
Definition: numeric.c:4467
rb_str_cat_cstr
VALUE rb_str_cat_cstr(VALUE str, const char *ptr)
Definition: string.c:2822
UCHAR_MAX
#define UCHAR_MAX
Definition: rb_mjit_min_header-2.7.2.h:4073
Qfalse
#define Qfalse
Definition: ruby.h:467
ENC_CODERANGE_CLEAR
#define ENC_CODERANGE_CLEAR(obj)
Definition: encoding.h:111
rb_fs
VALUE rb_fs
Definition: string.c:452
uintptr_t
unsigned int uintptr_t
Definition: win32.h:106
RETURN_SIZED_ENUMERATOR
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
Definition: intern.h:271
STR_BUF_MIN_SIZE
#define STR_BUF_MIN_SIZE
Definition: string.c:1311
DBL2NUM
#define DBL2NUM(dbl)
Definition: ruby.h:967
NEIGHBOR_WRAPPED
@ NEIGHBOR_WRAPPED
Definition: string.c:3882
CASE_MAPPING_ADDITIONAL_LENGTH
#define CASE_MAPPING_ADDITIONAL_LENGTH
Definition: string.c:6487
rb_external_str_with_enc
VALUE rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
Definition: string.c:1074
rb_enc_right_char_head
#define rb_enc_right_char_head(s, p, e, enc)
Definition: encoding.h:223
rb_enc_mbmaxlen
#define rb_enc_mbmaxlen(enc)
Definition: encoding.h:181
rb_str_ord
VALUE rb_str_ord(VALUE s)
Definition: string.c:9527
rb_reg_backref_number
int rb_reg_backref_number(VALUE match, VALUE backref)
Definition: re.c:1173
rb_id2str
#define rb_id2str(id)
Definition: vm_backtrace.c:30
ONIGENC_CTYPE_DIGIT
#define ONIGENC_CTYPE_DIGIT
Definition: onigmo.h:298
dp
#define dp(v)
Definition: vm_debug.h:21
SPECIAL_CONST_P
#define SPECIAL_CONST_P(x)
Definition: ruby.h:1313
rb_ary_new3
#define rb_ary_new3
Definition: intern.h:104
NULL
#define NULL
Definition: _sdbm.c:101
ST_DELETE
@ ST_DELETE
Definition: st.h:99
rb_obj_as_string
VALUE rb_obj_as_string(VALUE obj)
Definition: string.c:1440
END
#define END(no)
Definition: string.c:26
rb_sym_all_symbols
VALUE rb_sym_all_symbols(void)
Definition: symbol.c:840
rb_str_ellipsize
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition: string.c:10185
FL_TEST
#define FL_TEST(x, f)
Definition: ruby.h:1353
FL_WB_PROTECTED
#define FL_WB_PROTECTED
Definition: ruby.h:1279
fmt
const VALUE int int int int int int VALUE char * fmt
Definition: rb_mjit_min_header-2.7.2.h:6497
PRIsVALUE
#define PRIsVALUE
Definition: ruby.h:166
rb_str_buf_cat_ascii
VALUE rb_str_buf_cat_ascii(VALUE str, const char *ptr)
Definition: string.c:2926
RBASIC_SET_CLASS
#define RBASIC_SET_CLASS(obj, cls)
Definition: internal.h:1989
rb_intern
#define rb_intern(str)
rb_id_quote_unprintable
MJIT_FUNC_EXPORTED VALUE rb_id_quote_unprintable(ID id)
Definition: string.c:10735
rb_enc_from_encoding
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:116
last
unsigned int last
Definition: nkf.c:4324
rb_str_cat_conv_enc_opts
VALUE rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len, rb_encoding *from, int ecflags, VALUE ecopts)
Definition: string.c:943
ONIGENC_CASE_UPCASE
#define ONIGENC_CASE_UPCASE
Definition: onigmo.h:113
FL_SET
#define FL_SET(x, f)
Definition: ruby.h:1359
mapping_buffer::space
OnigUChar space[FLEX_ARY_LEN]
Definition: string.c:6497
rb_fatal
void rb_fatal(const char *fmt,...)
Definition: error.c:2722
rb_str_tmp_frozen_release
void rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
Definition: string.c:1217
onig_match
ONIG_EXTERN OnigPosition onig_match(OnigRegex, const OnigUChar *str, const OnigUChar *end, const OnigUChar *at, OnigRegion *region, OnigOptionType option)
OnigUChar
unsigned char OnigUChar
Definition: onigmo.h:79
FIX2LONG
#define FIX2LONG(x)
Definition: ruby.h:394
ID2SYM
#define ID2SYM(x)
Definition: ruby.h:414
rb_funcall_with_block_kw
VALUE rb_funcall_with_block_kw(VALUE, ID, int, const VALUE *, VALUE, int)
Definition: vm_eval.c:1060
strlen
size_t strlen(const char *)
OBJ_FREEZE
#define OBJ_FREEZE(x)
Definition: ruby.h:1377
rb_obj_as_string_result
MJIT_FUNC_EXPORTED VALUE rb_obj_as_string_result(VALUE str, VALUE obj)
Definition: string.c:1452
rb_str_locktmp
VALUE rb_str_locktmp(VALUE)
rb_ascii8bit_encindex
int rb_ascii8bit_encindex(void)
Definition: encoding.c:1322
OnigEncodingTypeST::name
const char * name
Definition: onigmo.h:162
rb_str_chomp_string
VALUE rb_str_chomp_string(VALUE str, VALUE rs)
Definition: string.c:8921
rb_str_locktmp_ensure
RUBY_FUNC_EXPORTED VALUE rb_str_locktmp_ensure(VALUE str, VALUE(*func)(VALUE), VALUE arg)
Definition: string.c:2685
RString::ary
char ary[RSTRING_EMBED_LEN_MAX+1]
Definition: ruby.h:999
rb_string_value
VALUE rb_string_value(volatile VALUE *ptr)
Definition: string.c:2175
STR_NOEMBED
#define STR_NOEMBED
Definition: internal.h:2161
L
#define L(x)
Definition: asm.h:125
rb_respond_to
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:2190
ascii_isspace
#define ascii_isspace(c)
Definition: string.c:7808
rb_memsearch
long rb_memsearch(const void *, long, const void *, long, rb_encoding *)
Definition: re.c:239
rb_undef_method
void rb_undef_method(VALUE klass, const char *name)
Definition: class.c:1575
rb_check_arity
#define rb_check_arity
Definition: intern.h:347
RUBY_FUNC_EXPORTED
#define RUBY_FUNC_EXPORTED
Definition: defines.h:391
rb_memhash
st_index_t rb_memhash(const void *ptr, long len)
Definition: random.c:1440
rb_reg_search0
long rb_reg_search0(VALUE, VALUE, long, int, int)
Definition: re.c:1530
RARRAY_LENINT
#define RARRAY_LENINT(ary)
Definition: ruby.h:1071
rb_str_capacity
size_t rb_str_capacity(VALUE str)
Definition: string.c:712
ALLOC_N
#define ALLOC_N(type, n)
Definition: ruby.h:1663
MBCLEN_INVALID_P
#define MBCLEN_INVALID_P(ret)
Definition: encoding.h:193
STR_SHARED_P
#define STR_SHARED_P(s)
Definition: internal.h:2164
rb_fstring
RUBY_FUNC_EXPORTED VALUE rb_fstring(VALUE str)
Definition: string.c:312
void
void
Definition: rb_mjit_min_header-2.7.2.h:13321
UNLIMITED_ARGUMENTS
#define UNLIMITED_ARGUMENTS
Definition: intern.h:57
crypt.h
crypt_data
Definition: crypt.h:230
STR_SET_SHARED
#define STR_SET_SHARED(str, shared_str)
Definition: string.c:157
rb_raise
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2671
rb_id_encoding
ID rb_id_encoding(void)
Definition: encoding.c:759
CASEMAP_DEBUG
#define CASEMAP_DEBUG
Definition: string.c:6489
rb_str_tmp_frozen_acquire
VALUE rb_str_tmp_frozen_acquire(VALUE orig)
Definition: string.c:1210
STR_SHARED
#define STR_SHARED
Definition: internal.h:2162
rb_eRangeError
VALUE rb_eRangeError
Definition: error.c:928
rb_str_times
VALUE rb_str_times(VALUE str, VALUE times)
Definition: string.c:1966
rb_str_to_inum
VALUE rb_str_to_inum(VALUE str, int base, int badcheck)
Definition: bignum.c:4268
rb_usascii_str_new_cstr
VALUE rb_usascii_str_new_cstr(const char *ptr)
Definition: string.c:820
rb_str_splice
#define rb_str_splice(str, beg, len, val)
Definition: string.c:4687
rb_fstring_cstr
VALUE rb_fstring_cstr(const char *ptr)
Definition: string.c:410
LONG2NUM
#define LONG2NUM(x)
Definition: ruby.h:1644
crypt
RUBY_EXTERN char * crypt(const char *, const char *)
rb_enc_str_new_static
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:890
idTo_s
@ idTo_s
Definition: rb_mjit_min_header-2.7.2.h:8763
rb_obj_class
VALUE rb_obj_class(VALUE)
Equivalent to Object#class in Ruby.
Definition: object.c:217
rb_check_string_type
VALUE rb_check_string_type(VALUE str)
Definition: string.c:2314
ONIGENC_MBCLEN_CHARFOUND_LEN
#define ONIGENC_MBCLEN_CHARFOUND_LEN(r)
Definition: onigmo.h:347
probes.h
rb_fstring_enc_new
VALUE rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:403
rb_str_substr
VALUE rb_str_substr(VALUE str, long beg, long len)
Definition: string.c:2584
rb_str_format
VALUE rb_str_format(int, const VALUE *, VALUE)
Definition: sprintf.c:204
rb_enc_get_index
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:779
PRIuSIZE
#define PRIuSIZE
Definition: ruby.h:208
RESIZE_CAPA
#define RESIZE_CAPA(str, capacity)
Definition: string.c:133
idTo_str
@ idTo_str
Definition: rb_mjit_min_header-2.7.2.h:8757
rb_str_opt_plus
MJIT_FUNC_EXPORTED VALUE rb_str_opt_plus(VALUE str1, VALUE str2)
Definition: string.c:1925
rb_syserr_fail
void rb_syserr_fail(int e, const char *mesg)
Definition: error.c:2783
snprintf
int snprintf(char *__restrict, size_t, const char *__restrict,...) __attribute__((__format__(__printf__
ONIG_OPTION_DEFAULT
#define ONIG_OPTION_DEFAULT
Definition: onigmo.h:447
rb_ascii8bit_encoding
rb_encoding * rb_ascii8bit_encoding(void)
Definition: encoding.c:1316
DEFAULT_REPLACE_CHAR
#define DEFAULT_REPLACE_CHAR(str)
ENC_CODERANGE_MASK
#define ENC_CODERANGE_MASK
Definition: encoding.h:102
rb_econv_open_opts
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Definition: transcode.c:2561
ONIGENC_IS_ALLOWED_REVERSE_MATCH
#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, s, end)
Definition: onigmo.h:334
DATA_PTR
#define DATA_PTR(dta)
Definition: ruby.h:1175
MBCLEN_CHARFOUND_LEN
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:192
klass
VALUE klass
Definition: rb_mjit_min_header-2.7.2.h:13302
rb_cEncodingConverter
VALUE rb_cEncodingConverter
Definition: transcode.c:25
LIKELY
#define LIKELY(x)
Definition: ffi_common.h:125
CRYPT_END
#define CRYPT_END()
rb_check_frozen
#define rb_check_frozen(obj)
Definition: intern.h:319
rb_tainted_str_new_cstr
VALUE rb_tainted_str_new_cstr(const char *ptr)
Definition: string.c:903
idLTLT
@ idLTLT
Definition: id.h:90
rb_str_intern
VALUE rb_str_intern(VALUE)
Definition: symbol.c:710
MIN_PRE_ALLOC_SIZE
#define MIN_PRE_ALLOC_SIZE
Definition: string.c:2971
ENCINDEX_UTF_16
#define ENCINDEX_UTF_16
Definition: encindex.h:49
rb_enc_left_char_head
#define rb_enc_left_char_head(s, p, e, enc)
Definition: encoding.h:222
rb_str_new_with_class
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Definition: string.c:1298
BEG
#define BEG(no)
Definition: string.c:25
mapping_buffer::capa
size_t capa
Definition: string.c:6494
rb_sym_to_proc
VALUE rb_sym_to_proc(VALUE sym)
Definition: proc.c:1312
rb_sym_proc_call
MJIT_FUNC_EXPORTED VALUE rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
Definition: string.c:10816
rb_enc_str_new
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:796
rb_str_upto_each
VALUE rb_str_upto_each(VALUE beg, VALUE end, int excl, int(*each)(VALUE, VALUE), VALUE arg)
Definition: string.c:4263
rb_fstring_hash_type
const struct st_hash_type rb_fstring_hash_type
Definition: string.c:260
rb_setup_fake_str
VALUE rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
Definition: string.c:385
rb_convert_type_with_id
VALUE rb_convert_type_with_id(VALUE, int, const char *, ID)
Definition: object.c:2914
rb_enc_from_index
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:609
CHAR_ESC_LEN
#define CHAR_ESC_LEN
Definition: string.c:5812
ENCINDEX_UTF_32
#define ENCINDEX_UTF_32
Definition: encindex.h:50
rb_str_replace
VALUE rb_str_replace(VALUE str, VALUE str2)
Definition: string.c:5363
str_buf_cat2
#define str_buf_cat2(str, ptr)
Definition: string.c:2809
tr::max
unsigned int max
Definition: string.c:6991
FL_EXIVAR
#define FL_EXIVAR
Definition: ruby.h:1286
ONIGENC_CASE_DOWNCASE
#define ONIGENC_CASE_DOWNCASE
Definition: onigmo.h:114
rb_str_resize
VALUE rb_str_resize(VALUE str, long len)
Definition: string.c:2709
rb_cSymbol
VALUE rb_cSymbol
Definition: string.c:67
rb_match_unbusy
void rb_match_unbusy(VALUE)
Definition: re.c:1293
OnigEncodingTypeST
Definition: onigmo.h:160
sym
#define sym(x)
Definition: date_core.c:3716
ONIGENC_CODE_TO_MBCLEN
#define ONIGENC_CODE_TO_MBCLEN(enc, code)
Definition: onigmo.h:367
st_data_t
RUBY_SYMBOL_EXPORT_BEGIN typedef unsigned long st_data_t
Definition: st.h:22
rb_to_id
ID rb_to_id(VALUE name)
Definition: string.c:11146
ONIGENC_CASE_FOLD_LITHUANIAN
#define ONIGENC_CASE_FOLD_LITHUANIAN
Definition: onigmo.h:124
T_REGEXP
#define T_REGEXP
Definition: ruby.h:529
FL_UNSET
#define FL_UNSET(x, f)
Definition: ruby.h:1361
rb_hash_lookup
VALUE rb_hash_lookup(VALUE hash, VALUE key)
Definition: hash.c:2063
memchr
void * memchr(const void *, int, size_t)
rb_ary_push
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:1195
rb_enc_check_str
rb_encoding * rb_enc_check_str(VALUE str1, VALUE str2)
Definition: encoding.c:880
CHECK_IF_ASCII
#define CHECK_IF_ASCII(c)
st_index_t
st_data_t st_index_t
Definition: st.h:50
st_hash_type
Definition: st.h:61
ENC_CODERANGE_AND
#define ENC_CODERANGE_AND(a, b)
Definition: encoding.h:112
rb_obj_freeze
VALUE rb_obj_freeze(VALUE)
Make the object unmodifiable.
Definition: object.c:1080
rb_enc_str_scrub
VALUE rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
Definition: string.c:10255
rb_enc_mbminlen
#define rb_enc_mbminlen(enc)
Definition: encoding.h:180
rb_enc_to_index
int rb_enc_to_index(rb_encoding *enc)
Definition: encoding.c:125
TypedData_Wrap_Struct
#define TypedData_Wrap_Struct(klass, data_type, sval)
Definition: ruby.h:1231
rb_str_modify
void rb_str_modify(VALUE str)
Definition: string.c:2114
ONIGENC_CASE_FOLD_TURKISH_AZERI
#define ONIGENC_CASE_FOLD_TURKISH_AZERI
Definition: onigmo.h:122
rb_eTypeError
VALUE rb_eTypeError
Definition: error.c:924
rb_require
VALUE rb_require(const char *)
Definition: load.c:1117
RBASIC_CLASS
#define RBASIC_CLASS(obj)
Definition: ruby.h:906
SIZED_REALLOC_N
#define SIZED_REALLOC_N(var, type, n, old_n)
Definition: internal.h:1663
rb_str_split
VALUE rb_str_split(VALUE str, const char *sep0)
Definition: string.c:8116
rb_enc_get_from_index
rb_encoding * rb_enc_get_from_index(int index)
Definition: encoding.c:618
rb_filesystem_str_new
VALUE rb_filesystem_str_new(const char *ptr, long len)
Definition: string.c:1111
ENCODING_CODERANGE_SET
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Definition: encoding.h:113
rb_eRuntimeError
VALUE rb_eRuntimeError
Definition: error.c:922
rb_obj_frozen_p
VALUE rb_obj_frozen_p(VALUE obj)
Determines if the object is frozen.
Definition: object.c:1099
rb_str_new_shared
VALUE rb_str_new_shared(VALUE str)
Definition: string.c:1197
rb_str_tmp_new
VALUE rb_str_tmp_new(long len)
Definition: string.c:1343
ALLOCA_N
#define ALLOCA_N(type, n)
Definition: ruby.h:1684
RETURN_ENUMERATOR
#define RETURN_ENUMERATOR(obj, argc, argv)
Definition: intern.h:279
OnigPosition
ptrdiff_t OnigPosition
Definition: onigmo.h:83
rb_enc_set_index
void rb_enc_set_index(VALUE obj, int idx)
Definition: encoding.c:830
mod
#define mod(x, y)
Definition: date_strftime.c:28
rb_str_dup_frozen
#define rb_str_dup_frozen
rb_utf8_str_new
VALUE rb_utf8_str_new(const char *ptr, long len)
Definition: string.c:788
ruby_sized_xfree
#define ruby_sized_xfree(ptr, size)
Definition: rb_mjit_min_header-2.7.2.h:7408
rb_str_export
VALUE rb_str_export(VALUE str)
Definition: string.c:1123
rb_enc_strlen_cr
long rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
Definition: string.c:1749
rb_str_upto_endless_each
VALUE rb_str_upto_endless_each(VALUE beg, int(*each)(VALUE, VALUE), VALUE arg)
Definition: string.c:4345
rb_enc_is_newline
#define rb_enc_is_newline(p, end, enc)
Definition: encoding.h:227
rb_str_unlocktmp
VALUE rb_str_unlocktmp(VALUE str)
Definition: string.c:2675
rb_enc_copy
void rb_enc_copy(VALUE obj1, VALUE obj2)
Definition: encoding.c:990
size
int size
Definition: encoding.c:58
rb_enc_compatible
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Definition: encoding.c:974
rb_str_resurrect
VALUE rb_str_resurrect(VALUE str)
Definition: string.c:1522
tr::pend
char * pend
Definition: string.c:6992
FALSE
#define FALSE
Definition: nkf.h:174
rb_backref_get
VALUE rb_backref_get(void)
Definition: vm.c:1304
FIXNUM_P
#define FIXNUM_P(f)
Definition: ruby.h:396
ENCINDEX_UTF_32LE
#define ENCINDEX_UTF_32LE
Definition: encindex.h:48
RString
Definition: ruby.h:988
rb_str_concat_literals
MJIT_FUNC_EXPORTED VALUE rb_str_concat_literals(size_t num, const VALUE *strary)
Definition: string.c:2974
rb_to_int
VALUE rb_to_int(VALUE)
Converts val into Integer.
Definition: object.c:3021
STR_SHARED_ROOT
#define STR_SHARED_ROOT
Definition: string.c:87
RSTRING_FSTR
@ RSTRING_FSTR
Definition: ruby.h:983
arg
VALUE arg
Definition: rb_mjit_min_header-2.7.2.h:5636
MBCLEN_NEEDMORE_P
#define MBCLEN_NEEDMORE_P(ret)
Definition: encoding.h:194
USTR
unsigned char * USTR
Definition: string.c:6987
MEMZERO
#define MEMZERO(p, type, n)
Definition: ruby.h:1752
rb_str_append
VALUE rb_str_append(VALUE str, VALUE str2)
Definition: string.c:2965
FL_SET_RAW
#define FL_SET_RAW(x, f)
Definition: ruby.h:1358
ISSPACE
#define ISSPACE(c)
Definition: ruby.h:2307
rb_gc_register_address
void rb_gc_register_address(VALUE *addr)
Definition: gc.c:7080
memcmp
int memcmp(const void *s1, const void *s2, size_t len)
Definition: memcmp.c:7
STR_HEAP_SIZE
#define STR_HEAP_SIZE(str)
Definition: string.c:168
IS_EVSTR
#define IS_EVSTR(p, e)
Definition: string.c:6023
rb_error_arity
MJIT_STATIC void rb_error_arity(int argc, int min, int max)
Definition: vm_insnhelper.c:387
rb_backref_set
void rb_backref_set(VALUE)
Definition: vm.c:1310
rb_default_internal_encoding
rb_encoding * rb_default_internal_encoding(void)
Definition: encoding.c:1512
RB_OBJ_WRITE
#define RB_OBJ_WRITE(a, slot, b)
Definition: ruby.h:1508
ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
Definition: onigmo.h:691
rb_str_hash_cmp
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Definition: string.c:3173
CONST_ID
#define CONST_ID(var, str)
Definition: ruby.h:1841
st_update
int st_update(st_table *tab, st_data_t key, st_update_callback_func *func, st_data_t arg)
Definition: st.c:1510
rb_str_conv_enc_opts
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Definition: string.c:914
ENCINDEX_UTF_16LE
#define ENCINDEX_UTF_16LE
Definition: encindex.h:46
StringValueCStr
#define StringValueCStr(v)
Definition: ruby.h:604
rb_str_equal
VALUE rb_str_equal(VALUE str1, VALUE str2)
Definition: string.c:3267
RString::heap
struct RString::@2::@3 heap
ENC_CODERANGE_BROKEN
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:106
rb_check_array_type
VALUE rb_check_array_type(VALUE ary)
Definition: array.c:909
TR_TABLE_SIZE
#define TR_TABLE_SIZE
Definition: string.c:7357
rb_econv_close
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1685
key
key
Definition: openssl_missing.h:181
rb_external_str_new_cstr
VALUE rb_external_str_new_cstr(const char *ptr)
Definition: string.c:1093
RString::basic
struct RBasic basic
Definition: ruby.h:989
rb_scan_args
#define rb_scan_args(argc, argvp, fmt,...)
Definition: rb_mjit_min_header-2.7.2.h:6407
rb_enc_str_new_cstr
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Definition: string.c:836
ZALLOC_N
#define ZALLOC_N(type, n)
Definition: ruby.h:1665
rb_str_to_cstr
char * rb_str_to_cstr(VALUE str)
Definition: string.c:2284
ENUM_ELEM
#define ENUM_ELEM(ary, e)
Definition: string.c:8140
rb_locale_str_new_cstr
VALUE rb_locale_str_new_cstr(const char *ptr)
Definition: string.c:1105
CLASS_OF
#define CLASS_OF(v)
Definition: ruby.h:484
WANTARRAY
#define WANTARRAY(m, size)
Definition: string.c:8125
NEIGHBOR_FOUND
@ NEIGHBOR_FOUND
Definition: string.c:3881
crypt_r
char * crypt_r(const char *key, const char *setting, struct crypt_data *data)
Definition: crypt.c:396
ruby::backward::cxxanyargs::rb_define_hooked_variable
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
Definition: cxxanyargs.hpp:106
MBCLEN_CHARFOUND_P
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:191
dup
int dup(int __fildes)
rb_str_hash
st_index_t rb_str_hash(VALUE str)
Definition: string.c:3163
rb_str_conv_enc
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Definition: string.c:1030
st_foreach
int st_foreach(st_table *tab, st_foreach_callback_func *func, st_data_t arg)
Definition: st.c:1718
ENCODING_MASK
#define ENCODING_MASK
Definition: encoding.h:42
rb_check_hash_type
VALUE rb_check_hash_type(VALUE hash)
Definition: hash.c:1852
FL_TEST_RAW
#define FL_TEST_RAW(x, f)
Definition: ruby.h:1352
rb_str_comparable
int rb_str_comparable(VALUE str1, VALUE str2)
Definition: string.c:3203
rb_cObject
RUBY_EXTERN VALUE rb_cObject
Definition: ruby.h:2010
rb_ary_new2
#define rb_ary_new2
Definition: intern.h:103
rb_locale_str_new
VALUE rb_locale_str_new(const char *ptr, long len)
Definition: string.c:1099
buf
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4322
rb_usascii_encindex
int rb_usascii_encindex(void)
Definition: encoding.c:1346
FL_UNSET_RAW
#define FL_UNSET_RAW(x, f)
Definition: ruby.h:1360
rb_str_update
void rb_str_update(VALUE str, long beg, long len, VALUE val)
Definition: string.c:4643
rb_exc_raise
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:668
obj
const VALUE VALUE obj
Definition: rb_mjit_min_header-2.7.2.h:5777
re_pattern_buffer
Definition: onigmo.h:755
RGENGC_WB_PROTECTED_STRING
#define RGENGC_WB_PROTECTED_STRING
Definition: ruby.h:811
rb_vm_fstring_table
st_table * rb_vm_fstring_table(void)
Definition: vm.c:3392
ISPRINT
#define ISPRINT(c)
Definition: ruby.h:2305
rb_bug
void rb_bug(const char *fmt,...)
Definition: error.c:636
scan_hex
#define scan_hex(s, l, e)
Definition: util.h:55
rb_enc_symname2_p
int rb_enc_symname2_p(const char *, long, rb_encoding *)
Definition: symbol.c:339
rb_reg_regcomp
VALUE rb_reg_regcomp(VALUE)
Definition: re.c:2970
internal.h
rb_to_encoding
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:245
UChar
#define UChar
Definition: onigmo.h:76
memrchr
void * memrchr(const void *, int, size_t)
argv
char ** argv
Definition: ruby.c:223
f
#define f
rb_must_asciicompat
void rb_must_asciicompat(VALUE str)
Definition: string.c:2166
ONIGERR_INVALID_CODE_POINT_VALUE
#define ONIGERR_INVALID_CODE_POINT_VALUE
Definition: onigmo.h:689
mapping_buffer::used
size_t used
Definition: string.c:6495
ST_CONTINUE
@ ST_CONTINUE
Definition: st.h:99
xmalloc
#define xmalloc
Definition: defines.h:211
rb_str_sublen
long rb_str_sublen(VALUE str, long pos)
Definition: string.c:2463
is_ascii_string
#define is_ascii_string(str)
Definition: internal.h:2165
UNREACHABLE
#define UNREACHABLE
Definition: ruby.h:63
ENCODING_GET
#define ENCODING_GET(obj)
Definition: encoding.h:62
rb_warn_deprecated
void rb_warn_deprecated(const char *fmt, const char *suggest,...)
Definition: error.c:366
ONIG_MAX_ERROR_MESSAGE_LEN
#define ONIG_MAX_ERROR_MESSAGE_LEN
Definition: onigmo.h:443
rb_range_beg_len
VALUE rb_range_beg_len(VALUE, long *, long *, long, int)
Definition: range.c:1278
rb_objspace_garbage_object_p
int rb_objspace_garbage_object_p(VALUE obj)
Definition: gc.c:3607
StringValue
use StringValue() instead")))
rb_str_new
VALUE rb_str_new(const char *ptr, long len)
Definition: string.c:774
rb_utf8_encoding
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1328
tr::gen
int gen
Definition: string.c:6990
onigenc_ascii_only_case_map
ONIG_EXTERN int onigenc_ascii_only_case_map(OnigCaseFoldType *flagP, const OnigUChar **pp, const OnigUChar *end, OnigUChar *to, OnigUChar *to_end, const struct OnigEncodingTypeST *enc)
Definition: regenc.c:955
is_broken_string
#define is_broken_string(str)
Definition: internal.h:2166
rb_obj_alloc
VALUE rb_obj_alloc(VALUE)
Allocates an instance of klass.
Definition: object.c:1895
str
char str[HTML_ESCAPE_MAX_LEN+1]
Definition: escape.c:18
UNALIGNED_WORD_ACCESS
#define UNALIGNED_WORD_ACCESS
Definition: defines.h:492
rb_str_inspect
VALUE rb_str_inspect(VALUE str)
Definition: string.c:5930
RUBY_ALIAS_FUNCTION
RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen,(str))
Definition: string.c:2661
src
__inline__ const void *__restrict src
Definition: rb_mjit_min_header-2.7.2.h:2874
rb_str_export_locale
VALUE rb_str_export_locale(VALUE str)
Definition: string.c:1129
rb_mComparable
VALUE rb_mComparable
Definition: compar.c:16
memcpy
void * memcpy(void *__restrict, const void *__restrict, size_t)
RARRAY_CONST_PTR
#define RARRAY_CONST_PTR(s)
Definition: psych_emitter.c:4
debug_counter.h
ENC_CODERANGE_7BIT
#define ENC_CODERANGE_7BIT
Definition: encoding.h:104
MEMCPY
#define MEMCPY(p1, p2, type, n)
Definition: ruby.h:1753
RSTRING
#define RSTRING(obj)
Definition: ruby.h:1271
rb_enc_ascget
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Definition: encoding.c:1044
econv_finished
@ econv_finished
Definition: encoding.h:302
rb_enc_str_buf_cat
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
Definition: string.c:2919
rb_hash_aset
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Definition: hash.c:2852
rb_cString
VALUE rb_cString
Definition: string.c:66
RString::capa
long capa
Definition: ruby.h:995
NIL_P
#define NIL_P(v)
Definition: ruby.h:482
OBJ_FREEZE_RAW
#define OBJ_FREEZE_RAW(x)
Definition: ruby.h:1376
rb_funcall
#define rb_funcall(recv, mid, argc,...)
Definition: rb_mjit_min_header-2.7.2.h:6620
OnigEncodingTypeST::case_map
int(* case_map)(OnigCaseFoldType *flagP, const OnigUChar **pp, const OnigUChar *end, OnigUChar *to, OnigUChar *to_end, const struct OnigEncodingTypeST *enc)
Definition: onigmo.h:177
onig_new
ONIG_EXTERN int onig_new(OnigRegex *, const OnigUChar *pattern, const OnigUChar *pattern_end, OnigOptionType option, OnigEncoding enc, const OnigSyntaxType *syntax, OnigErrorInfo *einfo)
memmove
#define memmove(dst, src, len)
Definition: rb_mjit_min_header-2.7.2.h:2886
argc
int argc
Definition: ruby.c:222
rb_str_initialize
VALUE rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
Definition: string.c:964
STR_EMBED_P
#define STR_EMBED_P(str)
Definition: internal.h:2163
ENCODING_GET_INLINED
#define ENCODING_GET_INLINED(obj)
Definition: encoding.h:61
re_registers
Definition: onigmo.h:716
rb_obj_classname
const char * rb_obj_classname(VALUE)
Definition: variable.c:289
rb_string_value_ptr
char * rb_string_value_ptr(volatile VALUE *ptr)
Definition: string.c:2186
rb_utf8_str_new_cstr
VALUE rb_utf8_str_new_cstr(const char *ptr)
Definition: string.c:828
tr::p
char * p
Definition: string.c:6992
err
int err
Definition: win32.c:135
encindex.h
Init_String
void Init_String(void)
Definition: string.c:11201
rb_data_type_struct
Definition: ruby.h:1148
BUILTIN_TYPE
#define BUILTIN_TYPE(x)
Definition: ruby.h:551
CASE_UTF
#define CASE_UTF(e)
xfree
#define xfree
Definition: defines.h:216
st_data_t
unsigned long st_data_t
Definition: rb_mjit_min_header-2.7.2.h:5398
FIXNUM_MAX
#define FIXNUM_MAX
Definition: ruby.h:259
RBASIC
#define RBASIC(obj)
Definition: ruby.h:1267
rb_str_dump
VALUE rb_str_dump(VALUE str)
Definition: string.c:6042
econv_destination_buffer_full
@ econv_destination_buffer_full
Definition: encoding.h:300
size_t
long unsigned int size_t
Definition: rb_mjit_min_header-2.7.2.h:669
OBJ_FROZEN_RAW
#define OBJ_FROZEN_RAW(x)
Definition: ruby.h:1374
rb_rs
RUBY_EXTERN VALUE rb_rs
Definition: intern.h:585
rb_reg_nth_match
VALUE rb_reg_nth_match(int, VALUE)
Definition: re.c:1706
tr::now
unsigned int now
Definition: string.c:6991
rb_sym_to_s
VALUE rb_sym_to_s(VALUE sym)
Definition: string.c:10793
rb_str_coderange_scan_restartable
long rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
Definition: string.c:567
MJIT_FUNC_EXPORTED
#define MJIT_FUNC_EXPORTED
Definition: defines.h:396
_
#define _(args)
Definition: dln.h:28
ENCINDEX_US_ASCII
#define ENCINDEX_US_ASCII
Definition: encindex.h:44
Qtrue
#define Qtrue
Definition: ruby.h:468
errno
int errno
rb_str_export_to_enc
VALUE rb_str_export_to_enc(VALUE str, rb_encoding *enc)
Definition: string.c:1135
OBJ_FROZEN
#define OBJ_FROZEN(x)
Definition: ruby.h:1375
rb_str_eql
MJIT_FUNC_EXPORTED VALUE rb_str_eql(VALUE str1, VALUE str2)
Definition: string.c:3287
re_registers::num_regs
int num_regs
Definition: onigmo.h:718
v
int VALUE v
Definition: rb_mjit_min_header-2.7.2.h:12380
aligned_ptr
#define aligned_ptr(value)
len
uint8_t len
Definition: escape.c:17
rb_str_to_str
VALUE rb_str_to_str(VALUE str)
Definition: string.c:1382
SYMBOL_P
#define SYMBOL_P(x)
Definition: ruby.h:413
RB_DEBUG_COUNTER_INC
#define RB_DEBUG_COUNTER_INC(type)
Definition: debug_counter.h:375
rb_enc_prev_char
#define rb_enc_prev_char(s, p, e, enc)
Definition: encoding.h:220
rb_eEncCompatError
VALUE rb_eEncCompatError
Definition: error.c:931
rb_enc_str_asciionly_p
int rb_enc_str_asciionly_p(VALUE str)
Definition: string.c:678
ENC_CODERANGE
#define ENC_CODERANGE(obj)
Definition: encoding.h:108
cc
const struct rb_call_cache * cc
Definition: rb_mjit_min_header-2.7.2.h:13276
ptrdiff_t
long int ptrdiff_t
Definition: rb_mjit_min_header-2.7.2.h:805
STATIC_ASSERT
STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX)
stderr
#define stderr
Definition: rb_mjit_min_header-2.7.2.h:1522
rb_str_freeze
VALUE rb_str_freeze(VALUE str)
Definition: string.c:2616
u8
unsigned char u8
Definition: many2.c:13
rb_str_free
void rb_str_free(VALUE str)
Definition: string.c:1349
TOLOWER
#define TOLOWER(c)
Definition: ruby.h:2319
FLEX_ARY_LEN
#define FLEX_ARY_LEN
Definition: internal.h:2626
rb_str_length
VALUE rb_str_length(VALUE str)
Definition: string.c:1843
RSTRING_NOEMBED
@ RSTRING_NOEMBED
Definition: ruby.h:978
ONIGENC_CASE_FOLD
#define ONIGENC_CASE_FOLD
Definition: onigmo.h:120
LONG2FIX
#define LONG2FIX(i)
Definition: ruby.h:265
RBASIC_SET_CLASS_RAW
#define RBASIC_SET_CLASS_RAW(obj, cls)
Definition: internal.h:1988
rb_enc_codelen
int rb_enc_codelen(int c, rb_encoding *enc)
Definition: encoding.c:1089
rb_intern_str
#define rb_intern_str(string)
Definition: generator.h:16
rb_str_index
#define rb_str_index(str, sub, offset)
Definition: string.c:3490
T_STRING
#define T_STRING
Definition: ruby.h:528
MAYBE_UNUSED
#define MAYBE_UNUSED
Definition: ffi_common.h:32
rb_str_set_len
void rb_str_set_len(VALUE str, long len)
Definition: string.c:2692
rb_funcallv
#define rb_funcallv(recv, mid, argc, argv)
Definition: rb_mjit_min_header-2.7.2.h:7940
mapping_buffer
struct mapping_buffer mapping_buffer
rb_sym2str
VALUE rb_sym2str(VALUE)
Definition: symbol.c:784
rb_utf8_str_new_static
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Definition: string.c:884
rb_usascii_str_new_static
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Definition: string.c:878
SIZET2NUM
#define SIZET2NUM(v)
Definition: ruby.h:295
old
VALUE ID VALUE old
Definition: rb_mjit_min_header-2.7.2.h:16196
index
int index
Definition: rb_mjit_min_header-2.7.2.h:11294
rb_enc_isctype
#define rb_enc_isctype(c, t, enc)
Definition: encoding.h:229
rb_str_encode
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2870
rb_yield
VALUE rb_yield(VALUE)
Definition: vm_eval.c:1237
rb_str_symname_p
int rb_str_symname_p(VALUE sym)
Definition: string.c:10695
FIXABLE
#define FIXABLE(f)
Definition: ruby.h:399
rb_str_modify_expand
void rb_str_modify_expand(VALUE str, long expand)
Definition: string.c:2122
RB_INTEGER_TYPE_P
#define RB_INTEGER_TYPE_P(obj)
Definition: ruby_missing.h:15
rb_ensure
VALUE rb_ensure(VALUE(*b_proc)(VALUE), VALUE data1, VALUE(*e_proc)(VALUE), VALUE data2)
An equivalent to ensure clause.
Definition: eval.c:1115
ENCINDEX_ASCII
#define ENCINDEX_ASCII
Definition: encindex.h:42
rb_str_change_terminator_length
void rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
Definition: string.c:2230
rb_enc_nth
char * rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
Definition: string.c:2388
ST2FIX
#define ST2FIX(h)
Definition: ruby_missing.h:21
ENCINDEX_UTF_32BE
#define ENCINDEX_UTF_32BE
Definition: encindex.h:47
rb_ary_new
VALUE rb_ary_new(void)
Definition: array.c:723
NEWOBJ_OF
#define NEWOBJ_OF(obj, type, klass, flags)
Definition: ruby.h:785
NUM2INT
#define NUM2INT(x)
Definition: ruby.h:715
Qnil
#define Qnil
Definition: ruby.h:469
rb_reg_match
VALUE rb_reg_match(VALUE, VALUE)
Definition: re.c:3180
rb_str_quote_unprintable
VALUE rb_str_quote_unprintable(VALUE str)
Definition: string.c:10714
rb_fstring_lit
#define rb_fstring_lit(str)
Definition: internal.h:2129
rb_enc_str_coderange
int rb_enc_str_coderange(VALUE str)
Definition: string.c:657
OnigErrorInfo
Definition: onigmo.h:738
rb_str_buf_cat
#define rb_str_buf_cat
Definition: intern.h:910
STR_SET_LEN
#define STR_SET_LEN(str, n)
Definition: string.c:104
rb_external_str_new_with_enc
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
Definition: string.c:1036
util.h
rb_enc_isprint
#define rb_enc_isprint(c, enc)
Definition: encoding.h:236
rb_undef_alloc_func
void rb_undef_alloc_func(VALUE)
Definition: vm_method.c:722
RB_GC_GUARD
#define RB_GC_GUARD(v)
Definition: ruby.h:585
numberof
#define numberof(array)
Definition: etc.c:618
STR_SET_EMBED_LEN
#define STR_SET_EMBED_LEN(str, n)
Definition: string.c:98
rb_enc_step_back
#define rb_enc_step_back(s, p, e, n, enc)
Definition: encoding.h:224
sym_equal
#define sym_equal
Definition: string.c:10676
RString::as
union RString::@2 as
rb_strlen_lit
#define rb_strlen_lit(str)
Definition: intern.h:913
fprintf
int fprintf(FILE *__restrict, const char *__restrict,...) __attribute__((__format__(__printf__
rb_str_cat
VALUE rb_str_cat(VALUE str, const char *ptr, long len)
Definition: string.c:2812
OnigCaseFoldType
unsigned int OnigCaseFoldType
Definition: onigmo.h:95
RSTRING_LEN
#define RSTRING_LEN(str)
Definition: ruby.h:1005
char
#define char
Definition: rb_mjit_min_header-2.7.2.h:2916
st_table
Definition: st.h:79
ruby_assert.h
ALLOCV
#define ALLOCV(v, n)
Definition: ruby.h:1748
rb_enc_codepoint_len
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
Definition: encoding.c:1068
rb_any_to_s
VALUE rb_any_to_s(VALUE)
Default implementation of #to_s.
Definition: object.c:527
rb_str_strlen
long rb_str_strlen(VALUE str)
Definition: string.c:1829
sub
#define sub(x, y)
Definition: date_strftime.c:24
rb_enc_code_to_mbclen
#define rb_enc_code_to_mbclen(c, enc)
Definition: encoding.h:214
rb_enc_associate
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:866
mapping_buffer::next
struct mapping_buffer * next
Definition: string.c:6496
rb_reg_check_preprocess
VALUE rb_reg_check_preprocess(VALUE)
Definition: re.c:2707
ENC_CODERANGE_UNKNOWN
#define ENC_CODERANGE_UNKNOWN
Definition: encoding.h:103
rb_define_alloc_func
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
RTEST
#define RTEST(v)
Definition: ruby.h:481
__msan_unpoison_string
#define __msan_unpoison_string(x)
Definition: internal.h:123
SYM2ID
#define SYM2ID(x)
Definition: ruby.h:415
ENC_CODERANGE_CLEAN_P
#define ENC_CODERANGE_CLEAN_P(cr)
Definition: encoding.h:107
rb_econv_result_t
rb_econv_result_t
Definition: encoding.h:297
ruby_escaped_char
const char * ruby_escaped_char(int c)
Definition: string.c:5848
RUBY_ASSERT
#define RUBY_ASSERT(expr)
Definition: assert.h:32
rb_enc_associate_index
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:838
STR_FAKESTR
#define STR_FAKESTR
Definition: string.c:91
rb_tainted_str_new
VALUE rb_tainted_str_new(const char *ptr, long len)
Definition: string.c:896
rb_usascii_encoding
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1340
RSTRING_END
#define RSTRING_END(str)
Definition: ruby.h:1013
name
const char * name
Definition: nkf.c:208
n
const char size_t n
Definition: rb_mjit_min_header-2.7.2.h:5491