/* * R : A Computer Language for Statistical Data Analysis * Copyright (C) 2001--2017 The R Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Pulic License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, a copy is available at * https://www.R-project.org/Licenses/ */ #ifdef HAVE_CONFIG_H # include #endif #include #include #define isRaw(x) (TYPEOF(x) == RAWSXP) /* charToRaw works at byte level, ignores encoding */ SEXP attribute_hidden do_charToRaw(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, x = CAR(args); int nc; checkArity(op, args); if (!isString(x) || LENGTH(x) == 0) error(_("argument must be a character vector of length 1")); if (LENGTH(x) > 1) warning(_("argument should be a character vector of length 1\nall but the first element will be ignored")); nc = LENGTH(STRING_ELT(x, 0)); ans = allocVector(RAWSXP, nc); if (nc) memcpy(RAW(ans), CHAR(STRING_ELT(x, 0)), nc); return ans; } /* rawToChar should work at byte level */ SEXP attribute_hidden do_rawToChar(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, x = CAR(args); checkArity(op, args); if (!isRaw(x)) error(_("argument 'x' must be a raw vector")); int multiple = asLogical(CADR(args)); if (multiple == NA_LOGICAL) error(_("argument 'multiple' must be TRUE or FALSE")); if (multiple) { R_xlen_t i, nc = XLENGTH(x); char buf[2]; buf[1] = '\0'; PROTECT(ans = allocVector(STRSXP, nc)); for (i = 0; i < nc; i++) { buf[0] = (char) RAW(x)[i]; SET_STRING_ELT(ans, i, mkChar(buf)); } /* do we want to copy e.g. names here? */ } else { int i, j, nc = LENGTH(x); /* String is not necessarily 0-terminated and may contain nuls. Strip trailing nuls */ for (i = 0, j = -1; i < nc; i++) if(RAW(x)[i]) j = i; nc = j + 1; PROTECT(ans = allocVector(STRSXP, 1)); SET_STRING_ELT(ans, 0, mkCharLenCE((const char *)RAW(x), j+1, CE_NATIVE)); } UNPROTECT(1); return ans; } SEXP attribute_hidden do_rawShift(SEXP call, SEXP op, SEXP args, SEXP env) { checkArity(op, args); SEXP ans, x = CAR(args); int shift = asInteger(CADR(args)); if (!isRaw(x)) error(_("argument 'x' must be a raw vector")); if (shift == NA_INTEGER || shift < -8 || shift > 8) error(_("argument 'shift' must be a small integer")); PROTECT(ans = duplicate(x)); if (shift > 0) for (R_xlen_t i = 0; i < XLENGTH(x); i++) RAW(ans)[i] <<= shift; else for (R_xlen_t i = 0; i < XLENGTH(x); i++) RAW(ans)[i] >>= (-shift); UNPROTECT(1); return ans; } SEXP attribute_hidden do_rawToBits(SEXP call, SEXP op, SEXP args, SEXP env) { checkArity(op, args); SEXP ans, x = CAR(args); R_xlen_t i, j = 0; unsigned int tmp; if (!isRaw(x)) error(_("argument 'x' must be a raw vector")); PROTECT(ans = allocVector(RAWSXP, 8*XLENGTH(x))); for (i = 0; i < XLENGTH(x); i++) { tmp = (unsigned int) RAW(x)[i]; for (int k = 0; k < 8; k++, tmp >>= 1) RAW(ans)[j++] = tmp & 0x1; } UNPROTECT(1); return ans; } SEXP attribute_hidden do_intToBits(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, x; R_xlen_t i, j = 0; unsigned int tmp; checkArity(op, args); PROTECT(x = coerceVector(CAR(args), INTSXP)); if (!isInteger(x)) error(_("argument 'x' must be an integer vector")); PROTECT(ans = allocVector(RAWSXP, 32*XLENGTH(x))); for (i = 0; i < XLENGTH(x); i++) { tmp = (unsigned int) INTEGER(x)[i]; for (int k = 0; k < 32; k++, tmp >>= 1) RAW(ans)[j++] = tmp & 0x1; } UNPROTECT(2); return ans; } SEXP attribute_hidden do_packBits(SEXP call, SEXP op, SEXP args, SEXP env) { checkArity(op, args); SEXP ans, x = CAR(args), stype = CADR(args); Rboolean useRaw; R_xlen_t i, len = XLENGTH(x), slen; int fac; if (TYPEOF(x) != RAWSXP && TYPEOF(x) != LGLSXP && TYPEOF(x) != INTSXP) error(_("argument 'x' must be raw, integer or logical")); if (!isString(stype) || LENGTH(stype) != 1) error(_("argument '%s' must be a character string"), "type"); useRaw = strcmp(CHAR(STRING_ELT(stype, 0)), "integer"); fac = useRaw ? 8 : 32; if (len% fac) error(_("argument 'x' must be a multiple of %d long"), fac); slen = len/fac; PROTECT(ans = allocVector(useRaw ? RAWSXP : INTSXP, slen)); for (i = 0; i < slen; i++) if (useRaw) { Rbyte btmp = 0; for (int k = 7; k >= 0; k--) { btmp <<= 1; if (isRaw(x)) btmp |= RAW(x)[8*i + k] & 0x1; else if (isLogical(x) || isInteger(x)) { int j = INTEGER(x)[8*i+k]; if (j == NA_INTEGER) error(_("argument 'x' must not contain NAs")); btmp |= j & 0x1; } } RAW(ans)[i] = btmp; } else { unsigned int itmp = 0; for (int k = 31; k >= 0; k--) { itmp <<= 1; if (isRaw(x)) itmp |= RAW(x)[32*i + k] & 0x1; else if (isLogical(x) || isInteger(x)) { int j = INTEGER(x)[32*i+k]; if (j == NA_INTEGER) error(_("argument 'x' must not contain NAs")); itmp |= j & 0x1; } } INTEGER(ans)[i] = (int) itmp; } UNPROTECT(1); return ans; } /* Simplified version for RFC3629 definition of UTF-8 */ static int mbrtoint(int *w, const char *s) { unsigned int byte; byte = *((unsigned char *)s); if (byte == 0) { *w = 0; return 0; } else if (byte < 0xC0) { *w = (int) byte; return 1; } else if (byte < 0xE0) { if (!s[1]) return -2; if ((s[1] & 0xC0) == 0x80) { *w = (int) (((byte & 0x1F) << 6) | (s[1] & 0x3F)); return 2; } else return -1; } else if (byte < 0xF0) { if (!s[1] || !s[2]) return -2; if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) { *w = (int) (((byte & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F)); byte = *w; if (byte >= 0xD800 && byte <= 0xDFFF) return -1; /* surrogate */ // Following Corrigendum 9, these are valid in UTF-8 // if (byte == 0xFFFE || byte == 0xFFFF) return -1; return 3; } else return -1; } else if (byte <= 0xF4) { // for RFC3629 if (!s[1] || !s[2] || !s[3]) return -2; if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80) && ((s[3] & 0xC0) == 0x80)) { *w = (int) (((byte & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F)); byte = *w; return (byte <= 0x10FFFF) ? 4 : -1; } else return -1; } else return -1; /* return -2; not reached */ } SEXP attribute_hidden do_utf8ToInt(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, x = CAR(args); int tmp, used = 0; /* -Wall */ R_xlen_t i, j, nc; checkArity(op, args); if (!isString(x) || LENGTH(x) == 0) error(_("argument must be a character vector of length 1")); if (LENGTH(x) > 1) warning(_("argument should be a character vector of length 1\nall but the first element will be ignored")); if (STRING_ELT(x, 0) == NA_STRING) return ScalarInteger(NA_INTEGER); const char *s = CHAR(STRING_ELT(x, 0)); if (!utf8Valid(s)) return ScalarInteger(NA_INTEGER); nc = XLENGTH(STRING_ELT(x, 0)); /* ints will be shorter */ int *ians = (int *) R_alloc(nc, sizeof(int)); for (i = 0, j = 0; i < nc; i++) { used = mbrtoint(&tmp, s); if (used <= 0) break; ians[j++] = tmp; s += used; } if (used < 0) error(_("invalid UTF-8 string")); ans = allocVector(INTSXP, j); if (j) memcpy(INTEGER(ans), ians, sizeof(int) * j); return ans; } /* Based on PCRE, but current Unicode only needs 4 bytes with maximum 0x10ffff */ static const int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff }; static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0 }; static size_t inttomb(char *s, const int wc) { register int i, j; unsigned int cvalue = wc; char buf[10], *b; b = s ? s : buf; if (cvalue == 0) {*b = 0; return 0;} for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) if (cvalue <= utf8_table1[i]) break; b += i; for (j = i; j > 0; j--) { *b-- = (char)(0x80 | (cvalue & 0x3f)); cvalue >>= 6; } *b = (char)(utf8_table2[i] | cvalue); return i + 1; } #include /* for Calloc/Free */ SEXP attribute_hidden do_intToUtf8(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP ans, x; int multiple, s_pair; size_t used, len; char buf[10], *tmp; checkArity(op, args); PROTECT(x = coerceVector(CAR(args), INTSXP)); if (!isInteger(x)) error(_("argument 'x' must be an integer vector")); multiple = asLogical(CADR(args)); if (multiple == NA_LOGICAL) error(_("argument 'multiple' must be TRUE or FALSE")); s_pair = asLogical(CADDR(args)); if (s_pair == NA_LOGICAL) error(_("argument 'allow_surrogate_pairs' must be TRUE or FALSE")); if (multiple) { if (s_pair) warning("allow_surrogate_pairs = TRUE is incompatible with multiple = TRUE and will be ignored"); R_xlen_t i, nc = XLENGTH(x); PROTECT(ans = allocVector(STRSXP, nc)); for (i = 0; i < nc; i++) { int this = INTEGER(x)[i]; if (this == NA_INTEGER || (this >= 0xD800 && this <= 0xDFFF) || this > 0x10FFFF) SET_STRING_ELT(ans, i, NA_STRING); else { used = inttomb(buf, this); buf[used] = '\0'; SET_STRING_ELT(ans, i, mkCharCE(buf, CE_UTF8)); } } /* do we want to copy e.g. names here? */ } else { int i, nc = LENGTH(x); Rboolean haveNA = FALSE; /* Note that this gives zero length for input '0', so it is omitted */ for (i = 0, len = 0; i < nc; i++) { int this = INTEGER(x)[i]; if (this == NA_INTEGER || (this >= 0xDC00 && this <= 0xDFFF) || this > 0x10FFFF) { haveNA = TRUE; break; } else if (this >= 0xD800 && this <= 0xDBFF) { if(!s_pair || i >= nc-1) {haveNA = TRUE; break;} int next = INTEGER(x)[i+1]; if(next >= 0xDC00 && next <= 0xDFFF) i++; else {haveNA = TRUE; break;} len += 4; // all points not in the basic plane have length 4 } else len += inttomb(NULL, this); } if (haveNA) { PROTECT(ans = allocVector(STRSXP, 1)); SET_STRING_ELT(ans, 0, NA_STRING); UNPROTECT(2); return ans; } if (len >= 10000) { tmp = Calloc(len+1, char); } else { R_CheckStack2(len+1); tmp = alloca(len+1); tmp[len] = '\0'; } for (i = 0, len = 0; i < nc; i++) { int this = INTEGER(x)[i]; if(s_pair && (this >= 0xD800 && this <= 0xDBFF)) { // all the validity checking has already been done. int next = INTEGER(x)[++i]; unsigned int hi = this - 0xD800, lo = next - 0xDC00; this = 0x10000 + (hi << 10) + lo; } used = inttomb(buf, this); strncpy(tmp + len, buf, used); len += used; } PROTECT(ans = allocVector(STRSXP, 1)); SET_STRING_ELT(ans, 0, mkCharLenCE(tmp, (int) len, CE_UTF8)); if(len >= 10000) Free(tmp); } UNPROTECT(2); return ans; }