From aacb3875a9a645880cbfe014fb0c4cb078ff4342 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 17 Sep 2018 21:27:43 +0200 Subject: [PATCH 4/5] vobject.c - vCard Unicode reading support References: https://github.com/libical/libical/pull/354 RFC 6350 declares vCard to be UTF-8 throughout without exceptions. However, any non-ASCII vCard content is garbled because the "fakeUnicode" botched the conversion to wchar_t: The conversion just copies values from char to wchar_t, which is neither correct for UTF-8 nor (a hypothetical) ISO-8859-1/-15 coded input. This patch fixes that. References: #353 --- src/libicalvcal/vobject.c | 94 ++++++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 16 deletions(-) diff --git a/src/libicalvcal/vobject.c b/src/libicalvcal/vobject.c index 10d0cf5a..b880716f 100644 --- a/src/libicalvcal/vobject.c +++ b/src/libicalvcal/vobject.c @@ -45,6 +45,9 @@ DFARS 252.227-7013 or 48 CFR 52.227-19, as applicable. #ifdef HAVE_CONFIG_H #include #endif +#include +#include +#include #include "vobject.h" @@ -1414,27 +1417,86 @@ char* writeMemVObjects(char *s, int *len, VObject *list) /*---------------------------------------------------------------------- APIs to do fake Unicode stuff. ----------------------------------------------------------------------*/ +/* + * Convert UTF-8 to wide chars. + * + * The only place where this spells Unicode is 1.) in "UTF-8", 2.) when it does + * the secondary pass to replace \n and \r with U+2028 and 2029, respectively. + * That step blindly pretends wchar_t shares the Unicode codepoints (happens to + * work for the important contemporary platforms, but otherwise is nonsense). + */ wchar_t* fakeUnicode(const char *ps, size_t *bytes) { - wchar_t *r, *pw; - size_t len = strlen(ps)+1; + /* + * Assuming the input were all ASCII, then + * + * method1_out_size = zs * sizeof(wchar_t) + * + * would make sense. But if the input were all 3-byte UTF-8 codepoints, + * then that would be a large wasteful allocation, and + * + * method2_out_size = zs * sizeof(wchar_t) / 3 + * + * would be more reasonable. Since there is no way of knowing in + * advance what is in @ps, method 1 will be chosen if that is a 1KB + * allocation (or less), and method 2 otherwise. From there, the + * standard exponential progression for realloc is applied. + */ + size_t zs = strlen(ps), out_size, out_rem; + char *out_block, *out_iter; + iconv_t conv = iconv_open("wchar_t", "utf-8"); - pw = r = (wchar_t*)malloc(sizeof(wchar_t)*len); - if (bytes) - *bytes = len * sizeof(wchar_t); + if (conv == (iconv_t)-1) + return NULL; + if (zs >= (SIZE_MAX - sizeof(wchar_t)) / sizeof(wchar_t)) + /* Input is larger than anything we want to handle */ + return NULL; + /* Initial allocation size as per above. */ + out_size = out_rem = zs * sizeof(wchar_t); + if (out_size >= 1024 - sizeof(wchar_t)) + out_size /= 3; + out_iter = out_block = malloc(out_size + sizeof(wchar_t)); + if (out_block == NULL) { + iconv_close(conv); + return NULL; + } - while (*ps) { - if (*ps == '\n') - *pw = (wchar_t)0x2028; - else if (*ps == '\r') - *pw = (wchar_t)0x2029; - else - *pw = (wchar_t)(unsigned char)*ps; - ps++; pw++; - } - *pw = (wchar_t)0; + while (zs > 0) { + int ret; + errno = 0; + ret = iconv(conv, (char **)&ps, &zs, &out_iter, &out_rem); + if (ret >= 0) + continue; + if (errno == EILSEQ || errno == EINVAL) { + ++ps; + --zs; + continue; + } + if (errno != E2BIG) + break; + out_rem += out_size; + out_size *= 2; + char *new_block = realloc(out_block, out_size + sizeof(wchar_t)); + if (new_block == NULL) { + free(out_block); + iconv_close(conv); + return NULL; + } + out_iter = new_block + (out_iter - out_block); + out_block = new_block; + } - return r; + wchar_t *wide = (wchar_t *)out_block, *p = wide; + for (; p < (wchar_t *)(out_block + out_size - out_rem); ++p) { + if (*p == '\n') + *p = 0x2028; + else if (*p == '\r') + *p = 0x2029; + } + *p = L'\0'; + if (bytes != NULL) + *bytes = (char *)p - out_block; + return wide; } int uStrLen(const wchar_t *u) -- 2.19.1