1
0
mirror of https://github.com/openbsd/src.git synced 2025-01-04 23:35:36 -08:00

Provide C11 <uchar.h>.

OK millert@.
Tested by naddy@ in a bulk and by matthieu@ in the new foot(1) port.
I originally wrote the code in 2022 at the prodding of espie@.
Using one improvement to a manual page from jmc@.
This commit is contained in:
schwarze 2023-08-20 15:02:50 +00:00
parent 7927db41a2
commit 46c354aa2b
16 changed files with 1048 additions and 185 deletions

View File

@ -743,7 +743,7 @@
./usr/lib/crtendS.o
./usr/lib/gcrt0.o
./usr/lib/libagentx.so.1.1
./usr/lib/libc.so.97.0
./usr/lib/libc.so.97.1
./usr/lib/libcbor.so.2.0
./usr/lib/libcrypto.so.52.0
./usr/lib/libcurses.so.14.0
@ -3008,7 +3008,7 @@
./usr/share/relink/kernel.tgz
./usr/share/relink/usr
./usr/share/relink/usr/lib
./usr/share/relink/usr/lib/libc.so.97.0.a
./usr/share/relink/usr/lib/libc.so.97.1.a
./usr/share/relink/usr/lib/libcrypto.so.52.0.a
./usr/share/relink/usr/libexec
./usr/share/relink/usr/libexec/ld.so.a

View File

@ -1435,6 +1435,7 @@
./usr/include/time.h
./usr/include/tls.h
./usr/include/ttyent.h
./usr/include/uchar.h
./usr/include/ufs
./usr/include/ufs/ext2fs
./usr/include/ufs/ext2fs/ext2fs.h
@ -2295,6 +2296,7 @@
./usr/share/man/man3/btowc.3
./usr/share/man/man3/btree.3
./usr/share/man/man3/bzero.3
./usr/share/man/man3/c16rtomb.3
./usr/share/man/man3/cacos.3
./usr/share/man/man3/cacosh.3
./usr/share/man/man3/carg.3
@ -2715,6 +2717,7 @@
./usr/share/man/man3/malloc.3
./usr/share/man/man3/mblen.3
./usr/share/man/man3/mbrlen.3
./usr/share/man/man3/mbrtoc16.3
./usr/share/man/man3/mbrtowc.3
./usr/share/man/man3/mbsinit.3
./usr/share/man/man3/mbsrtowcs.3

View File

@ -1,4 +1,4 @@
# $OpenBSD: Makefile,v 1.230 2022/08/30 18:50:06 krw Exp $
# $OpenBSD: Makefile,v 1.231 2023/08/20 15:02:50 schwarze Exp $
# $NetBSD: Makefile,v 1.59 1996/05/15 21:36:43 jtc Exp $
# @(#)Makefile 5.45.1.1 (Berkeley) 5/6/91
@ -27,7 +27,7 @@ FILES= a.out.h ar.h asr.h assert.h \
signal.h siphash.h sndio.h spawn.h stdbool.h stddef.h \
stdio.h stdlib.h string.h strings.h sysexits.h \
tar.h tgmath.h tib.h time.h ttyent.h \
unistd.h utime.h utmp.h uuid.h \
uchar.h unistd.h utime.h utmp.h uuid.h \
vis.h \
wchar.h wctype.h

38
include/uchar.h Normal file
View File

@ -0,0 +1,38 @@
/* $OpenBSD: uchar.h,v 1.1 2023/08/20 15:02:50 schwarze Exp $ */
/*
* Written by Ingo Schwarze <schwarze@openbsd.org>
* and placed in the public domain on March 19, 2022.
*/
#ifndef _UCHAR_H_
#define _UCHAR_H_
#include <sys/cdefs.h>
#include <sys/_types.h>
#ifndef _MBSTATE_T_DEFINED_
#define _MBSTATE_T_DEFINED_
typedef __mbstate_t mbstate_t;
#endif
#ifndef _SIZE_T_DEFINED_
#define _SIZE_T_DEFINED_
typedef __size_t size_t;
#endif
#define __STDC_UTF_16__ 1
#define __STDC_UTF_32__ 1
typedef __uint16_t char16_t;
typedef __uint32_t char32_t;
__BEGIN_DECLS
size_t mbrtoc16(char16_t * __restrict, const char * __restrict, size_t,
mbstate_t * __restrict);
size_t c16rtomb(char * __restrict, char16_t, mbstate_t * __restrict);
size_t mbrtoc32(char32_t * __restrict, const char * __restrict, size_t,
mbstate_t * __restrict);
size_t c32rtomb(char * __restrict, char32_t, mbstate_t * __restrict);
__END_DECLS
#endif /* !_UCHAR_H_ */

View File

@ -958,6 +958,8 @@ SipHash_Update
/* locale */
__mb_cur_max
btowc
c16rtomb
c32rtomb
duplocale
freelocale
isalnum_l
@ -1007,6 +1009,8 @@ mbsnrtowcs
mbsrtowcs
mbstowcs
newlocale
mbrtoc16
mbrtoc32
mbtowc
nl_langinfo
nl_langinfo_l

17
lib/libc/hidden/uchar.h Normal file
View File

@ -0,0 +1,17 @@
/* $OpenBSD: uchar.h,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
/*
* Written by Ingo Schwarze <schwarze@openbsd.org>
* and placed in the public domain on March 19, 2022.
*/
#ifndef _LIBC_UCHAR_H_
#define _LIBC_UCHAR_H_
#include_next <uchar.h>
PROTO_STD_DEPRECATED(c16rtomb);
PROTO_STD_DEPRECATED(c32rtomb);
PROTO_STD_DEPRECATED(mbrtoc16);
PROTO_STD_DEPRECATED(mbrtoc32);
#endif /* !_LIBC_UCHAR_H_ */

View File

@ -1,14 +1,15 @@
# $OpenBSD: Makefile.inc,v 1.26 2022/07/27 20:00:11 guenther Exp $
# $OpenBSD: Makefile.inc,v 1.27 2023/08/20 15:02:51 schwarze Exp $
# locale sources
.PATH: ${LIBCSRCDIR}/locale
SRCS+= btowc.c _def_messages.c _def_monetary.c _def_numeric.c _def_time.c \
SRCS+= _def_messages.c _def_monetary.c _def_numeric.c _def_time.c \
localeconv.c nl_langinfo.c nl_langinfo_l.c setlocale.c \
duplocale.c freelocale.c newlocale.c uselocale.c \
__mb_cur_max.c _CurrentRuneLocale.c _get_locname.c \
isctype_l.c iswctype.c iswctype_l.c wctype.c \
mblen.c mbrlen.c mbstowcs.c mbtowc.c multibyte_citrus.c wcscoll.c \
mblen.c mbrlen.c mbrtoc16.c mbrtoc32.c mbstowcs.c mbtowc.c \
btowc.c c16rtomb.c c32rtomb.c multibyte_citrus.c wcscoll.c \
wcscoll_l.c \
wcstombs.c wctob.c wctomb.c wcstof.c wcstod.c wcstold.c wcstol.c \
wcstoul.c wcstoll.c wcstoull.c wcstoimax.c wcstoumax.c \
@ -17,7 +18,8 @@ SRCS+= btowc.c _def_messages.c _def_monetary.c _def_numeric.c _def_time.c \
MAN+= nl_langinfo.3 setlocale.3 newlocale.3 uselocale.3 localeconv.3 \
iswalnum.3 towlower.3 \
btowc.3 mblen.3 mbrlen.3 mbrtowc.3 mbsinit.3 mbsrtowcs.3 \
btowc.3 c16rtomb.3 mblen.3 mbrlen.3 mbrtoc16.3 mbrtowc.3 \
mbsinit.3 mbsrtowcs.3 \
mbstowcs.3 mbtowc.3 wcrtomb.3 wcscoll.3 wcsrtombs.3 wcstod.3 \
wcstol.3 wcstombs.3 wcsxfrm.3 wctob.3 wctomb.3 \
wctype.3 iswctype.3 wctrans.3 towctrans.3 wcwidth.3

207
lib/libc/locale/c16rtomb.3 Normal file
View File

@ -0,0 +1,207 @@
.\" $OpenBSD: c16rtomb.3,v 1.1 2023/08/20 15:02:51 schwarze Exp $
.\"
.\" Copyright (c) 2023 Ingo Schwarze <schwarze@openbsd.org>
.\"
.\" Permission to use, copy, modify, and distribute this software for any
.\" purpose with or without fee is hereby granted, provided that the above
.\" copyright notice and this permission notice appear in all copies.
.\"
.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
.\"
.Dd $Mdocdate: August 20 2023 $
.Dt C16RTOMB 3
.Os
.Sh NAME
.Nm c16rtomb
.Nd convert one UTF-16 encoded character to UTF-8
.Sh SYNOPSIS
.In uchar.h
.Ft size_t
.Fo c16rtomb
.Fa "char * restrict s"
.Fa "char16_t c16"
.Fa "mbstate_t * restrict mbs"
.Fc
.Sh DESCRIPTION
This function converts one UTF-16 encoded character to UTF-8.
In some cases, it is necessary to call the function twice
to convert a single character.
.Pp
First, call
.Fn c16rtomb
passing the first 16-bit code unit of the UTF-16 encoded character in
.Fa c16 .
If the return value is greater than 0, the character is part of the UCS-2
range, the complete UTF-8 encoding consisting of at most
.Dv MB_CUR_MAX
bytes has been written to the storage starting at
.Fa s ,
and the function does not need to be called again.
.Pp
If the return value is 0, the first 16-bit code unit is a UTF-16
high surrogate and the function needs to be called a second time,
this time passing the second 16-bit code unit of the UTF-16 encoded
character in
.Fa c16
and passing the same
.Fa mbs
again that was also passed to the first call.
If the second 16-bit code unit is a UTF-16 low surrogate,
the second call returns a value greater than 0,
the surrogate pair represents a Unicode code point
beyond the basic multilingual plane,
and the complete UTF-8 encoding consisting of at most
.Dv MB_CUR_MAX
bytes is written to the storage starting at
.Fa s .
.Pp
The output encoding that
.Fn c16rtomb
uses in
.Fa s
is determined by the
.Dv LC_CTYPE
category of the current locale.
.Ox
only supports UTF-8 and ASCII output,
and this function is only useful for UTF-8.
.Pp
The following arguments cause special processing:
.Bl -tag -width 012345678901
.It Fa c16 No == 0
A NUL byte is stored to
.Pf * Fa s
and the state object pointed to by
.Fa mbs
is reset to the initial state.
On operating systems other than
.Ox
that support state-dependent multibyte encodings,
a special byte sequence
.Pq Dq shift sequence
is written before the NUL byte to return to the initial state
if that is required by the output encoding
and by the current output encoding state.
.It Fa mbs No == Dv NULL
An internal
.Vt mbstate_t
object specific to the
.Fn c16rtomb
function is used instead of the
.Fa mbs
argument.
This internal object is automatically initialized at program startup
and never changed by any
.Em libc
function except
.Fn c16rtomb .
.It Fa s No == Dv NULL
The object pointed to by
.Fa mbs ,
or the internal object if
.Fa mbs
is a
.Dv NULL
pointer, is reset to its initial state,
.Fa c16
is ignored, and 1 is returned.
.El
.Sh RETURN VALUES
.Fn c16rtomb
returns the number of bytes written to
.Fa s
on success or
.Po Vt size_t Pc Ns \-1
on failure, specifically:
.Bl -tag -width 10n
.It 0
The first 16-bit code unit was successfully decoded
as a UTF-16 high surrogate.
Nothing was written to
.Fa s
yet.
.It 1
The first 16-bit code unit was successfully decoded
as a character in the range U+0000 to U+007F, or
.Fa s
is
.Dv NULL .
.It 2
The first 16-bit code unit was successfully decoded
as a character in the range U+0080 to U+07FF.
.It 3
The first 16-bit code unit was successfully decoded
as a character in the range U+0800 to U+D7FF or U+E000 to U+FFFF.
.It 4
The second 16-bit code unit was successfully decoded as a UTF-16 low
surrogate, resulting in a character in the range U+10000 to U+10FFFF.
.It greater
Return values greater than 4 may occur on operating systems other than
.Ox
for output encodings other than UTF-8, in particular when a shift
sequence was written.
.It Po Vt size_t Pc Ns \-1
UTF-16 input decoding or
.Dv LC_CTYPE
output encoding failed, or
.Fa mbs
is invalid.
Nothing was written to
.Fa s ,
and
.Va errno
has been set.
.El
.Sh ERRORS
.Fn c16rtomb
causes an error in the following cases:
.Bl -tag -width Er
.It Bq Er EILSEQ
UTF-16 input decoding failed because the first 16-bit code unit
is neither a UCS-2 character nor a UTF-16 high surrogate,
or because the second 16-bit code unit is not a UTF-16 low surrogate;
or output encoding failed because the resulting character
cannot be represented in the output encoding selected with
.Dv LC_CTYPE .
.It Bq Er EINVAL
.Fa mbs
points to an invalid or uninitialized
.Vt mbstate_t
object.
.El
.Sh SEE ALSO
.Xr mbrtoc16 3 ,
.Xr setlocale 3 ,
.Xr wcrtomb 3
.Sh STANDARDS
.Fn c16rtomb
conforms to
.St -isoC-2011 .
.Sh HISTORY
.Fn c16rtomb
has been available since
.Ox 7.4 .
.Sh CAVEATS
The C11 standard only requires the
.Fa c16
argument to be interpreted according to UTF-16
if the predefined environment macro
.Dv __STDC_UTF_16__
is defined with a value of 1.
On
.Ox ,
.In uchar.h
provides this definition.
Other operating systems which do not define
.Dv __STDC_UTF_16__
could theoretically use a different,
implementation-defined input encoding for
.Fa c16
instead of UTF-16.
Using UTF-16 becomes mandatory in C23.

100
lib/libc/locale/c16rtomb.c Normal file
View File

@ -0,0 +1,100 @@
/* $OpenBSD: c16rtomb.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
/*
* Copyright (c) 2022 Ingo Schwarze <schwarze@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <errno.h>
#include <uchar.h>
#include <wchar.h>
/*
* Keep this structure compatible with
* struct _utf8_state in the file citrus/citrus_utf8.c.
*/
struct _utf16_state {
wchar_t ch;
int want;
};
size_t
c16rtomb(char *s, char16_t c16, mbstate_t *ps)
{
static mbstate_t mbs;
struct _utf16_state *us;
wchar_t wc;
if (ps == NULL)
ps = &mbs;
/*
* Handle the special case of NULL output first
* to avoid inspecting c16 and ps and possibly drawing
* bogus conclusions from whatever those may contain.
* Instead, just restore the initial conversion state.
* The return value represents the length of the NUL byte
* corresponding to the NUL wide character, even though
* there is no place to write that NUL byte to.
*/
if (s == NULL) {
memset(ps, 0, sizeof(*ps));
return 1;
}
us = (struct _utf16_state *)ps;
if (us->want == (size_t)-3) {
/*
* The previous call read a high surrogate,
* so expect a low surrogate now.
*/
if ((c16 & 0xfc00) != 0xdc00) {
errno = EILSEQ;
return -1;
}
/*
* Assemble the full code point for processing
* by wcrtomb(3). Since we do not support
* state-dependent encodings, our wcrtomb(3)
* always expects the initial conversion state,
* so clearing the state here is just fine.
*/
wc = us->ch + (c16 & 0x3ff);
us->ch = 0;
us->want = 0;
} else if ((c16 & 0xfc00) == 0xd800) {
/*
* Got a high surrogate while being in the initial
* conversion state. Remeber its contribution to
* the codepoint and defer encoding to the next call.
*/
us->ch = 0x10000 + ((c16 & 0x3ff) << 10);
us->want = -3;
/* Nothing was written to *s just yet. */
return 0;
} else
wc = c16;
/*
* The following correctly returns an error when a low
* surrogate is encountered without a preceding high one.
*/
return wcrtomb(s, wc, ps);
}

View File

@ -0,0 +1,18 @@
/* $OpenBSD: c32rtomb.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
/*
* Written by Ingo Schwarze <schwarze@openbsd.org>
* and placed in the public domain on March 19, 2022.
*/
#include <uchar.h>
#include <wchar.h>
size_t
c32rtomb(char *s, char32_t c32, mbstate_t *ps)
{
static mbstate_t mbs;
if (ps == NULL)
ps = &mbs;
return wcrtomb(s, c32, ps);
}

265
lib/libc/locale/mbrtoc16.3 Normal file
View File

@ -0,0 +1,265 @@
.\" $OpenBSD: mbrtoc16.3,v 1.1 2023/08/20 15:02:51 schwarze Exp $
.\"
.\" Copyright 2023 Ingo Schwarze <schwarze@openbsd.org>
.\" Copyright 2010 Stefan Sperling <stsp@openbsd.org>
.\"
.\" Permission to use, copy, modify, and distribute this software for any
.\" purpose with or without fee is hereby granted, provided that the above
.\" copyright notice and this permission notice appear in all copies.
.\"
.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
.\"
.Dd $Mdocdate: August 20 2023 $
.Dt MBRTOC16 3
.Os
.Sh NAME
.Nm mbrtoc16
.Nd convert one UTF-8 encoded character to UTF-16
.Sh SYNOPSIS
.In uchar.h
.Ft size_t
.Fo mbrtoc16
.Fa "char16_t * restrict pc16"
.Fa "const char * restrict s"
.Fa "size_t n"
.Fa "mbstate_t * restrict mbs"
.Fc
.Sh DESCRIPTION
The
.Fn mbrtoc16
function examines at most
.Fa n
bytes of the multibyte character byte string pointed to by
.Fa s ,
converts those bytes to a wide character,
and encodes the wide character using UTF-16.
In some cases, it is necessary to call this function
twice to convert a single character.
.Pp
Conversion happens in accordance with the conversion state
.Pf * Fa mbs ,
which must be initialized to zero before the application's first call to
.Fn mbrtoc16 .
For this function,
.Pf * Fa mbs
stores information about both the state of the UTF-8 input encoding
and the state of the UTF-16 output encoding.
If the previous call did not return
.Po Vt size_t Pc Ns \-1 ,
.Fa mbs
can safely be reused without reinitialization.
.Pp
The input encoding that
.Fn mbrtoc16
uses for
.Fa s
is determined by the
.Dv LC_CTYPE
category of the current locale.
If the locale is changed without reinitialization of
.Pf * Fa mbs ,
the behaviour is undefined.
.Pp
Unlike
.Xr mbtowc 3 ,
.Fn mbrtoc16
accepts an incomplete byte sequence pointed to by
.Fa s
which does not form a complete character but is potentially part of
a valid character.
In this case, the function consumes all such bytes.
The conversion state saved in
.Pf * Fa mbs
will be used to restart the suspended conversion during the next call.
.Pp
On systems other than
.Ox
that support state-dependent encodings,
.Fa s
may point to a special sequence of bytes called a
.Dq shift sequence ;
see
.Xr mbrtowc 3
for details.
.Pp
The following arguments cause special processing:
.Bl -tag -width 012345678901
.It Fa pc16 No == Dv NULL
The conversion from a multibyte character to a wide character is performed
and the conversion state may be affected, but the resulting wide character
is discarded.
.It Fa s No == Dv NULL
The arguments
.Fa pc16
and
.Fa n
are ignored and starting or continuing the conversion with an empty string
is attempted, discarding the conversion result.
.It Fa mbs No == Dv NULL
An internal
.Vt mbstate_t
object specific to the
.Fn mbrtoc16
function is used instead of the
.Fa mbs
argument.
This internal object is automatically initialized at program startup
and never changed by any
.Em libc
function except
.Fn mbrtoc16 .
.Pp
If
.Fn mbrtoc16
is called with a
.Dv NULL
.Fa mbs
argument and that call returns
.Po Vt size_t Pc Ns \-1 ,
the internal conversion state of
.Fn mbrtoc16
becomes permanently undefined and there is no way
to reset it to any defined state.
Consequently, after such a mishap, it is not safe to call
.Fn mbrtoc16
with a
.Dv NULL
.Fa mbs
argument ever again until the program is terminated.
.El
.Sh RETURN VALUES
.Bl -tag -width 012345678901
.It 0
The bytes pointed to by
.Fa s
form a terminating NUL character.
If
.Fa pc16
is not
.Dv NULL ,
a NUL wide character has been stored in
.Pf * Fa pc16 .
.It positive
.Fa s
points to a valid character, and the value returned is the number of
bytes completing the character.
If
.Fa pc16
is not
.Dv NULL ,
the first UTF-16 code unit of the corresponding wide character
has been stored in
.Pf * Fa pc16 .
If it is an UTF-16 high surrogate, the function needs to be called
again to retrieve a second UTF-16 code unit, the low surrogate.
On
.Ox ,
this happens if and only if the return value is 4,
but this equivalence does not hold on other operating systems
that support input encodings other than UTF-8.
.It Po Vt size_t Pc Ns \-1
.Fa s
points to an illegal byte sequence which does not form a valid multibyte
character in the current locale, or
.Fa mbs
points to an invalid or uninitialized object.
.Va errno
is set to
.Er EILSEQ
or
.Er EINVAL ,
respectively.
The conversion state object pointed to by
.Fa mbs
is left in an undefined state and must be reinitialized before being
used again.
.It Po Vt size_t Pc Ns \-2
.Fa s
points to an incomplete byte sequence of length
.Fa n
which has been consumed and contains part of a valid multibyte character.
The character may be completed by calling the same function again with
.Fa s
pointing to one or more subsequent bytes of the multibyte character and
.Fa mbs
pointing to the conversion state object used during conversion of the
incomplete byte sequence.
.It Po Vt size_t Pc Ns \-3
The second 16-bit code unit resulting from a previous call
has been stored into
.Pf * Fa pc16 ,
without consuming any additional bytes from
.Fa s .
.El
.Sh ERRORS
.Fn mbrtoc16
causes an error in the following cases:
.Bl -tag -width Er
.It Bq Er EILSEQ
.Fa s
points to an invalid multibyte character.
.It Bq Er EINVAL
.Fa mbs
points to an invalid or uninitialized
.Vt mbstate_t
object.
.El
.Sh SEE ALSO
.Xr c16rtomb 3 ,
.Xr mbrtowc 3 ,
.Xr setlocale 3
.Sh STANDARDS
.Fn mbrtoc16
conforms to
.St -isoC-2011 .
.Sh HISTORY
.Fn mbrtoc16
has been available since
.Ox 7.4 .
.Sh CAVEATS
On operating systems other than
.Ox
that support input encodings other than UTF-8, inspecting the return value
is insufficient to tell whether the function needs to be called again.
If the return value is positive, inspecting
.Pf * Fa pc16
is also required to make that decision.
Consequently, passing a
.Dv NULL
pointer for the
.Fa pc16
argument is discouraged because it can result
in a well-defined but unknown output encoding state.
The simplest way to recover from such an unknown state is to
reinitialize the object pointed to by
.Fa mbs .
.Pp
The C11 standard only requires the
.Fa pc16
argument to be encoded according to UTF-16
if the predefined environment macro
.Dv __STDC_UTF_16__
is defined with a value of 1.
On
.Ox ,
.In uchar.h
provides this definition.
Other operating systems which do not define
.Dv __STDC_UTF_16__
could theoretically use a different,
implementation-defined output encoding for
.Fa pc16
instead of UTF-16.
Writing portable code for an arbitrary output encoding is impossible
because the rules when and how often the function needs to be called
again depend on the output encoding; the rules explained above are
specific to UTF-16.
Using UTF-16 as the output encoding of
.Fn wcrtoc16
becomes mandatory in C23.

102
lib/libc/locale/mbrtoc16.c Normal file
View File

@ -0,0 +1,102 @@
/* $OpenBSD: mbrtoc16.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
/*
* Copyright (c) 2022 Ingo Schwarze <schwarze@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <stdint.h>
#include <uchar.h>
#include <wchar.h>
/*
* Keep this structure compatible with
* struct _utf8_state in the file citrus/citrus_utf8.c.
* In particular, only use values for the "want" field
* that do not collide with values used by the function
* _citrus_utf8_ctype_mbrtowc().
*/
struct _utf16_state {
wchar_t ch;
int want;
};
size_t
mbrtoc16(char16_t *pc16, const char *s, size_t n, mbstate_t *ps)
{
static mbstate_t mbs;
struct _utf16_state *us;
size_t rv;
wchar_t wc;
/*
* Fall back to a state object local to this function
* and do not use the fallback object in mbrtowc(3)
* because an application program might mix calls to mbrtowc(3)
* and mbrtoc16(3) decoding different strings, and they must
* not clobber each other's state.
*/
if (ps == NULL)
ps = &mbs;
us = (struct _utf16_state *)ps;
/*
* Handle the special case of NULL input first such that
* a low surrogate left over from a previous call does not
* clobber an object pointed to by the pc16 argument.
*/
if (s == NULL) {
s = "";
n = 1;
pc16 = NULL;
}
/*
* If the previous call stored a high surrogate,
* store the corresponding low surrogate now
* and do not inspect any further input yet.
*/
if (us->want == (size_t)-3) {
if (pc16 != NULL)
*pc16 = 0xdc00 + (us->ch & 0x3ff);
us->ch = 0;
us->want = 0;
return -3;
}
/*
* Decode the multibyte character.
* All the mbrtowc(3) use cases can be reached from here,
* including continuing an imcomplete character started earlier,
* decoding a NUL character, a valid complete character,
* an incomplete character to be continued later,
* or a decoding error.
*/
rv = mbrtowc(&wc, s, n, ps);
if (rv < (size_t)-2) {
/* A new character that is valid and complete. */
if (wc > UINT16_MAX) {
/* Store a high surrogate. */
if (pc16 != NULL)
*pc16 = 0xd7c0 + (wc >> 10);
/* Remember that the low surrogate is pending. */
us->ch = wc;
us->want = -3;
} else if (pc16 != NULL)
/* Store a basic multilingual plane codepoint. */
*pc16 = wc;
}
return rv;
}

View File

@ -0,0 +1,18 @@
/* $OpenBSD: mbrtoc32.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
/*
* Written by Ingo Schwarze <schwarze@openbsd.org>
* and placed in the public domain on March 19, 2022.
*/
#include <uchar.h>
#include <wchar.h>
size_t
mbrtoc32(char32_t *pc32, const char *s, size_t n, mbstate_t *ps)
{
static mbstate_t mbs;
if (ps == NULL)
ps = &mbs;
return mbrtowc(pc32, s, n, ps);
}

View File

@ -1,6 +1,8 @@
.\" $OpenBSD: mbrtowc.3,v 1.5 2016/02/08 09:56:16 schwarze Exp $
.\" $OpenBSD: mbrtowc.3,v 1.6 2023/08/20 15:02:51 schwarze Exp $
.\" $NetBSD: mbrtowc.3,v 1.5 2003/09/08 17:54:31 wiz Exp $
.\"
.\" Copyright (c)2023 Ingo Schwarze <schwarze@openbsd.org>
.\" Copyright (c)2010 Stefan Sperling <stsp@openbsd.org>
.\" Copyright (c)2002 Citrus Project,
.\" All rights reserved.
.\"
@ -25,27 +27,41 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.Dd $Mdocdate: February 8 2016 $
.Dd $Mdocdate: August 20 2023 $
.Dt MBRTOWC 3
.Os
.Sh NAME
.Nm mbrtowc
.Nd converts a multibyte character to a wide character (restartable)
.Nm mbrtowc ,
.Nm mbrtoc32
.Nd convert a multibyte character to a wide character (restartable)
.Sh SYNOPSIS
.In wchar.h
.Ft size_t
.Fn mbrtowc "wchar_t * restrict wc" "const char * restrict s" "size_t n" \
"mbstate_t * restrict mbs"
.Fo mbrtowc
.Fa "wchar_t * restrict wc"
.Fa "const char * restrict s"
.Fa "size_t n"
.Fa "mbstate_t * restrict mbs"
.Fc
.In uchar.h
.Ft size_t
.Fo mbrtoc32
.Fa "char32_t * restrict wc"
.Fa "const char * restrict s"
.Fa "size_t n"
.Fa "mbstate_t * restrict mbs"
.Fc
.Sh DESCRIPTION
The
.Fn mbrtowc
function examines at most
and
.Fn mbrtoc32
functions examine at most
.Fa n
bytes of the multibyte character byte string pointed to by
.Fa s ,
converts those bytes to a wide character, and stores the wide character
in the wchar_t object pointed to by
.Fa wc
convert those bytes to a wide character, and store the wide character into
.Pf * Fa wc
if
.Fa wc
is not
@ -54,46 +70,47 @@ and
.Fa s
points to a valid character.
.Pp
Conversion happens in accordance with the conversion state described
by the mbstate_t object pointed to by
.Fa mbs .
The mbstate_t object must be initialized to zero before the application's
first call to
.Fn mbrtowc .
If the previous call to
Conversion happens in accordance with the conversion state
.Pf * Fa mbs ,
which must be initialized to zero before the application's first call to
.Fn mbrtowc
did not return (size_t)-1, the mbstate_t object can safely be reused
without reinitialization.
or
.Fn mbrtoc32 .
If the previous call did not return
.Po Vt size_t Pc Ns \-1 ,
.Fa mbs
can safely be reused without reinitialization.
.Pp
The behaviour of
The input encoding that
.Fn mbrtowc
is affected by the
and
.Fn mbrtoc32
use for
.Fa s
is determined by the
.Dv LC_CTYPE
category of the current locale.
If the locale is changed without reinitialization of the mbstate_t object
pointed to by
.Fa mbs ,
the behaviour of
.Fn mbrtowc
is undefined.
If the locale is changed without reinitialization of
.Pf * Fa mbs ,
the behaviour is undefined.
.Pp
Unlike
.Xr mbtowc 3 ,
.Fn mbrtowc
will accept an incomplete byte sequence pointed to by
and
.Fn mbrtoc32
accept an incomplete byte sequence pointed to by
.Fa s
which does not form a complete character but is potentially part of
a valid character.
In this case,
.Fn mbrtowc
consumes all such bytes.
The conversion state saved in the mbstate_t object pointed to by
.Fa mbs
will be used to restart the suspended conversion during the next
call to
.Fn mbrtowc .
In this case, both functions consume all such bytes.
The conversion state saved in
.Pf * Fa mbs
will be used to restart the suspended conversion during the next call.
.Pp
In state-dependent encodings,
On systems other than
.Ox
that support state-dependent encodings,
.Fa s
may point to a special sequence of bytes called a
.Dq shift sequence .
@ -104,61 +121,58 @@ can switch e.g. from ASCII (which uses one byte per character) to
JIS X 0208 (which uses two bytes per character).
Shift sequence bytes correspond to no individual wide character, so
.Fn mbrtowc
treats them as if they were part of the subsequent multibyte character.
and
.Fn mbrtoc32
treat them as if they were part of the subsequent multibyte character.
Therefore they do contribute to the number of bytes in the multibyte character.
.Pp
Special cases in interpretation of arguments are as follows:
The following arguments cause special processing:
.Bl -tag -width 012345678901
.It "wc == NULL "
.It Fa wc No == Dv NULL
The conversion from a multibyte character to a wide character is performed
and the conversion state may be affected, but the resulting wide character
is discarded.
.Pp
This can be used to find out how many bytes are contained in the
multibyte character pointed to by
.Fa s .
.It "s == NULL "
.Fn mbrtowc
ignores
.It Fa s No == Dv NULL
The arguments
.Fa wc
and
.Fa n ,
and behaves equivalent to
.Bd -literal -offset indent
mbrtowc(NULL, "", 1, mbs);
.Ed
.Pp
which attempts to use the mbstate_t object pointed to by
.Fa mbs
to start or continue conversion using the empty string as input,
and discards the conversion result.
.Pp
.Fa n
are ignored and starting or continuing the conversion with an empty string
is attempted, discarding the conversion result.
If conversion succeeds, this call always returns zero.
Unlike
.Xr mbtowc 3 ,
the value returned does not indicate whether the current encoding of
the locale is state-dependent, i.e. uses shift sequences.
.It "mbs == NULL "
.It Fa mbs No == Dv NULL
.Fn mbrtowc
uses its own internal state object to keep the conversion state,
instead of an mbstate_t object pointed to by
.Fa mbs .
This internal conversion state is initialized once at program startup.
It is not safe to call
and
.Fn mbrtoc32
each use their own internal state object instead of the
.Fa mbs
argument.
Both internal state objects are initialized at startup time of the program,
and no other libc function ever changes either of them.
.Pp
If
.Fn mbrtowc
again with a
or
.Fn mbrtoc32
is called with a
.Dv NULL
.Fa mbs
argument if
.Fn mbrtowc
returned (size_t)-1 because at this point the internal conversion state
is undefined.
.Pp
Calling any other functions in
.Em libc
never changes the internal
conversion state object of
.Fn mbrtowc .
argument and that call returns
.Po Vt size_t Pc Ns \-1 ,
the internal conversion state of the respective function becomes
permanently undefined and there is no way to reset it to any defined state.
Consequently, after such a mishap, it is not safe
to call the same function with a
.Dv NULL
.Fa mbs
argument ever again until the program is terminated.
.El
.Sh RETURN VALUES
.Bl -tag -width 012345678901
@ -183,14 +197,18 @@ is not
the corresponding wide character has been stored in the wchar_t object
pointed to by
.Fa wc .
.It (size_t)-1
.It Po Vt size_t Pc Ns \-1
.Fa s
points to an illegal byte sequence which does not form a valid multibyte
character in the current locale.
.Fn mbrtowc
sets
character in the current locale, or
.Fa mbs
points to an invalid or uninitialized object.
.Va errno
to EILSEQ.
is set to
.Er EILSEQ
or
.Er EINVAL ,
respectively.
The conversion state object pointed to by
.Fa mbs
is left in an undefined state and must be reinitialized before being
@ -198,6 +216,8 @@ used again.
.Pp
Because applications using
.Fn mbrtowc
or
.Fn mbrtoc32
are shielded from the specifics of the multibyte character encoding scheme,
it is impossible to repair byte sequences containing encoding errors.
Such byte sequences must be treated as invalid and potentially malicious input.
@ -205,66 +225,90 @@ Applications must stop processing the byte string pointed to by
.Fa s
and either discard any wide characters already converted, or cope with
truncated input.
.It (size_t)-2
.It Po Vt size_t Pc Ns \-2
.Fa s
points to an incomplete byte sequence of length
.Fa n
which has been consumed and contains part of a valid multibyte character.
The character may be completed by calling
.Fn mbrtowc
again with
The character may be completed by calling the same function again with
.Fa s
pointing to one or more subsequent bytes of the multibyte character and
.Fa mbs
pointing to the conversion state object used during conversion of the
incomplete byte sequence.
.It Po Vt size_t Pc Ns \-3
The next character resulting from a previous call has been stored into
.Fa wc ,
without consuming any additional bytes from
.Fa s .
This never happens for
.Fn mbrtowc ,
and on
.Ox ,
it never happens for
.Fn mbrtoc32
either.
.El
.Sh ERRORS
The
.Fn mbrtowc
function may cause an error in the following cases:
and
.Fn mbrtoc32
cause an error in the following cases:
.Bl -tag -width Er
.It Bq Er EILSEQ
.Fa s
points to an invalid multibyte character.
.It Bq Er EINVAL
.Fa mbs
points to an invalid or uninitialized mbstate_t object.
points to an invalid or uninitialized
.Vt mbstate_t
object.
.El
.Sh SEE ALSO
.Xr mbrlen 3 ,
.Xr mbtowc 3 ,
.Xr setlocale 3
.Xr setlocale 3 ,
.Xr wcrtomb 3
.Sh STANDARDS
The
.Fn mbrtowc
function conforms to
.\" .St -isoC-amd1 .
ISO/IEC 9899/AMD1:1995
.Pq Dq ISO C90, Amendment 1 .
The restrict qualifier is added at
.\" .St -isoC99 .
ISO/IEC 9899:1999
.Pq Dq ISO C99 .
conforms to
.St -isoC-amd1 .
The restrict qualifier was added at
.St -isoC-99 .
.Pp
.Fn mbrtoc32
conforms to
.St -isoC-2011 .
.Sh HISTORY
.Fn mbrtowc
has been available since since
.Ox 3.8
and has provided support for UTF-8 since
.Ox 4.8 .
.Pp
.Fn mbrtoc32
has been available since since
.Ox 7.4 .
.Sh CAVEATS
.Fn mbrtowc
is not suitable for programs that care about internals of the character
and
.Fn mbrtoc32
are not suitable for programs that care about internals of the character
encoding scheme used by the byte string pointed to by
.Fa s .
.Pp
It is possible that
.Fn mbrtowc
fails because of locale configuration errors.
It is possible that these functions
fail because of locale configuration errors.
An
.Dq invalid
character sequence may simply be encoded in a different encoding than that
of the current locale.
.Pp
The special cases for
.Fa s
== NULL and
.Fa mbs
== NULL do not make any sense.
.Fa s No == Dv NULL
and
.Fa mbs No == Dv NULL
do not make any sense.
Instead of passing
.Dv NULL
for

View File

@ -1,6 +1,7 @@
.\" $OpenBSD: wcrtomb.3,v 1.10 2015/03/22 18:02:11 stsp Exp $
.\" $OpenBSD: wcrtomb.3,v 1.11 2023/08/20 15:02:51 schwarze Exp $
.\" $NetBSD: wcrtomb.3,v 1.4 2003/09/08 17:54:31 wiz Exp $
.\"
.\" Copyright (c)2023 Ingo Schwarze <schwarze@openbsd.org>
.\" Copyright (c)2002 Citrus Project,
.\" All rights reserved.
.\"
@ -25,117 +26,161 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.Dd $Mdocdate: March 22 2015 $
.Dd $Mdocdate: August 20 2023 $
.Dt WCRTOMB 3
.Os
.\" ----------------------------------------------------------------------
.Sh NAME
.Nm wcrtomb
.Nd converts a wide character to a multibyte character (restartable)
.\" ----------------------------------------------------------------------
.Nm wcrtomb ,
.Nm c32rtomb
.Nd convert a wide character to a multibyte character
.Sh SYNOPSIS
.In wchar.h
.Ft size_t
.Fn wcrtomb "const char * restrict s" "wchar_t wc" "mbstate_t * restrict ps"
.\" ----------------------------------------------------------------------
.Fo wcrtomb
.Fa "const char * restrict s"
.Fa "wchar_t wc"
.Fa "mbstate_t * restrict mbs"
.Fc
.In uchar.h
.Ft size_t
.Fo c32rtomb
.Fa "char * restrict s"
.Fa "char32_t wc"
.Fa "mbstate_t * restrict mbs"
.Fc
.Sh DESCRIPTION
.Fn wcrtomb
converts the wide character given by
and
.Fn c32rtomb
convert the wide character
.Fa wc
to the corresponding multibyte character, and stores up to
to the corresponding multibyte character, and store up to
.Dv MB_CUR_MAX
bytes in the array pointed to by
.Fa s
if
.Fa s
is not a null pointer.
is not a
.Dv NULL
pointer.
The interpretation of
.Fa wc
is implementation-defined.
On
.Ox ,
.Vt wchar_t
and
.Vt char32_t
are of the same width and both are always interpreted as Unicode codepoints.
.Pp
The behaviour of
The output encoding that
.Fn wcrtomb
is affected by the
and
.Fn c32rtomb
use in
.Fa s
is determined by the
.Dv LC_CTYPE
category of the current locale.
.Ox
only supports UTF-8 and ASCII output,
and these functions are only useful for UTF-8.
.Pp
These are the special cases:
The following arguments cause special processing:
.Bl -tag -width 012345678901
.It "wc == 0"
For state-dependent encodings,
.Fn wcrtomb
stores a null byte preceded by a special byte sequence (if any)
to return to an initial state to the array pointed by
.Fa s ,
and the state object pointed by
.Fa ps
also returned to an initial state.
.It "s == NULL"
.Fn wcrtomb
just places
.Fa ps
into an initial state.
It is equivalent to the following call:
.Bd -literal -offset indent
wcrtomb(buf, L'\e0', ps);
.Ed
.Pp
Here,
.Fa buf
is a dummy buffer.
In this case,
.Fa wc
is ignored.
.It "ps == NULL"
.It Fa wc No == 0
A NUL byte is stored to
.Pf * Fa s
and the state object pointed to by
.Fa mbs
is reset to the initial state.
On operating systems other than
.Ox
that support state-dependent multibyte encodings, a special byte sequence
.Pq Dq shift sequence
is written before the NUL byte to return to the initial state
if that is required by the output encoding
and by the current output encoding state.
.It Fa mbs No == Dv NULL
.Fn mbrtowc
uses its own internal state object to keep the conversion state,
instead of
.Fa ps
mentioned in this manual page.
.Pp
Calling any other functions in
and
.Fn c32rtomb
each use their own internal state object instead of the
.Fa mbs
argument.
Both internal state objects are initialized at startup time of the program,
and no other
.Em libc
never change the internal
state of
.Fn mbrtowc ,
which is initialized at startup time of the program.
function ever changes either of them.
.It Fa s No == Dv NULL
The object pointed to by
.Fa mbs ,
or the internal object if
.Fa mbs
is a
.Dv NULL
pointer, is reset to the initial state,
.Fa wc
is ignored, and 1 is returned.
.El
.\" ----------------------------------------------------------------------
.Sh RETURN VALUES
.Fn wcrtomb
returns the number of bytes (including any shift sequences)
and
.Fn c32rtomb
return the number of bytes (including any shift sequences)
which are stored in the array pointed to by
.Fa s .
.Fa s ,
or 1 if
.Fa s
is
.Dv NULL .
If
.Fa wc
is not a valid wide character,
.Fn wcrtomb
returns (size_t)-1
and sets
is not a valid wide character
or if it cannot be represented in the multibyte encoding selected with
.Dv LC_CTYPE ,
both functions return
.Po Vt size_t Pc Ns \-1
and set
.Va errno
to indicate error.
.\" ----------------------------------------------------------------------
to indicate the error.
.Sh ERRORS
.Fn wcrtomb
may cause an error in the following cases:
and
.Fn c32rtomb
cause an error in the following cases:
.Bl -tag -width Er
.It Bq Er EILSEQ
.Fa wc
is not a valid wide character.
is not a valid wide character or cannot be represented using
.Dv LC_CTYPE .
.It Bq Er EINVAL
.Fa ps
points to an invalid or uninitialized mbstate_t object.
.Fa mbs
points to an invalid or uninitialized
.Vt mbstate_t
object.
.El
.\" ----------------------------------------------------------------------
.Sh SEE ALSO
.Xr mbrtowc 3 ,
.Xr setlocale 3 ,
.Xr wctomb 3
.\" ----------------------------------------------------------------------
.Sh STANDARDS
The
.Fn wcrtomb
function conforms to
.\" .St -isoC-amd1 .
ISO/IEC 9899/AMD1:1995
.Pq Dq ISO C90, Amendment 1 .
The restrict qualifier is added at
.\" .St -isoC99 .
ISO/IEC 9899/1999
.Pq Dq ISO C99 .
conforms to
.St -isoC-amd1 .
The restrict qualifier was added at
.St -isoC-99 .
.Pp
.Fn c32rtomb
conforms to
.St -isoC-2011 .
.Sh HISTORY
.Fn wcrtomb
has been available since
.Ox 3.8
and has provided support for UTF-8 since
.Ox 4.8 .
.Pp
.Fn c32rtomb
has been available since since
.Ox 7.4 .

View File

@ -1,4 +1,4 @@
major=97
minor=0
minor=1
# note: If changes were made to include/thread_private.h or if system calls
# were added/changed then librthread/shlib_version must also be updated.