mirror of
https://github.com/openbsd/src.git
synced 2025-01-04 23:35:36 -08:00
Provide C11 <uchar.h>.
OK millert@. Tested by naddy@ in a bulk and by matthieu@ in the new foot(1) port. I originally wrote the code in 2022 at the prodding of espie@. Using one improvement to a manual page from jmc@.
This commit is contained in:
parent
7927db41a2
commit
46c354aa2b
@ -743,7 +743,7 @@
|
||||
./usr/lib/crtendS.o
|
||||
./usr/lib/gcrt0.o
|
||||
./usr/lib/libagentx.so.1.1
|
||||
./usr/lib/libc.so.97.0
|
||||
./usr/lib/libc.so.97.1
|
||||
./usr/lib/libcbor.so.2.0
|
||||
./usr/lib/libcrypto.so.52.0
|
||||
./usr/lib/libcurses.so.14.0
|
||||
@ -3008,7 +3008,7 @@
|
||||
./usr/share/relink/kernel.tgz
|
||||
./usr/share/relink/usr
|
||||
./usr/share/relink/usr/lib
|
||||
./usr/share/relink/usr/lib/libc.so.97.0.a
|
||||
./usr/share/relink/usr/lib/libc.so.97.1.a
|
||||
./usr/share/relink/usr/lib/libcrypto.so.52.0.a
|
||||
./usr/share/relink/usr/libexec
|
||||
./usr/share/relink/usr/libexec/ld.so.a
|
||||
|
@ -1435,6 +1435,7 @@
|
||||
./usr/include/time.h
|
||||
./usr/include/tls.h
|
||||
./usr/include/ttyent.h
|
||||
./usr/include/uchar.h
|
||||
./usr/include/ufs
|
||||
./usr/include/ufs/ext2fs
|
||||
./usr/include/ufs/ext2fs/ext2fs.h
|
||||
@ -2295,6 +2296,7 @@
|
||||
./usr/share/man/man3/btowc.3
|
||||
./usr/share/man/man3/btree.3
|
||||
./usr/share/man/man3/bzero.3
|
||||
./usr/share/man/man3/c16rtomb.3
|
||||
./usr/share/man/man3/cacos.3
|
||||
./usr/share/man/man3/cacosh.3
|
||||
./usr/share/man/man3/carg.3
|
||||
@ -2715,6 +2717,7 @@
|
||||
./usr/share/man/man3/malloc.3
|
||||
./usr/share/man/man3/mblen.3
|
||||
./usr/share/man/man3/mbrlen.3
|
||||
./usr/share/man/man3/mbrtoc16.3
|
||||
./usr/share/man/man3/mbrtowc.3
|
||||
./usr/share/man/man3/mbsinit.3
|
||||
./usr/share/man/man3/mbsrtowcs.3
|
||||
|
@ -1,4 +1,4 @@
|
||||
# $OpenBSD: Makefile,v 1.230 2022/08/30 18:50:06 krw Exp $
|
||||
# $OpenBSD: Makefile,v 1.231 2023/08/20 15:02:50 schwarze Exp $
|
||||
# $NetBSD: Makefile,v 1.59 1996/05/15 21:36:43 jtc Exp $
|
||||
|
||||
# @(#)Makefile 5.45.1.1 (Berkeley) 5/6/91
|
||||
@ -27,7 +27,7 @@ FILES= a.out.h ar.h asr.h assert.h \
|
||||
signal.h siphash.h sndio.h spawn.h stdbool.h stddef.h \
|
||||
stdio.h stdlib.h string.h strings.h sysexits.h \
|
||||
tar.h tgmath.h tib.h time.h ttyent.h \
|
||||
unistd.h utime.h utmp.h uuid.h \
|
||||
uchar.h unistd.h utime.h utmp.h uuid.h \
|
||||
vis.h \
|
||||
wchar.h wctype.h
|
||||
|
||||
|
38
include/uchar.h
Normal file
38
include/uchar.h
Normal file
@ -0,0 +1,38 @@
|
||||
/* $OpenBSD: uchar.h,v 1.1 2023/08/20 15:02:50 schwarze Exp $ */
|
||||
/*
|
||||
* Written by Ingo Schwarze <schwarze@openbsd.org>
|
||||
* and placed in the public domain on March 19, 2022.
|
||||
*/
|
||||
|
||||
#ifndef _UCHAR_H_
|
||||
#define _UCHAR_H_
|
||||
|
||||
#include <sys/cdefs.h>
|
||||
#include <sys/_types.h>
|
||||
|
||||
#ifndef _MBSTATE_T_DEFINED_
|
||||
#define _MBSTATE_T_DEFINED_
|
||||
typedef __mbstate_t mbstate_t;
|
||||
#endif
|
||||
|
||||
#ifndef _SIZE_T_DEFINED_
|
||||
#define _SIZE_T_DEFINED_
|
||||
typedef __size_t size_t;
|
||||
#endif
|
||||
|
||||
#define __STDC_UTF_16__ 1
|
||||
#define __STDC_UTF_32__ 1
|
||||
|
||||
typedef __uint16_t char16_t;
|
||||
typedef __uint32_t char32_t;
|
||||
|
||||
__BEGIN_DECLS
|
||||
size_t mbrtoc16(char16_t * __restrict, const char * __restrict, size_t,
|
||||
mbstate_t * __restrict);
|
||||
size_t c16rtomb(char * __restrict, char16_t, mbstate_t * __restrict);
|
||||
size_t mbrtoc32(char32_t * __restrict, const char * __restrict, size_t,
|
||||
mbstate_t * __restrict);
|
||||
size_t c32rtomb(char * __restrict, char32_t, mbstate_t * __restrict);
|
||||
__END_DECLS
|
||||
|
||||
#endif /* !_UCHAR_H_ */
|
@ -958,6 +958,8 @@ SipHash_Update
|
||||
/* locale */
|
||||
__mb_cur_max
|
||||
btowc
|
||||
c16rtomb
|
||||
c32rtomb
|
||||
duplocale
|
||||
freelocale
|
||||
isalnum_l
|
||||
@ -1007,6 +1009,8 @@ mbsnrtowcs
|
||||
mbsrtowcs
|
||||
mbstowcs
|
||||
newlocale
|
||||
mbrtoc16
|
||||
mbrtoc32
|
||||
mbtowc
|
||||
nl_langinfo
|
||||
nl_langinfo_l
|
||||
|
17
lib/libc/hidden/uchar.h
Normal file
17
lib/libc/hidden/uchar.h
Normal file
@ -0,0 +1,17 @@
|
||||
/* $OpenBSD: uchar.h,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
|
||||
/*
|
||||
* Written by Ingo Schwarze <schwarze@openbsd.org>
|
||||
* and placed in the public domain on March 19, 2022.
|
||||
*/
|
||||
|
||||
#ifndef _LIBC_UCHAR_H_
|
||||
#define _LIBC_UCHAR_H_
|
||||
|
||||
#include_next <uchar.h>
|
||||
|
||||
PROTO_STD_DEPRECATED(c16rtomb);
|
||||
PROTO_STD_DEPRECATED(c32rtomb);
|
||||
PROTO_STD_DEPRECATED(mbrtoc16);
|
||||
PROTO_STD_DEPRECATED(mbrtoc32);
|
||||
|
||||
#endif /* !_LIBC_UCHAR_H_ */
|
@ -1,14 +1,15 @@
|
||||
# $OpenBSD: Makefile.inc,v 1.26 2022/07/27 20:00:11 guenther Exp $
|
||||
# $OpenBSD: Makefile.inc,v 1.27 2023/08/20 15:02:51 schwarze Exp $
|
||||
|
||||
# locale sources
|
||||
.PATH: ${LIBCSRCDIR}/locale
|
||||
|
||||
SRCS+= btowc.c _def_messages.c _def_monetary.c _def_numeric.c _def_time.c \
|
||||
SRCS+= _def_messages.c _def_monetary.c _def_numeric.c _def_time.c \
|
||||
localeconv.c nl_langinfo.c nl_langinfo_l.c setlocale.c \
|
||||
duplocale.c freelocale.c newlocale.c uselocale.c \
|
||||
__mb_cur_max.c _CurrentRuneLocale.c _get_locname.c \
|
||||
isctype_l.c iswctype.c iswctype_l.c wctype.c \
|
||||
mblen.c mbrlen.c mbstowcs.c mbtowc.c multibyte_citrus.c wcscoll.c \
|
||||
mblen.c mbrlen.c mbrtoc16.c mbrtoc32.c mbstowcs.c mbtowc.c \
|
||||
btowc.c c16rtomb.c c32rtomb.c multibyte_citrus.c wcscoll.c \
|
||||
wcscoll_l.c \
|
||||
wcstombs.c wctob.c wctomb.c wcstof.c wcstod.c wcstold.c wcstol.c \
|
||||
wcstoul.c wcstoll.c wcstoull.c wcstoimax.c wcstoumax.c \
|
||||
@ -17,7 +18,8 @@ SRCS+= btowc.c _def_messages.c _def_monetary.c _def_numeric.c _def_time.c \
|
||||
|
||||
MAN+= nl_langinfo.3 setlocale.3 newlocale.3 uselocale.3 localeconv.3 \
|
||||
iswalnum.3 towlower.3 \
|
||||
btowc.3 mblen.3 mbrlen.3 mbrtowc.3 mbsinit.3 mbsrtowcs.3 \
|
||||
btowc.3 c16rtomb.3 mblen.3 mbrlen.3 mbrtoc16.3 mbrtowc.3 \
|
||||
mbsinit.3 mbsrtowcs.3 \
|
||||
mbstowcs.3 mbtowc.3 wcrtomb.3 wcscoll.3 wcsrtombs.3 wcstod.3 \
|
||||
wcstol.3 wcstombs.3 wcsxfrm.3 wctob.3 wctomb.3 \
|
||||
wctype.3 iswctype.3 wctrans.3 towctrans.3 wcwidth.3
|
||||
|
207
lib/libc/locale/c16rtomb.3
Normal file
207
lib/libc/locale/c16rtomb.3
Normal file
@ -0,0 +1,207 @@
|
||||
.\" $OpenBSD: c16rtomb.3,v 1.1 2023/08/20 15:02:51 schwarze Exp $
|
||||
.\"
|
||||
.\" Copyright (c) 2023 Ingo Schwarze <schwarze@openbsd.org>
|
||||
.\"
|
||||
.\" Permission to use, copy, modify, and distribute this software for any
|
||||
.\" purpose with or without fee is hereby granted, provided that the above
|
||||
.\" copyright notice and this permission notice appear in all copies.
|
||||
.\"
|
||||
.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
.\"
|
||||
.Dd $Mdocdate: August 20 2023 $
|
||||
.Dt C16RTOMB 3
|
||||
.Os
|
||||
.Sh NAME
|
||||
.Nm c16rtomb
|
||||
.Nd convert one UTF-16 encoded character to UTF-8
|
||||
.Sh SYNOPSIS
|
||||
.In uchar.h
|
||||
.Ft size_t
|
||||
.Fo c16rtomb
|
||||
.Fa "char * restrict s"
|
||||
.Fa "char16_t c16"
|
||||
.Fa "mbstate_t * restrict mbs"
|
||||
.Fc
|
||||
.Sh DESCRIPTION
|
||||
This function converts one UTF-16 encoded character to UTF-8.
|
||||
In some cases, it is necessary to call the function twice
|
||||
to convert a single character.
|
||||
.Pp
|
||||
First, call
|
||||
.Fn c16rtomb
|
||||
passing the first 16-bit code unit of the UTF-16 encoded character in
|
||||
.Fa c16 .
|
||||
If the return value is greater than 0, the character is part of the UCS-2
|
||||
range, the complete UTF-8 encoding consisting of at most
|
||||
.Dv MB_CUR_MAX
|
||||
bytes has been written to the storage starting at
|
||||
.Fa s ,
|
||||
and the function does not need to be called again.
|
||||
.Pp
|
||||
If the return value is 0, the first 16-bit code unit is a UTF-16
|
||||
high surrogate and the function needs to be called a second time,
|
||||
this time passing the second 16-bit code unit of the UTF-16 encoded
|
||||
character in
|
||||
.Fa c16
|
||||
and passing the same
|
||||
.Fa mbs
|
||||
again that was also passed to the first call.
|
||||
If the second 16-bit code unit is a UTF-16 low surrogate,
|
||||
the second call returns a value greater than 0,
|
||||
the surrogate pair represents a Unicode code point
|
||||
beyond the basic multilingual plane,
|
||||
and the complete UTF-8 encoding consisting of at most
|
||||
.Dv MB_CUR_MAX
|
||||
bytes is written to the storage starting at
|
||||
.Fa s .
|
||||
.Pp
|
||||
The output encoding that
|
||||
.Fn c16rtomb
|
||||
uses in
|
||||
.Fa s
|
||||
is determined by the
|
||||
.Dv LC_CTYPE
|
||||
category of the current locale.
|
||||
.Ox
|
||||
only supports UTF-8 and ASCII output,
|
||||
and this function is only useful for UTF-8.
|
||||
.Pp
|
||||
The following arguments cause special processing:
|
||||
.Bl -tag -width 012345678901
|
||||
.It Fa c16 No == 0
|
||||
A NUL byte is stored to
|
||||
.Pf * Fa s
|
||||
and the state object pointed to by
|
||||
.Fa mbs
|
||||
is reset to the initial state.
|
||||
On operating systems other than
|
||||
.Ox
|
||||
that support state-dependent multibyte encodings,
|
||||
a special byte sequence
|
||||
.Pq Dq shift sequence
|
||||
is written before the NUL byte to return to the initial state
|
||||
if that is required by the output encoding
|
||||
and by the current output encoding state.
|
||||
.It Fa mbs No == Dv NULL
|
||||
An internal
|
||||
.Vt mbstate_t
|
||||
object specific to the
|
||||
.Fn c16rtomb
|
||||
function is used instead of the
|
||||
.Fa mbs
|
||||
argument.
|
||||
This internal object is automatically initialized at program startup
|
||||
and never changed by any
|
||||
.Em libc
|
||||
function except
|
||||
.Fn c16rtomb .
|
||||
.It Fa s No == Dv NULL
|
||||
The object pointed to by
|
||||
.Fa mbs ,
|
||||
or the internal object if
|
||||
.Fa mbs
|
||||
is a
|
||||
.Dv NULL
|
||||
pointer, is reset to its initial state,
|
||||
.Fa c16
|
||||
is ignored, and 1 is returned.
|
||||
.El
|
||||
.Sh RETURN VALUES
|
||||
.Fn c16rtomb
|
||||
returns the number of bytes written to
|
||||
.Fa s
|
||||
on success or
|
||||
.Po Vt size_t Pc Ns \-1
|
||||
on failure, specifically:
|
||||
.Bl -tag -width 10n
|
||||
.It 0
|
||||
The first 16-bit code unit was successfully decoded
|
||||
as a UTF-16 high surrogate.
|
||||
Nothing was written to
|
||||
.Fa s
|
||||
yet.
|
||||
.It 1
|
||||
The first 16-bit code unit was successfully decoded
|
||||
as a character in the range U+0000 to U+007F, or
|
||||
.Fa s
|
||||
is
|
||||
.Dv NULL .
|
||||
.It 2
|
||||
The first 16-bit code unit was successfully decoded
|
||||
as a character in the range U+0080 to U+07FF.
|
||||
.It 3
|
||||
The first 16-bit code unit was successfully decoded
|
||||
as a character in the range U+0800 to U+D7FF or U+E000 to U+FFFF.
|
||||
.It 4
|
||||
The second 16-bit code unit was successfully decoded as a UTF-16 low
|
||||
surrogate, resulting in a character in the range U+10000 to U+10FFFF.
|
||||
.It greater
|
||||
Return values greater than 4 may occur on operating systems other than
|
||||
.Ox
|
||||
for output encodings other than UTF-8, in particular when a shift
|
||||
sequence was written.
|
||||
.It Po Vt size_t Pc Ns \-1
|
||||
UTF-16 input decoding or
|
||||
.Dv LC_CTYPE
|
||||
output encoding failed, or
|
||||
.Fa mbs
|
||||
is invalid.
|
||||
Nothing was written to
|
||||
.Fa s ,
|
||||
and
|
||||
.Va errno
|
||||
has been set.
|
||||
.El
|
||||
.Sh ERRORS
|
||||
.Fn c16rtomb
|
||||
causes an error in the following cases:
|
||||
.Bl -tag -width Er
|
||||
.It Bq Er EILSEQ
|
||||
UTF-16 input decoding failed because the first 16-bit code unit
|
||||
is neither a UCS-2 character nor a UTF-16 high surrogate,
|
||||
or because the second 16-bit code unit is not a UTF-16 low surrogate;
|
||||
or output encoding failed because the resulting character
|
||||
cannot be represented in the output encoding selected with
|
||||
.Dv LC_CTYPE .
|
||||
.It Bq Er EINVAL
|
||||
.Fa mbs
|
||||
points to an invalid or uninitialized
|
||||
.Vt mbstate_t
|
||||
object.
|
||||
.El
|
||||
.Sh SEE ALSO
|
||||
.Xr mbrtoc16 3 ,
|
||||
.Xr setlocale 3 ,
|
||||
.Xr wcrtomb 3
|
||||
.Sh STANDARDS
|
||||
.Fn c16rtomb
|
||||
conforms to
|
||||
.St -isoC-2011 .
|
||||
.Sh HISTORY
|
||||
.Fn c16rtomb
|
||||
has been available since
|
||||
.Ox 7.4 .
|
||||
.Sh CAVEATS
|
||||
The C11 standard only requires the
|
||||
.Fa c16
|
||||
argument to be interpreted according to UTF-16
|
||||
if the predefined environment macro
|
||||
.Dv __STDC_UTF_16__
|
||||
is defined with a value of 1.
|
||||
On
|
||||
.Ox ,
|
||||
.In uchar.h
|
||||
provides this definition.
|
||||
Other operating systems which do not define
|
||||
.Dv __STDC_UTF_16__
|
||||
could theoretically use a different,
|
||||
implementation-defined input encoding for
|
||||
.Fa c16
|
||||
instead of UTF-16.
|
||||
Using UTF-16 becomes mandatory in C23.
|
100
lib/libc/locale/c16rtomb.c
Normal file
100
lib/libc/locale/c16rtomb.c
Normal file
@ -0,0 +1,100 @@
|
||||
/* $OpenBSD: c16rtomb.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2022 Ingo Schwarze <schwarze@openbsd.org>
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <uchar.h>
|
||||
#include <wchar.h>
|
||||
|
||||
/*
|
||||
* Keep this structure compatible with
|
||||
* struct _utf8_state in the file citrus/citrus_utf8.c.
|
||||
*/
|
||||
struct _utf16_state {
|
||||
wchar_t ch;
|
||||
int want;
|
||||
};
|
||||
|
||||
size_t
|
||||
c16rtomb(char *s, char16_t c16, mbstate_t *ps)
|
||||
{
|
||||
static mbstate_t mbs;
|
||||
struct _utf16_state *us;
|
||||
wchar_t wc;
|
||||
|
||||
if (ps == NULL)
|
||||
ps = &mbs;
|
||||
|
||||
/*
|
||||
* Handle the special case of NULL output first
|
||||
* to avoid inspecting c16 and ps and possibly drawing
|
||||
* bogus conclusions from whatever those may contain.
|
||||
* Instead, just restore the initial conversion state.
|
||||
* The return value represents the length of the NUL byte
|
||||
* corresponding to the NUL wide character, even though
|
||||
* there is no place to write that NUL byte to.
|
||||
*/
|
||||
if (s == NULL) {
|
||||
memset(ps, 0, sizeof(*ps));
|
||||
return 1;
|
||||
}
|
||||
|
||||
us = (struct _utf16_state *)ps;
|
||||
|
||||
if (us->want == (size_t)-3) {
|
||||
|
||||
/*
|
||||
* The previous call read a high surrogate,
|
||||
* so expect a low surrogate now.
|
||||
*/
|
||||
if ((c16 & 0xfc00) != 0xdc00) {
|
||||
errno = EILSEQ;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Assemble the full code point for processing
|
||||
* by wcrtomb(3). Since we do not support
|
||||
* state-dependent encodings, our wcrtomb(3)
|
||||
* always expects the initial conversion state,
|
||||
* so clearing the state here is just fine.
|
||||
*/
|
||||
wc = us->ch + (c16 & 0x3ff);
|
||||
us->ch = 0;
|
||||
us->want = 0;
|
||||
|
||||
} else if ((c16 & 0xfc00) == 0xd800) {
|
||||
|
||||
/*
|
||||
* Got a high surrogate while being in the initial
|
||||
* conversion state. Remeber its contribution to
|
||||
* the codepoint and defer encoding to the next call.
|
||||
*/
|
||||
us->ch = 0x10000 + ((c16 & 0x3ff) << 10);
|
||||
us->want = -3;
|
||||
|
||||
/* Nothing was written to *s just yet. */
|
||||
return 0;
|
||||
|
||||
} else
|
||||
wc = c16;
|
||||
|
||||
/*
|
||||
* The following correctly returns an error when a low
|
||||
* surrogate is encountered without a preceding high one.
|
||||
*/
|
||||
return wcrtomb(s, wc, ps);
|
||||
}
|
18
lib/libc/locale/c32rtomb.c
Normal file
18
lib/libc/locale/c32rtomb.c
Normal file
@ -0,0 +1,18 @@
|
||||
/* $OpenBSD: c32rtomb.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
|
||||
/*
|
||||
* Written by Ingo Schwarze <schwarze@openbsd.org>
|
||||
* and placed in the public domain on March 19, 2022.
|
||||
*/
|
||||
|
||||
#include <uchar.h>
|
||||
#include <wchar.h>
|
||||
|
||||
size_t
|
||||
c32rtomb(char *s, char32_t c32, mbstate_t *ps)
|
||||
{
|
||||
static mbstate_t mbs;
|
||||
|
||||
if (ps == NULL)
|
||||
ps = &mbs;
|
||||
return wcrtomb(s, c32, ps);
|
||||
}
|
265
lib/libc/locale/mbrtoc16.3
Normal file
265
lib/libc/locale/mbrtoc16.3
Normal file
@ -0,0 +1,265 @@
|
||||
.\" $OpenBSD: mbrtoc16.3,v 1.1 2023/08/20 15:02:51 schwarze Exp $
|
||||
.\"
|
||||
.\" Copyright 2023 Ingo Schwarze <schwarze@openbsd.org>
|
||||
.\" Copyright 2010 Stefan Sperling <stsp@openbsd.org>
|
||||
.\"
|
||||
.\" Permission to use, copy, modify, and distribute this software for any
|
||||
.\" purpose with or without fee is hereby granted, provided that the above
|
||||
.\" copyright notice and this permission notice appear in all copies.
|
||||
.\"
|
||||
.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
.\"
|
||||
.Dd $Mdocdate: August 20 2023 $
|
||||
.Dt MBRTOC16 3
|
||||
.Os
|
||||
.Sh NAME
|
||||
.Nm mbrtoc16
|
||||
.Nd convert one UTF-8 encoded character to UTF-16
|
||||
.Sh SYNOPSIS
|
||||
.In uchar.h
|
||||
.Ft size_t
|
||||
.Fo mbrtoc16
|
||||
.Fa "char16_t * restrict pc16"
|
||||
.Fa "const char * restrict s"
|
||||
.Fa "size_t n"
|
||||
.Fa "mbstate_t * restrict mbs"
|
||||
.Fc
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn mbrtoc16
|
||||
function examines at most
|
||||
.Fa n
|
||||
bytes of the multibyte character byte string pointed to by
|
||||
.Fa s ,
|
||||
converts those bytes to a wide character,
|
||||
and encodes the wide character using UTF-16.
|
||||
In some cases, it is necessary to call this function
|
||||
twice to convert a single character.
|
||||
.Pp
|
||||
Conversion happens in accordance with the conversion state
|
||||
.Pf * Fa mbs ,
|
||||
which must be initialized to zero before the application's first call to
|
||||
.Fn mbrtoc16 .
|
||||
For this function,
|
||||
.Pf * Fa mbs
|
||||
stores information about both the state of the UTF-8 input encoding
|
||||
and the state of the UTF-16 output encoding.
|
||||
If the previous call did not return
|
||||
.Po Vt size_t Pc Ns \-1 ,
|
||||
.Fa mbs
|
||||
can safely be reused without reinitialization.
|
||||
.Pp
|
||||
The input encoding that
|
||||
.Fn mbrtoc16
|
||||
uses for
|
||||
.Fa s
|
||||
is determined by the
|
||||
.Dv LC_CTYPE
|
||||
category of the current locale.
|
||||
If the locale is changed without reinitialization of
|
||||
.Pf * Fa mbs ,
|
||||
the behaviour is undefined.
|
||||
.Pp
|
||||
Unlike
|
||||
.Xr mbtowc 3 ,
|
||||
.Fn mbrtoc16
|
||||
accepts an incomplete byte sequence pointed to by
|
||||
.Fa s
|
||||
which does not form a complete character but is potentially part of
|
||||
a valid character.
|
||||
In this case, the function consumes all such bytes.
|
||||
The conversion state saved in
|
||||
.Pf * Fa mbs
|
||||
will be used to restart the suspended conversion during the next call.
|
||||
.Pp
|
||||
On systems other than
|
||||
.Ox
|
||||
that support state-dependent encodings,
|
||||
.Fa s
|
||||
may point to a special sequence of bytes called a
|
||||
.Dq shift sequence ;
|
||||
see
|
||||
.Xr mbrtowc 3
|
||||
for details.
|
||||
.Pp
|
||||
The following arguments cause special processing:
|
||||
.Bl -tag -width 012345678901
|
||||
.It Fa pc16 No == Dv NULL
|
||||
The conversion from a multibyte character to a wide character is performed
|
||||
and the conversion state may be affected, but the resulting wide character
|
||||
is discarded.
|
||||
.It Fa s No == Dv NULL
|
||||
The arguments
|
||||
.Fa pc16
|
||||
and
|
||||
.Fa n
|
||||
are ignored and starting or continuing the conversion with an empty string
|
||||
is attempted, discarding the conversion result.
|
||||
.It Fa mbs No == Dv NULL
|
||||
An internal
|
||||
.Vt mbstate_t
|
||||
object specific to the
|
||||
.Fn mbrtoc16
|
||||
function is used instead of the
|
||||
.Fa mbs
|
||||
argument.
|
||||
This internal object is automatically initialized at program startup
|
||||
and never changed by any
|
||||
.Em libc
|
||||
function except
|
||||
.Fn mbrtoc16 .
|
||||
.Pp
|
||||
If
|
||||
.Fn mbrtoc16
|
||||
is called with a
|
||||
.Dv NULL
|
||||
.Fa mbs
|
||||
argument and that call returns
|
||||
.Po Vt size_t Pc Ns \-1 ,
|
||||
the internal conversion state of
|
||||
.Fn mbrtoc16
|
||||
becomes permanently undefined and there is no way
|
||||
to reset it to any defined state.
|
||||
Consequently, after such a mishap, it is not safe to call
|
||||
.Fn mbrtoc16
|
||||
with a
|
||||
.Dv NULL
|
||||
.Fa mbs
|
||||
argument ever again until the program is terminated.
|
||||
.El
|
||||
.Sh RETURN VALUES
|
||||
.Bl -tag -width 012345678901
|
||||
.It 0
|
||||
The bytes pointed to by
|
||||
.Fa s
|
||||
form a terminating NUL character.
|
||||
If
|
||||
.Fa pc16
|
||||
is not
|
||||
.Dv NULL ,
|
||||
a NUL wide character has been stored in
|
||||
.Pf * Fa pc16 .
|
||||
.It positive
|
||||
.Fa s
|
||||
points to a valid character, and the value returned is the number of
|
||||
bytes completing the character.
|
||||
If
|
||||
.Fa pc16
|
||||
is not
|
||||
.Dv NULL ,
|
||||
the first UTF-16 code unit of the corresponding wide character
|
||||
has been stored in
|
||||
.Pf * Fa pc16 .
|
||||
If it is an UTF-16 high surrogate, the function needs to be called
|
||||
again to retrieve a second UTF-16 code unit, the low surrogate.
|
||||
On
|
||||
.Ox ,
|
||||
this happens if and only if the return value is 4,
|
||||
but this equivalence does not hold on other operating systems
|
||||
that support input encodings other than UTF-8.
|
||||
.It Po Vt size_t Pc Ns \-1
|
||||
.Fa s
|
||||
points to an illegal byte sequence which does not form a valid multibyte
|
||||
character in the current locale, or
|
||||
.Fa mbs
|
||||
points to an invalid or uninitialized object.
|
||||
.Va errno
|
||||
is set to
|
||||
.Er EILSEQ
|
||||
or
|
||||
.Er EINVAL ,
|
||||
respectively.
|
||||
The conversion state object pointed to by
|
||||
.Fa mbs
|
||||
is left in an undefined state and must be reinitialized before being
|
||||
used again.
|
||||
.It Po Vt size_t Pc Ns \-2
|
||||
.Fa s
|
||||
points to an incomplete byte sequence of length
|
||||
.Fa n
|
||||
which has been consumed and contains part of a valid multibyte character.
|
||||
The character may be completed by calling the same function again with
|
||||
.Fa s
|
||||
pointing to one or more subsequent bytes of the multibyte character and
|
||||
.Fa mbs
|
||||
pointing to the conversion state object used during conversion of the
|
||||
incomplete byte sequence.
|
||||
.It Po Vt size_t Pc Ns \-3
|
||||
The second 16-bit code unit resulting from a previous call
|
||||
has been stored into
|
||||
.Pf * Fa pc16 ,
|
||||
without consuming any additional bytes from
|
||||
.Fa s .
|
||||
.El
|
||||
.Sh ERRORS
|
||||
.Fn mbrtoc16
|
||||
causes an error in the following cases:
|
||||
.Bl -tag -width Er
|
||||
.It Bq Er EILSEQ
|
||||
.Fa s
|
||||
points to an invalid multibyte character.
|
||||
.It Bq Er EINVAL
|
||||
.Fa mbs
|
||||
points to an invalid or uninitialized
|
||||
.Vt mbstate_t
|
||||
object.
|
||||
.El
|
||||
.Sh SEE ALSO
|
||||
.Xr c16rtomb 3 ,
|
||||
.Xr mbrtowc 3 ,
|
||||
.Xr setlocale 3
|
||||
.Sh STANDARDS
|
||||
.Fn mbrtoc16
|
||||
conforms to
|
||||
.St -isoC-2011 .
|
||||
.Sh HISTORY
|
||||
.Fn mbrtoc16
|
||||
has been available since
|
||||
.Ox 7.4 .
|
||||
.Sh CAVEATS
|
||||
On operating systems other than
|
||||
.Ox
|
||||
that support input encodings other than UTF-8, inspecting the return value
|
||||
is insufficient to tell whether the function needs to be called again.
|
||||
If the return value is positive, inspecting
|
||||
.Pf * Fa pc16
|
||||
is also required to make that decision.
|
||||
Consequently, passing a
|
||||
.Dv NULL
|
||||
pointer for the
|
||||
.Fa pc16
|
||||
argument is discouraged because it can result
|
||||
in a well-defined but unknown output encoding state.
|
||||
The simplest way to recover from such an unknown state is to
|
||||
reinitialize the object pointed to by
|
||||
.Fa mbs .
|
||||
.Pp
|
||||
The C11 standard only requires the
|
||||
.Fa pc16
|
||||
argument to be encoded according to UTF-16
|
||||
if the predefined environment macro
|
||||
.Dv __STDC_UTF_16__
|
||||
is defined with a value of 1.
|
||||
On
|
||||
.Ox ,
|
||||
.In uchar.h
|
||||
provides this definition.
|
||||
Other operating systems which do not define
|
||||
.Dv __STDC_UTF_16__
|
||||
could theoretically use a different,
|
||||
implementation-defined output encoding for
|
||||
.Fa pc16
|
||||
instead of UTF-16.
|
||||
Writing portable code for an arbitrary output encoding is impossible
|
||||
because the rules when and how often the function needs to be called
|
||||
again depend on the output encoding; the rules explained above are
|
||||
specific to UTF-16.
|
||||
Using UTF-16 as the output encoding of
|
||||
.Fn wcrtoc16
|
||||
becomes mandatory in C23.
|
102
lib/libc/locale/mbrtoc16.c
Normal file
102
lib/libc/locale/mbrtoc16.c
Normal file
@ -0,0 +1,102 @@
|
||||
/* $OpenBSD: mbrtoc16.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
|
||||
/*
|
||||
* Copyright (c) 2022 Ingo Schwarze <schwarze@openbsd.org>
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <uchar.h>
|
||||
#include <wchar.h>
|
||||
|
||||
/*
|
||||
* Keep this structure compatible with
|
||||
* struct _utf8_state in the file citrus/citrus_utf8.c.
|
||||
* In particular, only use values for the "want" field
|
||||
* that do not collide with values used by the function
|
||||
* _citrus_utf8_ctype_mbrtowc().
|
||||
*/
|
||||
struct _utf16_state {
|
||||
wchar_t ch;
|
||||
int want;
|
||||
};
|
||||
|
||||
size_t
|
||||
mbrtoc16(char16_t *pc16, const char *s, size_t n, mbstate_t *ps)
|
||||
{
|
||||
static mbstate_t mbs;
|
||||
struct _utf16_state *us;
|
||||
size_t rv;
|
||||
wchar_t wc;
|
||||
|
||||
/*
|
||||
* Fall back to a state object local to this function
|
||||
* and do not use the fallback object in mbrtowc(3)
|
||||
* because an application program might mix calls to mbrtowc(3)
|
||||
* and mbrtoc16(3) decoding different strings, and they must
|
||||
* not clobber each other's state.
|
||||
*/
|
||||
if (ps == NULL)
|
||||
ps = &mbs;
|
||||
|
||||
us = (struct _utf16_state *)ps;
|
||||
|
||||
/*
|
||||
* Handle the special case of NULL input first such that
|
||||
* a low surrogate left over from a previous call does not
|
||||
* clobber an object pointed to by the pc16 argument.
|
||||
*/
|
||||
if (s == NULL) {
|
||||
s = "";
|
||||
n = 1;
|
||||
pc16 = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the previous call stored a high surrogate,
|
||||
* store the corresponding low surrogate now
|
||||
* and do not inspect any further input yet.
|
||||
*/
|
||||
if (us->want == (size_t)-3) {
|
||||
if (pc16 != NULL)
|
||||
*pc16 = 0xdc00 + (us->ch & 0x3ff);
|
||||
us->ch = 0;
|
||||
us->want = 0;
|
||||
return -3;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decode the multibyte character.
|
||||
* All the mbrtowc(3) use cases can be reached from here,
|
||||
* including continuing an imcomplete character started earlier,
|
||||
* decoding a NUL character, a valid complete character,
|
||||
* an incomplete character to be continued later,
|
||||
* or a decoding error.
|
||||
*/
|
||||
rv = mbrtowc(&wc, s, n, ps);
|
||||
|
||||
if (rv < (size_t)-2) {
|
||||
/* A new character that is valid and complete. */
|
||||
if (wc > UINT16_MAX) {
|
||||
/* Store a high surrogate. */
|
||||
if (pc16 != NULL)
|
||||
*pc16 = 0xd7c0 + (wc >> 10);
|
||||
/* Remember that the low surrogate is pending. */
|
||||
us->ch = wc;
|
||||
us->want = -3;
|
||||
} else if (pc16 != NULL)
|
||||
/* Store a basic multilingual plane codepoint. */
|
||||
*pc16 = wc;
|
||||
}
|
||||
return rv;
|
||||
}
|
18
lib/libc/locale/mbrtoc32.c
Normal file
18
lib/libc/locale/mbrtoc32.c
Normal file
@ -0,0 +1,18 @@
|
||||
/* $OpenBSD: mbrtoc32.c,v 1.1 2023/08/20 15:02:51 schwarze Exp $ */
|
||||
/*
|
||||
* Written by Ingo Schwarze <schwarze@openbsd.org>
|
||||
* and placed in the public domain on March 19, 2022.
|
||||
*/
|
||||
|
||||
#include <uchar.h>
|
||||
#include <wchar.h>
|
||||
|
||||
size_t
|
||||
mbrtoc32(char32_t *pc32, const char *s, size_t n, mbstate_t *ps)
|
||||
{
|
||||
static mbstate_t mbs;
|
||||
|
||||
if (ps == NULL)
|
||||
ps = &mbs;
|
||||
return mbrtowc(pc32, s, n, ps);
|
||||
}
|
@ -1,6 +1,8 @@
|
||||
.\" $OpenBSD: mbrtowc.3,v 1.5 2016/02/08 09:56:16 schwarze Exp $
|
||||
.\" $OpenBSD: mbrtowc.3,v 1.6 2023/08/20 15:02:51 schwarze Exp $
|
||||
.\" $NetBSD: mbrtowc.3,v 1.5 2003/09/08 17:54:31 wiz Exp $
|
||||
.\"
|
||||
.\" Copyright (c)2023 Ingo Schwarze <schwarze@openbsd.org>
|
||||
.\" Copyright (c)2010 Stefan Sperling <stsp@openbsd.org>
|
||||
.\" Copyright (c)2002 Citrus Project,
|
||||
.\" All rights reserved.
|
||||
.\"
|
||||
@ -25,27 +27,41 @@
|
||||
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
.\" SUCH DAMAGE.
|
||||
.\"
|
||||
.Dd $Mdocdate: February 8 2016 $
|
||||
.Dd $Mdocdate: August 20 2023 $
|
||||
.Dt MBRTOWC 3
|
||||
.Os
|
||||
.Sh NAME
|
||||
.Nm mbrtowc
|
||||
.Nd converts a multibyte character to a wide character (restartable)
|
||||
.Nm mbrtowc ,
|
||||
.Nm mbrtoc32
|
||||
.Nd convert a multibyte character to a wide character (restartable)
|
||||
.Sh SYNOPSIS
|
||||
.In wchar.h
|
||||
.Ft size_t
|
||||
.Fn mbrtowc "wchar_t * restrict wc" "const char * restrict s" "size_t n" \
|
||||
"mbstate_t * restrict mbs"
|
||||
.Fo mbrtowc
|
||||
.Fa "wchar_t * restrict wc"
|
||||
.Fa "const char * restrict s"
|
||||
.Fa "size_t n"
|
||||
.Fa "mbstate_t * restrict mbs"
|
||||
.Fc
|
||||
.In uchar.h
|
||||
.Ft size_t
|
||||
.Fo mbrtoc32
|
||||
.Fa "char32_t * restrict wc"
|
||||
.Fa "const char * restrict s"
|
||||
.Fa "size_t n"
|
||||
.Fa "mbstate_t * restrict mbs"
|
||||
.Fc
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn mbrtowc
|
||||
function examines at most
|
||||
and
|
||||
.Fn mbrtoc32
|
||||
functions examine at most
|
||||
.Fa n
|
||||
bytes of the multibyte character byte string pointed to by
|
||||
.Fa s ,
|
||||
converts those bytes to a wide character, and stores the wide character
|
||||
in the wchar_t object pointed to by
|
||||
.Fa wc
|
||||
convert those bytes to a wide character, and store the wide character into
|
||||
.Pf * Fa wc
|
||||
if
|
||||
.Fa wc
|
||||
is not
|
||||
@ -54,46 +70,47 @@ and
|
||||
.Fa s
|
||||
points to a valid character.
|
||||
.Pp
|
||||
Conversion happens in accordance with the conversion state described
|
||||
by the mbstate_t object pointed to by
|
||||
.Fa mbs .
|
||||
The mbstate_t object must be initialized to zero before the application's
|
||||
first call to
|
||||
.Fn mbrtowc .
|
||||
If the previous call to
|
||||
Conversion happens in accordance with the conversion state
|
||||
.Pf * Fa mbs ,
|
||||
which must be initialized to zero before the application's first call to
|
||||
.Fn mbrtowc
|
||||
did not return (size_t)-1, the mbstate_t object can safely be reused
|
||||
without reinitialization.
|
||||
or
|
||||
.Fn mbrtoc32 .
|
||||
If the previous call did not return
|
||||
.Po Vt size_t Pc Ns \-1 ,
|
||||
.Fa mbs
|
||||
can safely be reused without reinitialization.
|
||||
.Pp
|
||||
The behaviour of
|
||||
The input encoding that
|
||||
.Fn mbrtowc
|
||||
is affected by the
|
||||
and
|
||||
.Fn mbrtoc32
|
||||
use for
|
||||
.Fa s
|
||||
is determined by the
|
||||
.Dv LC_CTYPE
|
||||
category of the current locale.
|
||||
If the locale is changed without reinitialization of the mbstate_t object
|
||||
pointed to by
|
||||
.Fa mbs ,
|
||||
the behaviour of
|
||||
.Fn mbrtowc
|
||||
is undefined.
|
||||
If the locale is changed without reinitialization of
|
||||
.Pf * Fa mbs ,
|
||||
the behaviour is undefined.
|
||||
.Pp
|
||||
Unlike
|
||||
.Xr mbtowc 3 ,
|
||||
.Fn mbrtowc
|
||||
will accept an incomplete byte sequence pointed to by
|
||||
and
|
||||
.Fn mbrtoc32
|
||||
accept an incomplete byte sequence pointed to by
|
||||
.Fa s
|
||||
which does not form a complete character but is potentially part of
|
||||
a valid character.
|
||||
In this case,
|
||||
.Fn mbrtowc
|
||||
consumes all such bytes.
|
||||
The conversion state saved in the mbstate_t object pointed to by
|
||||
.Fa mbs
|
||||
will be used to restart the suspended conversion during the next
|
||||
call to
|
||||
.Fn mbrtowc .
|
||||
In this case, both functions consume all such bytes.
|
||||
The conversion state saved in
|
||||
.Pf * Fa mbs
|
||||
will be used to restart the suspended conversion during the next call.
|
||||
.Pp
|
||||
In state-dependent encodings,
|
||||
On systems other than
|
||||
.Ox
|
||||
that support state-dependent encodings,
|
||||
.Fa s
|
||||
may point to a special sequence of bytes called a
|
||||
.Dq shift sequence .
|
||||
@ -104,61 +121,58 @@ can switch e.g. from ASCII (which uses one byte per character) to
|
||||
JIS X 0208 (which uses two bytes per character).
|
||||
Shift sequence bytes correspond to no individual wide character, so
|
||||
.Fn mbrtowc
|
||||
treats them as if they were part of the subsequent multibyte character.
|
||||
and
|
||||
.Fn mbrtoc32
|
||||
treat them as if they were part of the subsequent multibyte character.
|
||||
Therefore they do contribute to the number of bytes in the multibyte character.
|
||||
.Pp
|
||||
Special cases in interpretation of arguments are as follows:
|
||||
The following arguments cause special processing:
|
||||
.Bl -tag -width 012345678901
|
||||
.It "wc == NULL "
|
||||
.It Fa wc No == Dv NULL
|
||||
The conversion from a multibyte character to a wide character is performed
|
||||
and the conversion state may be affected, but the resulting wide character
|
||||
is discarded.
|
||||
.Pp
|
||||
This can be used to find out how many bytes are contained in the
|
||||
multibyte character pointed to by
|
||||
.Fa s .
|
||||
.It "s == NULL "
|
||||
.Fn mbrtowc
|
||||
ignores
|
||||
.It Fa s No == Dv NULL
|
||||
The arguments
|
||||
.Fa wc
|
||||
and
|
||||
.Fa n ,
|
||||
and behaves equivalent to
|
||||
.Bd -literal -offset indent
|
||||
mbrtowc(NULL, "", 1, mbs);
|
||||
.Ed
|
||||
.Pp
|
||||
which attempts to use the mbstate_t object pointed to by
|
||||
.Fa mbs
|
||||
to start or continue conversion using the empty string as input,
|
||||
and discards the conversion result.
|
||||
.Pp
|
||||
.Fa n
|
||||
are ignored and starting or continuing the conversion with an empty string
|
||||
is attempted, discarding the conversion result.
|
||||
If conversion succeeds, this call always returns zero.
|
||||
Unlike
|
||||
.Xr mbtowc 3 ,
|
||||
the value returned does not indicate whether the current encoding of
|
||||
the locale is state-dependent, i.e. uses shift sequences.
|
||||
.It "mbs == NULL "
|
||||
.It Fa mbs No == Dv NULL
|
||||
.Fn mbrtowc
|
||||
uses its own internal state object to keep the conversion state,
|
||||
instead of an mbstate_t object pointed to by
|
||||
.Fa mbs .
|
||||
This internal conversion state is initialized once at program startup.
|
||||
It is not safe to call
|
||||
and
|
||||
.Fn mbrtoc32
|
||||
each use their own internal state object instead of the
|
||||
.Fa mbs
|
||||
argument.
|
||||
Both internal state objects are initialized at startup time of the program,
|
||||
and no other libc function ever changes either of them.
|
||||
.Pp
|
||||
If
|
||||
.Fn mbrtowc
|
||||
again with a
|
||||
or
|
||||
.Fn mbrtoc32
|
||||
is called with a
|
||||
.Dv NULL
|
||||
.Fa mbs
|
||||
argument if
|
||||
.Fn mbrtowc
|
||||
returned (size_t)-1 because at this point the internal conversion state
|
||||
is undefined.
|
||||
.Pp
|
||||
Calling any other functions in
|
||||
.Em libc
|
||||
never changes the internal
|
||||
conversion state object of
|
||||
.Fn mbrtowc .
|
||||
argument and that call returns
|
||||
.Po Vt size_t Pc Ns \-1 ,
|
||||
the internal conversion state of the respective function becomes
|
||||
permanently undefined and there is no way to reset it to any defined state.
|
||||
Consequently, after such a mishap, it is not safe
|
||||
to call the same function with a
|
||||
.Dv NULL
|
||||
.Fa mbs
|
||||
argument ever again until the program is terminated.
|
||||
.El
|
||||
.Sh RETURN VALUES
|
||||
.Bl -tag -width 012345678901
|
||||
@ -183,14 +197,18 @@ is not
|
||||
the corresponding wide character has been stored in the wchar_t object
|
||||
pointed to by
|
||||
.Fa wc .
|
||||
.It (size_t)-1
|
||||
.It Po Vt size_t Pc Ns \-1
|
||||
.Fa s
|
||||
points to an illegal byte sequence which does not form a valid multibyte
|
||||
character in the current locale.
|
||||
.Fn mbrtowc
|
||||
sets
|
||||
character in the current locale, or
|
||||
.Fa mbs
|
||||
points to an invalid or uninitialized object.
|
||||
.Va errno
|
||||
to EILSEQ.
|
||||
is set to
|
||||
.Er EILSEQ
|
||||
or
|
||||
.Er EINVAL ,
|
||||
respectively.
|
||||
The conversion state object pointed to by
|
||||
.Fa mbs
|
||||
is left in an undefined state and must be reinitialized before being
|
||||
@ -198,6 +216,8 @@ used again.
|
||||
.Pp
|
||||
Because applications using
|
||||
.Fn mbrtowc
|
||||
or
|
||||
.Fn mbrtoc32
|
||||
are shielded from the specifics of the multibyte character encoding scheme,
|
||||
it is impossible to repair byte sequences containing encoding errors.
|
||||
Such byte sequences must be treated as invalid and potentially malicious input.
|
||||
@ -205,66 +225,90 @@ Applications must stop processing the byte string pointed to by
|
||||
.Fa s
|
||||
and either discard any wide characters already converted, or cope with
|
||||
truncated input.
|
||||
.It (size_t)-2
|
||||
.It Po Vt size_t Pc Ns \-2
|
||||
.Fa s
|
||||
points to an incomplete byte sequence of length
|
||||
.Fa n
|
||||
which has been consumed and contains part of a valid multibyte character.
|
||||
The character may be completed by calling
|
||||
.Fn mbrtowc
|
||||
again with
|
||||
The character may be completed by calling the same function again with
|
||||
.Fa s
|
||||
pointing to one or more subsequent bytes of the multibyte character and
|
||||
.Fa mbs
|
||||
pointing to the conversion state object used during conversion of the
|
||||
incomplete byte sequence.
|
||||
.It Po Vt size_t Pc Ns \-3
|
||||
The next character resulting from a previous call has been stored into
|
||||
.Fa wc ,
|
||||
without consuming any additional bytes from
|
||||
.Fa s .
|
||||
This never happens for
|
||||
.Fn mbrtowc ,
|
||||
and on
|
||||
.Ox ,
|
||||
it never happens for
|
||||
.Fn mbrtoc32
|
||||
either.
|
||||
.El
|
||||
.Sh ERRORS
|
||||
The
|
||||
.Fn mbrtowc
|
||||
function may cause an error in the following cases:
|
||||
and
|
||||
.Fn mbrtoc32
|
||||
cause an error in the following cases:
|
||||
.Bl -tag -width Er
|
||||
.It Bq Er EILSEQ
|
||||
.Fa s
|
||||
points to an invalid multibyte character.
|
||||
.It Bq Er EINVAL
|
||||
.Fa mbs
|
||||
points to an invalid or uninitialized mbstate_t object.
|
||||
points to an invalid or uninitialized
|
||||
.Vt mbstate_t
|
||||
object.
|
||||
.El
|
||||
.Sh SEE ALSO
|
||||
.Xr mbrlen 3 ,
|
||||
.Xr mbtowc 3 ,
|
||||
.Xr setlocale 3
|
||||
.Xr setlocale 3 ,
|
||||
.Xr wcrtomb 3
|
||||
.Sh STANDARDS
|
||||
The
|
||||
.Fn mbrtowc
|
||||
function conforms to
|
||||
.\" .St -isoC-amd1 .
|
||||
ISO/IEC 9899/AMD1:1995
|
||||
.Pq Dq ISO C90, Amendment 1 .
|
||||
The restrict qualifier is added at
|
||||
.\" .St -isoC99 .
|
||||
ISO/IEC 9899:1999
|
||||
.Pq Dq ISO C99 .
|
||||
conforms to
|
||||
.St -isoC-amd1 .
|
||||
The restrict qualifier was added at
|
||||
.St -isoC-99 .
|
||||
.Pp
|
||||
.Fn mbrtoc32
|
||||
conforms to
|
||||
.St -isoC-2011 .
|
||||
.Sh HISTORY
|
||||
.Fn mbrtowc
|
||||
has been available since since
|
||||
.Ox 3.8
|
||||
and has provided support for UTF-8 since
|
||||
.Ox 4.8 .
|
||||
.Pp
|
||||
.Fn mbrtoc32
|
||||
has been available since since
|
||||
.Ox 7.4 .
|
||||
.Sh CAVEATS
|
||||
.Fn mbrtowc
|
||||
is not suitable for programs that care about internals of the character
|
||||
and
|
||||
.Fn mbrtoc32
|
||||
are not suitable for programs that care about internals of the character
|
||||
encoding scheme used by the byte string pointed to by
|
||||
.Fa s .
|
||||
.Pp
|
||||
It is possible that
|
||||
.Fn mbrtowc
|
||||
fails because of locale configuration errors.
|
||||
It is possible that these functions
|
||||
fail because of locale configuration errors.
|
||||
An
|
||||
.Dq invalid
|
||||
character sequence may simply be encoded in a different encoding than that
|
||||
of the current locale.
|
||||
.Pp
|
||||
The special cases for
|
||||
.Fa s
|
||||
== NULL and
|
||||
.Fa mbs
|
||||
== NULL do not make any sense.
|
||||
.Fa s No == Dv NULL
|
||||
and
|
||||
.Fa mbs No == Dv NULL
|
||||
do not make any sense.
|
||||
Instead of passing
|
||||
.Dv NULL
|
||||
for
|
||||
|
@ -1,6 +1,7 @@
|
||||
.\" $OpenBSD: wcrtomb.3,v 1.10 2015/03/22 18:02:11 stsp Exp $
|
||||
.\" $OpenBSD: wcrtomb.3,v 1.11 2023/08/20 15:02:51 schwarze Exp $
|
||||
.\" $NetBSD: wcrtomb.3,v 1.4 2003/09/08 17:54:31 wiz Exp $
|
||||
.\"
|
||||
.\" Copyright (c)2023 Ingo Schwarze <schwarze@openbsd.org>
|
||||
.\" Copyright (c)2002 Citrus Project,
|
||||
.\" All rights reserved.
|
||||
.\"
|
||||
@ -25,117 +26,161 @@
|
||||
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
.\" SUCH DAMAGE.
|
||||
.\"
|
||||
.Dd $Mdocdate: March 22 2015 $
|
||||
.Dd $Mdocdate: August 20 2023 $
|
||||
.Dt WCRTOMB 3
|
||||
.Os
|
||||
.\" ----------------------------------------------------------------------
|
||||
.Sh NAME
|
||||
.Nm wcrtomb
|
||||
.Nd converts a wide character to a multibyte character (restartable)
|
||||
.\" ----------------------------------------------------------------------
|
||||
.Nm wcrtomb ,
|
||||
.Nm c32rtomb
|
||||
.Nd convert a wide character to a multibyte character
|
||||
.Sh SYNOPSIS
|
||||
.In wchar.h
|
||||
.Ft size_t
|
||||
.Fn wcrtomb "const char * restrict s" "wchar_t wc" "mbstate_t * restrict ps"
|
||||
.\" ----------------------------------------------------------------------
|
||||
.Fo wcrtomb
|
||||
.Fa "const char * restrict s"
|
||||
.Fa "wchar_t wc"
|
||||
.Fa "mbstate_t * restrict mbs"
|
||||
.Fc
|
||||
.In uchar.h
|
||||
.Ft size_t
|
||||
.Fo c32rtomb
|
||||
.Fa "char * restrict s"
|
||||
.Fa "char32_t wc"
|
||||
.Fa "mbstate_t * restrict mbs"
|
||||
.Fc
|
||||
.Sh DESCRIPTION
|
||||
.Fn wcrtomb
|
||||
converts the wide character given by
|
||||
and
|
||||
.Fn c32rtomb
|
||||
convert the wide character
|
||||
.Fa wc
|
||||
to the corresponding multibyte character, and stores up to
|
||||
to the corresponding multibyte character, and store up to
|
||||
.Dv MB_CUR_MAX
|
||||
bytes in the array pointed to by
|
||||
.Fa s
|
||||
if
|
||||
.Fa s
|
||||
is not a null pointer.
|
||||
is not a
|
||||
.Dv NULL
|
||||
pointer.
|
||||
The interpretation of
|
||||
.Fa wc
|
||||
is implementation-defined.
|
||||
On
|
||||
.Ox ,
|
||||
.Vt wchar_t
|
||||
and
|
||||
.Vt char32_t
|
||||
are of the same width and both are always interpreted as Unicode codepoints.
|
||||
.Pp
|
||||
The behaviour of
|
||||
The output encoding that
|
||||
.Fn wcrtomb
|
||||
is affected by the
|
||||
and
|
||||
.Fn c32rtomb
|
||||
use in
|
||||
.Fa s
|
||||
is determined by the
|
||||
.Dv LC_CTYPE
|
||||
category of the current locale.
|
||||
.Ox
|
||||
only supports UTF-8 and ASCII output,
|
||||
and these functions are only useful for UTF-8.
|
||||
.Pp
|
||||
These are the special cases:
|
||||
The following arguments cause special processing:
|
||||
.Bl -tag -width 012345678901
|
||||
.It "wc == 0"
|
||||
For state-dependent encodings,
|
||||
.Fn wcrtomb
|
||||
stores a null byte preceded by a special byte sequence (if any)
|
||||
to return to an initial state to the array pointed by
|
||||
.Fa s ,
|
||||
and the state object pointed by
|
||||
.Fa ps
|
||||
also returned to an initial state.
|
||||
.It "s == NULL"
|
||||
.Fn wcrtomb
|
||||
just places
|
||||
.Fa ps
|
||||
into an initial state.
|
||||
It is equivalent to the following call:
|
||||
.Bd -literal -offset indent
|
||||
wcrtomb(buf, L'\e0', ps);
|
||||
.Ed
|
||||
.Pp
|
||||
Here,
|
||||
.Fa buf
|
||||
is a dummy buffer.
|
||||
In this case,
|
||||
.Fa wc
|
||||
is ignored.
|
||||
.It "ps == NULL"
|
||||
.It Fa wc No == 0
|
||||
A NUL byte is stored to
|
||||
.Pf * Fa s
|
||||
and the state object pointed to by
|
||||
.Fa mbs
|
||||
is reset to the initial state.
|
||||
On operating systems other than
|
||||
.Ox
|
||||
that support state-dependent multibyte encodings, a special byte sequence
|
||||
.Pq Dq shift sequence
|
||||
is written before the NUL byte to return to the initial state
|
||||
if that is required by the output encoding
|
||||
and by the current output encoding state.
|
||||
.It Fa mbs No == Dv NULL
|
||||
.Fn mbrtowc
|
||||
uses its own internal state object to keep the conversion state,
|
||||
instead of
|
||||
.Fa ps
|
||||
mentioned in this manual page.
|
||||
.Pp
|
||||
Calling any other functions in
|
||||
and
|
||||
.Fn c32rtomb
|
||||
each use their own internal state object instead of the
|
||||
.Fa mbs
|
||||
argument.
|
||||
Both internal state objects are initialized at startup time of the program,
|
||||
and no other
|
||||
.Em libc
|
||||
never change the internal
|
||||
state of
|
||||
.Fn mbrtowc ,
|
||||
which is initialized at startup time of the program.
|
||||
function ever changes either of them.
|
||||
.It Fa s No == Dv NULL
|
||||
The object pointed to by
|
||||
.Fa mbs ,
|
||||
or the internal object if
|
||||
.Fa mbs
|
||||
is a
|
||||
.Dv NULL
|
||||
pointer, is reset to the initial state,
|
||||
.Fa wc
|
||||
is ignored, and 1 is returned.
|
||||
.El
|
||||
.\" ----------------------------------------------------------------------
|
||||
.Sh RETURN VALUES
|
||||
.Fn wcrtomb
|
||||
returns the number of bytes (including any shift sequences)
|
||||
and
|
||||
.Fn c32rtomb
|
||||
return the number of bytes (including any shift sequences)
|
||||
which are stored in the array pointed to by
|
||||
.Fa s .
|
||||
.Fa s ,
|
||||
or 1 if
|
||||
.Fa s
|
||||
is
|
||||
.Dv NULL .
|
||||
If
|
||||
.Fa wc
|
||||
is not a valid wide character,
|
||||
.Fn wcrtomb
|
||||
returns (size_t)-1
|
||||
and sets
|
||||
is not a valid wide character
|
||||
or if it cannot be represented in the multibyte encoding selected with
|
||||
.Dv LC_CTYPE ,
|
||||
both functions return
|
||||
.Po Vt size_t Pc Ns \-1
|
||||
and set
|
||||
.Va errno
|
||||
to indicate error.
|
||||
.\" ----------------------------------------------------------------------
|
||||
to indicate the error.
|
||||
.Sh ERRORS
|
||||
.Fn wcrtomb
|
||||
may cause an error in the following cases:
|
||||
and
|
||||
.Fn c32rtomb
|
||||
cause an error in the following cases:
|
||||
.Bl -tag -width Er
|
||||
.It Bq Er EILSEQ
|
||||
.Fa wc
|
||||
is not a valid wide character.
|
||||
is not a valid wide character or cannot be represented using
|
||||
.Dv LC_CTYPE .
|
||||
.It Bq Er EINVAL
|
||||
.Fa ps
|
||||
points to an invalid or uninitialized mbstate_t object.
|
||||
.Fa mbs
|
||||
points to an invalid or uninitialized
|
||||
.Vt mbstate_t
|
||||
object.
|
||||
.El
|
||||
.\" ----------------------------------------------------------------------
|
||||
.Sh SEE ALSO
|
||||
.Xr mbrtowc 3 ,
|
||||
.Xr setlocale 3 ,
|
||||
.Xr wctomb 3
|
||||
.\" ----------------------------------------------------------------------
|
||||
.Sh STANDARDS
|
||||
The
|
||||
.Fn wcrtomb
|
||||
function conforms to
|
||||
.\" .St -isoC-amd1 .
|
||||
ISO/IEC 9899/AMD1:1995
|
||||
.Pq Dq ISO C90, Amendment 1 .
|
||||
The restrict qualifier is added at
|
||||
.\" .St -isoC99 .
|
||||
ISO/IEC 9899/1999
|
||||
.Pq Dq ISO C99 .
|
||||
conforms to
|
||||
.St -isoC-amd1 .
|
||||
The restrict qualifier was added at
|
||||
.St -isoC-99 .
|
||||
.Pp
|
||||
.Fn c32rtomb
|
||||
conforms to
|
||||
.St -isoC-2011 .
|
||||
.Sh HISTORY
|
||||
.Fn wcrtomb
|
||||
has been available since
|
||||
.Ox 3.8
|
||||
and has provided support for UTF-8 since
|
||||
.Ox 4.8 .
|
||||
.Pp
|
||||
.Fn c32rtomb
|
||||
has been available since since
|
||||
.Ox 7.4 .
|
||||
|
@ -1,4 +1,4 @@
|
||||
major=97
|
||||
minor=0
|
||||
minor=1
|
||||
# note: If changes were made to include/thread_private.h or if system calls
|
||||
# were added/changed then librthread/shlib_version must also be updated.
|
||||
|
Loading…
Reference in New Issue
Block a user