1
0
mirror of https://github.com/openbsd/src.git synced 2025-01-04 15:25:38 -08:00
openbsd-src/usr.bin/mandoc/roff_escape.c
schwarze 700ead5451 Fix UTF-16 surrogate detection:
lower case variants have to be rejected, too.
2024-05-16 21:21:08 +00:00

547 lines
12 KiB
C

/* $OpenBSD: roff_escape.c,v 1.15 2024/05/16 21:21:08 schwarze Exp $ */
/*
* Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
* Ingo Schwarze <schwarze@openbsd.org>
* Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Parser for roff(7) escape sequences.
* To be used by all mandoc(1) parsers and formatters.
*/
#include <assert.h>
#include <ctype.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include "mandoc.h"
#include "roff.h"
#include "roff_int.h"
/*
* Traditional escape sequence interpreter for general use
* including in high-level formatters. This function does not issue
* diagnostics and is not usable for expansion in the roff(7) parser.
* It is documented in the mandoc_escape(3) manual page.
*/
enum mandoc_esc
mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
{
int iarg, iendarg, iend;
enum mandoc_esc rval;
rval = roff_escape(--*rendarg, 0, 0,
NULL, NULL, &iarg, &iendarg, &iend);
assert(rval != ESCAPE_EXPAND);
if (rarg != NULL)
*rarg = *rendarg + iarg;
if (rargl != NULL)
*rargl = iendarg - iarg;
*rendarg += iend;
return rval;
}
/*
* Full-featured escape sequence parser.
* If it encounters a nested escape sequence that requires expansion
* by the parser and re-parsing, the positions of that inner escape
* sequence are returned in *resc ... *rend.
* Otherwise, *resc is set to aesc and the positions of the escape
* sequence starting at aesc are returned.
* Diagnostic messages are generated if and only if ln != 0,
* that is, if and only if called by roff_expand().
*/
enum mandoc_esc
roff_escape(const char *buf, const int ln, const int aesc,
int *resc, int *rnam, int *rarg, int *rendarg, int *rend)
{
int iesc; /* index of leading escape char */
int inam; /* index of escape name */
int iarg; /* index beginning the argument */
int iendarg; /* index right after the argument */
int iend; /* index right after the sequence */
int sesc, snam, sarg, sendarg, send; /* for sub-escape */
int escterm; /* whether term is escaped */
int maxl; /* expected length of the argument */
int argl; /* actual length of the argument */
int c, i; /* for \[char...] parsing */
int valid_A; /* for \A parsing */
enum mandoc_esc rval; /* return value */
enum mandoc_esc stype; /* for sub-escape */
enum mandocerr err; /* diagnostic code */
char term; /* byte terminating the argument */
/*
* Treat "\E" just like "\";
* it only makes a difference in copy mode.
*/
iesc = inam = aesc;
do {
inam++;
} while (buf[inam] == 'E');
/*
* Sort the following cases first by syntax category,
* then by escape sequence type, and finally by ASCII code.
*/
iarg = iendarg = iend = inam + 1;
maxl = INT_MAX;
term = '\0';
err = MANDOCERR_OK;
switch (buf[inam]) {
/* Escape sequences taking no arguments at all. */
case '!':
case '?':
case 'r':
rval = ESCAPE_UNSUPP;
goto out;
case '%':
case '&':
case ')':
case ',':
case '/':
case '^':
case 'a':
case 'd':
case 't':
case 'u':
case '{':
case '|':
case '}':
rval = ESCAPE_IGNORE;
goto out;
case '\0':
iendarg = --iend;
/* FALLTHROUGH */
case '.':
case '\\':
default:
iarg--;
rval = ESCAPE_UNDEF;
goto out;
case ' ':
case '\'':
case '-':
case '0':
case ':':
case '_':
case '`':
case 'e':
case '~':
iarg--;
argl = 1;
rval = ESCAPE_SPECIAL;
goto out;
case 'p':
rval = ESCAPE_BREAK;
goto out;
case 'c':
rval = ESCAPE_NOSPACE;
goto out;
case 'z':
rval = ESCAPE_SKIPCHAR;
goto out;
/* Standard argument format. */
case '$':
case '*':
case 'V':
case 'g':
case 'n':
rval = ESCAPE_EXPAND;
break;
case 'F':
case 'M':
case 'O':
case 'Y':
case 'k':
case 'm':
rval = ESCAPE_IGNORE;
break;
case '(':
case '[':
rval = ESCAPE_SPECIAL;
iendarg = iend = --iarg;
break;
case 'f':
rval = ESCAPE_FONT;
break;
/* Quoted arguments */
case 'A':
case 'B':
case 'w':
rval = ESCAPE_EXPAND;
term = '\b';
break;
case 'D':
case 'H':
case 'L':
case 'R':
case 'S':
case 'X':
case 'Z':
case 'b':
case 'v':
case 'x':
rval = ESCAPE_IGNORE;
term = '\b';
break;
case 'C':
rval = ESCAPE_SPECIAL;
term = '\b';
break;
case 'N':
rval = ESCAPE_NUMBERED;
term = '\b';
break;
case 'h':
rval = ESCAPE_HORIZ;
term = '\b';
break;
case 'l':
rval = ESCAPE_HLINE;
term = '\b';
break;
case 'o':
rval = ESCAPE_OVERSTRIKE;
term = '\b';
break;
/* Sizes support both forms, with additional peculiarities. */
case 's':
rval = ESCAPE_IGNORE;
if (buf[iarg] == '+' || buf[iarg] == '-'||
buf[iarg] == ASCII_HYPH)
iarg++;
switch (buf[iarg]) {
case '(':
maxl = 2;
iarg++;
break;
case '[':
term = ']';
iarg++;
break;
case '\'':
term = '\'';
iarg++;
break;
case '1':
case '2':
case '3':
if (buf[iarg - 1] == 's' &&
isdigit((unsigned char)buf[iarg + 1])) {
maxl = 2;
break;
}
/* FALLTHROUGH */
default:
maxl = 1;
break;
}
iendarg = iend = iarg;
}
/* Decide how to end the argument. */
escterm = 0;
stype = ESCAPE_EXPAND;
if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
buf[iarg] == buf[iesc]) {
stype = roff_escape(buf, ln, iendarg,
&sesc, &snam, &sarg, &sendarg, &send);
if (stype == ESCAPE_EXPAND)
goto out_sub;
}
if (term == '\b') {
if (stype == ESCAPE_UNDEF)
iarg++;
if (stype != ESCAPE_EXPAND && stype != ESCAPE_UNDEF) {
if (strchr("BHLRSNhlvx", buf[inam]) != NULL &&
strchr(" ,.0DLOXYZ^abdhlortuvx|~",
buf[snam]) != NULL) {
err = MANDOCERR_ESC_DELIM;
iend = send;
iarg = iendarg = sesc;
goto out;
}
escterm = 1;
iarg = send;
term = buf[snam];
} else if (strchr("BDHLRSvxNhl", buf[inam]) != NULL &&
strchr(" %&()*+-./0123456789:<=>", buf[iarg]) != NULL) {
err = MANDOCERR_ESC_DELIM;
if (rval != ESCAPE_EXPAND)
rval = ESCAPE_ERROR;
if (buf[inam] != 'D') {
iendarg = iend = iarg + 1;
goto out;
}
}
if (term == '\b')
term = buf[iarg++];
} else if (term == '\0' && maxl == INT_MAX) {
if (buf[inam] == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
iarg++;
switch (buf[iarg]) {
case '(':
maxl = 2;
iarg++;
break;
case '[':
if (buf[++iarg] == ' ') {
iendarg = iend = iarg + 1;
err = MANDOCERR_ESC_ARG;
rval = ESCAPE_ERROR;
goto out;
}
term = ']';
break;
default:
maxl = 1;
break;
}
}
/* Advance to the end of the argument. */
valid_A = 1;
iendarg = iarg;
while (maxl > 0) {
if (buf[iendarg] == '\0') {
err = MANDOCERR_ESC_INCOMPLETE;
if (rval != ESCAPE_EXPAND &&
rval != ESCAPE_OVERSTRIKE)
rval = ESCAPE_ERROR;
/* Usually, ignore an incomplete argument. */
if (strchr("Aow", buf[inam]) == NULL)
iendarg = iarg;
break;
}
if (escterm == 0 && buf[iendarg] == term) {
iend = iendarg + 1;
break;
}
if (buf[iendarg] == buf[iesc]) {
stype = roff_escape(buf, ln, iendarg,
&sesc, &snam, &sarg, &sendarg, &send);
if (stype == ESCAPE_EXPAND)
goto out_sub;
iend = send;
if (escterm == 1 &&
(buf[snam] == term || buf[inam] == 'N'))
break;
if (stype != ESCAPE_UNDEF)
valid_A = 0;
iendarg = send;
} else if (buf[inam] == 'N' &&
isdigit((unsigned char)buf[iendarg]) == 0) {
iend = iendarg + 1;
break;
} else {
if (buf[iendarg] == ' ' || buf[iendarg] == '\t')
valid_A = 0;
if (maxl != INT_MAX)
maxl--;
iend = ++iendarg;
}
}
/* Post-process depending on the content of the argument. */
argl = iendarg - iarg;
switch (buf[inam]) {
case '*':
if (resc == NULL && argl == 2 &&
buf[iarg] == '.' && buf[iarg + 1] == 'T')
rval = ESCAPE_DEVICE;
break;
case 'A':
if (valid_A == 0)
iendarg = iarg;
break;
case 'O':
switch (buf[iarg]) {
case '0':
rval = ESCAPE_UNSUPP;
break;
case '1':
case '2':
case '3':
case '4':
if (argl == 1)
rval = ESCAPE_IGNORE;
else {
err = MANDOCERR_ESC_ARG;
rval = ESCAPE_ERROR;
}
break;
case '5':
if (buf[iarg - 1] == '[')
rval = ESCAPE_UNSUPP;
else {
err = MANDOCERR_ESC_ARG;
rval = ESCAPE_ERROR;
}
break;
default:
err = MANDOCERR_ESC_ARG;
rval = ESCAPE_ERROR;
break;
}
break;
default:
break;
}
switch (rval) {
case ESCAPE_FONT:
rval = mandoc_font(buf + iarg, argl);
if (rval == ESCAPE_ERROR)
err = MANDOCERR_ESC_ARG;
break;
case ESCAPE_SPECIAL:
if (argl == 0) {
err = MANDOCERR_ESC_BADCHAR;
rval = ESCAPE_ERROR;
break;
}
/*
* The file chars.c only provides one common list of
* character names, but \[-] == \- is the only one of
* the characters with one-byte names that allows
* enclosing the name in brackets.
*/
if (term != '\0' && argl == 1 && buf[iarg] != '-') {
err = MANDOCERR_ESC_BADCHAR;
rval = ESCAPE_ERROR;
break;
}
/* Treat \[char...] as an alias for \N'...'. */
if (buf[iarg] == 'c') {
if (argl < 6 || argl > 7 ||
strncmp(buf + iarg, "char", 4) != 0 ||
(int)strspn(buf + iarg + 4, "0123456789")
+ 4 < argl)
break;
c = 0;
for (i = iarg; i < iendarg; i++)
c = 10 * c + (buf[i] - '0');
if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) {
err = MANDOCERR_ESC_BADCHAR;
break;
}
iarg += 4;
rval = ESCAPE_NUMBERED;
break;
}
/*
* Unicode escapes are defined in groff as \[u0000]
* to \[u10FFFF], where the contained value must be
* a valid Unicode codepoint.
*/
if (buf[iarg] != 'u' || argl < 5 || argl > 7)
break;
if (argl == 7 && /* beyond the Unicode range */
(buf[iarg + 1] != '1' || buf[iarg + 2] != '0')) {
err = MANDOCERR_ESC_BADCHAR;
break;
}
if (argl == 6 && buf[iarg + 1] == '0') {
err = MANDOCERR_ESC_BADCHAR;
break;
}
if (argl == 5 && /* UTF-16 surrogate */
toupper((unsigned char)buf[iarg + 1]) == 'D' &&
strchr("89ABCDEFabcdef", buf[iarg + 2]) != NULL) {
err = MANDOCERR_ESC_BADCHAR;
break;
}
if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
+ 1 == argl)
rval = ESCAPE_UNICODE;
break;
default:
break;
}
goto out;
out_sub:
iesc = sesc;
inam = snam;
iarg = sarg;
iendarg = sendarg;
iend = send;
rval = ESCAPE_EXPAND;
out:
if (resc != NULL)
*resc = iesc;
if (rnam != NULL)
*rnam = inam;
if (rarg != NULL)
*rarg = iarg;
if (rendarg != NULL)
*rendarg = iendarg;
if (rend != NULL)
*rend = iend;
if (ln == 0)
return rval;
/*
* Diagnostic messages are only issued when called
* from the parser, not when called from the formatters.
*/
switch (rval) {
case ESCAPE_UNSUPP:
err = MANDOCERR_ESC_UNSUPP;
break;
case ESCAPE_UNDEF:
if (buf[inam] != '\\' && buf[inam] != '.')
err = MANDOCERR_ESC_UNDEF;
break;
case ESCAPE_SPECIAL:
if (mchars_spec2cp(buf + iarg, argl) >= 0)
err = MANDOCERR_OK;
else if (err == MANDOCERR_OK)
err = MANDOCERR_ESC_UNKCHAR;
break;
default:
break;
}
if (err != MANDOCERR_OK)
mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
return rval;
}