1
0
mirror of https://github.com/openbsd/src.git synced 2024-12-22 07:27:59 -08:00

Update awk to the Nov 20, 2023 version.

This includes a rewrite of the fnematch() function as well as a
refactoring of the sub and gsub implementation.
This commit is contained in:
millert 2023-11-22 01:01:21 +00:00
parent 8d94b3475e
commit 6e363ec32f
6 changed files with 201 additions and 252 deletions

View File

@ -25,6 +25,18 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the
second edition of the AWK book was published in September 2023.
Nov 20, 2023
rewrite of fnematch to fix a number of issues, including
extraneous output, out-of-bounds access, number of bytes
to push back after a failed match etc.
thanks to Miguel Pineiro Jr.
Nov 15, 2023
Man page edit, regression test fixes. thanks to Arnold Robbins
consolidation of sub and gsub into dosub, removing duplicate
code. thanks to Miguel Pineiro Jr.
gcc replaced with cc everywhere.
Oct 30, 2023:
multiple fixes and a minor code cleanup.
disabled utf-8 for non-multibyte locales, such as C or POSIX.

View File

@ -1,4 +1,4 @@
/* $OpenBSD: b.c,v 1.47 2023/11/15 18:56:53 millert Exp $ */
/* $OpenBSD: b.c,v 1.48 2023/11/22 01:01:21 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
@ -770,59 +770,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
// Read one rune at a time from the given FILE*. Return both
// the bytes and the actual rune.
struct runedata {
int rune;
size_t len;
char bytes[6];
};
struct runedata getrune(FILE *fp)
{
struct runedata result;
int c, i, next;
memset(&result, 0, sizeof(result));
c = getc(fp);
if (c == EOF)
return result; // result.rune == 0 --> EOF
else if (c < 128 || awk_mb_cur_max == 1) {
result.bytes[0] = c;
result.len = 1;
result.rune = c;
return result;
}
// need to get bytes and fill things in
result.bytes[0] = c;
result.len = 1;
next = 1;
for (i = 1; i < MAX_UTF_BYTES; i++) {
c = getc(fp);
if (c == EOF)
break;
result.bytes[next++] = c;
result.len++;
}
// put back any extra input bytes
int actual_len = u8_nextlen(result.bytes);
while (result.len > actual_len) {
ungetc(result.bytes[--result.len], fp);
}
result.bytes[result.len] = '\0';
(void) u8_rune(& result.rune, (uschar *) result.bytes);
return result;
}
/*
* NAME
* fnematch
@ -840,60 +787,76 @@ struct runedata getrune(FILE *fp)
bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
{
char *buf = *pbuf;
char *i, *j, *k, *buf = *pbuf;
int bufsize = *pbufsize;
int i, j, k, ns, s;
struct runedata r;
int c, n, ns, s;
s = pfa->initstat;
patlen = 0;
/*
* All indices relative to buf.
* i <= j <= k <= bufsize
* buf <= i <= j <= k <= buf+bufsize
*
* i: origin of active substring (first byte of first character)
* j: current character (last byte of current character)
* k: destination of next getc()
* i: origin of active substring
* j: current character
* k: destination of the next getc
*/
i = -1, k = 0;
do {
j = i++;
do {
r = getrune(f);
if (r.len == 0) {
r.len = 1; // store NUL byte for EOF
}
j += r.len;
if (j >= bufsize) {
if (!adjbuf(&buf, &bufsize, j+1, quantum, 0, "fnematch"))
FATAL("stream '%.30s...' too long", buf);
}
memcpy(buf + k, r.bytes, r.len);
k += r.len;
if ((ns = get_gototab(pfa, s, r.rune)) != 0)
s = ns;
else
s = cgoto(pfa, s, r.rune);
i = j = k = buf;
if (pfa->out[s]) { /* final state */
patlen = j - i + 1;
if (r.rune == 0) /* don't count $ */
patlen--;
do {
/*
* Call u8_rune with at least MAX_UTF_BYTES ahead in
* the buffer until EOF interferes.
*/
if (k - j < MAX_UTF_BYTES) {
if (k + MAX_UTF_BYTES > buf + bufsize) {
adjbuf(&buf, &bufsize,
bufsize + MAX_UTF_BYTES,
quantum, 0, "fnematch");
}
} while (buf[j] && s != 1);
for (n = MAX_UTF_BYTES ; n > 0; n--) {
*k++ = (c = getc(f)) != EOF ? c : 0;
if (c == EOF) {
if (ferror(f))
FATAL("fnematch: getc error");
break;
}
}
}
j += u8_rune(&c, (uschar *)j);
if ((ns = get_gototab(pfa, s, c)) != 0)
s = ns;
else
s = cgoto(pfa, s, c);
if (pfa->out[s]) { /* final state */
patbeg = i;
patlen = j - i;
if (c == 0) /* don't count $ */
patlen--;
}
if (c && s != 1)
continue; /* origin i still viable, next j */
if (patlen)
break; /* best match found */
/* no match at origin i, next i and start over */
i += u8_rune(&c, (uschar *)i);
if (c == 0)
break; /* no match */
j = i;
s = 2;
if (r.len > 1)
i += r.len - 1; // i incremented around the loop
} while (buf[i] && !patlen);
} while (1);
/* adjbuf() may have relocated a resized buffer. Inform the world. */
*pbuf = buf;
*pbufsize = bufsize;
if (patlen) {
patbeg = buf + i;
/*
* Under no circumstances is the last character fed to
* the automaton part of the match. It is EOF's nullbyte,
@ -905,10 +868,11 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
* (except for EOF's nullbyte, if present) and null
* terminate the buffer.
*/
for (; r.len > 0; r.len--)
if (buf[--k] && ungetc(buf[k], f) == EOF)
FATAL("unable to ungetc '%c'", buf[k]);
buf[k-patlen] = '\0';
do
if (*--k && ungetc(*k, f) == EOF)
FATAL("unable to ungetc '%c'", *k);
while (k > patbeg + patlen);
*k = '\0';
return true;
}
else

View File

@ -1,4 +1,4 @@
/* $OpenBSD: main.c,v 1.64 2023/10/31 01:08:51 millert Exp $ */
/* $OpenBSD: main.c,v 1.65 2023/11/22 01:01:21 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
@ -23,7 +23,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
const char *version = "version 20231030";
const char *version = "version 20231120";
#define DEBUG
#include <stdio.h>

View File

@ -1,4 +1,4 @@
/* $OpenBSD: maketab.c,v 1.21 2023/10/30 17:52:54 millert Exp $ */
/* $OpenBSD: maketab.c,v 1.22 2023/11/22 01:01:21 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
@ -53,8 +53,8 @@ struct xx
{ ARRAY, "array", NULL },
{ INDIRECT, "indirect", "$(" },
{ SUBSTR, "substr", "substr" },
{ SUB, "sub", "sub" },
{ GSUB, "gsub", "gsub" },
{ SUB, "dosub", "sub" },
{ GSUB, "dosub", "gsub" },
{ INDEX, "sindex", "sindex" },
{ SPRINTF, "awksprintf", "sprintf " },
{ ADD, "arith", " + " },

View File

@ -1,4 +1,4 @@
/* $OpenBSD: proto.h,v 1.22 2023/09/17 14:49:44 millert Exp $ */
/* $OpenBSD: proto.h,v 1.23 2023/11/22 01:01:21 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
@ -199,8 +199,7 @@ extern FILE *openfile(int, const char *, bool *);
extern const char *filename(FILE *);
extern Cell *closefile(Node **, int);
extern void closeall(void);
extern Cell *sub(Node **, int);
extern Cell *gsub(Node **, int);
extern Cell *dosub(Node **, int);
extern Cell *gensub(Node **, int);
extern FILE *popen(const char *, const char *);

View File

@ -1,4 +1,4 @@
/* $OpenBSD: run.c,v 1.80 2023/10/28 22:38:22 millert Exp $ */
/* $OpenBSD: run.c,v 1.81 2023/11/22 01:01:21 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
@ -2518,169 +2518,143 @@ static void flush_all(void)
void backsub(char **pb_ptr, const char **sptr_ptr);
Cell *sub(Node **a, int nnn) /* substitute command */
Cell *dosub(Node **a, int subop) /* sub and gsub */
{
const char *sptr, *q;
Cell *x, *y, *result;
char *t, *buf, *pb;
fa *pfa;
int tempstat;
char *repl;
Cell *x;
char *buf = NULL;
char *pb = NULL;
int bufsz = recsize;
if ((buf = (char *) malloc(bufsz)) == NULL)
FATAL("out of memory in sub");
x = execute(a[3]); /* target string */
t = getsval(x);
if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */
pfa = (fa *) a[1]; /* regular expression */
else {
y = execute(a[1]);
pfa = makedfa(getsval(y), 1);
tempfree(y);
}
y = execute(a[2]); /* replacement string */
result = False;
if (pmatch(pfa, t)) {
sptr = t;
adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub");
pb = buf;
while (sptr < patbeg)
*pb++ = *sptr++;
sptr = getsval(y);
while (*sptr != '\0') {
adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub");
if (*sptr == '\\') {
backsub(&pb, &sptr);
} else if (*sptr == '&') {
sptr++;
adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub");
for (q = patbeg; q < patbeg+patlen; )
*pb++ = *q++;
} else
*pb++ = *sptr++;
}
*pb = '\0';
if (pb > buf + bufsz)
FATAL("sub result1 %.30s too big; can't happen", buf);
sptr = patbeg + patlen;
if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) {
adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub");
while ((*pb++ = *sptr++) != '\0')
continue;
}
if (pb > buf + bufsz)
FATAL("sub result2 %.30s too big; can't happen", buf);
setsval(x, buf); /* BUG: should be able to avoid copy */
result = True;
const char *r, *s;
const char *start;
const char *noempty = NULL; /* empty match disallowed here */
size_t m = 0; /* match count */
size_t whichm; /* which match to select, 0 = global */
int mtype; /* match type */
if (a[0] == NULL) { /* 0 => a[1] is already-compiled regexpr */
pfa = (fa *) a[1];
} else {
x = execute(a[1]);
pfa = makedfa(getsval(x), 1);
tempfree(x);
}
x = execute(a[2]); /* replacement string */
repl = tostring(getsval(x));
tempfree(x);
tempfree(y);
free(buf);
return result;
}
Cell *gsub(Node **a, int nnn) /* global substitute */
{
Cell *x, *y;
char *rptr, *pb;
const char *q, *t, *sptr;
char *buf;
fa *pfa;
int mflag, tempstat, num;
int bufsz = recsize;
int charlen = 0;
if ((buf = (char *) malloc(bufsz)) == NULL)
FATAL("out of memory in gsub");
mflag = 0; /* if mflag == 0, can replace empty string */
num = 0;
x = execute(a[3]); /* target string */
t = getsval(x);
if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */
pfa = (fa *) a[1]; /* regular expression */
else {
y = execute(a[1]);
pfa = makedfa(getsval(y), 1);
tempfree(y);
switch (subop) {
case SUB:
whichm = 1;
x = execute(a[3]); /* source string */
break;
case GSUB:
whichm = 0;
x = execute(a[3]); /* source string */
break;
default:
FATAL("dosub: unrecognized subop: %d", subop);
}
y = execute(a[2]); /* replacement string */
if (pmatch(pfa, t)) {
tempstat = pfa->initstat;
pfa->initstat = 2;
pb = buf;
rptr = getsval(y);
do {
if (patlen == 0 && *patbeg != '\0') { /* matched empty string */
if (mflag == 0) { /* can replace empty */
num++;
sptr = rptr;
while (*sptr != '\0') {
adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
if (*sptr == '\\') {
backsub(&pb, &sptr);
} else if (*sptr == '&') {
sptr++;
adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
for (q = patbeg; q < patbeg+patlen; )
*pb++ = *q++;
} else
*pb++ = *sptr++;
}
}
if (*t == '\0') /* at end */
goto done;
adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");
charlen = u8_nextlen(t);
while (charlen-- > 0)
*pb++ = *t++;
if (pb > buf + bufsz) /* BUG: not sure of this test */
FATAL("gsub result0 %.30s too big; can't happen", buf);
mflag = 0;
start = getsval(x);
while (pmatch(pfa, start)) {
if (buf == NULL) {
if ((pb = buf = malloc(bufsz)) == NULL)
FATAL("out of memory in dosub");
tempstat = pfa->initstat;
pfa->initstat = 2;
}
/* match types */
#define MT_IGNORE 0 /* unselected or invalid */
#define MT_INSERT 1 /* selected, empty */
#define MT_REPLACE 2 /* selected, not empty */
/* an empty match just after replacement is invalid */
if (patbeg == noempty && patlen == 0) {
mtype = MT_IGNORE; /* invalid, not counted */
} else if (whichm == ++m || whichm == 0) {
mtype = patlen ? MT_REPLACE : MT_INSERT;
} else {
mtype = MT_IGNORE; /* unselected, but counted */
}
/* leading text: */
if (patbeg > start) {
adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
recsize, &pb, "dosub");
s = start;
while (s < patbeg)
*pb++ = *s++;
}
if (mtype == MT_IGNORE)
goto matching_text; /* skip replacement text */
r = repl;
while (*r != 0) {
adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
if (*r == '\\') {
backsub(&pb, &r);
} else if (*r == '&') {
r++;
adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
&pb, "dosub");
for (s = patbeg; s < patbeg+patlen; )
*pb++ = *s++;
} else {
*pb++ = *r++;
}
else { /* matched nonempty string */
num++;
sptr = t;
adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub");
while (sptr < patbeg)
*pb++ = *sptr++;
sptr = rptr;
while (*sptr != '\0') {
adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
if (*sptr == '\\') {
backsub(&pb, &sptr);
} else if (*sptr == '&') {
sptr++;
adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
for (q = patbeg; q < patbeg+patlen; )
*pb++ = *q++;
} else
*pb++ = *sptr++;
}
t = patbeg + patlen;
if (patlen == 0 || *t == '\0' || *(t-1) == '\0')
goto done;
if (pb > buf + bufsz)
FATAL("gsub result1 %.30s too big; can't happen", buf);
mflag = 1;
}
} while (pmatch(pfa,t));
sptr = t;
adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub");
while ((*pb++ = *sptr++) != '\0')
continue;
done: if (pb < buf + bufsz)
*pb = '\0';
else if (*(pb-1) != '\0')
FATAL("gsub result2 %.30s truncated; can't happen", buf);
setsval(x, buf); /* BUG: should be able to avoid copy + free */
}
matching_text:
if (mtype == MT_REPLACE || *patbeg == '\0')
goto next_search; /* skip matching text */
if (patlen == 0)
patlen = u8_nextlen(patbeg);
adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
s = patbeg;
while (s < patbeg + patlen)
*pb++ = *s++;
next_search:
start = patbeg + patlen;
if (m == whichm || *patbeg == '\0')
break;
if (mtype == MT_REPLACE)
noempty = start;
#undef MT_IGNORE
#undef MT_INSERT
#undef MT_REPLACE
}
xfree(repl);
if (buf != NULL) {
pfa->initstat = tempstat;
/* trailing text */
adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
while ((*pb++ = *start++) != '\0')
;
setsval(x, buf);
free(buf);
}
tempfree(x);
tempfree(y);
x = gettemp();
x->tval = NUM;
x->fval = num;
free(buf);
return(x);
x->fval = m;
return x;
}
Cell *gensub(Node **a, int nnn) /* global selective substitute */