mirror of
https://github.com/openbsd/src.git
synced 2024-12-22 07:27:59 -08:00
Update awk to the Nov 20, 2023 version.
This includes a rewrite of the fnematch() function as well as a refactoring of the sub and gsub implementation.
This commit is contained in:
parent
8d94b3475e
commit
6e363ec32f
@ -25,6 +25,18 @@ THIS SOFTWARE.
|
||||
This file lists all bug fixes, changes, etc., made since the
|
||||
second edition of the AWK book was published in September 2023.
|
||||
|
||||
Nov 20, 2023
|
||||
rewrite of fnematch to fix a number of issues, including
|
||||
extraneous output, out-of-bounds access, number of bytes
|
||||
to push back after a failed match etc.
|
||||
thanks to Miguel Pineiro Jr.
|
||||
|
||||
Nov 15, 2023
|
||||
Man page edit, regression test fixes. thanks to Arnold Robbins
|
||||
consolidation of sub and gsub into dosub, removing duplicate
|
||||
code. thanks to Miguel Pineiro Jr.
|
||||
gcc replaced with cc everywhere.
|
||||
|
||||
Oct 30, 2023:
|
||||
multiple fixes and a minor code cleanup.
|
||||
disabled utf-8 for non-multibyte locales, such as C or POSIX.
|
||||
|
152
usr.bin/awk/b.c
152
usr.bin/awk/b.c
@ -1,4 +1,4 @@
|
||||
/* $OpenBSD: b.c,v 1.47 2023/11/15 18:56:53 millert Exp $ */
|
||||
/* $OpenBSD: b.c,v 1.48 2023/11/22 01:01:21 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
@ -770,59 +770,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
|
||||
|
||||
#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
|
||||
|
||||
// Read one rune at a time from the given FILE*. Return both
|
||||
// the bytes and the actual rune.
|
||||
|
||||
struct runedata {
|
||||
int rune;
|
||||
size_t len;
|
||||
char bytes[6];
|
||||
};
|
||||
|
||||
struct runedata getrune(FILE *fp)
|
||||
{
|
||||
struct runedata result;
|
||||
int c, i, next;
|
||||
|
||||
memset(&result, 0, sizeof(result));
|
||||
|
||||
c = getc(fp);
|
||||
if (c == EOF)
|
||||
return result; // result.rune == 0 --> EOF
|
||||
else if (c < 128 || awk_mb_cur_max == 1) {
|
||||
result.bytes[0] = c;
|
||||
result.len = 1;
|
||||
result.rune = c;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// need to get bytes and fill things in
|
||||
result.bytes[0] = c;
|
||||
result.len = 1;
|
||||
|
||||
next = 1;
|
||||
for (i = 1; i < MAX_UTF_BYTES; i++) {
|
||||
c = getc(fp);
|
||||
if (c == EOF)
|
||||
break;
|
||||
result.bytes[next++] = c;
|
||||
result.len++;
|
||||
}
|
||||
|
||||
// put back any extra input bytes
|
||||
int actual_len = u8_nextlen(result.bytes);
|
||||
while (result.len > actual_len) {
|
||||
ungetc(result.bytes[--result.len], fp);
|
||||
}
|
||||
|
||||
result.bytes[result.len] = '\0';
|
||||
(void) u8_rune(& result.rune, (uschar *) result.bytes);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* NAME
|
||||
* fnematch
|
||||
@ -840,60 +787,76 @@ struct runedata getrune(FILE *fp)
|
||||
|
||||
bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
|
||||
{
|
||||
char *buf = *pbuf;
|
||||
char *i, *j, *k, *buf = *pbuf;
|
||||
int bufsize = *pbufsize;
|
||||
int i, j, k, ns, s;
|
||||
struct runedata r;
|
||||
int c, n, ns, s;
|
||||
|
||||
s = pfa->initstat;
|
||||
patlen = 0;
|
||||
|
||||
/*
|
||||
* All indices relative to buf.
|
||||
* i <= j <= k <= bufsize
|
||||
* buf <= i <= j <= k <= buf+bufsize
|
||||
*
|
||||
* i: origin of active substring (first byte of first character)
|
||||
* j: current character (last byte of current character)
|
||||
* k: destination of next getc()
|
||||
* i: origin of active substring
|
||||
* j: current character
|
||||
* k: destination of the next getc
|
||||
*/
|
||||
i = -1, k = 0;
|
||||
do {
|
||||
j = i++;
|
||||
do {
|
||||
r = getrune(f);
|
||||
if (r.len == 0) {
|
||||
r.len = 1; // store NUL byte for EOF
|
||||
}
|
||||
j += r.len;
|
||||
if (j >= bufsize) {
|
||||
if (!adjbuf(&buf, &bufsize, j+1, quantum, 0, "fnematch"))
|
||||
FATAL("stream '%.30s...' too long", buf);
|
||||
}
|
||||
memcpy(buf + k, r.bytes, r.len);
|
||||
k += r.len;
|
||||
|
||||
if ((ns = get_gototab(pfa, s, r.rune)) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(pfa, s, r.rune);
|
||||
i = j = k = buf;
|
||||
|
||||
if (pfa->out[s]) { /* final state */
|
||||
patlen = j - i + 1;
|
||||
if (r.rune == 0) /* don't count $ */
|
||||
patlen--;
|
||||
do {
|
||||
/*
|
||||
* Call u8_rune with at least MAX_UTF_BYTES ahead in
|
||||
* the buffer until EOF interferes.
|
||||
*/
|
||||
if (k - j < MAX_UTF_BYTES) {
|
||||
if (k + MAX_UTF_BYTES > buf + bufsize) {
|
||||
adjbuf(&buf, &bufsize,
|
||||
bufsize + MAX_UTF_BYTES,
|
||||
quantum, 0, "fnematch");
|
||||
}
|
||||
} while (buf[j] && s != 1);
|
||||
for (n = MAX_UTF_BYTES ; n > 0; n--) {
|
||||
*k++ = (c = getc(f)) != EOF ? c : 0;
|
||||
if (c == EOF) {
|
||||
if (ferror(f))
|
||||
FATAL("fnematch: getc error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
j += u8_rune(&c, (uschar *)j);
|
||||
|
||||
if ((ns = get_gototab(pfa, s, c)) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(pfa, s, c);
|
||||
|
||||
if (pfa->out[s]) { /* final state */
|
||||
patbeg = i;
|
||||
patlen = j - i;
|
||||
if (c == 0) /* don't count $ */
|
||||
patlen--;
|
||||
}
|
||||
|
||||
if (c && s != 1)
|
||||
continue; /* origin i still viable, next j */
|
||||
if (patlen)
|
||||
break; /* best match found */
|
||||
|
||||
/* no match at origin i, next i and start over */
|
||||
i += u8_rune(&c, (uschar *)i);
|
||||
if (c == 0)
|
||||
break; /* no match */
|
||||
j = i;
|
||||
s = 2;
|
||||
if (r.len > 1)
|
||||
i += r.len - 1; // i incremented around the loop
|
||||
} while (buf[i] && !patlen);
|
||||
} while (1);
|
||||
|
||||
/* adjbuf() may have relocated a resized buffer. Inform the world. */
|
||||
*pbuf = buf;
|
||||
*pbufsize = bufsize;
|
||||
|
||||
if (patlen) {
|
||||
patbeg = buf + i;
|
||||
/*
|
||||
* Under no circumstances is the last character fed to
|
||||
* the automaton part of the match. It is EOF's nullbyte,
|
||||
@ -905,10 +868,11 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
|
||||
* (except for EOF's nullbyte, if present) and null
|
||||
* terminate the buffer.
|
||||
*/
|
||||
for (; r.len > 0; r.len--)
|
||||
if (buf[--k] && ungetc(buf[k], f) == EOF)
|
||||
FATAL("unable to ungetc '%c'", buf[k]);
|
||||
buf[k-patlen] = '\0';
|
||||
do
|
||||
if (*--k && ungetc(*k, f) == EOF)
|
||||
FATAL("unable to ungetc '%c'", *k);
|
||||
while (k > patbeg + patlen);
|
||||
*k = '\0';
|
||||
return true;
|
||||
}
|
||||
else
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* $OpenBSD: main.c,v 1.64 2023/10/31 01:08:51 millert Exp $ */
|
||||
/* $OpenBSD: main.c,v 1.65 2023/11/22 01:01:21 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
@ -23,7 +23,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
|
||||
THIS SOFTWARE.
|
||||
****************************************************************/
|
||||
|
||||
const char *version = "version 20231030";
|
||||
const char *version = "version 20231120";
|
||||
|
||||
#define DEBUG
|
||||
#include <stdio.h>
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* $OpenBSD: maketab.c,v 1.21 2023/10/30 17:52:54 millert Exp $ */
|
||||
/* $OpenBSD: maketab.c,v 1.22 2023/11/22 01:01:21 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
@ -53,8 +53,8 @@ struct xx
|
||||
{ ARRAY, "array", NULL },
|
||||
{ INDIRECT, "indirect", "$(" },
|
||||
{ SUBSTR, "substr", "substr" },
|
||||
{ SUB, "sub", "sub" },
|
||||
{ GSUB, "gsub", "gsub" },
|
||||
{ SUB, "dosub", "sub" },
|
||||
{ GSUB, "dosub", "gsub" },
|
||||
{ INDEX, "sindex", "sindex" },
|
||||
{ SPRINTF, "awksprintf", "sprintf " },
|
||||
{ ADD, "arith", " + " },
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* $OpenBSD: proto.h,v 1.22 2023/09/17 14:49:44 millert Exp $ */
|
||||
/* $OpenBSD: proto.h,v 1.23 2023/11/22 01:01:21 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
@ -199,8 +199,7 @@ extern FILE *openfile(int, const char *, bool *);
|
||||
extern const char *filename(FILE *);
|
||||
extern Cell *closefile(Node **, int);
|
||||
extern void closeall(void);
|
||||
extern Cell *sub(Node **, int);
|
||||
extern Cell *gsub(Node **, int);
|
||||
extern Cell *dosub(Node **, int);
|
||||
extern Cell *gensub(Node **, int);
|
||||
|
||||
extern FILE *popen(const char *, const char *);
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* $OpenBSD: run.c,v 1.80 2023/10/28 22:38:22 millert Exp $ */
|
||||
/* $OpenBSD: run.c,v 1.81 2023/11/22 01:01:21 millert Exp $ */
|
||||
/****************************************************************
|
||||
Copyright (C) Lucent Technologies 1997
|
||||
All Rights Reserved
|
||||
@ -2518,169 +2518,143 @@ static void flush_all(void)
|
||||
|
||||
void backsub(char **pb_ptr, const char **sptr_ptr);
|
||||
|
||||
Cell *sub(Node **a, int nnn) /* substitute command */
|
||||
Cell *dosub(Node **a, int subop) /* sub and gsub */
|
||||
{
|
||||
const char *sptr, *q;
|
||||
Cell *x, *y, *result;
|
||||
char *t, *buf, *pb;
|
||||
fa *pfa;
|
||||
int tempstat;
|
||||
char *repl;
|
||||
Cell *x;
|
||||
|
||||
char *buf = NULL;
|
||||
char *pb = NULL;
|
||||
int bufsz = recsize;
|
||||
|
||||
if ((buf = (char *) malloc(bufsz)) == NULL)
|
||||
FATAL("out of memory in sub");
|
||||
x = execute(a[3]); /* target string */
|
||||
t = getsval(x);
|
||||
if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */
|
||||
pfa = (fa *) a[1]; /* regular expression */
|
||||
else {
|
||||
y = execute(a[1]);
|
||||
pfa = makedfa(getsval(y), 1);
|
||||
tempfree(y);
|
||||
}
|
||||
y = execute(a[2]); /* replacement string */
|
||||
result = False;
|
||||
if (pmatch(pfa, t)) {
|
||||
sptr = t;
|
||||
adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub");
|
||||
pb = buf;
|
||||
while (sptr < patbeg)
|
||||
*pb++ = *sptr++;
|
||||
sptr = getsval(y);
|
||||
while (*sptr != '\0') {
|
||||
adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub");
|
||||
if (*sptr == '\\') {
|
||||
backsub(&pb, &sptr);
|
||||
} else if (*sptr == '&') {
|
||||
sptr++;
|
||||
adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub");
|
||||
for (q = patbeg; q < patbeg+patlen; )
|
||||
*pb++ = *q++;
|
||||
} else
|
||||
*pb++ = *sptr++;
|
||||
}
|
||||
*pb = '\0';
|
||||
if (pb > buf + bufsz)
|
||||
FATAL("sub result1 %.30s too big; can't happen", buf);
|
||||
sptr = patbeg + patlen;
|
||||
if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) {
|
||||
adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub");
|
||||
while ((*pb++ = *sptr++) != '\0')
|
||||
continue;
|
||||
}
|
||||
if (pb > buf + bufsz)
|
||||
FATAL("sub result2 %.30s too big; can't happen", buf);
|
||||
setsval(x, buf); /* BUG: should be able to avoid copy */
|
||||
result = True;
|
||||
const char *r, *s;
|
||||
const char *start;
|
||||
const char *noempty = NULL; /* empty match disallowed here */
|
||||
size_t m = 0; /* match count */
|
||||
size_t whichm; /* which match to select, 0 = global */
|
||||
int mtype; /* match type */
|
||||
|
||||
if (a[0] == NULL) { /* 0 => a[1] is already-compiled regexpr */
|
||||
pfa = (fa *) a[1];
|
||||
} else {
|
||||
x = execute(a[1]);
|
||||
pfa = makedfa(getsval(x), 1);
|
||||
tempfree(x);
|
||||
}
|
||||
|
||||
x = execute(a[2]); /* replacement string */
|
||||
repl = tostring(getsval(x));
|
||||
tempfree(x);
|
||||
tempfree(y);
|
||||
free(buf);
|
||||
return result;
|
||||
}
|
||||
|
||||
Cell *gsub(Node **a, int nnn) /* global substitute */
|
||||
{
|
||||
Cell *x, *y;
|
||||
char *rptr, *pb;
|
||||
const char *q, *t, *sptr;
|
||||
char *buf;
|
||||
fa *pfa;
|
||||
int mflag, tempstat, num;
|
||||
int bufsz = recsize;
|
||||
int charlen = 0;
|
||||
|
||||
if ((buf = (char *) malloc(bufsz)) == NULL)
|
||||
FATAL("out of memory in gsub");
|
||||
mflag = 0; /* if mflag == 0, can replace empty string */
|
||||
num = 0;
|
||||
x = execute(a[3]); /* target string */
|
||||
t = getsval(x);
|
||||
if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */
|
||||
pfa = (fa *) a[1]; /* regular expression */
|
||||
else {
|
||||
y = execute(a[1]);
|
||||
pfa = makedfa(getsval(y), 1);
|
||||
tempfree(y);
|
||||
switch (subop) {
|
||||
case SUB:
|
||||
whichm = 1;
|
||||
x = execute(a[3]); /* source string */
|
||||
break;
|
||||
case GSUB:
|
||||
whichm = 0;
|
||||
x = execute(a[3]); /* source string */
|
||||
break;
|
||||
default:
|
||||
FATAL("dosub: unrecognized subop: %d", subop);
|
||||
}
|
||||
y = execute(a[2]); /* replacement string */
|
||||
if (pmatch(pfa, t)) {
|
||||
tempstat = pfa->initstat;
|
||||
pfa->initstat = 2;
|
||||
pb = buf;
|
||||
rptr = getsval(y);
|
||||
do {
|
||||
if (patlen == 0 && *patbeg != '\0') { /* matched empty string */
|
||||
if (mflag == 0) { /* can replace empty */
|
||||
num++;
|
||||
sptr = rptr;
|
||||
while (*sptr != '\0') {
|
||||
adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
|
||||
if (*sptr == '\\') {
|
||||
backsub(&pb, &sptr);
|
||||
} else if (*sptr == '&') {
|
||||
sptr++;
|
||||
adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
|
||||
for (q = patbeg; q < patbeg+patlen; )
|
||||
*pb++ = *q++;
|
||||
} else
|
||||
*pb++ = *sptr++;
|
||||
}
|
||||
}
|
||||
if (*t == '\0') /* at end */
|
||||
goto done;
|
||||
adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");
|
||||
charlen = u8_nextlen(t);
|
||||
while (charlen-- > 0)
|
||||
*pb++ = *t++;
|
||||
if (pb > buf + bufsz) /* BUG: not sure of this test */
|
||||
FATAL("gsub result0 %.30s too big; can't happen", buf);
|
||||
mflag = 0;
|
||||
|
||||
start = getsval(x);
|
||||
while (pmatch(pfa, start)) {
|
||||
if (buf == NULL) {
|
||||
if ((pb = buf = malloc(bufsz)) == NULL)
|
||||
FATAL("out of memory in dosub");
|
||||
tempstat = pfa->initstat;
|
||||
pfa->initstat = 2;
|
||||
}
|
||||
|
||||
/* match types */
|
||||
#define MT_IGNORE 0 /* unselected or invalid */
|
||||
#define MT_INSERT 1 /* selected, empty */
|
||||
#define MT_REPLACE 2 /* selected, not empty */
|
||||
|
||||
/* an empty match just after replacement is invalid */
|
||||
|
||||
if (patbeg == noempty && patlen == 0) {
|
||||
mtype = MT_IGNORE; /* invalid, not counted */
|
||||
} else if (whichm == ++m || whichm == 0) {
|
||||
mtype = patlen ? MT_REPLACE : MT_INSERT;
|
||||
} else {
|
||||
mtype = MT_IGNORE; /* unselected, but counted */
|
||||
}
|
||||
|
||||
/* leading text: */
|
||||
if (patbeg > start) {
|
||||
adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
|
||||
recsize, &pb, "dosub");
|
||||
s = start;
|
||||
while (s < patbeg)
|
||||
*pb++ = *s++;
|
||||
}
|
||||
|
||||
if (mtype == MT_IGNORE)
|
||||
goto matching_text; /* skip replacement text */
|
||||
|
||||
r = repl;
|
||||
while (*r != 0) {
|
||||
adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
|
||||
if (*r == '\\') {
|
||||
backsub(&pb, &r);
|
||||
} else if (*r == '&') {
|
||||
r++;
|
||||
adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
|
||||
&pb, "dosub");
|
||||
for (s = patbeg; s < patbeg+patlen; )
|
||||
*pb++ = *s++;
|
||||
} else {
|
||||
*pb++ = *r++;
|
||||
}
|
||||
else { /* matched nonempty string */
|
||||
num++;
|
||||
sptr = t;
|
||||
adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub");
|
||||
while (sptr < patbeg)
|
||||
*pb++ = *sptr++;
|
||||
sptr = rptr;
|
||||
while (*sptr != '\0') {
|
||||
adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
|
||||
if (*sptr == '\\') {
|
||||
backsub(&pb, &sptr);
|
||||
} else if (*sptr == '&') {
|
||||
sptr++;
|
||||
adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
|
||||
for (q = patbeg; q < patbeg+patlen; )
|
||||
*pb++ = *q++;
|
||||
} else
|
||||
*pb++ = *sptr++;
|
||||
}
|
||||
t = patbeg + patlen;
|
||||
if (patlen == 0 || *t == '\0' || *(t-1) == '\0')
|
||||
goto done;
|
||||
if (pb > buf + bufsz)
|
||||
FATAL("gsub result1 %.30s too big; can't happen", buf);
|
||||
mflag = 1;
|
||||
}
|
||||
} while (pmatch(pfa,t));
|
||||
sptr = t;
|
||||
adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub");
|
||||
while ((*pb++ = *sptr++) != '\0')
|
||||
continue;
|
||||
done: if (pb < buf + bufsz)
|
||||
*pb = '\0';
|
||||
else if (*(pb-1) != '\0')
|
||||
FATAL("gsub result2 %.30s truncated; can't happen", buf);
|
||||
setsval(x, buf); /* BUG: should be able to avoid copy + free */
|
||||
}
|
||||
|
||||
matching_text:
|
||||
if (mtype == MT_REPLACE || *patbeg == '\0')
|
||||
goto next_search; /* skip matching text */
|
||||
|
||||
if (patlen == 0)
|
||||
patlen = u8_nextlen(patbeg);
|
||||
adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
|
||||
s = patbeg;
|
||||
while (s < patbeg + patlen)
|
||||
*pb++ = *s++;
|
||||
|
||||
next_search:
|
||||
start = patbeg + patlen;
|
||||
if (m == whichm || *patbeg == '\0')
|
||||
break;
|
||||
if (mtype == MT_REPLACE)
|
||||
noempty = start;
|
||||
|
||||
#undef MT_IGNORE
|
||||
#undef MT_INSERT
|
||||
#undef MT_REPLACE
|
||||
}
|
||||
|
||||
xfree(repl);
|
||||
|
||||
if (buf != NULL) {
|
||||
pfa->initstat = tempstat;
|
||||
|
||||
/* trailing text */
|
||||
adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
|
||||
while ((*pb++ = *start++) != '\0')
|
||||
;
|
||||
|
||||
setsval(x, buf);
|
||||
free(buf);
|
||||
}
|
||||
|
||||
tempfree(x);
|
||||
tempfree(y);
|
||||
x = gettemp();
|
||||
x->tval = NUM;
|
||||
x->fval = num;
|
||||
free(buf);
|
||||
return(x);
|
||||
x->fval = m;
|
||||
return x;
|
||||
}
|
||||
|
||||
Cell *gensub(Node **a, int nnn) /* global selective substitute */
|
||||
|
Loading…
Reference in New Issue
Block a user