#include #include #include /* compile ed-like regular expressions to x86 code. * * mpr * * Note: this is much slower than GNU grep. Sorry. I found out about NFAs and * DFAs and Boyer-Moore only *after* having coded this. The code generated * represents a NFA, of course. */ /* meta-characters: * * ^ beginning of line * $ end of line * . any character except newline * [ ] character class (negated class if first character is `^'; `-' means * range, e.g. a-z) * * previous exp, zero or more times * \+ previous exp, one or more times * \? previous exp, zero or one time * \( \) subexpression (may nest) * * example: * * \(\(f[o-qz]*b\)\+a\)\+c matches fobfbfqqqbafooobfpzqbac */ /* BUGS: * * - compiled code for an expression or subexpression may break if it exceeds * 128 bytes (because of short jumps) */ /* TODO: * * - alternation ( \| ) */ #define CHAR 0 #define ANY 1 #define CLASS 2 #define NCLASS 3 #define STAR 4 #define PLUS 5 #define EXPR 6 #define QUESTION 7 #define BOL 8 #define EOL 9 #define END 10 struct patrn { int type; void *data; struct patrn *next, *prev; }; void setbit(b, n) char *b; { b[n >> 3] |= 1 << (n & 7); } int getbit(b, n) char *b; { return b[n >> 3] & (1 << (n & 7)); } void store(ppatrn, type, data) struct patrn **ppatrn; void *data; { struct patrn *p; p = (struct patrn *)malloc(sizeof(struct patrn)); p->type = type; p->data = data; p->next = p->prev = p; if (*ppatrn == NULL) { *ppatrn = p; } else { if (type == STAR || type == PLUS || type == QUESTION) { struct patrn *last; last = (*ppatrn)->prev; if (last == *ppatrn) *ppatrn = p; p->next = last; last->prev->next = p; p->prev = last->prev; last->prev = p; } else { (*ppatrn)->prev->next = p; p->prev = (*ppatrn)->prev; (*ppatrn)->prev = p; p->next = *ppatrn; } } } char *cclass(ppatrn, p) struct patrn **ppatrn; char *p; { char *bfield; int type; bfield = malloc(256 / 8); bzero(bfield, 256 / 8); if (*p == '^') { type = NCLASS; ++p; } else { type = CLASS; } for (; *p && *p != ']'; p++) { if (*p == '-' && p[-1] != '[') { int c; if (p[1] == ']' || !p[1]) return NULL; for (c = p[-1]; c != p[1]; c += p[1] > p[-1] ? 1 : -1) setbit(bfield, c); } else { setbit(bfield, *p); } } if (!*p) return NULL; store(ppatrn, type, bfield); return p; } struct patrn *study(pp, r) char **pp; { struct patrn *patrn = NULL; char *p = *pp; for(; *p; p++) { switch (*p) { case '.': store(&patrn, ANY, 0); break; case '[': if ((p = cclass(&patrn, p + 1)) == NULL) return NULL; break; case '*': if (patrn == NULL) return NULL; store(&patrn, STAR, 0); break; case '^': if (patrn != NULL) return NULL; store(&patrn, BOL, 0); break; case '$': if (p[1]) return NULL; store(&patrn, EOL, 0); break; case '\\': if (p[1]) { /* '+' and '?' have to be preceded by * backslash */ if (p[1] == '+' || p[1] == '?') { if (patrn == NULL) return NULL; store(&patrn, p[1] == '+' ? PLUS : QUESTION, 0); p++; break; } else if (p[1] == '(') { struct patrn *ipatrn; p += 2; if ((ipatrn = study(&p, 1)) == NULL) { return NULL; } store(&patrn, EXPR, ipatrn); break; } else if (p[1] == ')') { if (!r) return NULL; p++; goto leave; } ++p; } default: store(&patrn, CHAR, *p); break; } } leave: store(&patrn, END, 0); *pp = p; return patrn; } #define MAGIC_OFFSET 0xff void fixjump(b, e, c, magic) unsigned char *b, *e, *c, magic; { unsigned char *p; for (p = b; p < e; p++) { if ((*p == 0x74 || *p == 0x75 || *p == 0xeb || *p == 0x7e) && p[1] == magic) { p[1] = (unsigned)c - (unsigned)p - 2; p++; } } } char *compile(struct patrn *p); unsigned char *compilec(p, b, incptr) struct patrn *p; unsigned char *b; { int i, j; unsigned char *s, *eaddr; if (p->type != EXPR) { if (incptr) { /* lods %ds:(%esi),%al */ *b++ = 0xac; } else { /* movl %al,(%esi) */ *b++ = 0x8a; *b++ = 0x06; } } switch (p->type) { case CHAR: /* cmp $(p->data),%al */ *b++ = 0x3c; *b++ = (int)p->data; *b++ = 0x75; *b++ = MAGIC_OFFSET; /* jne fail */ break; case ANY: *b++ = 0x84; *b++ = 0xc0; /* test %al,%al */ *b++ = 0x74; *b++ = MAGIC_OFFSET; /* je fail */ break; case EOL: *b++ = 0x84; *b++ = 0xc0; /* test %al,%al */ *b++ = 0x75; *b++ = MAGIC_OFFSET; /* jne fail */ break; case EXPR: eaddr = compile((struct patrn *)p->data); if (!incptr) *b++ = 0x56; /* pushl %esi */ *b++ = 0xe8; /* call subexpr */ /* skip first 6 bytes of subexpression code */ *(unsigned int *)b = (unsigned)eaddr + 6 - (unsigned)b - 4; b += 4; if (!incptr) *b++ = 0x5e; /* popl %esi */ *b++ = 0x85; *b++ = 0xc0; /* testl %eax,%eax*/ *b++ = 0x75; *b++ = MAGIC_OFFSET; /* jne fail */ break; case NCLASS: case CLASS: s = b; for (i = 0; i < 256; i++) { if (getbit((char *)p->data, i)) { j = i; while (getbit((char *)p->data, ++i)) ; if (i == j + 1) { /* cmp $i,%al */ /* je $xx */ *b++ = 0x3c; *b++ = j; *b++ = 0x74; *b++ = MAGIC_OFFSET; } else { /* cmp $j,%eax */ *b++ = 0x3d; *(unsigned *)b = j; b += 4; /* jl $9 */ *b++ = 0x7c; *b++ = 0x07; /* cmp $(i - 1),%eax */ *b++ = 0x3d; *(unsigned *)b = i - 1; b += 4; /* jle $xx */ *b++ = 0x7e; *b++ = MAGIC_OFFSET; } } } if (p->type == CLASS) { *b++ = 0xeb; *b++ = MAGIC_OFFSET; /* jmp fail */ fixjump(s, b - 2, b, MAGIC_OFFSET); } break; } return b; } char *compileplus(p, b) struct patrn *p; char *b; { unsigned char *jpaddr, *caddr, *saddr, *esaddr; if (p->type == STAR) { saddr = b; b = compilec(p->next, b, 0); esaddr = b; } jpaddr = b; /* foo: */ b = compilec(p->next, b, 1); *b++ = 0x56; /* pushl %esi */ *b++ = 0xe8; /* call bar: */ caddr = b; b += 4; *b++ = 0x5e; /* popl %esi */ *b++ = 0x85; *b++ = 0xc0; /* testl %eax,%eax */ *b++ = 0x75; /* jne foo */ *b++ = (unsigned)jpaddr - (unsigned)b - 1; *b++ = 0xc3; /* ret */ if (p->type == STAR) fixjump(saddr, esaddr, b, MAGIC_OFFSET); /* bar: */ /* fix call offset */ *(unsigned *)caddr = (unsigned)b - (unsigned)caddr - 4; return b; } char *compile(p) struct patrn *p; { struct patrn *q; unsigned char *b, *buf, *s; b = buf = (char *)malloc(1000); *b++ = 0x8b; *b++ = 0x74; /* movl $4(%esp),%esi */ *b++ = 0x24; *b++ = 0x04; *b++ = 0x31; *b++ = 0xc0; /* xorl %eax,%eax */ for (q = p; q->type != END; q = q->next) { switch (q->type) { case STAR: case PLUS: b = compileplus(q, b); q = q->next; break; case QUESTION: if (q->next->type == EXPR) *b++ = 0x56; /* pushl %esi */ s = b; b = compilec(q->next, b, q->next->type == EXPR); if (q->next->type == EXPR) { /* addl $4,%esp */ *b++ = 0x83; *b++ = 0xc4; *b++ = 0x04; *b++ = 0xeb; *b++ = 0x01; /* jmp $1 */ fixjump(s, b - 5, b, MAGIC_OFFSET); *b++ = 0x5e; /* popl %esi */ } else { *b++ = 0x46; /* incl %esi */ fixjump(s, b - 1, b, MAGIC_OFFSET); } q = q->next; break; case BOL: break; default: b = compilec(q, b, 1); break; } } *b++ = 0x31; *b++ = 0xc0; /* xorl %eax,%eax */ *b++ = 0xc3; /* ret */ fixjump(buf, b - 3, b, MAGIC_OFFSET); *b++ = 0x31; *b++ = 0xc0; /* xorl %eax,%eax */ *b++ = 0x40; /* incl %eax */ *b++ = 0xc3; /* ret */ return buf; } #define LINESIZ 1024 void grep(rfn, in, bol) int(*rfn)(char *); FILE *in; { static char line[LINESIZ]; int len; while (fgets(line, sizeof(line), in)) { len = strlen(line); if (line[len - 1] == '\n') line[len - 1] = '\0'; if (bol) { if (!rfn(line)) printf("%s\n", line); } else { char *p; for (p = line; *p; p++) { if (!rfn(p)) { printf("%s\n", line); break; } } } } } int main(c, v) int c; char *v[]; { struct patrn *patrn; int(*rfn)(char *); if (c == 1) { fprintf(stderr, "Usage: %s PATTERN [FILE]...\n", *v); return -1; } if ((patrn = study(&v[1], 0)) == NULL) { fprintf(stderr, "Invalid regexp\n");; return -1; } rfn = (int(*)(char *))compile(patrn); #ifdef DEBUG write(1, rfn, 1000); #else if (c == 2) { grep(rfn, stdin, patrn->type == BOL); } else { FILE *in; for(v += 2; *v; v++) { if ((in = fopen(*v, "r"))) { grep(rfn, in, patrn->type == BOL); fclose(in); } else { perror(*v); } } } #endif return 0; }