Logo Search packages:      
Sourcecode: qtads version File versions

regex.c

#ifdef RCSID
static char RCSid[] =
"$Header: d:/cvsroot/tads/TADS2/regex.c,v 1.3 1999/07/11 00:46:34 MJRoberts Exp $";
#endif

/* 
 *   Copyright (c) 1998, 2002 Michael J. Roberts.  All Rights Reserved.
 *   
 *   Please see the accompanying license file, LICENSE.TXT, for information
 *   on using and copying this software.  
 */
/*
Name
  regex.c - Regular Expression Parser and Recognizer for TADS
Function
  Parses and recognizes regular expressions
Notes
  Regular expression syntax:

     abc|def    either abc or def
     (abc)      abc
     abc+       abc, abcc, abccc, ...
     abc*       ab, abc, abcc, ...
     abc?       ab or abc
     .          any single character
     abc$       abc at the end of the line
     ^abc       abc at the beginning of the line
     %^abc      literally ^abc
     [abcx-z]   matches a, b, c, x, y, or z
     [^abcx-z]  matches any character except a, b, c, x, y, or z
     [^]-q]     matches any character except ], -, or q

  Note that using ']' or '-' in a character range expression requires
  special ordering.  If ']' is to be used, it must be the first character
  after the '^', if present, within the brackets.  If '-' is to be used,
  it must be the first character after the '^' and/or ']', if present.

  '%' is used to escape the special characters: | . ( ) * ? + ^ $ % [
  (We use '%' rather than a backslash because it's less trouble to
  enter in a TADS string -- a backslash needs to be quoted with another
  backslash, which is error-prone and hard to read.  '%' doesn't need
  any special quoting in a TADS string, which makes it a lot more
  readable.)

  In addition, '%' is used to introduce additional special sequences:

     %1         text matching first parenthesized expression
     %9         text matching ninth parenthesized experssion
     %<         matches at the beginning of a word only
     %>         matches at the end of a word only
     %w         matches any word character
     %W         matches any non-word character
     %b         matches at any word boundary (beginning or end of word)
     %B         matches except at a word boundary

  For the word matching sequences, a word is any sequence of letters and
  numbers.

Modified
  09/06/98 MJRoberts  - Creation
*/


#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "std.h"
#include "mch.h"
#include "ler.h"
#include "regex.h"


/* ------------------------------------------------------------------------ */
/*
 *   A "machine" (i.e., a finite state automaton) is a set of state
 *   transition tuples.  A tuple has three elements: the state ID, the ID
 *   of the state that we transition to, and the condition for the
 *   transition.  The condition is simply the character that we must match
 *   to make the transition, or a special distinguished symbol "epsilon,"
 *   which refers to a transition with no input character consumed.
 *   
 *   The primitive elements of our machines guarantee that we never have
 *   more than two transitions out of a particular state, so we can
 *   denormalize the representation of a state by storing the two possible
 *   tuples for that state in a single combined tuple.  This has the
 *   performance advantage that we can use the state ID as an index into
 *   an array of state tuples.
 *   
 *   A particular machine always has a single initial and single final
 *   (successful) state, so we can define a machine by its initial and
 *   final state ID's.  
 */

/* the special symbol value for "epsilon" */
#define RE_EPSILON            '\001'

/* the special symbol value for a wildcard character */
#define RE_WILDCARD           '\002'

/* special symbol values for beginning and end of text */
#define RE_TEXT_BEGIN         '\003'
#define RE_TEXT_END           '\004'

/* special symbol values for start and end of a word */
#define RE_WORD_BEGIN         '\005'
#define RE_WORD_END           '\006'

/* special symbols for word-char and non-word-char */
#define RE_WORD_CHAR          '\007'
#define RE_NON_WORD_CHAR      '\010'

/* special symbols for word-boundary and non-word-boundary */
#define RE_WORD_BOUNDARY      '\011'
#define RE_NON_WORD_BOUNDARY  '\012'

/* special symbol for a character range/exclusion range */
#define RE_RANGE              '\013'
#define RE_RANGE_EXCL         '\014'

/* a range of special symbol values for group matchers */
#define RE_GROUP_MATCH_0      '\015'
#define RE_GROUP_MATCH_9      (RE_GROUP_MATCH_0 + 9)


/* ------------------------------------------------------------------------ */
/*
 *   A machine description.  Machines are fully described by their initial
 *   and final state ID's.  
 */
typedef struct
{
    /* the machine's initial state */
    re_state_id init;

    /* the machine's final state */
    re_state_id final;
} re_machine;


/* ------------------------------------------------------------------------ */
/*
 *   Initialize the context.  The memory for the context structure itself
 *   is allocated and maintained by the caller.  
 */
void re_init(re_context *ctx, errcxdef *errctx)
{
    /* save the error context */
    ctx->errctx = errctx;
    
    /* no tuple array yet */
    ctx->tuple_arr = 0;
    ctx->tuples_alloc = 0;

    /* clear states */
    ctx->next_state = RE_STATE_FIRST_VALID;

    /* clear groups */
    ctx->cur_group = 0;

    /* no string buffer yet */
    ctx->strbuf = 0;
}

/* ------------------------------------------------------------------------ */
/*
 *   Reset compiler - clears states and tuples 
 */
static void re_reset(re_context *ctx)
{
    int i;
    
    /* delete any range tables we've allocated */
    for (i = 0 ; i < ctx->next_state ; ++i)
    {
        if (ctx->tuple_arr[i].char_range != 0)
        {
            mchfre(ctx->tuple_arr[i].char_range);
            ctx->tuple_arr[i].char_range = 0;
        }
    }

    /* clear states */
    ctx->next_state = RE_STATE_FIRST_VALID;

    /* clear groups */
    ctx->cur_group = 0;
}

/* ------------------------------------------------------------------------ */
/*
 *   Delete the context - frees structures associated with the context.
 *   Does NOT free the memory used by the context structure itself.  
 */
void re_delete(re_context *ctx)
{
    /* reset state */
    re_reset(ctx);
    
    /* if we've allocated an array, delete it */
    if (ctx->tuple_arr != 0)
    {
        mchfre(ctx->tuple_arr);
        ctx->tuple_arr = 0;
    }

    /* if we allocated a string buffer, delete it */
    if (ctx->strbuf != 0)
    {
        mchfre(ctx->strbuf);
        ctx->strbuf = 0;
    }
}

/* ------------------------------------------------------------------------ */
/*
 *   Allocate a new state ID 
 */
static re_state_id re_alloc_state(re_context *ctx)
{
    /*
     *   If we don't have enough room for another state, expand the array 
     */
    if (ctx->next_state >= ctx->tuples_alloc)
    {
        int new_alloc;
        
        /* bump the size by a bit */
        new_alloc = ctx->tuples_alloc + 100;
        
        /* allocate or expand the array */
        if (ctx->tuples_alloc == 0)
        {
            /* allocate the initial memory block */
            ctx->tuple_arr =
                (re_tuple *)mchalo(ctx->errctx,
                                   (ushort)(new_alloc * sizeof(re_tuple)),
                                   "regex");
        }
        else
        {
            re_tuple *ptr;
            
            /* allocate a new memory block */
            ptr = (re_tuple *)mchalo(ctx->errctx,
                                     (ushort)(new_alloc * sizeof(re_tuple)),
                                     "regex");
            
            /* copy the old memory to the new memory */
            memcpy(ptr, ctx->tuple_arr, ctx->tuples_alloc * sizeof(re_tuple));

            /* free the old block */
            mchfre(ctx->tuple_arr);

            /* use the new block */
            ctx->tuple_arr = ptr;
        }

        /* remember the new allocation size */
        ctx->tuples_alloc = new_alloc;
    }

    /* initialize the next state */
    ctx->tuple_arr[ctx->next_state].next_state_1 = RE_STATE_INVALID;
    ctx->tuple_arr[ctx->next_state].next_state_2 = RE_STATE_INVALID;
    ctx->tuple_arr[ctx->next_state].ch = RE_EPSILON;
    ctx->tuple_arr[ctx->next_state].flags = 0;
    ctx->tuple_arr[ctx->next_state].char_range = 0;

    /* return the new state's ID */
    return ctx->next_state++;
}


/* ------------------------------------------------------------------------ */
/*
 *   Set a transition from a state to a given destination state.  
 */
static void re_set_trans(re_context *ctx,
                         re_state_id id, re_state_id dest_id, char ch)
{
    re_tuple *tuple;
    
    /* 
     *   get the tuple containing the transitions for this state ID - the
     *   state ID is the index of the state's transition tuple in the
     *   array 
     */
    tuple = &ctx->tuple_arr[id];

    /*
     *   If the first state pointer hasn't been set yet, set it to the new
     *   destination.  Otherwise, set the second state pointer.
     *   
     *   Only set the character on setting the first state.  When setting
     *   the second state, we must assume that the character for the state
     *   has already been set, since any given state can have only one
     *   character setting.  
     */
    if (tuple->next_state_1 == RE_STATE_INVALID)
    {
        /* 
         *   set the character ID, unless the state has been marked with a
         *   special flag which indicates that the character value has
         *   another meaning (in particular, a group marker) 
         */
        if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END)))
            tuple->ch = ch;

        /* set the first transition */
        tuple->next_state_1 = dest_id;
    }
    else
    {
        /* set only the second transition state - don't set the character */
        tuple->next_state_2 = dest_id;
    }
}

/* ------------------------------------------------------------------------ */
/*
 *   Initialize a new machine, giving it an initial and final state 
 */
static void re_init_machine(re_context *ctx, re_machine *machine)
{
    machine->init = re_alloc_state(ctx);
    machine->final = re_alloc_state(ctx);
}

/*
 *   Build a character recognizer
 */
static void re_build_char(re_context *ctx, re_machine *machine, char ch)
{
    /* initialize our new machine */
    re_init_machine(ctx, machine);

    /* allocate a transition tuple for the new state */
    re_set_trans(ctx, machine->init, machine->final, ch);
}

/*
 *   Build a character range recognizer.  'range' is a 256-bit (32-byte)
 *   bit vector.  
 */
static void re_build_char_range(re_context *ctx, re_machine *machine,
                                unsigned char *range, int exclusion)
{
    unsigned char *range_copy;
    
    /* initialize our new machine */
    re_init_machine(ctx, machine);

    /* allocate a transition table for the new state */
    re_set_trans(ctx, machine->init, machine->final,
                 (char)(exclusion ? RE_RANGE_EXCL : RE_RANGE));

    /* allocate a copy of the range bit vector */
    range_copy = (unsigned char *)mchalo(ctx->errctx, 32, "regex range");

    /* copy the caller's range */
    memcpy(range_copy, range, 32);

    /* store it in the tuple */
    ctx->tuple_arr[machine->init].char_range = range_copy;
}
                                

/*
 *   Build a group recognizer.  This is almost the same as a character
 *   recognizer, but matches a previous group rather than a literal
 *   character. 
 */
static void re_build_group_matcher(re_context *ctx,
                                   re_machine *machine, int group_num)
{
    /* initialize our new machine */
    re_init_machine(ctx, machine);

    /* 
     *   Allocate a transition tuple for the new state, using the group ID
     *   as the character code.  Store the special code for a group
     *   recognizer rather than the normal literal character code.  
     */
    re_set_trans(ctx, machine->init, machine->final,
                 (char)(group_num + RE_GROUP_MATCH_0));
}


/*
 *   Build a concatenation recognizer 
 */
static void re_build_concat(re_context *ctx, re_machine *new_machine,
                            re_machine *lhs, re_machine *rhs)
{
    /* initialize the new machine */
    re_init_machine(ctx, new_machine);

    /* 
     *   set up an epsilon transition from the new machine's initial state
     *   to the first submachine's initial state 
     */
    re_set_trans(ctx, new_machine->init, lhs->init, RE_EPSILON);

    /*
     *   Set up an epsilon transition from the first submachine's final
     *   state to the second submachine's initial state 
     */
    re_set_trans(ctx, lhs->final, rhs->init, RE_EPSILON);

    /*
     *   Set up an epsilon transition from the second submachine's final
     *   state to our new machine's final state 
     */
    re_set_trans(ctx, rhs->final, new_machine->final, RE_EPSILON);
}

/*
 *   Build a group machine.  sub_machine contains the machine that
 *   expresses the group's contents; we'll fill in new_machine with a
 *   newly-created machine that encloses and marks the group.  
 */
static void re_build_group(re_context *ctx, re_machine *new_machine,
                           re_machine *sub_machine, int group_id)
{
    /* initialize the container machine */
    re_init_machine(ctx, new_machine);

    /* 
     *   set up an epsilon transition from the new machine's initial state
     *   into the initial state of the group, and another transition from
     *   the group's final state into the container's final state 
     */
    re_set_trans(ctx, new_machine->init, sub_machine->init, RE_EPSILON);
    re_set_trans(ctx, sub_machine->final, new_machine->final, RE_EPSILON);

    /*
     *   Mark the initial and final states of the group machine as being
     *   group markers.  
     */
    ctx->tuple_arr[new_machine->init].flags |= RE_STATE_GROUP_BEGIN;
    ctx->tuple_arr[new_machine->final].flags |= RE_STATE_GROUP_END;

    /* store the group ID in the 'ch' member of the start and end states */
    ctx->tuple_arr[new_machine->init].ch = group_id;
    ctx->tuple_arr[new_machine->final].ch = group_id;
}

/*
 *   Build an alternation recognizer 
 */
static void re_build_alter(re_context *ctx, re_machine *new_machine,
                           re_machine *lhs, re_machine *rhs)
{
    /* initialize the new machine */
    re_init_machine(ctx, new_machine);

    /*
     *   Set up an epsilon transition from our new machine's initial state
     *   to the initial state of each submachine 
     */
    re_set_trans(ctx, new_machine->init, lhs->init, RE_EPSILON);
    re_set_trans(ctx, new_machine->init, rhs->init, RE_EPSILON);

    /*
     *   Set up an epsilon transition from the final state of each
     *   submachine to our final state 
     */
    re_set_trans(ctx, lhs->final, new_machine->final, RE_EPSILON);
    re_set_trans(ctx, rhs->final, new_machine->final, RE_EPSILON);
}

/*
 *   Build a closure recognizer
 */
static void re_build_closure(re_context *ctx,
                             re_machine *new_machine, re_machine *sub,
                             char specifier)
{
    /* initialize the new machine */
    re_init_machine(ctx, new_machine);

    /* 
     *   set up an epsilon transition from our initial state to the
     *   submachine's initial state, and from the submachine's final state
     *   to our final state 
     */
    re_set_trans(ctx, new_machine->init, sub->init, RE_EPSILON);
    re_set_trans(ctx, sub->final, new_machine->final, RE_EPSILON);

    /*
     *   If this is an unbounded closure ('*' or '+', but not '?'), set up
     *   the loop transition that takes us from the new machine's final
     *   state back to its initial state.  We don't do this on the
     *   zero-or-one closure, because we can only match the expression
     *   once.  
     */
    if (specifier != '?')
        re_set_trans(ctx, sub->final, sub->init, RE_EPSILON);

    /*
     *   If this is a zero-or-one closure or a zero-or-more closure, set
     *   up an epsilon transition from our initial state to our final
     *   state, since we can skip the entire subexpression.  We don't do
     *   this on the one-or-more closure, because we can't skip the
     *   subexpression in this case.  
     */
    if (specifier != '+')
        re_set_trans(ctx, new_machine->init, new_machine->final, RE_EPSILON);
}

/*
 *   Build a null machine 
 */
static void re_build_null_machine(re_context *ctx, re_machine *machine)
{
    machine->init = machine->final = RE_STATE_INVALID;
}

/* ------------------------------------------------------------------------ */
/*
 *   Determine if a machine is null 
 */
static int re_is_machine_null(re_context *ctx, re_machine *machine)
{
    return (machine->init == RE_STATE_INVALID);
}


/* ------------------------------------------------------------------------ */
/*
 *   Concatenate the second machine onto the first machine, replacing the
 *   first machine with the resulting machine.  If the first machine is a
 *   null machine (created with re_build_null_machine), we'll simply copy
 *   the second machine into the first. 
 */
static void re_concat_onto(re_context *ctx,
                           re_machine *dest, re_machine *rhs)
{
    /* check for a null destination machine */
    if (re_is_machine_null(ctx, dest))
    {
        /* 
         *   the first machine is null - simply copy the second machine
         *   onto the first unchanged 
         */
        *dest = *rhs;
    }
    else
    {
        re_machine new_machine;
        
        /* build the concatenated machine */
        re_build_concat(ctx, &new_machine, dest, rhs);

        /* copy the concatenated machine onto the first machine */
        *dest = new_machine;
    }
}

/*
 *   Alternate the second machine onto the first machine, replacing the
 *   first machine with the resulting machine.  If the first machine is a
 *   null machine, this simply replaces the first machine with the second
 *   machine.  If the second machine is null, this simply leaves the first
 *   machine unchanged. 
 */
static void re_alternate_onto(re_context *ctx,
                              re_machine *dest, re_machine *rhs)
{
    /* check to see if the first machine is null */
    if (re_is_machine_null(ctx, dest))
    {
        /* 
         *   the first machine is null - simply copy the second machine
         *   onto the first 
         */
        *dest = *rhs;
    }
    else
    {
        /* 
         *   if the second machine is null, don't do anything; otherwise,
         *   build the alternation 
         */
        if (!re_is_machine_null(ctx, rhs))
        {
            re_machine new_machine;
            
            /* build the alternation */
            re_build_alter(ctx, &new_machine, dest, rhs);

            /* replace the first machine with the alternation */
            *dest = new_machine;
        }
    }
}

/* ------------------------------------------------------------------------ */
/*
 *   Set a bit in a bit vector.
 */
#define re_set_bit(set, bit) \
    (((unsigned char *)(set))[(bit) >> 3] |= (1 << ((bit) & 7)))

/*
 *   Test a bit in a bit vector 
 */
#define re_is_bit_set(set, bit) \
    ((((unsigned char *)(set))[(bit) >> 3] & (1 << ((bit) & 7))) != 0)

/* ------------------------------------------------------------------------ */
/*
 *   Compile an expression 
 */
static re_status_t re_compile(re_context *ctx,
                              const char *expr, size_t exprlen,
                              re_machine *result_machine)
{
    re_machine cur_machine;
    re_machine alter_machine;
    re_machine new_machine;
    size_t group_stack_level;
    struct
    {
        re_machine old_cur;
        re_machine old_alter;
        int group_id;
    } group_stack[50];

    /* reset everything */
    re_reset(ctx);

    /* start out with no current machine and no alternate machine */
    re_build_null_machine(ctx, &cur_machine);
    re_build_null_machine(ctx, &alter_machine);

    /* nothing on the stack yet */
    group_stack_level = 0;

    /* loop until we run out of expression to parse */
    for ( ; exprlen != 0 ; ++expr, --exprlen)
    {
        switch(*expr)
        {
        case '^':
            /*
             *   beginning of line - if we're not at the beginning of the
             *   current expression (i.e., we already have some
             *   concatentations accumulated), treat it as an ordinary
             *   character 
             */
            if (!re_is_machine_null(ctx, &cur_machine))
                goto normal_char;

            /* build a new start-of-text recognizer */
            re_build_char(ctx, &new_machine, RE_TEXT_BEGIN);

            /* 
             *   concatenate it onto the string - note that this can't
             *   have any postfix operators 
             */
            re_concat_onto(ctx, &cur_machine, &new_machine);
            break;

        case '$':
            /*
             *   End of line specifier - if there's anything left after
             *   the '$' other than a close parens or alternation
             *   specifier, great it as a normal character 
             */
            if (exprlen > 1
                && (*(expr+1) != ')' && *(expr+1) != '|'))
                goto normal_char;

            /* build a new end-of-text recognizer */
            re_build_char(ctx, &new_machine, RE_TEXT_END);

            /* 
             *   concatenate it onto the string - note that this can't
             *   have any postfix operators 
             */
            re_concat_onto(ctx, &cur_machine, &new_machine);
            break;
            
        case '(':
            /* 
             *   Add a nesting level.  Push the current machine and
             *   alternate machines onto the group stack, and clear
             *   everything out for the new group. 
             */
            if (group_stack_level
                > sizeof(group_stack)/sizeof(group_stack[0]))
            {
                /* we cannot proceed - return an error */
                return RE_STATUS_GROUP_NESTING_TOO_DEEP;
            }

            /* save the current state on the stack */
            group_stack[group_stack_level].old_cur = cur_machine;
            group_stack[group_stack_level].old_alter = alter_machine;

            /* 
             *   Assign the group a group ID - groups are numbered in
             *   order of their opening (left) parentheses, so we want to
             *   assign a group number now.  We won't actually need to
             *   know the group number until we get to the matching close
             *   paren, but we need to assign it now, so store it in the
             *   group stack. 
             */
            group_stack[group_stack_level].group_id = ctx->cur_group;

            /* consume the group number */
            ctx->cur_group++;

            /* push the level */
            ++group_stack_level;

            /* start the new group with empty machines */
            re_build_null_machine(ctx, &cur_machine);
            re_build_null_machine(ctx, &alter_machine);
            break;

        case ')':
            /* if there's nothing on the stack, ignore this */
            if (group_stack_level == 0)
                break;

            /* take a level off the stack */
            --group_stack_level;

            /* 
             *   Remove a nesting level.  If we have a pending alternate
             *   expression, build the alternation expression.  This will
             *   leave the entire group expression in alter_machine,
             *   regardless of whether an alternation was in progress or
             *   not.  
             */
            re_alternate_onto(ctx, &alter_machine, &cur_machine);

            /*
             *   Create a group machine that encloses the group and marks
             *   it with a group number.  We assigned the group number
             *   when we parsed the open paren, so read that group number
             *   from the stack.
             *   
             *   Note that this will leave 'new_machine' with the entire
             *   group machine.  
             */
            re_build_group(ctx, &new_machine, &alter_machine,
                           group_stack[group_stack_level].group_id);

            /*
             *   Pop the stack - restore the alternation and current
             *   machines that were in progress before the group started. 
             */
            cur_machine = group_stack[group_stack_level].old_cur;
            alter_machine = group_stack[group_stack_level].old_alter;

            /*
             *   Check the group expression (in new_machine) for postfix
             *   expressions 
             */
            goto apply_postfix;

        case '|':
            /* 
             *   Start a new alternation.  This ends the current
             *   alternation; if we have a previous pending alternate,
             *   build an alternation machine out of the previous
             *   alternate and the current machine and move that to the
             *   alternate; otherwise, simply move the current machine to
             *   the pending alternate. 
             */
            re_alternate_onto(ctx, &alter_machine, &cur_machine);

            /* 
             *   the alternation starts out with a blank slate, so null
             *   out the current machine 
             */
            re_build_null_machine(ctx, &cur_machine);
            break;

        case '%':
            /* 
             *   quoted character - skip the quote mark and see what we
             *   have 
             */
            ++expr;
            --exprlen;

            /* check to see if we're at the end of the expression */
            if (exprlen == 0)
            {
                /* 
                 *   end of the string - ignore it, but undo the extra
                 *   increment of the expression index so that we exit the
                 *   enclosing loop properly 
                 */
                --expr;
                ++exprlen;
                break;
            }

            /* see what we have */
            switch(*expr)
            {
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
                /* group match - build a new literal group recognizer */
                re_build_group_matcher(ctx, &new_machine, (int)(*expr - '1'));

                /* apply any postfix expression to the group recognizer */
                goto apply_postfix;

            case '<':
                /* build a beginning-of-word recognizer */
                re_build_char(ctx, &new_machine, RE_WORD_BEGIN);

                /* it can't be postfixed - just concatenate it */
                re_concat_onto(ctx, &cur_machine, &new_machine);
                break;

            case '>':
                /* build an end-of-word recognizer */
                re_build_char(ctx, &new_machine, RE_WORD_END);

                /* it can't be postfixed - just concatenate it */
                re_concat_onto(ctx, &cur_machine, &new_machine);
                break;

            case 'w':
                /* word character */
                re_build_char(ctx, &new_machine, RE_WORD_CHAR);
                goto apply_postfix;

            case 'W':
                /* non-word character */
                re_build_char(ctx, &new_machine, RE_NON_WORD_CHAR);
                goto apply_postfix;

            case 'b':
                /* word boundary */
                re_build_char(ctx, &new_machine, RE_WORD_BOUNDARY);

                /* it can't be postfixed */
                re_concat_onto(ctx, &cur_machine, &new_machine);
                break;

            case 'B':
                /* not a word boundary */
                re_build_char(ctx, &new_machine, RE_NON_WORD_BOUNDARY);

                /* it can't be postfixed */
                re_concat_onto(ctx, &cur_machine, &new_machine);
                break;

            default:
                /* build a new literal character recognizer */
                re_build_char(ctx, &new_machine, *expr);

                /* apply any postfix expression to the character */
                goto apply_postfix;
            }
            break;

        case '.':
            /* 
             *   wildcard character - build a single character recognizer
             *   for the special wildcard symbol, then go check it for a
             *   postfix operator 
             */
            re_build_char(ctx, &new_machine, RE_WILDCARD);
            goto apply_postfix;
            break;

        case '[':
            /* range expression */
            {
                int is_exclusive = FALSE;
                unsigned char set[32];

                /* clear out the set of characters in the range */
                memset(set, 0, sizeof(set));

                /* first, skip the open bracket */
                ++expr;
                --exprlen;

                /* check to see if starts with the exclusion character */
                if (exprlen != 0 && *expr == '^')
                {
                    /* skip the exclusion specifier */
                    ++expr;
                    --exprlen;

                    /* note it */
                    is_exclusive = TRUE;
                }

                /* 
                 *   if the first character is a ']', include it in the
                 *   range 
                 */
                if (exprlen != 0 && *expr == ']')
                {
                    re_set_bit(set, (int)']');
                    ++expr;
                    --exprlen;
                }

                /*
                 *   if the next character is a '-', include it in the
                 *   range 
                 */
                if (exprlen != 0 && *expr == '-')
                {
                    re_set_bit(set, (int)'-');
                    ++expr;
                    --exprlen;
                }

                /* scan the character set */
                while (exprlen != 0 && *expr != ']')
                {
                    int ch;
                    
                    /* note this character */
                    ch = (int)(unsigned char)*expr;

                    /* set it */
                    re_set_bit(set, ch);

                    /* skip this character of the expression */
                    ++expr;
                    --exprlen;

                    /* check for a range */
                    if (exprlen != 0 && *expr == '-')
                    {
                        int ch2;
                        
                        /* skip the '-' */
                        ++expr;
                        --exprlen;
                        if (exprlen != 0)
                        {
                            /* get the other end of the range */
                            ch2 = (int)(unsigned char)*expr;

                            /* skip the second character */
                            ++expr;
                            --exprlen;

                            /* if the range is reversed, swap it */
                            if (ch > ch2)
                            {
                                int tmp = ch;
                                ch = ch2;
                                ch2 = tmp;
                            }

                            /* fill in the range */
                            for ( ; ch <= ch2 ; ++ch)
                                re_set_bit(set, ch);
                        }
                    }
                }

                /* create a character range machine */
                re_build_char_range(ctx, &new_machine, set, is_exclusive);

                /* apply any postfix operator */
                goto apply_postfix;
            }            
            break;

        default:
        normal_char:
            /* 
             *   it's an ordinary character - build a single character
             *   recognizer machine, and then concatenate it onto any
             *   existing machine 
             */
            re_build_char(ctx, &new_machine, *expr);

        apply_postfix:
            /*
             *   Check for a postfix operator, and apply it to the machine
             *   in 'new_machine' if present.  In any case, concatenate
             *   the 'new_machine' (modified by a postix operator or not)
             *   to the current machien.  
             */
            if (exprlen > 1)
            {
                switch(*(expr+1))
                {
                case '*':
                case '+':
                case '?':
                    /*
                     *   We have a postfix closure operator.  Build a new
                     *   closure machine out of 'new_machine'.  
                     */
                    {
                        re_machine closure_machine;
                        
                        /* move onto the closure operator */
                        ++expr;
                        --exprlen;
                        
                        /* build the closure machine */
                        re_build_closure(ctx, &closure_machine,
                                         &new_machine, *expr);
                        
                        /* replace the original machine with the closure */
                        new_machine = closure_machine;
                        
                        /* 
                         *   skip any redundant closure symbols, keeping
                         *   only the first one we saw 
                         */
                        while (exprlen > 1 && (*(expr+1) == '?'
                                               || *(expr+1) == '+'
                                               || *(expr+1) == '*'))
                        {
                            ++expr;
                            --exprlen;
                        }
                    }
                    break;
                    
                default:
                    /* no postfix operator */
                    break;
                }
            }

            /*
             *   Concatenate the new machine onto the current machine
             *   under construction.  
             */
            re_concat_onto(ctx, &cur_machine, &new_machine);
            break;
        }
    }

    /* complete any pending alternation */
    re_alternate_onto(ctx, &alter_machine, &cur_machine);

    /* store the resulting machine in the caller's machine descriptor */
    *result_machine = alter_machine;

    /* no errors encountered */
    return RE_STATUS_SUCCESS;
}


/* ------------------------------------------------------------------------ */
/*
 *   Pattern recognizer 
 */

/*
 *   Note a group position if appropriate 
 */
static void re_note_group(re_context *ctx, re_group_register *regs,
                          re_state_id id, const char *p)
{
    int group_index;

    /*
     *   Check to see if this is a valid state and it's a group marker -
     *   if not, there's nothing to do 
     */
    if (id == RE_STATE_INVALID
        || !(ctx->tuple_arr[id].flags
             & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))
        || (group_index = (int)ctx->tuple_arr[id].ch) >= RE_GROUP_REG_CNT)
        return;

    /*
     *   It's a valid group marker - note the appropriate register value 
     */
    if ((ctx->tuple_arr[id].flags & RE_STATE_GROUP_BEGIN) != 0)
        regs[group_index].start_ofs = p;
    else
        regs[group_index].end_ofs = p;
}

/*
 *   Determine if a character is part of a word.  We consider letters and
 *   numbers to be word characters.  
 */
static int re_is_word_char(char c)
{
    return (isalpha((unsigned char)c) || isdigit((unsigned char)c));
}

/*
 *   Match a string to a compiled expression.  Returns the length of the
 *   match if successful, or -1 if no match was found.  
 */
static int re_match(re_context *ctx, const char *entire_str,
                    const char *str, size_t origlen,
                    const re_machine *machine, re_group_register *regs)
{
    re_state_id cur_state;
    const char *p;
    size_t curlen;

    /* start at the machine's initial state */
    cur_state = machine->init;

    /* start at the beginning of the string */
    p = str;
    curlen = origlen;

    /* note any group involved in the initial state */
    re_note_group(ctx, regs, cur_state, p);

    /* 
     *   if we're starting in the final state, immediately return success
     *   with a zero-length match 
     */
    if (cur_state == machine->final)
    {
        /* return success with a zero-length match */
        return 0;
    }

    /* run the machine */
    for (;;)
    {
        re_tuple *tuple;

        /* get the tuple for this state */
        tuple = &ctx->tuple_arr[cur_state];

        /* if this is a group state, adjust the group registers */
        re_note_group(ctx, regs, cur_state, p);
        
        /* see what kind of state we're in */
        if (!(tuple->flags & (RE_STATE_GROUP_BEGIN | RE_STATE_GROUP_END))
            && tuple->ch != RE_EPSILON)
        {
            /*
             *   This is a character or group recognizer state.  If we
             *   match the character or group, continue on to the next
             *   state; otherwise, return failure.
             */
            switch(tuple->ch)
            {
            case RE_GROUP_MATCH_0:
            case RE_GROUP_MATCH_0 + 1:
            case RE_GROUP_MATCH_0 + 2:
            case RE_GROUP_MATCH_0 + 3:
            case RE_GROUP_MATCH_0 + 4:
            case RE_GROUP_MATCH_0 + 5:
            case RE_GROUP_MATCH_0 + 6:
            case RE_GROUP_MATCH_0 + 7:
            case RE_GROUP_MATCH_0 + 8:
            case RE_GROUP_MATCH_0 + 9:
                {
                    int group_num;
                    re_group_register *group_reg;
                    size_t reg_len;
                    
                    /* it's a group - get the group number */
                    group_num = tuple->ch - RE_GROUP_MATCH_0;
                    group_reg = &regs[group_num];
                    
                    /* 
                     *   if this register isn't defined, there's nothing
                     *   to match, so fail 
                     */
                    if (group_reg->start_ofs == 0 || group_reg->end_ofs == 0)
                        return -1;
                    
                    /* calculate the length of the register value */
                    reg_len = group_reg->end_ofs - group_reg->start_ofs;
                    
                    /* if we don't have enough left to match, it fails */
                    if (curlen < reg_len)
                        return -1;
                    
                    /* if the string doesn't match exactly, we fail */
                    if (memcmp(p, group_reg->start_ofs, reg_len) != 0)
                        return -1;
                    
                    /*
                     *   It matches exactly - skip the entire length of
                     *   the register in the source string 
                     */
                    p += reg_len;
                    curlen -= reg_len;
                }
                break;

            case RE_TEXT_BEGIN:
                /* 
                 *   Match only the exact beginning of the string - if
                 *   we're anywhere else, this isn't a match.  If this
                 *   succeeds, we don't skip any characters.  
                 */
                if (p != entire_str)
                    return -1;
                break;

            case RE_TEXT_END:
                /*
                 *   Match only the exact end of the string - if we're
                 *   anywhere else, this isn't a match.  Don't skip any
                 *   characters on success.  
                 */
                if (curlen != 0)
                    return -1;
                break;

            case RE_WORD_BEGIN:
                /* 
                 *   if the previous character is a word character, we're
                 *   not at the beginning of a word 
                 */
                if (p != entire_str && re_is_word_char(*(p-1)))
                    return -1;

                /* 
                 *   if we're at the end of the string, or the current
                 *   character isn't the start of a word, we're not at the
                 *   beginning of a word 
                 */
                if (curlen == 0 || !re_is_word_char(*p))
                    return -1;
                break;

            case RE_WORD_END:
                /*
                 *   if the current character is a word character, we're not
                 *   at the end of a word 
                 */
                if (curlen != 0 && re_is_word_char(*p))
                    return -1;

                /*
                 *   if we're at the beginning of the string, or the
                 *   previous character is not a word character, we're not
                 *   at the end of a word 
                 */
                if (p == entire_str || !re_is_word_char(*(p-1)))
                    return -1;
                break;

            case RE_WORD_CHAR:
                /* if it's not a word character, it's a failure */
                if (curlen == 0 || !re_is_word_char(*p))
                    return -1;

                /* skip this character of input */
                ++p;
                --curlen;
                break;

            case RE_NON_WORD_CHAR:
                /* if it's a word character, it's a failure */
                if (curlen == 0 || re_is_word_char(*p))
                    return -1;

                /* skip the input */
                ++p;
                --curlen;
                break;

            case RE_WORD_BOUNDARY:
            case RE_NON_WORD_BOUNDARY:
                {
                    int prev_is_word;
                    int next_is_word;
                    int boundary;

                    /*
                     *   Determine if the previous character is a word
                     *   character -- if we're at the beginning of the
                     *   string, it's obviously not, otherwise check its
                     *   classification 
                     */
                    prev_is_word = (p != entire_str
                                    && re_is_word_char(*(p-1)));

                    /* make the same check for the current character */
                    next_is_word = (curlen != 0
                                    && re_is_word_char(*p));

                    /*
                     *   Determine if this is a boundary - it is if the
                     *   two states are different 
                     */
                    boundary = ((prev_is_word != 0) ^ (next_is_word != 0));

                    /* 
                     *   make sure it matches what was desired, and return
                     *   failure if not 
                     */
                    if ((tuple->ch == RE_WORD_BOUNDARY && !boundary)
                        || (tuple->ch == RE_NON_WORD_BOUNDARY && boundary))
                        return -1;
                }
                break;

            case RE_WILDCARD:
                /* make sure we have a character to match */
                if (curlen == 0)
                    return -1;

                /* skip this character */
                ++p;
                --curlen;
                break;

            case RE_RANGE:
            case RE_RANGE_EXCL:
                {
                    int match;
                    
                    /* make sure we have a character to match */
                    if (curlen == 0)
                        return -1;
                    
                    /* see if we match */
                    match = re_is_bit_set(tuple->char_range,
                                          (int)(unsigned char)*p);
                    
                    /* make sure we got what we wanted */
                    if ((tuple->ch == RE_RANGE && !match)
                        || (tuple->ch == RE_RANGE_EXCL && match))
                        return -1;

                    /* skip this character of the input */
                    ++p;
                    --curlen;
                }
                break;

            default:
                /* make sure we have an exact match */
                if (curlen == 0 || tuple->ch != *p)
                    return -1;

                /* skip this character of the input */
                ++p;
                --curlen;
                break;
            }

            /* 
             *   if we got this far, we were successful - move on to the
             *   next state 
             */
            cur_state = tuple->next_state_1;
        }
        else if (tuple->next_state_2 == RE_STATE_INVALID)
        {
            /*
             *   We have only one transition, so this state is entirely
             *   deterministic.  Simply move on to the next state. 
             */
            cur_state = tuple->next_state_1;
        }
        else
        {
            re_machine sub_machine;
            re_group_register regs1[RE_GROUP_REG_CNT];
            re_group_register regs2[RE_GROUP_REG_CNT];
            int ret1;
            int ret2;
            
            /*
             *   This state has two possible transitions, and we don't
             *   know which one to take.  So, try both, see which one
             *   works better, and return the result.  Try the first
             *   transition first.  Note that each separate attempt must
             *   use a separate copy of the registers.  
             */
            memcpy(regs1, regs, sizeof(regs1));
            sub_machine.init = tuple->next_state_1;
            sub_machine.final = machine->final;
            ret1 = re_match(ctx, entire_str, p, curlen, &sub_machine, regs1);

            /*
             *   Now try the second transition 
             */
            memcpy(regs2, regs, sizeof(regs2));
            sub_machine.init = tuple->next_state_2;
            sub_machine.final = machine->final;
            ret2 = re_match(ctx, entire_str, p, curlen, &sub_machine, regs2);

            /*
             *   If they both failed, the whole thing failed.  Otherwise,
             *   return the longer of the two, plus the length we
             *   ourselves matched previously.  Note that we return the
             *   register set from the winning match.  
             */
            if (ret1 < 0 && ret2 < 0)
            {
                /* they both failed */
                return -1;
            }
            else if (ret1 > ret2)
            {
                /* use the first register set and result length */
                memcpy(regs, regs1, sizeof(regs1));
                return ret1 + (p - str);
            }
            else
            {
                /* use the second register set and result length */
                memcpy(regs, regs2, sizeof(regs2));
                return ret2 + (p - str);
            }
        }

        /*
         *   If we're in the final state, return success 
         */
        if (cur_state == machine->final)
        {
            /* finish off any group involved in the final state */
            re_note_group(ctx, regs, cur_state, p);
            
            /* return the length we matched */
            return p - str;
        }
    }
}

/* ------------------------------------------------------------------------ */
/*
 *   Search for a regular expression within a string.  Returns -1 if the
 *   string cannot be found, otherwise returns the offset from the start
 *   of the string to be searched of the start of the first match for the
 *   pattern.  
 */
static int re_search(re_context *ctx, const char *str, size_t len,
                     const re_machine *machine, re_group_register *regs,
                     int *result_len)
{
    int ofs;
    
    /*
     *   Starting at the first character in the string, search for the
     *   pattern at each subsequent character until we either find the
     *   pattern or run out of string to test. 
     */
    for (ofs = 0 ; ofs < (int)len ; ++ofs)
    {
        int matchlen;
        
        /* check for a match */
        matchlen = re_match(ctx, str, str + ofs, len - ofs,
                            machine, regs);
        if (matchlen >= 0)
        {
            /* we found a match here - return the length and offset */
            *result_len = matchlen;
            return ofs;
        }
    }

    /* we didn't find a match */
    return -1;
}

/* ------------------------------------------------------------------------ */
/*
 *   Make a copy of a search string in our private buffer.
 */
static void re_save_search_str(re_context *ctx, const char *str, size_t len)
{
    /* if the string is empty, this is easy */
    if (len == 0)
    {
        /* nothing to store - just save the length and return */
        ctx->curlen = 0;
        return;
    }
    
    /* if the current buffer isn't big enough, allocate a new one */
    if (ctx->strbuf == 0 || ctx->strbufsiz < len)
    {
        /* 
         *   free any previous buffer - its contents are no longer
         *   important, since we're about to overwrite it with a new
         *   string 
         */
        if (ctx->strbuf != 0)
            mchfre(ctx->strbuf);

        /* 
         *   allocate a new buffer; round up to the next 256-byte
         *   increment to make sure we're not constantly reallocating to
         *   random sizes 
         */
        ctx->strbufsiz = ((len + 255) & ~255);

        /* allocate it */
        ctx->strbuf = (char *)mchalo(ctx->errctx, (ushort)ctx->strbufsiz,
                                     "regex str");
    }

    /* copy the string */
    memcpy(ctx->strbuf, str, len);

    /* save the length */
    ctx->curlen = len;
}

/* ------------------------------------------------------------------------ */
/*
 *   Compile an expression and search for a match within the given string.
 *   Returns the offset of the match, or -1 if no match was found.  
 */
int re_compile_and_search(re_context *ctx,
                          const char *pattern, size_t patlen,
                          const char *searchstr, size_t searchlen,
                          int *result_len)
{
    re_machine machine;
    
    /* compile the expression - return failure if we get an error */
    if (re_compile(ctx, pattern, patlen, &machine) != RE_STATUS_SUCCESS)
        return -1;

    /* save the search string in our internal buffer */
    re_save_search_str(ctx, searchstr, searchlen);

    /* clear the group registers */
    memset(ctx->regs, 0, sizeof(ctx->regs));

    /* 
     *   search for the pattern in our copy of the string - use the copy
     *   so that the group registers stay valid even if the caller
     *   deallocates the original string after we return
     */
    return re_search(ctx, ctx->strbuf, ctx->curlen, &machine,
                     ctx->regs, result_len);
}

/* ------------------------------------------------------------------------ */
/*
 *   Compile an expression and check for a match.  Returns the length of
 *   the match if we found a match, -1 if we found no match.  This is not
 *   a search function; we merely match the leading substring of the given
 *   string to the given pattern.  
 */
int re_compile_and_match(re_context *ctx,
                         const char *pattern, size_t patlen,
                         const char *searchstr, size_t searchlen)
{
    re_machine machine;

    /* compile the expression - return failure if we get an error */
    if (re_compile(ctx, pattern, patlen, &machine) != RE_STATUS_SUCCESS)
        return FALSE;

    /* save the search string in our internal buffer */
    re_save_search_str(ctx, searchstr, searchlen);

    /* clear the group registers */
    memset(ctx->regs, 0, sizeof(ctx->regs));

    /* match the string */
    return re_match(ctx, ctx->strbuf, ctx->strbuf, ctx->curlen,
                    &machine, ctx->regs);
}


Generated by  Doxygen 1.6.0   Back to index