? grep.cat1
Index: grep.c
===================================================================
RCS file: /cvsroot/othersrc/usr.bin/grep/grep.c,v
retrieving revision 1.43
diff -u -u -r1.43 grep.c
--- grep.c	8 Nov 2003 20:10:08 -0000	1.43
+++ grep.c	8 Nov 2003 21:39:19 -0000
@@ -64,6 +64,8 @@
 char **pattern;
 regex_t	*r_pattern;
 
+fastgrep_t *fg_pattern;
+
 /* For regex errors  */
 char re_error[RE_ERROR_BUF + 1];
 
@@ -80,6 +82,7 @@
 int bflag;		/* -b: show block numbers for each match */
 int cflag;		/* -c: only show a count of matching lines */
 int hflag;		/* -h: Never print filenames. -H overrides */
+int iflag;		/* -i: Ignore case */
 int lflag;		/* -l: only show names of files with matches */
 int mflag;		/* -m: specify maximum line matches (per file) */
 int nflag;		/* -n: show line numbers in front of matching lines */
@@ -124,6 +127,8 @@
 /* Housekeeping */
 int first;		/* flag whether or not this is our first match */
 int tail;		/* lines left to print */
+int boleol;		/* At least one pattern has a BOL or EOL */
+size_t maxPatternLen = 0;	/* Longest length of all patterns */
 
 static void
 usage(void)
@@ -201,6 +206,9 @@
 	strncpy(pattern[patterns], pat, len);
 	pattern[patterns][len] = '\0';
 	++patterns;
+
+	if (len > maxPatternLen)
+		maxPatternLen = len;
 }
 
 static void
@@ -231,8 +239,26 @@
 	fclose(f);
 }
 
+static void
+free_patterns(void) 
+{
+	int i;
+
+	for (i = 0; i < patterns; i++) {
+		if (fg_pattern[i].pattern)
+			free(fg_pattern[i].pattern);
+		else
+			regfree(&r_pattern[i]);
+		free(pattern[i]);
+	}
+	free(fg_pattern);
+	free(r_pattern);
+	free(pattern);
+}
+
 static int
-check_context_arg(char const *str) {
+check_context_arg(char const *str) 
+{
 	char *ep;
 	long lval;
 
@@ -414,6 +440,7 @@
 			break;
 		case 'i':
 		case 'y':
+			iflag = 1;
 			cflags |= REG_ICASE;
 			break;
 		case 'l':
@@ -529,11 +556,19 @@
 		cflags |= REG_EXTENDED;
 	else if (Fflag)
 		cflags |= REG_NOSPEC;
+	
+	fg_pattern = grep_malloc(patterns * sizeof(*fg_pattern));
 	r_pattern = grep_malloc(patterns * sizeof(*r_pattern));
+
 	for (i = 0; i < patterns; ++i) {
-		if ((c = regcomp(&r_pattern[i], pattern[i], cflags))) {
-			regerror(c, &r_pattern[i], re_error, RE_ERROR_BUF);
-			errx(2, "%s", re_error);
+		/* Check if fast algorithm is allowed */
+		if (fastcomp(&fg_pattern[i], pattern[i])) {
+			/* Use regex library */
+			if ((c = regcomp(&r_pattern[i], pattern[i], cflags))) {
+				regerror(c, &r_pattern[i], re_error, 
+					RE_ERROR_BUF);
+				errx(2, "%s", re_error);
+			}
 		}
 	}
 
@@ -552,6 +587,8 @@
 	else
 		for (c = 0; argc--; ++argv)
 			c += procfile(*argv);
+
+	free_patterns();
 
 	exit(!c);
 }
Index: grep.h
===================================================================
RCS file: /cvsroot/othersrc/usr.bin/grep/grep.h,v
retrieving revision 1.25
diff -u -u -r1.25 grep.h
--- grep.h	8 Nov 2003 17:24:07 -0000	1.25
+++ grep.h	8 Nov 2003 21:39:20 -0000
@@ -28,11 +28,12 @@
 
 #include <sys/types.h>
 
+#include <limits.h>
 #include <regex.h>
 #include <stdio.h>
 #include <zlib.h>
 
-#define VERSION "20031108"
+#define VERSION "20031108a"
 
 #define	GREP_READ	0
 #define GREP_SKIP	1
@@ -56,11 +57,21 @@
 	char *dat;
 } str_t;
 
+typedef struct {
+	unsigned char *pattern;
+	int patternLen;
+	int qsBc[UCHAR_MAX + 1];
+	/* flags */
+	int bol;
+	int eol;
+	int reversedSearch;
+} fastgrep_t;
+
 /* Flags passed to regcomp() and regexec() */
 extern int cflags, eflags;
 
 /* Command line flags */
-extern int Aflag, Bflag, Fflag, Lflag, bflag, cflag, lflag, mflag, 
+extern int Aflag, Bflag, Fflag, Lflag, bflag, cflag, iflag, lflag, mflag, 
 	nflag, oflag, qflag, sflag, vflag, wflag, xflag;
 
 extern int colours;
@@ -77,10 +88,14 @@
 
 extern int binbehave, dirbehave, devbehave; 
 /* extern int linkbehave; */
+extern int boleol;
+extern size_t maxPatternLen;
 extern char *stdin_label;
 
 extern int first, matchall, patterns, tail;
 extern char **pattern;
+
+extern fastgrep_t *fg_pattern;
 extern regex_t *r_pattern;
 
 /* 
@@ -96,7 +111,9 @@
 
 void *grep_malloc(size_t size);
 void *grep_realloc(void *ptr, size_t size);
+unsigned char *grep_strdup(const char *);
 void printline(str_t *line, int sep, regmatch_t *matches, int m);
+int fastcomp(fastgrep_t *, const char *);
 
 /* queue.c */
 void initqueue(void);
Index: mmfile.c
===================================================================
RCS file: /cvsroot/othersrc/usr.bin/grep/mmfile.c,v
retrieving revision 1.5
diff -u -u -r1.5 mmfile.c
--- mmfile.c	8 Nov 2003 20:11:19 -0000	1.5
+++ mmfile.c	8 Nov 2003 21:39:20 -0000
@@ -44,6 +44,7 @@
 #include "grep.h"
 
 #define MAX_MAP_LEN 1048576
+#define BLOCKSIZE 32768
 
 mmf_t *
 mmopen(char *fn, char *mode)
@@ -94,9 +95,26 @@
 
 	if (mmf->ptr >= mmf->end)
 		return NULL;
-	for (p = mmf->ptr; mmf->ptr < mmf->end; ++mmf->ptr)
-		if (*mmf->ptr == line_endchar)
-			break;
+	
+	if ((lflag || qflag) && !boleol) {
+		/* Find starting point to search */
+		if (mmf->ptr == mmf->base)
+			p = mmf->ptr;
+		else
+			p = mmf->ptr - maxPatternLen;
+
+		/* Move the start pointer ahead for next iteration */
+		if (mmf->end - mmf->ptr > BLOCKSIZE)
+			mmf->ptr += BLOCKSIZE;
+		else
+			mmf->ptr = mmf->end;
+	} else {
+
+		for (p = mmf->ptr; mmf->ptr < mmf->end; ++mmf->ptr)
+			if (*mmf->ptr == line_endchar)
+				break;
+	}
+
 	*l = mmf->ptr - p;
 	++mmf->ptr;
 	return p;
Index: util.c
===================================================================
RCS file: /cvsroot/othersrc/usr.bin/grep/util.c,v
retrieving revision 1.29
diff -u -u -r1.29 util.c
--- util.c	8 Nov 2003 17:24:07 -0000	1.29
+++ util.c	8 Nov 2003 21:39:21 -0000
@@ -54,6 +54,9 @@
 
 static int linesqueued, newfile;
 static int procline(str_t *l, int nottext);
+static int grep_search(fastgrep_t *, unsigned char *, int, regmatch_t *);
+static int grep_cmp(const unsigned char *, const unsigned char *, size_t);
+static void grep_revstr(unsigned char *, int);
 
 int 
 grep_tree(char **argv)
@@ -223,7 +226,14 @@
 		pmatch.rm_so = st;
 		pmatch.rm_eo = l->len;
 		for (i = 0; i < patterns; i++) {
-			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
+			if (fg_pattern[i].pattern)
+				r = grep_search(&fg_pattern[i],
+						(unsigned char *)l->dat,
+						l->len, &pmatch);
+			else 
+				r = regexec(&r_pattern[i], l->dat, 1, &pmatch, 
+						eflags);
+
 			if (r == REG_NOMATCH && t == 0)
 				continue;
 			if (r == 0) {
@@ -286,6 +296,212 @@
 	return c;
 }
 
+int
+fastcomp(fastgrep_t *fg, const char *fast_pattern)
+{
+	int i;
+	int bol = 0;
+	int eol = 0;
+	int origPatternLen;
+	int shiftPatternLen;
+	int hasDot = 0;
+	int firstHalfDot = -1;
+	int firstLastHalfDot = -1;
+	int lastHalfDot = 0;
+
+	if (Fflag) {
+		fg->pattern = NULL;
+		return (-1);
+	}
+
+	/* Initialize. */
+	origPatternLen = fg->patternLen = strlen(fast_pattern);
+	fg->bol = 0;
+	fg->eol = 0;
+	fg->reversedSearch = 0;
+
+	/* Remove end-of-line character ('$'). */
+	if (fast_pattern[fg->patternLen - 1] == '$') {
+		eol++;
+		fg->eol = 1;
+		fg->patternLen--;
+		boleol = 1;
+	}
+
+	/* Remove beginning-of-line character ('^'). */
+	if (fast_pattern[0] == '^') {
+		bol++;
+		fg->bol = 1;
+		fg->patternLen--;
+		boleol = 1;
+	}
+
+	/*
+	 * Copy pattern minus '^' and '$' characters at the beginning and
+	 * ending of the string respectively.
+	 */
+	fg->pattern = grep_strdup(fast_pattern + bol);
+
+	/* Look for ways to cheat...er...avoid the full regex engine. */
+	for (i = 0; i < fg->patternLen; i++)
+	{
+		/* Can still cheat? */
+		if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
+		    (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
+		    (fg->pattern[i] == '^') || (fg->pattern[i] == '$') ||
+		    (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
+		    (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
+			/* As long as it is good, upper case it for later. */
+			if (iflag)
+				fg->pattern[i] = toupper(fg->pattern[i]);
+		} else if (fg->pattern[i] == '.') {
+			hasDot = i;
+			if (i < fg->patternLen / 2) {
+				if (firstHalfDot < -1)
+					/* Closest dot to the beginning */
+					firstHalfDot = i;
+			} else {
+				/* Closest dot to the end of the pattern. */
+				lastHalfDot = i;
+				if (firstLastHalfDot < 0)
+					firstLastHalfDot = i;
+			}
+		} else {
+			/* Free memory and let others know this is empty. */
+			free(fg->pattern);
+			fg->pattern = NULL;
+			return (-1);
+		}
+	}
+
+	/*
+	 * Determine if a reverse search would be faster based on the placement
+	 * of the dots.
+	 */
+	if ((!(lflag || cflag)) && ((!(bol || eol)) &&
+	    ((lastHalfDot) && ((firstHalfDot < 0) ||
+	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
+		fg->reversedSearch = 1;
+		hasDot = fg->patternLen - (firstHalfDot < 0 ?
+		    firstLastHalfDot : firstHalfDot) - 1;
+		grep_revstr(fg->pattern, fg->patternLen);
+	}
+
+	/*
+	 * Normal Quick Search would require a shift based on the position the
+	 * next character after the comparison is within the pattern.  With
+	 * wildcards, the position of the last dot effects the maximum shift
+	 * distance.
+	 * The closer to the end the wild card is the slower the search.  A
+	 * reverse version of this algorithm would be useful for wildcards near
+	 * the end of the string.
+	 *
+	 * Examples:
+	 * Pattern	Max shift
+	 * -------	---------
+	 * this		5
+	 * .his		4
+	 * t.is		3
+	 * th.s		2
+	 * thi.		1
+	 */
+
+	/* Adjust the shift based on location of the last dot ('.'). */
+	shiftPatternLen = fg->patternLen - hasDot;
+
+	/* Preprocess pattern. */
+	for (i = 0; i <= UCHAR_MAX; i++)
+		fg->qsBc[i] = shiftPatternLen;
+	for (i = hasDot + 1; i < fg->patternLen; i++) {
+		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
+		/*
+		 * If case is ignored, make the jump apply to both upper and
+		 * lower cased characters.  As the pattern is stored in upper
+		 * case, apply the same to the lower case equivalents.
+		 */
+		if (iflag)
+			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
+	}
+
+	/*
+	 * Put pattern back to normal after pre-processing to allow for easy
+	 * comparisons later.
+	 */
+	if (fg->reversedSearch)
+		grep_revstr(fg->pattern, fg->patternLen);
+
+	return (0);
+}
+
+
+static int
+grep_search(fastgrep_t *fg, unsigned char *data, int dataLen, regmatch_t *pmatch)
+{
+	int j;
+	int rtrnVal = REG_NOMATCH;
+
+	pmatch->rm_so = -1;
+	pmatch->rm_eo = -1;
+
+	/* No point in going farther if we do not have enough data. */
+	if (dataLen < fg->patternLen)
+		return (rtrnVal);
+
+	/* Only try once at the beginning or ending of the line. */
+	if (fg->bol || fg->eol) {
+		/* Simple text comparison. */
+		/* Verify data is >= pattern length before searching on it. */
+		if (dataLen >= fg->patternLen) {
+			/* Determine where in data to start search at. */
+			if (fg->eol)
+				j = dataLen - fg->patternLen;
+			else
+				j = 0;
+			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
+				if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
+					rtrnVal = 0;
+					pmatch->rm_so = j;
+					pmatch->rm_eo = j + fg->patternLen;
+				}
+		}
+	} else if (fg->reversedSearch) {
+		/* Quick Search algorithm. */
+		j = dataLen;
+		do {
+			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
+			    fg->patternLen) == -1) {
+				rtrnVal = 0;
+				pmatch->rm_so = j - fg->patternLen;
+				pmatch->rm_eo = j;
+				break;
+			}
+			/* Shift if within bounds, otherwise, we are done. */
+			if (j == fg->patternLen)
+				break;
+			j -= fg->qsBc[data[j - fg->patternLen - 1]];
+		} while (j >= fg->patternLen);
+	} else {
+		/* Quick Search algorithm. */
+		j = 0;
+		do {
+			if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
+				rtrnVal = 0;
+				pmatch->rm_so = j;
+				pmatch->rm_eo = j + fg->patternLen;
+				break;
+			}
+
+			/* Shift if within bounds, otherwise, we are done. */
+			if (j + fg->patternLen == dataLen)
+				break;
+			else
+				j += fg->qsBc[data[j + fg->patternLen]];
+		} while (j <= (dataLen - fg->patternLen));
+	}
+
+	return (rtrnVal);
+}
+
 void *
 grep_malloc(size_t size)
 {
@@ -303,6 +519,48 @@
 		err(2, "realloc");
 	return ptr;
 }
+
+unsigned char *
+grep_strdup(const char *str)
+{ 
+	unsigned char *ptr; 
+
+	if ((ptr = (unsigned char *)strdup(str)) == NULL)
+		err(2, "strdup"); 
+	return ptr;
+}                       
+                        
+/*                              
+ * Returns:     i >= 0 on failure (position that it failed)
+ *              -1 on success   
+ */            
+static int
+grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {     
+		if (((pat[i] == data[i]) || (pat[i] == '.')) ||
+			(iflag && pat[i] == toupper(data[i])))
+			continue;
+		return (i);
+	}       
+                
+	return (-1);    
+}                  
+                                
+static void
+grep_revstr(unsigned char *str, int len)
+{
+	int i;
+	char c;         
+
+	for (i = 0; i < len / 2; i++) {
+		c = str[i];
+		str[i] = str[len - i - 1];
+		str[len - i - 1] = c;
+	}       
+}              
 
 void
 printline(str_t *line, int sep, regmatch_t *matches, int m)