diff -ur /tmp/wget-1.11/src/init.c wget-1.11/src/init.c --- /tmp/wget-1.11/src/init.c 2008-01-26 11:26:57.000000000 +0200 +++ wget-1.11/src/init.c 2008-02-01 13:23:52.000000000 +0200 @@ -174,6 +174,7 @@ { "ignorelength", &opt.ignore_length, cmd_boolean }, { "ignoretags", &opt.ignore_tags, cmd_vector }, { "includedirectories", &opt.includes, cmd_directory_vector }, + { "includeregex", &opt.includeregex, cmd_string }, #ifdef ENABLE_IPV6 { "inet4only", &opt.ipv4_only, cmd_boolean }, { "inet6only", &opt.ipv6_only, cmd_boolean }, @@ -1494,6 +1495,7 @@ free_vec (opt.rejects); free_vec (opt.excludes); free_vec (opt.includes); + xfree_null (opt.includeregex); free_vec (opt.domains); free_vec (opt.follow_tags); free_vec (opt.ignore_tags); diff -ur /tmp/wget-1.11/src/main.c wget-1.11/src/main.c --- /tmp/wget-1.11/src/main.c 2008-01-26 11:26:57.000000000 +0200 +++ wget-1.11/src/main.c 2008-02-01 14:06:34.000000000 +0200 @@ -183,6 +183,7 @@ { "ignore-length", 0, OPT_BOOLEAN, "ignorelength", -1 }, { "ignore-tags", 0, OPT_VALUE, "ignoretags", -1 }, { "include-directories", 'I', OPT_VALUE, "includedirectories", -1 }, + { "include-regex", 'g', OPT_VALUE, "includeregex", -1 }, #ifdef ENABLE_IPV6 { "inet4-only", '4', OPT_BOOLEAN, "inet4only", -1 }, { "inet6-only", '6', OPT_BOOLEAN, "inet6only", -1 }, @@ -626,6 +627,8 @@ N_("\ -X, --exclude-directories=LIST list of excluded directories.\n"), N_("\ + -g, --include-regex=REGEX a regex that path should match to be included.\n"), + N_("\ -np, --no-parent don't ascend to the parent directory.\n"), "\n", diff -ur /tmp/wget-1.11/src/options.h wget-1.11/src/options.h --- /tmp/wget-1.11/src/options.h 2008-01-26 11:26:57.000000000 +0200 +++ wget-1.11/src/options.h 2008-02-01 13:24:50.000000000 +0200 @@ -66,6 +66,8 @@ char **excludes; /* List of excluded FTP directories. */ char **includes; /* List of FTP directories to follow. */ + char *includeregex; /* Regex which must match for including the + file in download */ bool ignore_case; /* Whether to ignore case when matching dirs and files */ diff -ur /tmp/wget-1.11/src/recur.c wget-1.11/src/recur.c --- /tmp/wget-1.11/src/recur.c 2008-01-26 11:26:58.000000000 +0200 +++ wget-1.11/src/recur.c 2008-02-01 14:16:12.000000000 +0200 @@ -527,6 +527,16 @@ } } + /* If the path is not covered by regex, skip it */ + if (opt.includeregex) + { + if (!accregex(u->path)) + { + DEBUGP (("%s (%s) is not-included by regex.\n", url, u->path)); + goto out; + } + } + /* 6. Check for acceptance/rejection rules. We ignore these rules for directories (no file name to match) and for non-leaf HTMLs, which can lead to other files that do need to be downloaded. (-p diff -ur /tmp/wget-1.11/src/utils.c wget-1.11/src/utils.c --- /tmp/wget-1.11/src/utils.c 2008-01-26 11:26:58.000000000 +0200 +++ wget-1.11/src/utils.c 2008-02-01 14:46:23.000000000 +0200 @@ -58,6 +58,8 @@ #include #include +#include + /* For TIOCGWINSZ and friends: */ #ifdef HAVE_SYS_IOCTL_H # include @@ -744,6 +746,30 @@ return true; } +/* Return true when a path matches pcre regex */ +bool accregex(const char *path) +{ + static pcre *expr; + const char *errstr; + int erroffset; + int matches[1]; + int len; + + len = strlen(path); + if ( !expr ) + { + if (!(expr = pcre_compile(opt.includeregex, 0, &errstr, &erroffset, 0))) + { + fprintf(stderr, "%s: %s\n", opt.includeregex, errstr); + return false; + } + } + if (pcre_exec(expr, 0, path, len, 0, 0, matches, 1) < 0) + return false; + else { + return true; + } +} /* Return true if STRING ends with TAIL. For instance: match_tail ("abc", "bc", false) -> 1 diff -ur /tmp/wget-1.11/src/utils.h wget-1.11/src/utils.h --- /tmp/wget-1.11/src/utils.h 2008-01-26 11:26:58.000000000 +0200 +++ wget-1.11/src/utils.h 2008-02-01 14:01:27.000000000 +0200 @@ -72,6 +72,8 @@ int fnmatch_nocase (const char *, const char *, int); bool acceptable (const char *); bool accdir (const char *s); +bool accregex (const char *s); + char *suffix (const char *s); bool match_tail (const char *, const char *, bool); bool has_wildcards_p (const char *);