diff --git a/src/burl.c b/src/burl.c index fe999b21..1c3ff7bc 100644 --- a/src/burl.c +++ b/src/burl.c @@ -66,8 +66,8 @@ static int burl_normalize_basic_unreserved_fix (buffer *b, buffer *t, int i, int memcpy(p, s, (size_t)i); for (; i < used; ++i, ++j) { if (!encoded_chars_http_uri_reqd[s[i]]) { - if (s[i] == '?' && -1 == qs) qs = j; p[j] = s[i]; + if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = j; } else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) { const unsigned int x = (n1 << 4) | n2; @@ -104,7 +104,7 @@ static int burl_normalize_basic_unreserved (buffer *b, buffer *t) for (int i = 0; i < used; ++i) { if (!encoded_chars_http_uri_reqd[s[i]]) { - if (s[i] == '?' && -1 == qs) qs = i; + if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = i; } else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2) && !burl_is_unreserved((x = (n1 << 4) | n2))) { @@ -135,11 +135,12 @@ static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int q unsigned char * const p = (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1); unsigned int n1, n2; + int invalid_utf8 = 0; memcpy(p, s, (size_t)i); for (; i < used; ++i, ++j) { if (!encoded_chars_http_uri_reqd[s[i]]) { - if (s[i] == '?' && -1 == qs) qs = j; p[j] = s[i]; + if (__builtin_expect( (s[i] == '?'), 0)) qs = j; } else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) { const unsigned int x = (n1 << 4) | n2; @@ -153,7 +154,7 @@ static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int q p[j] = '%'; p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/ p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/ - if (li_utf8_invalid_byte(x)) qs = -2; + invalid_utf8 |= li_utf8_invalid_byte(x); } i+=2; } @@ -162,11 +163,11 @@ static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int q p[j] = '%'; p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF]; p[++j] = hex_chars_uc[s[i] & 0xF]; - if (li_utf8_invalid_byte(s[i])) qs = -2; + invalid_utf8 |= li_utf8_invalid_byte(s[i]); } } buffer_copy_string_len(b, (char *)p, (size_t)j); - return qs; + return !invalid_utf8 ? qs : -2; } @@ -176,17 +177,18 @@ static int burl_normalize_basic_required (buffer *b, buffer *t) const int used = (int)buffer_clen(b); unsigned int n1, n2, x; int qs = -1; + int invalid_utf8 = 0; for (int i = 0; i < used; ++i) { if (!encoded_chars_http_uri_reqd[s[i]]) { - if (s[i] == '?' && -1 == qs) qs = i; + if (s[i] == '?') qs = i; } else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2) && (encoded_chars_http_uri_reqd[(x = (n1 << 4) | n2)] || (qs < 0 ? (x == '/' || x == '?') : (x == '&' || x == '=' || x == ';' || x == '+')))) { - if (li_utf8_invalid_byte(x)) qs = -2; + invalid_utf8 |= li_utf8_invalid_byte(x); if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */ if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */ i+=2; @@ -201,7 +203,7 @@ static int burl_normalize_basic_required (buffer *b, buffer *t) } } - return qs; + return !invalid_utf8 ? qs : -2; } @@ -323,6 +325,15 @@ static int burl_normalize_path (buffer *b, buffer *t, int qs, int flags) } +__attribute_cold__ +__attribute_noinline__ +__attribute_pure__ +static int burl_scan_qmark (const buffer * const b) { + const char * const qmark = strchr(b->ptr, '?'); + return qmark ? (int)(qmark - b->ptr) : -1; +} + + int burl_normalize (buffer *b, buffer *t, int flags) { int qs; @@ -342,7 +353,10 @@ int burl_normalize (buffer *b, buffer *t, int flags) qs = (flags & HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED) ? burl_normalize_basic_required(b, t) : burl_normalize_basic_unreserved(b, t); - if (-2 == qs) return -2; + if (-2 == qs) { + if (flags & HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT) return -2; + qs = burl_scan_qmark(b); + } if (flags & HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT) { if (burl_contains_ctrls(b)) return -2; diff --git a/src/burl.h b/src/burl.h index a013cc07..2d66f192 100644 --- a/src/burl.h +++ b/src/burl.h @@ -26,6 +26,7 @@ enum burl_opts_e { ,HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE =0x400/* "." ".." "//" */ ,HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT =0x800 ,HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS =0x1000 + ,HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT =0x2000 ,HTTP_PARSEOPT_METHOD_GET_BODY =0x8000 }; diff --git a/src/configfile.c b/src/configfile.c index 23cb8d06..1961ff33 100644 --- a/src/configfile.c +++ b/src/configfile.c @@ -574,6 +574,8 @@ static int config_http_parseopts (server *srv, const array *a) { opt = HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT; else if (buffer_eq_slen(k, CONST_STR_LEN("url-query-20-plus"))) opt = HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS; + else if (buffer_eq_slen(k, CONST_STR_LEN("url-invalid-utf8-reject"))) + opt = HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT; else if (buffer_eq_slen(k, CONST_STR_LEN("header-strict"))) { srv->srvconf.http_header_strict = val; continue; @@ -631,7 +633,8 @@ static int config_http_parseopts (server *srv, const array *a) { } if (!(opts & (HTTP_PARSEOPT_URL_NORMALIZE_UNRESERVED |HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED))) { - opts |= HTTP_PARSEOPT_URL_NORMALIZE_UNRESERVED; + opts |= HTTP_PARSEOPT_URL_NORMALIZE_UNRESERVED + | HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT; if (decode_2f && !(opts & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)) opts |= HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE; @@ -1521,7 +1524,8 @@ void config_init(server *srv) { | HTTP_PARSEOPT_URL_NORMALIZE_UNRESERVED | HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT | HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE - | HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE; + | HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE + | HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT; srv->srvconf.modules = array_init(16); srv->srvconf.modules_dir = LIBRARY_DIR; diff --git a/src/t/test_burl.c b/src/t/test_burl.c index 174d1c3c..fa9190ff 100644 --- a/src/t/test_burl.c +++ b/src/t/test_burl.c @@ -31,6 +31,11 @@ static void test_burl_normalize (void) { int flags; flags = HTTP_PARSEOPT_URL_NORMALIZE_UNRESERVED; + run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/%C0"), CONST_STR_LEN("/%C0")); + run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/\377"), CONST_STR_LEN("/%FF")); + + flags = HTTP_PARSEOPT_URL_NORMALIZE_UNRESERVED + | HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT; run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("no-slash"), CONST_STR_LEN("no-slash")); run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/"), CONST_STR_LEN("/")); run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/abc"), CONST_STR_LEN("/abc")); @@ -53,11 +58,13 @@ static void test_burl_normalize (void) { run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/%3a"), CONST_STR_LEN("/%3A")); run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/%3A"), CONST_STR_LEN("/%3A")); run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/~test%20รค_"), CONST_STR_LEN("/~test%20%C3%A4_")); + run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/%C0"), "", (size_t)-2); run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/\375"), "", (size_t)-2); run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/\376"), "", (size_t)-2); run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/\377"), "", (size_t)-2); - flags = HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED; + flags = HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED + | HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT; run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/"), CONST_STR_LEN("/")); run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/abc"), CONST_STR_LEN("/abc")); run_burl_normalize(psrc, ptmp, flags, __LINE__, CONST_STR_LEN("/abc/"), CONST_STR_LEN("/abc/"));