1 // Copyright 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file tests the robots.txt parsing and matching code found in robots.cc 16 // against the current Robots Exclusion Protocol (REP) internet draft (I-D). 17 // https://tools.ietf.org/html/draft-koster-rep 18 19 module unrobotstxt.test; 20 21 import unrobotstxt; 22 23 version (unittest) 24 { 25 bool IsUserAgentAllowed(string robotstxt, string useragent, string url) 26 { 27 auto matcher = new RobotsMatcher; 28 return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url); 29 } 30 } 31 32 // Google-specific: system test. 33 unittest 34 { 35 // GoogleOnly_SystemTest 36 immutable robotstxt = 37 "user-agent: FooBot\n" ~ 38 "disallow: /\n"; 39 // Empty robots.txt: everything allowed. 40 assert (IsUserAgentAllowed("", "FooBot", "")); 41 42 // Empty user-agent to be matched: everything allowed. 43 assert (IsUserAgentAllowed(robotstxt, "", "")); 44 45 // Empty url: implicitly disallowed, see method comment for GetPathParamsQuery 46 // in robots.cc. 47 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "")); 48 49 // All params empty: same as robots.txt empty, everything allowed. 50 assert (IsUserAgentAllowed("", "", "")); 51 } 52 // Rules are colon separated name-value pairs. The following names are 53 // provisioned: 54 // user-agent: <value> 55 // allow: <value> 56 // disallow: <value> 57 // See REP I-D section "Protocol Definition". 58 // https://tools.ietf.org/html/draft-koster-rep#section-2.1 59 // 60 // Google specific: webmasters sometimes miss the colon separator, but it's 61 // obvious what they mean by "disallow /", so we assume the colon if it's 62 // missing. 63 unittest 64 { 65 // ID_LineSyntax_Line 66 immutable robotstxt_correct = 67 "user-agent: FooBot\n" ~ 68 "disallow: /\n"; 69 immutable robotstxt_incorrect = 70 "foo: FooBot\n" ~ 71 "bar: /\n"; 72 immutable robotstxt_incorrect_accepted = 73 "user-agent FooBot\n" ~ 74 "disallow /\n"; 75 immutable url = "http://foo.bar/x/y"; 76 77 assert (!IsUserAgentAllowed(robotstxt_correct, "FooBot", url)); 78 assert (IsUserAgentAllowed(robotstxt_incorrect, "FooBot", url)); 79 assert (!IsUserAgentAllowed(robotstxt_incorrect_accepted, "FooBot", url)); 80 } 81 82 // A group is one or more user-agent line followed by rules, and terminated 83 // by a another user-agent line. Rules for same user-agents are combined 84 // opaquely into one group. Rules outside groups are ignored. 85 // See REP I-D section "Protocol Definition". 86 // https://tools.ietf.org/html/draft-koster-rep#section-2.1 87 unittest 88 { 89 // ID_LineSyntax_Groups 90 immutable robotstxt = 91 "allow: /foo/bar/\n" ~ 92 "\n" ~ 93 "user-agent: FooBot\n" ~ 94 "disallow: /\n" ~ 95 "allow: /x/\n" ~ 96 "user-agent: BarBot\n" ~ 97 "disallow: /\n" ~ 98 "allow: /y/\n" ~ 99 "\n" ~ 100 "\n" ~ 101 "allow: /w/\n" ~ 102 "user-agent: BazBot\n" ~ 103 "\n" ~ 104 "user-agent: FooBot\n" ~ 105 "allow: /z/\n" ~ 106 "disallow: /\n"; 107 108 immutable url_w = "http://foo.bar/w/a"; 109 immutable url_x = "http://foo.bar/x/b"; 110 immutable url_y = "http://foo.bar/y/c"; 111 immutable url_z = "http://foo.bar/z/d"; 112 immutable url_foo = "http://foo.bar/foo/bar/"; 113 114 assert (IsUserAgentAllowed(robotstxt, "FooBot", url_x)); 115 assert (IsUserAgentAllowed(robotstxt, "FooBot", url_z)); 116 assert (!IsUserAgentAllowed(robotstxt, "FooBot", url_y)); 117 assert (IsUserAgentAllowed(robotstxt, "BarBot", url_y)); 118 assert (IsUserAgentAllowed(robotstxt, "BarBot", url_w)); 119 assert (!IsUserAgentAllowed(robotstxt, "BarBot", url_z)); 120 assert (IsUserAgentAllowed(robotstxt, "BazBot", url_z)); 121 122 // Lines with rules outside groups are ignored. 123 assert (!IsUserAgentAllowed(robotstxt, "FooBot", url_foo)); 124 assert (!IsUserAgentAllowed(robotstxt, "BarBot", url_foo)); 125 assert (!IsUserAgentAllowed(robotstxt, "BazBot", url_foo)); 126 } 127 128 // REP lines are case insensitive. See REP I-D section "Protocol Definition". 129 // https://tools.ietf.org/html/draft-koster-rep#section-2.1 130 unittest 131 { 132 // ID_REPLineNamesCaseInsensitive 133 immutable robotstxt_upper = 134 "USER-AGENT: FooBot\n" ~ 135 "ALLOW: /x/\n" ~ 136 "DISALLOW: /\n"; 137 immutable robotstxt_lower = 138 "user-agent: FooBot\n" ~ 139 "allow: /x/\n" ~ 140 "disallow: /\n"; 141 immutable robotstxt_camel = 142 "uSeR-aGeNt: FooBot\n" ~ 143 "AlLoW: /x/\n" ~ 144 "dIsAlLoW: /\n"; 145 immutable url_allowed = "http://foo.bar/x/y"; 146 immutable url_disallowed = "http://foo.bar/a/b"; 147 148 assert (IsUserAgentAllowed(robotstxt_upper, "FooBot", url_allowed)); 149 assert (IsUserAgentAllowed(robotstxt_lower, "FooBot", url_allowed)); 150 assert (IsUserAgentAllowed(robotstxt_camel, "FooBot", url_allowed)); 151 assert (!IsUserAgentAllowed(robotstxt_upper, "FooBot", url_disallowed)); 152 assert (!IsUserAgentAllowed(robotstxt_lower, "FooBot", url_disallowed)); 153 assert (!IsUserAgentAllowed(robotstxt_camel, "FooBot", url_disallowed)); 154 } 155 156 // A user-agent line is expected to contain only [a-zA-Z_-] characters and must 157 // not be empty. See REP I-D section "The user-agent line". 158 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1 159 unittest 160 { 161 // ID_VerifyValidUserAgentsToObey 162 assert (RobotsMatcher.IsValidUserAgentToObey("Foobot")); 163 assert (RobotsMatcher.IsValidUserAgentToObey("Foobot-Bar")); 164 assert (RobotsMatcher.IsValidUserAgentToObey("Foo_Bar")); 165 166 assert (!RobotsMatcher.IsValidUserAgentToObey("")); 167 assert (!RobotsMatcher.IsValidUserAgentToObey("ツ")); 168 169 assert (!RobotsMatcher.IsValidUserAgentToObey("Foobot*")); 170 assert (!RobotsMatcher.IsValidUserAgentToObey(" Foobot ")); 171 assert (!RobotsMatcher.IsValidUserAgentToObey("Foobot/2.1")); 172 173 assert (!RobotsMatcher.IsValidUserAgentToObey("Foobot Bar")); 174 } 175 176 // User-agent line values are case insensitive. See REP I-D section "The 177 // user-agent line". 178 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1 179 unittest 180 { 181 // ID_UserAgentValueCaseInsensitive 182 immutable robotstxt_upper = 183 "User-Agent: FOO BAR\n" ~ 184 "Allow: /x/\n" ~ 185 "Disallow: /\n"; 186 immutable robotstxt_lower = 187 "User-Agent: foo bar\n" ~ 188 "Allow: /x/\n" ~ 189 "Disallow: /\n"; 190 immutable robotstxt_camel = 191 "User-Agent: FoO bAr\n" ~ 192 "Allow: /x/\n" ~ 193 "Disallow: /\n"; 194 immutable url_allowed = "http://foo.bar/x/y"; 195 immutable url_disallowed = "http://foo.bar/a/b"; 196 197 assert (IsUserAgentAllowed(robotstxt_upper, "Foo", url_allowed)); 198 assert (IsUserAgentAllowed(robotstxt_lower, "Foo", url_allowed)); 199 assert (IsUserAgentAllowed(robotstxt_camel, "Foo", url_allowed)); 200 assert (!IsUserAgentAllowed(robotstxt_upper, "Foo", url_disallowed)); 201 assert (!IsUserAgentAllowed(robotstxt_lower, "Foo", url_disallowed)); 202 assert (!IsUserAgentAllowed(robotstxt_camel, "Foo", url_disallowed)); 203 assert (IsUserAgentAllowed(robotstxt_upper, "foo", url_allowed)); 204 assert (IsUserAgentAllowed(robotstxt_lower, "foo", url_allowed)); 205 assert (IsUserAgentAllowed(robotstxt_camel, "foo", url_allowed)); 206 assert (!IsUserAgentAllowed(robotstxt_upper, "foo", url_disallowed)); 207 assert (!IsUserAgentAllowed(robotstxt_lower, "foo", url_disallowed)); 208 assert (!IsUserAgentAllowed(robotstxt_camel, "foo", url_disallowed)); 209 } 210 211 // Google specific: accept user-agent value up to the first space. Space is not 212 // allowed in user-agent values, but that doesn't stop webmasters from using 213 // them. This is more restrictive than the I-D, since in case of the bad value 214 // "Googlebot Images" we'd still obey the rules with "Googlebot". 215 // Extends REP I-D section "The user-agent line" 216 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1 217 unittest 218 { 219 // GoogleOnly_AcceptUserAgentUpToFirstSpace 220 assert (!RobotsMatcher.IsValidUserAgentToObey("Foobot Bar")); 221 immutable robotstxt = 222 "User-Agent: *\n" ~ 223 "Disallow: /\n" ~ 224 "User-Agent: Foo Bar\n" ~ 225 "Allow: /x/\n" ~ 226 "Disallow: /\n"; 227 immutable url = "http://foo.bar/x/y"; 228 229 assert (IsUserAgentAllowed(robotstxt, "Foo", url)); 230 assert (!IsUserAgentAllowed(robotstxt, "Foo Bar", url)); 231 } 232 233 // If no group matches the user-agent, crawlers must obey the first group with a 234 // user-agent line with a "*" value, if present. If no group satisfies either 235 // condition, or no groups are present at all, no rules apply. 236 // See REP I-D section "The user-agent line". 237 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1 238 unittest 239 { 240 // ID_GlobalGroups_Secondary 241 immutable robotstxt_empty = ""; 242 immutable robotstxt_global = 243 "user-agent: *\n" ~ 244 "allow: /\n" ~ 245 "user-agent: FooBot\n" ~ 246 "disallow: /\n"; 247 immutable robotstxt_only_specific = 248 "user-agent: FooBot\n" ~ 249 "allow: /\n" ~ 250 "user-agent: BarBot\n" ~ 251 "disallow: /\n" ~ 252 "user-agent: BazBot\n" ~ 253 "disallow: /\n"; 254 immutable url = "http://foo.bar/x/y"; 255 256 assert (IsUserAgentAllowed(robotstxt_empty, "FooBot", url)); 257 assert (!IsUserAgentAllowed(robotstxt_global, "FooBot", url)); 258 assert (IsUserAgentAllowed(robotstxt_global, "BarBot", url)); 259 assert (IsUserAgentAllowed(robotstxt_only_specific, "QuxBot", url)); 260 } 261 262 // Matching rules against URIs is case sensitive. 263 // See REP I-D section "The Allow and Disallow lines". 264 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.2 265 unittest 266 { 267 // ID_AllowDisallow_Value_CaseSensitive 268 immutable robotstxt_lowercase_url = 269 "user-agent: FooBot\n" ~ 270 "disallow: /x/\n"; 271 immutable robotstxt_uppercase_url = 272 "user-agent: FooBot\n" ~ 273 "disallow: /X/\n"; 274 immutable url = "http://foo.bar/x/y"; 275 276 assert (!IsUserAgentAllowed(robotstxt_lowercase_url, "FooBot", url)); 277 assert (IsUserAgentAllowed(robotstxt_uppercase_url, "FooBot", url)); 278 } 279 280 // The most specific match found MUST be used. The most specific match is the 281 // match that has the most octets. In case of multiple rules with the same 282 // length, the least strict rule must be used. 283 // See REP I-D section "The Allow and Disallow lines". 284 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.2 285 unittest 286 { 287 // ID_LongestMatch 288 immutable url = "http://foo.bar/x/page.html"; 289 { 290 immutable robotstxt = 291 "user-agent: FooBot\n" ~ 292 "disallow: /x/page.html\n" ~ 293 "allow: /x/\n"; 294 295 assert (!IsUserAgentAllowed(robotstxt, "FooBot", url)); 296 } 297 { 298 immutable robotstxt = 299 "user-agent: FooBot\n" ~ 300 "allow: /x/page.html\n" ~ 301 "disallow: /x/\n"; 302 303 assert (IsUserAgentAllowed(robotstxt, "FooBot", url)); 304 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/")); 305 } 306 { 307 immutable robotstxt = 308 "user-agent: FooBot\n" ~ 309 "disallow: \n" ~ 310 "allow: \n"; 311 // In case of equivalent disallow and allow patterns for the same 312 // user-agent, allow is used. 313 assert (IsUserAgentAllowed(robotstxt, "FooBot", url)); 314 } 315 { 316 immutable robotstxt = 317 "user-agent: FooBot\n" ~ 318 "disallow: /\n" ~ 319 "allow: /\n"; 320 // In case of equivalent disallow and allow patterns for the same 321 // user-agent, allow is used. 322 assert (IsUserAgentAllowed(robotstxt, "FooBot", url)); 323 } 324 { 325 immutable url_a = "http://foo.bar/x"; 326 immutable url_b = "http://foo.bar/x/"; 327 immutable robotstxt = 328 "user-agent: FooBot\n" ~ 329 "disallow: /x\n" ~ 330 "allow: /x/\n"; 331 assert (!IsUserAgentAllowed(robotstxt, "FooBot", url_a)); 332 assert (IsUserAgentAllowed(robotstxt, "FooBot", url_b)); 333 } 334 335 { 336 immutable robotstxt = 337 "user-agent: FooBot\n" ~ 338 "disallow: /x/page.html\n" ~ 339 "allow: /x/page.html\n"; 340 // In case of equivalent disallow and allow patterns for the same 341 // user-agent, allow is used. 342 assert (IsUserAgentAllowed(robotstxt, "FooBot", url)); 343 } 344 { 345 immutable robotstxt = 346 "user-agent: FooBot\n" ~ 347 "allow: /page\n" ~ 348 "disallow: /*.html\n"; 349 // Longest match wins. 350 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page.html")); 351 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page")); 352 } 353 { 354 immutable robotstxt = 355 "user-agent: FooBot\n" ~ 356 "allow: /x/page.\n" ~ 357 "disallow: /*.html\n"; 358 // Longest match wins. 359 assert (IsUserAgentAllowed(robotstxt, "FooBot", url)); 360 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/y.html")); 361 } 362 { 363 immutable robotstxt = 364 "User-agent: *\n" ~ 365 "Disallow: /x/\n" ~ 366 "User-agent: FooBot\n" ~ 367 "Disallow: /y/\n"; 368 // Most specific group for FooBot allows implicitly /x/page. 369 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/page")); 370 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/y/page")); 371 } 372 } 373 374 // Octets in the URI and robots.txt paths outside the range of the US-ASCII 375 // coded character set, and those in the reserved range defined by RFC3986, 376 // MUST be percent-encoded as defined by RFC3986 prior to comparison. 377 // See REP I-D section "The Allow and Disallow lines". 378 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.2 379 // 380 // NOTE: It's up to the caller to percent encode a URL before passing it to the 381 // parser. Percent encoding URIs in the rules is unnecessary. 382 unittest 383 { 384 // ID_Encoding 385 // /foo/bar?baz=http://foo.bar stays unencoded. 386 { 387 immutable robotstxt = 388 "User-agent: FooBot\n" ~ 389 "Disallow: /\n" ~ 390 "Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n"; 391 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par")); 392 } 393 394 // 3 byte character: /foo/bar/ツ -> /foo/bar/%E3%83%84 395 { 396 immutable robotstxt = 397 "User-agent: FooBot\n" ~ 398 "Disallow: /\n" ~ 399 "Allow: /foo/bar/ツ\n"; 400 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/%E3%83%84")); 401 // The parser encodes the 3-byte character, but the URL is not %-encoded. 402 assert (! 403 IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ")); 404 } 405 // Percent encoded 3 byte character: /foo/bar/%E3%83%84 -> /foo/bar/%E3%83%84 406 { 407 immutable robotstxt = 408 "User-agent: FooBot\n" ~ 409 "Disallow: /\n" ~ 410 "Allow: /foo/bar/%E3%83%84\n"; 411 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/%E3%83%84")); 412 assert (! 413 IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ")); 414 } 415 // Percent encoded unreserved US-ASCII: /foo/bar/%62%61%7A -> NULL 416 // This is illegal according to RFC3986 and while it may work here due to 417 // simple string matching, it should not be relied on. 418 { 419 immutable robotstxt = 420 "User-agent: FooBot\n" ~ 421 "Disallow: /\n" ~ 422 "Allow: /foo/bar/%62%61%7A\n"; 423 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz")); 424 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/%62%61%7A")); 425 } 426 } 427 428 // The REP I-D defines the following characters that have special meaning in 429 // robots.txt: 430 // # - inline comment. 431 // $ - end of pattern. 432 // * - any number of characters. 433 // See REP I-D section "Special Characters". 434 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.3 435 unittest 436 { 437 // ID_SpecialCharacters 438 { 439 immutable robotstxt = 440 "User-agent: FooBot\n" ~ 441 "Disallow: /foo/bar/quz\n" ~ 442 "Allow: /foo/*/qux\n"; 443 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/quz")); 444 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz")); 445 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo//quz")); 446 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bax/quz")); 447 } 448 { 449 immutable robotstxt = 450 "User-agent: FooBot\n" ~ 451 "Disallow: /foo/bar$\n" ~ 452 "Allow: /foo/bar/qux\n"; 453 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar")); 454 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/qux")); 455 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/")); 456 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz")); 457 } 458 { 459 immutable robotstxt = 460 "User-agent: FooBot\n" ~ 461 "# Disallow: /\n" ~ 462 "Disallow: /foo/quz#qux\n" ~ 463 "Allow: /\n"; 464 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar")); 465 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz")); 466 } 467 } 468 469 // Google-specific: "index.html" (and only that) at the end of a pattern is 470 // equivalent to "/". 471 unittest 472 { 473 // GoogleOnly_IndexHTMLisDirectory 474 immutable robotstxt = 475 "User-Agent: *\n" ~ 476 "Allow: /allowed-slash/index.html\n" ~ 477 "Disallow: /\n"; 478 // If index.html is allowed, we interpret this as / being allowed too. 479 assert (IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/")); 480 // Does not exatly match. 481 assert (!IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/index.htm")); 482 // Exact match. 483 assert (IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/index.html")); 484 assert (!IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/anyother-url")); 485 } 486 487 // Google-specific: long lines are ignored after 8 * 2083 bytes. See comment in 488 // RobotsTxtParser::Parse(). 489 unittest 490 { 491 // GoogleOnly_LineTooLong 492 immutable kEOLLen = "\n".length; 493 immutable size_t kMaxLineLen = 2083 * 8; 494 immutable allow = "allow: "; 495 immutable disallow = "disallow: "; 496 497 // Disallow rule pattern matches the URL after being cut off at kMaxLineLen. 498 { 499 string robotstxt = "user-agent: FooBot\n"; 500 string longline = "/x/"; 501 immutable size_t max_length = kMaxLineLen - longline.length - disallow.length + kEOLLen; 502 while (longline.length < max_length) 503 { 504 longline ~= "a"; 505 } 506 robotstxt ~= disallow ~ longline ~ "/qux\n"; 507 508 // Matches nothing, so URL is allowed. 509 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fux")); 510 // Matches cut off disallow rule. 511 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar" ~ longline ~ "/fux")); 512 } 513 514 { 515 string robotstxt = 516 "user-agent: FooBot\n" ~ 517 "disallow: /\n"; 518 string longline_a = "/x/"; 519 string longline_b = "/x/"; 520 immutable size_t max_length = kMaxLineLen - longline_a.length - allow.length + kEOLLen; 521 while (longline_a.length < max_length) 522 { 523 longline_a ~= "a"; 524 longline_b ~= "b"; 525 } 526 robotstxt ~= allow ~ longline_a ~ "/qux\n"; 527 robotstxt ~= allow ~ longline_b ~ "/qux\n"; 528 529 // URL matches the disallow rule. 530 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/")); 531 // Matches the allow rule exactly. 532 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar" ~ longline_a ~ "/qux")); 533 // Matches cut off allow rule. 534 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar" ~ longline_b ~ "/fux")); 535 } 536 } 537 538 unittest 539 { 540 // GoogleOnly_DocumentationChecks 541 // Test documentation from 542 // https://developers.google.com/search/reference/robots_txt 543 // Section "URL matching based on path values". 544 { 545 immutable robotstxt = 546 "user-agent: FooBot\n" ~ 547 "disallow: /\n" ~ 548 "allow: /fish\n"; 549 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 550 551 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish")); 552 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html")); 553 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon.html")); 554 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads")); 555 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads/yummy.html")); 556 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html?id=anything")); 557 558 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.asp")); 559 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish")); 560 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish")); 561 } 562 // "/fish*" equals "/fish" 563 { 564 immutable robotstxt = 565 "user-agent: FooBot\n" ~ 566 "disallow: /\n" ~ 567 "allow: /fish*\n"; 568 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 569 570 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish")); 571 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html")); 572 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon.html")); 573 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads")); 574 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads/yummy.html")); 575 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html?id=anything")); 576 577 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.bar")); 578 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish")); 579 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish")); 580 } 581 // "/fish/" does not equal "/fish" 582 { 583 immutable robotstxt = 584 "user-agent: FooBot\n" ~ 585 "disallow: /\n" ~ 586 "allow: /fish/\n"; 587 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 588 589 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/")); 590 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon")); 591 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/?salmon")); 592 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon.html")); 593 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/?id=anything")); 594 595 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish")); 596 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html")); 597 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish/Salmon.html")); 598 } 599 // "/*.php" 600 { 601 immutable robotstxt = 602 "user-agent: FooBot\n" ~ 603 "disallow: /\n" ~ 604 "allow: /*.php\n"; 605 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 606 607 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php")); 608 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/folder/filename.php")); 609 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/folder/filename.php?parameters")); 610 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar//folder/any.php.file.html")); 611 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php/")); 612 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/index?f=filename.php/")); 613 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/php/")); 614 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/index?php")); 615 616 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/windows.PHP")); 617 } 618 // "/*.php$" 619 { 620 immutable robotstxt = 621 "user-agent: FooBot\n" ~ 622 "disallow: /\n" ~ 623 "allow: /*.php$\n"; 624 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 625 626 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php")); 627 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/folder/filename.php")); 628 629 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php?parameters")); 630 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php/")); 631 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php5")); 632 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/php/")); 633 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename?php")); 634 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/aaaphpaaa")); 635 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar//windows.PHP")); 636 } 637 // "/fish*.php" 638 { 639 immutable robotstxt = 640 "user-agent: FooBot\n" ~ 641 "disallow: /\n" ~ 642 "allow: /fish*.php\n"; 643 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar")); 644 645 assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.php")); 646 assert ( IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads/catfish.php?parameters")); 647 648 assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.PHP")); 649 } 650 // Section "Order of precedence for group-member records". 651 { 652 immutable robotstxt = 653 "user-agent: FooBot\n" ~ 654 "allow: /p\n" ~ 655 "disallow: /\n"; 656 immutable url = "http://example.com/page"; 657 assert (IsUserAgentAllowed(robotstxt, "FooBot", url)); 658 } 659 { 660 immutable robotstxt = 661 "user-agent: FooBot\n" ~ 662 "allow: /folder\n" ~ 663 "disallow: /folder\n"; 664 immutable url = "http://example.com/folder/page"; 665 assert (IsUserAgentAllowed(robotstxt, "FooBot", url)); 666 } 667 { 668 immutable robotstxt = 669 "user-agent: FooBot\n" ~ 670 "allow: /page\n" ~ 671 "disallow: /*.htm\n"; 672 immutable url = "http://example.com/page.htm"; 673 assert (!IsUserAgentAllowed(robotstxt, "FooBot", url)); 674 } 675 { 676 immutable robotstxt = 677 "user-agent: FooBot\n" ~ 678 "allow: /$\n" ~ 679 "disallow: /\n"; 680 immutable url = "http://example.com/"; 681 immutable url_page = "http://example.com/page.html"; 682 assert (IsUserAgentAllowed(robotstxt, "FooBot", url)); 683 assert (!IsUserAgentAllowed(robotstxt, "FooBot", url_page)); 684 } 685 } 686 687 version (unittest) 688 { 689 class RobotsStatsReporter : RobotsParseHandler 690 { 691 public: 692 override void HandleRobotsStart() 693 { 694 last_line_seen_ = 0; 695 valid_directives_ = 0; 696 unknown_directives_ = 0; 697 sitemap_ = ""; 698 } 699 override void HandleRobotsEnd() {} 700 701 override void HandleUserAgent(int line_num, string value) 702 { 703 Digest(line_num); 704 } 705 override void HandleAllow(int line_num, string value) 706 { 707 Digest(line_num); 708 } 709 override void HandleDisallow(int line_num, string value) 710 { 711 Digest(line_num); 712 } 713 714 override void HandleSitemap(int line_num, string value) 715 { 716 Digest(line_num); 717 sitemap_ ~= value; 718 } 719 720 // Any other unrecognized name/v pairs. 721 override void HandleUnknownAction(int line_num, string action, string value) 722 { 723 last_line_seen_ = line_num; 724 unknown_directives_++; 725 } 726 727 int last_line_seen() const { return last_line_seen_; } 728 729 // All directives found, including unknown. 730 int valid_directives() const { return valid_directives_; } 731 732 // Number of unknown directives. 733 int unknown_directives() const { return unknown_directives_; } 734 735 // Parsed sitemap line. 736 string sitemap() const { return sitemap_; } 737 738 private: 739 void Digest(int line_num) @safe 740 { 741 assert (line_num >= last_line_seen_); 742 last_line_seen_ = line_num; 743 valid_directives_++; 744 } 745 746 int last_line_seen_ = 0; 747 int valid_directives_ = 0; 748 int unknown_directives_ = 0; 749 string sitemap_; 750 } 751 } 752 753 // Different kinds of line endings are all supported: %x0D / %x0A / %x0D.0A 754 unittest 755 { 756 // ID_LinesNumbersAreCountedCorrectly 757 758 auto report = new RobotsStatsReporter; 759 760 immutable string kUnixFile = 761 "User-Agent: foo\n" ~ 762 "Allow: /some/path\n" ~ 763 "User-Agent: bar\n" ~ 764 "\n" ~ 765 "\n" ~ 766 "Disallow: /\n"; 767 ParseRobotsTxt(kUnixFile, report); 768 assert (report.valid_directives() == 4); 769 assert (report.last_line_seen() == 6); 770 771 immutable string kDosFile = 772 "User-Agent: foo\r\n" ~ 773 "Allow: /some/path\r\n" ~ 774 "User-Agent: bar\r\n" ~ 775 "\r\n" ~ 776 "\r\n" ~ 777 "Disallow: /\r\n"; 778 ParseRobotsTxt(kDosFile, report); 779 assert (report.valid_directives() == 4); 780 assert (report.last_line_seen() == 6); 781 782 immutable string kMacFile = 783 "User-Agent: foo\r" ~ 784 "Allow: /some/path\r" ~ 785 "User-Agent: bar\r" ~ 786 "\r" ~ 787 "\r" ~ 788 "Disallow: /\r"; 789 ParseRobotsTxt(kMacFile, report); 790 assert (report.valid_directives() == 4); 791 assert (report.last_line_seen() == 6); 792 793 immutable string kNoFinalNewline = 794 "User-Agent: foo\n" ~ 795 "Allow: /some/path\n" ~ 796 "User-Agent: bar\n" ~ 797 "\n" ~ 798 "\n" ~ 799 "Disallow: /"; 800 ParseRobotsTxt(kNoFinalNewline, report); 801 assert (report.valid_directives() == 4); 802 assert (report.last_line_seen() == 6); 803 804 immutable string kMixedFile = 805 "User-Agent: foo\n" ~ 806 "Allow: /some/path\r\n" ~ 807 "User-Agent: bar\n" ~ 808 "\r\n" ~ 809 "\n" ~ 810 "Disallow: /"; 811 ParseRobotsTxt(kMixedFile, report); 812 assert (report.valid_directives() == 4); 813 assert (report.last_line_seen() == 6); 814 } 815 816 // BOM characters are unparseable and thus skipped. The rules following the line 817 // are used. 818 unittest 819 { 820 // ID_UTF8ByteOrderMarkIsSkipped 821 822 auto report = new RobotsStatsReporter; 823 824 immutable kUtf8FileFullBOM = 825 "\xEF\xBB\xBF" ~ 826 "User-Agent: foo\n" ~ 827 "Allow: /AnyValue\n"; 828 ParseRobotsTxt(kUtf8FileFullBOM, report); 829 assert (report.valid_directives() == 2); 830 assert (report.unknown_directives() == 0); 831 832 // We allow as well partial ByteOrderMarks. 833 immutable kUtf8FilePartial2BOM = 834 "\xEF\xBB" ~ 835 "User-Agent: foo\n" ~ 836 "Allow: /AnyValue\n"; 837 ParseRobotsTxt(kUtf8FilePartial2BOM, report); 838 assert (report.valid_directives() == 2); 839 assert (report.unknown_directives() == 0); 840 841 immutable kUtf8FilePartial1BOM = 842 "\xEF" ~ 843 "User-Agent: foo\n" ~ 844 "Allow: /AnyValue\n"; 845 ParseRobotsTxt(kUtf8FilePartial1BOM, report); 846 assert (report.valid_directives() == 2); 847 assert (report.unknown_directives() == 0); 848 849 // If the BOM is not the right sequence, the first line looks like garbage 850 // that is skipped (we essentially see "\x11\xBFUser-Agent"). 851 immutable kUtf8FileBrokenBOM = 852 "\xEF\x11\xBF" ~ 853 "User-Agent: foo\n" ~ 854 "Allow: /AnyValue\n"; 855 ParseRobotsTxt(kUtf8FileBrokenBOM, report); 856 assert (report.valid_directives() == 1); 857 assert (report.unknown_directives() == 1); // We get one broken line. 858 859 // Some other messed up file: BOMs only valid in the beginning of the file. 860 immutable kUtf8BOMSomewhereInMiddleOfFile = 861 "User-Agent: foo\n" ~ 862 "\xEF\xBB\xBF" ~ 863 "Allow: /AnyValue\n"; 864 ParseRobotsTxt(kUtf8BOMSomewhereInMiddleOfFile, report); 865 assert (report.valid_directives() == 1); 866 assert (report.unknown_directives() == 1); 867 } 868 869 // Google specific: the I-D allows any line that crawlers might need, such as 870 // sitemaps, which Google supports. 871 // See REP I-D section "Other records". 872 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.4 873 unittest 874 { 875 // ID_NonStandardLineExample_Sitemap 876 auto report = new RobotsStatsReporter; 877 878 { 879 auto sitemap_loc = "http://foo.bar/sitemap.xml"; 880 auto robotstxt = 881 "User-Agent: foo\n" ~ 882 "Allow: /some/path\n" ~ 883 "User-Agent: bar\n" ~ 884 "\n" ~ 885 "\n" ~ 886 "Sitemap: " ~ sitemap_loc ~ "\n"; 887 ParseRobotsTxt(robotstxt, report); 888 assert (sitemap_loc == report.sitemap); 889 } 890 891 // A sitemap line may appear anywhere in the file. 892 { 893 auto sitemap_loc = "http://foo.bar/sitemap.xml"; 894 auto robotstxt = 895 "Sitemap: " ~ sitemap_loc ~ "\n" ~ 896 "User-Agent: foo\n" ~ 897 "Allow: /some/path\n" ~ 898 "User-Agent: bar\n" ~ 899 "\n" ~ 900 "\n"; 901 ParseRobotsTxt(robotstxt, report); 902 assert (sitemap_loc == report.sitemap); 903 } 904 } 905 906 version (unittest) 907 { 908 void TestPath(string url, string expected_path) 909 { 910 assert (expected_path == GetPathParamsQuery(url)); 911 } 912 913 void TestEscape(string url, string expected) 914 { 915 assert (expected == MaybeEscapePattern(url)); 916 } 917 } 918 919 unittest 920 { 921 // TestGetPathParamsQuery 922 // Only testing URLs that are already correctly escaped here. 923 TestPath("", "/"); 924 TestPath("http://www.example.com", "/"); 925 TestPath("http://www.example.com/", "/"); 926 TestPath("http://www.example.com/a", "/a"); 927 TestPath("http://www.example.com/a/", "/a/"); 928 TestPath("http://www.example.com/a/b?c=http://d.e/", "/a/b?c=http://d.e/"); 929 TestPath("http://www.example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f"); 930 TestPath("example.com", "/"); 931 TestPath("example.com/", "/"); 932 TestPath("example.com/a", "/a"); 933 TestPath("example.com/a/", "/a/"); 934 TestPath("example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f"); 935 TestPath("a", "/"); 936 TestPath("a/", "/"); 937 TestPath("/a", "/a"); 938 TestPath("a/b", "/b"); 939 TestPath("example.com?a", "/?a"); 940 TestPath("example.com/a;b#c", "/a;b"); 941 TestPath("//a/b/c", "/b/c"); 942 } 943 944 unittest 945 { 946 // TestMaybeEscapePattern 947 TestEscape("http://www.example.com", "http://www.example.com"); 948 TestEscape("/a/b/c", "/a/b/c"); 949 TestEscape("á", "%C3%A1"); 950 TestEscape("%aa", "%AA"); 951 }