1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // This file tests the robots.txt parsing and matching code found in robots.cc
16 // against the current Robots Exclusion Protocol (REP) internet draft (I-D).
17 // https://tools.ietf.org/html/draft-koster-rep
18 
19 module unrobotstxt.test;
20 
21 import unrobotstxt;
22 
23 version (unittest)
24 {
25 	bool IsUserAgentAllowed(string robotstxt, string useragent, string url)
26 	{
27 		auto matcher = new RobotsMatcher;
28 		return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url);
29 	}
30 }
31 
32 // Google-specific: system test.
33 unittest
34 {
35 	// GoogleOnly_SystemTest
36 	immutable robotstxt =
37 		"user-agent: FooBot\n" ~
38 		"disallow: /\n";
39 	// Empty robots.txt: everything allowed.
40 	assert (IsUserAgentAllowed("", "FooBot", ""));
41 
42 	// Empty user-agent to be matched: everything allowed.
43 	assert (IsUserAgentAllowed(robotstxt, "", ""));
44 
45 	// Empty url: implicitly disallowed, see method comment for GetPathParamsQuery
46 	// in robots.cc.
47 	assert (!IsUserAgentAllowed(robotstxt, "FooBot", ""));
48 
49 	// All params empty: same as robots.txt empty, everything allowed.
50 	assert (IsUserAgentAllowed("", "", ""));
51 }
52 // Rules are colon separated name-value pairs. The following names are
53 // provisioned:
54 //     user-agent: <value>
55 //     allow: <value>
56 //     disallow: <value>
57 // See REP I-D section "Protocol Definition".
58 // https://tools.ietf.org/html/draft-koster-rep#section-2.1
59 //
60 // Google specific: webmasters sometimes miss the colon separator, but it's
61 // obvious what they mean by "disallow /", so we assume the colon if it's
62 // missing.
63 unittest
64 {
65 	// ID_LineSyntax_Line
66 	immutable robotstxt_correct =
67 		"user-agent: FooBot\n" ~
68 		"disallow: /\n";
69 	immutable robotstxt_incorrect =
70 		"foo: FooBot\n" ~
71 		"bar: /\n";
72 	immutable robotstxt_incorrect_accepted =
73 		"user-agent FooBot\n" ~
74 		"disallow /\n";
75 	immutable url = "http://foo.bar/x/y";
76 
77 	assert (!IsUserAgentAllowed(robotstxt_correct, "FooBot", url));
78 	assert (IsUserAgentAllowed(robotstxt_incorrect, "FooBot", url));
79 	assert (!IsUserAgentAllowed(robotstxt_incorrect_accepted, "FooBot", url));
80 }
81 
82 // A group is one or more user-agent line followed by rules, and terminated
83 // by a another user-agent line. Rules for same user-agents are combined
84 // opaquely into one group. Rules outside groups are ignored.
85 // See REP I-D section "Protocol Definition".
86 // https://tools.ietf.org/html/draft-koster-rep#section-2.1
87 unittest
88 {
89 	// ID_LineSyntax_Groups
90 	immutable robotstxt =
91 		"allow: /foo/bar/\n" ~
92 		"\n" ~
93 		"user-agent: FooBot\n" ~
94 		"disallow: /\n" ~
95 		"allow: /x/\n" ~
96 		"user-agent: BarBot\n" ~
97 		"disallow: /\n" ~
98 		"allow: /y/\n" ~
99 		"\n" ~
100 		"\n" ~
101 		"allow: /w/\n" ~
102 		"user-agent: BazBot\n" ~
103 		"\n" ~
104 		"user-agent: FooBot\n" ~
105 		"allow: /z/\n" ~
106 		"disallow: /\n";
107 
108 	immutable url_w = "http://foo.bar/w/a";
109 	immutable url_x = "http://foo.bar/x/b";
110 	immutable url_y = "http://foo.bar/y/c";
111 	immutable url_z = "http://foo.bar/z/d";
112 	immutable url_foo = "http://foo.bar/foo/bar/";
113 
114 	assert (IsUserAgentAllowed(robotstxt, "FooBot", url_x));
115 	assert (IsUserAgentAllowed(robotstxt, "FooBot", url_z));
116 	assert (!IsUserAgentAllowed(robotstxt, "FooBot", url_y));
117 	assert (IsUserAgentAllowed(robotstxt, "BarBot", url_y));
118 	assert (IsUserAgentAllowed(robotstxt, "BarBot", url_w));
119 	assert (!IsUserAgentAllowed(robotstxt, "BarBot", url_z));
120 	assert (IsUserAgentAllowed(robotstxt, "BazBot", url_z));
121 
122 	// Lines with rules outside groups are ignored.
123 	assert (!IsUserAgentAllowed(robotstxt, "FooBot", url_foo));
124 	assert (!IsUserAgentAllowed(robotstxt, "BarBot", url_foo));
125 	assert (!IsUserAgentAllowed(robotstxt, "BazBot", url_foo));
126 }
127 
128 // REP lines are case insensitive. See REP I-D section "Protocol Definition".
129 // https://tools.ietf.org/html/draft-koster-rep#section-2.1
130 unittest
131 {
132 	// ID_REPLineNamesCaseInsensitive
133 	immutable robotstxt_upper =
134 		"USER-AGENT: FooBot\n" ~
135 		"ALLOW: /x/\n" ~
136 		"DISALLOW: /\n";
137 	immutable robotstxt_lower =
138 		"user-agent: FooBot\n" ~
139 		"allow: /x/\n" ~
140 		"disallow: /\n";
141 	immutable robotstxt_camel =
142 		"uSeR-aGeNt: FooBot\n" ~
143 		"AlLoW: /x/\n" ~
144 		"dIsAlLoW: /\n";
145 	immutable url_allowed = "http://foo.bar/x/y";
146 	immutable url_disallowed = "http://foo.bar/a/b";
147 
148 	assert (IsUserAgentAllowed(robotstxt_upper, "FooBot", url_allowed));
149 	assert (IsUserAgentAllowed(robotstxt_lower, "FooBot", url_allowed));
150 	assert (IsUserAgentAllowed(robotstxt_camel, "FooBot", url_allowed));
151 	assert (!IsUserAgentAllowed(robotstxt_upper, "FooBot", url_disallowed));
152 	assert (!IsUserAgentAllowed(robotstxt_lower, "FooBot", url_disallowed));
153 	assert (!IsUserAgentAllowed(robotstxt_camel, "FooBot", url_disallowed));
154 }
155 
156 // A user-agent line is expected to contain only [a-zA-Z_-] characters and must
157 // not be empty. See REP I-D section "The user-agent line".
158 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
159 unittest
160 {
161 	// ID_VerifyValidUserAgentsToObey
162 	assert (RobotsMatcher.IsValidUserAgentToObey("Foobot"));
163 	assert (RobotsMatcher.IsValidUserAgentToObey("Foobot-Bar"));
164 	assert (RobotsMatcher.IsValidUserAgentToObey("Foo_Bar"));
165 
166 	assert (!RobotsMatcher.IsValidUserAgentToObey(""));
167 	assert (!RobotsMatcher.IsValidUserAgentToObey("ツ"));
168 
169 	assert (!RobotsMatcher.IsValidUserAgentToObey("Foobot*"));
170 	assert (!RobotsMatcher.IsValidUserAgentToObey(" Foobot "));
171 	assert (!RobotsMatcher.IsValidUserAgentToObey("Foobot/2.1"));
172 
173 	assert (!RobotsMatcher.IsValidUserAgentToObey("Foobot Bar"));
174 }
175 
176 // User-agent line values are case insensitive. See REP I-D section "The
177 // user-agent line".
178 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
179 unittest
180 {
181 	// ID_UserAgentValueCaseInsensitive
182 	immutable robotstxt_upper =
183 		"User-Agent: FOO BAR\n" ~
184 		"Allow: /x/\n" ~
185 		"Disallow: /\n";
186 	immutable robotstxt_lower =
187 		"User-Agent: foo bar\n" ~
188 		"Allow: /x/\n" ~
189 		"Disallow: /\n";
190 	immutable robotstxt_camel =
191 		"User-Agent: FoO bAr\n" ~
192 		"Allow: /x/\n" ~
193 		"Disallow: /\n";
194 	immutable url_allowed = "http://foo.bar/x/y";
195 	immutable url_disallowed = "http://foo.bar/a/b";
196 
197 	assert (IsUserAgentAllowed(robotstxt_upper, "Foo", url_allowed));
198 	assert (IsUserAgentAllowed(robotstxt_lower, "Foo", url_allowed));
199 	assert (IsUserAgentAllowed(robotstxt_camel, "Foo", url_allowed));
200 	assert (!IsUserAgentAllowed(robotstxt_upper, "Foo", url_disallowed));
201 	assert (!IsUserAgentAllowed(robotstxt_lower, "Foo", url_disallowed));
202 	assert (!IsUserAgentAllowed(robotstxt_camel, "Foo", url_disallowed));
203 	assert (IsUserAgentAllowed(robotstxt_upper, "foo", url_allowed));
204 	assert (IsUserAgentAllowed(robotstxt_lower, "foo", url_allowed));
205 	assert (IsUserAgentAllowed(robotstxt_camel, "foo", url_allowed));
206 	assert (!IsUserAgentAllowed(robotstxt_upper, "foo", url_disallowed));
207 	assert (!IsUserAgentAllowed(robotstxt_lower, "foo", url_disallowed));
208 	assert (!IsUserAgentAllowed(robotstxt_camel, "foo", url_disallowed));
209 }
210 
211 // Google specific: accept user-agent value up to the first space. Space is not
212 // allowed in user-agent values, but that doesn't stop webmasters from using
213 // them. This is more restrictive than the I-D, since in case of the bad value
214 // "Googlebot Images" we'd still obey the rules with "Googlebot".
215 // Extends REP I-D section "The user-agent line"
216 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
217 unittest
218 {
219 	// GoogleOnly_AcceptUserAgentUpToFirstSpace
220 	assert (!RobotsMatcher.IsValidUserAgentToObey("Foobot Bar"));
221 	immutable robotstxt =
222 		"User-Agent: *\n" ~
223 		"Disallow: /\n" ~
224 		"User-Agent: Foo Bar\n" ~
225 		"Allow: /x/\n" ~
226 		"Disallow: /\n";
227 	immutable url = "http://foo.bar/x/y";
228 
229 	assert (IsUserAgentAllowed(robotstxt, "Foo", url));
230 	assert (!IsUserAgentAllowed(robotstxt, "Foo Bar", url));
231 }
232 
233 // If no group matches the user-agent, crawlers must obey the first group with a
234 // user-agent line with a "*" value, if present. If no group satisfies either
235 // condition, or no groups are present at all, no rules apply.
236 // See REP I-D section "The user-agent line".
237 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.1
238 unittest
239 {
240 	// ID_GlobalGroups_Secondary
241 	immutable robotstxt_empty = "";
242 	immutable robotstxt_global =
243 		"user-agent: *\n" ~
244 		"allow: /\n" ~
245 		"user-agent: FooBot\n" ~
246 		"disallow: /\n";
247 	immutable robotstxt_only_specific =
248 		"user-agent: FooBot\n" ~
249 		"allow: /\n" ~
250 		"user-agent: BarBot\n" ~
251 		"disallow: /\n" ~
252 		"user-agent: BazBot\n" ~
253 		"disallow: /\n";
254 	immutable url = "http://foo.bar/x/y";
255 
256 	assert (IsUserAgentAllowed(robotstxt_empty, "FooBot", url));
257 	assert (!IsUserAgentAllowed(robotstxt_global, "FooBot", url));
258 	assert (IsUserAgentAllowed(robotstxt_global, "BarBot", url));
259 	assert (IsUserAgentAllowed(robotstxt_only_specific, "QuxBot", url));
260 }
261 
262 // Matching rules against URIs is case sensitive.
263 // See REP I-D section "The Allow and Disallow lines".
264 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
265 unittest
266 {
267 	// ID_AllowDisallow_Value_CaseSensitive
268 	immutable robotstxt_lowercase_url =
269 		"user-agent: FooBot\n" ~
270 		"disallow: /x/\n";
271 	immutable robotstxt_uppercase_url =
272 		"user-agent: FooBot\n" ~
273 		"disallow: /X/\n";
274 	immutable url = "http://foo.bar/x/y";
275 
276 	assert (!IsUserAgentAllowed(robotstxt_lowercase_url, "FooBot", url));
277 	assert (IsUserAgentAllowed(robotstxt_uppercase_url, "FooBot", url));
278 }
279 
280 // The most specific match found MUST be used. The most specific match is the
281 // match that has the most octets. In case of multiple rules with the same
282 // length, the least strict rule must be used.
283 // See REP I-D section "The Allow and Disallow lines".
284 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
285 unittest
286 {
287 	// ID_LongestMatch
288 	immutable url = "http://foo.bar/x/page.html";
289 	{
290 		immutable robotstxt =
291 			"user-agent: FooBot\n" ~
292 			"disallow: /x/page.html\n" ~
293 			"allow: /x/\n";
294 
295 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", url));
296 	}
297 	{
298 		immutable robotstxt =
299 			"user-agent: FooBot\n" ~
300 			"allow: /x/page.html\n" ~
301 			"disallow: /x/\n";
302 
303 		assert (IsUserAgentAllowed(robotstxt, "FooBot", url));
304 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/"));
305 	}
306 	{
307 		immutable robotstxt =
308 			"user-agent: FooBot\n" ~
309 			"disallow: \n" ~
310 			"allow: \n";
311 		// In case of equivalent disallow and allow patterns for the same
312 		// user-agent, allow is used.
313 		assert (IsUserAgentAllowed(robotstxt, "FooBot", url));
314 	}
315 	{
316 		immutable robotstxt =
317 			"user-agent: FooBot\n" ~
318 			"disallow: /\n" ~
319 			"allow: /\n";
320 		// In case of equivalent disallow and allow patterns for the same
321 		// user-agent, allow is used.
322 		assert (IsUserAgentAllowed(robotstxt, "FooBot", url));
323 	}
324 	{
325 		immutable url_a = "http://foo.bar/x";
326 		immutable url_b = "http://foo.bar/x/";
327 		immutable robotstxt =
328 			"user-agent: FooBot\n" ~
329 			"disallow: /x\n" ~
330 			"allow: /x/\n";
331 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", url_a));
332 		assert (IsUserAgentAllowed(robotstxt, "FooBot", url_b));
333 	}
334 
335 	{
336 		immutable robotstxt =
337 			"user-agent: FooBot\n" ~
338 			"disallow: /x/page.html\n" ~
339 			"allow: /x/page.html\n";
340 		// In case of equivalent disallow and allow patterns for the same
341 		// user-agent, allow is used.
342 		assert (IsUserAgentAllowed(robotstxt, "FooBot", url));
343 	}
344 	{
345 		immutable robotstxt =
346 			"user-agent: FooBot\n" ~
347 			"allow: /page\n" ~
348 			"disallow: /*.html\n";
349 		// Longest match wins.
350 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page.html"));
351 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/page"));
352 	}
353 	{
354 		immutable robotstxt =
355 			"user-agent: FooBot\n" ~
356 			"allow: /x/page.\n" ~
357 			"disallow: /*.html\n";
358 		// Longest match wins.
359 		assert (IsUserAgentAllowed(robotstxt, "FooBot", url));
360 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/y.html"));
361 	}
362 	{
363 		immutable robotstxt =
364 			"User-agent: *\n" ~
365 			"Disallow: /x/\n" ~
366 			"User-agent: FooBot\n" ~
367 			"Disallow: /y/\n";
368 		// Most specific group for FooBot allows implicitly /x/page.
369 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/x/page"));
370 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/y/page"));
371 	}
372 }
373 
374 // Octets in the URI and robots.txt paths outside the range of the US-ASCII
375 // coded character set, and those in the reserved range defined by RFC3986,
376 // MUST be percent-encoded as defined by RFC3986 prior to comparison.
377 // See REP I-D section "The Allow and Disallow lines".
378 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.2
379 //
380 // NOTE: It's up to the caller to percent encode a URL before passing it to the
381 // parser. Percent encoding URIs in the rules is unnecessary.
382 unittest
383 {
384 	// ID_Encoding
385 	// /foo/bar?baz=http://foo.bar stays unencoded.
386 	{
387 		immutable robotstxt =
388 			"User-agent: FooBot\n" ~
389 			"Disallow: /\n" ~
390 			"Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n";
391 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par"));
392 	}
393 
394 	// 3 byte character: /foo/bar/ツ -> /foo/bar/%E3%83%84
395 	{
396 		immutable robotstxt =
397 			"User-agent: FooBot\n" ~
398 			"Disallow: /\n" ~
399 			"Allow: /foo/bar/ツ\n";
400 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/%E3%83%84"));
401 		// The parser encodes the 3-byte character, but the URL is not %-encoded.
402 		assert (!
403 				IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
404 	}
405 	// Percent encoded 3 byte character: /foo/bar/%E3%83%84 -> /foo/bar/%E3%83%84
406 	{
407 		immutable robotstxt =
408 			"User-agent: FooBot\n" ~
409 			"Disallow: /\n" ~
410 			"Allow: /foo/bar/%E3%83%84\n";
411 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/%E3%83%84"));
412 		assert (!
413 				IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/ツ"));
414 	}
415 	// Percent encoded unreserved US-ASCII: /foo/bar/%62%61%7A -> NULL
416 	// This is illegal according to RFC3986 and while it may work here due to
417 	// simple string matching, it should not be relied on.
418 	{
419 		immutable robotstxt =
420 			"User-agent: FooBot\n" ~
421 			"Disallow: /\n" ~
422 			"Allow: /foo/bar/%62%61%7A\n";
423 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
424 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/%62%61%7A"));
425 	}
426 }
427 
428 // The REP I-D defines the following characters that have special meaning in
429 // robots.txt:
430 // # - inline comment.
431 // $ - end of pattern.
432 // * - any number of characters.
433 // See REP I-D section "Special Characters".
434 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.3
435 unittest
436 {
437 	// ID_SpecialCharacters
438 	{
439 		immutable robotstxt =
440 			"User-agent: FooBot\n" ~
441 			"Disallow: /foo/bar/quz\n" ~
442 			"Allow: /foo/*/qux\n";
443 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/quz"));
444 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
445 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo//quz"));
446 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bax/quz"));
447 	}
448 	{
449 		immutable robotstxt =
450 			"User-agent: FooBot\n" ~
451 			"Disallow: /foo/bar$\n" ~
452 			"Allow: /foo/bar/qux\n";
453 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
454 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/qux"));
455 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/"));
456 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar/baz"));
457 	}
458 	{
459 		immutable robotstxt =
460 			"User-agent: FooBot\n" ~
461 			"# Disallow: /\n" ~
462 			"Disallow: /foo/quz#qux\n" ~
463 			"Allow: /\n";
464 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/bar"));
465 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/foo/quz"));
466 	}
467 }
468 
469 // Google-specific: "index.html" (and only that) at the end of a pattern is
470 // equivalent to "/".
471 unittest
472 {
473 	// GoogleOnly_IndexHTMLisDirectory
474 	immutable robotstxt =
475 		"User-Agent: *\n" ~
476 		"Allow: /allowed-slash/index.html\n" ~
477 		"Disallow: /\n";
478 	// If index.html is allowed, we interpret this as / being allowed too.
479 	assert (IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/"));
480 	// Does not exatly match.
481 	assert (!IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/index.htm"));
482 	// Exact match.
483 	assert (IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/allowed-slash/index.html"));
484 	assert (!IsUserAgentAllowed(robotstxt, "foobot", "http://foo.com/anyother-url"));
485 }
486 
487 // Google-specific: long lines are ignored after 8 * 2083 bytes. See comment in
488 // RobotsTxtParser::Parse().
489 unittest
490 {
491 	// GoogleOnly_LineTooLong
492 	immutable kEOLLen = "\n".length;
493 	immutable size_t kMaxLineLen = 2083 * 8;
494 	immutable allow = "allow: ";
495 	immutable disallow = "disallow: ";
496 
497 	// Disallow rule pattern matches the URL after being cut off at kMaxLineLen.
498 	{
499 		string robotstxt = "user-agent: FooBot\n";
500 		string longline = "/x/";
501 		immutable size_t max_length = kMaxLineLen - longline.length - disallow.length + kEOLLen;
502 		while (longline.length < max_length)
503 		{
504 			longline ~= "a";
505 		}
506 		robotstxt ~= disallow ~ longline ~ "/qux\n";
507 
508 		// Matches nothing, so URL is allowed.
509 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fux"));
510 		// Matches cut off disallow rule.
511 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar" ~ longline ~ "/fux"));
512 	}
513 
514 	{
515 		string robotstxt =
516 			"user-agent: FooBot\n" ~
517 			"disallow: /\n";
518 		string longline_a = "/x/";
519 		string longline_b = "/x/";
520 		immutable size_t max_length = kMaxLineLen - longline_a.length - allow.length + kEOLLen;
521 		while (longline_a.length < max_length)
522 		{
523 			longline_a ~= "a";
524 			longline_b ~= "b";
525 		}
526 		robotstxt ~= allow ~ longline_a ~ "/qux\n";
527 		robotstxt ~= allow ~ longline_b ~ "/qux\n";
528 
529 		// URL matches the disallow rule.
530 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/"));
531 		// Matches the allow rule exactly.
532 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar" ~ longline_a ~ "/qux"));
533 		// Matches cut off allow rule.
534 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar" ~ longline_b ~ "/fux"));
535 	}
536 }
537 
538 unittest
539 {
540 	// GoogleOnly_DocumentationChecks
541 	// Test documentation from
542 	// https://developers.google.com/search/reference/robots_txt
543 	// Section "URL matching based on path values".
544 	{
545 		immutable robotstxt =
546 			"user-agent: FooBot\n" ~
547 			"disallow: /\n" ~
548 			"allow: /fish\n";
549 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
550 
551 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
552 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
553 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon.html"));
554 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
555 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads/yummy.html"));
556 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html?id=anything"));
557 
558 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.asp"));
559 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
560 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
561 	}
562 	// "/fish*" equals "/fish"
563 	{
564 		immutable robotstxt =
565 			"user-agent: FooBot\n" ~
566 			"disallow: /\n" ~
567 			"allow: /fish*\n";
568 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
569 
570 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
571 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
572 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon.html"));
573 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads"));
574 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads/yummy.html"));
575 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html?id=anything"));
576 
577 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.bar"));
578 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/catfish"));
579 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/?id=fish"));
580 	}
581 	// "/fish/" does not equal "/fish"
582 	{
583 		immutable robotstxt =
584 			"user-agent: FooBot\n" ~
585 			"disallow: /\n" ~
586 			"allow: /fish/\n";
587 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
588 
589 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/"));
590 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon"));
591 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/?salmon"));
592 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/salmon.html"));
593 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish/?id=anything"));
594 
595 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish"));
596 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.html"));
597 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish/Salmon.html"));
598 	}
599 	// "/*.php"
600 	{
601 		immutable robotstxt =
602 			"user-agent: FooBot\n" ~
603 			"disallow: /\n" ~
604 			"allow: /*.php\n";
605 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
606 
607 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
608 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/folder/filename.php"));
609 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/folder/filename.php?parameters"));
610 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar//folder/any.php.file.html"));
611 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php/"));
612 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/index?f=filename.php/"));
613 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/php/"));
614 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/index?php"));
615 
616 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/windows.PHP"));
617 	}
618 	// "/*.php$"
619 	{
620 		immutable robotstxt =
621 			"user-agent: FooBot\n" ~
622 			"disallow: /\n" ~
623 			"allow: /*.php$\n";
624 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
625 
626 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php"));
627 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/folder/filename.php"));
628 
629 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php?parameters"));
630 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php/"));
631 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename.php5"));
632 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/php/"));
633 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/filename?php"));
634 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/aaaphpaaa"));
635 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar//windows.PHP"));
636 	}
637 	// "/fish*.php"
638 	{
639 		immutable robotstxt =
640 			"user-agent: FooBot\n" ~
641 			"disallow: /\n" ~
642 			"allow: /fish*.php\n";
643 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/bar"));
644 
645 		assert (IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fish.php"));
646 		assert ( IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/fishheads/catfish.php?parameters"));
647 
648 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", "http://foo.bar/Fish.PHP"));
649 	}
650 	// Section "Order of precedence for group-member records".
651 	{
652 		immutable robotstxt =
653 			"user-agent: FooBot\n" ~
654 			"allow: /p\n" ~
655 			"disallow: /\n";
656 		immutable url = "http://example.com/page";
657 		assert (IsUserAgentAllowed(robotstxt, "FooBot", url));
658 	}
659 	{
660 		immutable robotstxt =
661 			"user-agent: FooBot\n" ~
662 			"allow: /folder\n" ~
663 			"disallow: /folder\n";
664 		immutable url = "http://example.com/folder/page";
665 		assert (IsUserAgentAllowed(robotstxt, "FooBot", url));
666 	}
667 	{
668 		immutable robotstxt =
669 			"user-agent: FooBot\n" ~
670 			"allow: /page\n" ~
671 			"disallow: /*.htm\n";
672 		immutable url = "http://example.com/page.htm";
673 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", url));
674 	}
675 	{
676 		immutable robotstxt =
677 			"user-agent: FooBot\n" ~
678 			"allow: /$\n" ~
679 			"disallow: /\n";
680 		immutable url = "http://example.com/";
681 		immutable url_page = "http://example.com/page.html";
682 		assert (IsUserAgentAllowed(robotstxt, "FooBot", url));
683 		assert (!IsUserAgentAllowed(robotstxt, "FooBot", url_page));
684 	}
685 }
686 
687 version (unittest)
688 {
689 	class RobotsStatsReporter : RobotsParseHandler
690 	{
691 		public:
692 			override void HandleRobotsStart() 
693 			{
694 				last_line_seen_ = 0;
695 				valid_directives_ = 0;
696 				unknown_directives_ = 0;
697 				sitemap_ = "";
698 			}
699 			override void HandleRobotsEnd() {}
700 
701 			override void HandleUserAgent(int line_num, string value) 
702 			{
703 				Digest(line_num);
704 			}
705 			override void HandleAllow(int line_num, string value) 
706 			{
707 				Digest(line_num);
708 			}
709 			override void HandleDisallow(int line_num, string value) 
710 			{
711 				Digest(line_num);
712 			}
713 
714 			override void HandleSitemap(int line_num, string value) 
715 			{
716 				Digest(line_num);
717 				sitemap_ ~= value;
718 			}
719 
720 			// Any other unrecognized name/v pairs.
721 			override void HandleUnknownAction(int line_num, string action, string value) 
722 			{
723 				last_line_seen_ = line_num;
724 				unknown_directives_++;
725 			}
726 
727 			int last_line_seen() const { return last_line_seen_; }
728 
729 			// All directives found, including unknown.
730 			int valid_directives() const { return valid_directives_; }
731 
732 			// Number of unknown directives.
733 			int unknown_directives() const { return unknown_directives_; }
734 
735 			// Parsed sitemap line.
736 			string sitemap() const { return sitemap_; }
737 
738 		private:
739 			void Digest(int line_num) @safe
740 			{
741 				assert (line_num >= last_line_seen_);
742 				last_line_seen_ = line_num;
743 				valid_directives_++;
744 			}
745 
746 			int last_line_seen_ = 0;
747 			int valid_directives_ = 0;
748 			int unknown_directives_ = 0;
749 			string sitemap_;
750 	}
751 }
752 
753 // Different kinds of line endings are all supported: %x0D / %x0A / %x0D.0A
754 unittest
755 {
756 	// ID_LinesNumbersAreCountedCorrectly
757 
758 	auto report = new RobotsStatsReporter;
759 
760 	immutable string kUnixFile =
761 		"User-Agent: foo\n" ~
762 		"Allow: /some/path\n" ~
763 		"User-Agent: bar\n" ~
764 		"\n" ~
765 		"\n" ~
766 		"Disallow: /\n";
767 	ParseRobotsTxt(kUnixFile, report);
768 	assert (report.valid_directives() == 4);
769 	assert (report.last_line_seen() == 6);
770 
771 	immutable string kDosFile =
772 		"User-Agent: foo\r\n" ~
773 		"Allow: /some/path\r\n" ~
774 		"User-Agent: bar\r\n" ~
775 		"\r\n" ~
776 		"\r\n" ~
777 		"Disallow: /\r\n";
778 	ParseRobotsTxt(kDosFile, report);
779 	assert (report.valid_directives() == 4);
780 	assert (report.last_line_seen() == 6);
781 
782 	immutable string kMacFile =
783 		"User-Agent: foo\r" ~
784 		"Allow: /some/path\r" ~
785 		"User-Agent: bar\r" ~
786 		"\r" ~
787 		"\r" ~
788 		"Disallow: /\r";
789 	ParseRobotsTxt(kMacFile, report);
790 	assert (report.valid_directives() == 4);
791 	assert (report.last_line_seen() == 6);
792 
793 	immutable string kNoFinalNewline =
794 		"User-Agent: foo\n" ~
795 		"Allow: /some/path\n" ~
796 		"User-Agent: bar\n" ~
797 		"\n" ~
798 		"\n" ~
799 		"Disallow: /";
800 	ParseRobotsTxt(kNoFinalNewline, report);
801 	assert (report.valid_directives() == 4);
802 	assert (report.last_line_seen() == 6);
803 
804 	immutable string kMixedFile =
805 		"User-Agent: foo\n" ~
806 		"Allow: /some/path\r\n" ~
807 		"User-Agent: bar\n" ~
808 		"\r\n" ~
809 		"\n" ~
810 		"Disallow: /";
811 	ParseRobotsTxt(kMixedFile, report);
812 	assert (report.valid_directives() == 4);
813 	assert (report.last_line_seen() == 6);
814 }
815 
816 // BOM characters are unparseable and thus skipped. The rules following the line
817 // are used.
818 unittest
819 {
820 	// ID_UTF8ByteOrderMarkIsSkipped
821 
822 	auto report = new RobotsStatsReporter;
823 
824 	immutable kUtf8FileFullBOM =
825 		"\xEF\xBB\xBF" ~
826 		"User-Agent: foo\n" ~
827 		"Allow: /AnyValue\n";
828 	ParseRobotsTxt(kUtf8FileFullBOM, report);
829 	assert (report.valid_directives() == 2);
830 	assert (report.unknown_directives() == 0);
831 
832 	// We allow as well partial ByteOrderMarks.
833 	immutable kUtf8FilePartial2BOM =
834 		"\xEF\xBB" ~
835 		"User-Agent: foo\n" ~
836 		"Allow: /AnyValue\n";
837 	ParseRobotsTxt(kUtf8FilePartial2BOM, report);
838 	assert (report.valid_directives() == 2);
839 	assert (report.unknown_directives() == 0);
840 
841 	immutable kUtf8FilePartial1BOM =
842 		"\xEF" ~
843 		"User-Agent: foo\n" ~
844 		"Allow: /AnyValue\n";
845 	ParseRobotsTxt(kUtf8FilePartial1BOM, report);
846 	assert (report.valid_directives() == 2);
847 	assert (report.unknown_directives() == 0);
848 
849 	// If the BOM is not the right sequence, the first line looks like garbage
850 	// that is skipped (we essentially see "\x11\xBFUser-Agent").
851 	immutable kUtf8FileBrokenBOM =
852 		"\xEF\x11\xBF" ~
853 		"User-Agent: foo\n" ~
854 		"Allow: /AnyValue\n";
855 	ParseRobotsTxt(kUtf8FileBrokenBOM, report);
856 	assert (report.valid_directives() == 1);
857 	assert (report.unknown_directives() == 1);  // We get one broken line.
858 
859 	// Some other messed up file: BOMs only valid in the beginning of the file.
860 	immutable kUtf8BOMSomewhereInMiddleOfFile =
861 		"User-Agent: foo\n" ~
862 		"\xEF\xBB\xBF" ~
863 		"Allow: /AnyValue\n";
864 	ParseRobotsTxt(kUtf8BOMSomewhereInMiddleOfFile, report);
865 	assert (report.valid_directives() == 1);
866 	assert (report.unknown_directives() == 1);
867 }
868 
869 // Google specific: the I-D allows any line that crawlers might need, such as
870 // sitemaps, which Google supports.
871 // See REP I-D section "Other records".
872 // https://tools.ietf.org/html/draft-koster-rep#section-2.2.4
873 unittest
874 {
875 	// ID_NonStandardLineExample_Sitemap
876 	auto report = new RobotsStatsReporter;
877 
878 	{
879 		auto sitemap_loc = "http://foo.bar/sitemap.xml";
880 		auto robotstxt =
881 			"User-Agent: foo\n" ~
882 			"Allow: /some/path\n" ~
883 			"User-Agent: bar\n" ~
884 			"\n" ~
885 			"\n" ~
886 			"Sitemap: " ~ sitemap_loc ~ "\n";
887 		ParseRobotsTxt(robotstxt, report);
888 		assert (sitemap_loc == report.sitemap);
889 	}
890 
891 	// A sitemap line may appear anywhere in the file.
892 	{
893 		auto sitemap_loc = "http://foo.bar/sitemap.xml";
894 		auto robotstxt =
895 			"Sitemap: " ~ sitemap_loc ~ "\n" ~
896 			"User-Agent: foo\n" ~
897 			"Allow: /some/path\n" ~
898 			"User-Agent: bar\n" ~
899 			"\n" ~
900 			"\n";
901 		ParseRobotsTxt(robotstxt, report);
902 		assert (sitemap_loc == report.sitemap);
903 	}
904 }
905 
906 version (unittest)
907 {
908 	void TestPath(string url, string expected_path)
909 	{
910 		assert (expected_path == GetPathParamsQuery(url));
911 	}
912 
913 	void TestEscape(string url, string expected)
914 	{
915 		assert (expected == MaybeEscapePattern(url));
916 	}
917 }
918 
919 unittest
920 {
921 	// TestGetPathParamsQuery
922 	// Only testing URLs that are already correctly escaped here.
923 	TestPath("", "/");
924 	TestPath("http://www.example.com", "/");
925 	TestPath("http://www.example.com/", "/");
926 	TestPath("http://www.example.com/a", "/a");
927 	TestPath("http://www.example.com/a/", "/a/");
928 	TestPath("http://www.example.com/a/b?c=http://d.e/", "/a/b?c=http://d.e/");
929 	TestPath("http://www.example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
930 	TestPath("example.com", "/");
931 	TestPath("example.com/", "/");
932 	TestPath("example.com/a", "/a");
933 	TestPath("example.com/a/", "/a/");
934 	TestPath("example.com/a/b?c=d&e=f#fragment", "/a/b?c=d&e=f");
935 	TestPath("a", "/");
936 	TestPath("a/", "/");
937 	TestPath("/a", "/a");
938 	TestPath("a/b", "/b");
939 	TestPath("example.com?a", "/?a");
940 	TestPath("example.com/a;b#c", "/a;b");
941 	TestPath("//a/b/c", "/b/c");
942 }
943 
944 unittest
945 {
946 	// TestMaybeEscapePattern
947 	TestEscape("http://www.example.com", "http://www.example.com");
948 	TestEscape("/a/b/c", "/a/b/c");
949 	TestEscape("á", "%C3%A1");
950 	TestEscape("%aa", "%AA");
951 }