unrobotstxt source code

1 // Copyright 1999 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // -----------------------------------------------------------------------------
16 // [Translated from] File: robots.cc
17 // -----------------------------------------------------------------------------
18 //
19 // Implements expired internet draft
20 //   http://www.robotstxt.org/norobots-rfc.txt
21 // with Google-specific optimizations detailed at
22 //   https://developers.google.com/search/reference/robots_txt
23 
24 module unrobotstxt;
25 
26 @safe:
27 
28 import std.algorithm;
29 import std.array;
30 import std.ascii;
31 import std.container.array;
32 import std.conv;
33 import std.exception;
34 import std.range;
35 import std.typecons : Rebindable;
36 import std.utf;
37 
38 /// Handler for directives found in robots.txt. These callbacks are called by
39 /// ParseRobotsTxt() in the sequence they have been found in the file.
40 abstract class RobotsParseHandler
41 {
42 public:
43 
44 	void HandleRobotsStart();
45 	void HandleRobotsEnd();
46 
47 	void HandleUserAgent(int line_num, string value);
48 	void HandleAllow(int line_num, string value);
49 	void HandleDisallow(int line_num, string value);
50 
51 	void HandleSitemap(int line_num, string value);
52 
53 	/// Any other unrecognized name/value pairs.
54 	void HandleUnknownAction(int line_num, string action, string value);
55 }
56 
57 /// Parses body of a robots.txt and emits parse callbacks. This will accept
58 /// typical typos found in robots.txt, such as 'disalow'.
59 ///
60 /// Note, this function will accept all kind of input but will skip
61 /// everything that does not look like a robots directive.
62 void ParseRobotsTxt(string robots_body, RobotsParseHandler parse_callback)
63 {
64 	auto parser = RobotsTxtParser(robots_body, parse_callback);
65 	parser.Parse();
66 }
67 
68 final class RobotsMatcher : RobotsParseHandler
69 {
70 public:
71 
72 	/// Create a RobotsMatcher with the default matching strategy. The default
73 	/// matching strategy is longest-match as opposed to the former internet draft
74 	/// that provisioned first-match strategy. Analysis shows that longest-match,
75 	/// while more restrictive for crawlers, is what webmasters assume when writing
76 	/// directives. For example, in case of conflicting matches (both Allow and
77 	/// Disallow), the longest match is the one the user wants. For example, in
78 	/// case of a robots.txt file that has the following rules
79 	///   Allow: /
80 	///   Disallow: /cgi-bin
81 	/// it's pretty obvious what the webmaster wants: they want to allow crawl of
82 	/// every URI except /cgi-bin. However, according to the expired internet
83 	/// standard, crawlers should be allowed to crawl everything with such a rule.
84 	this()
85 	{
86 		match_strategy_ = new LongestMatchRobotsMatchStrategy();
87 	}
88 
89 	/// Verifies that the given user agent is valid to be matched against
90 	/// robots.txt. Valid user agent strings only contain the characters
91 	/// [a-zA-Z_-].
92 	static bool IsValidUserAgentToObey(string user_agent)
93 	{
94 		return !user_agent.empty && extractUserAgent(user_agent) == user_agent;
95 	}
96 
97 	/// Returns true iff 'url' is allowed to be fetched by any member of the
98 	/// "user_agents" vector. 'url' must be %-encoded according to RFC3986.
99 	bool AllowedByRobots(string robots_body, const(string[]) user_agents, string url)
100 	{
101 		string path = GetPathParamsQuery(url);
102 		InitUserAgentsAndPath(user_agents, path);
103 		ParseRobotsTxt(robots_body, this);
104 		return !disallow();
105 	}
106 
107 	/// Do robots check for 'url' when there is only one user agent. 'url' must
108 	/// be %-encoded according to RFC3986.
109 	bool OneAgentAllowedByRobots(string robots_txt, string user_agent, string url)
110 	{
111 		string[] v = [user_agent];
112 		return AllowedByRobots(robots_txt, v, url);
113 	}
114 
115 	/// Returns true if we are disallowed from crawling a matching URI.
116 	bool disallow() const
117 	{
118 		if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0)
119 		{
120 			return (disallow_.specific.priority() > allow_.specific.priority());
121 		}
122 
123 		if (ever_seen_specific_agent_)
124 		{
125 			// Matching group for user-agent but either without disallow or empty one,
126 			// i.e. priority == 0.
127 			return false;
128 		}
129 
130 		if (disallow_.global.priority() > 0 || allow_.global.priority() > 0)
131 		{
132 			return disallow_.global.priority() > allow_.global.priority();
133 		}
134 		return false;
135 	}
136 
137 	/// Returns true if we are disallowed from crawling a matching URI. Ignores any
138 	/// rules specified for the default user agent, and bases its results only on
139 	/// the specified user agents.
140 	bool disallow_ignore_global() const
141 	{
142 		if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0)
143 		{
144 			return disallow_.specific.priority() > allow_.specific.priority();
145 		}
146 		return false;
147 	}
148 
149 	/// Returns true iff, when AllowedByRobots() was called, the robots file
150 	/// referred explicitly to one of the specified user agents.
151 	bool ever_seen_specific_agent() const
152 	{
153 		return ever_seen_specific_agent_;
154 	}
155 
156 	/// Returns the line that matched or 0 if none matched.
157 	int matching_line() const
158 	{
159 		if (ever_seen_specific_agent_)
160 		{
161 			return Match.HigherPriorityMatch(&disallow_.specific, &allow_.specific).line();
162 		}
163 		return Match.HigherPriorityMatch(&disallow_.global, &allow_.global).line();
164 	}
165 
166 protected:
167 
168 	/// Parse callbacks.
169 	/// Protected because used in unittest. Never override RobotsMatcher, implement
170 	/// googlebot::RobotsParseHandler instead.
171 	override void HandleRobotsStart()
172 	{
173 		// This is a new robots.txt file, so we need to reset all the instance member
174 		// variables. We do it in the same order the instance member variables are
175 		// declared, so it's easier to keep track of which ones we have (or maybe
176 		// haven't!) done.
177 		allow_.Clear();
178 		disallow_.Clear();
179 
180 		seen_global_agent_ = false;
181 		seen_specific_agent_ = false;
182 		ever_seen_specific_agent_ = false;
183 		seen_separator_ = false;
184 	}
185 
186 	override void HandleRobotsEnd()
187 	{
188 	}
189 
190 	override void HandleUserAgent(int line_num, string user_agent)
191 	{
192 		if (seen_separator_)
193 		{
194 			seen_specific_agent_ = seen_global_agent_ = seen_separator_ = false;
195 		}
196 
197 		// Google-specific optimization: a '*' followed by space and more characters
198 		// in a user-agent record is still regarded a global rule.
199 		if (user_agent.length >= 1 && user_agent[0] == '*'
200 				&& (user_agent.length == 1 || std.ascii.isWhite(user_agent[1])))
201 		{
202 			seen_global_agent_ = true;
203 		}
204 		else
205 		{
206 			user_agent = extractUserAgent(user_agent);
207 			foreach (agent; user_agents_)
208 			{
209 				if (equalsIgnoreAsciiCase(user_agent, agent))
210 				{
211 					ever_seen_specific_agent_ = seen_specific_agent_ = true;
212 					break;
213 				}
214 			}
215 		}
216 	}
217 
218 	override void HandleAllow(int line_num, string value)
219 	{
220 		if (!seen_any_agent) return;
221 		seen_separator_ = true;
222 		const int priority = match_strategy_.MatchAllow(path_, value);
223 		if (priority >= 0)
224 		{
225 			if (seen_specific_agent_)
226 			{
227 				if (allow_.specific.priority() < priority)
228 				{
229 					allow_.specific.Set(priority, line_num);
230 				}
231 			}
232 			else
233 			{
234 				assert (seen_global_agent_);
235 				if (allow_.global.priority() < priority)
236 				{
237 					allow_.global.Set(priority, line_num);
238 				}
239 			}
240 		}
241 		else
242 		{
243 			// Google-specific optimization: 'index.htm' and 'index.html' are normalized
244 			// to '/'.
245 			if (value.empty) return;
246 			auto last_part = value.byCodeUnit.splitter('/').tail(1).front;
247 			if (last_part.startsWith("index.htm"))
248 			{
249 				auto new_pattern = value[0 .. $ - last_part.length] ~ '$';
250 				HandleAllow(line_num, new_pattern);
251 			}
252 		}
253 	}
254 
255 	override void HandleDisallow(int line_num, string value)
256 	{
257 		if (!seen_any_agent) return;
258 		seen_separator_ = true;
259 		const int priority = match_strategy_.MatchDisallow(path_, value);
260 		if (priority >= 0)
261 		{
262 			if (seen_specific_agent_)
263 			{
264 				if (disallow_.specific.priority() < priority)
265 				{
266 					disallow_.specific.Set(priority, line_num);
267 				}
268 			}
269 			else
270 			{
271 				assert (seen_global_agent_);
272 				if (disallow_.global.priority() < priority)
273 				{
274 					disallow_.global.Set(priority, line_num);
275 				}
276 			}
277 		}
278 	}
279 
280 	override void HandleSitemap(int line_num, string value)
281 	{
282 		seen_separator_ = true;
283 	}
284 
285 	override void HandleUnknownAction(int line_num, string action, string value)
286 	{
287 		seen_separator_ = true;
288 	}
289 
290 package:
291 
292 	/// Initialize next path and user-agents to check. Path must contain only the
293 	/// path, params, and query (if any) of the url and must start with a '/'.
294 	void InitUserAgentsAndPath(const(string)[] user_agents, string path)
295 	{
296 		path_ = path;
297 		assert (path_.startsWith('/'));
298 		user_agents_ = user_agents;
299 	}
300 
301 	/// Returns true if any user-agent was seen.
302 	bool seen_any_agent() const
303 	{
304 		return seen_global_agent_ || seen_specific_agent_;
305 	}
306 
307 	/// Instead of just maintaining a Boolean indicating whether a given line has
308 	/// matched, we maintain a count of the maximum number of characters matched by
309 	/// that pattern.
310 	///
311 	/// This structure stores the information associated with a match (e.g. when a
312 	/// Disallow is matched) as priority of the match and line matching.
313 	///
314 	/// The priority is initialized with a negative value to make sure that a match
315 	/// of priority 0 is higher priority than no match at all.
316 	struct Match
317 	{
318 	private:
319 
320 		enum kNoMatchPriority = -1;
321 
322 	public:
323 
324 		void Set(int priority, int line)
325 		{
326 			priority_ = priority;
327 			line_ = line;
328 		}
329 
330 		void Clear()
331 		{
332 			Set(kNoMatchPriority, 0);
333 		}
334 
335 		int line() const
336 		{
337 			return line_;
338 		}
339 
340 		int priority() const
341 		{
342 			return priority_;
343 		}
344 
345 		static const(Match)* HigherPriorityMatch(const(Match)* a, const(Match)* b)
346 		{
347 			if (a.priority > b.priority)
348 			{
349 				return a;
350 			}
351 			else
352 			{
353 				return b;
354 			}
355 		}
356 
357 	private:
358 
359 		int priority_ = kNoMatchPriority;
360 		int line_ = 0;
361 	}
362 
363 	/// For each of the directives within user-agents, we keep global and specific
364 	/// match scores.
365 	struct MatchHierarchy
366 	{
367 		/// Match for '*'
368 		Match global;
369 		/// Match for queried agent.
370 		Match specific;
371 		void Clear()
372 		{
373 			global.Clear();
374 			specific.Clear();
375 		}
376 	}
377 
378 	/// Characters of 'url' matching Allow.
379 	MatchHierarchy allow_;
380 	/// Characters of 'url' matching Disallow.
381 	MatchHierarchy disallow_;
382 
383 	/// True if processing global agent rules.
384 	bool seen_global_agent_ = false;
385 	/// True if processing our specific agent.
386 	bool seen_specific_agent_ = false;
387 	/// True if we ever saw a block for our agent.
388 	bool ever_seen_specific_agent_ = false;
389 	/// True if saw any key: value pair.
390 	bool seen_separator_ = false;
391 
392 	/// The path we want to pattern match. Not owned and only a valid pointer
393 	/// during the lifetime of *AllowedByRobots calls.
394 	string path_;
395 	// The User-Agents we are interested in. Not owned and only a valid
396 	// pointer during the lifetime of *AllowedByRobots calls.
397 	const(string)[] user_agents_;
398 
399 	RobotsMatchStrategy match_strategy_;
400 }
401 
402 /// A RobotsMatchStrategy defines a strategy for matching individual lines in a
403 /// robots.txt file. Each Match* method should return a match priority, which is
404 /// interpreted as:
405 ///
406 /// match priority < 0:
407 ///    No match.
408 ///
409 /// match priority == 0:
410 ///    Match, but treat it as if matched an empty pattern.
411 ///
412 /// match priority > 0:
413 ///    Match.
414 abstract class RobotsMatchStrategy
415 {
416 public:
417 
418 	int MatchAllow(string path, string pattern);
419 	int MatchDisallow(string path, string pattern);
420 }
421 
422 /// Extracts path (with params) and query part from URL. Removes scheme,
423 /// authority, and fragment. Result always starts with "/".
424 /// Returns "/" if the url doesn't have a path or is not valid.
425 string GetPathParamsQuery(string url)
426 {
427 	auto remainder = url.byCodeUnit;
428 
429 	// Initial two slashes are ignored.
430 	if (url.startsWith("//")) remainder = remainder[2 .. $];
431 
432 	// Find end of protocol, if it exists, and remove it plus following ://
433     // If path, param or query starts before ://, :// doesn't indicate protocol.
434 	auto protocol_point = remainder.findAmong(":/?;");
435 	while (protocol_point.startsWith(":"))
436 	{
437 		if (protocol_point[1 .. $].startsWith("//"))
438 		{
439 			remainder = protocol_point[3 .. $];
440 			break;
441 		}
442 		protocol_point = protocol_point[1 .. $];
443 	}
444 
445 	auto path = remainder.findAmong("/?;");
446 	if (path.empty) return "/";
447 	auto hash = remainder.find('#');
448 	if (hash.length > path.length) return "/";
449 
450 	path = path[0 .. $ - hash.length];
451 
452 	if (path[0] != '/') return '/' ~ path.source;
453 	return path.source;
454 }
455 
456 package immutable kHexDigits = "0123456789ABCDEF";
457 
458 /// Canonicalize the allowed/disallowed paths. For example:
459 ///     /SanJoséSellers ==> /Sanjos%C3%A9Sellers
460 ///     %aa ==> %AA
461 string MaybeEscapePattern(string src)
462 {
463 	int num_to_escape = 0;
464 	bool need_capitalize = false;
465 
466 	// First, scan the buffer to see if changes are needed. Most don't.
467 	foreach (i; 0 .. src.length)
468 	{
469 		// (a) % escape sequence.
470 		if (i + 2 < src.length && src[i] == '%'
471 				&& std.ascii.isHexDigit(src[i + 1]) && std.ascii.isHexDigit(src[i + 2]))
472 		{
473 			if (std.ascii.isLower(src[i + 1]) || std.ascii.isLower(src[i + 2]))
474 			{
475 				need_capitalize = true;
476 			}
477 			i += 2;
478 			// (b) needs escaping.
479 		}
480 		else if (!isASCII(src[i]))
481 		{
482 			num_to_escape++;
483 		}
484 		// (c) Already escaped and escape-characters normalized (eg. %2f -> %2F).
485 	}
486 
487 	// Return if no changes needed.
488 	if (!num_to_escape && !need_capitalize) return src;
489 
490 	auto ret = new char[num_to_escape * 2 + src.length];
491 	int j = 0;
492 	size_t i = 0;
493 	while (i < src.length)
494 	{
495 		// (a) Normalize %-escaped sequence (eg. %2f -> %2F).
496 		if (i + 2 < src.length && src[i] == '%'
497 				&& std.ascii.isHexDigit(src[i + 1]) && std.ascii.isHexDigit(src[i + 2]))
498 		{
499 			ret[j++] = src[i++];
500 			ret[j++] = std.ascii.toUpper(src[i++]);
501 			ret[j++] = std.ascii.toUpper(src[i]);
502 			// (b) %-escape octets whose highest bit is set. These are outside the
503 			// ASCII range.
504 		}
505 		else if (!isASCII(src[i]))
506 		{
507 			ret[j++] = '%';
508 			ret[j++] = kHexDigits[(src[i] >> 4) & 0xf];
509 			ret[j++] = kHexDigits[src[i] & 0xf];
510 			// (c) Normal character, no modification needed.
511 		}
512 		else
513 		{
514 			ret[j++] = src[i];
515 		}
516 		i++;
517 	}
518 	// ret is a locally allocated array
519 	return (() @trusted => ret.assumeUnique())();
520 }
521 
522 unittest
523 {
524 	assert (MaybeEscapePattern("/SanJoséSellers") == "/SanJos%C3%A9Sellers");
525 	assert (MaybeEscapePattern("%aa") == "%AA");
526 }
527 
528 /// Implements robots.txt pattern matching.
529 bool Matches(string path, string pattern) pure
530 {
531 	const pathlen = path.length;
532 	auto pos = new size_t[pathlen + 1];
533 
534 	// The pos[] array holds a sorted list of indexes of 'path', with length
535 	// 'numpos'.  At the start and end of each iteration of the main loop below,
536 	// the pos[] array will hold a list of the prefixes of the 'path' which can
537 	// match the current prefix of 'pattern'. If this list is ever empty,
538 	// return false. If we reach the end of 'pattern' with at least one element
539 	// in pos[], return true.
540 
541 	pos[0] = 0;
542 	size_t numpos = 1;
543 
544 	foreach (idx, pat; pattern)
545 	{
546 		if (pat == '$' && idx == pattern.length - 1)
547 		{
548 			return pos[numpos - 1] == pathlen;
549 		}
550 		if (pat == '*')
551 		{
552 			numpos = pathlen - pos[0] + 1;
553 			foreach (i; 1 .. numpos)
554 			{
555 				pos[i] = pos[i - 1] + 1;
556 			}
557 		}
558 		else
559 		{
560 			// Includes '$' when not at end of pattern.
561 			size_t newnumpos = 0;
562 			foreach (i; 0 .. numpos)
563 			{
564 				if (pos[i] < pathlen && path[pos[i]] == pat)
565 				{
566 					pos[newnumpos++] = pos[i] + 1;
567 				}
568 			}
569 			numpos = newnumpos;
570 			if (numpos == 0) return false;
571 		}
572 	}
573 
574 	return true;
575 }
576 
577 package:
578 
579 /// A robots.txt has lines of key/value pairs. A ParsedRobotsKey represents
580 /// a key. This class can parse a text-representation (including common typos)
581 /// and represent them as an enumeration which allows for faster processing
582 /// afterwards.
583 /// For unparsable keys, the original string representation is kept.
584 struct ParsedRobotsKey
585 {
586 public:
587 
588 	enum KeyType
589 	{
590 		/// Generic highlevel fields.
591 		USER_AGENT,
592 		SITEMAP,
593 
594 		/// Fields within a user-agent.
595 		ALLOW,
596 		DISALLOW,
597 
598 		/// Unrecognized field; kept as-is. High number so that additions to the
599 		/// enumeration above does not change the serialization.
600 		UNKNOWN = 128
601 	}
602 
603 	/// Parse given key text. Does not copy the text, so the text_key must stay
604 	/// valid for the object's life-time or the next Parse() call.
605 	void Parse(string key)
606 	{
607 		key_text_ = "";
608 		with (KeyType)
609 		{
610 			if (KeyIsUserAgent(key))
611 			{
612 				type_ = USER_AGENT;
613 			}
614 			else if (KeyIsAllow(key))
615 			{
616 				type_ = ALLOW;
617 			}
618 			else if (KeyIsDisallow(key))
619 			{
620 				type_ = DISALLOW;
621 			}
622 			else if (KeyIsSitemap(key))
623 			{
624 				type_ = SITEMAP;
625 			}
626 			else
627 			{
628 				type_ = UNKNOWN;
629 				key_text_ = key;
630 			}
631 		}
632 	}
633 
634 	/// Returns the type of key.
635 	KeyType type() const
636 	{
637 		return type_;
638 	}
639 
640 	/// If this is an unknown key, get the text.
641 	string GetUnknownText() const
642 	{
643 		assert (type_ == KeyType.UNKNOWN && !key_text_.empty);
644 		return key_text_;
645 	}
646 
647 private:
648 
649 	static bool KeyIsUserAgent(string key)
650 	{
651 		const ret = startsWithIgnoreCase(key, "user-agent");
652 		version (StrictSpelling)
653 		{
654 			return ret;
655 		}
656 		else
657 		{
658 			return ret || startsWithIgnoreCase(key, "useragent") || startsWithIgnoreCase(key, "user agent");
659 		}
660 	}
661 
662 	static bool KeyIsAllow(string key)
663 	{
664 		return startsWithIgnoreCase(key, "allow");
665 	}
666 
667 	static bool KeyIsDisallow(string key)
668 	{
669 		const ret = startsWithIgnoreCase(key, "disallow");
670 		version (StrictSpelling)
671 		{
672 			return ret;
673 		}
674 		else
675 		{
676 			return ret || startsWithIgnoreCase(key, "dissallow") || startsWithIgnoreCase(key, "dissalow") ||
677 				startsWithIgnoreCase(key, "disalow") || startsWithIgnoreCase(key, "diasllow") || startsWithIgnoreCase(key, "disallaw");
678 		}
679 	}
680 
681 	static bool KeyIsSitemap(string key)
682 	{
683 		return startsWithIgnoreCase(key, "sitemap") || startsWithIgnoreCase(key, "site-map");
684 	}
685 
686 	KeyType type_ = KeyType.UNKNOWN;
687 	string key_text_;
688 }
689 
690 void emitKeyValueToHandler(int line, ref const(ParsedRobotsKey) key, string value,
691 		RobotsParseHandler handler)
692 {
693 	with (ParsedRobotsKey.KeyType) final switch (key.type)
694 	{
695 	case USER_AGENT:
696 		handler.HandleUserAgent(line, value);
697 		break;
698 	case ALLOW:
699 		handler.HandleAllow(line, value);
700 		break;
701 	case DISALLOW:
702 		handler.HandleDisallow(line, value);
703 		break;
704 	case SITEMAP:
705 		handler.HandleSitemap(line, value);
706 		break;
707 	case UNKNOWN:
708 		handler.HandleUnknownAction(line, key.GetUnknownText(), value);
709 		break;
710 	}
711 }
712 
713 struct RobotsTxtParser
714 {
715 public:
716 
717 	alias Key = ParsedRobotsKey;
718 
719 	void Parse()
720 	{
721 		/// UTF-8 byte order marks.
722 		static immutable utf_bom = "\xef\xbb\xbf";
723 
724 		/// Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's
725 		/// fairly safe to assume any valid line isn't going to be more than many times
726 		/// that max url length of 2KB. We want some padding for
727 		/// UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well.
728 		/// If so, we can ignore the chars on a line past that.
729 		enum kMaxLineLen = 2083 * 8 - 1;
730 		/// Allocate a buffer used to process the current line.
731 		auto line_buffer = new char[kMaxLineLen];
732 		size_t line_pos = 0;
733 		int line_num = 0;
734 		size_t bom_pos = 0;
735 		bool last_was_carriage_return = false;
736 		handler_.HandleRobotsStart();
737 
738 		foreach (ch; robots_body_)
739 		{
740 			assert (line_pos <= kMaxLineLen);
741 			// Google-specific optimization: UTF-8 byte order marks should never
742 			// appear in a robots.txt file, but they do nevertheless. Skipping
743 			// possible BOM-prefix in the first bytes of the input.
744 			if (bom_pos < utf_bom.length && ch == utf_bom[bom_pos++]) continue;
745 			bom_pos = utf_bom.length;
746 			if (ch != '\x0a' && ch != '\x0d')
747 			{
748 				// Non-line-ending char case.
749 				// Put in next spot on current line, as long as there's room.
750 				if (line_pos < kMaxLineLen) line_buffer[line_pos++] = ch;
751 			}
752 			else
753 			{
754 				// Line-ending character char case.
755 				// Only emit an empty line if this was not due to the second character
756 				// of the DOS line-ending \r\n .
757 				const bool is_CRLF_continuation =
758 					line_pos == 0 && last_was_carriage_return && ch == '\x0a';
759 				if (!is_CRLF_continuation) ParseAndEmitLine(++line_num, line_buffer[0 .. line_pos]);
760 				line_pos = 0;
761 				last_was_carriage_return = (ch == '\x0d');
762 			}
763 		}
764 		ParseAndEmitLine(++line_num, line_buffer[0 .. line_pos]);
765 		handler_.HandleRobotsEnd();
766 	}
767 
768 private:
769 
770 	void ParseAndEmitLine(int current_line, char[] line)
771 	{
772 		string string_key, value;
773 		if (!getKeyAndValueFrom(string_key, value, line)) return;
774 
775 		Key key;
776 		key.Parse(string_key);
777 		if (NeedEscapeValueForKey(key)) value = MaybeEscapePattern(value);
778 		emitKeyValueToHandler(current_line, key, value, handler_);
779 	}
780 
781 	bool NeedEscapeValueForKey(ref const(Key) key)
782 	{
783 		with (Key.KeyType) return !key.type.among(USER_AGENT, SITEMAP);
784 	}
785 
786 	string robots_body_;
787 	Rebindable!RobotsParseHandler handler_;
788 }
789 
790 final class LongestMatchRobotsMatchStrategy : RobotsMatchStrategy
791 {
792 public:
793 
794 	override int MatchAllow(string path, string pattern)
795 	{
796 		return Matches(path, pattern) ? pattern.length.to!int : -1;
797 	}
798 
799 	override int MatchDisallow(string path, string pattern)
800 	{
801 		return Matches(path, pattern) ? pattern.length.to!int : -1;
802 	}
803 }
804 
805 bool equalsIgnoreAsciiCase(string s1, string s2) @nogc pure
806 {
807 	return equal!((c1, c2) => std.ascii.toLower(c1) == std.ascii.toLower(c2))(s1.byChar, s2.byChar);
808 }
809 
810 bool startsWithIgnoreCase(string target, string prefix) @nogc pure
811 {
812 	return target.length >= prefix.length && equalsIgnoreAsciiCase(target[0 .. prefix.length], prefix);
813 }
814 
815 bool getKeyAndValueFrom(ref string key, ref string value, char[] line) pure
816 {
817 	auto line_c = line.byCodeUnit;
818 	static void asciiStripLeft(ref typeof(line_c) s) pure
819 	{
820 		s.findSkip!(std.ascii.isWhite);
821 	}
822 
823 	static void asciiStripRight(ref typeof(line_c) s) pure
824 	{
825 		if (s.empty) return;
826 		const white_suffix_len = s.retro.countUntil!(c => !std.ascii.isWhite(c));
827 		s = s[0 .. $ - white_suffix_len];
828 	}
829 	// Remove comments from the current robots.txt line.
830 	if (auto comment_pieces = line_c.findSplitBefore("#"))
831 	{
832 		line_c = comment_pieces[0];
833 	}
834 	if (line_c.empty) return false;
835 	asciiStripLeft(line_c);
836 	asciiStripRight(line_c);
837 
838 	// Rules must match the following pattern:
839 	//   <key>[ \t]*:[ \t]*<value>
840 	if (auto pieces = line_c.findSplit(":"))
841 	{
842 		asciiStripRight(pieces[0]);
843 		key = pieces[0].source.idup;
844 		asciiStripLeft(pieces[2]);
845 		value = pieces[2].source.idup;
846 	}
847 	else
848 	{
849 		// Google-specific optimization: some people forget the colon, so we need to
850 		// accept whitespace in its stead.
851 		auto after_whitespace = line_c.findAmong(" \t");
852 		if (after_whitespace.empty) return false;
853 		key = line_c[0 .. $ - after_whitespace.length].source.idup;
854 		asciiStripLeft(after_whitespace);
855 		// We only accept whitespace as a separator if there are exactly two
856 		// sequences of non-whitespace characters.  If we get here, there were
857 		// more than 2 such sequences since we stripped trailing whitespace
858 		// above.
859 		if (!after_whitespace.findAmong(" \t").empty) return false;
860 		value = after_whitespace.source.idup;
861 	}
862 	return !key.empty;
863 }
864 
865 /// Extract the matchable part of a user agent string, essentially stopping at
866 /// the first invalid character.
867 /// Example: 'Googlebot/2.1' becomes 'Googlebot'
868 static string extractUserAgent(string user_agent) @nogc pure
869 {
870 	static bool isUserAgentChar(char c) pure @nogc
871 	{
872 		// Allowed characters in user-agent are [a-zA-Z_-].
873 		return std.ascii.isAlpha(c) || c == '-' || c == '_';
874 	}
875 
876 	auto len = user_agent.byCodeUnit.countUntil!(c => !isUserAgentChar(c));
877 	if (len < 0) len = user_agent.length;
878 	return user_agent[0 .. len];
879 }
880 
881 unittest
882 {
883 	assert (extractUserAgent("Googlebot/2.1") == "Googlebot");
884 	assert (extractUserAgent("Googlebot/2.1/2.1") == "Googlebot");
885 	assert (extractUserAgent("Googlebot2.1") == "Googlebot");
886 	assert (extractUserAgent("Googlebot") == "Googlebot");
887 	assert (extractUserAgent("/") == "");
888 	assert (extractUserAgent("") == "");
889 }