1 // Copyright 1999 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // ----------------------------------------------------------------------------- 16 // [Translated from] File: robots.cc 17 // ----------------------------------------------------------------------------- 18 // 19 // Implements expired internet draft 20 // http://www.robotstxt.org/norobots-rfc.txt 21 // with Google-specific optimizations detailed at 22 // https://developers.google.com/search/reference/robots_txt 23 24 module unrobotstxt; 25 26 @safe: 27 28 import std.algorithm; 29 import std.array; 30 import std.ascii; 31 import std.container.array; 32 import std.conv; 33 import std.exception; 34 import std.range; 35 import std.typecons : Rebindable; 36 import std.utf; 37 38 /// Handler for directives found in robots.txt. These callbacks are called by 39 /// ParseRobotsTxt() in the sequence they have been found in the file. 40 abstract class RobotsParseHandler 41 { 42 public: 43 44 void HandleRobotsStart(); 45 void HandleRobotsEnd(); 46 47 void HandleUserAgent(int line_num, string value); 48 void HandleAllow(int line_num, string value); 49 void HandleDisallow(int line_num, string value); 50 51 void HandleSitemap(int line_num, string value); 52 53 /// Any other unrecognized name/value pairs. 54 void HandleUnknownAction(int line_num, string action, string value); 55 } 56 57 /// Parses body of a robots.txt and emits parse callbacks. This will accept 58 /// typical typos found in robots.txt, such as 'disalow'. 59 /// 60 /// Note, this function will accept all kind of input but will skip 61 /// everything that does not look like a robots directive. 62 void ParseRobotsTxt(string robots_body, RobotsParseHandler parse_callback) 63 { 64 auto parser = RobotsTxtParser(robots_body, parse_callback); 65 parser.Parse(); 66 } 67 68 final class RobotsMatcher : RobotsParseHandler 69 { 70 public: 71 72 /// Create a RobotsMatcher with the default matching strategy. The default 73 /// matching strategy is longest-match as opposed to the former internet draft 74 /// that provisioned first-match strategy. Analysis shows that longest-match, 75 /// while more restrictive for crawlers, is what webmasters assume when writing 76 /// directives. For example, in case of conflicting matches (both Allow and 77 /// Disallow), the longest match is the one the user wants. For example, in 78 /// case of a robots.txt file that has the following rules 79 /// Allow: / 80 /// Disallow: /cgi-bin 81 /// it's pretty obvious what the webmaster wants: they want to allow crawl of 82 /// every URI except /cgi-bin. However, according to the expired internet 83 /// standard, crawlers should be allowed to crawl everything with such a rule. 84 this() 85 { 86 match_strategy_ = new LongestMatchRobotsMatchStrategy(); 87 } 88 89 /// Verifies that the given user agent is valid to be matched against 90 /// robots.txt. Valid user agent strings only contain the characters 91 /// [a-zA-Z_-]. 92 static bool IsValidUserAgentToObey(string user_agent) 93 { 94 return !user_agent.empty && extractUserAgent(user_agent) == user_agent; 95 } 96 97 /// Returns true iff 'url' is allowed to be fetched by any member of the 98 /// "user_agents" vector. 'url' must be %-encoded according to RFC3986. 99 bool AllowedByRobots(string robots_body, const(string[]) user_agents, string url) 100 { 101 string path = GetPathParamsQuery(url); 102 InitUserAgentsAndPath(user_agents, path); 103 ParseRobotsTxt(robots_body, this); 104 return !disallow(); 105 } 106 107 /// Do robots check for 'url' when there is only one user agent. 'url' must 108 /// be %-encoded according to RFC3986. 109 bool OneAgentAllowedByRobots(string robots_txt, string user_agent, string url) 110 { 111 string[] v = [user_agent]; 112 return AllowedByRobots(robots_txt, v, url); 113 } 114 115 /// Returns true if we are disallowed from crawling a matching URI. 116 bool disallow() const 117 { 118 if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) 119 { 120 return (disallow_.specific.priority() > allow_.specific.priority()); 121 } 122 123 if (ever_seen_specific_agent_) 124 { 125 // Matching group for user-agent but either without disallow or empty one, 126 // i.e. priority == 0. 127 return false; 128 } 129 130 if (disallow_.global.priority() > 0 || allow_.global.priority() > 0) 131 { 132 return disallow_.global.priority() > allow_.global.priority(); 133 } 134 return false; 135 } 136 137 /// Returns true if we are disallowed from crawling a matching URI. Ignores any 138 /// rules specified for the default user agent, and bases its results only on 139 /// the specified user agents. 140 bool disallow_ignore_global() const 141 { 142 if (allow_.specific.priority() > 0 || disallow_.specific.priority() > 0) 143 { 144 return disallow_.specific.priority() > allow_.specific.priority(); 145 } 146 return false; 147 } 148 149 /// Returns true iff, when AllowedByRobots() was called, the robots file 150 /// referred explicitly to one of the specified user agents. 151 bool ever_seen_specific_agent() const 152 { 153 return ever_seen_specific_agent_; 154 } 155 156 /// Returns the line that matched or 0 if none matched. 157 int matching_line() const 158 { 159 if (ever_seen_specific_agent_) 160 { 161 return Match.HigherPriorityMatch(&disallow_.specific, &allow_.specific).line(); 162 } 163 return Match.HigherPriorityMatch(&disallow_.global, &allow_.global).line(); 164 } 165 166 protected: 167 168 /// Parse callbacks. 169 /// Protected because used in unittest. Never override RobotsMatcher, implement 170 /// googlebot::RobotsParseHandler instead. 171 override void HandleRobotsStart() 172 { 173 // This is a new robots.txt file, so we need to reset all the instance member 174 // variables. We do it in the same order the instance member variables are 175 // declared, so it's easier to keep track of which ones we have (or maybe 176 // haven't!) done. 177 allow_.Clear(); 178 disallow_.Clear(); 179 180 seen_global_agent_ = false; 181 seen_specific_agent_ = false; 182 ever_seen_specific_agent_ = false; 183 seen_separator_ = false; 184 } 185 186 override void HandleRobotsEnd() 187 { 188 } 189 190 override void HandleUserAgent(int line_num, string user_agent) 191 { 192 if (seen_separator_) 193 { 194 seen_specific_agent_ = seen_global_agent_ = seen_separator_ = false; 195 } 196 197 // Google-specific optimization: a '*' followed by space and more characters 198 // in a user-agent record is still regarded a global rule. 199 if (user_agent.length >= 1 && user_agent[0] == '*' 200 && (user_agent.length == 1 || std.ascii.isWhite(user_agent[1]))) 201 { 202 seen_global_agent_ = true; 203 } 204 else 205 { 206 user_agent = extractUserAgent(user_agent); 207 foreach (agent; user_agents_) 208 { 209 if (equalsIgnoreAsciiCase(user_agent, agent)) 210 { 211 ever_seen_specific_agent_ = seen_specific_agent_ = true; 212 break; 213 } 214 } 215 } 216 } 217 218 override void HandleAllow(int line_num, string value) 219 { 220 if (!seen_any_agent) return; 221 seen_separator_ = true; 222 const int priority = match_strategy_.MatchAllow(path_, value); 223 if (priority >= 0) 224 { 225 if (seen_specific_agent_) 226 { 227 if (allow_.specific.priority() < priority) 228 { 229 allow_.specific.Set(priority, line_num); 230 } 231 } 232 else 233 { 234 assert (seen_global_agent_); 235 if (allow_.global.priority() < priority) 236 { 237 allow_.global.Set(priority, line_num); 238 } 239 } 240 } 241 else 242 { 243 // Google-specific optimization: 'index.htm' and 'index.html' are normalized 244 // to '/'. 245 if (value.empty) return; 246 auto last_part = value.byCodeUnit.splitter('/').tail(1).front; 247 if (last_part.startsWith("index.htm")) 248 { 249 auto new_pattern = value[0 .. $ - last_part.length] ~ '$'; 250 HandleAllow(line_num, new_pattern); 251 } 252 } 253 } 254 255 override void HandleDisallow(int line_num, string value) 256 { 257 if (!seen_any_agent) return; 258 seen_separator_ = true; 259 const int priority = match_strategy_.MatchDisallow(path_, value); 260 if (priority >= 0) 261 { 262 if (seen_specific_agent_) 263 { 264 if (disallow_.specific.priority() < priority) 265 { 266 disallow_.specific.Set(priority, line_num); 267 } 268 } 269 else 270 { 271 assert (seen_global_agent_); 272 if (disallow_.global.priority() < priority) 273 { 274 disallow_.global.Set(priority, line_num); 275 } 276 } 277 } 278 } 279 280 override void HandleSitemap(int line_num, string value) 281 { 282 seen_separator_ = true; 283 } 284 285 override void HandleUnknownAction(int line_num, string action, string value) 286 { 287 seen_separator_ = true; 288 } 289 290 package: 291 292 /// Initialize next path and user-agents to check. Path must contain only the 293 /// path, params, and query (if any) of the url and must start with a '/'. 294 void InitUserAgentsAndPath(const(string)[] user_agents, string path) 295 { 296 path_ = path; 297 assert (path_.startsWith('/')); 298 user_agents_ = user_agents; 299 } 300 301 /// Returns true if any user-agent was seen. 302 bool seen_any_agent() const 303 { 304 return seen_global_agent_ || seen_specific_agent_; 305 } 306 307 /// Instead of just maintaining a Boolean indicating whether a given line has 308 /// matched, we maintain a count of the maximum number of characters matched by 309 /// that pattern. 310 /// 311 /// This structure stores the information associated with a match (e.g. when a 312 /// Disallow is matched) as priority of the match and line matching. 313 /// 314 /// The priority is initialized with a negative value to make sure that a match 315 /// of priority 0 is higher priority than no match at all. 316 struct Match 317 { 318 private: 319 320 enum kNoMatchPriority = -1; 321 322 public: 323 324 void Set(int priority, int line) 325 { 326 priority_ = priority; 327 line_ = line; 328 } 329 330 void Clear() 331 { 332 Set(kNoMatchPriority, 0); 333 } 334 335 int line() const 336 { 337 return line_; 338 } 339 340 int priority() const 341 { 342 return priority_; 343 } 344 345 static const(Match)* HigherPriorityMatch(const(Match)* a, const(Match)* b) 346 { 347 if (a.priority > b.priority) 348 { 349 return a; 350 } 351 else 352 { 353 return b; 354 } 355 } 356 357 private: 358 359 int priority_ = kNoMatchPriority; 360 int line_ = 0; 361 } 362 363 /// For each of the directives within user-agents, we keep global and specific 364 /// match scores. 365 struct MatchHierarchy 366 { 367 /// Match for '*' 368 Match global; 369 /// Match for queried agent. 370 Match specific; 371 void Clear() 372 { 373 global.Clear(); 374 specific.Clear(); 375 } 376 } 377 378 /// Characters of 'url' matching Allow. 379 MatchHierarchy allow_; 380 /// Characters of 'url' matching Disallow. 381 MatchHierarchy disallow_; 382 383 /// True if processing global agent rules. 384 bool seen_global_agent_ = false; 385 /// True if processing our specific agent. 386 bool seen_specific_agent_ = false; 387 /// True if we ever saw a block for our agent. 388 bool ever_seen_specific_agent_ = false; 389 /// True if saw any key: value pair. 390 bool seen_separator_ = false; 391 392 /// The path we want to pattern match. Not owned and only a valid pointer 393 /// during the lifetime of *AllowedByRobots calls. 394 string path_; 395 // The User-Agents we are interested in. Not owned and only a valid 396 // pointer during the lifetime of *AllowedByRobots calls. 397 const(string)[] user_agents_; 398 399 RobotsMatchStrategy match_strategy_; 400 } 401 402 /// A RobotsMatchStrategy defines a strategy for matching individual lines in a 403 /// robots.txt file. Each Match* method should return a match priority, which is 404 /// interpreted as: 405 /// 406 /// match priority < 0: 407 /// No match. 408 /// 409 /// match priority == 0: 410 /// Match, but treat it as if matched an empty pattern. 411 /// 412 /// match priority > 0: 413 /// Match. 414 abstract class RobotsMatchStrategy 415 { 416 public: 417 418 int MatchAllow(string path, string pattern); 419 int MatchDisallow(string path, string pattern); 420 } 421 422 /// Extracts path (with params) and query part from URL. Removes scheme, 423 /// authority, and fragment. Result always starts with "/". 424 /// Returns "/" if the url doesn't have a path or is not valid. 425 string GetPathParamsQuery(string url) 426 { 427 auto remainder = url.byCodeUnit; 428 429 // Initial two slashes are ignored. 430 if (url.startsWith("//")) remainder = remainder[2 .. $]; 431 432 // Find end of protocol, if it exists, and remove it plus following :// 433 // If path, param or query starts before ://, :// doesn't indicate protocol. 434 auto protocol_point = remainder.findAmong(":/?;"); 435 while (protocol_point.startsWith(":")) 436 { 437 if (protocol_point[1 .. $].startsWith("//")) 438 { 439 remainder = protocol_point[3 .. $]; 440 break; 441 } 442 protocol_point = protocol_point[1 .. $]; 443 } 444 445 auto path = remainder.findAmong("/?;"); 446 if (path.empty) return "/"; 447 auto hash = remainder.find('#'); 448 if (hash.length > path.length) return "/"; 449 450 path = path[0 .. $ - hash.length]; 451 452 if (path[0] != '/') return '/' ~ path.source; 453 return path.source; 454 } 455 456 package immutable kHexDigits = "0123456789ABCDEF"; 457 458 /// Canonicalize the allowed/disallowed paths. For example: 459 /// /SanJoséSellers ==> /Sanjos%C3%A9Sellers 460 /// %aa ==> %AA 461 string MaybeEscapePattern(string src) 462 { 463 int num_to_escape = 0; 464 bool need_capitalize = false; 465 466 // First, scan the buffer to see if changes are needed. Most don't. 467 foreach (i; 0 .. src.length) 468 { 469 // (a) % escape sequence. 470 if (i + 2 < src.length && src[i] == '%' 471 && std.ascii.isHexDigit(src[i + 1]) && std.ascii.isHexDigit(src[i + 2])) 472 { 473 if (std.ascii.isLower(src[i + 1]) || std.ascii.isLower(src[i + 2])) 474 { 475 need_capitalize = true; 476 } 477 i += 2; 478 // (b) needs escaping. 479 } 480 else if (!isASCII(src[i])) 481 { 482 num_to_escape++; 483 } 484 // (c) Already escaped and escape-characters normalized (eg. %2f -> %2F). 485 } 486 487 // Return if no changes needed. 488 if (!num_to_escape && !need_capitalize) return src; 489 490 auto ret = new char[num_to_escape * 2 + src.length]; 491 int j = 0; 492 size_t i = 0; 493 while (i < src.length) 494 { 495 // (a) Normalize %-escaped sequence (eg. %2f -> %2F). 496 if (i + 2 < src.length && src[i] == '%' 497 && std.ascii.isHexDigit(src[i + 1]) && std.ascii.isHexDigit(src[i + 2])) 498 { 499 ret[j++] = src[i++]; 500 ret[j++] = std.ascii.toUpper(src[i++]); 501 ret[j++] = std.ascii.toUpper(src[i]); 502 // (b) %-escape octets whose highest bit is set. These are outside the 503 // ASCII range. 504 } 505 else if (!isASCII(src[i])) 506 { 507 ret[j++] = '%'; 508 ret[j++] = kHexDigits[(src[i] >> 4) & 0xf]; 509 ret[j++] = kHexDigits[src[i] & 0xf]; 510 // (c) Normal character, no modification needed. 511 } 512 else 513 { 514 ret[j++] = src[i]; 515 } 516 i++; 517 } 518 // ret is a locally allocated array 519 return (() @trusted => ret.assumeUnique())(); 520 } 521 522 unittest 523 { 524 assert (MaybeEscapePattern("/SanJoséSellers") == "/SanJos%C3%A9Sellers"); 525 assert (MaybeEscapePattern("%aa") == "%AA"); 526 } 527 528 /// Implements robots.txt pattern matching. 529 bool Matches(string path, string pattern) pure 530 { 531 const pathlen = path.length; 532 auto pos = new size_t[pathlen + 1]; 533 534 // The pos[] array holds a sorted list of indexes of 'path', with length 535 // 'numpos'. At the start and end of each iteration of the main loop below, 536 // the pos[] array will hold a list of the prefixes of the 'path' which can 537 // match the current prefix of 'pattern'. If this list is ever empty, 538 // return false. If we reach the end of 'pattern' with at least one element 539 // in pos[], return true. 540 541 pos[0] = 0; 542 size_t numpos = 1; 543 544 foreach (idx, pat; pattern) 545 { 546 if (pat == '$' && idx == pattern.length - 1) 547 { 548 return pos[numpos - 1] == pathlen; 549 } 550 if (pat == '*') 551 { 552 numpos = pathlen - pos[0] + 1; 553 foreach (i; 1 .. numpos) 554 { 555 pos[i] = pos[i - 1] + 1; 556 } 557 } 558 else 559 { 560 // Includes '$' when not at end of pattern. 561 size_t newnumpos = 0; 562 foreach (i; 0 .. numpos) 563 { 564 if (pos[i] < pathlen && path[pos[i]] == pat) 565 { 566 pos[newnumpos++] = pos[i] + 1; 567 } 568 } 569 numpos = newnumpos; 570 if (numpos == 0) return false; 571 } 572 } 573 574 return true; 575 } 576 577 package: 578 579 /// A robots.txt has lines of key/value pairs. A ParsedRobotsKey represents 580 /// a key. This class can parse a text-representation (including common typos) 581 /// and represent them as an enumeration which allows for faster processing 582 /// afterwards. 583 /// For unparsable keys, the original string representation is kept. 584 struct ParsedRobotsKey 585 { 586 public: 587 588 enum KeyType 589 { 590 /// Generic highlevel fields. 591 USER_AGENT, 592 SITEMAP, 593 594 /// Fields within a user-agent. 595 ALLOW, 596 DISALLOW, 597 598 /// Unrecognized field; kept as-is. High number so that additions to the 599 /// enumeration above does not change the serialization. 600 UNKNOWN = 128 601 } 602 603 /// Parse given key text. Does not copy the text, so the text_key must stay 604 /// valid for the object's life-time or the next Parse() call. 605 void Parse(string key) 606 { 607 key_text_ = ""; 608 with (KeyType) 609 { 610 if (KeyIsUserAgent(key)) 611 { 612 type_ = USER_AGENT; 613 } 614 else if (KeyIsAllow(key)) 615 { 616 type_ = ALLOW; 617 } 618 else if (KeyIsDisallow(key)) 619 { 620 type_ = DISALLOW; 621 } 622 else if (KeyIsSitemap(key)) 623 { 624 type_ = SITEMAP; 625 } 626 else 627 { 628 type_ = UNKNOWN; 629 key_text_ = key; 630 } 631 } 632 } 633 634 /// Returns the type of key. 635 KeyType type() const 636 { 637 return type_; 638 } 639 640 /// If this is an unknown key, get the text. 641 string GetUnknownText() const 642 { 643 assert (type_ == KeyType.UNKNOWN && !key_text_.empty); 644 return key_text_; 645 } 646 647 private: 648 649 static bool KeyIsUserAgent(string key) 650 { 651 const ret = startsWithIgnoreCase(key, "user-agent"); 652 version (StrictSpelling) 653 { 654 return ret; 655 } 656 else 657 { 658 return ret || startsWithIgnoreCase(key, "useragent") || startsWithIgnoreCase(key, "user agent"); 659 } 660 } 661 662 static bool KeyIsAllow(string key) 663 { 664 return startsWithIgnoreCase(key, "allow"); 665 } 666 667 static bool KeyIsDisallow(string key) 668 { 669 const ret = startsWithIgnoreCase(key, "disallow"); 670 version (StrictSpelling) 671 { 672 return ret; 673 } 674 else 675 { 676 return ret || startsWithIgnoreCase(key, "dissallow") || startsWithIgnoreCase(key, "dissalow") || 677 startsWithIgnoreCase(key, "disalow") || startsWithIgnoreCase(key, "diasllow") || startsWithIgnoreCase(key, "disallaw"); 678 } 679 } 680 681 static bool KeyIsSitemap(string key) 682 { 683 return startsWithIgnoreCase(key, "sitemap") || startsWithIgnoreCase(key, "site-map"); 684 } 685 686 KeyType type_ = KeyType.UNKNOWN; 687 string key_text_; 688 } 689 690 void emitKeyValueToHandler(int line, ref const(ParsedRobotsKey) key, string value, 691 RobotsParseHandler handler) 692 { 693 with (ParsedRobotsKey.KeyType) final switch (key.type) 694 { 695 case USER_AGENT: 696 handler.HandleUserAgent(line, value); 697 break; 698 case ALLOW: 699 handler.HandleAllow(line, value); 700 break; 701 case DISALLOW: 702 handler.HandleDisallow(line, value); 703 break; 704 case SITEMAP: 705 handler.HandleSitemap(line, value); 706 break; 707 case UNKNOWN: 708 handler.HandleUnknownAction(line, key.GetUnknownText(), value); 709 break; 710 } 711 } 712 713 struct RobotsTxtParser 714 { 715 public: 716 717 alias Key = ParsedRobotsKey; 718 719 void Parse() 720 { 721 /// UTF-8 byte order marks. 722 static immutable utf_bom = "\xef\xbb\xbf"; 723 724 /// Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's 725 /// fairly safe to assume any valid line isn't going to be more than many times 726 /// that max url length of 2KB. We want some padding for 727 /// UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well. 728 /// If so, we can ignore the chars on a line past that. 729 enum kMaxLineLen = 2083 * 8 - 1; 730 /// Allocate a buffer used to process the current line. 731 auto line_buffer = new char[kMaxLineLen]; 732 size_t line_pos = 0; 733 int line_num = 0; 734 size_t bom_pos = 0; 735 bool last_was_carriage_return = false; 736 handler_.HandleRobotsStart(); 737 738 foreach (ch; robots_body_) 739 { 740 assert (line_pos <= kMaxLineLen); 741 // Google-specific optimization: UTF-8 byte order marks should never 742 // appear in a robots.txt file, but they do nevertheless. Skipping 743 // possible BOM-prefix in the first bytes of the input. 744 if (bom_pos < utf_bom.length && ch == utf_bom[bom_pos++]) continue; 745 bom_pos = utf_bom.length; 746 if (ch != '\x0a' && ch != '\x0d') 747 { 748 // Non-line-ending char case. 749 // Put in next spot on current line, as long as there's room. 750 if (line_pos < kMaxLineLen) line_buffer[line_pos++] = ch; 751 } 752 else 753 { 754 // Line-ending character char case. 755 // Only emit an empty line if this was not due to the second character 756 // of the DOS line-ending \r\n . 757 const bool is_CRLF_continuation = 758 line_pos == 0 && last_was_carriage_return && ch == '\x0a'; 759 if (!is_CRLF_continuation) ParseAndEmitLine(++line_num, line_buffer[0 .. line_pos]); 760 line_pos = 0; 761 last_was_carriage_return = (ch == '\x0d'); 762 } 763 } 764 ParseAndEmitLine(++line_num, line_buffer[0 .. line_pos]); 765 handler_.HandleRobotsEnd(); 766 } 767 768 private: 769 770 void ParseAndEmitLine(int current_line, char[] line) 771 { 772 string string_key, value; 773 if (!getKeyAndValueFrom(string_key, value, line)) return; 774 775 Key key; 776 key.Parse(string_key); 777 if (NeedEscapeValueForKey(key)) value = MaybeEscapePattern(value); 778 emitKeyValueToHandler(current_line, key, value, handler_); 779 } 780 781 bool NeedEscapeValueForKey(ref const(Key) key) 782 { 783 with (Key.KeyType) return !key.type.among(USER_AGENT, SITEMAP); 784 } 785 786 string robots_body_; 787 Rebindable!RobotsParseHandler handler_; 788 } 789 790 final class LongestMatchRobotsMatchStrategy : RobotsMatchStrategy 791 { 792 public: 793 794 override int MatchAllow(string path, string pattern) 795 { 796 return Matches(path, pattern) ? pattern.length.to!int : -1; 797 } 798 799 override int MatchDisallow(string path, string pattern) 800 { 801 return Matches(path, pattern) ? pattern.length.to!int : -1; 802 } 803 } 804 805 bool equalsIgnoreAsciiCase(string s1, string s2) @nogc pure 806 { 807 return equal!((c1, c2) => std.ascii.toLower(c1) == std.ascii.toLower(c2))(s1.byChar, s2.byChar); 808 } 809 810 bool startsWithIgnoreCase(string target, string prefix) @nogc pure 811 { 812 return target.length >= prefix.length && equalsIgnoreAsciiCase(target[0 .. prefix.length], prefix); 813 } 814 815 bool getKeyAndValueFrom(ref string key, ref string value, char[] line) pure 816 { 817 auto line_c = line.byCodeUnit; 818 static void asciiStripLeft(ref typeof(line_c) s) pure 819 { 820 s.findSkip!(std.ascii.isWhite); 821 } 822 823 static void asciiStripRight(ref typeof(line_c) s) pure 824 { 825 if (s.empty) return; 826 const white_suffix_len = s.retro.countUntil!(c => !std.ascii.isWhite(c)); 827 s = s[0 .. $ - white_suffix_len]; 828 } 829 // Remove comments from the current robots.txt line. 830 if (auto comment_pieces = line_c.findSplitBefore("#")) 831 { 832 line_c = comment_pieces[0]; 833 } 834 if (line_c.empty) return false; 835 asciiStripLeft(line_c); 836 asciiStripRight(line_c); 837 838 // Rules must match the following pattern: 839 // <key>[ \t]*:[ \t]*<value> 840 if (auto pieces = line_c.findSplit(":")) 841 { 842 asciiStripRight(pieces[0]); 843 key = pieces[0].source.idup; 844 asciiStripLeft(pieces[2]); 845 value = pieces[2].source.idup; 846 } 847 else 848 { 849 // Google-specific optimization: some people forget the colon, so we need to 850 // accept whitespace in its stead. 851 auto after_whitespace = line_c.findAmong(" \t"); 852 if (after_whitespace.empty) return false; 853 key = line_c[0 .. $ - after_whitespace.length].source.idup; 854 asciiStripLeft(after_whitespace); 855 // We only accept whitespace as a separator if there are exactly two 856 // sequences of non-whitespace characters. If we get here, there were 857 // more than 2 such sequences since we stripped trailing whitespace 858 // above. 859 if (!after_whitespace.findAmong(" \t").empty) return false; 860 value = after_whitespace.source.idup; 861 } 862 return !key.empty; 863 } 864 865 /// Extract the matchable part of a user agent string, essentially stopping at 866 /// the first invalid character. 867 /// Example: 'Googlebot/2.1' becomes 'Googlebot' 868 static string extractUserAgent(string user_agent) @nogc pure 869 { 870 static bool isUserAgentChar(char c) pure @nogc 871 { 872 // Allowed characters in user-agent are [a-zA-Z_-]. 873 return std.ascii.isAlpha(c) || c == '-' || c == '_'; 874 } 875 876 auto len = user_agent.byCodeUnit.countUntil!(c => !isUserAgentChar(c)); 877 if (len < 0) len = user_agent.length; 878 return user_agent[0 .. len]; 879 } 880 881 unittest 882 { 883 assert (extractUserAgent("Googlebot/2.1") == "Googlebot"); 884 assert (extractUserAgent("Googlebot/2.1/2.1") == "Googlebot"); 885 assert (extractUserAgent("Googlebot2.1") == "Googlebot"); 886 assert (extractUserAgent("Googlebot") == "Googlebot"); 887 assert (extractUserAgent("/") == ""); 888 assert (extractUserAgent("") == ""); 889 }