Object
Creates a WebRobots object for a robot named user_agent, with optional options.
:http_get => a custom method, proc, or anything that responds to .call(uri), to be used for fetching robots.txt. It must return the response body if successful, return an empty string if the resource is not found, and return nil or raise any error on failure. Redirects should be handled within this proc.
# File lib/webrobots.rb, line 20 20: def initialize(user_agent, options = nil) 21: @user_agent = user_agent 22: @parser = RobotsTxt::Parser.new(user_agent) 23: @parser_mutex = Mutex.new 24: 25: options ||= {} 26: @http_get = options[:http_get] || method(:http_get) 27: 28: @robotstxt = create_cache() 29: end
Tests if the robot is allowed to access a resource at url. If a malformed URI string is given, URI::InvalidURIError is raised. If a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is raised.
# File lib/webrobots.rb, line 48 48: def allowed?(url) 49: site, request_uri = split_uri(url) 50: return true if request_uri == '/robots.txt' 51: robots_txt = get_robots_txt(site) 52: robots_txt.allow?(request_uri) 53: end
:nodoc:
# File lib/webrobots.rb, line 32 32: def create_cache 33: Hash.new # Must respond to [], []=, delete and clear. 34: end
Equivalent to !allowed?(url).
# File lib/webrobots.rb, line 56 56: def disallowed?(url) 57: !allowed?(url) 58: end
Returns an error object if there is an error in fetching or parsing robots.txt of the site url.
# File lib/webrobots.rb, line 80 80: def error(url) 81: robots_txt_for(url).error 82: end
Raises the error if there was an error in fetching or parsing robots.txt of the site url.
# File lib/webrobots.rb, line 86 86: def error!(url) 87: robots_txt_for(url).error! 88: end
Flushes robots.txt cache.
# File lib/webrobots.rb, line 37 37: def flush_cache 38: @robotstxt.clear 39: end
Equivalent to option(url).
# File lib/webrobots.rb, line 68 68: def option(url, token) 69: options(url)[token.downcase] 70: end
Returns extended option values for a resource at url in a hash with each field name lower-cased. See allowed?() for a list of errors that may be raised.
# File lib/webrobots.rb, line 63 63: def options(url) 64: robots_txt_for(url).options 65: end
# File lib/webrobots.rb, line 133 133: def fetch_robots_txt(site) 134: begin 135: body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable' 136: rescue => e 137: return RobotsTxt.unfetchable(site, e, @user_agent) 138: end 139: @parser_mutex.synchronize { 140: @parser.parse!(body, site) 141: } 142: end
# File lib/webrobots.rb, line 129 129: def get_robots_txt(site) 130: @robotstxt[site] ||= fetch_robots_txt(site) 131: end
# File lib/webrobots.rb, line 144 144: def http_get(uri) 145: referer = nil 146: 10.times { 147: http = Net::HTTP.new(uri.host, uri.port) 148: if http.use_ssl = uri.is_a?(URI::HTTPS) 149: http.verify_mode = OpenSSL::SSL::VERIFY_PEER 150: http.cert_store = OpenSSL::X509::Store.new.tap { |store| 151: store.set_default_paths 152: } 153: end 154: header = { 'User-Agent' => @user_agent } 155: header['Referer'] = referer if referer 156: # header is destroyed by this in ruby 1.9.2! 157: response = http.get(uri.request_uri, header) 158: case response 159: when Net::HTTPSuccess 160: return response.body 161: when Net::HTTPRedirection 162: referer = uri.to_s 163: uri = URI(response['location']) 164: when Net::HTTPNotFound 165: return '' 166: else 167: response.value 168: end 169: } 170: raise 'too many HTTP redirects' 171: end
# File lib/webrobots.rb, line 124 124: def robots_txt_for(url) 125: site, = split_uri(url) 126: get_robots_txt(site) 127: end
# File lib/webrobots.rb, line 98 98: def split_uri(url) 99: site = 100: if url.is_a?(URI) 101: url.dup 102: else 103: begin 104: URI.parse(url) 105: rescue => e 106: raise ArgumentError, e.message 107: end 108: end 109: 110: site.scheme && site.host or 111: raise ArgumentError, "non-absolute URI: #{url}" 112: 113: site.is_a?(URI::HTTP) or 114: raise ArgumentError, "non-HTTP/HTTPS URI: #{url}" 115: 116: request_uri = site.request_uri 117: if (host = site.host).match(/[[:upper:]]/) 118: site.host = host.downcase 119: end 120: site.path = '/' 121: return site, request_uri 122: end
Disabled; run with --debug to generate this.
Generated with the Darkfish Rdoc Generator 1.1.6.