Class Index [+]

Quicksearch

Nokogiri::HTML::Document

Public Class Methods

new click to toggle source

Create a new document

static VALUE new(int argc, VALUE *argv, VALUE klass)
{
  VALUE uri, external_id, rest, rb_doc;
  htmlDocPtr doc;

  rb_scan_args(argc, argv, "0*", &rest);
  uri         = rb_ary_entry(rest, (long)0);
  external_id = rb_ary_entry(rest, (long)1);

  doc = htmlNewDoc(
      RTEST(uri) ? (const xmlChar *)StringValuePtr(uri) : NULL,
      RTEST(external_id) ? (const xmlChar *)StringValuePtr(external_id) : NULL
  );
  rb_doc = Nokogiri_wrap_xml_document(klass, doc);
  rb_obj_call_init(rb_doc, argc, argv);
  return rb_doc ;
}
parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML) click to toggle source
 

Parse HTML. string_or_io may be a String, or any object that responds to read and close such as an IO, or StringIO. url is resource where this document is located. encoding is the encoding that should be used when processing the document. options is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions.

     # File lib/nokogiri/html/document.rb, line 83
 83:         def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
 84: 
 85:           options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
 86:           # Give the options to the user
 87:           yield options if block_given?
 88: 
 89:           if string_or_io.respond_to?(:encoding)
 90:             unless string_or_io.encoding.name == "ASCII-8BIT"
 91:               encoding ||= string_or_io.encoding.name
 92:             end
 93:           end
 94: 
 95:           if string_or_io.respond_to?(:read)
 96:             url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
 97:             if !encoding
 98:               # Libxml2's parser has poor support for encoding
 99:               # detection.  First, it does not recognize the HTML5
100:               # style meta charset declaration.  Secondly, even if it
101:               # successfully detects an encoding hint, it does not
102:               # re-decode or re-parse the preceding part which may be
103:               # garbled.
104:               #
105:               # EncodingReader aims to perform advanced encoding
106:               # detection beyond what Libxml2 does, and to emulate
107:               # rewinding of a stream and make Libxml2 redo parsing
108:               # from the start when an encoding hint is found.
109:               string_or_io = EncodingReader.new(string_or_io)
110:               begin
111:                 return read_io(string_or_io, url, encoding, options.to_i)
112:               rescue EncodingFound => e
113:                 encoding = e.found_encoding
114:               end
115:             end
116:             return read_io(string_or_io, url, encoding, options.to_i)
117:           end
118: 
119:           # read_memory pukes on empty docs
120:           return new if string_or_io.nil? or string_or_io.empty?
121: 
122:           encoding ||= EncodingReader.detect_encoding(string_or_io)
123: 
124:           read_memory(string_or_io, url, encoding, options.to_i)
125:         end
read_io(io, url, encoding, options) click to toggle source

Read the HTML document from io with given url, encoding, and options. See Nokogiri::HTML.parse

static VALUE read_io( VALUE klass,
                      VALUE io,
                      VALUE url,
                      VALUE encoding,
                      VALUE options )
{
  const char * c_url    = NIL_P(url)      ? NULL : StringValuePtr(url);
  const char * c_enc    = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  htmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

  doc = htmlReadIO(
      io_read_callback,
      io_close_callback,
      (void *)io,
      c_url,
      c_enc,
      (int)NUM2INT(options)
  );
  xmlSetStructuredErrorFunc(NULL, NULL);

  /*
   * If EncodingFound has occurred in EncodingReader, make sure to do
   * a cleanup and propagate the error.
   */
  if (rb_respond_to(io, id_encoding_found)) {
    VALUE encoding_found = rb_funcall(io, id_encoding_found, 0);
    if (!NIL_P(encoding_found)) {
      xmlFreeDoc(doc);
      rb_exc_raise(encoding_found);
    }
  }

  if(doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
    else
      rb_raise(rb_eRuntimeError, "Could not parse document");

    return Qnil;
  }

  document = Nokogiri_wrap_xml_document(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}
read_memory(string, url, encoding, options) click to toggle source

Read the HTML document contained in string with given url, encoding, and options. See Nokogiri::HTML.parse

static VALUE read_memory( VALUE klass,
                          VALUE string,
                          VALUE url,
                          VALUE encoding,
                          VALUE options )
{
  const char * c_buffer = StringValuePtr(string);
  const char * c_url    = NIL_P(url)      ? NULL : StringValuePtr(url);
  const char * c_enc    = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
  int len               = (int)RSTRING_LEN(string);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  htmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

  doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
  xmlSetStructuredErrorFunc(NULL, NULL);

  if(doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
    else
      rb_raise(rb_eRuntimeError, "Could not parse document");

    return Qnil;
  }

  document = Nokogiri_wrap_xml_document(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}

Public Instance Methods

fragment(tags = nil) click to toggle source
  

Create a Nokogiri::XML::DocumentFragment from tags

    # File lib/nokogiri/html/document.rb, line 70
70:       def fragment tags = nil
71:         DocumentFragment.new(self, tags, self.root)
72:       end
meta_encoding() click to toggle source
 

Get the meta tag encoding for this document. If there is no meta tag, then nil is returned.

    # File lib/nokogiri/html/document.rb, line 7
 7:       def meta_encoding
 8:         meta = meta_content_type and
 9:           match = /charset\s*=\s*([\w-]+)/.match(meta['content']) and
10:           match[1]
11:       end
meta_encoding=(encoding) click to toggle source
 

Set the meta tag encoding for this document. If there is no meta content tag, the encoding is not set.

    # File lib/nokogiri/html/document.rb, line 16
16:       def meta_encoding= encoding
17:         meta = meta_content_type and
18:           meta['content'] = "text/html; charset=%s" % encoding
19:       end
serialize(options = {}) click to toggle source
  

Serialize Node using options. Save options can also be set using a block. See SaveOptions.

These two statements are equivalent:

 node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)

or

  node.serialize(:encoding => 'UTF-8') do |config|
    config.format.as_xml
  end
    # File lib/nokogiri/html/document.rb, line 63
63:       def serialize options = {}
64:         options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
65:         super
66:       end
title() click to toggle source
 

Get the title string of this document. Return nil if there is no title tag.

    # File lib/nokogiri/html/document.rb, line 33
33:       def title
34:         title = at('title') and title.inner_text
35:       end
title=(text) click to toggle source
 

Set the title string of this document. If there is no head element, the title is not set.

    # File lib/nokogiri/html/document.rb, line 40
40:       def title=(text)
41:         unless title = at('title')
42:           head = at('head') or return nil
43:           title = Nokogiri::XML::Node.new('title', self)
44:           head << title
45:         end
46:         title.children = XML::Text.new(text, self)
47:       end
type click to toggle source

The type for this document

static VALUE type(VALUE self)
{
  htmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);
  return INT2NUM((long)doc->type);
}

Private Instance Methods

meta_content_type() click to toggle source
    # File lib/nokogiri/html/document.rb, line 21
21:       def meta_content_type
22:         css('meta[@http-equiv]').find { |node|
23:           node['http-equiv'] =~ /\AContent-Type\z/ and
24:             !node['content'].nil? and
25:             !node['content'].empty?
26:         }
27:       end

Disabled; run with --debug to generate this.

[Validate]

Generated with the Darkfish Rdoc Generator 1.1.6.