class Nokogiri::XML::Document
Nokogiri::XML::Document
wraps an xml document.
Nokogiri::XML::Document
is the main entry point for dealing with XML
documents. The Document
is created by parsing an XML
document. See Nokogiri::XML::Document.parse()
for more information on parsing.
For searching a Document
, see Nokogiri::XML::Searchable#css
and Nokogiri::XML::Searchable#xpath
Constants
- NCNAME_CHAR
- NCNAME_RE
- NCNAME_START_CHAR
-
I'm ignoring unicode characters here. See www.w3.org/TR/REC-xml-names/#ns-decl for more details.
Attributes
A list of Nokogiri::XML::SyntaxError
found when parsing a document
Public Class Methods
static VALUE new(int argc, VALUE *argv, VALUE klass) { xmlDocPtr doc; VALUE version, rest, rb_doc ; rb_scan_args(argc, argv, "0*", &rest); version = rb_ary_entry(rest, (long)0); if (NIL_P(version)) version = rb_str_new2("1.0"); doc = xmlNewDoc((xmlChar *)StringValueCStr(version)); rb_doc = Nokogiri_wrap_xml_document(klass, doc); rb_obj_call_init(rb_doc, argc, argv); return rb_doc ; }
Create a new document with version
(defaults to “1.0”)
# File lib/nokogiri/xml/document.rb, line 44 def self.parse string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML options = Nokogiri::XML::ParseOptions.new(options) if Integer === options # Give the options to the user yield options if block_given? if empty_doc?(string_or_io) if options.strict? raise Nokogiri::XML::SyntaxError.new("Empty document") else return encoding ? new.tap { |i| i.encoding = encoding } : new end end doc = if string_or_io.respond_to?(:read) url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil read_io(string_or_io, url, encoding, options.to_i) else # read_memory pukes on empty docs read_memory(string_or_io, url, encoding, options.to_i) end # do xinclude processing doc.do_xinclude(options) if options.xinclude? return doc end
Parse an XML
file.
string_or_io
may be a String, or any object that responds to read and close such as an IO, or StringIO.
url
(optional) is the URI where this document is located.
encoding
(optional) is the encoding that should be used when processing the document.
options
(optional) is a configuration object that sets options during parsing, such as Nokogiri::XML::ParseOptions::RECOVER. See the Nokogiri::XML::ParseOptions
for more information.
block
(optional) is passed a configuration object on which parse options may be set.
By default, Nokogiri
treats documents as untrusted, and so does not attempt to load DTDs or access the network. See Nokogiri::XML::ParseOptions
for a complete list of options; and that module's DEFAULT_XML constant for what's set (and not set) by default.
Nokogiri
.XML() is a convenience method which will call this method.
static VALUE read_io( VALUE klass, VALUE io, VALUE url, VALUE encoding, VALUE options ) { const char * c_url = NIL_P(url) ? NULL : StringValueCStr(url); const char * c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); VALUE error_list = rb_ary_new(); VALUE document; xmlDocPtr doc; xmlResetLastError(); xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); doc = xmlReadIO( (xmlInputReadCallback)io_read_callback, (xmlInputCloseCallback)io_close_callback, (void *)io, c_url, c_enc, (int)NUM2INT(options) ); xmlSetStructuredErrorFunc(NULL, NULL); if(doc == NULL) { xmlErrorPtr error; xmlFreeDoc(doc); error = xmlGetLastError(); if(error) rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); else rb_raise(rb_eRuntimeError, "Could not parse document"); return Qnil; } document = Nokogiri_wrap_xml_document(klass, doc); rb_iv_set(document, "@errors", error_list); return document; }
Create a new document from an IO object
static VALUE read_memory( VALUE klass, VALUE string, VALUE url, VALUE encoding, VALUE options ) { const char * c_buffer = StringValuePtr(string); const char * c_url = NIL_P(url) ? NULL : StringValueCStr(url); const char * c_enc = NIL_P(encoding) ? NULL : StringValueCStr(encoding); int len = (int)RSTRING_LEN(string); VALUE error_list = rb_ary_new(); VALUE document; xmlDocPtr doc; xmlResetLastError(); xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); doc = xmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options)); xmlSetStructuredErrorFunc(NULL, NULL); if(doc == NULL) { xmlErrorPtr error; xmlFreeDoc(doc); error = xmlGetLastError(); if(error) rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); else rb_raise(rb_eRuntimeError, "Could not parse document"); return Qnil; } document = Nokogiri_wrap_xml_document(klass, doc); rb_iv_set(document, "@errors", error_list); return document; }
Create a new document from a String
# File lib/nokogiri/xml/document.rb, line 257 def self.wrap document raise "JRuby only method" unless Nokogiri.jruby? return wrapJavaDocument(document) end
JRuby
Wraps Java's org.w3c.dom.document and returns Nokogiri::XML::Document
Public Instance Methods
# File lib/nokogiri/xml/document.rb, line 242 def add_child node_or_tags raise "A document may not have multiple root nodes." if (root && root.name != 'nokogiri_text_wrapper') && !(node_or_tags.comment? || node_or_tags.processing_instruction?) node_or_tags = coerce(node_or_tags) if node_or_tags.is_a?(XML::NodeSet) raise "A document may not have multiple root nodes." if node_or_tags.size > 1 super(node_or_tags.first) else super end end
Nokogiri::XML::Node#add_child
static VALUE canonicalize(int argc, VALUE* argv, VALUE self) { VALUE mode; VALUE incl_ns; VALUE with_comments; xmlChar **ns; long ns_len, i; xmlDocPtr doc; xmlOutputBufferPtr buf; xmlC14NIsVisibleCallback cb = NULL; void * ctx = NULL; VALUE rb_cStringIO; VALUE io; rb_scan_args(argc, argv, "03", &mode, &incl_ns, &with_comments); Data_Get_Struct(self, xmlDoc, doc); rb_cStringIO = rb_const_get_at(rb_cObject, rb_intern("StringIO")); io = rb_class_new_instance(0, 0, rb_cStringIO); buf = xmlAllocOutputBuffer(NULL); buf->writecallback = (xmlOutputWriteCallback)io_write_callback; buf->closecallback = (xmlOutputCloseCallback)io_close_callback; buf->context = (void *)io; if(rb_block_given_p()) { cb = block_caller; ctx = (void *)rb_block_proc(); } if(NIL_P(incl_ns)){ ns = NULL; } else{ Check_Type(incl_ns, T_ARRAY); ns_len = RARRAY_LEN(incl_ns); ns = calloc((size_t)ns_len+1, sizeof(xmlChar *)); for (i = 0 ; i < ns_len ; i++) { VALUE entry = rb_ary_entry(incl_ns, i); ns[i] = (xmlChar*)StringValueCStr(entry); } } xmlC14NExecute(doc, cb, ctx, (int) (NIL_P(mode) ? 0 : NUM2INT(mode)), ns, (int) RTEST(with_comments), buf); xmlOutputBufferClose(buf); return rb_funcall(io, rb_intern("string"), 0); }
Canonicalize a document and return the results. Takes an optional block that takes two parameters: the obj
and that node's parent
. The obj
will be either a Nokogiri::XML::Node
, or a Nokogiri::XML::Namespace
The block must return a non-nil, non-false value if the obj
passed in should be included in the canonicalized document.
# File lib/nokogiri/xml/document.rb, line 167 def collect_namespaces xpath("//namespace::*").inject({}) do |hash, ns| hash[["xmlns",ns.prefix].compact.join(":")] = ns.href if ns.prefix != "xml" hash end end
Recursively get all namespaces from this node and its subtree and return them as a hash.
For example, given this document:
<root xmlns:foo="bar"> <bar xmlns:hello="world" /> </root>
This method will return:
{ 'xmlns:foo' => 'bar', 'xmlns:hello' => 'world' }
WARNING: this method will clobber duplicate names in the keys. For example, given this document:
<root xmlns:foo="bar"> <bar xmlns:foo="baz" /> </root>
The hash returned will look like this: { 'xmlns:foo' => 'bar' }
Non-prefixed default namespaces (as in “xmlns=”) are not included in the hash.
Note that this method does an xpath lookup for nodes with namespaces, and as a result the order may be dependent on the implementation of the underlying XML
library.
# File lib/nokogiri/xml/document.rb, line 88 def create_element name, *args, &block elm = Nokogiri::XML::Element.new(name, self, &block) args.each do |arg| case arg when Hash arg.each { |k,v| key = k.to_s if key =~ NCNAME_RE ns_name = key.split(":", 2)[1] elm.add_namespace_definition ns_name, v else elm[k.to_s] = v.to_s end } else elm.content = arg end end if ns = elm.namespace_definitions.find { |n| n.prefix.nil? or n.prefix == '' } elm.namespace = ns end elm end
Create an element with name
, and optionally setting the content and attributes.
doc.create_element "div" # <div></div> doc.create_element "div", :class => "container" # <div class='container'></div> doc.create_element "div", "contents" # <div>contents</div> doc.create_element "div", "contents", :class => "container" # <div class='container'>contents</div> doc.create_element "div" { |node| node['class'] = "container" } # <div class='container'></div>
static VALUE create_entity(int argc, VALUE *argv, VALUE self) { VALUE name; VALUE type; VALUE external_id; VALUE system_id; VALUE content; xmlEntityPtr ptr; xmlDocPtr doc ; Data_Get_Struct(self, xmlDoc, doc); rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id, &content); xmlResetLastError(); ptr = xmlAddDocEntity( doc, (xmlChar *)(NIL_P(name) ? NULL : StringValueCStr(name)), (int) (NIL_P(type) ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(type)), (xmlChar *)(NIL_P(external_id) ? NULL : StringValueCStr(external_id)), (xmlChar *)(NIL_P(system_id) ? NULL : StringValueCStr(system_id)), (xmlChar *)(NIL_P(content) ? NULL : StringValueCStr(content)) ); if(NULL == ptr) { xmlErrorPtr error = xmlGetLastError(); if(error) rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); else rb_raise(rb_eRuntimeError, "Could not create entity"); return Qnil; } return Nokogiri_wrap_xml_node(cNokogiriXmlEntityDecl, (xmlNodePtr)ptr); }
Create a new entity named name
.
type
is an integer representing the type of entity to be created, and it defaults to Nokogiri::XML::EntityDecl::INTERNAL_GENERAL. See the constants on Nokogiri::XML::EntityDecl
for more information.
external_id
, system_id
, and content
set the External ID, System ID, and content respectively. All of these parameters are optional.
# File lib/nokogiri/xml/document.rb, line 215 def decorate node return unless @decorators @decorators.each { |klass,list| next unless node.is_a?(klass) list.each { |moodule| node.extend(moodule) } } end
Apply any decorators to node
# File lib/nokogiri/xml/document.rb, line 175 def decorators key @decorators ||= Hash.new @decorators[key] ||= [] end
Get the list of decorators given key
# File lib/nokogiri/xml/document.rb, line 133 def document self end
A reference to self
static VALUE duplicate_document(int argc, VALUE *argv, VALUE self) { xmlDocPtr doc, dup; VALUE copy; VALUE level; VALUE error_list; if(rb_scan_args(argc, argv, "01", &level) == 0) level = INT2NUM((long)1); Data_Get_Struct(self, xmlDoc, doc); dup = xmlCopyDoc(doc, (int)NUM2INT(level)); if(dup == NULL) return Qnil; dup->type = doc->type; copy = Nokogiri_wrap_xml_document(rb_obj_class(self), dup); error_list = rb_iv_get(self, "@errors"); rb_iv_set(copy, "@errors", error_list); return copy ; }
Copy this Document
. An optional depth may be passed in, but it defaults to a deep copy. 0 is a shallow copy, 1 is a deep copy.
static VALUE encoding(VALUE self) { xmlDocPtr doc; Data_Get_Struct(self, xmlDoc, doc); if(!doc->encoding) return Qnil; return NOKOGIRI_STR_NEW2(doc->encoding); }
Get the encoding for this Document
static VALUE set_encoding(VALUE self, VALUE encoding) { xmlDocPtr doc; Data_Get_Struct(self, xmlDoc, doc); if (doc->encoding) free((char *)(uintptr_t) doc->encoding); /* avoid gcc cast warning */ doc->encoding = xmlStrdup((xmlChar *)StringValueCStr(encoding)); return encoding; }
Set the encoding string for this Document
# File lib/nokogiri/xml/document.rb, line 234 def fragment tags = nil DocumentFragment.new(self, tags, self.root) end
Create a Nokogiri::XML::DocumentFragment
from tags
Returns an empty fragment if tags
is nil.
# File lib/nokogiri/xml/document.rb, line 128 def name 'document' end
The name of this document. Always returns “document”
# File lib/nokogiri/xml/document.rb, line 227 def namespaces root ? root.namespaces : {} end
Get the hash of namespaces on the root Nokogiri::XML::Node
VALUE remove_namespaces_bang(VALUE self) { xmlDocPtr doc ; Data_Get_Struct(self, xmlDoc, doc); recursively_remove_namespaces_from_node((xmlNodePtr)doc); return self; }
Remove all namespaces from all nodes in the document.
This could be useful for developers who either don't understand namespaces or don't care about them.
The following example shows a use case, and you can decide for yourself whether this is a good thing or not:
doc = Nokogiri::XML <<-EOXML <root> <car xmlns:part="http://general-motors.com/"> <part:tire>Michelin Model XGV</part:tire> </car> <bicycle xmlns:part="http://schwinn.com/"> <part:tire>I'm a bicycle tire!</part:tire> </bicycle> </root> EOXML doc.xpath("//tire").to_s # => "" doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "<part:tire>Michelin Model XGV</part:tire>" doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => "<part:tire>I'm a bicycle tire!</part:tire>" doc.remove_namespaces! doc.xpath("//tire").to_s # => "<tire>Michelin Model XGV</tire><tire>I'm a bicycle tire!</tire>" doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "" doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => ""
For more information on why this probably is not a good thing in general, please direct your browser to tenderlovemaking.com/2009/04/23/namespaces-in-xml.html
static VALUE root(VALUE self) { xmlDocPtr doc; xmlNodePtr root; Data_Get_Struct(self, xmlDoc, doc); root = xmlDocGetRootElement(doc); if(!root) return Qnil; return Nokogiri_wrap_xml_node(Qnil, root) ; }
Get the root node for this document.
static VALUE set_root(VALUE self, VALUE root) { xmlDocPtr doc; xmlNodePtr new_root; xmlNodePtr old_root; Data_Get_Struct(self, xmlDoc, doc); old_root = NULL; if(NIL_P(root)) { old_root = xmlDocGetRootElement(doc); if(old_root) { xmlUnlinkNode(old_root); nokogiri_root_node(old_root); } return root; } Data_Get_Struct(root, xmlNode, new_root); /* If the new root's document is not the same as the current document, * then we need to dup the node in to this document. */ if(new_root->doc != doc) { old_root = xmlDocGetRootElement(doc); if (!(new_root = xmlDocCopyNode(new_root, doc, 1))) { rb_raise(rb_eRuntimeError, "Could not reparent node (xmlDocCopyNode)"); } } xmlDocSetRootElement(doc, new_root); if(old_root) nokogiri_root_node(old_root); return root; }
Set the root element on this document
# File lib/nokogiri/xml/document.rb, line 204 def slop! unless decorators(XML::Node).include? Nokogiri::Decorators::Slop decorators(XML::Node) << Nokogiri::Decorators::Slop decorate! end self end
Explore a document with shortcut methods. See Nokogiri::Slop for details.
Note that any nodes that have been instantiated before slop!
is called will not be decorated with sloppy behavior. So, if you're in irb, the preferred idiom is:
irb> doc = Nokogiri::Slop my_markup
and not
irb> doc = Nokogiri::HTML my_markup ... followed by irb's implicit inspect (and therefore instantiation of every node) ... irb> doc.slop! ... which does absolutely nothing.
# File lib/nokogiri/xml/document.rb, line 265 def to_java raise "JRuby only method" unless Nokogiri.jruby? return toJavaDocument() end
JRuby
Returns Java's org.w3c.dom.document of this Document
.
static VALUE url(VALUE self) { xmlDocPtr doc; Data_Get_Struct(self, xmlDoc, doc); if(doc->URL) return NOKOGIRI_STR_NEW2(doc->URL); return Qnil; }
Get the url name for this document.
© 2008–2018 Aaron Patterson, Mike Dalessio, Charles Nutter, Sergio Arbeo,
Patrick Mahoney, Yoko Harada, Akinori MUSHA, John Shahid, Lars Kanis
Licensed under the MIT License.