XML parsing

original
2014/09/09 09:55
Reading number 939

The object library only provides a parsing mode for specified object elements, which is simple and convenient to use, but has some limitations. If you want to support xml parsing of big data and more flexible control of elements, you can directly use the xml module provided separately at the bottom of the tbox.

The xml library of tbox provides two parsing modes: DOM parsing and SAX parsing.

DOM adopts the dom object tree, which can be parsed to memory at one time. This is similar to object, but it can control all element tags. SAX mode adopts external iteration mode, with higher flexibility and performance. It also supports user-defined path resolution operations. Similar to xpath, you can select a specified path for resolution.

The DOM mode is relatively simple. Just look at the following examples to make it clear at a glance:

 //Initialize Stream tb_stream_ref_t istream = tb_stream_init_from_url(" http://localhost/file.xml "); if (istream) { //Open Stream if (tb_stream_open(istream)) { //Initialize Reader tb_xml_reader_ref_t reader = tb_xml_reader_init(istream); if (reader) { //Load data, root is the root node tb_xml_node_t* root = tb_xml_reader_load(reader); if (root)  { //Resolve node operation // ... //Release root node tb_xml_node_exit(root); } //Release Reader tb_xml_reader_exit(reader); } } //Release stream tb_stream_exit(istream); }

The SAX mode is more efficient and flexible, and better supports big data xml, because it uses the iterator mode, reads while solving, and only parses the data you are interested in, which saves more memory and does not need to load everything into memory. Therefore, with stream, the network data can be stream parsed.

I won't say much about it. Let's go directly to the code:

 //Initialize Stream tb_stream_ref_t istream = tb_stream_init_from_url(" http://localhost/file.xml "); if (istream) { //Open Stream if (tb_stream_open(istream)) { //Initialize Reader tb_xml_reader_ref_t reader = tb_xml_reader_init(istream); if (reader) { //Initialize xml reader events tb_size_t event = TB_XML_READER_EVENT_NONE; //Traverse all xml node elements. If an empty event is returned, it ends while ((event = tb_xml_reader_next(reader))) { switch (event) { //Xml document node type event case TB_XML_READER_EVENT_DOCUMENT:  { tb_printf("<?xml version = \"%s\" encoding = \"%s\" ?>\n" , tb_xml_reader_version(reader), tb_xml_reader_charset(reader)); } break; //Document Type Node Type Event case TB_XML_READER_EVENT_DOCUMENT_TYPE:  { tb_printf("<!DOCTYPE>\n"); } break; //Empty element node type event, for example:<element/> case TB_XML_READER_EVENT_ELEMENT_EMPTY:  { //Node element name tb_char_t const*         name = tb_xml_reader_element(reader); //Node element attribute list tb_xml_node_t const*     attr = tb_xml_reader_attributes(reader); //XML node hierarchy, used to display indented layout tb_size_t                 t = tb_xml_reader_level(reader); while (t--) tb_printf("\t"); //Traverse all element attributes if (! attr) tb_printf("<%s/>\n", name); else { tb_printf("<%s", name); for (;  attr; attr = attr->next) tb_printf(" %s = \"%s\"", tb_pstring_cstr(&attr->name), tb_pstring_cstr(&attr->data)); tb_printf("/>\n"); } } break; //Element start node event, for example:<element> case TB_XML_READER_EVENT_ELEMENT_BEG:  { //Node element name tb_char_t const*         name = tb_xml_reader_element(reader); //Node element attribute list tb_xml_node_t const*     attr = tb_xml_reader_attributes(reader);     //XML node hierarchy, used to display indented layout tb_size_t                 t = tb_xml_reader_level(reader) - 1; while (t--) tb_printf("\t"); //Traverse all element attributes if (! attr) tb_printf("<%s>\n", name); else { tb_printf("<%s", name); for (;  attr; attr = attr->next) tb_printf(" %s = \"%s\"", tb_pstring_cstr(&attr->name), tb_pstring_cstr(&attr->data)); tb_printf(">\n"); } } break; //Element end node event, for example:</ element> case TB_XML_READER_EVENT_ELEMENT_END:  { tb_size_t t = tb_xml_reader_level(reader); while (t--) tb_printf("\t"); tb_printf("</%s>\n", tb_xml_reader_element(reader)); } break; //Text Node Events case TB_XML_READER_EVENT_TEXT:  { tb_size_t t = tb_xml_reader_level(reader); while (t--) tb_printf("\t"); tb_printf("%s", tb_xml_reader_text(reader)); tb_printf("\n"); } break; //CDATA node event, for example:<! CDATA[data]> case TB_XML_READER_EVENT_CDATA:  { tb_size_t t = tb_xml_reader_level(reader); while (t--) tb_printf("\t"); tb_printf("<![CDATA[%s]]>", tb_xml_reader_cdata(reader)); tb_printf("\n"); } break; //Comment node events, for example:<-- comment --> case TB_XML_READER_EVENT_COMMENT:  { tb_size_t t = tb_xml_reader_level(reader); while (t--) tb_printf("\t"); tb_printf("<!--%s-->", tb_xml_reader_comment(reader)); tb_printf("\n"); } break; default: break; } } //Release Reader tb_xml_reader_exit(reader); } } //Release stream tb_stream_exit(istream); }

If you want to parse specifically, you can locate the specified path through tb_xml_reader_goto to start parsing:

 //Initialize Stream tb_stream_ref_t istream = tb_stream_init_from_url(" http://localhost/file.xml "); if (istream) { //Open Stream if (tb_stream_open(istream)) { //Initialize Reader tb_xml_reader_ref_t reader = tb_xml_reader_init(istream); if (reader) { //Jump the reader to the specified path if (tb_xml_reader_goto(reader, "/root/node/data")) { //Load data, root is the root node tb_xml_node_t* root = tb_xml_reader_load(reader); if (root)  { //Resolve node operation // ... //Release root node tb_xml_node_exit(root); } } //Release Reader tb_xml_reader_exit(reader); } } //Release stream tb_stream_exit(istream); }

The tb_xml_node_t node type is actually a tree linked list. If you load the entire object tree at one time, you can easily traverse it:

 //Node type definition description. All other nodes inherit this node typedef struct __tb_xml_node_t { ///Type of node tb_size_t                    type; ///Name of the node tb_pstring_t                 name; ///Data of nodes tb_pstring_t                 data; ///Next node, single linked list struct __tb_xml_node_t*      next; //Head of child node, single linked list struct __tb_xml_node_t*      chead; //Tail of child node struct __tb_xml_node_t*      ctail; //Number of child nodes tb_size_t                    csize; //Head of attribute node, single linked list struct __tb_xml_node_t*      ahead; //Tail of attribute node struct __tb_xml_node_t*      atail; //Number of attribute nodes tb_size_t                    asize; ///Parent node struct __tb_xml_node_t*      parent; }tb_xml_node_t;

Traverse all child nodes:

 tb_xml_node_t* head = node->chead; for (node = head;  node; node = node->next) { //Only element nodes are processed here:<element></ Element>or<element/> if (node->type == TB_XML_NODE_TYPE_ELEMENT) { //Name size of element node tb_size_t m = tb_pstring_size(&node->name); //Print Element Node Name Child tb_trace_d("%s", tb_pstring_cstr(&node->name)); }

}

Traverse all attribute nodes:

 tb_xml_node_t* head = node->ahead; for (node = head;  node; node = node->next) { //Print the name and data of the attribute node, for example: attr_name="data" tb_trace_d("%s=\"%s\"", tb_pstring_cstr(&node->name), tb_pstring_cstr(&node->data)); }

Expand to read the full text
Loading
Click to lead the topic 📣 Post and join the discussion 🔥
Reward
zero comment
five Collection
zero fabulous
 Back to top
Top