Skip to content

XMLConverter

A class to convert data from document tree format (nested dict) to and from XML.

Parameters:

Name Type Description Default
data_model DataModel

The DataModel object used to parse XML files

required
document_tree dict

Data in the document tree format (optional, can be built later by the parse_xml method)

None
Source code in xml2db/xml_converter.py
def __init__(self, data_model: "DataModel", document_tree: dict = None):
    """A class to convert data from document tree format (nested dict) to and from XML.

    Args:
        data_model: The [`DataModel`](./data_model.md#xml2db.model.DataModel) object used to parse XML files
        document_tree: Data in the document tree format (optional, can be built later by the `parse_xml` method)
    """
    self.model = data_model
    self.document_tree = document_tree

parse_xml(xml_file, file_path=None, skip_validation=False, recover=False, iterparse=True)

Parse an XML document into a nested dict and performs the simplifications defined in the DataModel object ("pull" child to upper level, transform a choice model into "type" and "value" fields or concatenate children as string).

Parameters:

Name Type Description Default
xml_file Union[str, BytesIO]

An XML file path or file content to be converted

required
file_path str

The file path to be printed in logs

None
skip_validation bool

Whether we should validate XML against the schema before parsing

False
recover bool

Try to process malformed XML (lxml option)

False
iterparse bool

Parse XML using iterative parsing, which is a bit slower but uses less memory

True

Returns:

Type Description
tuple

The parsed data in the document tree format (nested dict)

Source code in xml2db/xml_converter.py
def parse_xml(
    self,
    xml_file: Union[str, BytesIO],
    file_path: str = None,
    skip_validation: bool = False,
    recover: bool = False,
    iterparse: bool = True,
) -> tuple:
    """Parse an XML document into a nested dict and performs the simplifications defined in the
    DataModel object ("pull" child to upper level, transform a choice model into "type" and "value"
    fields or concatenate children as string).

    Args:
        xml_file: An XML file path or file content to be converted
        file_path: The file path to be printed in logs
        skip_validation: Whether we should validate XML against the schema before parsing
        recover: Try to process malformed XML (lxml option)
        iterparse: Parse XML using iterative parsing, which is a bit slower but uses less memory

    Returns:
        The parsed data in the document tree format (nested dict)
    """

    xt = None
    if not iterparse or (not skip_validation and recover):
        logger.info("Parsing XML file")
        xt = etree.parse(xml_file, parser=etree.XMLParser(recover=recover))

    if skip_validation:
        logger.info("Skipping XML file validation")
    else:
        logger.info("Validating XML file against the schema")
        if not self.model.lxml_schema.validate(xt if xt else etree.parse(xml_file)):
            logger.error(f"XML file {file_path} does not conform with the schema")
            raise ValueError(
                f"XML file {file_path} does not conform with the schema"
            )
        logger.info("XML file conforms with the schema")

    if iterparse:
        self.document_tree = self._parse_iterative(xml_file, recover)
    else:
        self.document_tree = self._parse_element_tree(xt)

    return self.document_tree

to_xml(out_file=None, nsmap=None, indent=' ')

Convert a document tree (nested dict) into an XML file

Parameters:

Name Type Description Default
out_file str

If provided, write output to a file.

None
nsmap dict

An optional namespace mapping.

None
indent str

A string used as indentin XML output.

' '

Returns:

Type Description
Element

The etree object corresponding to the root XML node.

Source code in xml2db/xml_converter.py
def to_xml(
    self, out_file: str = None, nsmap: dict = None, indent: str = "  "
) -> etree.Element:
    """Convert a document tree (nested dict) into an XML file

    Args:
        out_file: If provided, write output to a file.
        nsmap: An optional namespace mapping.
        indent: A string used as indentin XML output.

    Returns:
        The etree object corresponding to the root XML node.
    """
    doc = self._make_xml_node(
        self.document_tree,
        self.model.tables[self.document_tree[0]].name,
        nsmap,
    )
    if self.model.tables[self.model.root_table].is_virtual_node:
        child = None
        for child in doc:
            break
        doc = child
    if out_file:
        etree.indent(doc, space=indent)
        with open(out_file, "wt") as f:
            f.write(
                etree.tostring(
                    doc,
                    pretty_print=True,
                    encoding="utf-8",
                    xml_declaration=True,
                ).decode("utf-8")
            )
    return doc