XMLConverter¶

A class to convert data from document tree format (nested dict) to and from XML.

Parameters:

Name	Type	Description	Default
`data_model`	`DataModel`	The `DataModel` object used to parse XML files	required
`document_tree`	`dict`	Data in the document tree format (optional, can be built later by the `parse_xml` method)	`None`

Source code in xml2db/xml_converter.py

def __init__(self, data_model: "DataModel", document_tree: dict = None):
    """A class to convert data from document tree format (nested dict) to and from XML.

    Args:
        data_model: The [`DataModel`](./data_model.md#xml2db.model.DataModel) object used to parse XML files
        document_tree: Data in the document tree format (optional, can be built later by the `parse_xml` method)
    """
    self.model = data_model
    self.document_tree = document_tree

`parse_xml(xml_file, file_path=None, skip_validation=False, recover=False, iterparse=True)` ¶

Parse an XML document into a nested dict and performs the simplifications defined in the DataModel object ("pull" child to upper level, transform a choice model into "type" and "value" fields or concatenate children as string).

Parameters:

Name	Type	Description	Default
`xml_file`	`Union[str, BytesIO]`	An XML file path or file content to be converted	required
`file_path`	`str`	The file path to be printed in logs	`None`
`skip_validation`	`bool`	Whether we should validate XML against the schema before parsing	`False`
`recover`	`bool`	Try to process malformed XML (lxml option)	`False`
`iterparse`	`bool`	Parse XML using iterative parsing, which is a bit slower but uses less memory	`True`

Returns:

Type	Description
`tuple`	The parsed data in the document tree format (nested dict)

Source code in xml2db/xml_converter.py

def parse_xml(
    self,
    xml_file: Union[str, BytesIO],
    file_path: str = None,
    skip_validation: bool = False,
    recover: bool = False,
    iterparse: bool = True,
) -> tuple:
    """Parse an XML document into a nested dict and performs the simplifications defined in the
    DataModel object ("pull" child to upper level, transform a choice model into "type" and "value"
    fields or concatenate children as string).

    Args:
        xml_file: An XML file path or file content to be converted
        file_path: The file path to be printed in logs
        skip_validation: Whether we should validate XML against the schema before parsing
        recover: Try to process malformed XML (lxml option)
        iterparse: Parse XML using iterative parsing, which is a bit slower but uses less memory

    Returns:
        The parsed data in the document tree format (nested dict)
    """

    xt = None
    if not iterparse or (not skip_validation and recover):
        logger.info("Parsing XML file")
        xt = etree.parse(xml_file, parser=etree.XMLParser(recover=recover))

    if skip_validation:
        logger.info("Skipping XML file validation")
    else:
        logger.info("Validating XML file against the schema")
        if not self.model.lxml_schema.validate(xt if xt else etree.parse(xml_file)):
            logger.error(f"XML file {file_path} does not conform with the schema")
            raise ValueError(
                f"XML file {file_path} does not conform with the schema"
            )
        logger.info("XML file conforms with the schema")

    if iterparse:
        self.document_tree = self._parse_iterative(xml_file, recover)
    else:
        self.document_tree = self._parse_element_tree(xt)

    return self.document_tree

`to_xml(out_file=None, nsmap=None, indent=' ')` ¶

Convert a document tree (nested dict) into an XML file

Parameters:

Name	Type	Description	Default
`out_file`	`str`	If provided, write output to a file.	`None`
`nsmap`	`dict`	An optional namespace mapping.	`None`
`indent`	`str`	A string used as indentin XML output.	`' '`

Returns:

Type	Description
`Element`	The etree object corresponding to the root XML node.

Source code in xml2db/xml_converter.py

def to_xml(
    self, out_file: str = None, nsmap: dict = None, indent: str = "  "
) -> etree.Element:
    """Convert a document tree (nested dict) into an XML file

    Args:
        out_file: If provided, write output to a file.
        nsmap: An optional namespace mapping.
        indent: A string used as indentin XML output.

    Returns:
        The etree object corresponding to the root XML node.
    """
    doc = self._make_xml_node(
        self.document_tree,
        self.model.tables[self.document_tree[0]].name,
        nsmap,
    )
    if self.model.tables[self.model.root_table].is_virtual_node:
        child = None
        for child in doc:
            break
        doc = child
    if out_file:
        etree.indent(doc, space=indent)
        with open(out_file, "wt") as f:
            f.write(
                etree.tostring(
                    doc,
                    pretty_print=True,
                    encoding="utf-8",
                    xml_declaration=True,
                ).decode("utf-8")
            )
    return doc

XMLConverter¶

parse_xml(xml_file, file_path=None, skip_validation=False, recover=False, iterparse=True) ¶

to_xml(out_file=None, nsmap=None, indent=' ') ¶

`parse_xml(xml_file, file_path=None, skip_validation=False, recover=False, iterparse=True)` ¶

`to_xml(out_file=None, nsmap=None, indent=' ')` ¶