8.4.5. XLSX or XLSM Workbook

import logging
import pprint
import xml.etree.cElementTree as dom
import re
import zipfile
from collections import defaultdict

from stingray.workbook.base import Workbook
import stingray.sheet
import stingray.cell
class workbook.xlsx.XLSX_Workbook

Extract sheets, rows and cells from an XLSX format file.

We’re opening a ZIP archive and parsing the various XML documents that we find therein.

The ElementTree incremental parser provides parse “events” for specific tags, allowing for lower-memory parsing of the sometimes large XML documents.

See http://effbot.org/zone/element-iterparse.htm

The class as a whole defines some handy constants like XML namespaces and a pattern for parsing Cell ID’s to separate the letters from the numbers.

In addition to the superclass attributes, some additional unique attributes are introduced here.

zip_archive

A zip archive for this file.

strings_dict

The strings in this workbook

class XLSX_Workbook( Workbook ):
    """ECMA Standard XLSX or XLSM documents.
    Locate sheets and rows within a given sheet.

    See http://www.ecma-international.org/publications/standards/Ecma-376.htm
    """
    # Relevant subset of namespaces used
    XLSX_NS = {
    "main":"http://schemas.openxmlformats.org/spreadsheetml/2006/main",
    "r":"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "rel":"http://schemas.openxmlformats.org/package/2006/relationships",
    }
    cell_id_pat = re.compile( r"(\D+)(\d+)" )
    def __init__( self, name, file_object=None ):
        """Prepare the workbook for reading.
        :param name: File name
        :param file_object: Optional file-like object.  If omitted, the named file is opened.
        """
        super().__init__( name, file_object )
        self.zip_archive= zipfile.ZipFile( file_object or name, "r" )
        self._prepare()

The are two preparation steps required for reading these files. First, the sheets must be located. This involves resolving internal rID numbers. Second, the shared strings need to be loaded into memory.

def _prepare( self ):
    self._locate_sheets()
    self._get_shared_strings()

Locate all sheets involves building a name_to_id mapping and and id_to_member mapping. This allows is to map the user-oriented name to an id and the id to the XLSX zipfile member.

def _locate_sheets( self ):
    """Locate the name to id mapping and the id to member mapping.
    """
    # 1a. Open "workbook.xml" member.
    workbook_zip= self.zip_archive.getinfo("xl/workbook.xml")
    workbook_doc= dom.parse( self.zip_archive.open(workbook_zip) )
    # 1b. Get a dict of sheet names and their rIdx values.
    key_attr_id= 'name'
    val_attr_id= dom.QName( self.XLSX_NS['r'], 'id' )
    self.name_to_id = dict(
        ( s.attrib[key_attr_id], s.attrib[val_attr_id] )
        for s in workbook_doc.findall("*/main:sheet", namespaces=self.XLSX_NS)
    )
    logging.debug( self.name_to_id )

    # 2a. Open the "_rels/workbook.xml.rels" member
    rels_zip= self.zip_archive.getinfo("xl/_rels/workbook.xml.rels")
    rels_doc= dom.parse( self.zip_archive.open(rels_zip) )
    # 2b. Get a dict of rIdx to Target member name
    logging.debug( dom.tostring( rels_doc.getroot() ) )
    key_attr_id= 'Id'
    val_attr_id= 'Target'
    self.id_to_member = dict(
        ( r.attrib[key_attr_id], r.attrib[val_attr_id] )
        for r in rels_doc.findall("rel:Relationship", namespaces=self.XLSX_NS)
    )
    logging.debug( self.id_to_member )

Get Shared Strings walks a fine line. Ideally, we’d like to parse the document and simply use itertext to gather all of the text within a given string instance (<si>) tag. However.

In practice, these documents can be so huge that they don’t fit in memory comfortably. We rely on incremental parsing via the iterparse function.

def _get_shared_strings( self ):
    """Build ``strings_dict`` with all shared strings.
    """
    self.strings_dict= defaultdict(str)
    count= 0
    text_tag= dom.QName( self.XLSX_NS['main'], "t" )
    string_tag= dom.QName( self.XLSX_NS['main'], "si" )
    # 1. Open the "xl/sharedStrings.xml" member
    sharedStrings_zip= self.zip_archive.getinfo("xl/sharedStrings.xml")
    for event, element in dom.iterparse(
        self.zip_archive.open( sharedStrings_zip ), events=('end',) ):
        logging.debug( event, element.tag )
        if element.tag == text_tag:
            self.strings_dict[ count ]+= element.text
        elif element.tag == string_tag:
            count += 1
        element.clear()
    logging.debug( self.strings_dict )

The shared strings may be too massive for in-memory incremental parsing. We can create a temporary extract file to handle this case. Here’s the kind of code we might use.

with tempfile.TemporaryFile( ) as temp:
    self.zip_archive.extract( sharedStrings_mbr, temp.filename )
    for event, element in dom.iterparse( temp.filename ):
        process event and element
XLSX_Workbook.sheets()

Return the list of sheets for this workbook.

def sheets( self ):
    return self.name_to_id.keys()

Translate a col-row pair from (letter, number) to proper 0-based Python index of (row, col).

@staticmethod
def make_row_col( col_row_pair ):
    col, row = col_row_pair
    cn = 0
    for char in col_row_pair[0]:
        cn = cn*26 + (ord(char)-ord("A")+1)
    return int(row), cn-1

We can build an eager sheet.Row or a sheet.LazyRow from the available data. The eager sheet.Row is built from cell.Cell objects. The sheet.LazyRow delegates the creation of cell.Cell objects to Workbook.row_get().

This uses an incremental parser, also. There are four kinds of tags that have to be located.

  • <row>row</row>, end event. Finish (and yield) the row of cells. Since XLSX is sparse, missing empty cells must be filled in.
  • <c t="type" r="id">cell</c>.
    • Start event for c. Get the cell type and id. Empty the value accumulator.
    • End event for c. Save the accumulated value. This allows the cell to have mixed content model.
  • <v>value</v>, end event. Use the cell() method to track down enough information to build the Cell instance.
XLSX_Workbook.rows_of(sheet)

Iterate through rows of the given sheet.

def rows_of( self, sheet ):
    """Iterator over rows as a list of Cells for a named worksheet."""
    # 1. Map user name to member.
    rId = self.name_to_id[sheet.name]
    self.sheet_member_name = self.id_to_member[rId]
    # 2. Open member.
    sheet_zip= self.zip_archive.getinfo("xl/"+self.sheet_member_name)
    self.row= {}
    # 3. Assemble each row, allowing for missing cells.
    row_tag= dom.QName(self.XLSX_NS['main'], "row")
    cell_tag= dom.QName(self.XLSX_NS['main'], "c")
    value_tag= dom.QName(self.XLSX_NS['main'], "v")
    format_tag= dom.QName(self.XLSX_NS['main'], "f")

    for event, element in dom.iterparse(
        self.zip_archive.open(sheet_zip), events=('start','end') ):
        logging.debug( element.tag, repr(element.text) )
        if event=='end' and element.tag == row_tag:
            # End of row: fill in missing cells
            if self.row.keys():
                data= stingray.sheet.Row( sheet, *(
                    self.row.get(i, stingray.cell.EmptyCell('', self))
                    for i in range(max(self.row.keys())+1) ) )
                yield data
            else:
                yield stingray.sheet.Row( sheet )
            self.row= {}
            element.clear()
        elif event=='end' and element.tag == cell_tag:
            # End of cell: consolidate the final string
            self.row[self.row_col[1]] = self.value
            self.value= stingray.cell.EmptyCell( '', self )
        elif event=='start' and element.tag == cell_tag:
            # Start of cell: collect a string in pieces.
            self.cell_type= element.attrib.get('t',None)
            self.cell_id = element.attrib['r']
            id_match = self.cell_id_pat.match( self.cell_id )
            self.row_col = self.make_row_col( id_match.groups() )
            self.value= stingray.cell.EmptyCell( '', self )
        elif event=='end' and element.tag == value_tag:
            # End of a value; what type was it?
            self.value= self.cell( element )

        elif event=='end' and element.tag == format_tag:
            pass # A format string
        else:
            pass
            logging.debug( "Ignoring", end="" ) # Numerous bits of structure exposed.
            logging.debug( dom.tostring(element) )
XLSX_Workbook.row_get(row, attribute)

Low-level get of a particular attribute from the given row.

def row_get( self, row, attribute ):
    """Create a Cell from the row's data."""
    return row[attribute.position]
XLSX_Workbook.cell(row, element)

Build a subclass of cell.Cell from the current value tag content plus the containing cell type information.

def cell( self, element ):
    """Create a proper :py:class:`cell.Cell` subclass from cell and value information."""
    logging.debug( self.cell_type, self.cell_id, element.text )
    if self.cell_type is None or self.cell_type == 'n':
        try:
            return stingray.cell.NumberCell( float(element.text), self )
        except ValueError:
            print( self.cell_id, element.attrib, element.text )
            return None
    elif self.cell_type == "s":
        try:
            # Shared String?
            return stingray.cell.TextCell( self.strings_dict[int(element.text)], self )
        except ValueError:
            # Inline String?
            logging.debug( self.cell_id, element.attrib, element.text )
            return stingray.cell.TextCell( element.text, self )
        except KeyError:
            # Not a valid shared string identifier?
            logging.debug( self.cell_id, element.attrib, element.text )
            return stingray.cell.TextCell( element.text, self )
    elif self.cell_type == "b":
        return stingray.cell.BooleanCell( float(element.text), self )
    elif self.cell_type == "d":
        return stingray.cell.FloatDateCell( float(element.text), self )
    elif self.cell_type == "e":
        return stingray.cell.ErrorCell( element.text, self )
    else:
        # 'str' (formula), 'inlineStr' (string), 'e' (error)
        print( self.cell_type, self.cell_id, element.attrib, element.text )
        logging.debug( self.strings_dict.get(int(element.text)) )
        return None