8.4.6. ODS Workbook

import logging
import pprint
import xml.etree.cElementTree as dom
import zipfile
import datetime

from stingray.workbook.base import Workbook
import stingray.sheet
import stingray.cell

We should use iterparse rather than simply parsing the entire XML document. If the document is large, then we can’t hold it all in memory.

class workbook.ods.ODS_Workbook

Extract sheets, rows and cells from a OOO ODS format file.

In addition to the superclass attributes, some additional unique attributes are introduced here.

zip_archive

A zip archive for this file.

tables

A mapping that provides sheet names.

class ODS_Workbook( Workbook ):
    """Standard OOO ODS document.
    Locate sheets and rows within a given sheet.
    """
    ODS_NS = {
    "office":"urn:oasis:names:tc:opendocument:xmlns:office:1.0",
    "table":"urn:oasis:names:tc:opendocument:xmlns:table:1.0",
    "text":"urn:oasis:names:tc:opendocument:xmlns:text:1.0",
    }
    date_format = "%Y-%m-%d"
    def __init__( self, name, file_object=None ):
        """Prepare the workbook for reading.
        :param name: File name
        :param file_object: Optional file-like object.  If omitted, the named file is opened.
        """
        super().__init__( name, file_object )
        self.zip_archive= zipfile.ZipFile( file_object or name, "r" )
        self._prepare()

As preparation for reading these files, we locate all the sheet names.

def _prepare( self ):
    self._locate_sheets()

Locating all the sheets is a matter of doing an XPath search for body/spreadsheet/table and getting the name attribute from the <table name="name"> tags.

def _locate_sheets( self ):
    """Create ``tables`` map from name to table."""
    workbook_zip= self.zip_archive.getinfo("content.xml")
    workbook_doc= dom.parse( self.zip_archive.open(workbook_zip) )
    name_attr_id= dom.QName( self.ODS_NS["table"], "name" )
    logging.debug( dom.tostring( workbook_doc.getroot() ) )
    self.tables= dict(
        (t.attrib[name_attr_id],t)
        for t in workbook_doc.findall("office:body/office:spreadsheet/table:table",
            namespaces=self.ODS_NS) )

An iterparse version to locate sheets would look for start of table tags and then get the name attribute from that tag.

ODS_Workbook.sheets()

Return the list of sheets for this workbook.

def sheets( self ):
    return self.tables.keys()

We can build an eager sheet.Row or a sheet.LazyRow from the available data. The eager Row includes the conversions. The LazyRow defers the conversions to ODS_Workbook.row_get().

In ODS documents, the cell’s value can be carried in the value attribute or it can be a mixed content value of the element. There are three cases.

  • <table-cell value-type="type" value="value">...</table-cell>
  • <table-cell value-type="type" date-value="value">...</table-cell>
  • <table-cell value-type="type">value</table-cell>
ODS_Workbook.rows_of(sheet)

Iterate through rows of the given sheet.

def rows_of( self, sheet ):
    """Iterator over rows as a list of Cells for a named worksheet."""
    for r, row_doc in enumerate(
        self.tables[sheet.name].findall( "table:table-row", namespaces=self.ODS_NS ) ):
        row= []
        for c, cell_doc in enumerate( row_doc.findall( "table:table-cell", namespaces=self.ODS_NS ) ):
            row.append( self.cell(cell_doc) )
        yield row
ODS_Workbook.row_get(row, attribute)

Low-level get of a particular attribute from the given row.

def row_get( self, row, attribute ):
    """Create a Cell from the row's data."""
    return row[attribute.position]

Build a subclass of cell.Cell from the current type name and value.

Todo

Refactor this, it feels clunky.

def cell( self, cell_doc ):
    logging.debug( dom.tostring(cell_doc) )
    value_attr_id= dom.QName( self.ODS_NS['office'], 'value' )
    date_attr_id= dom.QName( self.ODS_NS['office'], 'date-value' )
    type_attr_id= dom.QName( self.ODS_NS['office'], 'value-type' )
    # Get the type
    try:
        type_name= cell_doc.attrib[type_attr_id]
    except KeyError:
        return stingray.cell.EmptyCell( '', self )
    value= None
    # Date value as attribute?
    if not value:
        try:
            value= cell_doc.attrib[date_attr_id]
        except KeyError:
            pass
    # Other value as attribute?
    if not value:
        try:
            value= cell_doc.attrib[value_attr_id]
        except KeyError:
            pass
    # No value attributes, get *all* the text content.
    if not value:
        value= "".join( x for x in cell_doc.itertext() )
    if not value:
        # TODO: Proper warning.
        dom.dump( cell_doc )
    logging.debug( type_name, repr(value) )
    if type_name == "string":
        return stingray.cell.TextCell( value, self )
    elif type_name == "float":
        return stingray.cell.NumberCell( float(value), self )
    elif type_name == "date":
        theDate= datetime.datetime.strptime(
            value, ODS_Workbook.date_format )
        return stingray.cell.FloatDateCell( theDate, self )
    elif type_name == "boolean":
        return stingray.cell.BooleanCell(
            float(value.upper()=='TRUE'),  self )
    elif type_name == "empty":
        return stingray.cell.EmptyCell( '', self )
    else:
        raise Exception( "Unknown cell {0}".format( dom.tostring(cell_doc) ) )

An iterparse version of building a row would look for start of table tags and then get the name attribute from that tag just to locate the right sheet.

Once the sheet was located, then the row and cell tags would be used

  • At <table-row start: increment row number, reset buffer
  • At <table-row end: yield the row
  • At <table-cell start: check for empty, date, float, boolean types, which are available as an attribute at start. For strings, start accumulating string values.
  • At <table-cell end: finalize the accumulated value.