8.4.7. Apple iWorks Numbers ‘09 Workbook

The Stingray model of sheet/row/cell structure does not easily fit the Numbers sheet/table/row/cell structure. How can we handle the extra layer of names introduced by Numbers?

Option 1: navigation hierarchy.

Workbook ➞ new layer (Numbers “Workspace”) ➞ Sheet (Numbers “Table”) ➞ Row ➞ Cell

Option 2: navigation hierarchy.

Combine (Workspace,Table) into a 2-tuple, and call this a “sheet” name when working with Numbers documents.

This will fit with Stingray acceptably.

The imports required to process this kind of file.

import logging
import pprint
import xml.etree.cElementTree as dom
import zipfile
import datetime
import decimal

from stingray.workbook.base import Workbook
import stingray.sheet
import stingray.cell

The iWork Numbers 09 format is a Zip file with an XML document inside it. There may be slight variations between native Numbers ‘09 and Numbers ‘13 doing a “save as” in Numbers ‘09 format. It’s not clear; we haven’t done exhaustive checking.

Numbers ‘13 is entirely different. See Apple iWorks Numbers ‘13 Workbook.

class workbook.numbers09.Numbers09_Workbook

Extract sheets, rows and cells from a Numbers ‘09 format file.

The .numbers “file” is a ZIP file.

The index.xml element the interesting part of the archive.

In addition to the superclass attributes, some additional unique attributes are introduced here.

zip_archive

A zip archive for this file.

workspace

The “workspaces”: pages with tables inside them.

class Numbers09_Workbook( Workbook ):
    """Mac OS X Numbers Workbook for iWork 09.
    """
    NUMBERS_NS = {
    "ls":"http://developer.apple.com/namespaces/ls",
    "sf":"http://developer.apple.com/namespaces/sf",
    "sfa":"http://developer.apple.com/namespaces/sfa",
    }
    row_debug= False
    def __init__( self, name, file_object=None ):
        """Prepare the workbook for reading.
        :param name: File name
        :param file_object: Optional file-like object. Ignored for v3.2 numbers files.
        """
        super().__init__( name, file_object )
        self.zip_archive= zipfile.ZipFile( file_object or name, "r" )
        self._prepare()

As preparation for reading these files, we locate all the sheet names and all the number styles.

def _prepare( self ):
    """Locate sheets/tables and styles."""
    root= dom.parse( self.zip_archive.open('index.xml') ).getroot()
    self._locate_sheets(root)
    self._get_styles(root)

Locating all the sheets is a matter of doing an XPath search for workspace-array/workspace and getting the workspace-name attribute from the <table name="name"> tags.

Within each workspace we have to find page-info/tabular-info/tabular-model to get the tables within the workspaces.

def _locate_sheets( self, root ):
    """Create ``workspace_table`` map from name to workspace and table."""
    self.workspace= dict()

    ws_name_attr= dom.QName( self.NUMBERS_NS["ls"], 'workspace-name' )
    name_attr= dom.QName( self.NUMBERS_NS["sf"], 'name' )
    workspace_array= root.find("ls:workspace-array", namespaces=self.NUMBERS_NS )
    for workspace in workspace_array.findall('.//ls:workspace', namespaces=self.NUMBERS_NS ):
        # Populate tables within this workspace.
        tables= dict()
        page_info = workspace.find('ls:page-info', namespaces=self.NUMBERS_NS)
        for tabular_info in page_info.findall('.//sf:tabular-info', namespaces=self.NUMBERS_NS):
            tabular_model = tabular_info.find( 'sf:tabular-model', namespaces=self.NUMBERS_NS)
            tables[ tabular_model.get(name_attr) ] = tabular_model
        self.workspace[ workspace.get(ws_name_attr) ]= workspace, tables

Locate a “data source” within the XML document. Create Cell instances.

def _datasource( self, grid ):
    """The data source for cell values within a grid.
    This yields each individual cell value, transformed into
    string, Decimal, datetime.
    """
    datasource = grid.find('.//sf:datasource', namespaces=self.NUMBERS_NS)
    for cell_doc in datasource:
        yield self.cell( cell_doc )
    # or return map( self.cell, datasource )
Numbers09_Workbook.cell(cell)

Create a Cell instance from the decoded data.

def cell( self, cell ):
    logging.debug( dom.tostring(cell) )

    date_tag= dom.QName( self.NUMBERS_NS["sf"], 'd' )
    date_attr= dom.QName( self.NUMBERS_NS["sf"], 'cell-date' )
    formula_tag= dom.QName( self.NUMBERS_NS["sf"], 'f' )
    s_attr= dom.QName( self.NUMBERS_NS["sf"], 's' )
    v_attr= dom.QName( self.NUMBERS_NS["sf"], 'v' )
    general_tag= dom.QName( self.NUMBERS_NS["sf"], 'g' )
    number_tag= dom.QName( self.NUMBERS_NS["sf"], 'n' )
    text_tag= dom.QName( self.NUMBERS_NS["sf"], 't' )
    o_tag= dom.QName( self.NUMBERS_NS["sf"], 'o' )
    span_tag= dom.QName( self.NUMBERS_NS["sf"], 's' )
    bool_tag= dom.QName( self.NUMBERS_NS["sf"], 'b' )
    popup_menu_tag= dom.QName( self.NUMBERS_NS["sf"], 'pm' )
    IDREF_attr= dom.QName( self.NUMBERS_NS["sfa"], 'IDREF' )
    ID_attr= dom.QName( self.NUMBERS_NS["sfa"], 'ID' )
    fs_attr= dom.QName( self.NUMBERS_NS["sf"],"fs")

    if cell.tag == date_tag:
        seconds= int(cell.attrib[date_attr])
        epoch= datetime.datetime(2001, 1, 1)
        delta= datetime.timedelta( seconds=seconds )
        theDate= epoch + delta
        return stingray.cell.DateCell( theDate, self )

    elif cell.tag == formula_tag: # formula or error.
        s= cell.get(s_attr)
        fo= cell.find('sf:fo', namespaces=self.NUMBERS_NS)
        # Numeric Result? What about non-numeric results?
        r= cell.find('sf:r', namespaces=self.NUMBERS_NS)
        if r:
            # Result:
            rn= r.find('sf:rn', namespaces=self.NUMBERS_NS)
            try:
                value_txt= rn.attrib[v_attr]
                value= self._to_decimal( value_txt, s )
            except KeyError as ex:
                #self._cell_warning("Formula with no value", cell)
                value= self._to_decimal( '0', s )
            return stingray.cell.NumberCell( value, self )
        else:
            # Error:
            #self._cell_warning("Formula error", cell)
            value= "#Error in {0}".format(fo.get(fs_attr))
            return stingray.cell.ErrorCell( value, self )

    elif cell.tag == general_tag: # General?
        return stingray.cell.EmptyCell( '', self )
    elif cell.tag == number_tag: # Number
        value= self._decode_number( cell )
        return stingray.cell.NumberCell( value, self )
    elif cell.tag == o_tag: #??
        self._cell_warning("Unknown cell type", cell)
        return stingray.cell.EmptyCell( '', self )
    elif cell.tag == span_tag: # Span?
        self._cell_warning("Unknown cell type", cell)
        return stingray.cell.EmptyCell( '', self )
    elif cell.tag == text_tag: # Text
        value= self._decode_text( cell )
        return stingray.cell.TextCell( value, self )
    elif cell.tag == bool_tag: # Boolean
        value= self._decode_number( cell )
        return stingray.cell.BooleanCell( value, self )
    elif cell.tag == popup_menu_tag: # popup menu
        # TODO:: Better Xpath query: ``menu-choices/*[@ID='name']``
        value= None # In case we can't find anything.
        selected= cell.find('sf:proxied-cell-ref', namespaces=self.NUMBERS_NS)
        name= selected.get(IDREF_attr)
        mc= cell.find('sf:menu-choices', namespaces=self.NUMBERS_NS)
        for t in mc:
            if t.get(ID_attr) == name:
                # t's tag cold end in Could be "t", or "n".
                if t.tag.endswith('t'): # Text
                    value= self._decode_text( t )
                    return stingray.cell.TextCell( value, self )
                elif t.tag.endswith('n'): # Number
                    value= self._decode_number( t )
                    return stingray.cell.NumberCell( value, self )
                else:
                    raise Exception( "Unknown popup menu {0}".format(dom.tostring(cell)))
    else:
        raise Exception( "Unknown cell {0}".format( dom.tostring(cell) ) )

Some lower-level conversions.

def _to_decimal( self, value_txt, style_id ):
    """Convert a given numeric value_text using the named style.

    TODO: From the style, get the number of decimal places, use that to
    build a string version of the float value.
    """
    fdp_attr= dom.QName( self.NUMBERS_NS["sf"], 'format-decimal-places' )
    fs_attr= dom.QName( self.NUMBERS_NS["sf"], 'format-string' )
    cell_style= self.cell_style.get(style_id)
    #print( "TO_DECIMAL", value_txt, style_id, "=", cell_style )

    fs= None # cell_style.get(fs_attr) # Doesn't seem correct
    fdp= None # cell_style.get(fdp_attr) # Doesn't seem correct

    # Transform fs into proper Python format, otherwise, use the number of
    # decimal places.

    if fs is not None:
        fmt= self._rewrite_fmt( fs )
        #print( "Decimal: {{0:{0}}}.format({1}) = ".format( fmt, value_txt ), end="" )
        value= decimal.Decimal( "{:{fmt}}".format(float(value_txt), fmt=fmt) )
        #print( value )
        return value
    elif fdp is not None:
        #fmt= "{{0:.{0}f}}".format(fdp)
        value= decimal.Decimal( "{:.{fdp}f}".format(float(value_txt), fdp=fdp) )
        #print( "Decimal: {0}.format({1}) = {2!r}".format( fmt, value_txt, value ) )
        return value
    else:
        value= decimal.Decimal( value_txt )
        #print( "Decimal: {0} = {1!r}".format( value_txt, value ) )
    return value

def _decode_text( self, cell ):
    """Decode a <t> tag's value."""
    sfa_s_attr= dom.QName( self.NUMBERS_NS["sfa"], 's' )
    ct= cell.find( 'sf:ct', namespaces=self.NUMBERS_NS )
    value= ct.get(sfa_s_attr)
    if value is None:
        value= "\n".join( cell.itertext() )
    return value

def _decode_number( self, cell ):
    """Decode a <n> tag's value, applying the style."""
    s_attr= dom.QName( self.NUMBERS_NS["sf"], 's' )
    v_attr= dom.QName( self.NUMBERS_NS["sf"], 'v' )
    s= cell.get(s_attr)
    cell_style= self.cell_style.get(s)
    try:
        value_txt= cell.attrib[v_attr]
        value= self._to_decimal( value_txt, s )
    except KeyError as ex:
        #self._cell_warning("Number with no value", cell)
        value= self._to_decimal( '0', s )
    return value

The styles are also important because we can use them to parse the numbers more precisely.

def _get_styles( self, root ):
    """Get the styles."""
    ID_attr= dom.QName( self.NUMBERS_NS["sfa"], 'ID' )
    ident_attr= dom.QName( self.NUMBERS_NS["sf"], 'ident' )
    parent_ident_attr= dom.QName( self.NUMBERS_NS["sf"], 'parent-ident' )

    self.cell_style= {}
    for cs in root.findall('.//sf:cell-style', namespaces=self.NUMBERS_NS):
        #print( "STYLE", dom.tostring(cs) )
        ID= cs.get(ID_attr)
        ident= cs.get(ident_attr)
        parent_ident= cs.get(parent_ident_attr)
        property_number_format= cs.find('.//sf:SFTCellStylePropertyNumberFormat', namespaces=self.NUMBERS_NS)
        if property_number_format is None:
            if parent_ident is not None:
                self.cell_style[ID]= self.cell_style[parent_ident]
        else:
            number_format= property_number_format.find('sf:number-format', namespaces=self.NUMBERS_NS)
            if number_format is None:
                if parent_ident is not None:
                    self.cell_style[ID]= self.cell_style[parent_ident]
            else:
                self.cell_style[ID]= number_format.attrib
                if ident is not None:
                    self.cell_style[ident]= number_format.attrib
            #print( ID, self.cell_style.get(ID,None) )

Rewrite a number format from Numbers to Python

def _rewrite_fmt( self, format_string ):
    """Parse the mini-language: '#,##0.###;-#,##0.###' is an example.
    This becomes "{:10,.3f}"
    """
    positive, _, negative = format_string.partition(";")
    fmt= negative or positive
    digits= len(fmt)
    comma= "," if "," in fmt else ""
    whole, _, frac= fmt.partition(".")
    precision= len(frac)
    return "{digits}{comma}.{precision}f".format(
        digits= digits, comma=comma, precision=precision )
Numbers09_Workbook.sheets()

Return a list of “sheets” (actually underlying tables.)

The “sheets” are [ ( workspace, table ), ... ] pairs.

Picking a sheet involves matching a two-part name: (workspace, table).

def sheets( self ):
    """Build "sheet" names from workspace/table"""
    sheet_list= []
    for w_name in self.workspace:
        ws, tables = self.workspace[w_name]
        for t_name in tables:
            sheet_list.append( (w_name, t_name) )
    return sheet_list
Numbers09_Workbook.rows_of(sheet)

Iterator through all rows of a sheet.

def rows_of( self, sheet ):
    """Iterator over rows.

    Two parallel traversals:

    Internal iterator over grid/datasource/* has d, t, n, pm, g, o and s
        yields individual cell values.

    Iterator over grid/rows/grid-row may have ``nc``, number of columns in that row.
        Each grid-row fetches a number of cell values to assemble a row.
        Row's may be variable length (sigh) but padded to the number of columns
        specified in the grid.

    :param sheet: a Sheet object to retrieve rows from.
    """
    self.log.debug( "rows of {0}: {1}".format(sheet, sheet.name) )
    ws_name, t_name = sheet.name
    ws, tables= self.workspace[ws_name]
    tabular_model= tables[t_name]

    grid= tabular_model.find( 'sf:grid', namespaces=self.NUMBERS_NS )
    numrows_attr= dom.QName( self.NUMBERS_NS["sf"], 'numrows' )
    numcols_attr= dom.QName( self.NUMBERS_NS["sf"], 'numcols' )
    numrows = int(grid.attrib[numrows_attr])
    numcols = int(grid.attrib[numcols_attr])

    nc_attr= dom.QName( self.NUMBERS_NS["sf"], 'nc' )

    datasource= iter( self._datasource(grid) )

    rows = grid.find('sf:rows', namespaces=self.NUMBERS_NS)
    for n, r in enumerate(rows.findall( 'sf:grid-row', namespaces=self.NUMBERS_NS )):
        #print( "ROW", dom.tostring(r) )
        self.debug_row= n
        # Is this really relevant for Numbers '09?
        nc= int(r.get(nc_attr,numcols))
        try:
            row= [ next(datasource) for self.debug_col in range(nc) ]
        except StopIteration as e:
            pass # Last row will exhaust the datasource.
        if len(row) == numcols:
            yield row
        else:
            yield row + (numcols-nc)*[None]