8.4.8. Apple iWorks Numbers ‘13 Workbook¶
See Apple iWorks Numbers ‘09 Workbook for a note on the naming of sheets.
Here are the modules we need to import to process these Workbooks.
import logging
import pprint
import zipfile
import datetime
import decimal
import os
import struct
from stingray.workbook.base import Workbook
import stingray.sheet
import stingray.cell
from stingray.snappy import Snappy
from stingray.protobuf import Archive_Reader, Message
Todo
Additional Numbers13_Workbook Feature
Translate Formula and Formula error to Text
The iWork 13 format is a directory with an index.zip
file. The ZIP contains
a number of .IWA
files. Each .IWA
is compressed using the Snappy protocol.
The uncompressed data is messages in Protobuf format.
We could depend on other Snappy and Protobuf implementations. We provide our own fall-back implementation, instead. See The “Other” Modules: snappy and protobuf for more information on these two supporting modules.
-
class
workbook.numbers13.
Numbers13_Workbook
¶ Extract sheets, rows and cells from a Numbers ‘13 format file.
The
.numbers
“file” is a directory bundle or package.The
index.zip
file is the interesting part of the bundle.In addition to the superclass attributes, some additional unique attributes are introduced here.
-
archive
¶ The snappy archive for this file.
-
class Numbers13_Workbook( Workbook ):
"""Mac OS X Numbers Workbook for iWork '13.
"""
def __init__( self, name, file_object=None ):
"""Prepare the workbook for reading.
:param name: File name
:param file_object: Ignored for iWork13 files.
We might be able to use the file's internal handle to open
it as a proper directory. But we don't.
"""
super().__init__( name, None )
self.archive= self._load_archive( name )
Read the archive to get the serialized messages. This method deserializes all of the protobuf-encoded messages. It’s a shabby stand-in for proper protobuf processing.
One thing we could do is to refactor this method into a bunch of methods, each of which is tied to a specific class of message. That would parallel the way protobuf really works.
@staticmethod
def _load_archive( filename ):
"""Extract all the protobuf-serialized Archived messages.
We don't actually need to read **all** of them.
We really only need Index/Document.iwa, Index/CalculationEngine.iwa, and
all Index/Tables/*.iwa. But that's almost everything.
:param filename: File name
"""
log= logging.getLogger( "load_archive" )
snappy= Snappy()
reader= Archive_Reader()
archive=dict()
with zipfile.ZipFile( os.path.join( filename, "index.zip" ) ) as index:
for n in index.namelist():
log.info( n )
with index.open( n ) as member:
data= snappy.decompress( member )
log.debug( "{0} bytes".format(len(data)) )
for id, m in reader.archive_iter( data ):
log.debug( "{0:4d}: {1}".format( id, m ) )
archive[id]= m
if m.name_ == "TN.DocumentArchive":
m.sheets= [Message("Reference", sheet) for sheet in m[1]]
log.debug( "{0} {1}".format(m.name_, m.sheets) )
elif m.name_ == "TN.SheetArchive":
m.name= bytes(m[1][0]).decode("UTF-8")
m.drawable_infos= [Message('Reference', ref) for ref in m[2]]
log.debug( "{0} {1} {2}".format(m.name_, m.name, m.drawable_infos) )
elif m.name_ == "TST.TableInfoArchive":
m.super= Message( 'DrawableArchive', m[1][0] )
m.tableModel= Message('Reference', m[2][0])
log.debug( "{0} {1} {2}".format(m.name_, m.super, m.tableModel) )
elif m.name_ == "TST.TableModelArchive":
m.data_store= Message( "DataStore", m[4][0] )
m.table_id= bytes(m[1][0])
m.table_name= bytes(m[8][0]).decode("UTF-8")
log.debug( "{0} {1} {2}".format(m.name_, m.table_name, m.data_store) )
m.data_store.tiles= [Message("TileStorage", tile) for tile in m.data_store[3]]
log.debug( "{0} {1}".format(m.name_,m.data_store.tiles) )
for ts in m.data_store.tiles:
ts.tiles= [Message("TileStorage.Tile", tile) for tile in ts[1]]
for tile in ts.tiles:
tile.id= tile[1][0]
tile.ref= Message('Reference', tile[2][0])
log.debug( "{0} {1} {2} {3}".format(m.name_, tile, tile.id, tile.ref) )
m.data_store.stringTable= [Message('Reference', string) for string in m.data_store[4]]
m.data_store.formulaTable= [Message('Reference', formula) for formula in m.data_store[6]]
m.data_store.formulaErrorTable= [Message('Reference', error) for error in m.data_store[12]]
log.debug( "DataStore stringTable {0}, formulaTable {1}, formulaErrorTable {2}".format(
m.data_store.stringTable, m.data_store.formulaTable,
m.data_store.formulaErrorTable) )
elif m.name_ == "TST.TableDataList":
m.listType= m[1][0]
m.nextListID= m[2][0]
m.entries= [Message('ListEntry', entry) for entry in m[3]]
log.debug( "{0} {1} {2}".format( m.name_, m.listType, m.nextListID ) )
for entry in m.entries:
entry.key= entry[1][0]
try:
entry.string= bytes(entry[3][0])
except IndexError:
entry.string= b''
try:
entry.formula= Message('FormulaArchive', entry[5][0])
entry.formula.AST_node_array= Message("ASTNodeArrayArchive", entry.formula[1][0])
entry.formula.AST_node_array.AST_node= [Message("AST_node", n) for n in entry.formula.AST_node_array[1]]
except IndexError:
entry.formula= None
log.debug( "ListEntry {0} {1} {2}".format( entry, entry.string, entry.formula ) )
elif m.name_ == "TST.Tile":
m.rowInfos= [ Message("TileRowInfo", row_info) for row_info in m[5] ]
for row_info in m.rowInfos:
row_info.tileRowIndex= row_info[1][0]
row_info.cellCount= row_info[2][0]
row_info.cellStorageBuffer= row_info[3][0]
row_info.cellOffsets= row_info[4][0]
log.debug( "tileRowIndex {0} cellCount {1}".format(row_info.tileRowIndex, row_info.cellCount) )
format= "<{0}h".format(len(row_info.cellOffsets)//2)
offsets= struct.unpack( format, bytes(row_info.cellOffsets) )
log.debug( "{0} {1}".format( m.name_, offsets ) )
row_info.cells= dict()
count= row_info.cellCount
for col, offset in enumerate( offsets ):
if offset == -1:
log.debug( "{0} {1} {2}".format( m.name_, col, offset ) )
row_info.cells[col]= (None,None)
continue
# A little needless copying to simplify the elif sequence.
cell_raw= row_info.cellStorageBuffer[offset:offset+32]
version, celltype = struct.unpack( "<hbx", bytes(cell_raw[:4]) )
if celltype == 2: # Number
data= struct.unpack( "<IIIdi", bytes(cell_raw[4:28]) )
elif celltype == 3: # Text (in the associated TableDataList)
data= struct.unpack( "<IIIi", bytes(cell_raw[4:20]) )
elif celltype == 5: # Date
data= struct.unpack( "<IIIdi", bytes(cell_raw[4:28]) )
elif celltype == 6: # Boolean
data= struct.unpack( "<IIId", bytes(cell_raw[4:24]) )
elif celltype == 8: # Error
data= struct.unpack( "<hhII", bytes(cell_raw[4:16]) )
else:
raise Exception( "Unsupported {0}".format(celltype) )
log.debug( "{0} {1} {2} {3} {4}".format( m.name_, col, offset, celltype, data ) )
row_info.cells[col]= (celltype,data[3])
count -= 1
if count == 0: break
return archive
Once we’ve decoded the archive, we can fetch sheets, tables, rows and cells.
We start with document.iwa and get the message with an id of 1. This is the TN.DocumentArchive and it lists the TN.SheetArchive by id.
Each TN.SheetArchive has drawable_infos references to TST.TableInfoArchive. TST.TableInfoArchive has reference to TST.TableModelArchive (and a super reference to SheetArchive.) TST.TableModelArchive references a DataStore.
- TST.DataStore.stringTable references a TST.TableDataList. This has string literals
- or formulas (or styles) referenced by cells.
- TST.DataStore.tiles references a TST.Tile.
- The Tile has some bytes which contain the cells as hard-to-parse raw structures. This includes text cells with references to the data lists string literals. It includes number, date and boolean cells with numeric-looking data.
-
Numbers13_Workbook.
sheets
()¶ Return a list of “sheets” (actually underlying tables.)
The “sheets” are
[ (
workspace, table), ... ]
pairs.Picking a sheet involves matching a two-part name: (workspace, table).
def sheets( self ):
sheet_list= []
document = self.archive[1]
for sheet_ref in document.sheets:
sheet= self.archive[sheet_ref[1][0]]
for table_ref in sheet.drawable_infos:
tableinfo= self.archive[table_ref[1][0]]
tablemodel= self.archive[tableinfo.tableModel[1][0]]
sheet_list.append( (sheet.name, tablemodel.table_name) )
return sheet_list
The proto files include this.
enum CellType {
genericCellType = 0;
spanCellType = 1;
numberCellType = 2;
textCellType = 3;
formulaCellType = 4;
dateCellType = 5;
boolCellType = 6;
durationCellType = 7;
formulaErrorCellType = 8;
automaticCellType = 9;
}
def _cell( self, cell, strings, formulae, errors ):
"""Given a decoded row cell message, pluck out the relevant data.
Look into the collection of strings or formulae for the data object.
:param cell: cell message
:param strings: dict of strings
:param formulae: dict of formulae
:param errors: dict of errors
:returns: Cell object
"""
celltype, value = cell
if celltype == 0: # What does "generic" mean?
v= stingray.cell.NumberCell( value, self )
elif celltype == 1 or celltype is None:
v= stingray.cell.EmptyCell( "", self )
elif celltype == 2:
v= stingray.cell.NumberCell( value, self )
elif celltype in (3, 9): # Text and Rich Text
v= stingray.cell.TextCell( strings[value].decode("UTF-8"), self )
elif celltype == 5:
seconds= int(value)
epoch= datetime.datetime(2001, 1, 1)
delta= datetime.timedelta( seconds=seconds )
theDate= epoch + delta
v= stingray.cell.DateCell( theDate, self )
elif celltype == 6:
v= stingray.cell.BooleanCell( value, self )
elif celltype == 7: # Actually a duration
v= stingray.cell.NumberCell( value, self )
elif celltype in (4, 8):
ast_0= formulae[value][0]
v= stingray.cell.ErrorCell( "Formula {0}".format(ast_0[1]), self )
else:
raise Exception( "Unknown cell type {0!r}".format(cell) )
self.log.debug( "_cell {0} = {1}".format( cell, v) )
return v
-
Numbers13_Workbook.
rows_of
(sheet)¶ Iterator through all rows.
def rows_of( self, sheet ):
"""Iterator over rows.
:param sheet: a Sheet object to retrieve rows from.
"""
self.log.debug( "rows of {0}: {1}".format(sheet, sheet.name) )
document = self.archive[1]
for sheet_ref in document.sheets:
sheet_msg= self.archive[sheet_ref[1][0]]
for table_ref in sheet_msg.drawable_infos:
tableinfo= self.archive[table_ref[1][0]]
tablemodel= self.archive[tableinfo.tableModel[1][0]]
if (sheet_msg.name, tablemodel.table_name) != sheet.name:
continue
for row in self._row_iter( tablemodel ):
yield row
break
def _row_iter( self, tablemodel ):
str_values= dict()
for str_ref in tablemodel.data_store.stringTable:
strs= self.archive[str_ref[1][0]]
for entry in strs.entries:
str_values[entry.key]= entry.string
form_values= dict()
for form_ref in tablemodel.data_store.formulaTable:
form= self.archive[form_ref[1][0]]
for entry in form.entries:
form_values[entry.key]= entry.formula.AST_node_array.AST_node
error_values= dict()
for error_ref in tablemodel.data_store.formulaErrorTable:
error= self.archive[error_ref[1][0]]
# TODO: decode error details
#for entry in error.entries:
# error_values[entry.key]= entry.formula # entry.string?
for t in tablemodel.data_store.tiles:
for tt_ref in t.tiles:
tile= self.archive[tt_ref.ref[1][0]]
for row in tile.rowInfos:
yield [ self._cell(row.cells[col], str_values, form_values, error_values)
for col in row.cells ]