Source code for PyDvi.Dvi.DviParser

####################################################################################################
# 
# PyDvi - A Python Library to Process DVI Stream
# Copyright (C) 2014 Fabrice Salvaire
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# 
####################################################################################################

""" This module implements a DVI Stream Parser.
"""

####################################################################################################

import logging
import os

####################################################################################################

from ..OpcodeParser import OpcodeParserSet, OpcodeParser
from ..Tools.EnumFactory import EnumFactory, ExplicitEnumFactory
from ..Tools.Stream import AbstractStream
from .DviMachine import *

####################################################################################################

_module_logger = logging.getLogger(__name__)

####################################################################################################

dvi_opcodes_tuple = ['SETC_%03u' % (i) for i in xrange(128)]

dvi_opcodes_tuple += [
    'SET1', 'SET2', 'SET3', 'SET4',
    'SET_RULE',
    'PUT1', 'PUT2', 'PUT3', 'PUT4',
    'PUT_RULE',
    'NOP',
    'BOP',
    'EOP',
    'PUSH',
    'POP',
    'RIGHT1', 'RIGHT2', 'RIGHT3', 'RIGHT4',
    'W0', 'W1', 'W2', 'W3', 'W4',
    'X0', 'X1', 'X2', 'X3', 'X4',
    'DOWN1', 'DOWN2', 'DOWN3', 'DOWN4',
    'Y0', 'Y1', 'Y2', 'Y3', 'Y4',
    'Z0', 'Z1', 'Z2', 'Z3', 'Z4',
    ]

dvi_opcodes_tuple += ['FONT_%02u' % (i) for i in xrange(64)]

dvi_opcodes_tuple += [
    'FNT1', 'FNT2', 'FNT3', 'FNT4',
    'XXX1', 'XXX2', 'XXX3', 'XXX4',
    'FNT_DEF1', 'FNT_DEF2', 'FNT_DEF3', 'FNT_DEF4',
    'PRE',
    'POST',
    'POST_POST',
    ]

dvi_opcodes = EnumFactory('DviOpcodes', dvi_opcodes_tuple)

####################################################################################################

DVI_EOF_SIGNATURE = 223

dvi_formats = ExplicitEnumFactory('DviFormats',
                                  {'DVI':  2,
                                   'DVIV': 3,
                                   'XDVI': 5,
                                   })

set_char_description = 'typeset a character and move right'

####################################################################################################

[docs]class OpcodeParser_set_char(OpcodeParser): """ This class parse the ``set_char`` opcode. """ ############################################## def __init__(self, opcode): super(OpcodeParser_set_char, self).__init__(opcode, 'set', set_char_description, opcode_class=Opcode_set_char) ##############################################
[docs] def read_parameters(self, dvi_parser): return [self.opcode]
####################################################################################################
[docs]class OpcodeParser_font(OpcodeParser): """ This class parse the ``font`` opcode. """ ############################################## def __init__(self, opcode): super(OpcodeParser_font, self).__init__(opcode, 'fnt num', 'set current font to i', opcode_class=Opcode_font) ##############################################
[docs] def read_parameters(self, dvi_parser): return [self.opcode - dvi_opcodes.FONT_00]
####################################################################################################
[docs]class OpcodeParser_xxx(OpcodeParser): """ This class parse the ``xxx`` opcode. """ base_opcode = dvi_opcodes.XXX1 ############################################## def __init__(self, opcode): super(OpcodeParser_xxx, self).__init__(opcode, 'xxx', 'extension to DVI primitives', opcode_class=Opcode_xxx) self.read_unsigned_byten = AbstractStream.read_unsigned_byten[self.opcode - self.base_opcode] ##############################################
[docs] def read_parameters(self, dvi_parser): stream = dvi_parser.stream return [str(stream.read(self.read_unsigned_byten(stream)))]
####################################################################################################
[docs]class OpcodeParser_fnt_def(OpcodeParser): """ This class parse the ``fnt_def`` opcode. """ base_opcode = dvi_opcodes.FNT_DEF1 ############################################## def __init__(self, opcode): super(OpcodeParser_fnt_def, self).__init__(opcode, 'fnt def', 'define the meaning of a font number') self.read_unsigned_byten = AbstractStream.read_unsigned_byten[opcode - self.base_opcode] ##############################################
[docs] def read_parameters(self, dvi_parser): stream = dvi_parser.stream font_id = self.read_unsigned_byten(stream) font_checksum = stream.read_unsigned_byte4() font_scale_factor = stream.read_unsigned_byte4() font_design_size = stream.read_unsigned_byte4() font_name = str(stream.read(stream.read_unsigned_byte1() + stream.read_unsigned_byte1())) font = DviFont(font_id, font_name, font_checksum, font_scale_factor, font_design_size) dvi_parser.dvi_program.register_font(font)
#################################################################################################### opcode_definitions = ( ( [dvi_opcodes.SETC_000, dvi_opcodes.SETC_127], OpcodeParser_set_char ), ( dvi_opcodes.SET1, 'set', set_char_description, [1,4], Opcode_set_char ), # ... to SET4 ( dvi_opcodes.SET_RULE, 'set rule', 'typeset a rule and move right', (4,4), Opcode_set_rule ), ( dvi_opcodes.PUT1, 'put', 'typeset a character', [1,4], Opcode_put_char ), # ... to PUT4 ( dvi_opcodes.PUT_RULE, 'put rule', 'typeset a rule', (4,4), Opcode_put_rule ), ( dvi_opcodes.NOP, 'nop', 'no operation', None, None ), ( dvi_opcodes.BOP, 'bop', 'beginning of page', tuple([4]*9 + [-4]), None ), ( dvi_opcodes.EOP, 'eop', 'ending of page', None, None ), ( dvi_opcodes.PUSH, 'push', 'save the current positions', None, Opcode_push ), ( dvi_opcodes.POP, 'pop', 'restore previous positions', None, Opcode_pop ), ( dvi_opcodes.RIGHT1, 'right', 'move right', [-1,-4], Opcode_right ), # ... to RIGHT4 ( dvi_opcodes.W0, 'w0', 'move right by w', None, Opcode_w0 ), ( dvi_opcodes.W1, 'w', 'move right and set w', [-1,-4], Opcode_w ), # ... to W4 ( dvi_opcodes.X0, 'x0', 'move right by x', None, Opcode_x0 ), ( dvi_opcodes.X1, 'x', 'move right and set x', [-1,-4], Opcode_x ), # ... to X4 ( dvi_opcodes.DOWN1, 'down', 'move down', [-1,-4], Opcode_down ), # ... to DOWN4 ( dvi_opcodes.Y0, 'y0', 'move down by y', None, Opcode_y0 ), ( dvi_opcodes.Y1, 'y', 'move down and set y', [-1,-4], Opcode_y ), # ... to Y4 ( dvi_opcodes.Z0, 'z0', 'move down by z', None, Opcode_z0 ), ( dvi_opcodes.Z1, 'z', 'move down and set z', [-1,-4], Opcode_z ), # ... to Z4 ( [dvi_opcodes.FONT_00, dvi_opcodes.FONT_63], OpcodeParser_font ), ( dvi_opcodes.FNT1, 'fnt', 'set current font', [1,4], Opcode_font ), ( [dvi_opcodes.XXX1, dvi_opcodes.XXX4], OpcodeParser_xxx ), ( [dvi_opcodes.FNT_DEF1, dvi_opcodes.FNT_DEF4], OpcodeParser_fnt_def ), ( dvi_opcodes.PRE, 'pre', 'preamble', (), None ), ( dvi_opcodes.POST, 'post', 'postamble beginning', None, None ), ( dvi_opcodes.POST_POST, 'post post', 'postamble ending', None, None ), ) opcode_parser_set = OpcodeParserSet(opcode_definitions) #################################################################################################### BadDviStream = NameError('Bad DVI stream') ####################################################################################################
[docs]class DviParser(object): """ This class implements a DVI Stream Parser. """ _logger = _module_logger.getChild('DviParser') ##############################################
[docs] def _reset(self): """ Reset the DVI parser. """ self.dvi_program = DviProgam() self.post_pointer = None self.page_number = None self.bop_pointer_stack = [] # can be used for lazy loading, reverse if backward
##############################################
[docs] def process_stream(self, stream): """ Process a DVI stream and return a :class:`DviProgam` instance. """ # Fixme: read pages before postamble (note: why ?) # Fixme: it would be better to read page on demand for long documents. self._reset() self.stream = stream self._process_preambule() self._process_postambule() self._process_pages_backward() # Fixme: retrieve bop pointers but don't read the page self.stream = None return self.dvi_program
##############################################
[docs] def _process_preambule(self): """ Process the preamble where we get the magnification. """ self._logger.debug('Process the preamble') stream = self.stream stream.seek(0) if stream.read_unsigned_byte1() != dvi_opcodes.PRE: raise NameError("DVI stream don't start by PRE") dvi_format = stream.read_unsigned_byte1() if dvi_format not in dvi_formats: raise NameError('Unknown DVI Format') numerator = stream.read_unsigned_byte4() denominator = stream.read_unsigned_byte4() magnification = stream.read_unsigned_byte4() comment = stream.read(stream.read_unsigned_byte1()) self.dvi_program.set_preambule_data(comment, dvi_format, numerator, denominator, magnification) self._logger.debug('Preamble end at {}'.format(stream.tell() -1))
##############################################
[docs] def _process_postambule(self): """ Process the postamble where we get the number of pages and the fonts. """ # DVI postamble format: # postamble: post opcode # <font definitions> # post post opcode # post pointer # dvi format # EOF_SIGNATURE [at least 4 times] self._logger.debug('Process the postamble') stream = self.stream # DVI file end with at least four EOF_SIGNATURE # Read stream[-5] and move backward until opcode != EOF_SIGNATURE stream.seek(-5, os.SEEK_END) while True: opcode = stream.read_unsigned_byte1() if opcode != DVI_EOF_SIGNATURE: break else: # seek to previous byte stream.seek(-2, os.SEEK_CUR) # We read the dvi format dvi_format = opcode # Move backward and read post pointer stream.seek(-5, os.SEEK_CUR) self.post_pointer = stream.read_unsigned_byte4() # Move to Postamble stream.seek(self.post_pointer) self._logger.debug('Postamble start at {}'.format(stream.tell())) if stream.read_unsigned_byte1() != dvi_opcodes.POST: raise BadDviStream # Push pointer to the last page self.bop_pointer_stack.append(stream.read_signed_byte4()) numerator = stream.read_unsigned_byte4() denominator = stream.read_unsigned_byte4() magnification = stream.read_unsigned_byte4() max_height = stream.read_unsigned_byte4() max_width = stream.read_unsigned_byte4() stack_depth = stream.read_unsigned_byte2() number_of_pages = stream.read_unsigned_byte2() # Read Font definitions while True: opcode = stream.read_unsigned_byte1() if dvi_opcodes.FNT_DEF1 <= opcode <= dvi_opcodes.FNT_DEF4: opcode_parser_set[opcode].read_parameters(self) elif opcode != dvi_opcodes.NOP: break # Fixme: else # We must reach POST POST if opcode != dvi_opcodes.POST_POST: raise BadDviStream # post_pointer = stream.read_unsigned_byte4() # dvi_format = stream.read_unsigned_byte1() self.number_of_pages = number_of_pages self.dvi_program.set_postambule_data(max_height, max_width, stack_depth, number_of_pages) self._logger.debug('Number of pages: {}'.format(number_of_pages)) self._logger.debug('Stack depth: {}'.format(stack_depth))
##############################################
[docs] def _process_pages_backward(self): """ Process the pages in backward order. """ self._logger.debug('Process the pages in backward order.') stream = self.stream self.page_number = self.number_of_pages # Get pointer to the last page bop_pointer = self.bop_pointer_stack[0] # Move backward from page to page and process the pages while bop_pointer >= 0: stream.seek(bop_pointer) self.page_number -= 1 self._logger.debug('BOP at {}, page # {}'.format(stream.tell(), self.page_number)) opcode = stream.read_unsigned_byte1() if opcode != dvi_opcodes.BOP: raise BadDviStream counts = [stream.read_unsigned_byte4() for i in xrange(10)] bop_pointer = stream.read_signed_byte4() self.bop_pointer_stack.append(bop_pointer) # Fixme: page? page = self.process_page()
##############################################
[docs] def process_page_forward(self): # Fixme: test this code stream = self.stream if self.page_number is None: self.page_number = 0 else: self.page_number += 1 self.dvi_program.append_page(self.page_number) self._logger.debug('BOP at {}, page # {}'.format(stream.tell(), self.page_number)) opcode = stream.read_unsigned_byte1() if opcode != dvi_opcodes.BOP: raise BadDviStream counts = [stream.read_unsigned_byte4() for i in xrange(10)] bop_pointer = stream.read_signed_byte4() #? forward # self.bop_pointer_stack.append(bop_pointer) # Fixme: page? page = self.process_page()
##############################################
[docs] def process_page(self): stream = self.stream opcode_program = self.dvi_program[self.page_number] # Define some counters to track fonts, characters and rules # These counters are intended to allocate memory at the beginning of a page rendering. font_id = None char_counter = {} rule_counter = 0 # opcode tracker to merge char opcode previous_opcode_obj = None previous_opcode_was_set = None # Fixme: tracking versus program simplification # could merge same opcode using a test and a merge method # char pop push positionning while True: opcode = stream.read_unsigned_byte1() if opcode == dvi_opcodes.EOP: break else: opcode_parser = opcode_parser_set[opcode] parameters = opcode_parser.read_parameters(self) # self._logger.debug('Opcode {} {} {}'.format(opcode, opcode_parser.name, parameters)) # count characters by font is_font = dvi_opcodes.FONT_00 <= opcode <= dvi_opcodes.FNT4 is_set_char = opcode <= dvi_opcodes.SET4 # SET1 == 0 is_put_char = dvi_opcodes.PUT1 <= opcode <= dvi_opcodes.PUT4 is_char = is_set_char or is_put_char if is_char: if font_id in char_counter: char_counter[font_id] += 1 else: char_counter[font_id] = 1 # count rules is_rule = opcode == dvi_opcodes.SET_RULE or opcode == dvi_opcodes.PUT_RULE if is_rule: rule_counter += 1 # If the current and the previous opcode correspond to set/put char then the new # char is concatenated. if (is_char and previous_opcode_obj is not None and previous_opcode_was_set == is_set_char): previous_opcode_obj.append(parameters[0]) else: opcode_obj = opcode_parser.to_opcode(parameters) if opcode_obj is not None: opcode_program.append(opcode_obj) if is_font: font_id = opcode_obj.font_id if is_char: previous_opcode_obj = opcode_obj previous_opcode_was_set = is_set_char else: previous_opcode_obj = None previous_opcode_was_set = None # end of while loop opcode_program.number_of_chars = char_counter opcode_program.number_of_rules = rule_counter
####################################################################################################
[docs]class DviSubroutineParser(object): ############################################## def __init__(self, stream): self.stream = stream ##############################################
[docs] def parse(self): stream = self.stream opcode_program = DviSubroutine() # Define some counters to track fonts, characters and rules # These counters are intended to allocate memory at the beginning of a page rendering. font_id = None char_counter = {} rule_counter = 0 # opcode tracker to merge char opcode previous_opcode_obj = None previous_opcode_was_set = None # Fixme: tracking versus program simplification # could merge same opcode using a test and a merge method # char pop push positionning while True: if stream.end_of_stream(): break opcode = stream.read_unsigned_byte1() if opcode == dvi_opcodes.EOP: break else: opcode_parser = opcode_parser_set[opcode] parameters = opcode_parser.read_parameters(self) # self._logger.debug('Opcode {} {} {}'.format(opcode, opcode_parser.name, parameters)) # count characters by font is_font = dvi_opcodes.FONT_00 <= opcode <= dvi_opcodes.FNT4 is_set_char = opcode <= dvi_opcodes.SET4 # SET1 == 0 is_put_char = dvi_opcodes.PUT1 <= opcode <= dvi_opcodes.PUT4 is_char = is_set_char or is_put_char if is_char: if font_id in char_counter: char_counter[font_id] += 1 else: char_counter[font_id] = 1 # count rules is_rule = opcode == dvi_opcodes.SET_RULE or opcode == dvi_opcodes.PUT_RULE if is_rule: rule_counter += 1 # If the current and the previous opcode correspond to set/put char then the new # char is concatenated. if (is_char and previous_opcode_obj is not None and previous_opcode_was_set == is_set_char): previous_opcode_obj.append(parameters[0]) else: opcode_obj = opcode_parser.to_opcode(parameters) if opcode_obj is not None: opcode_program.append(opcode_obj) if is_font: font_id = opcode_obj.font_id if is_char: previous_opcode_obj = opcode_obj previous_opcode_was_set = is_set_char else: previous_opcode_obj = None previous_opcode_was_set = None # end of while loop opcode_program.number_of_chars = char_counter opcode_program.number_of_rules = rule_counter return opcode_program
#################################################################################################### # # End # ####################################################################################################