Spaces:
Sleeping
Sleeping
| //============================================================+ | |
| // File name : tcpdf_parser.php | |
| // Version : 1.0.001 | |
| // Begin : 2011-05-23 | |
| // Last Update : 2012-05-03 | |
| // Author : Nicola Asuni - Tecnick.com LTD - Manor Coach House, Church Hill, Aldershot, Hants, GU12 4RQ, UK - www.tecnick.com - info@tecnick.com | |
| // License : http://www.tecnick.com/pagefiles/tcpdf/LICENSE.TXT GNU-LGPLv3 | |
| // ------------------------------------------------------------------- | |
| // Copyright (C) 2011-2012 Nicola Asuni - Tecnick.com LTD | |
| // | |
| // This file is part of TCPDF software library. | |
| // | |
| // TCPDF is free software: you can redistribute it and/or modify it | |
| // under the terms of the GNU Lesser General Public License as | |
| // published by the Free Software Foundation, either version 3 of the | |
| // License, or (at your option) any later version. | |
| // | |
| // TCPDF is distributed in the hope that it will be useful, but | |
| // WITHOUT ANY WARRANTY; without even the implied warranty of | |
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
| // See the GNU Lesser General Public License for more details. | |
| // | |
| // You should have received a copy of the License | |
| // along with TCPDF. If not, see | |
| // <http://www.tecnick.com/pagefiles/tcpdf/LICENSE.TXT>. | |
| // | |
| // See LICENSE.TXT file for more information. | |
| // ------------------------------------------------------------------- | |
| // | |
| // Description : This is a PHP class for parsing PDF documents. | |
| // | |
| //============================================================+ | |
| /** | |
| * @file | |
| * This is a PHP class for parsing PDF documents.<br> | |
| * @package com.tecnick.tcpdf | |
| * @author Nicola Asuni | |
| * @version 1.0.001 | |
| */ | |
| // include class for decoding filters | |
| require_once(dirname(__FILE__).'/tcpdf_filters.php'); | |
| /** | |
| * @class TCPDF_PARSER | |
| * This is a PHP class for parsing PDF documents.<br> | |
| * @package com.tecnick.tcpdf | |
| * @brief This is a PHP class for parsing PDF documents.. | |
| * @version 1.0.001 | |
| * @author Nicola Asuni - info@tecnick.com | |
| */ | |
| class TCPDF_PARSER { | |
| /** | |
| * Raw content of the PDF document. | |
| * @private | |
| */ | |
| private $pdfdata = ''; | |
| /** | |
| * XREF data. | |
| * @protected | |
| */ | |
| protected $xref = array(); | |
| /** | |
| * Array of PDF objects. | |
| * @protected | |
| */ | |
| protected $objects = array(); | |
| /** | |
| * Class object for decoding filters. | |
| * @private | |
| */ | |
| private $FilterDecoders; | |
| // ----------------------------------------------------------------------------- | |
| /** | |
| * Parse a PDF document an return an array of objects. | |
| * @param $data (string) PDF data to parse. | |
| * @public | |
| * @since 1.0.000 (2011-05-24) | |
| */ | |
| public function __construct($data) { | |
| if (empty($data)) { | |
| $this->Error('Empty PDF data.'); | |
| } | |
| $this->pdfdata = $data; | |
| // get length | |
| $pdflen = strlen($this->pdfdata); | |
| // initialize class for decoding filters | |
| $this->FilterDecoders = new TCPDF_FILTERS(); | |
| // get xref and trailer data | |
| $this->xref = $this->getXrefData(); | |
| // parse all document objects | |
| $this->objects = array(); | |
| foreach ($this->xref['xref'] as $obj => $offset) { | |
| if (!isset($this->objects[$obj])) { | |
| $this->objects[$obj] = $this->getIndirectObject($obj, $offset, true); | |
| } | |
| } | |
| // release some memory | |
| unset($this->pdfdata); | |
| $this->pdfdata = ''; | |
| } | |
| /** | |
| * Return an array of parsed PDF document objects. | |
| * @return (array) Array of parsed PDF document objects. | |
| * @public | |
| * @since 1.0.000 (2011-06-26) | |
| */ | |
| public function getParsedData() { | |
| return array($this->xref, $this->objects); | |
| } | |
| /** | |
| * Get xref (cross-reference table) and trailer data from PDF document data. | |
| * @param $offset (int) xref offset (if know). | |
| * @param $xref (array) previous xref array (if any). | |
| * @return Array containing xref and trailer data. | |
| * @protected | |
| * @since 1.0.000 (2011-05-24) | |
| */ | |
| protected function getXrefData($offset=0, $xref=array()) { | |
| if ($offset == 0) { | |
| // find last startxref | |
| if (preg_match_all('/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $this->pdfdata, $matches, PREG_SET_ORDER, $offset) == 0) { | |
| $this->Error('Unable to find startxref'); | |
| } | |
| $matches = array_pop($matches); | |
| $startxref = $matches[1]; | |
| } else { | |
| // get the first xref at the specified offset | |
| if (preg_match('/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) == 0) { | |
| $this->Error('Unable to find startxref'); | |
| } | |
| $startxref = $matches[1][0]; | |
| } | |
| // check xref position | |
| if (strpos($this->pdfdata, 'xref', $startxref) != $startxref) { | |
| $this->Error('Unable to find xref'); | |
| } | |
| // extract xref data (object indexes and offsets) | |
| $xoffset = $startxref + 5; | |
| // initialize object number | |
| $obj_num = 0; | |
| $offset = $xoffset; | |
| while (preg_match('/^([0-9]+)[\s]([0-9]+)[\s]?([nf]?)/im', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) { | |
| $offset = (strlen($matches[0][0]) + $matches[0][1]); | |
| if ($matches[3][0] == 'n') { | |
| // create unique object index: [object number]_[generation number] | |
| $index = $obj_num.'_'.intval($matches[2][0]); | |
| // check if object already exist | |
| if (!isset($xref['xref'][$index])) { | |
| // store object offset position | |
| $xref['xref'][$index] = intval($matches[1][0]); | |
| } | |
| ++$obj_num; | |
| $offset += 2; | |
| } elseif ($matches[3][0] == 'f') { | |
| ++$obj_num; | |
| $offset += 2; | |
| } else { | |
| // object number (index) | |
| $obj_num = intval($matches[1][0]); | |
| } | |
| } | |
| // get trailer data | |
| if (preg_match('/trailer[\s]*<<(.*)>>[\s]*[\r\n]+startxref[\s]*[\r\n]+/isU', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $xoffset) > 0) { | |
| $trailer_data = $matches[1][0]; | |
| if (!isset($xref['trailer'])) { | |
| // get only the last updated version | |
| $xref['trailer'] = array(); | |
| // parse trailer_data | |
| if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { | |
| $xref['trailer']['size'] = intval($matches[1]); | |
| } | |
| if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { | |
| $xref['trailer']['root'] = intval($matches[1]).'_'.intval($matches[2]); | |
| } | |
| if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { | |
| $xref['trailer']['encrypt'] = intval($matches[1]).'_'.intval($matches[2]); | |
| } | |
| if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { | |
| $xref['trailer']['info'] = intval($matches[1]).'_'.intval($matches[2]); | |
| } | |
| if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) { | |
| $xref['trailer']['id'] = array(); | |
| $xref['trailer']['id'][0] = $matches[1]; | |
| $xref['trailer']['id'][1] = $matches[2]; | |
| } | |
| } | |
| if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { | |
| // get previous xref | |
| $xref = $this->getXrefData(intval($matches[1]), $xref); | |
| } | |
| } else { | |
| $this->Error('Unable to find trailer'); | |
| } | |
| return $xref; | |
| } | |
| /** | |
| * Get object type, raw value and offset to next object | |
| * @param $offset (int) Object offset. | |
| * @return array containing object type, raw value and offset to next object | |
| * @protected | |
| * @since 1.0.000 (2011-06-20) | |
| */ | |
| protected function getRawObject($offset=0) { | |
| $objtype = ''; // object type to be returned | |
| $objval = ''; // object value to be returned | |
| // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP) | |
| $offset += strspn($this->pdfdata, "\x00\x09\x0a\x0c\x0d\x20", $offset); | |
| // get first char | |
| $char = $this->pdfdata{$offset}; | |
| // get object type | |
| switch ($char) { | |
| case '%': { // \x25 PERCENT SIGN | |
| // skip comment and search for next token | |
| $next = strcspn($this->pdfdata, "\r\n", $offset); | |
| if ($next > 0) { | |
| $offset += $next; | |
| return $this->getRawObject($this->pdfdata, $offset); | |
| } | |
| break; | |
| } | |
| case '/': { // \x2F SOLIDUS | |
| // name object | |
| $objtype = $char; | |
| ++$offset; | |
| if (preg_match('/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/', substr($this->pdfdata, $offset, 256), $matches) == 1) { | |
| $objval = $matches[1]; // unescaped value | |
| $offset += strlen($objval); | |
| } | |
| break; | |
| } | |
| case '(': // \x28 LEFT PARENTHESIS | |
| case ')': { // \x29 RIGHT PARENTHESIS | |
| // literal string object | |
| $objtype = $char; | |
| ++$offset; | |
| $strpos = $offset; | |
| if ($char == '(') { | |
| $open_bracket = 1; | |
| while ($open_bracket > 0) { | |
| if (!isset($this->pdfdata{$strpos})) { | |
| break; | |
| } | |
| $ch = $this->pdfdata{$strpos}; | |
| switch ($ch) { | |
| case '\\': { // REVERSE SOLIDUS (5Ch) (Backslash) | |
| // skip next character | |
| ++$strpos; | |
| break; | |
| } | |
| case '(': { // LEFT PARENHESIS (28h) | |
| ++$open_bracket; | |
| break; | |
| } | |
| case ')': { // RIGHT PARENTHESIS (29h) | |
| --$open_bracket; | |
| break; | |
| } | |
| } | |
| ++$strpos; | |
| } | |
| $objval = substr($this->pdfdata, $offset, ($strpos - $offset - 1)); | |
| $offset = $strpos; | |
| } | |
| break; | |
| } | |
| case '[': // \x5B LEFT SQUARE BRACKET | |
| case ']': { // \x5D RIGHT SQUARE BRACKET | |
| // array object | |
| $objtype = $char; | |
| ++$offset; | |
| if ($char == '[') { | |
| // get array content | |
| $objval = array(); | |
| do { | |
| // get element | |
| $element = $this->getRawObject($offset); | |
| $offset = $element[2]; | |
| $objval[] = $element; | |
| } while ($element[0] != ']'); | |
| // remove closing delimiter | |
| array_pop($objval); | |
| } | |
| break; | |
| } | |
| case '<': // \x3C LESS-THAN SIGN | |
| case '>': { // \x3E GREATER-THAN SIGN | |
| if (isset($this->pdfdata{($offset + 1)}) AND ($this->pdfdata{($offset + 1)} == $char)) { | |
| // dictionary object | |
| $objtype = $char.$char; | |
| $offset += 2; | |
| if ($char == '<') { | |
| // get array content | |
| $objval = array(); | |
| do { | |
| // get element | |
| $element = $this->getRawObject($offset); | |
| $offset = $element[2]; | |
| $objval[] = $element; | |
| } while ($element[0] != '>>'); | |
| // remove closing delimiter | |
| array_pop($objval); | |
| } | |
| } else { | |
| // hexadecimal string object | |
| $objtype = $char; | |
| ++$offset; | |
| if (($char == '<') AND (preg_match('/^([0-9A-Fa-f]+)[>]/iU', substr($this->pdfdata, $offset), $matches) == 1)) { | |
| $objval = $matches[1]; | |
| $offset += strlen($matches[0]); | |
| } | |
| } | |
| break; | |
| } | |
| default: { | |
| if (substr($this->pdfdata, $offset, 6) == 'endobj') { | |
| // indirect object | |
| $objtype = 'endobj'; | |
| $offset += 6; | |
| } elseif (substr($this->pdfdata, $offset, 4) == 'null') { | |
| // null object | |
| $objtype = 'null'; | |
| $offset += 4; | |
| $objval = 'null'; | |
| } elseif (substr($this->pdfdata, $offset, 4) == 'true') { | |
| // boolean true object | |
| $objtype = 'boolean'; | |
| $offset += 4; | |
| $objval = 'true'; | |
| } elseif (substr($this->pdfdata, $offset, 5) == 'false') { | |
| // boolean false object | |
| $objtype = 'boolean'; | |
| $offset += 5; | |
| $objval = 'false'; | |
| } elseif (substr($this->pdfdata, $offset, 6) == 'stream') { | |
| // start stream object | |
| $objtype = 'stream'; | |
| $offset += 6; | |
| if (preg_match('/^[\r\n]+(.*)[\r\n]*endstream/isU', substr($this->pdfdata, $offset), $matches) == 1) { | |
| $objval = $matches[1]; | |
| $offset += strlen($matches[0]); | |
| } | |
| } elseif (substr($this->pdfdata, $offset, 9) == 'endstream') { | |
| // end stream object | |
| $objtype = 'endstream'; | |
| $offset += 9; | |
| } elseif (preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($this->pdfdata, $offset, 33), $matches) == 1) { | |
| // indirect object reference | |
| $objtype = 'ojbref'; | |
| $offset += strlen($matches[0]); | |
| $objval = intval($matches[1]).'_'.intval($matches[2]); | |
| } elseif (preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($this->pdfdata, $offset, 33), $matches) == 1) { | |
| // object start | |
| $objtype = 'ojb'; | |
| $objval = intval($matches[1]).'_'.intval($matches[2]); | |
| $offset += strlen ($matches[0]); | |
| } elseif (($numlen = strspn($this->pdfdata, '+-.0123456789', $offset)) > 0) { | |
| // numeric object | |
| $objtype = 'numeric'; | |
| $objval = substr($this->pdfdata, $offset, $numlen); | |
| $offset += $numlen; | |
| } | |
| break; | |
| } | |
| } | |
| return array($objtype, $objval, $offset); | |
| } | |
| /** | |
| * Get content of indirect object. | |
| * @param $obj_ref (string) Object number and generation number separated by underscore character. | |
| * @param $offset (int) Object offset. | |
| * @param $decoding (boolean) If true decode streams. | |
| * @return array containing object data. | |
| * @protected | |
| * @since 1.0.000 (2011-05-24) | |
| */ | |
| protected function getIndirectObject($obj_ref, $offset=0, $decoding=true) { | |
| $obj = explode('_', $obj_ref); | |
| if (($obj === false) OR (count($obj) != 2)) { | |
| $this->Error('Invalid object reference: '.$obj); | |
| return; | |
| } | |
| $objref = $obj[0].' '.$obj[1].' obj'; | |
| if (strpos($this->pdfdata, $objref, $offset) != $offset) { | |
| // an indirect reference to an undefined object shall be considered a reference to the null object | |
| return array('null', 'null', $offset); | |
| } | |
| // starting position of object content | |
| $offset += strlen($objref); | |
| // get array of object content | |
| $objdata = array(); | |
| $i = 0; // object main index | |
| do { | |
| // get element | |
| $element = $this->getRawObject($offset); | |
| $offset = $element[2]; | |
| // decode stream using stream's dictionary information | |
| if ($decoding AND ($element[0] == 'stream') AND (isset($objdata[($i - 1)][0])) AND ($objdata[($i - 1)][0] == '<<')) { | |
| $element[3] = $this->decodeStream($objdata[($i - 1)][1], substr($element[1], 1)); | |
| } | |
| $objdata[$i] = $element; | |
| ++$i; | |
| } while ($element[0] != 'endobj'); | |
| // remove closing delimiter | |
| array_pop($objdata); | |
| // return raw object content | |
| return $objdata; | |
| } | |
| /** | |
| * Get the content of object, resolving indect object reference if necessary. | |
| * @param $obj (string) Object value. | |
| * @return array containing object data. | |
| * @protected | |
| * @since 1.0.000 (2011-06-26) | |
| */ | |
| protected function getObjectVal($obj) { | |
| if ($obj[0] == 'objref') { | |
| // reference to indirect object | |
| if (isset($this->objects[$obj[1]])) { | |
| // this object has been already parsed | |
| return $this->objects[$obj[1]]; | |
| } elseif (isset($this->xref[$obj[1]])) { | |
| // parse new object | |
| $this->objects[$obj[1]] = $this->getIndirectObject($obj[1], $this->xref[$obj[1]], false); | |
| return $this->objects[$obj[1]]; | |
| } | |
| } | |
| return $obj; | |
| } | |
| /** | |
| * Decode the specified stream. | |
| * @param $sdic (array) Stream's dictionary array. | |
| * @param $stream (string) Stream to decode. | |
| * @return array containing decoded stream data and remaining filters. | |
| * @protected | |
| * @since 1.0.000 (2011-06-22) | |
| */ | |
| protected function decodeStream($sdic, $stream) { | |
| // get stream lenght and filters | |
| $slength = strlen($stream); | |
| $filters = array(); | |
| foreach ($sdic as $k => $v) { | |
| if ($v[0] == '/') { | |
| if (($v[1] == 'Length') AND (isset($sdic[($k + 1)])) AND ($sdic[($k + 1)][0] == 'numeric')) { | |
| // get declared stream lenght | |
| $declength = intval($sdic[($k + 1)][1]); | |
| if ($declength < $slength) { | |
| $stream = substr($stream, 0, $declength); | |
| $slength = $declength; | |
| } | |
| } elseif (($v[1] == 'Filter') AND (isset($sdic[($k + 1)]))) { | |
| // resolve indirect object | |
| $objval = $this->getObjectVal($sdic[($k + 1)]); | |
| if ($objval[0] == '/') { | |
| // single filter | |
| $filters[] = $objval[1]; | |
| } elseif ($objval[0] == '[') { | |
| // array of filters | |
| foreach ($objval[1] as $flt) { | |
| if ($flt[0] == '/') { | |
| $filters[] = $flt[1]; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| // decode the stream | |
| $remaining_filters = array(); | |
| foreach ($filters as $filter) { | |
| if (in_array($filter, $this->FilterDecoders->getAvailableFilters())) { | |
| $stream = $this->FilterDecoders->decodeFilter($filter, $stream); | |
| } else { | |
| // add missing filter to array | |
| $remaining_filters[] = $filter; | |
| } | |
| } | |
| return array($stream, $remaining_filters); | |
| } | |
| /** | |
| * This method is automatically called in case of fatal error; it simply outputs the message and halts the execution. | |
| * @param $msg (string) The error message | |
| * @public | |
| * @since 1.0.000 (2011-05-23) | |
| */ | |
| public function Error($msg) { | |
| // exit program and print error | |
| die('<strong>TCPDF_PARSER ERROR: </strong>'.$msg); | |
| } | |
| } // END OF TCPDF_PARSER CLASS | |
| //============================================================+ | |
| // END OF FILE | |
| //============================================================+ | |