diff --git a/fairseq-0.10.2/fairseq_cli/preprocess.py b/fairseq-0.10.2/fairseq_cli/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..fa77da8dba74e3e07cadfc66abf8fb5fe7bddd6c --- /dev/null +++ b/fairseq-0.10.2/fairseq_cli/preprocess.py @@ -0,0 +1,398 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Data pre-processing: build vocabularies and binarize training data. +""" + +import logging +import os +import shutil +import sys +from collections import Counter +from itertools import zip_longest +from multiprocessing import Pool + +from fairseq import options, tasks, utils +from fairseq.binarizer import Binarizer +from fairseq.data import indexed_dataset + + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=sys.stdout, +) +logger = logging.getLogger("fairseq_cli.preprocess") + + +def main(args): + utils.import_user_module(args) + + os.makedirs(args.destdir, exist_ok=True) + + logger.addHandler( + logging.FileHandler( + filename=os.path.join(args.destdir, "preprocess.log"), + ) + ) + logger.info(args) + + task = tasks.get_task(args.task) + + def train_path(lang): + return "{}{}".format(args.trainpref, ("." + lang) if lang else "") + + def file_name(prefix, lang): + fname = prefix + if lang is not None: + fname += ".{lang}".format(lang=lang) + return fname + + def dest_path(prefix, lang): + return os.path.join(args.destdir, file_name(prefix, lang)) + + def dict_path(lang): + return dest_path("dict", lang) + ".txt" + + def build_dictionary(filenames, src=False, tgt=False): + assert src ^ tgt + return task.build_dictionary( + filenames, + workers=args.workers, + threshold=args.thresholdsrc if src else args.thresholdtgt, + nwords=args.nwordssrc if src else args.nwordstgt, + padding_factor=args.padding_factor, + ) + + target = not args.only_source + + if not args.srcdict and os.path.exists(dict_path(args.source_lang)): + raise FileExistsError(dict_path(args.source_lang)) + if target and not args.tgtdict and os.path.exists(dict_path(args.target_lang)): + raise FileExistsError(dict_path(args.target_lang)) + + if args.joined_dictionary: + assert ( + not args.srcdict or not args.tgtdict + ), "cannot use both --srcdict and --tgtdict with --joined-dictionary" + + if args.srcdict: + src_dict = task.load_dictionary(args.srcdict) + elif args.tgtdict: + src_dict = task.load_dictionary(args.tgtdict) + else: + assert ( + args.trainpref + ), "--trainpref must be set if --srcdict is not specified" + src_dict = build_dictionary( + {train_path(lang) for lang in [args.source_lang, args.target_lang]}, + src=True, + ) + tgt_dict = src_dict + else: + if args.srcdict: + src_dict = task.load_dictionary(args.srcdict) + else: + assert ( + args.trainpref + ), "--trainpref must be set if --srcdict is not specified" + src_dict = build_dictionary([train_path(args.source_lang)], src=True) + + if target: + if args.tgtdict: + tgt_dict = task.load_dictionary(args.tgtdict) + else: + assert ( + args.trainpref + ), "--trainpref must be set if --tgtdict is not specified" + tgt_dict = build_dictionary([train_path(args.target_lang)], tgt=True) + else: + tgt_dict = None + + src_dict.save(dict_path(args.source_lang)) + if target and tgt_dict is not None: + tgt_dict.save(dict_path(args.target_lang)) + + def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): + logger.info("[{}] Dictionary: {} types".format(lang, len(vocab))) + n_seq_tok = [0, 0] + replaced = Counter() + + def merge_result(worker_result): + replaced.update(worker_result["replaced"]) + n_seq_tok[0] += worker_result["nseq"] + n_seq_tok[1] += worker_result["ntok"] + + input_file = "{}{}".format( + input_prefix, ("." + lang) if lang is not None else "" + ) + offsets = Binarizer.find_offsets(input_file, num_workers) + pool = None + if num_workers > 1: + pool = Pool(processes=num_workers - 1) + for worker_id in range(1, num_workers): + prefix = "{}{}".format(output_prefix, worker_id) + pool.apply_async( + binarize, + ( + args, + input_file, + vocab, + prefix, + lang, + offsets[worker_id], + offsets[worker_id + 1], + ), + callback=merge_result, + ) + pool.close() + + ds = indexed_dataset.make_builder( + dataset_dest_file(args, output_prefix, lang, "bin"), + impl=args.dataset_impl, + vocab_size=len(vocab), + ) + merge_result( + Binarizer.binarize( + input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1] + ) + ) + if num_workers > 1: + pool.join() + for worker_id in range(1, num_workers): + prefix = "{}{}".format(output_prefix, worker_id) + temp_file_path = dataset_dest_prefix(args, prefix, lang) + ds.merge_file_(temp_file_path) + os.remove(indexed_dataset.data_file_path(temp_file_path)) + os.remove(indexed_dataset.index_file_path(temp_file_path)) + + ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) + + logger.info( + "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( + lang, + input_file, + n_seq_tok[0], + n_seq_tok[1], + 100 * sum(replaced.values()) / n_seq_tok[1], + vocab.unk_word, + ) + ) + + def make_binary_alignment_dataset(input_prefix, output_prefix, num_workers): + nseq = [0] + + def merge_result(worker_result): + nseq[0] += worker_result["nseq"] + + input_file = input_prefix + offsets = Binarizer.find_offsets(input_file, num_workers) + pool = None + if num_workers > 1: + pool = Pool(processes=num_workers - 1) + for worker_id in range(1, num_workers): + prefix = "{}{}".format(output_prefix, worker_id) + pool.apply_async( + binarize_alignments, + ( + args, + input_file, + utils.parse_alignment, + prefix, + offsets[worker_id], + offsets[worker_id + 1], + ), + callback=merge_result, + ) + pool.close() + + ds = indexed_dataset.make_builder( + dataset_dest_file(args, output_prefix, None, "bin"), impl=args.dataset_impl + ) + + merge_result( + Binarizer.binarize_alignments( + input_file, + utils.parse_alignment, + lambda t: ds.add_item(t), + offset=0, + end=offsets[1], + ) + ) + if num_workers > 1: + pool.join() + for worker_id in range(1, num_workers): + prefix = "{}{}".format(output_prefix, worker_id) + temp_file_path = dataset_dest_prefix(args, prefix, None) + ds.merge_file_(temp_file_path) + os.remove(indexed_dataset.data_file_path(temp_file_path)) + os.remove(indexed_dataset.index_file_path(temp_file_path)) + + ds.finalize(dataset_dest_file(args, output_prefix, None, "idx")) + + logger.info("[alignments] {}: parsed {} alignments".format(input_file, nseq[0])) + + def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): + if args.dataset_impl == "raw": + # Copy original text file to destination folder + output_text_file = dest_path( + output_prefix + ".{}-{}".format(args.source_lang, args.target_lang), + lang, + ) + shutil.copyfile(file_name(input_prefix, lang), output_text_file) + else: + make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers) + + def make_all(lang, vocab): + if args.trainpref: + make_dataset(vocab, args.trainpref, "train", lang, num_workers=args.workers) + if args.validpref: + for k, validpref in enumerate(args.validpref.split(",")): + outprefix = "valid{}".format(k) if k > 0 else "valid" + make_dataset( + vocab, validpref, outprefix, lang, num_workers=args.workers + ) + if args.testpref: + for k, testpref in enumerate(args.testpref.split(",")): + outprefix = "test{}".format(k) if k > 0 else "test" + make_dataset(vocab, testpref, outprefix, lang, num_workers=args.workers) + + def make_all_alignments(): + if args.trainpref and os.path.exists(args.trainpref + "." + args.align_suffix): + make_binary_alignment_dataset( + args.trainpref + "." + args.align_suffix, + "train.align", + num_workers=args.workers, + ) + if args.validpref and os.path.exists(args.validpref + "." + args.align_suffix): + make_binary_alignment_dataset( + args.validpref + "." + args.align_suffix, + "valid.align", + num_workers=args.workers, + ) + if args.testpref and os.path.exists(args.testpref + "." + args.align_suffix): + make_binary_alignment_dataset( + args.testpref + "." + args.align_suffix, + "test.align", + num_workers=args.workers, + ) + + make_all(args.source_lang, src_dict) + if target: + make_all(args.target_lang, tgt_dict) + if args.align_suffix: + make_all_alignments() + + logger.info("Wrote preprocessed data to {}".format(args.destdir)) + + if args.alignfile: + assert args.trainpref, "--trainpref must be set if --alignfile is specified" + src_file_name = train_path(args.source_lang) + tgt_file_name = train_path(args.target_lang) + freq_map = {} + with open(args.alignfile, "r", encoding="utf-8") as align_file: + with open(src_file_name, "r", encoding="utf-8") as src_file: + with open(tgt_file_name, "r", encoding="utf-8") as tgt_file: + for a, s, t in zip_longest(align_file, src_file, tgt_file): + si = src_dict.encode_line(s, add_if_not_exist=False) + ti = tgt_dict.encode_line(t, add_if_not_exist=False) + ai = list(map(lambda x: tuple(x.split("-")), a.split())) + for sai, tai in ai: + srcidx = si[int(sai)] + tgtidx = ti[int(tai)] + if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): + assert srcidx != src_dict.pad() + assert srcidx != src_dict.eos() + assert tgtidx != tgt_dict.pad() + assert tgtidx != tgt_dict.eos() + + if srcidx not in freq_map: + freq_map[srcidx] = {} + if tgtidx not in freq_map[srcidx]: + freq_map[srcidx][tgtidx] = 1 + else: + freq_map[srcidx][tgtidx] += 1 + + align_dict = {} + for srcidx in freq_map.keys(): + align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) + + with open( + os.path.join( + args.destdir, + "alignment.{}-{}.txt".format(args.source_lang, args.target_lang), + ), + "w", + encoding="utf-8", + ) as f: + for k, v in align_dict.items(): + print("{} {}".format(src_dict[k], tgt_dict[v]), file=f) + + +def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True): + ds = indexed_dataset.make_builder( + dataset_dest_file(args, output_prefix, lang, "bin"), + impl=args.dataset_impl, + vocab_size=len(vocab), + ) + + def consumer(tensor): + ds.add_item(tensor) + + res = Binarizer.binarize( + filename, vocab, consumer, append_eos=append_eos, offset=offset, end=end + ) + ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) + return res + + +def binarize_alignments(args, filename, parse_alignment, output_prefix, offset, end): + ds = indexed_dataset.make_builder( + dataset_dest_file(args, output_prefix, None, "bin"), + impl=args.dataset_impl, + vocab_size=None, + ) + + def consumer(tensor): + ds.add_item(tensor) + + res = Binarizer.binarize_alignments( + filename, parse_alignment, consumer, offset=offset, end=end + ) + ds.finalize(dataset_dest_file(args, output_prefix, None, "idx")) + return res + + +def dataset_dest_prefix(args, output_prefix, lang): + base = "{}/{}".format(args.destdir, output_prefix) + if lang is not None: + lang_part = ".{}-{}.{}".format(args.source_lang, args.target_lang, lang) + elif args.only_source: + lang_part = "" + else: + lang_part = ".{}-{}".format(args.source_lang, args.target_lang) + + return "{}{}".format(base, lang_part) + + +def dataset_dest_file(args, output_prefix, lang, extension): + base = dataset_dest_prefix(args, output_prefix, lang) + return "{}.{}".format(base, extension) + + +def get_offsets(input_file, num_workers): + return Binarizer.find_offsets(input_file, num_workers) + + +def cli_main(): + parser = options.get_preprocessing_parser() + args = parser.parse_args() + main(args) + + +if __name__ == "__main__": + cli_main() diff --git a/fairseq-0.10.2/fairseq_cli/score.py b/fairseq-0.10.2/fairseq_cli/score.py new file mode 100644 index 0000000000000000000000000000000000000000..b8354eb95a8b786c0e21b8dfe777f36af6f261a3 --- /dev/null +++ b/fairseq-0.10.2/fairseq_cli/score.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +BLEU scoring of generated translations against reference translations. +""" + +import argparse +import os +import sys + +from fairseq.data import dictionary +from fairseq.scoring import bleu + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Command-line script for BLEU scoring." + ) + # fmt: off + parser.add_argument('-s', '--sys', default='-', help='system output') + parser.add_argument('-r', '--ref', required=True, help='references') + parser.add_argument('-o', '--order', default=4, metavar='N', + type=int, help='consider ngrams up to this order') + parser.add_argument('--ignore-case', action='store_true', + help='case-insensitive scoring') + parser.add_argument('--sacrebleu', action='store_true', + help='score with sacrebleu') + parser.add_argument('--sentence-bleu', action='store_true', + help='report sentence-level BLEUs (i.e., with +1 smoothing)') + # fmt: on + return parser + + +def cli_main(): + parser = get_parser() + args = parser.parse_args() + print(args) + + assert args.sys == "-" or os.path.exists( + args.sys + ), "System output file {} does not exist".format(args.sys) + assert os.path.exists(args.ref), "Reference file {} does not exist".format(args.ref) + + dict = dictionary.Dictionary() + + def readlines(fd): + for line in fd.readlines(): + if args.ignore_case: + yield line.lower() + else: + yield line + + if args.sacrebleu: + import sacrebleu + + def score(fdsys): + with open(args.ref) as fdref: + print(sacrebleu.corpus_bleu(fdsys, [fdref])) + + elif args.sentence_bleu: + + def score(fdsys): + with open(args.ref) as fdref: + scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) + for i, (sys_tok, ref_tok) in enumerate( + zip(readlines(fdsys), readlines(fdref)) + ): + scorer.reset(one_init=True) + sys_tok = dict.encode_line(sys_tok) + ref_tok = dict.encode_line(ref_tok) + scorer.add(ref_tok, sys_tok) + print(i, scorer.result_string(args.order)) + + else: + + def score(fdsys): + with open(args.ref) as fdref: + scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) + for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)): + sys_tok = dict.encode_line(sys_tok) + ref_tok = dict.encode_line(ref_tok) + scorer.add(ref_tok, sys_tok) + print(scorer.result_string(args.order)) + + if args.sys == "-": + score(sys.stdin) + else: + with open(args.sys, "r") as f: + score(f) + + +if __name__ == "__main__": + cli_main() diff --git a/fairseq-0.10.2/tests/test_backtranslation_dataset.py b/fairseq-0.10.2/tests/test_backtranslation_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..dffc3b49387dfdc046ea23d7db179377040b7cbc --- /dev/null +++ b/fairseq-0.10.2/tests/test_backtranslation_dataset.py @@ -0,0 +1,123 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import tests.utils as test_utils +import torch +from fairseq.data import ( + BacktranslationDataset, + LanguagePairDataset, + TransformEosDataset, +) +from fairseq.sequence_generator import SequenceGenerator + + +class TestBacktranslationDataset(unittest.TestCase): + def setUp(self): + ( + self.tgt_dict, + self.w1, + self.w2, + self.src_tokens, + self.src_lengths, + self.model, + ) = test_utils.sequence_generator_setup() + + dummy_src_samples = self.src_tokens + + self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples) + self.cuda = torch.cuda.is_available() + + def _backtranslation_dataset_helper( + self, + remove_eos_from_input_src, + remove_eos_from_output_src, + ): + tgt_dataset = LanguagePairDataset( + src=self.tgt_dataset, + src_sizes=self.tgt_dataset.sizes, + src_dict=self.tgt_dict, + tgt=None, + tgt_sizes=None, + tgt_dict=None, + ) + + generator = SequenceGenerator( + [self.model], + tgt_dict=self.tgt_dict, + max_len_a=0, + max_len_b=200, + beam_size=2, + unk_penalty=0, + ) + + backtranslation_dataset = BacktranslationDataset( + tgt_dataset=TransformEosDataset( + dataset=tgt_dataset, + eos=self.tgt_dict.eos(), + # remove eos from the input src + remove_eos_from_src=remove_eos_from_input_src, + ), + src_dict=self.tgt_dict, + backtranslation_fn=( + lambda sample: generator.generate([self.model], sample) + ), + output_collater=TransformEosDataset( + dataset=tgt_dataset, + eos=self.tgt_dict.eos(), + # if we remove eos from the input src, then we need to add it + # back to the output tgt + append_eos_to_tgt=remove_eos_from_input_src, + remove_eos_from_src=remove_eos_from_output_src, + ).collater, + cuda=self.cuda, + ) + dataloader = torch.utils.data.DataLoader( + backtranslation_dataset, + batch_size=2, + collate_fn=backtranslation_dataset.collater, + ) + backtranslation_batch_result = next(iter(dataloader)) + + eos, pad, w1, w2 = self.tgt_dict.eos(), self.tgt_dict.pad(), self.w1, self.w2 + + # Note that we sort by src_lengths and add left padding, so actually + # ids will look like: [1, 0] + expected_src = torch.LongTensor([[w1, w2, w1, eos], [pad, pad, w1, eos]]) + if remove_eos_from_output_src: + expected_src = expected_src[:, :-1] + expected_tgt = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]]) + generated_src = backtranslation_batch_result["net_input"]["src_tokens"] + tgt_tokens = backtranslation_batch_result["target"] + + self.assertTensorEqual(expected_src, generated_src) + self.assertTensorEqual(expected_tgt, tgt_tokens) + + def test_backtranslation_dataset_no_eos_in_output_src(self): + self._backtranslation_dataset_helper( + remove_eos_from_input_src=False, + remove_eos_from_output_src=True, + ) + + def test_backtranslation_dataset_with_eos_in_output_src(self): + self._backtranslation_dataset_helper( + remove_eos_from_input_src=False, + remove_eos_from_output_src=False, + ) + + def test_backtranslation_dataset_no_eos_in_input_src(self): + self._backtranslation_dataset_helper( + remove_eos_from_input_src=True, + remove_eos_from_output_src=False, + ) + + def assertTensorEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertEqual(t1.ne(t2).long().sum(), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_constraints.py b/fairseq-0.10.2/tests/test_constraints.py new file mode 100644 index 0000000000000000000000000000000000000000..1c37f7e1fb26d8ea5349fedd3a60f566d09cf598 --- /dev/null +++ b/fairseq-0.10.2/tests/test_constraints.py @@ -0,0 +1,269 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import sys +import unittest + +import torch +from fairseq.token_generation_constraints import * + + +def tensorize(constraints: List[List[int]]) -> torch.Tensor: + return [torch.tensor(x) for x in constraints] + + +class TestHelperRoutines(unittest.TestCase): + def setUp(self): + self.examples = [ + ([[]], torch.tensor([[0]])), + ([[], []], torch.tensor([[0], [0]])), + ([[torch.tensor([1, 2])], []], torch.tensor([[1, 1, 2, 0], [0, 0, 0, 0]])), + ( + [ + [ + torch.tensor([3, 1, 2]), + torch.tensor([3]), + torch.tensor([4, 5, 6, 7]), + ], + [], + [torch.tensor([1, 8, 9, 10, 1, 4, 11, 12])], + ], + torch.tensor( + [ + [3, 3, 1, 2, 0, 3, 0, 4, 5, 6, 7, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 8, 9, 10, 1, 4, 11, 12, 0, 0, 0], + ] + ), + ), + ] + + def test_packing(self): + """Ensures the list of lists of tensors gets packed correctly.""" + for batch_constraints, expected_tensor in self.examples: + packed = pack_constraints(batch_constraints) + assert torch.equal(packed, expected_tensor) + + +class TestUnorderedConstraintState(unittest.TestCase): + def setUp(self): + # Tuples of (contraint set, expected printed graph, token counts per node) + self.examples = [ + ( + tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), + "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))", + {1: 4, 2: 1, 3: 2, 4: 3, 5: 2, 6: 1, 7: 1}, + ), + ([], "[None].False#0", {}), + (tensorize([[0]]), "([None].False#1 [0].True#1)", {0: 1}), + ( + tensorize([[100000, 1, 2, 3, 4, 5]]), + "([None].False#1 ([100000].False#1 ([1].False#1 ([2].False#1 ([3].False#1 ([4].False#1 [5].True#1))))))", + {100000: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1}, + ), + ( + tensorize([[1, 2], [1, 2]]), + "([None].False#2 ([1].False#2 [2].True#2))", + {1: 2, 2: 2}, + ), + ( + tensorize([[1, 2], [3, 4]]), + "([None].False#2 ([1].False#1 [2].True#1) ([3].False#1 [4].True#1))", + {1: 1, 2: 1, 3: 1, 4: 1}, + ), + ] + + self.sequences = [ + ( + self.examples[0][0], + [], + {"bank": 0, "num_completed": 0, "finished": False, "is_root": True}, + ), + ( + self.examples[0][0], + [1, 2], + {"bank": 2, "num_completed": 0, "finished": False, "is_root": False}, + ), + ( + self.examples[0][0], + [1, 2, 94], + {"bank": 1, "num_completed": 1, "finished": False, "is_root": True}, + ), + ( + self.examples[0][0], + [1, 3, 999, 1, 4], + {"bank": 4, "num_completed": 2, "finished": False, "is_root": False}, + ), + ( + self.examples[0][0], + [1, 3, 999, 1, 4, 999], + {"bank": 4, "num_completed": 2, "finished": False, "is_root": True}, + ), + ( + self.examples[0][0], + [4, 5, 6, 8], + {"bank": 2, "num_completed": 1, "finished": False, "is_root": True}, + ), + ( + self.examples[0][0], + # Tricky, because in last three, goes down [1->4] branch, could miss [1] and [4->5] + # [[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]], + [1, 2, 3, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5], + {"bank": 14, "num_completed": 6, "finished": True, "is_root": False}, + ), + ( + self.examples[0][0], + [1, 2, 3, 999, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5, 117], + {"bank": 14, "num_completed": 6, "finished": True, "is_root": True}, + ), + ( + tensorize([[1], [2, 3]]), + # Should not be able to get credit for entering 1 a second time + [1, 1], + {"bank": 1, "num_completed": 1, "finished": False, "is_root": True}, + ), + ( + self.examples[4][0], + [1, 2, 1, 2], + {"bank": 4, "num_completed": 2, "finished": True, "is_root": False}, + ), + ( + self.examples[4][0], + [1, 2, 1, 2, 1], + {"bank": 4, "num_completed": 2, "finished": True, "is_root": True}, + ), + ( + self.examples[5][0], + [1, 2, 3, 4, 5], + {"bank": 4, "num_completed": 2, "finished": True, "is_root": True}, + ), + ] + + def test_graphs(self): + """ + Test whether unordered graph systems are created correctly. + """ + for example in self.examples: + constraints, expected, gold_counts = example + c = ConstraintNode.create(constraints) + assert ( + ConstraintNode.print_graph(c) == expected + ), f"got {ConstraintNode.print_graph(c)}, expected {expected}" + assert ( + c.token_counts() == gold_counts + ), f"{c} got {c.token_counts()} wanted {gold_counts}" + + def test_next_tokens(self): + """ + Tests that the set of next tokens is correct. + """ + for example in self.examples: + constraints, expected, gold_counts = example + root = ConstraintNode.create(constraints) + + root_tokens = set(root.children.keys()) + for sequence in constraints: + state = UnorderedConstraintState(root) + for token in sequence: + all_tokens = root_tokens.union(state.node.children.keys()) + assert ( + all_tokens == state.next_tokens() + ), f"ALL {all_tokens} NEXT {state.next_tokens()}" + state = state.advance(token) + + def test_sequences(self): + for constraints, tokens, expected in self.sequences: + state = UnorderedConstraintState.create(pack_constraints([constraints])[0]) + for token in tokens: + state = state.advance(token) + result = {} + for attr in expected.keys(): + result[attr] = getattr(state, attr) + + assert ( + result == expected + ), f"TEST({tokens}) GOT: {result} WANTED: {expected}" + + +class TestOrderedConstraintState(unittest.TestCase): + def setUp(self): + self.sequences = [ + ( + tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), + [], + {"bank": 0, "num_completed": 0, "finished": False, "is_root": True}, + ), + ( + tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), + [1, 2], + {"bank": 2, "num_completed": 0, "finished": False, "is_root": False}, + ), + ( + tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), + [1, 2, 94], + {"bank": 0, "num_completed": 0, "finished": False, "is_root": True}, + ), + ( + tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), + [1, 3, 999, 1, 4], + {"bank": 0, "num_completed": 0, "finished": False, "is_root": True}, + ), + ( + tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), + [1, 2, 3, 999, 999], + {"bank": 3, "num_completed": 1, "finished": False, "is_root": False}, + ), + ( + tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), + [1, 2, 3, 77, 1, 3, 1], + {"bank": 6, "num_completed": 2, "finished": False, "is_root": False}, + ), + ( + tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), + [1, 2, 3, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5], + {"bank": 14, "num_completed": 6, "finished": True, "is_root": False}, + ), + ( + tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), + [1, 2, 999, 1, 2, 3, 999, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5, 117], + {"bank": 14, "num_completed": 6, "finished": True, "is_root": False}, + ), + ( + tensorize([[1], [2, 3]]), + [1, 1], + {"bank": 1, "num_completed": 1, "finished": False, "is_root": False}, + ), + ( + tensorize([[1, 2], [1, 2]]), + [1, 2, 1, 2], + {"bank": 4, "num_completed": 2, "finished": True, "is_root": False}, + ), + ( + tensorize([[1, 2], [1, 2]]), + [1, 2, 1, 2, 1], + {"bank": 4, "num_completed": 2, "finished": True, "is_root": False}, + ), + ( + tensorize([[1, 2], [3, 4]]), + [1, 2, 3, 4, 5], + {"bank": 4, "num_completed": 2, "finished": True, "is_root": False}, + ), + ] + + def test_sequences(self): + for i, (constraints, tokens, expected) in enumerate(self.sequences): + state = OrderedConstraintState.create(pack_constraints([constraints])[0]) + for token in tokens: + state = state.advance(token) + result = {} + for attr in expected.keys(): + result[attr] = getattr(state, attr) + assert ( + result == expected + ), f"TEST({tokens}) GOT: {result} WANTED: {expected}" + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_convtbc.py b/fairseq-0.10.2/tests/test_convtbc.py new file mode 100644 index 0000000000000000000000000000000000000000..3a3c9b91e70f597ab77b9b01459cc429db5d7956 --- /dev/null +++ b/fairseq-0.10.2/tests/test_convtbc.py @@ -0,0 +1,54 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +import torch.nn as nn +from fairseq.modules import ConvTBC + + +class TestConvTBC(unittest.TestCase): + def test_convtbc(self): + # ksz, in_channels, out_channels + conv_tbc = ConvTBC(4, 5, kernel_size=3, padding=1) + # out_channels, in_channels, ksz + conv1d = nn.Conv1d(4, 5, kernel_size=3, padding=1) + + conv_tbc.weight.data.copy_(conv1d.weight.data.transpose(0, 2)) + conv_tbc.bias.data.copy_(conv1d.bias.data) + + input_tbc = torch.randn(7, 2, 4, requires_grad=True) + input1d = input_tbc.data.transpose(0, 1).transpose(1, 2) + input1d.requires_grad = True + + output_tbc = conv_tbc(input_tbc) + output1d = conv1d(input1d) + + self.assertAlmostEqual( + output_tbc.data.transpose(0, 1).transpose(1, 2), output1d.data + ) + + grad_tbc = torch.randn(output_tbc.size()) + grad1d = grad_tbc.transpose(0, 1).transpose(1, 2).contiguous() + + output_tbc.backward(grad_tbc) + output1d.backward(grad1d) + + self.assertAlmostEqual( + conv_tbc.weight.grad.data.transpose(0, 2), conv1d.weight.grad.data + ) + self.assertAlmostEqual(conv_tbc.bias.grad.data, conv1d.bias.grad.data) + self.assertAlmostEqual( + input_tbc.grad.data.transpose(0, 1).transpose(1, 2), input1d.grad.data + ) + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess((t1 - t2).abs().max(), 1e-4) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_inference_dropout.py b/fairseq-0.10.2/tests/test_inference_dropout.py new file mode 100644 index 0000000000000000000000000000000000000000..fd5edd43d6a6f1fe06f8439cb9cb9a565e8a1074 --- /dev/null +++ b/fairseq-0.10.2/tests/test_inference_dropout.py @@ -0,0 +1,66 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import unittest + +from fairseq.models.transformer import TransformerModel +from tests.test_sequence_generator import get_dummy_task_and_parser + + +class TestInferenceDropout(unittest.TestCase): + def setUp(self): + self.task, self.parser = get_dummy_task_and_parser() + TransformerModel.add_args(self.parser) + self.args = self.parser.parse_args([]) + self.args.encoder_layers = 2 + self.args.decoder_layers = 1 + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + def test_sets_inference_dropout_to_true(self): + self.args.retain_dropout = True + self.transformer_model = TransformerModel.build_model(self.args, self.task) + self.transformer_model.prepare_for_inference_(self.args) + assert self.transformer_model.encoder.dropout_module.apply_during_inference + assert self.transformer_model.decoder.dropout_module.apply_during_inference + for layer in self.transformer_model.encoder.layers: + assert layer.dropout_module.apply_during_inference + + def test_inference_dropout_false_by_default(self): + self.transformer_model = TransformerModel.build_model(self.args, self.task) + self.transformer_model.prepare_for_inference_(self.args) + assert not self.transformer_model.encoder.dropout_module.apply_during_inference + assert not self.transformer_model.decoder.dropout_module.apply_during_inference + for layer in self.transformer_model.encoder.layers: + assert not layer.dropout_module.apply_during_inference + for layer in self.transformer_model.decoder.layers: + assert not layer.dropout_module.apply_during_inference + + def test_applies_training_mode(self): + self.transformer_model = TransformerModel.build_model(self.args, self.task) + assert self.transformer_model.encoder.dropout_module.training + for layer in self.transformer_model.encoder.layers: + assert layer.dropout_module.training + + self.transformer_model.eval() + assert not self.transformer_model.decoder.dropout_module.training + for layer in self.transformer_model.encoder.layers: + assert not layer.dropout_module.training + + def test_retain_modules(self): + self.args.retain_dropout = True + self.args.retain_dropout_modules = [ + "TransformerEncoder", + "TransformerEncoderLayer", + ] + self.transformer_model = TransformerModel.build_model(self.args, self.task) + self.transformer_model.prepare_for_inference_(self.args) + assert self.transformer_model.encoder.dropout_module.apply_during_inference + assert not self.transformer_model.decoder.dropout_module.apply_during_inference + for layer in self.transformer_model.decoder.layers: + assert not layer.dropout_module.apply_during_inference diff --git a/fairseq-0.10.2/tests/test_metrics.py b/fairseq-0.10.2/tests/test_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..2de6969cf4445bc6cda44dacf6de765ea30d5f5b --- /dev/null +++ b/fairseq-0.10.2/tests/test_metrics.py @@ -0,0 +1,77 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +import uuid + +from fairseq import metrics + + +class TestMetrics(unittest.TestCase): + def test_nesting(self): + with metrics.aggregate() as a: + metrics.log_scalar("loss", 1) + with metrics.aggregate() as b: + metrics.log_scalar("loss", 2) + + self.assertEqual(a.get_smoothed_values()["loss"], 1.5) + self.assertEqual(b.get_smoothed_values()["loss"], 2) + + def test_new_root(self): + with metrics.aggregate() as a: + metrics.log_scalar("loss", 1) + with metrics.aggregate(new_root=True) as b: + metrics.log_scalar("loss", 2) + + self.assertEqual(a.get_smoothed_values()["loss"], 1) + self.assertEqual(b.get_smoothed_values()["loss"], 2) + + def test_nested_new_root(self): + with metrics.aggregate() as layer1: + metrics.log_scalar("loss", 1) + with metrics.aggregate(new_root=True) as layer2: + metrics.log_scalar("loss", 2) + with metrics.aggregate() as layer3: + metrics.log_scalar("loss", 3) + with metrics.aggregate(new_root=True) as layer4: + metrics.log_scalar("loss", 4) + metrics.log_scalar("loss", 1.5) + + self.assertEqual(layer4.get_smoothed_values()["loss"], 4) + self.assertEqual(layer3.get_smoothed_values()["loss"], 3) + self.assertEqual(layer2.get_smoothed_values()["loss"], 2.5) + self.assertEqual(layer1.get_smoothed_values()["loss"], 1.25) + + def test_named(self): + name = str(uuid.uuid4()) + metrics.reset_meters(name) + + with metrics.aggregate(name): + metrics.log_scalar("loss", 1) + + metrics.log_scalar("loss", 3) + + with metrics.aggregate(name): + metrics.log_scalar("loss", 2) + + self.assertEqual(metrics.get_smoothed_values(name)["loss"], 1.5) + + def test_nested_duplicate_names(self): + name = str(uuid.uuid4()) + metrics.reset_meters(name) + + with metrics.aggregate(name): + metrics.log_scalar("loss", 1) + with metrics.aggregate() as other: + with metrics.aggregate(name): + metrics.log_scalar("loss", 2) + metrics.log_scalar("loss", 6) + + self.assertEqual(metrics.get_smoothed_values(name)["loss"], 3) + self.assertEqual(other.get_smoothed_values()["loss"], 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_sequence_scorer.py b/fairseq-0.10.2/tests/test_sequence_scorer.py new file mode 100644 index 0000000000000000000000000000000000000000..42f9447b599bcd7a9913aec37d94ea5078ff43a3 --- /dev/null +++ b/fairseq-0.10.2/tests/test_sequence_scorer.py @@ -0,0 +1,120 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import unittest + +import tests.utils as test_utils +import torch +from fairseq.sequence_scorer import SequenceScorer + + +class TestSequenceScorer(unittest.TestCase): + def test_sequence_scorer(self): + # construct dummy dictionary + d = test_utils.dummy_dictionary(vocab_size=2) + self.assertEqual(d.pad(), 1) + self.assertEqual(d.eos(), 2) + self.assertEqual(d.unk(), 3) + eos = d.eos() + w1 = 4 + w2 = 5 + + # construct dataloader + data = [ + { + "source": torch.LongTensor([w1, w2, eos]), + "target": torch.LongTensor([w1, w2, w1, eos]), + }, + { + "source": torch.LongTensor([w2, eos]), + "target": torch.LongTensor([w2, w1, eos]), + }, + { + "source": torch.LongTensor([w2, eos]), + "target": torch.LongTensor([w2, eos]), + }, + ] + data_itr = test_utils.dummy_dataloader(data) + + # specify expected output probabilities + args = argparse.Namespace() + unk = 0.0 + args.beam_probs = [ + # step 0: + torch.FloatTensor( + [ + # eos w1 w2 + [0.0, unk, 0.6, 0.4], # sentence 1 + [0.0, unk, 0.4, 0.6], # sentence 2 + [0.0, unk, 0.7, 0.3], # sentence 3 + ] + ), + # step 1: + torch.FloatTensor( + [ + # eos w1 w2 + [0.0, unk, 0.2, 0.7], # sentence 1 + [0.0, unk, 0.8, 0.2], # sentence 2 + [0.7, unk, 0.1, 0.2], # sentence 3 + ] + ), + # step 2: + torch.FloatTensor( + [ + # eos w1 w2 + [0.10, unk, 0.50, 0.4], # sentence 1 + [0.15, unk, 0.15, 0.7], # sentence 2 + [0.00, unk, 0.00, 0.0], # sentence 3 + ] + ), + # step 3: + torch.FloatTensor( + [ + # eos w1 w2 + [0.9, unk, 0.05, 0.05], # sentence 1 + [0.0, unk, 0.00, 0.0], # sentence 2 + [0.0, unk, 0.00, 0.0], # sentence 3 + ] + ), + ] + expected_scores = [ + [0.6, 0.7, 0.5, 0.9], # sentence 1 + [0.6, 0.8, 0.15], # sentence 2 + [0.3, 0.7], # sentence 3 + ] + + task = test_utils.TestTranslationTask.setup_task(args, d, d) + model = task.build_model(args) + scorer = SequenceScorer(task.target_dictionary) + for sample in data_itr: + hypos = task.inference_step(scorer, [model], sample) + for id, hypos_id in zip(sample["id"].tolist(), hypos): + self.assertHypoTokens(hypos_id[0], data[id]["target"]) + self.assertHypoScore(hypos_id[0], expected_scores[id]) + + def assertHypoTokens(self, hypo, tokens): + self.assertTensorEqual(hypo["tokens"], torch.LongTensor(tokens)) + + def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0): + pos_scores = torch.FloatTensor(pos_probs).log() + self.assertAlmostEqual(hypo["positional_scores"], pos_scores) + self.assertEqual(pos_scores.numel(), hypo["tokens"].numel()) + score = pos_scores.sum() + if normalized: + score /= pos_scores.numel() ** lenpen + self.assertLess(abs(score - hypo["score"]), 1e-6) + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess((t1 - t2).abs().max(), 1e-4) + + def assertTensorEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertEqual(t1.ne(t2).long().sum(), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_token_block_dataset.py b/fairseq-0.10.2/tests/test_token_block_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..ea315b4e67a6176feb3e35c468ca1179b4e0e3c4 --- /dev/null +++ b/fairseq-0.10.2/tests/test_token_block_dataset.py @@ -0,0 +1,79 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import tests.utils as test_utils +import torch +from fairseq.data import TokenBlockDataset + + +class TestTokenBlockDataset(unittest.TestCase): + def _build_dataset(self, data, **kwargs): + sizes = [len(x) for x in data] + underlying_ds = test_utils.TestDataset(data) + return TokenBlockDataset(underlying_ds, sizes, **kwargs) + + def test_eos_break_mode(self): + data = [ + torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), + torch.tensor([1], dtype=torch.long), + torch.tensor([8, 7, 6, 1], dtype=torch.long), + ] + ds = self._build_dataset(data, block_size=None, pad=0, eos=1, break_mode="eos") + self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1]) + self.assertEqual(ds[1].tolist(), [1]) + self.assertEqual(ds[2].tolist(), [8, 7, 6, 1]) + + data = [ + torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), + torch.tensor([8, 7, 6, 1], dtype=torch.long), + torch.tensor([1], dtype=torch.long), + ] + ds = self._build_dataset(data, block_size=None, pad=0, eos=1, break_mode="eos") + self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1]) + self.assertEqual(ds[1].tolist(), [8, 7, 6, 1]) + self.assertEqual(ds[2].tolist(), [1]) + + def test_block_break_mode(self): + data = [ + torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), + torch.tensor([8, 7, 6, 1], dtype=torch.long), + torch.tensor([9, 1], dtype=torch.long), + ] + ds = self._build_dataset(data, block_size=3, pad=0, eos=1, break_mode="none") + self.assertEqual(ds[0].tolist(), [5, 4, 3]) + self.assertEqual(ds[1].tolist(), [2, 1, 8]) + self.assertEqual(ds[2].tolist(), [7, 6, 1]) + self.assertEqual(ds[3].tolist(), [9, 1]) + + def test_complete_break_mode(self): + data = [ + torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), + torch.tensor([8, 7, 6, 1], dtype=torch.long), + torch.tensor([9, 1], dtype=torch.long), + ] + ds = self._build_dataset( + data, block_size=6, pad=0, eos=1, break_mode="complete" + ) + self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1]) + self.assertEqual(ds[1].tolist(), [8, 7, 6, 1, 9, 1]) + + data = [ + torch.tensor([4, 3, 2, 1], dtype=torch.long), + torch.tensor([5, 1], dtype=torch.long), + torch.tensor([1], dtype=torch.long), + torch.tensor([6, 1], dtype=torch.long), + ] + ds = self._build_dataset( + data, block_size=3, pad=0, eos=1, break_mode="complete" + ) + self.assertEqual(ds[0].tolist(), [4, 3, 2, 1]) + self.assertEqual(ds[1].tolist(), [5, 1, 1]) + self.assertEqual(ds[2].tolist(), [6, 1]) + + +if __name__ == "__main__": + unittest.main() diff --git a/mosesdecoder/.beautify-ignore b/mosesdecoder/.beautify-ignore new file mode 100644 index 0000000000000000000000000000000000000000..b7eb51a205542217c7b265ec47c077d9d3f89e95 --- /dev/null +++ b/mosesdecoder/.beautify-ignore @@ -0,0 +1,38 @@ +# Files and directories that beautify.py should not clean up. +# +# This file is not as advanced as, say, .gitignore. It only supports files +# and directory paths relative to the project root, one per line, no globs, +# no quotes. +# +# Leading and trailing whitespace is stripped from filenames, but internal +# whitespace is preserved. +# +# Lines starting with a hash mark, such as this one, are comments. The hash +# mark must be the first character on the line. Blank lines are ignored. +# +# The .beautify-ignore file must be encoded in UTF-8. + +boost +contrib +irstlm +jam-files +lm +mingw/MosesGUI/icons_rc.py +mingw/MosesGUI/Ui_credits.py +mingw/MosesGUI/Ui_mainWindow.py +moses/TranslationModel/UG +moses/server +moses/parameters +moses/thread_safe_container.h +phrase-extract/pcfg-common +phrase-extract/syntax-common +randlm +# Filename suffixes in here are language codes, so e.g. ".pl" means +# Polish, not Perl. +scripts/share/nonbreaking_prefixes +search +srilm +util +xmlrpc-c +.git +util/ug_cache_with_timeout.h diff --git a/mosesdecoder/.travis.yml b/mosesdecoder/.travis.yml new file mode 100644 index 0000000000000000000000000000000000000000..c80b60de57a5e588801e429aebaeb2352194f663 --- /dev/null +++ b/mosesdecoder/.travis.yml @@ -0,0 +1,24 @@ +sudo: false +dist: trusty +language: c +compiler: gcc +env: + matrix: +addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - subversion + - automake + - libtool + - zlib1g-dev + - libbz2-dev + - liblzma-dev + - libboost-all-dev + - libgoogle-perftools-dev + - libxmlrpc-c++.*-dev + - cmake + - csh +script: +- ./bjam -j4 diff --git a/mosesdecoder/azure-pipelines.yml b/mosesdecoder/azure-pipelines.yml new file mode 100644 index 0000000000000000000000000000000000000000..fddd0faea1e935437ee9d67dfb4bd6414f86b636 --- /dev/null +++ b/mosesdecoder/azure-pipelines.yml @@ -0,0 +1,100 @@ +# Starter pipeline +# Start with a minimal pipeline that you can customize to build and deploy your code. +# Add steps that build, run tests, deploy, and more: +# https://aka.ms/yaml + +trigger: +- master + +pool: + #vmImage: 'ubuntu-latest' + vmImage: 'ubuntu-16.04' + +steps: + +- script: | + echo Printing some environment information + echo HOME: $HOME + echo + echo UBUNTU VERSION: + cat /etc/lsb-release + echo + echo CPU INFO + cat /proc/cpuinfo + echo + echo MEM INFO + cat /proc/meminfo + echo + echo DISK INFO + df -h + echo + echo PWD: $PWD + echo + ls + displayName: 'Printing some environment information' + + +## Installation commands for Ubuntu +- script: | + sudo apt-get install \ + g++ \ + git \ + subversion \ + automake \ + libtool \ + zlib1g-dev \ + libicu-dev \ + libboost-all-dev \ + libssl-dev \ + libbz2-dev \ + liblzma-dev \ + python-dev \ + graphviz \ + imagemagick \ + make \ + cmake \ + libgoogle-perftools-dev \ + autoconf \ + doxygen + displayName: 'Install Ubuntu packages' + +- script: | + wget "https://sourceforge.net/projects/cmph/files/v2.0.2/cmph-2.0.2.tar.gz/download" + mv download cmph-2.0.2.tar.gz + tar xvzf cmph-2.0.2.tar.gz + cd cmph-2.0.2 + ./configure --prefix=$PWD + make + make install + cd .. + displayName: 'Build and Install cmph' + +- script: | + wget "https://sourceforge.net/projects/xmlrpc-c/files/Xmlrpc-c%20Super%20Stable/1.51.06/xmlrpc-c-1.51.06.tgz/download" + mv download xmlrpc-c-1.51.06.tgz + tar xvzf xmlrpc-c-1.51.06.tgz + cd xmlrpc-c-1.51.06 + ./configure --prefix=$PWD + make + make install + sudo ldconfig + cd .. + displayName: 'Build and Install xmlrpc-c' + +- script: | + ./bjam \ + --with-cmph=$PWD/cmph-2.0.2 \ + --with-xmlrpc-c=$PWD/xmlrpc-c-1.51.06 \ + -j2 + displayName: 'Build Moses' + +# - script: | +# ./bjam \ +# -j2 +# displayName: 'Build Moses' + +# - task: ComponentGovernanceComponentDetection@0 +# inputs: +# scanType: 'Register' +# verbosity: 'Verbose' +# alertWarningLevel: 'High' \ No newline at end of file diff --git a/mosesdecoder/biconcor/Jamfile b/mosesdecoder/biconcor/Jamfile new file mode 100644 index 0000000000000000000000000000000000000000..83a73800072c01fb80904e897a515191ebc62fb6 --- /dev/null +++ b/mosesdecoder/biconcor/Jamfile @@ -0,0 +1,2 @@ +exe biconcor : Vocabulary.cpp SuffixArray.cpp TargetCorpus.cpp Alignment.cpp Mismatch.cpp PhrasePair.cpp PhrasePairCollection.cpp biconcor.cpp base64.cpp ; +exe phrase-lookup : Vocabulary.cpp SuffixArray.cpp phrase-lookup.cpp ; diff --git a/mosesdecoder/biconcor/PhrasePairCollection.cpp b/mosesdecoder/biconcor/PhrasePairCollection.cpp new file mode 100644 index 0000000000000000000000000000000000000000..540aaac6f424d47029c5ddb40cdd7a2aea1c094f --- /dev/null +++ b/mosesdecoder/biconcor/PhrasePairCollection.cpp @@ -0,0 +1,209 @@ +#include "PhrasePairCollection.h" + +#include +#include +#include + +#include "Vocabulary.h" +#include "SuffixArray.h" +#include "TargetCorpus.h" +#include "Alignment.h" +#include "PhrasePair.h" +#include "Mismatch.h" + +using namespace std; + +PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a, int max_translation, int max_example ) + :m_suffixArray(sa) + ,m_targetCorpus(tc) + ,m_alignment(a) + ,m_size(0) + ,m_max_lookup(10000) // maximum number of source occurrences sampled + ,m_max_translation(max_translation) // max number of different distinct translations returned + ,m_max_example(max_example) // max number of examples returned for each distinct translation +{} + +PhrasePairCollection::~PhrasePairCollection() +{} + +int PhrasePairCollection::GetCollection( const vector< string >& sourceString ) +{ + INDEX first_match, last_match; + if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) { + return 0; + } + //cerr << "\tfirst match " << first_match << endl; + //cerr << "\tlast match " << last_match << endl; + + INDEX found = last_match - first_match +1; + + map< vector< WORD_ID >, INDEX > index; + int real_count = 0; + for( INDEX i=first_match; i<=last_match; i++ ) { + int position = m_suffixArray->GetPosition( i ); + int source_start = m_suffixArray->GetWordInSentence( position ); + int source_end = source_start + sourceString.size()-1; + INDEX sentence_id = m_suffixArray->GetSentence( position ); + int sentence_length = m_suffixArray->GetSentenceLength( sentence_id ); + int target_length = m_targetCorpus->GetSentenceLength( sentence_id ); + //cerr << "match " << (i-first_match) + //<< " in sentence " << sentence_id + //<< ", starting at word " << source_start + //<< " of " << sentence_length + //<< ". target sentence has " << target_length << " words."; + int target_start, target_end, pre_null, post_null; + if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) { + //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]"; + //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")"; + bool null_boundary_words = false; + for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) { + for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) { + vector< WORD_ID > targetString; + //cerr << "; "; + for (int target = target_start - pre; target <= target_end + post; target++) { + targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) ); + //cerr << m_targetCorpus->GetWord( sentence_id, target) << " "; + } + PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post); + // matchCollection.Add( sentence_id, ) + if (index.find( targetString ) == index.end()) { + index[targetString] = m_collection.size(); + vector< PhrasePair* > emptyVector; + m_collection.push_back( emptyVector ); + } + m_collection[ index[targetString] ].push_back( phrasePair ); + m_size++; + } + } + } else { + //cerr << "mismatch " << (i-first_match) + // << " in sentence " << sentence_id + // << ", starting at word " << source_start + // << " of " << sentence_length + // << ". target sentence has " << target_length << " words."; + Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end ); + if (mismatch->Unaligned()) + m_unaligned.push_back( mismatch ); + else + m_mismatch.push_back( mismatch ); + } + //cerr << endl; + + if (found > (INDEX)m_max_lookup) { + i += found/m_max_lookup-1; + } + real_count++; + } + sort(m_collection.begin(), m_collection.end(), CompareBySize()); + return real_count; +} + +void PhrasePairCollection::Print(bool pretty) const +{ + vector< vector >::const_iterator ppWithSameTarget; + int i=0; + for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && ibegin()))->PrintTarget( &cout ); + int count = ppWithSameTarget->size(); + cout << "(" << count << ")" << endl; + vector< PhrasePair* >::const_iterator p = ppWithSameTarget->begin(); + for(int j=0; jsize() && jPrintPretty( &cout, 100 ); + } else { + (*p)->Print( &cout ); + } + if (ppWithSameTarget->size() > m_max_example) { + p += ppWithSameTarget->size()/m_max_example-1; + } + } + } +} + +void PhrasePairCollection::PrintHTML() const +{ + int pp_target = 0; + bool singleton = false; + // loop over all translations + vector< vector >::const_iterator ppWithSameTarget; + for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_targetsize(); + if (!singleton) { + if (count == 1) { + singleton = true; + cout << "

singleton" + << (m_collection.end() - ppWithSameTarget==1?"":"s") << " (" + << (m_collection.end() - ppWithSameTarget) + << "/" << m_size << ")

"; + } else { + cout << "

"; + (*(ppWithSameTarget->begin()))->PrintTarget( &cout ); + cout << " (" << count << "/" << m_size << ")" << endl; + cout << "

"; + } + cout << ""; + } + + vector< PhrasePair* >::const_iterator p; + // loop over all sentences where translation occurs + int pp=0; + int i=0; + for(p = ppWithSameTarget->begin(); i<10 && ppend(); p++, pp++, i++ ) { + (*p)->PrintClippedHTML( &cout, 160 ); + if (count > m_max_example) { + p += count/m_max_example-1; + pp += count/m_max_example-1; + } + } + if (i == 10 && pp < count) { + // extended table + cout << "
(more)
"; + cout << "
"; + cout << ""; + for(i=0, pp=0, p = ppWithSameTarget->begin(); iend(); p++, pp++, i++ ) { + (*p)->PrintClippedHTML( &cout, 160 ); + if (count > m_max_example) { + p += count/m_max_example-1; + pp += count/m_max_example-1; + } + } + } + if (!singleton) cout << "
\n"; + + if (!singleton && pp_target == 9) { + cout << "
"; + cout << "

(more)

"; + cout << "
"; + } + } + if (singleton) cout << "
\n"; + else if (pp_target > 9) cout << ""; + + size_t max_mismatch = m_max_example/3; + // unaligned phrases + if (m_unaligned.size() > 0) { + cout << "

unaligned" + << " (" << (m_unaligned.size()) << ")

"; + cout << ""; + int step_size = 1; + if (m_unaligned.size() > max_mismatch) + step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch; + for(size_t i=0; iPrintClippedHTML( &cout, 160 ); + cout << "
"; + } + + // mismatched phrases + if (m_mismatch.size() > 0) { + cout << "

mismatched" + << " (" << (m_mismatch.size()) << ")

"; + cout << ""; + int step_size = 1; + if (m_mismatch.size() > max_mismatch) + step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch; + for(size_t i=0; iPrintClippedHTML( &cout, 160 ); + cout << "
"; + } +} diff --git a/mosesdecoder/biconcor/PhrasePairCollection.h b/mosesdecoder/biconcor/PhrasePairCollection.h new file mode 100644 index 0000000000000000000000000000000000000000..e076eba9bbc1c1ae76f3c1f80c0887c7167a1f7e --- /dev/null +++ b/mosesdecoder/biconcor/PhrasePairCollection.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include + +class Alignment; +class PhrasePair; +class SuffixArray; +class TargetCorpus; +class Mismatch; + +class PhrasePairCollection +{ +public: + typedef unsigned int INDEX; + +private: + SuffixArray *m_suffixArray; + TargetCorpus *m_targetCorpus; + Alignment *m_alignment; + std::vector > m_collection; + std::vector< Mismatch* > m_mismatch, m_unaligned; + int m_size; + int m_max_lookup; + int m_max_translation; + int m_max_example; + + // No copying allowed. + PhrasePairCollection(const PhrasePairCollection&); + void operator=(const PhrasePairCollection&); + +public: + PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment *, int, int ); + ~PhrasePairCollection (); + + int GetCollection( const std::vector& sourceString ); + void Print(bool pretty) const; + void PrintHTML() const; +}; + +// sorting helper +struct CompareBySize { + bool operator()(const std::vector& a, const std::vector& b ) const { + return a.size() > b.size(); + } +}; diff --git a/mosesdecoder/biconcor/SuffixArray.h b/mosesdecoder/biconcor/SuffixArray.h new file mode 100644 index 0000000000000000000000000000000000000000..f20702e41f0e283a27ce4074f1f2f8ae08964d11 --- /dev/null +++ b/mosesdecoder/biconcor/SuffixArray.h @@ -0,0 +1,82 @@ +#pragma once + +#include "Vocabulary.h" + +class SuffixArray +{ +public: + typedef unsigned int INDEX; + +private: + WORD_ID *m_array; + INDEX *m_index; + INDEX *m_buffer; + char *m_wordInSentence; + INDEX *m_sentence; + char *m_sentenceLength; + WORD_ID m_endOfSentence; + INDEX *m_document; + INDEX *m_documentName; + char *m_documentNameBuffer; + size_t m_documentNameLength; + size_t m_documentCount; + bool m_useDocument; + Vocabulary m_vcb; + INDEX m_size; + INDEX m_sentenceCount; + + // No copying allowed. + SuffixArray(const SuffixArray&); + void operator=(const SuffixArray&); + +public: + SuffixArray(); + ~SuffixArray(); + + void Create(const std::string& fileName ); + bool ProcessDocumentLine( const char* const, const size_t ); + void Sort(INDEX start, INDEX end); + int CompareIndex( INDEX a, INDEX b ) const; + inline int CompareWord( WORD_ID a, WORD_ID b ) const; + int Count( const std::vector< WORD > &phrase ); + bool MinCount( const std::vector< WORD > &phrase, INDEX min ); + bool Exists( const std::vector< WORD > &phrase ); + int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 ); + int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 ); + INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end ); + INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction ); + int Match( const std::vector< WORD > &phrase, INDEX index ); + void List( INDEX start, INDEX end ); + void PrintSentenceMatches( const std::vector< WORD > &phrase ); + inline INDEX GetPosition( INDEX index ) const { + return m_index[ index ]; + } + inline INDEX GetSentence( INDEX position ) const { + return m_sentence[position]; + } + inline char GetWordInSentence( INDEX position ) const { + return m_wordInSentence[position]; + } + inline char GetSentenceLength( INDEX sentenceId ) const { + return m_sentenceLength[sentenceId]; + } + inline INDEX GetSize() const { + return m_size; + } + inline WORD GetWord( INDEX position ) const { + return m_vcb.GetWord( m_array[position] ); + } + void UseDocument() { + m_useDocument = true; + } + INDEX GetDocument( INDEX sentence ) const; + void PrintDocumentName( INDEX document ) { + for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) { + std::cout << m_documentNameBuffer[ i ]; + } + } + void Save(const std::string& fileName ) const; + void Load(const std::string& fileName ); + void CheckAllocation(bool, const char *dataStructure) const; + bool Error( const char* message, const std::string& fileName) const; +}; diff --git a/mosesdecoder/biconcor/biconcor.cpp b/mosesdecoder/biconcor/biconcor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cb63e855d82e4b29a3ff3c7ada13fe7996a481cb --- /dev/null +++ b/mosesdecoder/biconcor/biconcor.cpp @@ -0,0 +1,171 @@ +#include "SuffixArray.h" +#include "TargetCorpus.h" +#include "Alignment.h" +#include "PhrasePairCollection.h" +#include +#include "base64.h" + +using namespace std; + +int main(int argc, char* argv[]) +{ + // handle parameters + string query; + string fileNameSuffix; + string fileNameSource; + string fileNameTarget = ""; + string fileNameAlignment = ""; + int loadFlag = false; + int saveFlag = false; + int createFlag = false; + int queryFlag = false; + int htmlFlag = false; // output as HTML + int prettyFlag = false; // output readable on screen + int stdioFlag = false; // receive requests from STDIN, respond to STDOUT + int max_translation = 20; + int max_example = 50; + string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n\t[--translations count]\n\t[--examples count]\n\t[--html]\n\t[--stdio]\n"; + while(1) { + static struct option long_options[] = { + {"load", required_argument, 0, 'l'}, + {"save", required_argument, 0, 's'}, + {"create", required_argument, 0, 'c'}, + {"query", required_argument, 0, 'q'}, + {"target", required_argument, 0, 't'}, + {"alignment", required_argument, 0, 'a'}, + {"html", no_argument, 0, 'h'}, + {"pretty", no_argument, 0, 'p'}, + {"stdio", no_argument, 0, 'i'}, + {"translations", required_argument, 0, 'o'}, + {"examples", required_argument, 0, 'e'}, + {0, 0, 0, 0} + }; + int option_index = 0; + int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:hpio:e:", long_options, &option_index); + if (c == -1) break; + switch (c) { + case 'l': + fileNameSuffix = string(optarg); + loadFlag = true; + break; + case 't': + fileNameTarget = string(optarg); + break; + case 'a': + fileNameAlignment = string(optarg); + break; + case 's': + fileNameSuffix = string(optarg); + saveFlag = true; + break; + case 'c': + fileNameSource = string(optarg); + createFlag = true; + break; + case 'Q': + query = base64_decode(string(optarg)); + queryFlag = true; + break; + case 'q': + query = string(optarg); + queryFlag = true; + break; + case 'o': + max_translation = atoi(optarg); + break; + case 'e': + max_example = atoi(optarg); + break; + case 'p': + prettyFlag = true; + break; + case 'h': + htmlFlag = true; + break; + case 'i': + stdioFlag = true; + break; + default: + cerr << info; + exit(1); + } + } + if (stdioFlag) { + queryFlag = true; + } + + // check if parameter settings are legal + if (saveFlag && !createFlag) { + cerr << "error: cannot save without creating\n" << info; + exit(1); + } + if (saveFlag && loadFlag) { + cerr << "error: cannot load and save at the same time\n" << info; + exit(1); + } + if (!loadFlag && !createFlag) { + cerr << "error: neither load or create - i have no info!\n" << info; + exit(1); + } + if (createFlag && (fileNameTarget == "" || fileNameAlignment == "")) { + cerr << "error: i have no target corpus or alignment\n" << info; + exit(1); + } + + // do your thing + SuffixArray suffixArray; + TargetCorpus targetCorpus; + Alignment alignment; + if (createFlag) { + cerr << "will create\n"; + cerr << "source corpus is in " << fileNameSource << endl; + suffixArray.Create( fileNameSource ); + cerr << "target corpus is in " << fileNameTarget << endl; + targetCorpus.Create( fileNameTarget ); + cerr << "alignment is in " << fileNameAlignment << endl; + alignment.Create( fileNameAlignment ); + if (saveFlag) { + suffixArray.Save( fileNameSuffix ); + targetCorpus.Save( fileNameSuffix ); + alignment.Save( fileNameSuffix ); + cerr << "will save in " << fileNameSuffix << endl; + } + } + if (loadFlag) { + cerr << "will load from " << fileNameSuffix << endl; + suffixArray.Load( fileNameSuffix ); + targetCorpus.Load( fileNameSuffix ); + alignment.Load( fileNameSuffix ); + } + if (stdioFlag) { + cout << "-|||- BICONCOR START -|||-" << endl << flush; + while(true) { + string query; + if (getline(cin, query, '\n').eof()) { + return 0; + } + vector< string > queryString = alignment.Tokenize( query.c_str() ); + PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); + int total = ppCollection.GetCollection( queryString ); + cout << "TOTAL: " << total << endl; + if (htmlFlag) { + ppCollection.PrintHTML(); + } else { + ppCollection.Print(prettyFlag); + } + cout << "-|||- BICONCOR END -|||-" << endl << flush; + } + } else if (queryFlag) { + cerr << "query is " << query << endl; + vector< string > queryString = alignment.Tokenize( query.c_str() ); + PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); + ppCollection.GetCollection( queryString ); + if (htmlFlag) { + ppCollection.PrintHTML(); + } else { + ppCollection.Print(prettyFlag); + } + } + + return 0; +} diff --git a/mosesdecoder/chk.tmp b/mosesdecoder/chk.tmp new file mode 100644 index 0000000000000000000000000000000000000000..9daeafb9864cf43055ae93beb0afd6c7d144bfa4 --- /dev/null +++ b/mosesdecoder/chk.tmp @@ -0,0 +1 @@ +test diff --git a/mosesdecoder/doxygen.conf b/mosesdecoder/doxygen.conf new file mode 100644 index 0000000000000000000000000000000000000000..3cd93e9ed54ea8283c01b0dce52a402dbe59c92f --- /dev/null +++ b/mosesdecoder/doxygen.conf @@ -0,0 +1,1781 @@ +# Doxyfile 1.7.6.1 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" "). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or sequence of words) that should +# identify the project. Note that if you do not use Doxywizard you need +# to put quotes around the project name if it contains spaces. + +PROJECT_NAME = "Moses Decoder" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer +# a quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify an logo or icon that is +# included in the documentation. The maximum height of the logo should not +# exceed 55 pixels and the maximum width should not exceed 200 pixels. +# Doxygen will copy the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = doxy + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful if your file system +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 8 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding +# "class=itcl::class" will allow you to use the command class in the +# itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given extension. +# Doxygen has a built-in mapping, but you can override or extend it using this +# tag. The format is ext=language, where ext is a file extension, and language +# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, +# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make +# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C +# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions +# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also makes the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate getter +# and setter methods for a property. Setting this option to YES (the default) +# will make doxygen replace the get and set methods by a property in the +# documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and +# unions are shown inside the group in which they are included (e.g. using +# @ingroup) instead of on a separate page (for HTML and Man pages) or +# section (for LaTeX and RTF). + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and +# unions with only public data fields will be shown inline in the documentation +# of the scope in which they are defined (i.e. file, namespace, or group +# documentation), provided this scope is documented. If set to NO (the default), +# structs, classes, and unions are shown on a separate page (for HTML and Man +# pages) or section (for LaTeX and RTF). + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penalty. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will roughly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols. + +SYMBOL_CACHE_SIZE = 0 + +# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be +# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given +# their name and scope. Since this can be an expensive process and often the +# same symbol appear multiple times in the code, doxygen keeps a cache of +# pre-resolved symbols. If the cache is too small doxygen will become slower. +# If the cache is too large, memory is wasted. The cache size is given by this +# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = NO + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespaces are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen +# will list include files with double quotes in the documentation +# rather than with sharp brackets. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen +# will sort the (brief and detailed) documentation of class members so that +# constructors and destructors are listed first. If set to NO (the default) +# the constructors will appear in the respective orders defined by +# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. +# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO +# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to +# do proper type resolution of all parameters of a function it will reject a +# match between the prototype and the implementation of a member function even +# if there is only one candidate or it is obvious which candidate to choose +# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen +# will still accept a match between prototype and implementation in such cases. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or macro consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and macros in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = NO + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. +# This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. The create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. +# You can optionally specify a file name after the option, if omitted +# DoxygenLayout.xml will be used as the name of the layout file. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files +# containing the references data. This must be a list of .bib files. The +# .bib extension is automatically appended if omitted. Using this command +# requires the bibtex tool to be installed. See also +# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style +# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this +# feature you need bibtex and perl available in the search path. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# The WARN_NO_PARAMDOC option can be enabled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = moses + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh +# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py +# *.f90 *.f *.for *.vhd *.vhdl + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = opt regtest doxy + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = opt/* regtest/* doxy/* + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. +# If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. +# Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. +# The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty or if +# non of the patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO, + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) +# and it is also possible to disable source filtering for a specific pattern +# using *.ext= (so without naming a filter). This option only has effect when +# FILTER_SOURCE_FILES is enabled. + +FILTER_SOURCE_PATTERNS = + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. +# Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = YES + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. Note that when using a custom header you are responsible +# for the proper inclusion of any scripts and style sheets that doxygen +# needs, which is dependent on the configuration options used. +# It is advised to generate a default header using "doxygen -w html +# header.html footer.html stylesheet.css YourConfigFile" and then modify +# that header. Note that the header is subject to change so you typically +# have to redo this when upgrading to a newer version of doxygen or when +# changing the value of configuration settings such as GENERATE_TREEVIEW! + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# style sheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that +# the files will be copied as-is; there are no commands or markers available. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. +# Doxygen will adjust the colors in the style sheet and background images +# according to this color. Hue is specified as an angle on a colorwheel, +# see http://en.wikipedia.org/wiki/Hue for more information. +# For instance the value 0 represents red, 60 is yellow, 120 is green, +# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. +# The allowed range is 0 to 359. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of +# the colors in the HTML output. For a value of 0 the output will use +# grayscales only. A value of 255 will produce the most vivid colors. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to +# the luminance component of the colors in the HTML output. Values below +# 100 gradually make the output lighter, whereas values above 100 make +# the output darker. The value divided by 100 is the actual gamma applied, +# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, +# and 100 does not change the gamma. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting +# this to NO can help when comparing the output of multiple runs. + +HTML_TIMESTAMP = YES + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = NO + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated +# that can be used as input for Qt's qhelpgenerator to generate a +# Qt Compressed Help (.qch) of the generated HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to +# add. For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see +# +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's +# filter section matches. +# +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files +# will be generated, which together with the HTML files, form an Eclipse help +# plugin. To install this plugin and make it available under the help contents +# menu in Eclipse, the contents of the directory containing the HTML and XML +# files needs to be copied into the plugins directory of eclipse. The name of +# the directory within the plugins directory should be the same as +# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before +# the help appears. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have +# this name. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) +# at top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. Since the tabs have the same information as the +# navigation tree you can set this option to NO if you already set +# GENERATE_TREEVIEW to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. +# Since the tree basically has the same information as the tab index you +# could consider to set DISABLE_INDEX to NO when enabling this option. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values +# (range [0,1..20]) that doxygen will group on one line in the generated HTML +# documentation. Note that a value of 0 will completely suppress the enum +# values from appearing in the overview section. + +ENUM_VALUES_PER_LINE = 4 + +# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, +# and Class Hierarchy pages using a tree view instead of an ordered list. + +USE_INLINE_TREES = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open +# links to external symbols imported via tag files in a separate window. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are +# not supported properly for IE 6.0, but are supported on all modern browsers. +# Note that when changing this option you need to delete any form_*.png files +# in the HTML output before the changes have effect. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax +# (see http://www.mathjax.org) which uses client side Javascript for the +# rendering instead of using prerendered bitmaps. Use this if you do not +# have LaTeX installed or if you want to formulas look prettier in the HTML +# output. When enabled you also need to install MathJax separately and +# configure the path to it using the MATHJAX_RELPATH option. + +USE_MATHJAX = NO + +# When MathJax is enabled you need to specify the location relative to the +# HTML output directory using the MATHJAX_RELPATH option. The destination +# directory should contain the MathJax.js script. For instance, if the mathjax +# directory is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the +# mathjax.org site, so you can quickly see the result without installing +# MathJax, but it is strongly recommended to install a local copy of MathJax +# before deployment. + +MATHJAX_RELPATH = http://www.mathjax.org/mathjax + +# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension +# names that should be enabled during MathJax rendering. + +MATHJAX_EXTENSIONS = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box +# for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using +# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets +# (GENERATE_DOCSET) there is already a search function so this one should +# typically be disabled. For large projects the javascript based search engine +# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. + +SEARCHENGINE = YES + +# When the SERVER_BASED_SEARCH tag is enabled the search engine will be +# implemented using a PHP enabled web server instead of at the web client +# using Javascript. Doxygen will generate the search PHP script and index +# file to put on the web server. The advantage of the server +# based approach is that it scales better to large projects and allows +# full text search. The disadvantages are that it is more difficult to setup +# and does not have live searching capabilities. + +SERVER_BASED_SEARCH = NO + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = NO + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. +# Note that when enabling USE_PDFLATEX this option is only used for +# generating bitmaps for formulas in the HTML output, but not in the +# Makefile that is written to the output directory. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4 + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for +# the generated latex document. The footer should contain everything after +# the last chapter. If it is left blank doxygen will generate a +# standard footer. Notice: only use this tag if you know what you are doing! + +LATEX_FOOTER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include +# source code with syntax highlighting in the LaTeX output. +# Note that which sources are shown also depends on other settings +# such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +# The LATEX_BIB_STYLE tag can be used to specify the style to use for the +# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See +# http://en.wikipedia.org/wiki/BibTeX for more info. + +LATEX_BIB_STYLE = plain + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load style sheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. +# This is useful +# if you want to understand what is going on. +# On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# pointed to by INCLUDE_PATH will be searched when a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition that +# overrules the definition found in the source code. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all references to function-like macros +# that are alone on a line, have an all uppercase name, and do not end with a +# semicolon, because these will confuse the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option also works with HAVE_DOT disabled, but it is recommended to +# install and use dot, since it yields more powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = YES + +# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is +# allowed to run in parallel. When set to 0 (the default) doxygen will +# base this on the number of processors available in the system. You can set it +# explicitly to a value larger than 0 to get control over the balance +# between CPU load and processing speed. + +DOT_NUM_THREADS = 0 + +# By default doxygen will use the Helvetica font for all dot files that +# doxygen generates. When you want a differently looking font you can specify +# the font name using DOT_FONTNAME. You need to make sure dot is able to find +# the font, which can be done by putting it in a standard location or by setting +# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the +# directory containing the font. + +DOT_FONTNAME = Helvetica + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the Helvetica font. +# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to +# set the path where dot can find it. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will generate a graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are svg, png, jpg, or gif. +# If left blank png will be used. If you choose svg you need to set +# HTML_FILE_EXTENSION to xhtml in order to make the SVG files +# visible in IE 9+ (other browsers do not have this requirement). + +DOT_IMAGE_FORMAT = png + +# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to +# enable generation of interactive SVG images that allow zooming and panning. +# Note that this requires a modern browser other than Internet Explorer. +# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you +# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files +# visible. Older versions of IE do not have SVG support. + +INTERACTIVE_SVG = NO + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MSCFILE_DIRS tag can be used to specify one or more directories that +# contain msc files that are included in the documentation (see the +# \mscfile command). + +MSCFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = YES + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/mosesdecoder/moses-cmd/Jamfile b/mosesdecoder/moses-cmd/Jamfile new file mode 100644 index 0000000000000000000000000000000000000000..f705732f89bc84178dbec10077a89f25b074254b --- /dev/null +++ b/mosesdecoder/moses-cmd/Jamfile @@ -0,0 +1,7 @@ +alias deps : ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ; + +exe moses : Main.cpp deps ; +exe vwtrainer : MainVW.cpp deps ; +exe lmbrgrid : LatticeMBRGrid.cpp deps ; +alias programs : moses lmbrgrid vwtrainer ; + diff --git a/mosesdecoder/moses-cmd/LatticeMBRGrid.cpp b/mosesdecoder/moses-cmd/LatticeMBRGrid.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3cb8013426d82f4a387e4d4ecad8b072975a0da7 --- /dev/null +++ b/mosesdecoder/moses-cmd/LatticeMBRGrid.cpp @@ -0,0 +1,215 @@ +// $Id: LatticeMBRGrid.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (c) 2010 University of Edinburgh +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the University of Edinburgh nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ +/** +* Lattice MBR grid search. Enables a grid search through the four parameters (p,r,scale and prune) used in lattice MBR. + See 'Lattice Minimum Bayes-Risk Decoding for Statistical Machine Translation by Tromble, Kumar, Och and Macherey, + EMNLP 2008 for details of the parameters. + + The grid search is controlled by specifying comma separated lists for the lmbr parameters (-lmbr-p, -lmbr-r, + -lmbr-pruning-factor and -mbr-scale). All other parameters are passed through to moses. If any of the lattice mbr + parameters are missing, then they are set to their default values. Output is of the form: + sentence-id ||| p r prune scale ||| translation-hypothesis +**/ + +#include +#include +#include +#include +#include + +#include "moses/IOWrapper.h" +#include "moses/LatticeMBR.h" +#include "moses/Manager.h" +#include "moses/Timer.h" +#include "moses/StaticData.h" +#include "util/exception.hh" + +#include +#include "moses/TranslationTask.h" + +using namespace std; +using namespace Moses; + +//keys +enum gridkey {lmbr_p,lmbr_r,lmbr_prune,lmbr_scale}; + +namespace Moses +{ + +class Grid +{ +public: + /** Add a parameter with key, command line argument, and default value */ + void addParam(gridkey key, const string& arg, float defaultValue) { + m_args[arg] = key; + UTIL_THROW_IF2(m_grid.find(key) != m_grid.end(), + "Couldn't find value for key " << (int) key); + m_grid[key].push_back(defaultValue); + } + + /** Parse the arguments, removing those that define the grid and returning a copy of the rest */ + void parseArgs(int& argc, char const**& argv) { + char const** newargv = new char const*[argc+1]; //Space to add mbr parameter + int newargc = 0; + for (int i = 0; i < argc; ++i) { + bool consumed = false; + for (map::const_iterator argi = m_args.begin(); argi != m_args.end(); ++argi) { + if (!strcmp(argv[i], argi->first.c_str())) { + ++i; + if (i >= argc) { + cerr << "Error: missing parameter for " << argi->first << endl; + throw runtime_error("Missing parameter"); + } else { + string value = argv[i]; + gridkey key = argi->second; + if (m_grid[key].size() != 1) { + throw runtime_error("Duplicate grid argument"); + } + m_grid[key].clear(); + char delim = ','; + string::size_type lastpos = value.find_first_not_of(delim); + string::size_type pos = value.find_first_of(delim,lastpos); + while (string::npos != pos || string::npos != lastpos) { + float param = atof(value.substr(lastpos, pos-lastpos).c_str()); + if (!param) { + cerr << "Error: Illegal grid parameter for " << argi->first << endl; + throw runtime_error("Illegal grid parameter"); + } + m_grid[key].push_back(param); + lastpos = value.find_first_not_of(delim,pos); + pos = value.find_first_of(delim,lastpos); + } + consumed = true; + } + if (consumed) break; + } + } + if (!consumed) { + // newargv[newargc] = new char[strlen(argv[i]) + 1]; + // strcpy(newargv[newargc],argv[i]); + newargv[newargc] = argv[i]; + ++newargc; + } + } + argc = newargc; + argv = newargv; + } + + /** Get the grid for a particular key.*/ + const vector& getGrid(gridkey key) const { + map >::const_iterator iter = m_grid.find(key); + assert (iter != m_grid.end()); + return iter->second; + + } + +private: + map > m_grid; + map m_args; +}; + +} // namespace + +int main(int argc, char const* argv[]) +{ + cerr << "Lattice MBR Grid search" << endl; + + Grid grid; + grid.addParam(lmbr_p, "-lmbr-p", 0.5); + grid.addParam(lmbr_r, "-lmbr-r", 0.5); + grid.addParam(lmbr_prune, "-lmbr-pruning-factor",30.0); + grid.addParam(lmbr_scale, "-mbr-scale",1.0); + + grid.parseArgs(argc,argv); + + Parameter* params = new Parameter(); + if (!params->LoadParam(argc,argv)) { + params->Explain(); + exit(1); + } + + ResetUserTime(); + if (!StaticData::LoadDataStatic(params, argv[0])) { + exit(1); + } + + StaticData& SD = const_cast(StaticData::Instance()); + boost::shared_ptr opts(new AllOptions(*SD.options())); + LMBR_Options& lmbr = opts->lmbr; + MBR_Options& mbr = opts->mbr; + lmbr.enabled = true; + + boost::shared_ptr ioWrapper(new IOWrapper(*opts)); + if (!ioWrapper) { + throw runtime_error("Failed to initialise IOWrapper"); + } + size_t nBestSize = mbr.size; + + if (nBestSize <= 0) { + throw new runtime_error("Non-positive size specified for n-best list"); + } + + const vector& pgrid = grid.getGrid(lmbr_p); + const vector& rgrid = grid.getGrid(lmbr_r); + const vector& prune_grid = grid.getGrid(lmbr_prune); + const vector& scale_grid = grid.getGrid(lmbr_scale); + + boost::shared_ptr source; + while((source = ioWrapper->ReadInput()) != NULL) { + // set up task of translating one sentence + boost::shared_ptr ttask; + ttask = TranslationTask::create(source, ioWrapper); + Manager manager(ttask); + manager.Decode(); + TrellisPathList nBestList; + manager.CalcNBest(nBestSize, nBestList,true); + //grid search + BOOST_FOREACH(float const& p, pgrid) { + lmbr.precision = p; + BOOST_FOREACH(float const& r, rgrid) { + lmbr.ratio = r; + BOOST_FOREACH(size_t const prune_i, prune_grid) { + lmbr.pruning_factor = prune_i; + BOOST_FOREACH(float const& scale_i, scale_grid) { + mbr.scale = scale_i; + size_t lineCount = source->GetTranslationId(); + cout << lineCount << " ||| " << p << " " + << r << " " << size_t(prune_i) << " " << scale_i + << " ||| "; + vector mbrBestHypo = doLatticeMBR(manager,nBestList); + manager.OutputBestHypo(mbrBestHypo, cout); + } + } + } + } + } +} diff --git a/mosesdecoder/moses-cmd/Main.cpp b/mosesdecoder/moses-cmd/Main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0318d8f4e6f25882931b12baa2a45f68bdeb99c9 --- /dev/null +++ b/mosesdecoder/moses-cmd/Main.cpp @@ -0,0 +1,33 @@ +// $Id: MainMT.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2009 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +/** + * Moses main wrapper for executable for single-threaded and multi-threaded, simply calling decoder_main. + **/ +#include "moses/ExportInterface.h" +#include "util/string_stream.hh" + +/** main function of the command line version of the decoder **/ +int main(int argc, char const** argv) +{ + return decoder_main(argc, argv); +} + diff --git a/mosesdecoder/moses-cmd/MainVW.cpp b/mosesdecoder/moses-cmd/MainVW.cpp new file mode 100644 index 0000000000000000000000000000000000000000..694dcee8af0629e471f29e441621f4efbfa88044 --- /dev/null +++ b/mosesdecoder/moses-cmd/MainVW.cpp @@ -0,0 +1,186 @@ +// $Id: MainMT.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2009 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +/** + * Moses main, for single-threaded and multi-threaded. + **/ +#include +#include +#include +#include + +#include "util/usage.hh" + +#ifdef WIN32 +// Include Visual Leak Detector +//#include +#endif + +#include "moses/IOWrapper.h" +#include "moses/Hypothesis.h" +#include "moses/Manager.h" +#include "moses/StaticData.h" +#include "moses/TypeDef.h" +#include "moses/Util.h" +#include "moses/Timer.h" +#include "moses/TranslationModel/PhraseDictionary.h" +#include "moses/FF/StatefulFeatureFunction.h" +#include "moses/FF/StatelessFeatureFunction.h" +#include "moses/TrainingTask.h" +#include "util/random.hh" + +#ifdef HAVE_PROTOBUF +#include "hypergraph.pb.h" +#endif + +using namespace std; +using namespace Moses; + +namespace Moses +{ + +void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) +{ + outputSearchGraphStream.setf(std::ios::fixed); + outputSearchGraphStream.precision(6); + StaticData::Instance().GetAllWeights().Save(outputSearchGraphStream); +} + + +} //namespace + +/** main function of the command line version of the decoder **/ +int main(int argc, char const** argv) +{ + //setting in the Staticdata a link between the thread id of this process and a NULL tasksptr + // StaticData::InstanceNonConst().SetTask(); // => moved into StaticData constructor + + try { + +#ifdef HAVE_PROTOBUF + GOOGLE_PROTOBUF_VERIFY_VERSION; +#endif + + // echo command line, if verbose + IFVERBOSE(1) { + TRACE_ERR("command: "); + for(int i=0; i just dump out weights and exit + if (params.isParamSpecified("show-weights")) { + ShowWeights(); + exit(0); + } + + // shorthand for accessing information in StaticData + const StaticData& staticData = StaticData::Instance(); + + + //initialise random numbers + util::rand_init(); + + // set up read/writing class + IFVERBOSE(1) { + PrintUserTime("Created input-output object"); + } + AllOptions::ptr opts(new AllOptions(*StaticData::Instance().options())); + boost::shared_ptr ioWrapper(new IOWrapper(*opts)); + if (ioWrapper == NULL) { + cerr << "Error; Failed to create IO object" << endl; + exit(1); + } + + // check on weights + const ScoreComponentCollection& weights = staticData.GetAllWeights(); + IFVERBOSE(2) { + TRACE_ERR("The global weight vector looks like this: "); + TRACE_ERR(weights); + TRACE_ERR("\n"); + } + +#ifdef WITH_THREADS +#pragma message ("Compiling with Threads.") + ThreadPool pool(staticData.ThreadCount()); +#endif + + // main loop over set of input sentences + + boost::shared_ptr scope(new ContextScope); + boost::shared_ptr source; + while ((source = ioWrapper->ReadInput()) != NULL) { + IFVERBOSE(1) { + ResetUserTime(); + } + + // set up task of training one sentence + boost::shared_ptr task; + task = TrainingTask::create(source, ioWrapper, scope); + + // execute task +#ifdef WITH_THREADS + pool.Submit(task); +#else + task->Run(); +#endif + } + + // we are done, finishing up +#ifdef WITH_THREADS + pool.Stop(true); //flush remaining jobs +#endif + + FeatureFunction::Destroy(); + + } catch (const std::exception &e) { + std::cerr << "Exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } + + IFVERBOSE(1) util::PrintUsage(std::cerr); + +#ifndef EXIT_RETURN + //This avoids that destructors are called (it can take a long time) + exit(EXIT_SUCCESS); +#else + return EXIT_SUCCESS; +#endif +} diff --git a/mosesdecoder/moses-cmd/MainVW.h b/mosesdecoder/moses-cmd/MainVW.h new file mode 100644 index 0000000000000000000000000000000000000000..49fee0219ec069f2ddb9353995d1ffe0a804389c --- /dev/null +++ b/mosesdecoder/moses-cmd/MainVW.h @@ -0,0 +1,42 @@ +#pragma once +// $Id$ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (c) 2006 University of Edinburgh +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the University of Edinburgh nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +// example file on how to use moses library + + +#include "moses/StaticData.h" + +class IOWrapper; + +int main(int argc, char* argv[]); + diff --git a/mosesdecoder/moses2/AlignmentInfo.h b/mosesdecoder/moses2/AlignmentInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..89b31a1fc44c160baed53909b06ba9b06f21399e --- /dev/null +++ b/mosesdecoder/moses2/AlignmentInfo.h @@ -0,0 +1,148 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include "TypeDef.h" + +namespace Moses2 +{ + +class AlignmentInfoCollection; +class System; + +/** Collection of non-terminal alignment pairs, ordered by source index. + * Usually held by a TargetPhrase to map non-terms in hierarchical/syntax models + */ +class AlignmentInfo +{ + friend struct AlignmentInfoOrderer; + friend struct AlignmentInfoHasher; + friend class AlignmentInfoCollection; + friend class VW; + + friend std::ostream& operator<<(std::ostream& out, const AlignmentInfo& obj); + +public: + typedef std::set > CollType; + typedef std::vector NonTermIndexMap; + typedef CollType::const_iterator const_iterator; + + const_iterator begin() const { + return m_collection.begin(); + } + const_iterator end() const { + return m_collection.end(); + } + + void Add(size_t sourcePos, size_t targetPos) { + m_collection.insert(std::pair(sourcePos, targetPos)); + } + /** Provides a map from target-side to source-side non-terminal indices. + * The target-side index should be the rule symbol index (COUNTING terminals). + * The index returned is the rule non-terminal index (IGNORING terminals). + */ + const NonTermIndexMap &GetNonTermIndexMap() const { + return m_nonTermIndexMap; + } + + /** Like GetNonTermIndexMap but the return value is the symbol index (i.e. + * the index counting both terminals and non-terminals) */ + const NonTermIndexMap &GetNonTermIndexMap2() const { + return m_nonTermIndexMap2; + } + + const CollType &GetAlignments() const { + return m_collection; + } + + std::set GetAlignmentsForSource(size_t sourcePos) const; + std::set GetAlignmentsForTarget(size_t targetPos) const; + + size_t GetSize() const { + return m_collection.size(); + } + + std::vector< const std::pair* > + GetSortedAlignments(Moses2::WordAlignmentSort SortOrder) const; + + std::vector GetSourceIndex2PosMap() const; + + bool operator==(const AlignmentInfo& rhs) const { + return m_collection == rhs.m_collection && + m_nonTermIndexMap == rhs.m_nonTermIndexMap; + } + + std::string Debug(const System &system) const; + +private: + //! AlignmentInfo objects should only be created by an AlignmentInfoCollection + explicit AlignmentInfo(const std::set > &pairs); + explicit AlignmentInfo(const std::vector &aln); + + // used only by VW to load word alignment between sentences + explicit AlignmentInfo(const std::string &str); + + void BuildNonTermIndexMaps(); + + CollType m_collection; + NonTermIndexMap m_nonTermIndexMap; + NonTermIndexMap m_nonTermIndexMap2; +}; + +/** Define an arbitrary strict weak ordering between AlignmentInfo objects + * for use by AlignmentInfoCollection. + */ +struct AlignmentInfoOrderer { + bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const { + if (a.m_collection == b.m_collection) { + return a.m_nonTermIndexMap < b.m_nonTermIndexMap; + } else { + return a.m_collection < b.m_collection; + } + } +}; + +/** + * Hashing functoid + **/ +struct AlignmentInfoHasher { + size_t operator()(const AlignmentInfo& a) const { + size_t seed = 0; + boost::hash_combine(seed,a.m_collection); + boost::hash_combine(seed,a.m_nonTermIndexMap); + return seed; + } + +}; + +inline size_t hash_value(const AlignmentInfo& a) +{ + static AlignmentInfoHasher hasher; + return hasher(a); +} + +} diff --git a/mosesdecoder/moses2/AlignmentInfoCollection.cpp b/mosesdecoder/moses2/AlignmentInfoCollection.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a6116400c417b6fd51204b03a42b6552e6b401c6 --- /dev/null +++ b/mosesdecoder/moses2/AlignmentInfoCollection.cpp @@ -0,0 +1,62 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "AlignmentInfoCollection.h" + +using namespace std; + +namespace Moses2 +{ + +AlignmentInfoCollection AlignmentInfoCollection::s_instance; + +AlignmentInfoCollection::AlignmentInfoCollection() +{ + std::set > pairs; + m_emptyAlignmentInfo = Add(pairs); +} + +AlignmentInfoCollection::~AlignmentInfoCollection() +{} + +const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const +{ + return *m_emptyAlignmentInfo; +} + +AlignmentInfo const * +AlignmentInfoCollection:: +Add(AlignmentInfo const& ainfo) +{ +#ifdef WITH_THREADS + { + boost::shared_lock read_lock(m_accessLock); + AlignmentInfoSet::const_iterator i = m_collection.find(ainfo); + if (i != m_collection.end()) + return &*i; + } + boost::unique_lock lock(m_accessLock); +#endif + std::pair ret = m_collection.insert(ainfo); + return &(*ret.first); +} + + + +} diff --git a/mosesdecoder/moses2/ArcLists.cpp b/mosesdecoder/moses2/ArcLists.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1143024c0bcb9aee43c4ae39bb54710d1229273b --- /dev/null +++ b/mosesdecoder/moses2/ArcLists.cpp @@ -0,0 +1,127 @@ +/* + * ArcList.cpp + * + * Created on: 26 Oct 2015 + * Author: hieu + */ +#include +#include +#include +#include +#include "ArcLists.h" +#include "HypothesisBase.h" +#include "util/exception.hh" + +using namespace std; + +namespace Moses2 +{ + +ArcLists::ArcLists() +{ + // TODO Auto-generated constructor stub + +} + +ArcLists::~ArcLists() +{ + BOOST_FOREACH(const Coll::value_type &collPair, m_coll) { + const ArcList *arcList = collPair.second; + delete arcList; + } +} + +void ArcLists::AddArc(bool added, const HypothesisBase *currHypo, + const HypothesisBase *otherHypo) +{ + //cerr << added << " " << currHypo << " " << otherHypo << endl; + ArcList *arcList; + if (added) { + // we're winners! + if (otherHypo) { + // there was a existing losing hypo + arcList = &GetAndDetachArcList(otherHypo); + } else { + // there was no existing hypo + arcList = new ArcList; + } + m_coll[currHypo] = arcList; + } else { + // we're losers! + // there should be a winner, we're not doing beam pruning + UTIL_THROW_IF2(otherHypo == NULL, "There must have been a winning hypo"); + arcList = &GetArcList(otherHypo); + } + + // in any case, add the curr hypo + arcList->push_back(currHypo); +} + +ArcList &ArcLists::GetArcList(const HypothesisBase *hypo) +{ + Coll::iterator iter = m_coll.find(hypo); + UTIL_THROW_IF2(iter == m_coll.end(), "Can't find arc list"); + ArcList &arcList = *iter->second; + return arcList; +} + +const ArcList &ArcLists::GetArcList(const HypothesisBase *hypo) const +{ + Coll::const_iterator iter = m_coll.find(hypo); + + if (iter == m_coll.end()) { + cerr << "looking for:" << hypo << " have " << m_coll.size() << " :"; + BOOST_FOREACH(const Coll::value_type &collPair, m_coll) { + const HypothesisBase *hypo = collPair.first; + cerr << hypo << " "; + } + } + + UTIL_THROW_IF2(iter == m_coll.end(), "Can't find arc list for " << hypo); + ArcList &arcList = *iter->second; + return arcList; +} + +ArcList &ArcLists::GetAndDetachArcList(const HypothesisBase *hypo) +{ + Coll::iterator iter = m_coll.find(hypo); + UTIL_THROW_IF2(iter == m_coll.end(), "Can't find arc list"); + ArcList &arcList = *iter->second; + + m_coll.erase(iter); + + return arcList; +} + +void ArcLists::Sort() +{ + BOOST_FOREACH(Coll::value_type &collPair, m_coll) { + ArcList &list = *collPair.second; + std::sort(list.begin(), list.end(), HypothesisFutureScoreOrderer() ); + } +} + +void ArcLists::Delete(const HypothesisBase *hypo) +{ + //cerr << "hypo=" << hypo->Debug() << endl; + //cerr << "m_coll=" << m_coll.size() << endl; + Coll::iterator iter = m_coll.find(hypo); + UTIL_THROW_IF2(iter == m_coll.end(), "Can't find arc list"); + ArcList *arcList = iter->second; + + m_coll.erase(iter); + delete arcList; +} + +std::string ArcLists::Debug(const System &system) const +{ + stringstream strm; + BOOST_FOREACH(const Coll::value_type &collPair, m_coll) { + const ArcList *arcList = collPair.second; + strm << arcList << "(" << arcList->size() << ") "; + } + return strm.str(); +} + +} + diff --git a/mosesdecoder/moses2/Array.h b/mosesdecoder/moses2/Array.h new file mode 100644 index 0000000000000000000000000000000000000000..8d40ce2688fd13d80d721d9ae988b0e0794d7d09 --- /dev/null +++ b/mosesdecoder/moses2/Array.h @@ -0,0 +1,85 @@ +#pragma once +#include +#include +#include "MemPool.h" + +namespace Moses2 +{ + +template +class Array +{ +public: + typedef T* iterator; + typedef const T* const_iterator; + //! iterators + const_iterator begin() const { + return m_arr; + } + const_iterator end() const { + return m_arr + m_size; + } + + iterator begin() { + return m_arr; + } + iterator end() { + return m_arr + m_size; + } + + Array(MemPool &pool, size_t size = 0, const T &val = T()) { + m_size = size; + m_maxSize = size; + m_arr = pool.Allocate(size); + for (size_t i = 0; i < size; ++i) { + m_arr[i] = val; + } + } + + size_t size() const { + return m_size; + } + + const T& operator[](size_t ind) const { + assert(ind < m_size); + return m_arr[ind]; + } + + T& operator[](size_t ind) { + assert(ind < m_size); + return m_arr[ind]; + } + + T *GetArray() { + return m_arr; + } + + size_t hash() const { + size_t seed = 0; + for (size_t i = 0; i < m_size; ++i) { + boost::hash_combine(seed, m_arr[i]); + } + return seed; + } + + int Compare(const Array &compare) const { + + int cmp = memcmp(m_arr, compare.m_arr, sizeof(T) * m_size); + return cmp; + } + + bool operator==(const Array &compare) const { + int cmp = Compare(compare); + return cmp == 0; + } + + void resize(size_t newSize) { + assert(m_size <= m_maxSize); + m_size = newSize; + } +protected: + size_t m_size, m_maxSize; + T *m_arr; +}; + +} diff --git a/mosesdecoder/moses2/EstimatedScores.h b/mosesdecoder/moses2/EstimatedScores.h new file mode 100644 index 0000000000000000000000000000000000000000..f854707839b6df1c4e44412b4f456711029eb15a --- /dev/null +++ b/mosesdecoder/moses2/EstimatedScores.h @@ -0,0 +1,59 @@ +// $Id$ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include "legacy/Util2.h" +#include "legacy/Bitmap.h" +#include "legacy/Matrix.h" + +namespace Moses2 +{ +class MemPool; +class System; + +//! A square array of floats to store future costs in the phrase-based decoder +class EstimatedScores: public Matrix +{ +public: + EstimatedScores(MemPool &pool, size_t size) : + Matrix(pool, size, size) { + } + + ~EstimatedScores(); // not implemented + + float CalcEstimatedScore(Bitmap const&) const; + float CalcEstimatedScore(Bitmap const&, size_t startPos, size_t endPos) const; + + std::ostream &Debug(std::ostream &out, const System &system) const { + for (size_t endPos = 0; endPos < GetSize(); endPos++) { + for (size_t startPos = 0; startPos < GetSize(); startPos++) + out << GetValue(startPos, endPos) << " "; + out << std::endl; + } + return out; + } + +}; + +} + diff --git a/mosesdecoder/moses2/HypothesisBase.cpp b/mosesdecoder/moses2/HypothesisBase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c124866d1036d62bdd7bd98c047b94195f1080cd --- /dev/null +++ b/mosesdecoder/moses2/HypothesisBase.cpp @@ -0,0 +1,81 @@ +/* + * Hypothesis.cpp + * + * Created on: 24 Oct 2015 + * Author: hieu + */ + +#include +#include +#include +#include "HypothesisBase.h" +#include "System.h" +#include "Scores.h" +#include "ManagerBase.h" +#include "MemPool.h" +#include "FF/StatefulFeatureFunction.h" + +using namespace std; + +namespace Moses2 +{ + +//size_t g_numHypos = 0; + +HypothesisBase::HypothesisBase(MemPool &pool, const System &system) +{ + m_scores = new (pool.Allocate()) Scores(system, pool, + system.featureFunctions.GetNumScores()); + + // FF states + const std::vector &sfffs = + system.featureFunctions.GetStatefulFeatureFunctions(); + size_t numStatefulFFs = sfffs.size(); + m_ffStates = (FFState **) pool.Allocate(sizeof(FFState*) * numStatefulFFs); + + BOOST_FOREACH(const StatefulFeatureFunction *sfff, sfffs) { + size_t statefulInd = sfff->GetStatefulInd(); + FFState *state = sfff->BlankState(pool, system); + m_ffStates[statefulInd] = state; + } +} + +size_t HypothesisBase::hash() const +{ + return hash(0); +} + +size_t HypothesisBase::hash(size_t seed) const +{ + size_t numStatefulFFs = + GetManager().system.featureFunctions.GetStatefulFeatureFunctions().size(); + + // states + for (size_t i = 0; i < numStatefulFFs; ++i) { + const FFState *state = m_ffStates[i]; + size_t hash = state->hash(); + boost::hash_combine(seed, hash); + } + return seed; + +} + +bool HypothesisBase::operator==(const HypothesisBase &other) const +{ + size_t numStatefulFFs = + GetManager().system.featureFunctions.GetStatefulFeatureFunctions().size(); + + // states + for (size_t i = 0; i < numStatefulFFs; ++i) { + const FFState &thisState = *m_ffStates[i]; + const FFState &otherState = *other.m_ffStates[i]; + if (thisState != otherState) { + return false; + } + } + return true; + +} + +} + diff --git a/mosesdecoder/moses2/HypothesisBase.h b/mosesdecoder/moses2/HypothesisBase.h new file mode 100644 index 0000000000000000000000000000000000000000..55747990667b886913227c27cfee933b945e6c54 --- /dev/null +++ b/mosesdecoder/moses2/HypothesisBase.h @@ -0,0 +1,76 @@ +/* + * Hypothesis.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include "FF/FFState.h" +#include "Scores.h" + +namespace Moses2 +{ + +class ManagerBase; +class Scores; + +class HypothesisBase +{ +public: + virtual ~HypothesisBase() { + } + + inline ManagerBase &GetManager() const { + return *m_mgr; + } + + template + const T &Cast() const { + return static_cast(*this); + } + + const Scores &GetScores() const { + return *m_scores; + } + Scores &GetScores() { + return *m_scores; + } + + const FFState *GetState(size_t ind) const { + return m_ffStates[ind]; + } + FFState *GetState(size_t ind) { + return m_ffStates[ind]; + } + + virtual size_t hash() const; + virtual size_t hash(size_t seed) const; + virtual bool operator==(const HypothesisBase &other) const; + + virtual SCORE GetFutureScore() const = 0; + virtual void EvaluateWhenApplied() = 0; + + virtual std::string Debug(const System &system) const = 0; + +protected: + ManagerBase *m_mgr; + Scores *m_scores; + FFState **m_ffStates; + + HypothesisBase(MemPool &pool, const System &system); +}; + +//////////////////////////////////////////////////////////////////////////////////// +class HypothesisFutureScoreOrderer +{ +public: + bool operator()(const HypothesisBase* a, const HypothesisBase* b) const { + return a->GetFutureScore() > b->GetFutureScore(); + } +}; + +} + diff --git a/mosesdecoder/moses2/HypothesisColl.cpp b/mosesdecoder/moses2/HypothesisColl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6fd8383e4c352f612fca2f55344a4cea2ee1b09f --- /dev/null +++ b/mosesdecoder/moses2/HypothesisColl.cpp @@ -0,0 +1,286 @@ +/* + * HypothesisColl.cpp + * + * Created on: 26 Feb 2016 + * Author: hieu + */ +#include +#include +#include +#include +#include "HypothesisColl.h" +#include "ManagerBase.h" +#include "System.h" +#include "MemPoolAllocator.h" + +using namespace std; + +namespace Moses2 +{ + +HypothesisColl::HypothesisColl(const ManagerBase &mgr) + :m_coll(MemPoolAllocator(mgr.GetPool())) + ,m_sortedHypos(NULL) +{ + m_bestScore = -std::numeric_limits::infinity(); + m_worstScore = std::numeric_limits::infinity(); +} + +const HypothesisBase *HypothesisColl::GetBestHypo() const +{ + if (GetSize() == 0) { + return NULL; + } + if (m_sortedHypos) { + return (*m_sortedHypos)[0]; + } + + SCORE bestScore = -std::numeric_limits::infinity(); + const HypothesisBase *bestHypo; + BOOST_FOREACH(const HypothesisBase *hypo, m_coll) { + if (hypo->GetFutureScore() > bestScore) { + bestScore = hypo->GetFutureScore(); + bestHypo = hypo; + } + } + return bestHypo; +} + +void HypothesisColl::Add( + const ManagerBase &mgr, + HypothesisBase *hypo, + Recycler &hypoRecycle, + ArcLists &arcLists) +{ + size_t maxStackSize = mgr.system.options.search.stack_size; + + if (GetSize() > maxStackSize * 2) { + //cerr << "maxStackSize=" << maxStackSize << " " << GetSize() << endl; + PruneHypos(mgr, mgr.arcLists); + } + + SCORE futureScore = hypo->GetFutureScore(); + + /* + cerr << "scores:" + << futureScore << " " + << m_bestScore << " " + << GetSize() << " " + << endl; + */ + if (GetSize() >= maxStackSize && futureScore < m_worstScore) { + // beam threshold or really bad hypo that won't make the pruning cut + // as more hypos are added, the m_worstScore stat gets out of date and isn't the optimum cut-off point + //cerr << "Discard, really bad score:" << hypo->Debug(mgr.system) << endl; + hypoRecycle.Recycle(hypo); + return; + } + + StackAdd added = Add(hypo); + + size_t nbestSize = mgr.system.options.nbest.nbest_size; + if (nbestSize) { + arcLists.AddArc(added.added, hypo, added.other); + } else { + if (added.added) { + if (added.other) { + hypoRecycle.Recycle(added.other); + } + } else { + hypoRecycle.Recycle(hypo); + } + } + + // update beam variables + if (added.added) { + if (futureScore > m_bestScore) { + m_bestScore = futureScore; + float beamWidth = mgr.system.options.search.beam_width; + if ( m_bestScore + beamWidth > m_worstScore ) { + m_worstScore = m_bestScore + beamWidth; + } + } else if (GetSize() <= maxStackSize && futureScore < m_worstScore) { + m_worstScore = futureScore; + } + } +} + +StackAdd HypothesisColl::Add(const HypothesisBase *hypo) +{ + std::pair<_HCType::iterator, bool> addRet = m_coll.insert(hypo); + //cerr << endl << "new=" << hypo->Debug(hypo->GetManager().system) << endl; + + // CHECK RECOMBINATION + if (addRet.second) { + // equiv hypo doesn't exists + //cerr << "Added " << hypo << endl; + return StackAdd(true, NULL); + } else { + HypothesisBase *hypoExisting = const_cast(*addRet.first); + //cerr << "hypoExisting=" << hypoExisting->Debug(hypo->GetManager().system) << endl; + + if (hypo->GetFutureScore() > hypoExisting->GetFutureScore()) { + // incoming hypo is better than the one we have + //cerr << "Add " << hypo << "(" << hypo->hash() << ")" + // << " discard existing " << hypoExisting << "(" << hypoExisting->hash() << ")" + // << endl; + + const HypothesisBase * const &hypoExisting1 = *addRet.first; + const HypothesisBase *&hypoExisting2 = + const_cast(hypoExisting1); + hypoExisting2 = hypo; + + return StackAdd(true, hypoExisting); + } else { + // already storing the best hypo. discard incoming hypo + //cerr << "Keep existing " << hypoExisting << "(" << hypoExisting->hash() << ")" + // << " discard new " << hypo << "(" << hypo->hash() << ")" + // << endl; + return StackAdd(false, hypoExisting); + } + } + + //assert(false); +} + +const Hypotheses &HypothesisColl::GetSortedAndPrunedHypos( + const ManagerBase &mgr, + ArcLists &arcLists) const +{ + if (m_sortedHypos == NULL) { + // create sortedHypos first + MemPool &pool = mgr.GetPool(); + m_sortedHypos = new (pool.Allocate()) Hypotheses(pool, + m_coll.size()); + + SortHypos(mgr, m_sortedHypos->GetArray()); + + // prune + Recycler &recycler = mgr.GetHypoRecycler(); + + size_t maxStackSize = mgr.system.options.search.stack_size; + if (maxStackSize && m_sortedHypos->size() > maxStackSize) { + for (size_t i = maxStackSize; i < m_sortedHypos->size(); ++i) { + HypothesisBase *hypo = const_cast((*m_sortedHypos)[i]); + recycler.Recycle(hypo); + + // delete from arclist + if (mgr.system.options.nbest.nbest_size) { + arcLists.Delete(hypo); + } + } + m_sortedHypos->resize(maxStackSize); + } + + } + + return *m_sortedHypos; +} + +void HypothesisColl::PruneHypos(const ManagerBase &mgr, ArcLists &arcLists) +{ + size_t maxStackSize = mgr.system.options.search.stack_size; + + Recycler &recycler = mgr.GetHypoRecycler(); + + const HypothesisBase **sortedHypos = (const HypothesisBase **) alloca(GetSize() * sizeof(const HypothesisBase *)); + SortHypos(mgr, sortedHypos); + + // update worse score + m_worstScore = sortedHypos[maxStackSize - 1]->GetFutureScore(); + + // prune + for (size_t i = maxStackSize; i < GetSize(); ++i) { + HypothesisBase *hypo = const_cast(sortedHypos[i]); + + // delete from arclist + if (mgr.system.options.nbest.nbest_size) { + arcLists.Delete(hypo); + } + + // delete from collection + Delete(hypo); + + recycler.Recycle(hypo); + } + +} + +void HypothesisColl::SortHypos(const ManagerBase &mgr, const HypothesisBase **sortedHypos) const +{ + size_t maxStackSize = mgr.system.options.search.stack_size; + //assert(maxStackSize); // can't do stack=0 - unlimited stack size. No-one ever uses that + //assert(GetSize() > maxStackSize); + //assert(sortedHypos.size() == GetSize()); + + /* + cerr << "UNSORTED hypos: "; + BOOST_FOREACH(const HypothesisBase *hypo, m_coll) { + cerr << hypo << "(" << hypo->GetFutureScore() << ")" << " "; + } + cerr << endl; + */ + size_t ind = 0; + BOOST_FOREACH(const HypothesisBase *hypo, m_coll) { + sortedHypos[ind] = hypo; + ++ind; + } + + size_t indMiddle; + if (maxStackSize == 0) { + indMiddle = GetSize(); + } else if (GetSize() > maxStackSize) { + indMiddle = maxStackSize; + } else { + // GetSize() <= maxStackSize + indMiddle = GetSize(); + } + + const HypothesisBase **iterMiddle = sortedHypos + indMiddle; + + std::partial_sort( + sortedHypos, + iterMiddle, + sortedHypos + GetSize(), + HypothesisFutureScoreOrderer()); + + /* + cerr << "sorted hypos: "; + for (size_t i = 0; i < sortedHypos.size(); ++i) { + const HypothesisBase *hypo = sortedHypos[i]; + cerr << hypo << " "; + } + cerr << endl; + */ +} + +void HypothesisColl::Delete(const HypothesisBase *hypo) +{ + //cerr << " Delete hypo=" << hypo << "(" << hypo->hash() << ")" + // << " m_coll=" << m_coll.size() << endl; + + size_t erased = m_coll.erase(hypo); + UTIL_THROW_IF2(erased != 1, "couldn't erase hypo " << hypo); +} + +void HypothesisColl::Clear() +{ + m_sortedHypos = NULL; + m_coll.clear(); + + m_bestScore = -std::numeric_limits::infinity(); + m_worstScore = std::numeric_limits::infinity(); +} + +std::string HypothesisColl::Debug(const System &system) const +{ + stringstream out; + BOOST_FOREACH (const HypothesisBase *hypo, m_coll) { + out << hypo->Debug(system); + out << std::endl << std::endl; + } + + return out.str(); +} + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/HypothesisColl.h b/mosesdecoder/moses2/HypothesisColl.h new file mode 100644 index 0000000000000000000000000000000000000000..9f1fa4bc755c5d4fac62363b1592c59f2d52b353 --- /dev/null +++ b/mosesdecoder/moses2/HypothesisColl.h @@ -0,0 +1,75 @@ +/* + * HypothesisColl.h + * + * Created on: 26 Feb 2016 + * Author: hieu + */ +#pragma once +#include +#include "HypothesisBase.h" +#include "MemPoolAllocator.h" +#include "Recycler.h" +#include "Array.h" +#include "legacy/Util2.h" + +namespace Moses2 +{ + +class ManagerBase; +class ArcLists; + +typedef Array Hypotheses; + +//////////////////////////////////////////////////// +class HypothesisColl +{ +public: + HypothesisColl(const ManagerBase &mgr); + + void Add(const ManagerBase &mgr, + HypothesisBase *hypo, + Recycler &hypoRecycle, + ArcLists &arcLists); + + size_t GetSize() const { + return m_coll.size(); + } + + void Clear(); + + const Hypotheses &GetSortedAndPrunedHypos( + const ManagerBase &mgr, + ArcLists &arcLists) const; + + const HypothesisBase *GetBestHypo() const; + + template + const T *GetBestHypo() const { + const HypothesisBase *hypo = GetBestHypo(); + return hypo ? &hypo->Cast() : NULL; + } + + void Delete(const HypothesisBase *hypo); + + std::string Debug(const System &system) const; + +protected: + typedef std::unordered_set, UnorderedComparer, + MemPoolAllocator > _HCType; + + _HCType m_coll; + mutable Hypotheses *m_sortedHypos; + + SCORE m_bestScore; + SCORE m_worstScore; + + StackAdd Add(const HypothesisBase *hypo); + + void PruneHypos(const ManagerBase &mgr, ArcLists &arcLists); + void SortHypos(const ManagerBase &mgr, const HypothesisBase **sortedHypos) const; + +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/InputPathsBase.h b/mosesdecoder/moses2/InputPathsBase.h new file mode 100644 index 0000000000000000000000000000000000000000..88e69ea04ef14800a849a12fefb099a6022ed3c4 --- /dev/null +++ b/mosesdecoder/moses2/InputPathsBase.h @@ -0,0 +1,54 @@ +/* + * InputPaths.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include "MemPool.h" + +namespace Moses2 +{ + +class InputType; +class System; +class ManagerBase; +class InputPathBase; + +class InputPathsBase +{ + typedef std::vector Coll; +public: + InputPathsBase() { + } + virtual ~InputPathsBase(); + + //! iterators + typedef Coll::iterator iterator; + typedef Coll::const_iterator const_iterator; + + const_iterator begin() const { + return m_inputPaths.begin(); + } + const_iterator end() const { + return m_inputPaths.end(); + } + + iterator begin() { + return m_inputPaths.begin(); + } + iterator end() { + return m_inputPaths.end(); + } + + virtual void Init(const InputType &input, const ManagerBase &mgr) = 0; + +protected: + Coll m_inputPaths; +}; + +} + diff --git a/mosesdecoder/moses2/Main.cpp b/mosesdecoder/moses2/Main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9b043dd0db0a4d51e12bd13dd2f029dbfdff08ca --- /dev/null +++ b/mosesdecoder/moses2/Main.cpp @@ -0,0 +1,116 @@ +#include +#include +#include +#include "Main.h" +#include "System.h" +#include "Phrase.h" +#include "TranslationTask.h" +#include "MemPoolAllocator.h" +#ifdef HAVE_XMLRPC_C + #include "server/Server.h" +#endif // HAVE_XMLRPC_C + +#include "legacy/InputFileStream.h" +#include "legacy/Parameter.h" +#include "legacy/ThreadPool.h" +#include "legacy/Timer.h" +#include "legacy/Util2.h" +#include "util/usage.hh" + +//#include + +using namespace std; + +//extern size_t g_numHypos; + +int main(int argc, char** argv) +{ + cerr << "Starting..." << endl; + + Moses2::Timer timer; + timer.start(); + //Temp(); + + Moses2::Parameter params; + if (!params.LoadParam(argc, argv)) { + return EXIT_FAILURE; + } + Moses2::System system(params); + timer.check("Loaded"); + + if (params.GetParam("show-weights")) { + return EXIT_SUCCESS; + } + + //cerr << "system.numThreads=" << system.options.server.numThreads << endl; + Moses2::ThreadPool pool(system.options.server.numThreads, system.cpuAffinityOffset, system.cpuAffinityOffsetIncr); + //cerr << "CREATED POOL" << endl; + + if (params.GetParam("server")) { + std::cerr << "RUN SERVER" << std::endl; + run_as_server(system); + } + else { + std::cerr << "RUN BATCH" << std::endl; + batch_run(params, system, pool); + } + + cerr << "Decoding took " << timer.get_elapsed_time() << endl; + // cerr << "g_numHypos=" << g_numHypos << endl; + cerr << "Finished" << endl; + return EXIT_SUCCESS; +} + +//////////////////////////////////////////////////////////////////////////////////////////////// +void run_as_server(Moses2::System& system) +{ +#ifdef HAVE_XMLRPC_C + Moses2::Server server(system.options.server, system); + server.run(system); // actually: don't return. see Server::run() +#else + UTIL_THROW2("Moses2 was compiled without xmlrpc-c. " + << "No server functionality available."); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////// +istream &GetInputStream(Moses2::Parameter ¶ms) +{ + const Moses2::PARAM_VEC *vec = params.GetParam("input-file"); + if (vec && vec->size()) { + Moses2::InputFileStream *stream = new Moses2::InputFileStream(vec->at(0)); + return *stream; + } else { + return cin; + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////// + +void batch_run(Moses2::Parameter& params, Moses2::System& system, Moses2::ThreadPool& pool) +{ + istream& inStream = GetInputStream(params); + + long translationId = 0; + string line; + while (getline(inStream, line)) { + //cerr << "line=" << line << endl; + boost::shared_ptr task(new Moses2::TranslationTask(system, line, translationId)); + + //cerr << "START pool.Submit()" << endl; + pool.Submit(task); + //task->Run(); + ++translationId; + } + + pool.Stop(true); + + if (&inStream != &cin) { + delete& inStream; + } + + //util::PrintUsage(std::cerr); + +} + +//////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/mosesdecoder/moses2/Main.h b/mosesdecoder/moses2/Main.h new file mode 100644 index 0000000000000000000000000000000000000000..731d6385bc85fd2b3b9778a1c314bbe75ef603ea --- /dev/null +++ b/mosesdecoder/moses2/Main.h @@ -0,0 +1,23 @@ +/* + * Main.h + * + * Created on: 1 Apr 2016 + * Author: hieu + */ +#pragma once +#include + +namespace Moses2 +{ +class Parameter; +class System; +class ThreadPool; +} + +std::istream &GetInputStream(Moses2::Parameter ¶ms); +void batch_run(Moses2::Parameter ¶ms, Moses2::System &system, Moses2::ThreadPool &pool); +void run_as_server(Moses2::System &system); + +void Temp(); + + diff --git a/mosesdecoder/moses2/ManagerBase.cpp b/mosesdecoder/moses2/ManagerBase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ed38075decbdb07377791944891175a55f3f07c3 --- /dev/null +++ b/mosesdecoder/moses2/ManagerBase.cpp @@ -0,0 +1,53 @@ +/* + * Manager.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include +#include "System.h" +#include "ManagerBase.h" +#include "Phrase.h" +#include "InputPathsBase.h" +#include "InputPathBase.h" +#include "TranslationModel/PhraseTable.h" +#include "legacy/Range.h" +#include "PhraseBased/Sentence.h" + +using namespace std; + +namespace Moses2 +{ +ManagerBase::ManagerBase(System &sys, const TranslationTask &task, + const std::string &inputStr, long translationId) + :system(sys) + ,task(task) + ,m_inputStr(inputStr) + ,m_translationId(translationId) + ,m_pool(NULL) + ,m_systemPool(NULL) + ,m_hypoRecycler(NULL) + ,m_input(NULL) +{ +} + +ManagerBase::~ManagerBase() +{ + system.featureFunctions.CleanUpAfterSentenceProcessing(*m_input); + + GetPool().Reset(); + GetHypoRecycler().Clear(); +} + +void ManagerBase::InitPools() +{ + m_pool = &system.GetManagerPool(); + m_systemPool = &system.GetSystemPool(); + m_hypoRecycler = &system.GetHypoRecycler(); + //cerr << "pool size " << m_pool->Size() << " " << m_systemPool->Size() << endl; +} + +} + diff --git a/mosesdecoder/moses2/ManagerBase.h b/mosesdecoder/moses2/ManagerBase.h new file mode 100644 index 0000000000000000000000000000000000000000..b9d5556c7f47f7fad2fd20698d9439b31edbf2d9 --- /dev/null +++ b/mosesdecoder/moses2/ManagerBase.h @@ -0,0 +1,81 @@ +/* + * Manager.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include +#include "Phrase.h" +#include "MemPool.h" +#include "Recycler.h" +#include "EstimatedScores.h" +#include "ArcLists.h" +#include "legacy/Bitmaps.h" + +namespace Moses2 +{ + +class System; +class TranslationTask; +class PhraseImpl; +class SearchNormal; +class Search; +class InputType; +class OutputCollector; +class HypothesisBase; + +class ManagerBase +{ +public: + System &system; + const TranslationTask &task; + mutable ArcLists arcLists; + + ManagerBase(System &sys, const TranslationTask &task, + const std::string &inputStr, long translationId); + virtual ~ManagerBase(); + virtual void Decode() = 0; + virtual std::string OutputBest() const = 0; + virtual std::string OutputNBest() = 0; + virtual std::string OutputTransOpt() = 0; + + MemPool &GetPool() const { + return *m_pool; + } + + MemPool &GetSystemPool() const { + return *m_systemPool; + } + + Recycler &GetHypoRecycler() const { + return *m_hypoRecycler; + } + + const InputType &GetInput() const { + return *m_input; + } + + long GetTranslationId() const { + return m_translationId; + } + +protected: + std::string m_inputStr; + long m_translationId; + InputType *m_input; + + mutable MemPool *m_pool, *m_systemPool; + mutable Recycler *m_hypoRecycler; + + void InitPools(); + +}; + +} + diff --git a/mosesdecoder/moses2/MemPool.h b/mosesdecoder/moses2/MemPool.h new file mode 100644 index 0000000000000000000000000000000000000000..d71f7948ddbdab8b4005e801383648faf4fb66f8 --- /dev/null +++ b/mosesdecoder/moses2/MemPool.h @@ -0,0 +1,77 @@ +/* + * MemPool.h + * + * Created on: 28 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace Moses2 +{ + +class MemPool +{ + struct Page { + uint8_t *mem; + uint8_t *end; + size_t size; + + Page() = delete; + Page(std::size_t size); + ~Page(); + }; + +public: + MemPool(std::size_t initSize = 10240); + + ~MemPool(); + + uint8_t* Allocate(std::size_t size); + + template + T *Allocate() { + uint8_t *ret = Allocate(sizeof(T)); + return (T*) ret; + } + + template + T *Allocate(size_t num) { + size_t size = sizeof(T); + size_t m = size % 16; + size += m; + + uint8_t *ret = Allocate(size * num); + return (T*) ret; + } + + // re-use pool + void Reset(); + + size_t Size(); + +private: + uint8_t *More(std::size_t size); + + std::vector m_pages; + + size_t m_currSize; + size_t m_currPage; + uint8_t *current_; + + // no copying + MemPool(const MemPool &) = delete; + MemPool &operator=(const MemPool &) = delete; +}; + + +} + diff --git a/mosesdecoder/moses2/MemPoolAllocator.h b/mosesdecoder/moses2/MemPoolAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..994bb77112e0c9cb75b09f4160c3d5a285b5e1b6 --- /dev/null +++ b/mosesdecoder/moses2/MemPoolAllocator.h @@ -0,0 +1,85 @@ +#pragma once +#include "MemPool.h" + +namespace Moses2 +{ + +template +class MemPoolAllocator +{ +public: + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + template + struct rebind { + typedef MemPoolAllocator other; + }; + + MemPoolAllocator(Moses2::MemPool &pool) : + m_pool(pool) { + } + MemPoolAllocator(const MemPoolAllocator &other) : + m_pool(other.m_pool) { + } + + template + MemPoolAllocator(const MemPoolAllocator& other) : + m_pool(other.m_pool) { + } + + size_type max_size() const { + return std::numeric_limits::max(); + } + + void deallocate(pointer p, size_type n) { + //std::cerr << "deallocate " << p << " " << n << std::endl; + } + + pointer allocate(size_type n, std::allocator::const_pointer hint = 0) { + //std::cerr << "allocate " << n << " " << hint << std::endl; + pointer ret = m_pool.Allocate(n); + return ret; + } + + void construct(pointer p, const_reference val) { + //std::cerr << "construct " << p << " " << n << std::endl; + new ((void *) p) T(val); + } + + void destroy(pointer p) { + //std::cerr << "destroy " << p << " " << n << std::endl; + } + + // return address of values + pointer address (reference value) const { + return &value; + } + const_pointer address (const_reference value) const { + return &value; + } + + bool operator==(const MemPoolAllocator &allocator) const { + return true; + } + + bool operator!=(const MemPoolAllocator &allocator) const { + return false; + } + + MemPoolAllocator& operator=(const MemPoolAllocator& allocator) { + return *this; + } + + MemPool &m_pool; +protected: +}; + +} + + diff --git a/mosesdecoder/moses2/Moses2Wrapper.h b/mosesdecoder/moses2/Moses2Wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..00bcc07767340ac512877d6646bfef6e4bab8bd7 --- /dev/null +++ b/mosesdecoder/moses2/Moses2Wrapper.h @@ -0,0 +1,30 @@ +#pragma once +#include + +namespace Moses2 { + class Parameter; + class System; + extern "C" { + enum MosesApiErrorCode { + MS_API_OK, + MS_API_E_FAILURE, + MS_API_E_INPUT, + MS_API_E_TIMEOUT + }; + } + class Moses2Wrapper + { + Parameter* m_param; + System* m_system; + + public: + Moses2Wrapper(const std::string& filePath); + ~Moses2Wrapper(); + std::string Translate(const std::string& input, long id, bool nbest); + void UpdateLMPath(const std::string& filePath); + + static char* CopyString(const char* str); + static void Free(void* ptr); + }; + +} \ No newline at end of file diff --git a/mosesdecoder/moses2/Phrase.h b/mosesdecoder/moses2/Phrase.h new file mode 100644 index 0000000000000000000000000000000000000000..5a55648d66a1203208123211c770bc88a1f76a0c --- /dev/null +++ b/mosesdecoder/moses2/Phrase.h @@ -0,0 +1,146 @@ +/* + * PhraseImpl.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once + +#include +#include +#include +#include +#include "Word.h" +#include "MemPool.h" +#include "TypeDef.h" +#include "legacy/FactorCollection.h" +#include "SCFG/Word.h" +#include + +namespace Moses2 +{ + +template +class SubPhrase; + +class Scores; +class PhraseTable; +class MemPool; +class System; + +template +class Phrase +{ +public: + virtual ~Phrase() { + } + virtual const WORD& operator[](size_t pos) const = 0; + virtual size_t GetSize() const = 0; + + virtual const WORD& Back() const { + assert(GetSize()); + return (*this)[GetSize() - 1]; + } + + virtual size_t hash() const { + size_t seed = 0; + + for (size_t i = 0; i < GetSize(); ++i) { + const WORD &word = (*this)[i]; + size_t wordHash = word.hash(); + boost::hash_combine(seed, wordHash); + } + + return seed; + } + + virtual bool operator==(const Phrase &compare) const { + if (GetSize() != compare.GetSize()) { + return false; + } + + for (size_t i = 0; i < GetSize(); ++i) { + const WORD &word = (*this)[i]; + const WORD &otherWord = compare[i]; + if (word != otherWord) { + return false; + } + } + + return true; + } + + virtual bool operator!=(const Phrase &compare) const { + return !((*this) == compare); + } + + virtual std::string GetString(const FactorList &factorTypes) const { + if (GetSize() == 0) { + return ""; + } + + std::stringstream ret; + + const WORD &word = (*this)[0]; + ret << word.GetString(factorTypes); + for (size_t i = 1; i < GetSize(); ++i) { + const WORD &word = (*this)[i]; + ret << " " << word.GetString(factorTypes); + } + return ret.str(); + } + + virtual SubPhrase GetSubPhrase(size_t start, size_t size) const = 0; + + virtual std::string Debug(const System &system) const { + std::stringstream out; + size_t size = GetSize(); + if (size) { + out << (*this)[0].Debug(system); + for (size_t i = 1; i < size; ++i) { + const WORD &word = (*this)[i]; + out << " " << word.Debug(system); + } + } + + return out.str(); + } + + virtual void OutputToStream(const System &system, std::ostream &out) const { + size_t size = GetSize(); + if (size) { + (*this)[0].OutputToStream(system, out); + for (size_t i = 1; i < size; ++i) { + const WORD &word = (*this)[i]; + out << " "; + word.OutputToStream(system, out); + } + } + } + + +}; + +//////////////////////////////////////////////////////////////////////// +template +class PhraseOrdererLexical +{ +public: + bool operator()(const Phrase &a, const Phrase &b) const { + size_t minSize = std::min(a.GetSize(), b.GetSize()); + for (size_t i = 0; i < minSize; ++i) { + const Word &aWord = a[i]; + const Word &bWord = b[i]; + int cmp = aWord.Compare(bWord); + //std::cerr << "WORD: " << aWord << " ||| " << bWord << " ||| " << lessThan << std::endl; + if (cmp) { + return (cmp < 0); + } + } + return a.GetSize() < b.GetSize(); + } +}; + +} + diff --git a/mosesdecoder/moses2/Recycler.cpp b/mosesdecoder/moses2/Recycler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b7a8fb77dccbb1d525bbd60ef3b849c564533eee --- /dev/null +++ b/mosesdecoder/moses2/Recycler.cpp @@ -0,0 +1,13 @@ +/* + * Recycler.cpp + * + * Created on: 2 Jan 2016 + * Author: hieu + */ + +#include "Recycler.h" + +namespace Moses2 +{ + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/Scores.h b/mosesdecoder/moses2/Scores.h new file mode 100644 index 0000000000000000000000000000000000000000..5069fda3608039d549a53c7704e6a61bb174c462 --- /dev/null +++ b/mosesdecoder/moses2/Scores.h @@ -0,0 +1,81 @@ +/* + * Scores.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ + +#pragma once +#include +#include +#include "TypeDef.h" +#include "MemPool.h" + +namespace Moses2 +{ + +class FeatureFunction; +class FeatureFunctions; +class System; + +class Scores +{ +public: + Scores(const System &system, MemPool &pool, size_t numScores); + Scores(const System &system, MemPool &pool, size_t numScores, + const Scores &origScores); + + virtual ~Scores(); + + SCORE GetTotalScore() const { + return m_total; + } + + const SCORE *GetScores(const FeatureFunction &featureFunction) const; + + void Reset(const System &system); + + void CreateFromString(const std::string &str, + const FeatureFunction &featureFunction, const System &system, + bool transformScores); + + void PlusEquals(const System &system, const FeatureFunction &featureFunction, + const SCORE &score); + + void PlusEquals(const System &system, const FeatureFunction &featureFunction, + const SCORE &score, size_t offset); + + void PlusEquals(const System &system, const FeatureFunction &featureFunction, + const std::vector &scores); + + void PlusEquals(const System &system, const FeatureFunction &featureFunction, + SCORE scores[]); + + void PlusEquals(const System &system, const Scores &scores); + + void MinusEquals(const System &system, const Scores &scores); + + void Assign(const System &system, const FeatureFunction &featureFunction, + const SCORE &score); + + void Assign(const System &system, const FeatureFunction &featureFunction, + const std::vector &scores); + + std::string Debug(const System &system) const; + + void OutputBreakdown(std::ostream &out, const System &system) const; + + // static functions to work out estimated scores + static SCORE CalcWeightedScore(const System &system, + const FeatureFunction &featureFunction, SCORE scores[]); + + static SCORE CalcWeightedScore(const System &system, + const FeatureFunction &featureFunction, SCORE score); + +protected: + SCORE *m_scores; + SCORE m_total; +}; + +} + diff --git a/mosesdecoder/moses2/SubPhrase.h b/mosesdecoder/moses2/SubPhrase.h new file mode 100644 index 0000000000000000000000000000000000000000..21b003912666f5150f9314660595873b83b6d825 --- /dev/null +++ b/mosesdecoder/moses2/SubPhrase.h @@ -0,0 +1,54 @@ +#pragma once +#include +#include "Phrase.h" +#include "Word.h" +#include "SCFG/Word.h" + +namespace Moses2 +{ +class System; + +template +class SubPhrase: public Phrase +{ +public: + SubPhrase(const Phrase &origPhrase, size_t start, size_t size) + :m_origPhrase(&origPhrase) + ,m_start(start) + ,m_size(size) + {} + + virtual const WORD& operator[](size_t pos) const { + return (*m_origPhrase)[pos + m_start]; + } + + virtual size_t GetSize() const { + return m_size; + } + + SubPhrase GetSubPhrase(size_t start, size_t size) const { + SubPhrase ret(*m_origPhrase, m_start + start, size); + return ret; + } + + virtual std::string Debug(const System &system) const { + std::stringstream out; + if (GetSize()) { + out << (*this)[0].Debug(system); + for (size_t i = 1; i < GetSize(); ++i) { + const WORD &word = (*this)[i]; + out << " " << word.Debug(system); + } + } + + return out.str(); + } + +protected: + const Phrase *m_origPhrase; + size_t m_start, m_size; +}; + + +} + diff --git a/mosesdecoder/moses2/TargetPhrase.cpp b/mosesdecoder/moses2/TargetPhrase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..600d41ae75fd19578737d70a01f0b57b9c853b1b --- /dev/null +++ b/mosesdecoder/moses2/TargetPhrase.cpp @@ -0,0 +1,15 @@ +/* + * TargetPhrase.cpp + * + * Created on: 26 Apr 2016 + * Author: hieu + */ + +#include "TargetPhrase.h" +#include "System.h" +#include "Scores.h" + +namespace Moses2 +{ + +} /* namespace Moses2 */ diff --git a/mosesdecoder/moses2/TranslationTask.cpp b/mosesdecoder/moses2/TranslationTask.cpp new file mode 100644 index 0000000000000000000000000000000000000000..07ec0a01131523c2ad716bdd9f8abf4202a4d657 --- /dev/null +++ b/mosesdecoder/moses2/TranslationTask.cpp @@ -0,0 +1,65 @@ +#include "TranslationTask.h" +#include "System.h" +#include "InputType.h" +#include "PhraseBased/Manager.h" +#include "SCFG/Manager.h" + +using namespace std; + +namespace Moses2 +{ + +TranslationTask::TranslationTask(System &system, + const std::string &line, + long translationId) +{ + if (system.isPb) { + m_mgr = new Manager(system, *this, line, translationId); + } else { + m_mgr = new SCFG::Manager(system, *this, line, translationId); + } +} + +TranslationTask::~TranslationTask() +{ +} + +std::string TranslationTask::ReturnTranslation(bool nbest) const +{ + m_mgr->Decode(); + string out; + if (nbest) { + out = m_mgr->OutputNBest() + "\n"; + } + else { + out = m_mgr->OutputBest() + "\n"; + } + delete m_mgr; + return out; +} + +void TranslationTask::Run() +{ + + m_mgr->Decode(); + + string out; + + out = m_mgr->OutputBest() + "\n"; + m_mgr->system.bestCollector->Write(m_mgr->GetTranslationId(), out); + + if (m_mgr->system.options.nbest.nbest_size) { + out = m_mgr->OutputNBest(); + m_mgr->system.nbestCollector->Write(m_mgr->GetTranslationId(), out); + } + + if (!m_mgr->system.options.output.detailed_transrep_filepath.empty()) { + out = m_mgr->OutputTransOpt(); + m_mgr->system.detailedTranslationCollector->Write(m_mgr->GetTranslationId(), out); + } + + delete m_mgr; +} + +} + diff --git a/mosesdecoder/moses2/TrellisPaths.h b/mosesdecoder/moses2/TrellisPaths.h new file mode 100644 index 0000000000000000000000000000000000000000..6a6a59c1a64abd681e5c1f11aef7d4a5988b5741 --- /dev/null +++ b/mosesdecoder/moses2/TrellisPaths.h @@ -0,0 +1,64 @@ +/* + * TrellisPaths.h + * + * Created on: 16 Mar 2016 + * Author: hieu + */ +#pragma once + +#include +#include +#include "PhraseBased/TrellisPath.h" + +namespace Moses2 +{ + +template +struct CompareTrellisPath { + bool operator()(const T* pathA, const T* pathB) const { + return (pathA->GetFutureScore() < pathB->GetFutureScore()); + } +}; + +template +class TrellisPaths +{ +public: + TrellisPaths() {} + + virtual ~TrellisPaths() { + while (!empty()) { + T *path = Get(); + delete path; + } + } + + bool empty() const { + return m_coll.empty(); + } + + //! add a new entry into collection + void Add(T *trellisPath) { + m_coll.push(trellisPath); + } + + T *Get() { + T *top = m_coll.top(); + + // Detach + m_coll.pop(); + return top; + } + + size_t GetSize() const { + return m_coll.size(); + } + +protected: + typedef std::priority_queue, + CompareTrellisPath > CollectionType; + CollectionType m_coll; +}; + +} /* namespace Moses2 */ + diff --git a/mosesdecoder/moses2/TypeDef.h b/mosesdecoder/moses2/TypeDef.h new file mode 100644 index 0000000000000000000000000000000000000000..d96257ac29e3cca13167ebc61cfae65735d6fa0f --- /dev/null +++ b/mosesdecoder/moses2/TypeDef.h @@ -0,0 +1,125 @@ +/* + * TypeDef.h + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include +#include "Vector.h" + +namespace Moses2 +{ + +class HypothesisBase; + +#define NOT_FOUND std::numeric_limits::max() +const size_t DEFAULT_MAX_PHRASE_LENGTH = 20; +const size_t DEFAULT_MAX_CHART_SPAN = 20; +const size_t DEFAULT_MAX_HYPOSTACK_SIZE = 200; +const size_t DEFAULT_CUBE_PRUNING_POP_LIMIT = 1000; +const size_t DEFAULT_CUBE_PRUNING_DIVERSITY = 0; +const size_t DEFAULT_MAX_TRANS_OPT_SIZE = 5000; + +const size_t DEFAULT_MAX_PART_TRANS_OPT_SIZE = 10000; +const size_t DEFAULT_MAX_TRANS_OPT_CACHE_SIZE = 10000; +const float LOWEST_SCORE = -100.0f; + +const float DEFAULT_BEAM_WIDTH = 0.00001f; +const float DEFAULT_EARLY_DISCARDING_THRESHOLD = 0.0f; +const float DEFAULT_TRANSLATION_OPTION_THRESHOLD = 0.0f; + +#ifndef BOS_ +#define BOS_ "" //Beginning of sentence symbol +#endif +#ifndef EOS_ +#define EOS_ "" //End of sentence symbol +#endif + +typedef size_t FactorType; +typedef float SCORE; +typedef std::vector FactorList; + +// Note: StaticData uses SearchAlgorithm to determine whether the translation +// model is phrase-based or syntax-based. If you add a syntax-based search +// algorithm here then you should also update StaticData::IsSyntax(). +enum SearchAlgorithm { + Normal = 0, CubePruning = 1, + //,CubeGrowing = 2 + CYKPlus = 3, + NormalBatch = 4, + ChartIncremental = 5, + SyntaxS2T = 6, + SyntaxT2S = 7, + SyntaxT2S_SCFG = 8, + SyntaxF2S = 9, + CubePruningPerMiniStack = 10, + CubePruningPerBitmap = 11, + CubePruningCardinalStack = 12, + CubePruningBitmapStack = 13, + CubePruningMiniStack = 14, + DefaultSearchAlgorithm = 777 // means: use StaticData.m_searchAlgorithm +}; + +enum InputTypeEnum { + SentenceInput = 0, + ConfusionNetworkInput = 1, + WordLatticeInput = 2, + TreeInputType = 3, + //,WordLatticeInput2 = 4, + TabbedSentenceInput = 5, + ForestInputType = 6, + SentenceInputWithCandidates = 7, +}; + +enum XmlInputType { + XmlPassThrough = 0, + XmlIgnore = 1, + XmlExclusive = 2, + XmlInclusive = 3, + XmlConstraint = 4 +}; + +enum WordAlignmentSort { + NoSort = 0, + TargetOrder = 1 +}; + +enum S2TParsingAlgorithm { + RecursiveCYKPlus, + Scope3 +}; + +enum SourceLabelOverlap { + SourceLabelOverlapAdd = 0, + SourceLabelOverlapReplace = 1, + SourceLabelOverlapDiscard = 2 +}; + +///////////////////////// +// MOSES2 only + +class StackAdd +{ +public: + bool added; + HypothesisBase *other; + + StackAdd() { + } + StackAdd(bool vadded, HypothesisBase *vOther) : + added(vadded), other(vOther) { + } +}; + +class Hypothesis; +typedef Vector Batch; + +class Factor; +typedef std::vector Context; + +} + diff --git a/mosesdecoder/moses2/Vector.cpp b/mosesdecoder/moses2/Vector.cpp new file mode 100644 index 0000000000000000000000000000000000000000..46af0f7934b7c0b749e40811d65680148a44a3e2 --- /dev/null +++ b/mosesdecoder/moses2/Vector.cpp @@ -0,0 +1,14 @@ +/* + * Vector.cpp + * + * Created on: 7 Dec 2015 + * Author: hieu + */ + +#include "Vector.h" + +namespace Moses2 +{ + +} + diff --git a/mosesdecoder/moses2/Weights.h b/mosesdecoder/moses2/Weights.h new file mode 100644 index 0000000000000000000000000000000000000000..96fdb5a71a20cc360d1d618a02d5abcdbb4810fa --- /dev/null +++ b/mosesdecoder/moses2/Weights.h @@ -0,0 +1,38 @@ +/* + * Weights.h + * + * Created on: 24 Oct 2015 + * Author: hieu + */ +#pragma once + +#include +#include +#include "TypeDef.h" + +namespace Moses2 +{ + +class FeatureFunctions; + +class Weights +{ +public: + Weights(); + virtual ~Weights(); + void Init(const FeatureFunctions &ffs); + + SCORE operator[](size_t ind) const { + return m_weights[ind]; + } + + std::vector GetWeights(const FeatureFunction &ff) const; + + void SetWeights(const FeatureFunctions &ffs, const std::string &ffName, const std::vector &weights); + +protected: + std::vector m_weights; +}; + +} + diff --git a/mosesdecoder/moses2/Word.cpp b/mosesdecoder/moses2/Word.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f272f7cdcc135aa10a42ea11939b94157ddb54ec --- /dev/null +++ b/mosesdecoder/moses2/Word.cpp @@ -0,0 +1,136 @@ +/* + * Word.cpp + * + * Created on: 23 Oct 2015 + * Author: hieu + */ +#include +#include +#include +#include "Word.h" +#include "System.h" +#include "legacy/Util2.h" +#include "util/murmur_hash.hh" + +using namespace std; + +namespace Moses2 +{ + +Word::Word() +{ + Init(m_factors, MAX_NUM_FACTORS, NULL); +} + +Word::Word(const Word ©) +{ + memcpy(m_factors, copy.m_factors, sizeof(const Factor *) * MAX_NUM_FACTORS); +} + +Word::~Word() +{ + // TODO Auto-generated destructor stub +} + +void Word::CreateFromString(FactorCollection &vocab, const System &system, + const std::string &str) +{ + vector toks = Tokenize(str, "|"); + for (size_t i = 0; i < toks.size(); ++i) { + const string &tok = toks[i]; + //cerr << "tok=" << tok << endl; + const Factor *factor = vocab.AddFactor(tok, system, false); + m_factors[i] = factor; + } + + // null the rest + for (size_t i = toks.size(); i < MAX_NUM_FACTORS; ++i) { + m_factors[i] = NULL; + } +} + +size_t Word::hash() const +{ + uint64_t seed = 0; + size_t ret = util::MurmurHashNative(m_factors, + sizeof(Factor*) * MAX_NUM_FACTORS, seed); + return ret; +} + +size_t Word::hash(const std::vector &factors) const +{ + size_t seed = 0; + for (size_t i = 0; i < factors.size(); ++i) { + FactorType factorType = factors[i]; + const Factor *factor = m_factors[factorType]; + boost::hash_combine(seed, factor); + } + return seed; +} + + +int Word::Compare(const Word &compare) const +{ + + int cmp = memcmp(m_factors, compare.m_factors, + sizeof(Factor*) * MAX_NUM_FACTORS); + return cmp; + + /* + int ret = m_factors[0]->GetString().compare(compare.m_factors[0]->GetString()); + return ret; + */ +} + +bool Word::operator<(const Word &compare) const +{ + int cmp = Compare(compare); + return (cmp < 0); +} + +std::string Word::Debug(const System &system) const +{ + stringstream out; + bool outputAlready = false; + for (size_t i = 0; i < MAX_NUM_FACTORS; ++i) { + const Factor *factor = m_factors[i]; + if (factor) { + if (outputAlready) { + out << "|"; + } + out << *factor; + outputAlready = true; + } + } + + return out.str(); +} + +void Word::OutputToStream(const System &system, std::ostream &out) const +{ + const std::vector &factorTypes = system.options.output.factor_order; + out << *m_factors[ factorTypes[0] ]; + + for (size_t i = 1; i < factorTypes.size(); ++i) { + FactorType factorType = factorTypes[i]; + const Factor *factor = m_factors[factorType]; + + out << "|" << *factor; + } +} + +std::string Word::GetString(const FactorList &factorTypes) const +{ + assert(factorTypes.size()); + std::stringstream ret; + + ret << m_factors[factorTypes[0]]->GetString(); + for (size_t i = 1; i < factorTypes.size(); ++i) { + FactorType factorType = factorTypes[i]; + ret << "|" << m_factors[factorType]; + } + return ret.str(); +} + +} + diff --git a/mosesdecoder/moses2/pugixml.cpp b/mosesdecoder/moses2/pugixml.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a39f25880de738604ffe18dd7a6f781bc70afd6b --- /dev/null +++ b/mosesdecoder/moses2/pugixml.cpp @@ -0,0 +1,11456 @@ +/** + * pugixml parser - version 1.7 + * -------------------------------------------------------- + * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://pugixml.org/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef SOURCE_PUGIXML_CPP +#define SOURCE_PUGIXML_CPP + +#include "pugixml.hpp" + +#include +#include +#include +#include +#include + +#ifdef PUGIXML_WCHAR_MODE +# include +#endif + +#ifndef PUGIXML_NO_XPATH +# include +# include +# ifdef PUGIXML_NO_EXCEPTIONS +# include +# endif +#endif + +#ifndef PUGIXML_NO_STL +# include +# include +# include +#endif + +// For placement new +#include + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable: 4127) // conditional expression is constant +# pragma warning(disable: 4324) // structure was padded due to __declspec(align()) +# pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable +# pragma warning(disable: 4702) // unreachable code +# pragma warning(disable: 4996) // this function or variable may be unsafe +# pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged +#endif + +#ifdef __INTEL_COMPILER +# pragma warning(disable: 177) // function was declared but never referenced +# pragma warning(disable: 279) // controlling expression is constant +# pragma warning(disable: 1478 1786) // function was declared "deprecated" +# pragma warning(disable: 1684) // conversion from pointer to same-sized integral type +#endif + +#if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY) +# pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away +#endif + +#ifdef __BORLANDC__ +# pragma option push +# pragma warn -8008 // condition is always false +# pragma warn -8066 // unreachable code +#endif + +#ifdef __SNC__ +// Using diag_push/diag_pop does not disable the warnings inside templates due to a compiler bug +# pragma diag_suppress=178 // function was declared but never referenced +# pragma diag_suppress=237 // controlling expression is constant +#endif + +// Inlining controls +#if defined(_MSC_VER) && _MSC_VER >= 1300 +# define PUGI__NO_INLINE __declspec(noinline) +#elif defined(__GNUC__) +# define PUGI__NO_INLINE __attribute__((noinline)) +#else +# define PUGI__NO_INLINE +#endif + +// Branch weight controls +#if defined(__GNUC__) +# define PUGI__UNLIKELY(cond) __builtin_expect(cond, 0) +#else +# define PUGI__UNLIKELY(cond) (cond) +#endif + +// Simple static assertion +#define PUGI__STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; } + +// Digital Mars C++ bug workaround for passing char loaded from memory via stack +#ifdef __DMC__ +# define PUGI__DMC_VOLATILE volatile +#else +# define PUGI__DMC_VOLATILE +#endif + +// Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all) +#if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST) +using std::memcpy; +using std::memmove; +using std::memset; +#endif + +// In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features +#if defined(_MSC_VER) && !defined(__S3E__) +# define PUGI__MSVC_CRT_VERSION _MSC_VER +#endif + +#ifdef PUGIXML_HEADER_ONLY +# define PUGI__NS_BEGIN namespace pugi { namespace impl { +# define PUGI__NS_END } } +# define PUGI__FN inline +# define PUGI__FN_NO_INLINE inline +#else +# if defined(_MSC_VER) && _MSC_VER < 1300 // MSVC6 seems to have an amusing bug with anonymous namespaces inside namespaces +# define PUGI__NS_BEGIN namespace pugi { namespace impl { +# define PUGI__NS_END } } +# else +# define PUGI__NS_BEGIN namespace pugi { namespace impl { namespace { +# define PUGI__NS_END } } } +# endif +# define PUGI__FN +# define PUGI__FN_NO_INLINE PUGI__NO_INLINE +#endif + +// uintptr_t +#if !defined(_MSC_VER) || _MSC_VER >= 1600 +# include +#else +namespace pugi +{ +# ifndef _UINTPTR_T_DEFINED +typedef size_t uintptr_t; +# endif + +typedef unsigned __int8 uint8_t; +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; +} +#endif + +// Memory allocation +PUGI__NS_BEGIN +PUGI__FN void* default_allocate(size_t size) +{ + return malloc(size); +} + +PUGI__FN void default_deallocate(void* ptr) +{ + free(ptr); +} + +template +struct xml_memory_management_function_storage { + static allocation_function allocate; + static deallocation_function deallocate; +}; + +// Global allocation functions are stored in class statics so that in header mode linker deduplicates them +// Without a template<> we'll get multiple definitions of the same static +template allocation_function xml_memory_management_function_storage::allocate = default_allocate; +template deallocation_function xml_memory_management_function_storage::deallocate = default_deallocate; + +typedef xml_memory_management_function_storage xml_memory; +PUGI__NS_END + +// String utilities +PUGI__NS_BEGIN +// Get string length +PUGI__FN size_t strlength(const char_t* s) +{ + assert(s); + +#ifdef PUGIXML_WCHAR_MODE + return wcslen(s); +#else + return strlen(s); +#endif +} + +// Compare two strings +PUGI__FN bool strequal(const char_t* src, const char_t* dst) +{ + assert(src && dst); + +#ifdef PUGIXML_WCHAR_MODE + return wcscmp(src, dst) == 0; +#else + return strcmp(src, dst) == 0; +#endif +} + +// Compare lhs with [rhs_begin, rhs_end) +PUGI__FN bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count) +{ + for (size_t i = 0; i < count; ++i) + if (lhs[i] != rhs[i]) + return false; + + return lhs[count] == 0; +} + +// Get length of wide string, even if CRT lacks wide character support +PUGI__FN size_t strlength_wide(const wchar_t* s) +{ + assert(s); + +#ifdef PUGIXML_WCHAR_MODE + return wcslen(s); +#else + const wchar_t* end = s; + while (*end) end++; + return static_cast(end - s); +#endif +} +PUGI__NS_END + +// auto_ptr-like object for exception recovery +PUGI__NS_BEGIN +template struct auto_deleter { + T* data; + D deleter; + + auto_deleter(T* data_, D deleter_): data(data_), deleter(deleter_) { + } + + ~auto_deleter() { + if (data) deleter(data); + } + + T* release() { + T* result = data; + data = 0; + return result; + } +}; +PUGI__NS_END + +#ifdef PUGIXML_COMPACT +PUGI__NS_BEGIN +class compact_hash_table +{ +public: + compact_hash_table(): _items(0), _capacity(0), _count(0) { + } + + void clear() { + if (_items) { + xml_memory::deallocate(_items); + _items = 0; + _capacity = 0; + _count = 0; + } + } + + void** find(const void* key) { + assert(key); + + if (_capacity == 0) return 0; + + size_t hashmod = _capacity - 1; + size_t bucket = hash(key) & hashmod; + + for (size_t probe = 0; probe <= hashmod; ++probe) { + item_t& probe_item = _items[bucket]; + + if (probe_item.key == key) + return &probe_item.value; + + if (probe_item.key == 0) + return 0; + + // hash collision, quadratic probing + bucket = (bucket + probe + 1) & hashmod; + } + + assert(!"Hash table is full"); + return 0; + } + + void** insert(const void* key) { + assert(key); + assert(_capacity != 0 && _count < _capacity - _capacity / 4); + + size_t hashmod = _capacity - 1; + size_t bucket = hash(key) & hashmod; + + for (size_t probe = 0; probe <= hashmod; ++probe) { + item_t& probe_item = _items[bucket]; + + if (probe_item.key == 0) { + probe_item.key = key; + _count++; + return &probe_item.value; + } + + if (probe_item.key == key) + return &probe_item.value; + + // hash collision, quadratic probing + bucket = (bucket + probe + 1) & hashmod; + } + + assert(!"Hash table is full"); + return 0; + } + + bool reserve() { + if (_count + 16 >= _capacity - _capacity / 4) + return rehash(); + + return true; + } + +private: + struct item_t { + const void* key; + void* value; + }; + + item_t* _items; + size_t _capacity; + + size_t _count; + + bool rehash(); + + static unsigned int hash(const void* key) { + unsigned int h = static_cast(reinterpret_cast(key)); + + // MurmurHash3 32-bit finalizer + h ^= h >> 16; + h *= 0x85ebca6bu; + h ^= h >> 13; + h *= 0xc2b2ae35u; + h ^= h >> 16; + + return h; + } +}; + +PUGI__FN_NO_INLINE bool compact_hash_table::rehash() +{ + compact_hash_table rt; + rt._capacity = (_capacity == 0) ? 32 : _capacity * 2; + rt._items = static_cast(xml_memory::allocate(sizeof(item_t) * rt._capacity)); + + if (!rt._items) + return false; + + memset(rt._items, 0, sizeof(item_t) * rt._capacity); + + for (size_t i = 0; i < _capacity; ++i) + if (_items[i].key) + *rt.insert(_items[i].key) = _items[i].value; + + if (_items) + xml_memory::deallocate(_items); + + _capacity = rt._capacity; + _items = rt._items; + + assert(_count == rt._count); + + return true; +} + +PUGI__NS_END +#endif + +PUGI__NS_BEGIN +static const size_t xml_memory_page_size = +#ifdef PUGIXML_MEMORY_PAGE_SIZE + PUGIXML_MEMORY_PAGE_SIZE +#else + 32768 +#endif + ; + +#ifdef PUGIXML_COMPACT +static const uintptr_t xml_memory_block_alignment = 4; + +static const uintptr_t xml_memory_page_alignment = sizeof(void*); +#else +static const uintptr_t xml_memory_block_alignment = sizeof(void*); + +static const uintptr_t xml_memory_page_alignment = 64; +static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1); +#endif + +// extra metadata bits +static const uintptr_t xml_memory_page_contents_shared_mask = 32; +static const uintptr_t xml_memory_page_name_allocated_mask = 16; +static const uintptr_t xml_memory_page_value_allocated_mask = 8; +static const uintptr_t xml_memory_page_type_mask = 7; + +// combined masks for string uniqueness +static const uintptr_t xml_memory_page_name_allocated_or_shared_mask = xml_memory_page_name_allocated_mask | xml_memory_page_contents_shared_mask; +static const uintptr_t xml_memory_page_value_allocated_or_shared_mask = xml_memory_page_value_allocated_mask | xml_memory_page_contents_shared_mask; + +#ifdef PUGIXML_COMPACT +#define PUGI__GETPAGE_IMPL(header) (header).get_page() +#else +#define PUGI__GETPAGE_IMPL(header) reinterpret_cast((header) & impl::xml_memory_page_pointer_mask) +#endif + +#define PUGI__GETPAGE(n) PUGI__GETPAGE_IMPL((n)->header) +#define PUGI__NODETYPE(n) static_cast(((n)->header & impl::xml_memory_page_type_mask) + 1) + +struct xml_allocator; + +struct xml_memory_page { + static xml_memory_page* construct(void* memory) { + xml_memory_page* result = static_cast(memory); + + result->allocator = 0; + result->prev = 0; + result->next = 0; + result->busy_size = 0; + result->freed_size = 0; + +#ifdef PUGIXML_COMPACT + result->compact_string_base = 0; + result->compact_shared_parent = 0; + result->compact_page_marker = 0; +#endif + + return result; + } + + xml_allocator* allocator; + + xml_memory_page* prev; + xml_memory_page* next; + + size_t busy_size; + size_t freed_size; + +#ifdef PUGIXML_COMPACT + char_t* compact_string_base; + void* compact_shared_parent; + uint32_t* compact_page_marker; +#endif +}; + +struct xml_memory_string_header { + uint16_t page_offset; // offset from page->data + uint16_t full_size; // 0 if string occupies whole page +}; + +struct xml_allocator { + xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size) { +#ifdef PUGIXML_COMPACT + _hash = 0; +#endif + } + + xml_memory_page* allocate_page(size_t data_size) { + size_t size = sizeof(xml_memory_page) + data_size; + + // allocate block with some alignment, leaving memory for worst-case padding + void* memory = xml_memory::allocate(size + xml_memory_page_alignment); + if (!memory) return 0; + + // align to next page boundary (note: this guarantees at least 1 usable byte before the page) + char* page_memory = reinterpret_cast((reinterpret_cast(memory) + xml_memory_page_alignment) & ~(xml_memory_page_alignment - 1)); + + // prepare page structure + xml_memory_page* page = xml_memory_page::construct(page_memory); + assert(page); + + page->allocator = _root->allocator; + + // record the offset for freeing the memory block + assert(page_memory > memory && page_memory - static_cast(memory) <= 127); + page_memory[-1] = static_cast(page_memory - static_cast(memory)); + + return page; + } + + static void deallocate_page(xml_memory_page* page) { + char* page_memory = reinterpret_cast(page); + + xml_memory::deallocate(page_memory - page_memory[-1]); + } + + void* allocate_memory_oob(size_t size, xml_memory_page*& out_page); + + void* allocate_memory(size_t size, xml_memory_page*& out_page) { + if (PUGI__UNLIKELY(_busy_size + size > xml_memory_page_size)) + return allocate_memory_oob(size, out_page); + + void* buf = reinterpret_cast(_root) + sizeof(xml_memory_page) + _busy_size; + + _busy_size += size; + + out_page = _root; + + return buf; + } + +#ifdef PUGIXML_COMPACT + void* allocate_object(size_t size, xml_memory_page*& out_page) { + void* result = allocate_memory(size + sizeof(uint32_t), out_page); + if (!result) return 0; + + // adjust for marker + ptrdiff_t offset = static_cast(result) - reinterpret_cast(out_page->compact_page_marker); + + if (PUGI__UNLIKELY(static_cast(offset) >= 256 * xml_memory_block_alignment)) { + // insert new marker + uint32_t* marker = static_cast(result); + + *marker = static_cast(reinterpret_cast(marker) - reinterpret_cast(out_page)); + out_page->compact_page_marker = marker; + + // since we don't reuse the page space until we reallocate it, we can just pretend that we freed the marker block + // this will make sure deallocate_memory correctly tracks the size + out_page->freed_size += sizeof(uint32_t); + + return marker + 1; + } else { + // roll back uint32_t part + _busy_size -= sizeof(uint32_t); + + return result; + } + } +#else + void* allocate_object(size_t size, xml_memory_page*& out_page) { + return allocate_memory(size, out_page); + } +#endif + + void deallocate_memory(void* ptr, size_t size, xml_memory_page* page) { + if (page == _root) page->busy_size = _busy_size; + + assert(ptr >= reinterpret_cast(page) + sizeof(xml_memory_page) && ptr < reinterpret_cast(page) + sizeof(xml_memory_page) + page->busy_size); + (void)!ptr; + + page->freed_size += size; + assert(page->freed_size <= page->busy_size); + + if (page->freed_size == page->busy_size) { + if (page->next == 0) { + assert(_root == page); + + // top page freed, just reset sizes + page->busy_size = 0; + page->freed_size = 0; + +#ifdef PUGIXML_COMPACT + // reset compact state to maximize efficiency + page->compact_string_base = 0; + page->compact_shared_parent = 0; + page->compact_page_marker = 0; +#endif + + _busy_size = 0; + } else { + assert(_root != page); + assert(page->prev); + + // remove from the list + page->prev->next = page->next; + page->next->prev = page->prev; + + // deallocate + deallocate_page(page); + } + } + } + + char_t* allocate_string(size_t length) { + static const size_t max_encoded_offset = (1 << 16) * xml_memory_block_alignment; + + PUGI__STATIC_ASSERT(xml_memory_page_size <= max_encoded_offset); + + // allocate memory for string and header block + size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t); + + // round size up to block alignment boundary + size_t full_size = (size + (xml_memory_block_alignment - 1)) & ~(xml_memory_block_alignment - 1); + + xml_memory_page* page; + xml_memory_string_header* header = static_cast(allocate_memory(full_size, page)); + + if (!header) return 0; + + // setup header + ptrdiff_t page_offset = reinterpret_cast(header) - reinterpret_cast(page) - sizeof(xml_memory_page); + + assert(page_offset % xml_memory_block_alignment == 0); + assert(page_offset >= 0 && static_cast(page_offset) < max_encoded_offset); + header->page_offset = static_cast(static_cast(page_offset) / xml_memory_block_alignment); + + // full_size == 0 for large strings that occupy the whole page + assert(full_size % xml_memory_block_alignment == 0); + assert(full_size < max_encoded_offset || (page->busy_size == full_size && page_offset == 0)); + header->full_size = static_cast(full_size < max_encoded_offset ? full_size / xml_memory_block_alignment : 0); + + // round-trip through void* to avoid 'cast increases required alignment of target type' warning + // header is guaranteed a pointer-sized alignment, which should be enough for char_t + return static_cast(static_cast(header + 1)); + } + + void deallocate_string(char_t* string) { + // this function casts pointers through void* to avoid 'cast increases required alignment of target type' warnings + // we're guaranteed the proper (pointer-sized) alignment on the input string if it was allocated via allocate_string + + // get header + xml_memory_string_header* header = static_cast(static_cast(string)) - 1; + assert(header); + + // deallocate + size_t page_offset = sizeof(xml_memory_page) + header->page_offset * xml_memory_block_alignment; + xml_memory_page* page = reinterpret_cast(static_cast(reinterpret_cast(header) - page_offset)); + + // if full_size == 0 then this string occupies the whole page + size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size * xml_memory_block_alignment; + + deallocate_memory(header, full_size, page); + } + + bool reserve() { +#ifdef PUGIXML_COMPACT + return _hash->reserve(); +#else + return true; +#endif + } + + xml_memory_page* _root; + size_t _busy_size; + +#ifdef PUGIXML_COMPACT + compact_hash_table* _hash; +#endif +}; + +PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page) +{ + const size_t large_allocation_threshold = xml_memory_page_size / 4; + + xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size); + out_page = page; + + if (!page) return 0; + + if (size <= large_allocation_threshold) { + _root->busy_size = _busy_size; + + // insert page at the end of linked list + page->prev = _root; + _root->next = page; + _root = page; + + _busy_size = size; + } else { + // insert page before the end of linked list, so that it is deleted as soon as possible + // the last page is not deleted even if it's empty (see deallocate_memory) + assert(_root->prev); + + page->prev = _root->prev; + page->next = _root; + + _root->prev->next = page; + _root->prev = page; + + page->busy_size = size; + } + + return reinterpret_cast(page) + sizeof(xml_memory_page); +} +PUGI__NS_END + +#ifdef PUGIXML_COMPACT +PUGI__NS_BEGIN +static const uintptr_t compact_alignment_log2 = 2; +static const uintptr_t compact_alignment = 1 << compact_alignment_log2; + +class compact_header +{ +public: + compact_header(xml_memory_page* page, unsigned int flags) { + PUGI__STATIC_ASSERT(xml_memory_block_alignment == compact_alignment); + + ptrdiff_t offset = (reinterpret_cast(this) - reinterpret_cast(page->compact_page_marker)); + assert(offset % compact_alignment == 0 && static_cast(offset) < 256 * compact_alignment); + + _page = static_cast(offset >> compact_alignment_log2); + _flags = static_cast(flags); + } + + void operator&=(uintptr_t mod) { + _flags &= static_cast(mod); + } + + void operator|=(uintptr_t mod) { + _flags |= static_cast(mod); + } + + uintptr_t operator&(uintptr_t mod) const { + return _flags & mod; + } + + xml_memory_page* get_page() const { + // round-trip through void* to silence 'cast increases required alignment of target type' warnings + const char* page_marker = reinterpret_cast(this) - (_page << compact_alignment_log2); + const char* page = page_marker - *reinterpret_cast(static_cast(page_marker)); + + return const_cast(reinterpret_cast(static_cast(page))); + } + +private: + unsigned char _page; + unsigned char _flags; +}; + +PUGI__FN xml_memory_page* compact_get_page(const void* object, int header_offset) +{ + const compact_header* header = reinterpret_cast(static_cast(object) - header_offset); + + return header->get_page(); +} + +template PUGI__FN_NO_INLINE T* compact_get_value(const void* object) +{ + return static_cast(*compact_get_page(object, header_offset)->allocator->_hash->find(object)); +} + +template PUGI__FN_NO_INLINE void compact_set_value(const void* object, T* value) +{ + *compact_get_page(object, header_offset)->allocator->_hash->insert(object) = value; +} + +template class compact_pointer +{ +public: + compact_pointer(): _data(0) { + } + + void operator=(const compact_pointer& rhs) { + *this = rhs + 0; + } + + void operator=(T* value) { + if (value) { + // value is guaranteed to be compact-aligned; 'this' is not + // our decoding is based on 'this' aligned to compact alignment downwards (see operator T*) + // so for negative offsets (e.g. -3) we need to adjust the diff by compact_alignment - 1 to + // compensate for arithmetic shift rounding for negative values + ptrdiff_t diff = reinterpret_cast(value) - reinterpret_cast(this); + ptrdiff_t offset = ((diff + int(compact_alignment - 1)) >> compact_alignment_log2) - start; + + if (static_cast(offset) <= 253) + _data = static_cast(offset + 1); + else { + compact_set_value(this, value); + + _data = 255; + } + } else + _data = 0; + } + + operator T*() const { + if (_data) { + if (_data < 255) { + uintptr_t base = reinterpret_cast(this) & ~(compact_alignment - 1); + + return reinterpret_cast(base + ((_data - 1 + start) << compact_alignment_log2)); + } else + return compact_get_value(this); + } else + return 0; + } + + T* operator->() const { + return *this; + } + +private: + unsigned char _data; +}; + +template class compact_pointer_parent +{ +public: + compact_pointer_parent(): _data(0) { + } + + void operator=(const compact_pointer_parent& rhs) { + *this = rhs + 0; + } + + void operator=(T* value) { + if (value) { + // value is guaranteed to be compact-aligned; 'this' is not + // our decoding is based on 'this' aligned to compact alignment downwards (see operator T*) + // so for negative offsets (e.g. -3) we need to adjust the diff by compact_alignment - 1 to + // compensate for arithmetic shift behavior for negative values + ptrdiff_t diff = reinterpret_cast(value) - reinterpret_cast(this); + ptrdiff_t offset = ((diff + int(compact_alignment - 1)) >> compact_alignment_log2) + 65533; + + if (static_cast(offset) <= 65533) { + _data = static_cast(offset + 1); + } else { + xml_memory_page* page = compact_get_page(this, header_offset); + + if (PUGI__UNLIKELY(page->compact_shared_parent == 0)) + page->compact_shared_parent = value; + + if (page->compact_shared_parent == value) { + _data = 65534; + } else { + compact_set_value(this, value); + + _data = 65535; + } + } + } else { + _data = 0; + } + } + + operator T*() const { + if (_data) { + if (_data < 65534) { + uintptr_t base = reinterpret_cast(this) & ~(compact_alignment - 1); + + return reinterpret_cast(base + ((_data - 1 - 65533) << compact_alignment_log2)); + } else if (_data == 65534) + return static_cast(compact_get_page(this, header_offset)->compact_shared_parent); + else + return compact_get_value(this); + } else + return 0; + } + + T* operator->() const { + return *this; + } + +private: + uint16_t _data; +}; + +template class compact_string +{ +public: + compact_string(): _data(0) { + } + + void operator=(const compact_string& rhs) { + *this = rhs + 0; + } + + void operator=(char_t* value) { + if (value) { + xml_memory_page* page = compact_get_page(this, header_offset); + + if (PUGI__UNLIKELY(page->compact_string_base == 0)) + page->compact_string_base = value; + + ptrdiff_t offset = value - page->compact_string_base; + + if (static_cast(offset) < (65535 << 7)) { + // round-trip through void* to silence 'cast increases required alignment of target type' warnings + uint16_t* base = reinterpret_cast(static_cast(reinterpret_cast(this) - base_offset)); + + if (*base == 0) { + *base = static_cast((offset >> 7) + 1); + _data = static_cast((offset & 127) + 1); + } else { + ptrdiff_t remainder = offset - ((*base - 1) << 7); + + if (static_cast(remainder) <= 253) { + _data = static_cast(remainder + 1); + } else { + compact_set_value(this, value); + + _data = 255; + } + } + } else { + compact_set_value(this, value); + + _data = 255; + } + } else { + _data = 0; + } + } + + operator char_t*() const { + if (_data) { + if (_data < 255) { + xml_memory_page* page = compact_get_page(this, header_offset); + + // round-trip through void* to silence 'cast increases required alignment of target type' warnings + const uint16_t* base = reinterpret_cast(static_cast(reinterpret_cast(this) - base_offset)); + assert(*base); + + ptrdiff_t offset = ((*base - 1) << 7) + (_data - 1); + + return page->compact_string_base + offset; + } else { + return compact_get_value(this); + } + } else + return 0; + } + +private: + unsigned char _data; +}; +PUGI__NS_END +#endif + +#ifdef PUGIXML_COMPACT +namespace pugi +{ +struct xml_attribute_struct { + xml_attribute_struct(impl::xml_memory_page* page): header(page, 0), namevalue_base(0) { + PUGI__STATIC_ASSERT(sizeof(xml_attribute_struct) == 8); + } + + impl::compact_header header; + + uint16_t namevalue_base; + + impl::compact_string<4, 2> name; + impl::compact_string<5, 3> value; + + impl::compact_pointer prev_attribute_c; + impl::compact_pointer next_attribute; +}; + +struct xml_node_struct { + xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(page, type - 1), namevalue_base(0) { + PUGI__STATIC_ASSERT(sizeof(xml_node_struct) == 12); + } + + impl::compact_header header; + + uint16_t namevalue_base; + + impl::compact_string<4, 2> name; + impl::compact_string<5, 3> value; + + impl::compact_pointer_parent parent; + + impl::compact_pointer first_child; + + impl::compact_pointer prev_sibling_c; + impl::compact_pointer next_sibling; + + impl::compact_pointer first_attribute; +}; +} +#else +namespace pugi +{ +struct xml_attribute_struct { + xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0) { + } + + uintptr_t header; + + char_t* name; + char_t* value; + + xml_attribute_struct* prev_attribute_c; + xml_attribute_struct* next_attribute; +}; + +struct xml_node_struct { + xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast(page) | (type - 1)), name(0), value(0), parent(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0) { + } + + uintptr_t header; + + char_t* name; + char_t* value; + + xml_node_struct* parent; + + xml_node_struct* first_child; + + xml_node_struct* prev_sibling_c; + xml_node_struct* next_sibling; + + xml_attribute_struct* first_attribute; +}; +} +#endif + +PUGI__NS_BEGIN +struct xml_extra_buffer { + char_t* buffer; + xml_extra_buffer* next; +}; + +struct xml_document_struct: public xml_node_struct, public xml_allocator { + xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0), extra_buffers(0) { +#ifdef PUGIXML_COMPACT + _hash = &hash; +#endif + } + + const char_t* buffer; + + xml_extra_buffer* extra_buffers; + +#ifdef PUGIXML_COMPACT + compact_hash_table hash; +#endif +}; + +template inline xml_allocator& get_allocator(const Object* object) +{ + assert(object); + + return *PUGI__GETPAGE(object)->allocator; +} + +template inline xml_document_struct& get_document(const Object* object) +{ + assert(object); + + return *static_cast(PUGI__GETPAGE(object)->allocator); +} +PUGI__NS_END + +// Low-level DOM operations +PUGI__NS_BEGIN +inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc) +{ + xml_memory_page* page; + void* memory = alloc.allocate_object(sizeof(xml_attribute_struct), page); + if (!memory) return 0; + + return new (memory) xml_attribute_struct(page); +} + +inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type) +{ + xml_memory_page* page; + void* memory = alloc.allocate_object(sizeof(xml_node_struct), page); + if (!memory) return 0; + + return new (memory) xml_node_struct(page, type); +} + +inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc) +{ + if (a->header & impl::xml_memory_page_name_allocated_mask) + alloc.deallocate_string(a->name); + + if (a->header & impl::xml_memory_page_value_allocated_mask) + alloc.deallocate_string(a->value); + + alloc.deallocate_memory(a, sizeof(xml_attribute_struct), PUGI__GETPAGE(a)); +} + +inline void destroy_node(xml_node_struct* n, xml_allocator& alloc) +{ + if (n->header & impl::xml_memory_page_name_allocated_mask) + alloc.deallocate_string(n->name); + + if (n->header & impl::xml_memory_page_value_allocated_mask) + alloc.deallocate_string(n->value); + + for (xml_attribute_struct* attr = n->first_attribute; attr; ) { + xml_attribute_struct* next = attr->next_attribute; + + destroy_attribute(attr, alloc); + + attr = next; + } + + for (xml_node_struct* child = n->first_child; child; ) { + xml_node_struct* next = child->next_sibling; + + destroy_node(child, alloc); + + child = next; + } + + alloc.deallocate_memory(n, sizeof(xml_node_struct), PUGI__GETPAGE(n)); +} + +inline void append_node(xml_node_struct* child, xml_node_struct* node) +{ + child->parent = node; + + xml_node_struct* head = node->first_child; + + if (head) { + xml_node_struct* tail = head->prev_sibling_c; + + tail->next_sibling = child; + child->prev_sibling_c = tail; + head->prev_sibling_c = child; + } else { + node->first_child = child; + child->prev_sibling_c = child; + } +} + +inline void prepend_node(xml_node_struct* child, xml_node_struct* node) +{ + child->parent = node; + + xml_node_struct* head = node->first_child; + + if (head) { + child->prev_sibling_c = head->prev_sibling_c; + head->prev_sibling_c = child; + } else + child->prev_sibling_c = child; + + child->next_sibling = head; + node->first_child = child; +} + +inline void insert_node_after(xml_node_struct* child, xml_node_struct* node) +{ + xml_node_struct* parent = node->parent; + + child->parent = parent; + + if (node->next_sibling) + node->next_sibling->prev_sibling_c = child; + else + parent->first_child->prev_sibling_c = child; + + child->next_sibling = node->next_sibling; + child->prev_sibling_c = node; + + node->next_sibling = child; +} + +inline void insert_node_before(xml_node_struct* child, xml_node_struct* node) +{ + xml_node_struct* parent = node->parent; + + child->parent = parent; + + if (node->prev_sibling_c->next_sibling) + node->prev_sibling_c->next_sibling = child; + else + parent->first_child = child; + + child->prev_sibling_c = node->prev_sibling_c; + child->next_sibling = node; + + node->prev_sibling_c = child; +} + +inline void remove_node(xml_node_struct* node) +{ + xml_node_struct* parent = node->parent; + + if (node->next_sibling) + node->next_sibling->prev_sibling_c = node->prev_sibling_c; + else + parent->first_child->prev_sibling_c = node->prev_sibling_c; + + if (node->prev_sibling_c->next_sibling) + node->prev_sibling_c->next_sibling = node->next_sibling; + else + parent->first_child = node->next_sibling; + + node->parent = 0; + node->prev_sibling_c = 0; + node->next_sibling = 0; +} + +inline void append_attribute(xml_attribute_struct* attr, xml_node_struct* node) +{ + xml_attribute_struct* head = node->first_attribute; + + if (head) { + xml_attribute_struct* tail = head->prev_attribute_c; + + tail->next_attribute = attr; + attr->prev_attribute_c = tail; + head->prev_attribute_c = attr; + } else { + node->first_attribute = attr; + attr->prev_attribute_c = attr; + } +} + +inline void prepend_attribute(xml_attribute_struct* attr, xml_node_struct* node) +{ + xml_attribute_struct* head = node->first_attribute; + + if (head) { + attr->prev_attribute_c = head->prev_attribute_c; + head->prev_attribute_c = attr; + } else + attr->prev_attribute_c = attr; + + attr->next_attribute = head; + node->first_attribute = attr; +} + +inline void insert_attribute_after(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node) +{ + if (place->next_attribute) + place->next_attribute->prev_attribute_c = attr; + else + node->first_attribute->prev_attribute_c = attr; + + attr->next_attribute = place->next_attribute; + attr->prev_attribute_c = place; + place->next_attribute = attr; +} + +inline void insert_attribute_before(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node) +{ + if (place->prev_attribute_c->next_attribute) + place->prev_attribute_c->next_attribute = attr; + else + node->first_attribute = attr; + + attr->prev_attribute_c = place->prev_attribute_c; + attr->next_attribute = place; + place->prev_attribute_c = attr; +} + +inline void remove_attribute(xml_attribute_struct* attr, xml_node_struct* node) +{ + if (attr->next_attribute) + attr->next_attribute->prev_attribute_c = attr->prev_attribute_c; + else + node->first_attribute->prev_attribute_c = attr->prev_attribute_c; + + if (attr->prev_attribute_c->next_attribute) + attr->prev_attribute_c->next_attribute = attr->next_attribute; + else + node->first_attribute = attr->next_attribute; + + attr->prev_attribute_c = 0; + attr->next_attribute = 0; +} + +PUGI__FN_NO_INLINE xml_node_struct* append_new_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element) +{ + if (!alloc.reserve()) return 0; + + xml_node_struct* child = allocate_node(alloc, type); + if (!child) return 0; + + append_node(child, node); + + return child; +} + +PUGI__FN_NO_INLINE xml_attribute_struct* append_new_attribute(xml_node_struct* node, xml_allocator& alloc) +{ + if (!alloc.reserve()) return 0; + + xml_attribute_struct* attr = allocate_attribute(alloc); + if (!attr) return 0; + + append_attribute(attr, node); + + return attr; +} +PUGI__NS_END + +// Helper classes for code generation +PUGI__NS_BEGIN +struct opt_false { + enum { value = 0 }; +}; + +struct opt_true { + enum { value = 1 }; +}; +PUGI__NS_END + +// Unicode utilities +PUGI__NS_BEGIN +inline uint16_t endian_swap(uint16_t value) +{ + return static_cast(((value & 0xff) << 8) | (value >> 8)); +} + +inline uint32_t endian_swap(uint32_t value) +{ + return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24); +} + +struct utf8_counter { + typedef size_t value_type; + + static value_type low(value_type result, uint32_t ch) { + // U+0000..U+007F + if (ch < 0x80) return result + 1; + // U+0080..U+07FF + else if (ch < 0x800) return result + 2; + // U+0800..U+FFFF + else return result + 3; + } + + static value_type high(value_type result, uint32_t) { + // U+10000..U+10FFFF + return result + 4; + } +}; + +struct utf8_writer { + typedef uint8_t* value_type; + + static value_type low(value_type result, uint32_t ch) { + // U+0000..U+007F + if (ch < 0x80) { + *result = static_cast(ch); + return result + 1; + } + // U+0080..U+07FF + else if (ch < 0x800) { + result[0] = static_cast(0xC0 | (ch >> 6)); + result[1] = static_cast(0x80 | (ch & 0x3F)); + return result + 2; + } + // U+0800..U+FFFF + else { + result[0] = static_cast(0xE0 | (ch >> 12)); + result[1] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (ch & 0x3F)); + return result + 3; + } + } + + static value_type high(value_type result, uint32_t ch) { + // U+10000..U+10FFFF + result[0] = static_cast(0xF0 | (ch >> 18)); + result[1] = static_cast(0x80 | ((ch >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (ch & 0x3F)); + return result + 4; + } + + static value_type any(value_type result, uint32_t ch) { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } +}; + +struct utf16_counter { + typedef size_t value_type; + + static value_type low(value_type result, uint32_t) { + return result + 1; + } + + static value_type high(value_type result, uint32_t) { + return result + 2; + } +}; + +struct utf16_writer { + typedef uint16_t* value_type; + + static value_type low(value_type result, uint32_t ch) { + *result = static_cast(ch); + + return result + 1; + } + + static value_type high(value_type result, uint32_t ch) { + uint32_t msh = static_cast(ch - 0x10000) >> 10; + uint32_t lsh = static_cast(ch - 0x10000) & 0x3ff; + + result[0] = static_cast(0xD800 + msh); + result[1] = static_cast(0xDC00 + lsh); + + return result + 2; + } + + static value_type any(value_type result, uint32_t ch) { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } +}; + +struct utf32_counter { + typedef size_t value_type; + + static value_type low(value_type result, uint32_t) { + return result + 1; + } + + static value_type high(value_type result, uint32_t) { + return result + 1; + } +}; + +struct utf32_writer { + typedef uint32_t* value_type; + + static value_type low(value_type result, uint32_t ch) { + *result = ch; + + return result + 1; + } + + static value_type high(value_type result, uint32_t ch) { + *result = ch; + + return result + 1; + } + + static value_type any(value_type result, uint32_t ch) { + *result = ch; + + return result + 1; + } +}; + +struct latin1_writer { + typedef uint8_t* value_type; + + static value_type low(value_type result, uint32_t ch) { + *result = static_cast(ch > 255 ? '?' : ch); + + return result + 1; + } + + static value_type high(value_type result, uint32_t ch) { + (void)ch; + + *result = '?'; + + return result + 1; + } +}; + +struct utf8_decoder { + typedef uint8_t type; + + template static inline typename Traits::value_type process(const uint8_t* data, size_t size, typename Traits::value_type result, Traits) { + const uint8_t utf8_byte_mask = 0x3f; + + while (size) { + uint8_t lead = *data; + + // 0xxxxxxx -> U+0000..U+007F + if (lead < 0x80) { + result = Traits::low(result, lead); + data += 1; + size -= 1; + + // process aligned single-byte (ascii) blocks + if ((reinterpret_cast(data) & 3) == 0) { + // round-trip through void* to silence 'cast increases required alignment of target type' warnings + while (size >= 4 && (*static_cast(static_cast(data)) & 0x80808080) == 0) { + result = Traits::low(result, data[0]); + result = Traits::low(result, data[1]); + result = Traits::low(result, data[2]); + result = Traits::low(result, data[3]); + data += 4; + size -= 4; + } + } + } + // 110xxxxx -> U+0080..U+07FF + else if (static_cast(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80) { + result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask)); + data += 2; + size -= 2; + } + // 1110xxxx -> U+0800-U+FFFF + else if (static_cast(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80) { + result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask)); + data += 3; + size -= 3; + } + // 11110xxx -> U+10000..U+10FFFF + else if (static_cast(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80) { + result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask)); + data += 4; + size -= 4; + } + // 10xxxxxx or 11111xxx -> invalid + else { + data += 1; + size -= 1; + } + } + + return result; + } +}; + +template struct utf16_decoder { + typedef uint16_t type; + + template static inline typename Traits::value_type process(const uint16_t* data, size_t size, typename Traits::value_type result, Traits) { + while (size) { + uint16_t lead = opt_swap::value ? endian_swap(*data) : *data; + + // U+0000..U+D7FF + if (lead < 0xD800) { + result = Traits::low(result, lead); + data += 1; + size -= 1; + } + // U+E000..U+FFFF + else if (static_cast(lead - 0xE000) < 0x2000) { + result = Traits::low(result, lead); + data += 1; + size -= 1; + } + // surrogate pair lead + else if (static_cast(lead - 0xD800) < 0x400 && size >= 2) { + uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1]; + + if (static_cast(next - 0xDC00) < 0x400) { + result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff)); + data += 2; + size -= 2; + } else { + data += 1; + size -= 1; + } + } else { + data += 1; + size -= 1; + } + } + + return result; + } +}; + +template struct utf32_decoder { + typedef uint32_t type; + + template static inline typename Traits::value_type process(const uint32_t* data, size_t size, typename Traits::value_type result, Traits) { + while (size) { + uint32_t lead = opt_swap::value ? endian_swap(*data) : *data; + + // U+0000..U+FFFF + if (lead < 0x10000) { + result = Traits::low(result, lead); + data += 1; + size -= 1; + } + // U+10000..U+10FFFF + else { + result = Traits::high(result, lead); + data += 1; + size -= 1; + } + } + + return result; + } +}; + +struct latin1_decoder { + typedef uint8_t type; + + template static inline typename Traits::value_type process(const uint8_t* data, size_t size, typename Traits::value_type result, Traits) { + while (size) { + result = Traits::low(result, *data); + data += 1; + size -= 1; + } + + return result; + } +}; + +template struct wchar_selector; + +template <> struct wchar_selector<2> { + typedef uint16_t type; + typedef utf16_counter counter; + typedef utf16_writer writer; + typedef utf16_decoder decoder; +}; + +template <> struct wchar_selector<4> { + typedef uint32_t type; + typedef utf32_counter counter; + typedef utf32_writer writer; + typedef utf32_decoder decoder; +}; + +typedef wchar_selector::counter wchar_counter; +typedef wchar_selector::writer wchar_writer; + +struct wchar_decoder { + typedef wchar_t type; + + template static inline typename Traits::value_type process(const wchar_t* data, size_t size, typename Traits::value_type result, Traits traits) { + typedef wchar_selector::decoder decoder; + + return decoder::process(reinterpret_cast(data), size, result, traits); + } +}; + +#ifdef PUGIXML_WCHAR_MODE +PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length) +{ + for (size_t i = 0; i < length; ++i) + result[i] = static_cast(endian_swap(static_cast::type>(data[i]))); +} +#endif +PUGI__NS_END + +PUGI__NS_BEGIN +enum chartype_t { + ct_parse_pcdata = 1, // \0, &, \r, < + ct_parse_attr = 2, // \0, &, \r, ', " + ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab + ct_space = 8, // \r, \n, space, tab + ct_parse_cdata = 16, // \0, ], >, \r + ct_parse_comment = 32, // \0, -, >, \r + ct_symbol = 64, // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, . + ct_start_symbol = 128 // Any symbol > 127, a-z, A-Z, _, : +}; + +static const unsigned char chartype_table[256] = { + 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31 + 8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47 + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63 + 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79 + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95 + 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 96-111 + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, // 112-127 + + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 128+ + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192 +}; + +enum chartypex_t { + ctx_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, > + ctx_special_attr = 2, // Any symbol >= 0 and < 32 (except \t), &, <, >, " + ctx_start_symbol = 4, // Any symbol > 127, a-z, A-Z, _ + ctx_digit = 8, // 0-9 + ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, . +}; + +static const unsigned char chartypex_table[256] = { + 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31 + 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 16, 0, // 32-47 + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 3, 0, 3, 0, // 48-63 + + 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 64-79 + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 20, // 80-95 + 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 96-111 + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, // 112-127 + + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 128+ + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 +}; + +#ifdef PUGIXML_WCHAR_MODE +#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast(c) < 128 ? table[static_cast(c)] : table[128]) & (ct)) +#else +#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast(c)] & (ct)) +#endif + +#define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table) +#define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table) + +PUGI__FN bool is_little_endian() +{ + unsigned int ui = 1; + + return *reinterpret_cast(&ui) == 1; +} + +PUGI__FN xml_encoding get_wchar_encoding() +{ + PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4); + + if (sizeof(wchar_t) == 2) + return is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + else + return is_little_endian() ? encoding_utf32_le : encoding_utf32_be; +} + +PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3) +{ + // look for BOM in first few bytes + if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be; + if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le; + if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be; + if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le; + if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8; + + // look for <, (contents); + + PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3]; + + return guess_buffer_encoding(d0, d1, d2, d3); +} + +PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) +{ + size_t length = size / sizeof(char_t); + + if (is_mutable) { + out_buffer = static_cast(const_cast(contents)); + out_length = length; + } else { + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + if (contents) + memcpy(buffer, contents, length * sizeof(char_t)); + else + assert(length == 0); + + buffer[length] = 0; + + out_buffer = buffer; + out_length = length + 1; + } + + return true; +} + +#ifdef PUGIXML_WCHAR_MODE +PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re) +{ + return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) || + (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be); +} + +PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) +{ + const char_t* data = static_cast(contents); + size_t length = size / sizeof(char_t); + + if (is_mutable) { + char_t* buffer = const_cast(data); + + convert_wchar_endian_swap(buffer, data, length); + + out_buffer = buffer; + out_length = length; + } else { + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + convert_wchar_endian_swap(buffer, data, length); + buffer[length] = 0; + + out_buffer = buffer; + out_length = length + 1; + } + + return true; +} + +template PUGI__FN bool convert_buffer_generic(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, D) +{ + const typename D::type* data = static_cast(contents); + size_t data_length = size / sizeof(typename D::type); + + // first pass: get length in wchar_t units + size_t length = D::process(data, data_length, 0, wchar_counter()); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert utf16 input to wchar_t + wchar_writer::value_type obegin = reinterpret_cast(buffer); + wchar_writer::value_type oend = D::process(data, data_length, obegin, wchar_writer()); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; +} + +PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable) +{ + // get native encoding + xml_encoding wchar_encoding = get_wchar_encoding(); + + // fast path: no conversion required + if (encoding == wchar_encoding) + return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // only endian-swapping is required + if (need_endian_swap_utf(encoding, wchar_encoding)) + return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable); + + // source encoding is utf8 + if (encoding == encoding_utf8) + return convert_buffer_generic(out_buffer, out_length, contents, size, utf8_decoder()); + + // source encoding is utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return (native_encoding == encoding) ? + convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder()) : + convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder()); + } + + // source encoding is utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return (native_encoding == encoding) ? + convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder()) : + convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder()); + } + + // source encoding is latin1 + if (encoding == encoding_latin1) + return convert_buffer_generic(out_buffer, out_length, contents, size, latin1_decoder()); + + assert(!"Invalid encoding"); + return false; +} +#else +template PUGI__FN bool convert_buffer_generic(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, D) +{ + const typename D::type* data = static_cast(contents); + size_t data_length = size / sizeof(typename D::type); + + // first pass: get length in utf8 units + size_t length = D::process(data, data_length, 0, utf8_counter()); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert utf16 input to utf8 + uint8_t* obegin = reinterpret_cast(buffer); + uint8_t* oend = D::process(data, data_length, obegin, utf8_writer()); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; +} + +PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size) +{ + for (size_t i = 0; i < size; ++i) + if (data[i] > 127) + return i; + + return size; +} + +PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) +{ + const uint8_t* data = static_cast(contents); + size_t data_length = size; + + // get size of prefix that does not need utf8 conversion + size_t prefix_length = get_latin1_7bit_prefix_length(data, data_length); + assert(prefix_length <= data_length); + + const uint8_t* postfix = data + prefix_length; + size_t postfix_length = data_length - prefix_length; + + // if no conversion is needed, just return the original buffer + if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // first pass: get length in utf8 units + size_t length = prefix_length + latin1_decoder::process(postfix, postfix_length, 0, utf8_counter()); + + // allocate buffer of suitable length + char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; + + // second pass: convert latin1 input to utf8 + memcpy(buffer, data, prefix_length); + + uint8_t* obegin = reinterpret_cast(buffer); + uint8_t* oend = latin1_decoder::process(postfix, postfix_length, obegin + prefix_length, utf8_writer()); + + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; + + return true; +} + +PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable) +{ + // fast path: no conversion required + if (encoding == encoding_utf8) + return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); + + // source encoding is utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return (native_encoding == encoding) ? + convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder()) : + convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder()); + } + + // source encoding is utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return (native_encoding == encoding) ? + convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder()) : + convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder()); + } + + // source encoding is latin1 + if (encoding == encoding_latin1) + return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable); + + assert(!"Invalid encoding"); + return false; +} +#endif + +PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length) +{ + // get length in utf8 characters + return wchar_decoder::process(str, length, 0, utf8_counter()); +} + +PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length) +{ + // convert to utf8 + uint8_t* begin = reinterpret_cast(buffer); + uint8_t* end = wchar_decoder::process(str, length, begin, utf8_writer()); + + assert(begin + size == end); + (void)!end; + (void)!size; +} + +#ifndef PUGIXML_NO_STL +PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length) +{ + // first pass: get length in utf8 characters + size_t size = as_utf8_begin(str, length); + + // allocate resulting string + std::string result; + result.resize(size); + + // second pass: convert to utf8 + if (size > 0) as_utf8_end(&result[0], size, str, length); + + return result; +} + +PUGI__FN std::basic_string as_wide_impl(const char* str, size_t size) +{ + const uint8_t* data = reinterpret_cast(str); + + // first pass: get length in wchar_t units + size_t length = utf8_decoder::process(data, size, 0, wchar_counter()); + + // allocate resulting string + std::basic_string result; + result.resize(length); + + // second pass: convert to wchar_t + if (length > 0) { + wchar_writer::value_type begin = reinterpret_cast(&result[0]); + wchar_writer::value_type end = utf8_decoder::process(data, size, begin, wchar_writer()); + + assert(begin + length == end); + (void)!end; + } + + return result; +} +#endif + +template +inline bool strcpy_insitu_allow(size_t length, const Header& header, uintptr_t header_mask, char_t* target) +{ + // never reuse shared memory + if (header & xml_memory_page_contents_shared_mask) return false; + + size_t target_length = strlength(target); + + // always reuse document buffer memory if possible + if ((header & header_mask) == 0) return target_length >= length; + + // reuse heap memory if waste is not too great + const size_t reuse_threshold = 32; + + return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2); +} + +template +PUGI__FN bool strcpy_insitu(String& dest, Header& header, uintptr_t header_mask, const char_t* source, size_t source_length) +{ + if (source_length == 0) { + // empty string and null pointer are equivalent, so just deallocate old memory + xml_allocator* alloc = PUGI__GETPAGE_IMPL(header)->allocator; + + if (header & header_mask) alloc->deallocate_string(dest); + + // mark the string as not allocated + dest = 0; + header &= ~header_mask; + + return true; + } else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest)) { + // we can reuse old buffer, so just copy the new data (including zero terminator) + memcpy(dest, source, source_length * sizeof(char_t)); + dest[source_length] = 0; + + return true; + } else { + xml_allocator* alloc = PUGI__GETPAGE_IMPL(header)->allocator; + + if (!alloc->reserve()) return false; + + // allocate new buffer + char_t* buf = alloc->allocate_string(source_length + 1); + if (!buf) return false; + + // copy the string (including zero terminator) + memcpy(buf, source, source_length * sizeof(char_t)); + buf[source_length] = 0; + + // deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures) + if (header & header_mask) alloc->deallocate_string(dest); + + // the string is now allocated, so set the flag + dest = buf; + header |= header_mask; + + return true; + } +} + +struct gap { + char_t* end; + size_t size; + + gap(): end(0), size(0) { + } + + // Push new gap, move s count bytes further (skipping the gap). + // Collapse previous gap. + void push(char_t*& s, size_t count) { + if (end) { // there was a gap already; collapse it + // Move [old_gap_end, new_gap_start) to [old_gap_start, ...) + assert(s >= end); + memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); + } + + s += count; // end of current gap + + // "merge" two gaps + end = s; + size += count; + } + + // Collapse all gaps, return past-the-end pointer + char_t* flush(char_t* s) { + if (end) { + // Move [old_gap_end, current_pos) to [old_gap_start, ...) + assert(s >= end); + memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); + + return s - size; + } else return s; + } +}; + +PUGI__FN char_t* strconv_escape(char_t* s, gap& g) +{ + char_t* stre = s + 1; + + switch (*stre) { + case '#': { // &#... + unsigned int ucsc = 0; + + if (stre[1] == 'x') { // &#x... (hex code) + stre += 2; + + char_t ch = *stre; + + if (ch == ';') return stre; + + for (;;) { + if (static_cast(ch - '0') <= 9) + ucsc = 16 * ucsc + (ch - '0'); + else if (static_cast((ch | ' ') - 'a') <= 5) + ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10); + else if (ch == ';') + break; + else // cancel + return stre; + + ch = *++stre; + } + + ++stre; + } else { // &#... (dec code) + char_t ch = *++stre; + + if (ch == ';') return stre; + + for (;;) { + if (static_cast(static_cast(ch) - '0') <= 9) + ucsc = 10 * ucsc + (ch - '0'); + else if (ch == ';') + break; + else // cancel + return stre; + + ch = *++stre; + } + + ++stre; + } + +#ifdef PUGIXML_WCHAR_MODE + s = reinterpret_cast(wchar_writer::any(reinterpret_cast(s), ucsc)); +#else + s = reinterpret_cast(utf8_writer::any(reinterpret_cast(s), ucsc)); +#endif + + g.push(s, stre - s); + return stre; + } + + case 'a': { // &a + ++stre; + + if (*stre == 'm') { // &am + if (*++stre == 'p' && *++stre == ';') { // & + *s++ = '&'; + ++stre; + + g.push(s, stre - s); + return stre; + } + } else if (*stre == 'p') { // &ap + if (*++stre == 'o' && *++stre == 's' && *++stre == ';') { // ' + *s++ = '\''; + ++stre; + + g.push(s, stre - s); + return stre; + } + } + break; + } + + case 'g': { // &g + if (*++stre == 't' && *++stre == ';') { // > + *s++ = '>'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + case 'l': { // &l + if (*++stre == 't' && *++stre == ';') { // < + *s++ = '<'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + case 'q': { // &q + if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') { // " + *s++ = '"'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + default: + break; + } + + return stre; +} + +// Parser utilities +#define PUGI__ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e))) +#define PUGI__SKIPWS() { while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; } +#define PUGI__OPTSET(OPT) ( optmsk & (OPT) ) +#define PUGI__PUSHNODE(TYPE) { cursor = append_new_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); } +#define PUGI__POPNODE() { cursor = cursor->parent; } +#define PUGI__SCANFOR(X) { while (*s != 0 && !(X)) ++s; } +#define PUGI__SCANWHILE(X) { while (X) ++s; } +#define PUGI__SCANWHILE_UNROLL(X) { for (;;) { char_t ss = s[0]; if (PUGI__UNLIKELY(!(X))) { break; } ss = s[1]; if (PUGI__UNLIKELY(!(X))) { s += 1; break; } ss = s[2]; if (PUGI__UNLIKELY(!(X))) { s += 2; break; } ss = s[3]; if (PUGI__UNLIKELY(!(X))) { s += 3; break; } s += 4; } } +#define PUGI__ENDSEG() { ch = *s; *s = 0; ++s; } +#define PUGI__THROW_ERROR(err, m) return error_offset = m, error_status = err, static_cast(0) +#define PUGI__CHECK_ERROR(err, m) { if (*s == 0) PUGI__THROW_ERROR(err, m); } + +PUGI__FN char_t* strconv_comment(char_t* s, char_t endch) +{ + gap g; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_comment)); + + if (*s == '\r') { // Either a single 0x0d or 0x0d 0x0a pair + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } else if (s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')) { // comment ends here + *g.flush(s) = 0; + + return s + (s[2] == '>' ? 3 : 2); + } else if (*s == 0) { + return 0; + } else ++s; + } +} + +PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch) +{ + gap g; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_cdata)); + + if (*s == '\r') { // Either a single 0x0d or 0x0d 0x0a pair + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } else if (s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')) { // CDATA ends here + *g.flush(s) = 0; + + return s + 1; + } else if (*s == 0) { + return 0; + } else ++s; + } +} + +typedef char_t* (*strconv_pcdata_t)(char_t*); + +template struct strconv_pcdata_impl { + static char_t* parse(char_t* s) { + gap g; + + char_t* begin = s; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_pcdata)); + + if (*s == '<') { // PCDATA ends here + char_t* end = g.flush(s); + + if (opt_trim::value) + while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space)) + --end; + + *end = 0; + + return s + 1; + } else if (opt_eol::value && *s == '\r') { // Either a single 0x0d or 0x0d 0x0a pair + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } else if (opt_escape::value && *s == '&') { + s = strconv_escape(s, g); + } else if (*s == 0) { + char_t* end = g.flush(s); + + if (opt_trim::value) + while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space)) + --end; + + *end = 0; + + return s; + } else ++s; + } + } +}; + +PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask) +{ + PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800); + + switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) { // get bitmask for flags (eol escapes trim) + case 0: + return strconv_pcdata_impl::parse; + case 1: + return strconv_pcdata_impl::parse; + case 2: + return strconv_pcdata_impl::parse; + case 3: + return strconv_pcdata_impl::parse; + case 4: + return strconv_pcdata_impl::parse; + case 5: + return strconv_pcdata_impl::parse; + case 6: + return strconv_pcdata_impl::parse; + case 7: + return strconv_pcdata_impl::parse; + default: + assert(false); + return 0; // should not get here + } +} + +typedef char_t* (*strconv_attribute_t)(char_t*, char_t); + +template struct strconv_attribute_impl { + static char_t* parse_wnorm(char_t* s, char_t end_quote) { + gap g; + + // trim leading whitespaces + if (PUGI__IS_CHARTYPE(*s, ct_space)) { + char_t* str = s; + + do ++str; + while (PUGI__IS_CHARTYPE(*str, ct_space)); + + g.push(s, str - s); + } + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space)); + + if (*s == end_quote) { + char_t* str = g.flush(s); + + do *str-- = 0; + while (PUGI__IS_CHARTYPE(*str, ct_space)); + + return s + 1; + } else if (PUGI__IS_CHARTYPE(*s, ct_space)) { + *s++ = ' '; + + if (PUGI__IS_CHARTYPE(*s, ct_space)) { + char_t* str = s + 1; + while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str; + + g.push(s, str - s); + } + } else if (opt_escape::value && *s == '&') { + s = strconv_escape(s, g); + } else if (!*s) { + return 0; + } else ++s; + } + } + + static char_t* parse_wconv(char_t* s, char_t end_quote) { + gap g; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws)); + + if (*s == end_quote) { + *g.flush(s) = 0; + + return s + 1; + } else if (PUGI__IS_CHARTYPE(*s, ct_space)) { + if (*s == '\r') { + *s++ = ' '; + + if (*s == '\n') g.push(s, 1); + } else *s++ = ' '; + } else if (opt_escape::value && *s == '&') { + s = strconv_escape(s, g); + } else if (!*s) { + return 0; + } else ++s; + } + } + + static char_t* parse_eol(char_t* s, char_t end_quote) { + gap g; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr)); + + if (*s == end_quote) { + *g.flush(s) = 0; + + return s + 1; + } else if (*s == '\r') { + *s++ = '\n'; + + if (*s == '\n') g.push(s, 1); + } else if (opt_escape::value && *s == '&') { + s = strconv_escape(s, g); + } else if (!*s) { + return 0; + } else ++s; + } + } + + static char_t* parse_simple(char_t* s, char_t end_quote) { + gap g; + + while (true) { + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr)); + + if (*s == end_quote) { + *g.flush(s) = 0; + + return s + 1; + } else if (opt_escape::value && *s == '&') { + s = strconv_escape(s, g); + } else if (!*s) { + return 0; + } else ++s; + } + } +}; + +PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask) +{ + PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80); + + switch ((optmask >> 4) & 15) { // get bitmask for flags (wconv wnorm eol escapes) + case 0: + return strconv_attribute_impl::parse_simple; + case 1: + return strconv_attribute_impl::parse_simple; + case 2: + return strconv_attribute_impl::parse_eol; + case 3: + return strconv_attribute_impl::parse_eol; + case 4: + return strconv_attribute_impl::parse_wconv; + case 5: + return strconv_attribute_impl::parse_wconv; + case 6: + return strconv_attribute_impl::parse_wconv; + case 7: + return strconv_attribute_impl::parse_wconv; + case 8: + return strconv_attribute_impl::parse_wnorm; + case 9: + return strconv_attribute_impl::parse_wnorm; + case 10: + return strconv_attribute_impl::parse_wnorm; + case 11: + return strconv_attribute_impl::parse_wnorm; + case 12: + return strconv_attribute_impl::parse_wnorm; + case 13: + return strconv_attribute_impl::parse_wnorm; + case 14: + return strconv_attribute_impl::parse_wnorm; + case 15: + return strconv_attribute_impl::parse_wnorm; + default: + assert(false); + return 0; // should not get here + } +} + +inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0) +{ + xml_parse_result result; + result.status = status; + result.offset = offset; + + return result; +} + +struct xml_parser { + xml_allocator alloc; + xml_allocator* alloc_state; + char_t* error_offset; + xml_parse_status error_status; + + xml_parser(xml_allocator* alloc_): alloc(*alloc_), alloc_state(alloc_), error_offset(0), error_status(status_ok) { + } + + ~xml_parser() { + *alloc_state = alloc; + } + + // DOCTYPE consists of nested sections of the following possible types: + // , , "...", '...' + // + // + // First group can not contain nested groups + // Second group can contain nested groups of the same type + // Third group can contain all other groups + char_t* parse_doctype_primitive(char_t* s) { + if (*s == '"' || *s == '\'') { + // quoted string + char_t ch = *s++; + PUGI__SCANFOR(*s == ch); + if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s); + + s++; + } else if (s[0] == '<' && s[1] == '?') { + // + s += 2; + PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype + if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s); + + s += 2; + } else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-') { + s += 4; + PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype + if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s); + + s += 3; + } else PUGI__THROW_ERROR(status_bad_doctype, s); + + return s; + } + + char_t* parse_doctype_ignore(char_t* s) { + size_t depth = 0; + + assert(s[0] == '<' && s[1] == '!' && s[2] == '['); + s += 3; + + while (*s) { + if (s[0] == '<' && s[1] == '!' && s[2] == '[') { + // nested ignore section + s += 3; + depth++; + } else if (s[0] == ']' && s[1] == ']' && s[2] == '>') { + // ignore section end + s += 3; + + if (depth == 0) + return s; + + depth--; + } else s++; + } + + PUGI__THROW_ERROR(status_bad_doctype, s); + } + + char_t* parse_doctype_group(char_t* s, char_t endch) { + size_t depth = 0; + + assert((s[0] == '<' || s[0] == 0) && s[1] == '!'); + s += 2; + + while (*s) { + if (s[0] == '<' && s[1] == '!' && s[2] != '-') { + if (s[2] == '[') { + // ignore + s = parse_doctype_ignore(s); + if (!s) return s; + } else { + // some control group + s += 2; + depth++; + } + } else if (s[0] == '<' || s[0] == '"' || s[0] == '\'') { + // unknown tag (forbidden), or some primitive group + s = parse_doctype_primitive(s); + if (!s) return s; + } else if (*s == '>') { + if (depth == 0) + return s; + + depth--; + s++; + } else s++; + } + + if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s); + + return s; + } + + char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch) { + // parse node contents, starting with exclamation mark + ++s; + + if (*s == '-') { // 'value = s; // Save the offset. + } + + if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments)) { + s = strconv_comment(s, endch); + + if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value); + } else { + // Scan for terminating '-->'. + PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')); + PUGI__CHECK_ERROR(status_bad_comment, s); + + if (PUGI__OPTSET(parse_comments)) + *s = 0; // Zero-terminate this segment at the first terminating '-'. + + s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'. + } + } else PUGI__THROW_ERROR(status_bad_comment, s); + } else if (*s == '[') { + // 'value = s; // Save the offset. + + if (PUGI__OPTSET(parse_eol)) { + s = strconv_cdata(s, endch); + + if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value); + } else { + // Scan for terminating ']]>'. + PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')); + PUGI__CHECK_ERROR(status_bad_cdata, s); + + *s++ = 0; // Zero-terminate this segment. + } + } else { // Flagged for discard, but we still have to scan for the terminator. + // Scan for terminating ']]>'. + PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')); + PUGI__CHECK_ERROR(status_bad_cdata, s); + + ++s; + } + + s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'. + } else PUGI__THROW_ERROR(status_bad_cdata, s); + } else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && PUGI__ENDSWITH(s[6], 'E')) { + s -= 2; + + if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s); + + char_t* mark = s + 9; + + s = parse_doctype_group(s, endch); + if (!s) return s; + + assert((*s == 0 && endch == '>') || *s == '>'); + if (*s) *s++ = 0; + + if (PUGI__OPTSET(parse_doctype)) { + while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark; + + PUGI__PUSHNODE(node_doctype); + + cursor->value = mark; + } + } else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s); + else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s); + else PUGI__THROW_ERROR(status_unrecognized_tag, s); + + return s; + } + + char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch) { + // load into registers + xml_node_struct* cursor = ref_cursor; + char_t ch = 0; + + // parse node contents, starting with question mark + ++s; + + // read PI target + char_t* target = s; + + if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s); + + PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); + PUGI__CHECK_ERROR(status_bad_pi, s); + + // determine node type; stricmp / strcasecmp is not portable + bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s; + + if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi)) { + if (declaration) { + // disallow non top-level declarations + if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s); + + PUGI__PUSHNODE(node_declaration); + } else { + PUGI__PUSHNODE(node_pi); + } + + cursor->name = target; + + PUGI__ENDSEG(); + + // parse value/attributes + if (ch == '?') { + // empty node + if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s); + s += (*s == '>'); + + PUGI__POPNODE(); + } else if (PUGI__IS_CHARTYPE(ch, ct_space)) { + PUGI__SKIPWS(); + + // scan for tag end + char_t* value = s; + + PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>')); + PUGI__CHECK_ERROR(status_bad_pi, s); + + if (declaration) { + // replace ending ? with / so that 'element' terminates properly + *s = '/'; + + // we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES + s = value; + } else { + // store value and step over > + cursor->value = value; + + PUGI__POPNODE(); + + PUGI__ENDSEG(); + + s += (*s == '>'); + } + } else PUGI__THROW_ERROR(status_bad_pi, s); + } else { + // scan for tag end + PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>')); + PUGI__CHECK_ERROR(status_bad_pi, s); + + s += (s[1] == '>' ? 2 : 1); + } + + // store from registers + ref_cursor = cursor; + + return s; + } + + char_t* parse_tree(char_t* s, xml_node_struct* root, unsigned int optmsk, char_t endch) { + strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk); + strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk); + + char_t ch = 0; + xml_node_struct* cursor = root; + char_t* mark = s; + + while (*s != 0) { + if (*s == '<') { + ++s; + +LOC_TAG: + if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) { // '<#...' + PUGI__PUSHNODE(node_element); // Append a new node to the tree. + + cursor->name = s; + + PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator. + PUGI__ENDSEG(); // Save char in 'ch', terminate & step over. + + if (ch == '>') { + // end of tag + } else if (PUGI__IS_CHARTYPE(ch, ct_space)) { +LOC_ATTRIBUTES: + while (true) { + PUGI__SKIPWS(); // Eat any whitespace. + + if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) { // <... #... + xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for this attribute. + if (!a) PUGI__THROW_ERROR(status_out_of_memory, s); + + a->name = s; // Save the offset. + + PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator. + PUGI__ENDSEG(); // Save char in 'ch', terminate & step over. + + if (PUGI__IS_CHARTYPE(ch, ct_space)) { + PUGI__SKIPWS(); // Eat any whitespace. + + ch = *s; + ++s; + } + + if (ch == '=') { // '<... #=...' + PUGI__SKIPWS(); // Eat any whitespace. + + if (*s == '"' || *s == '\'') { // '<... #="...' + ch = *s; // Save quote char to avoid breaking on "''" -or- '""'. + ++s; // Step over the quote. + a->value = s; // Save the offset. + + s = strconv_attribute(s, ch); + + if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value); + + // After this line the loop continues from the start; + // Whitespaces, / and > are ok, symbols and EOF are wrong, + // everything else will be detected + if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s); + } else PUGI__THROW_ERROR(status_bad_attribute, s); + } else PUGI__THROW_ERROR(status_bad_attribute, s); + } else if (*s == '/') { + ++s; + + if (*s == '>') { + PUGI__POPNODE(); + s++; + break; + } else if (*s == 0 && endch == '>') { + PUGI__POPNODE(); + break; + } else PUGI__THROW_ERROR(status_bad_start_element, s); + } else if (*s == '>') { + ++s; + + break; + } else if (*s == 0 && endch == '>') { + break; + } else PUGI__THROW_ERROR(status_bad_start_element, s); + } + + // !!! + } else if (ch == '/') { // '<#.../' + if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s); + + PUGI__POPNODE(); // Pop. + + s += (*s == '>'); + } else if (ch == 0) { + // we stepped over null terminator, backtrack & handle closing tag + --s; + + if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s); + } else PUGI__THROW_ERROR(status_bad_start_element, s); + } else if (*s == '/') { + ++s; + + char_t* name = cursor->name; + if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s); + + while (PUGI__IS_CHARTYPE(*s, ct_symbol)) { + if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s); + } + + if (*name) { + if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s); + else PUGI__THROW_ERROR(status_end_element_mismatch, s); + } + + PUGI__POPNODE(); // Pop. + + PUGI__SKIPWS(); + + if (*s == 0) { + if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s); + } else { + if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s); + ++s; + } + } else if (*s == '?') { // 'first_child) continue; + } + } + + if (!PUGI__OPTSET(parse_trim_pcdata)) + s = mark; + + if (cursor->parent || PUGI__OPTSET(parse_fragment)) { + PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. + cursor->value = s; // Save the offset. + + s = strconv_pcdata(s); + + PUGI__POPNODE(); // Pop since this is a standalone. + + if (!*s) break; + } else { + PUGI__SCANFOR(*s == '<'); // '...<' + if (!*s) break; + + ++s; + } + + // We're after '<' + goto LOC_TAG; + } + } + + // check that last tag is closed + if (cursor != root) PUGI__THROW_ERROR(status_end_element_mismatch, s); + + return s; + } + +#ifdef PUGIXML_WCHAR_MODE + static char_t* parse_skip_bom(char_t* s) { + unsigned int bom = 0xfeff; + return (s[0] == static_cast(bom)) ? s + 1 : s; + } +#else + static char_t* parse_skip_bom(char_t* s) { + return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; + } +#endif + + static bool has_element_node_siblings(xml_node_struct* node) { + while (node) { + if (PUGI__NODETYPE(node) == node_element) return true; + + node = node->next_sibling; + } + + return false; + } + + static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk) { + // early-out for empty documents + if (length == 0) + return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element); + + // get last child of the root before parsing + xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c + 0 : 0; + + // create parser on stack + xml_parser parser(static_cast(xmldoc)); + + // save last character and make buffer zero-terminated (speeds up parsing) + char_t endch = buffer[length - 1]; + buffer[length - 1] = 0; + + // skip BOM to make sure it does not end up as part of parse output + char_t* buffer_data = parse_skip_bom(buffer); + + // perform actual parsing + parser.parse_tree(buffer_data, root, optmsk, endch); + + xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0); + assert(result.offset >= 0 && static_cast(result.offset) <= length); + + if (result) { + // since we removed last character, we have to handle the only possible false positive (stray <) + if (endch == '<') + return make_parse_result(status_unrecognized_tag, length - 1); + + // check if there are any element nodes parsed + xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling + 0 : root->first_child+ 0; + + if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed)) + return make_parse_result(status_no_document_element, length - 1); + } else { + // roll back offset if it occurs on a null terminator in the source buffer + if (result.offset > 0 && static_cast(result.offset) == length - 1 && endch == 0) + result.offset--; + } + + return result; + } +}; + +// Output facilities +PUGI__FN xml_encoding get_write_native_encoding() +{ +#ifdef PUGIXML_WCHAR_MODE + return get_wchar_encoding(); +#else + return encoding_utf8; +#endif +} + +PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding) +{ + // replace wchar encoding with utf implementation + if (encoding == encoding_wchar) return get_wchar_encoding(); + + // replace utf16 encoding with utf16 with specific endianness + if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + // replace utf32 encoding with utf32 with specific endianness + if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + // only do autodetection if no explicit encoding is requested + if (encoding != encoding_auto) return encoding; + + // assume utf8 encoding + return encoding_utf8; +} + +template PUGI__FN size_t convert_buffer_output_generic(typename T::value_type dest, const char_t* data, size_t length, D, T) +{ + PUGI__STATIC_ASSERT(sizeof(char_t) == sizeof(typename D::type)); + + typename T::value_type end = D::process(reinterpret_cast(data), length, dest, T()); + + return static_cast(end - dest) * sizeof(*dest); +} + +template PUGI__FN size_t convert_buffer_output_generic(typename T::value_type dest, const char_t* data, size_t length, D, T, bool opt_swap) +{ + PUGI__STATIC_ASSERT(sizeof(char_t) == sizeof(typename D::type)); + + typename T::value_type end = D::process(reinterpret_cast(data), length, dest, T()); + + if (opt_swap) { + for (typename T::value_type i = dest; i != end; ++i) + *i = endian_swap(*i); + } + + return static_cast(end - dest) * sizeof(*dest); +} + +#ifdef PUGIXML_WCHAR_MODE +PUGI__FN size_t get_valid_length(const char_t* data, size_t length) +{ + if (length < 1) return 0; + + // discard last character if it's the lead of a surrogate pair + return (sizeof(wchar_t) == 2 && static_cast(static_cast(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length; +} + +PUGI__FN size_t convert_buffer_output(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding) +{ + // only endian-swapping is required + if (need_endian_swap_utf(encoding, get_wchar_encoding())) { + convert_wchar_endian_swap(r_char, data, length); + + return length * sizeof(char_t); + } + + // convert to utf8 + if (encoding == encoding_utf8) + return convert_buffer_output_generic(r_u8, data, length, wchar_decoder(), utf8_writer()); + + // convert to utf16 + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return convert_buffer_output_generic(r_u16, data, length, wchar_decoder(), utf16_writer(), native_encoding != encoding); + } + + // convert to utf32 + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return convert_buffer_output_generic(r_u32, data, length, wchar_decoder(), utf32_writer(), native_encoding != encoding); + } + + // convert to latin1 + if (encoding == encoding_latin1) + return convert_buffer_output_generic(r_u8, data, length, wchar_decoder(), latin1_writer()); + + assert(!"Invalid encoding"); + return 0; +} +#else +PUGI__FN size_t get_valid_length(const char_t* data, size_t length) +{ + if (length < 5) return 0; + + for (size_t i = 1; i <= 4; ++i) { + uint8_t ch = static_cast(data[length - i]); + + // either a standalone character or a leading one + if ((ch & 0xc0) != 0x80) return length - i; + } + + // there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk + return length; +} + +PUGI__FN size_t convert_buffer_output(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding) +{ + if (encoding == encoding_utf16_be || encoding == encoding_utf16_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be; + + return convert_buffer_output_generic(r_u16, data, length, utf8_decoder(), utf16_writer(), native_encoding != encoding); + } + + if (encoding == encoding_utf32_be || encoding == encoding_utf32_le) { + xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be; + + return convert_buffer_output_generic(r_u32, data, length, utf8_decoder(), utf32_writer(), native_encoding != encoding); + } + + if (encoding == encoding_latin1) + return convert_buffer_output_generic(r_u8, data, length, utf8_decoder(), latin1_writer()); + + assert(!"Invalid encoding"); + return 0; +} +#endif + +class xml_buffered_writer +{ + xml_buffered_writer(const xml_buffered_writer&); + xml_buffered_writer& operator=(const xml_buffered_writer&); + +public: + xml_buffered_writer(xml_writer& writer_, xml_encoding user_encoding): writer(writer_), bufsize(0), encoding(get_write_encoding(user_encoding)) { + PUGI__STATIC_ASSERT(bufcapacity >= 8); + } + + size_t flush() { + flush(buffer, bufsize); + bufsize = 0; + return 0; + } + + void flush(const char_t* data, size_t size) { + if (size == 0) return; + + // fast path, just write data + if (encoding == get_write_native_encoding()) + writer.write(data, size * sizeof(char_t)); + else { + // convert chunk + size_t result = convert_buffer_output(scratch.data_char, scratch.data_u8, scratch.data_u16, scratch.data_u32, data, size, encoding); + assert(result <= sizeof(scratch)); + + // write data + writer.write(scratch.data_u8, result); + } + } + + void write_direct(const char_t* data, size_t length) { + // flush the remaining buffer contents + flush(); + + // handle large chunks + if (length > bufcapacity) { + if (encoding == get_write_native_encoding()) { + // fast path, can just write data chunk + writer.write(data, length * sizeof(char_t)); + return; + } + + // need to convert in suitable chunks + while (length > bufcapacity) { + // get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer + // and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary) + size_t chunk_size = get_valid_length(data, bufcapacity); + assert(chunk_size); + + // convert chunk and write + flush(data, chunk_size); + + // iterate + data += chunk_size; + length -= chunk_size; + } + + // small tail is copied below + bufsize = 0; + } + + memcpy(buffer + bufsize, data, length * sizeof(char_t)); + bufsize += length; + } + + void write_buffer(const char_t* data, size_t length) { + size_t offset = bufsize; + + if (offset + length <= bufcapacity) { + memcpy(buffer + offset, data, length * sizeof(char_t)); + bufsize = offset + length; + } else { + write_direct(data, length); + } + } + + void write_string(const char_t* data) { + // write the part of the string that fits in the buffer + size_t offset = bufsize; + + while (*data && offset < bufcapacity) + buffer[offset++] = *data++; + + // write the rest + if (offset < bufcapacity) { + bufsize = offset; + } else { + // backtrack a bit if we have split the codepoint + size_t length = offset - bufsize; + size_t extra = length - get_valid_length(data - length, length); + + bufsize = offset - extra; + + write_direct(data - extra, strlength(data) + extra); + } + } + + void write(char_t d0) { + size_t offset = bufsize; + if (offset > bufcapacity - 1) offset = flush(); + + buffer[offset + 0] = d0; + bufsize = offset + 1; + } + + void write(char_t d0, char_t d1) { + size_t offset = bufsize; + if (offset > bufcapacity - 2) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + bufsize = offset + 2; + } + + void write(char_t d0, char_t d1, char_t d2) { + size_t offset = bufsize; + if (offset > bufcapacity - 3) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + bufsize = offset + 3; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3) { + size_t offset = bufsize; + if (offset > bufcapacity - 4) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + buffer[offset + 3] = d3; + bufsize = offset + 4; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4) { + size_t offset = bufsize; + if (offset > bufcapacity - 5) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + buffer[offset + 3] = d3; + buffer[offset + 4] = d4; + bufsize = offset + 5; + } + + void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5) { + size_t offset = bufsize; + if (offset > bufcapacity - 6) offset = flush(); + + buffer[offset + 0] = d0; + buffer[offset + 1] = d1; + buffer[offset + 2] = d2; + buffer[offset + 3] = d3; + buffer[offset + 4] = d4; + buffer[offset + 5] = d5; + bufsize = offset + 6; + } + + // utf8 maximum expansion: x4 (-> utf32) + // utf16 maximum expansion: x2 (-> utf32) + // utf32 maximum expansion: x1 + enum { + bufcapacitybytes = +#ifdef PUGIXML_MEMORY_OUTPUT_STACK + PUGIXML_MEMORY_OUTPUT_STACK +#else + 10240 +#endif + , + bufcapacity = bufcapacitybytes / (sizeof(char_t) + 4) + }; + + char_t buffer[bufcapacity]; + + union { + uint8_t data_u8[4 * bufcapacity]; + uint16_t data_u16[2 * bufcapacity]; + uint32_t data_u32[bufcapacity]; + char_t data_char[bufcapacity]; + } scratch; + + xml_writer& writer; + size_t bufsize; + xml_encoding encoding; +}; + +PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type) +{ + while (*s) { + const char_t* prev = s; + + // While *s is a usual symbol + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type)); + + writer.write_buffer(prev, static_cast(s - prev)); + + switch (*s) { + case 0: + break; + case '&': + writer.write('&', 'a', 'm', 'p', ';'); + ++s; + break; + case '<': + writer.write('&', 'l', 't', ';'); + ++s; + break; + case '>': + writer.write('&', 'g', 't', ';'); + ++s; + break; + case '"': + writer.write('&', 'q', 'u', 'o', 't', ';'); + ++s; + break; + default: { // s is not a usual symbol + unsigned int ch = static_cast(*s++); + assert(ch < 32); + + writer.write('&', '#', static_cast((ch / 10) + '0'), static_cast((ch % 10) + '0'), ';'); + } + } + } +} + +PUGI__FN void text_output(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags) +{ + if (flags & format_no_escapes) + writer.write_string(s); + else + text_output_escaped(writer, s, type); +} + +PUGI__FN void text_output_cdata(xml_buffered_writer& writer, const char_t* s) +{ + do { + writer.write('<', '!', '[', 'C', 'D'); + writer.write('A', 'T', 'A', '['); + + const char_t* prev = s; + + // look for ]]> sequence - we can't output it as is since it terminates CDATA + while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s; + + // skip ]] if we stopped at ]]>, > will go to the next CDATA section + if (*s) s += 2; + + writer.write_buffer(prev, static_cast(s - prev)); + + writer.write(']', ']', '>'); + } while (*s); +} + +PUGI__FN void text_output_indent(xml_buffered_writer& writer, const char_t* indent, size_t indent_length, unsigned int depth) +{ + switch (indent_length) { + case 1: { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0]); + break; + } + + case 2: { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0], indent[1]); + break; + } + + case 3: { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0], indent[1], indent[2]); + break; + } + + case 4: { + for (unsigned int i = 0; i < depth; ++i) + writer.write(indent[0], indent[1], indent[2], indent[3]); + break; + } + + default: { + for (unsigned int i = 0; i < depth; ++i) + writer.write_buffer(indent, indent_length); + } + } +} + +PUGI__FN void node_output_comment(xml_buffered_writer& writer, const char_t* s) +{ + writer.write('<', '!', '-', '-'); + + while (*s) { + const char_t* prev = s; + + // look for -\0 or -- sequence - we can't output it since -- is illegal in comment body + while (*s && !(s[0] == '-' && (s[1] == '-' || s[1] == 0))) ++s; + + writer.write_buffer(prev, static_cast(s - prev)); + + if (*s) { + assert(*s == '-'); + + writer.write('-', ' '); + ++s; + } + } + + writer.write('-', '-', '>'); +} + +PUGI__FN void node_output_pi_value(xml_buffered_writer& writer, const char_t* s) +{ + while (*s) { + const char_t* prev = s; + + // look for ?> sequence - we can't output it since ?> terminates PI + while (*s && !(s[0] == '?' && s[1] == '>')) ++s; + + writer.write_buffer(prev, static_cast(s - prev)); + + if (*s) { + assert(s[0] == '?' && s[1] == '>'); + + writer.write('?', ' ', '>'); + s += 2; + } + } +} + +PUGI__FN void node_output_attributes(xml_buffered_writer& writer, xml_node_struct* node, const char_t* indent, size_t indent_length, unsigned int flags, unsigned int depth) +{ + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + + for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute) { + if ((flags & (format_indent_attributes | format_raw)) == format_indent_attributes) { + writer.write('\n'); + + text_output_indent(writer, indent, indent_length, depth + 1); + } else { + writer.write(' '); + } + + writer.write_string(a->name ? a->name + 0 : default_name); + writer.write('=', '"'); + + if (a->value) + text_output(writer, a->value, ctx_special_attr, flags); + + writer.write('"'); + } +} + +PUGI__FN bool node_output_start(xml_buffered_writer& writer, xml_node_struct* node, const char_t* indent, size_t indent_length, unsigned int flags, unsigned int depth) +{ + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + const char_t* name = node->name ? node->name + 0 : default_name; + + writer.write('<'); + writer.write_string(name); + + if (node->first_attribute) + node_output_attributes(writer, node, indent, indent_length, flags, depth); + + if (!node->first_child) { + writer.write(' ', '/', '>'); + + return false; + } else { + writer.write('>'); + + return true; + } +} + +PUGI__FN void node_output_end(xml_buffered_writer& writer, xml_node_struct* node) +{ + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + const char_t* name = node->name ? node->name + 0 : default_name; + + writer.write('<', '/'); + writer.write_string(name); + writer.write('>'); +} + +PUGI__FN void node_output_simple(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags) +{ + const char_t* default_name = PUGIXML_TEXT(":anonymous"); + + switch (PUGI__NODETYPE(node)) { + case node_pcdata: + text_output(writer, node->value ? node->value + 0 : PUGIXML_TEXT(""), ctx_special_pcdata, flags); + break; + + case node_cdata: + text_output_cdata(writer, node->value ? node->value + 0 : PUGIXML_TEXT("")); + break; + + case node_comment: + node_output_comment(writer, node->value ? node->value + 0 : PUGIXML_TEXT("")); + break; + + case node_pi: + writer.write('<', '?'); + writer.write_string(node->name ? node->name + 0 : default_name); + + if (node->value) { + writer.write(' '); + node_output_pi_value(writer, node->value); + } + + writer.write('?', '>'); + break; + + case node_declaration: + writer.write('<', '?'); + writer.write_string(node->name ? node->name + 0 : default_name); + node_output_attributes(writer, node, PUGIXML_TEXT(""), 0, flags | format_raw, 0); + writer.write('?', '>'); + break; + + case node_doctype: + writer.write('<', '!', 'D', 'O', 'C'); + writer.write('T', 'Y', 'P', 'E'); + + if (node->value) { + writer.write(' '); + writer.write_string(node->value); + } + + writer.write('>'); + break; + + default: + assert(!"Invalid node type"); + } +} + +enum indent_flags_t { + indent_newline = 1, + indent_indent = 2 +}; + +PUGI__FN void node_output(xml_buffered_writer& writer, xml_node_struct* root, const char_t* indent, unsigned int flags, unsigned int depth) +{ + size_t indent_length = ((flags & (format_indent | format_indent_attributes)) && (flags & format_raw) == 0) ? strlength(indent) : 0; + unsigned int indent_flags = indent_indent; + + xml_node_struct* node = root; + + do { + assert(node); + + // begin writing current node + if (PUGI__NODETYPE(node) == node_pcdata || PUGI__NODETYPE(node) == node_cdata) { + node_output_simple(writer, node, flags); + + indent_flags = 0; + } else { + if ((indent_flags & indent_newline) && (flags & format_raw) == 0) + writer.write('\n'); + + if ((indent_flags & indent_indent) && indent_length) + text_output_indent(writer, indent, indent_length, depth); + + if (PUGI__NODETYPE(node) == node_element) { + indent_flags = indent_newline | indent_indent; + + if (node_output_start(writer, node, indent, indent_length, flags, depth)) { + node = node->first_child; + depth++; + continue; + } + } else if (PUGI__NODETYPE(node) == node_document) { + indent_flags = indent_indent; + + if (node->first_child) { + node = node->first_child; + continue; + } + } else { + node_output_simple(writer, node, flags); + + indent_flags = indent_newline | indent_indent; + } + } + + // continue to the next node + while (node != root) { + if (node->next_sibling) { + node = node->next_sibling; + break; + } + + node = node->parent; + + // write closing node + if (PUGI__NODETYPE(node) == node_element) { + depth--; + + if ((indent_flags & indent_newline) && (flags & format_raw) == 0) + writer.write('\n'); + + if ((indent_flags & indent_indent) && indent_length) + text_output_indent(writer, indent, indent_length, depth); + + node_output_end(writer, node); + + indent_flags = indent_newline | indent_indent; + } + } + } while (node != root); + + if ((indent_flags & indent_newline) && (flags & format_raw) == 0) + writer.write('\n'); +} + +PUGI__FN bool has_declaration(xml_node_struct* node) +{ + for (xml_node_struct* child = node->first_child; child; child = child->next_sibling) { + xml_node_type type = PUGI__NODETYPE(child); + + if (type == node_declaration) return true; + if (type == node_element) return false; + } + + return false; +} + +PUGI__FN bool is_attribute_of(xml_attribute_struct* attr, xml_node_struct* node) +{ + for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute) + if (a == attr) + return true; + + return false; +} + +PUGI__FN bool allow_insert_attribute(xml_node_type parent) +{ + return parent == node_element || parent == node_declaration; +} + +PUGI__FN bool allow_insert_child(xml_node_type parent, xml_node_type child) +{ + if (parent != node_document && parent != node_element) return false; + if (child == node_document || child == node_null) return false; + if (parent != node_document && (child == node_declaration || child == node_doctype)) return false; + + return true; +} + +PUGI__FN bool allow_move(xml_node parent, xml_node child) +{ + // check that child can be a child of parent + if (!allow_insert_child(parent.type(), child.type())) + return false; + + // check that node is not moved between documents + if (parent.root() != child.root()) + return false; + + // check that new parent is not in the child subtree + xml_node cur = parent; + + while (cur) { + if (cur == child) + return false; + + cur = cur.parent(); + } + + return true; +} + +template +PUGI__FN void node_copy_string(String& dest, Header& header, uintptr_t header_mask, char_t* source, Header& source_header, xml_allocator* alloc) +{ + assert(!dest && (header & header_mask) == 0); + + if (source) { + if (alloc && (source_header & header_mask) == 0) { + dest = source; + + // since strcpy_insitu can reuse document buffer memory we need to mark both source and dest as shared + header |= xml_memory_page_contents_shared_mask; + source_header |= xml_memory_page_contents_shared_mask; + } else + strcpy_insitu(dest, header, header_mask, source, strlength(source)); + } +} + +PUGI__FN void node_copy_contents(xml_node_struct* dn, xml_node_struct* sn, xml_allocator* shared_alloc) +{ + node_copy_string(dn->name, dn->header, xml_memory_page_name_allocated_mask, sn->name, sn->header, shared_alloc); + node_copy_string(dn->value, dn->header, xml_memory_page_value_allocated_mask, sn->value, sn->header, shared_alloc); + + for (xml_attribute_struct* sa = sn->first_attribute; sa; sa = sa->next_attribute) { + xml_attribute_struct* da = append_new_attribute(dn, get_allocator(dn)); + + if (da) { + node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, shared_alloc); + node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, shared_alloc); + } + } +} + +PUGI__FN void node_copy_tree(xml_node_struct* dn, xml_node_struct* sn) +{ + xml_allocator& alloc = get_allocator(dn); + xml_allocator* shared_alloc = (&alloc == &get_allocator(sn)) ? &alloc : 0; + + node_copy_contents(dn, sn, shared_alloc); + + xml_node_struct* dit = dn; + xml_node_struct* sit = sn->first_child; + + while (sit && sit != sn) { + if (sit != dn) { + xml_node_struct* copy = append_new_node(dit, alloc, PUGI__NODETYPE(sit)); + + if (copy) { + node_copy_contents(copy, sit, shared_alloc); + + if (sit->first_child) { + dit = copy; + sit = sit->first_child; + continue; + } + } + } + + // continue to the next node + do { + if (sit->next_sibling) { + sit = sit->next_sibling; + break; + } + + sit = sit->parent; + dit = dit->parent; + } while (sit != sn); + } +} + +PUGI__FN void node_copy_attribute(xml_attribute_struct* da, xml_attribute_struct* sa) +{ + xml_allocator& alloc = get_allocator(da); + xml_allocator* shared_alloc = (&alloc == &get_allocator(sa)) ? &alloc : 0; + + node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, shared_alloc); + node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, shared_alloc); +} + +inline bool is_text_node(xml_node_struct* node) +{ + xml_node_type type = PUGI__NODETYPE(node); + + return type == node_pcdata || type == node_cdata; +} + +// get value with conversion functions +template U string_to_integer(const char_t* value, U minneg, U maxpos) +{ + U result = 0; + const char_t* s = value; + + while (PUGI__IS_CHARTYPE(*s, ct_space)) + s++; + + bool negative = (*s == '-'); + + s += (*s == '+' || *s == '-'); + + bool overflow = false; + + if (s[0] == '0' && (s[1] | ' ') == 'x') { + s += 2; + + // since overflow detection relies on length of the sequence skip leading zeros + while (*s == '0') + s++; + + const char_t* start = s; + + for (;;) { + if (static_cast(*s - '0') < 10) + result = result * 16 + (*s - '0'); + else if (static_cast((*s | ' ') - 'a') < 6) + result = result * 16 + ((*s | ' ') - 'a' + 10); + else + break; + + s++; + } + + size_t digits = static_cast(s - start); + + overflow = digits > sizeof(U) * 2; + } else { + // since overflow detection relies on length of the sequence skip leading zeros + while (*s == '0') + s++; + + const char_t* start = s; + + for (;;) { + if (static_cast(*s - '0') < 10) + result = result * 10 + (*s - '0'); + else + break; + + s++; + } + + size_t digits = static_cast(s - start); + + PUGI__STATIC_ASSERT(sizeof(U) == 8 || sizeof(U) == 4 || sizeof(U) == 2); + + const size_t max_digits10 = sizeof(U) == 8 ? 20 : sizeof(U) == 4 ? 10 : 5; + const char_t max_lead = sizeof(U) == 8 ? '1' : sizeof(U) == 4 ? '4' : '6'; + const size_t high_bit = sizeof(U) * 8 - 1; + + overflow = digits >= max_digits10 && !(digits == max_digits10 && (*start < max_lead || (*start == max_lead && result >> high_bit))); + } + + if (negative) + return (overflow || result > minneg) ? 0 - minneg : 0 - result; + else + return (overflow || result > maxpos) ? maxpos : result; +} + +PUGI__FN int get_value_int(const char_t* value) +{ + return string_to_integer(value, 0 - static_cast(INT_MIN), INT_MAX); +} + +PUGI__FN unsigned int get_value_uint(const char_t* value) +{ + return string_to_integer(value, 0, UINT_MAX); +} + +PUGI__FN double get_value_double(const char_t* value) +{ +#ifdef PUGIXML_WCHAR_MODE + return wcstod(value, 0); +#else + return strtod(value, 0); +#endif +} + +PUGI__FN float get_value_float(const char_t* value) +{ +#ifdef PUGIXML_WCHAR_MODE + return static_cast(wcstod(value, 0)); +#else + return static_cast(strtod(value, 0)); +#endif +} + +PUGI__FN bool get_value_bool(const char_t* value) +{ + // only look at first char + char_t first = *value; + + // 1*, t* (true), T* (True), y* (yes), Y* (YES) + return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y'); +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN long long get_value_llong(const char_t* value) +{ + return string_to_integer(value, 0 - static_cast(LLONG_MIN), LLONG_MAX); +} + +PUGI__FN unsigned long long get_value_ullong(const char_t* value) +{ + return string_to_integer(value, 0, ULLONG_MAX); +} +#endif + +template +PUGI__FN char_t* integer_to_string(char_t* begin, char_t* end, U value, bool negative) +{ + char_t* result = end - 1; + U rest = negative ? 0 - value : value; + + do { + *result-- = static_cast('0' + (rest % 10)); + rest /= 10; + } while (rest); + + assert(result >= begin); + (void)begin; + + *result = '-'; + + return result + !negative; +} + +// set value with conversion functions +template +PUGI__FN bool set_value_ascii(String& dest, Header& header, uintptr_t header_mask, char* buf) +{ +#ifdef PUGIXML_WCHAR_MODE + char_t wbuf[128]; + assert(strlen(buf) < sizeof(wbuf) / sizeof(wbuf[0])); + + size_t offset = 0; + for (; buf[offset]; ++offset) wbuf[offset] = buf[offset]; + + return strcpy_insitu(dest, header, header_mask, wbuf, offset); +#else + return strcpy_insitu(dest, header, header_mask, buf, strlen(buf)); +#endif +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, int value) +{ + char_t buf[64]; + char_t* end = buf + sizeof(buf) / sizeof(buf[0]); + char_t* begin = integer_to_string(buf, end, value, value < 0); + + return strcpy_insitu(dest, header, header_mask, begin, end - begin); +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, unsigned int value) +{ + char_t buf[64]; + char_t* end = buf + sizeof(buf) / sizeof(buf[0]); + char_t* begin = integer_to_string(buf, end, value, false); + + return strcpy_insitu(dest, header, header_mask, begin, end - begin); +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, float value) +{ + char buf[128]; + sprintf(buf, "%.9g", value); + + return set_value_ascii(dest, header, header_mask, buf); +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, double value) +{ + char buf[128]; + sprintf(buf, "%.17g", value); + + return set_value_ascii(dest, header, header_mask, buf); +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, bool value) +{ + return strcpy_insitu(dest, header, header_mask, value ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"), value ? 4 : 5); +} + +#ifdef PUGIXML_HAS_LONG_LONG +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, long long value) +{ + char_t buf[64]; + char_t* end = buf + sizeof(buf) / sizeof(buf[0]); + char_t* begin = integer_to_string(buf, end, value, value < 0); + + return strcpy_insitu(dest, header, header_mask, begin, end - begin); +} + +template +PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, unsigned long long value) +{ + char_t buf[64]; + char_t* end = buf + sizeof(buf) / sizeof(buf[0]); + char_t* begin = integer_to_string(buf, end, value, false); + + return strcpy_insitu(dest, header, header_mask, begin, end - begin); +} +#endif + +PUGI__FN xml_parse_result load_buffer_impl(xml_document_struct* doc, xml_node_struct* root, void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own, char_t** out_buffer) +{ + // check input buffer + if (!contents && size) return make_parse_result(status_io_error); + + // get actual encoding + xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size); + + // get private buffer + char_t* buffer = 0; + size_t length = 0; + + if (!impl::convert_buffer(buffer, length, buffer_encoding, contents, size, is_mutable)) return impl::make_parse_result(status_out_of_memory); + + // delete original buffer if we performed a conversion + if (own && buffer != contents && contents) impl::xml_memory::deallocate(contents); + + // grab onto buffer if it's our buffer, user is responsible for deallocating contents himself + if (own || buffer != contents) *out_buffer = buffer; + + // store buffer for offset_debug + doc->buffer = buffer; + + // parse + xml_parse_result res = impl::xml_parser::parse(buffer, length, doc, root, options); + + // remember encoding + res.encoding = buffer_encoding; + + return res; +} + +// we need to get length of entire file to load it in memory; the only (relatively) sane way to do it is via seek/tell trick +PUGI__FN xml_parse_status get_file_size(FILE* file, size_t& out_result) +{ +#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE) + // there are 64-bit versions of fseek/ftell, let's use them + typedef __int64 length_type; + + _fseeki64(file, 0, SEEK_END); + length_type length = _ftelli64(file); + _fseeki64(file, 0, SEEK_SET); +#elif defined(__MINGW32__) && !defined(__NO_MINGW_LFS) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR)) + // there are 64-bit versions of fseek/ftell, let's use them + typedef off64_t length_type; + + fseeko64(file, 0, SEEK_END); + length_type length = ftello64(file); + fseeko64(file, 0, SEEK_SET); +#else + // if this is a 32-bit OS, long is enough; if this is a unix system, long is 64-bit, which is enough; otherwise we can't do anything anyway. + typedef long length_type; + + fseek(file, 0, SEEK_END); + length_type length = ftell(file); + fseek(file, 0, SEEK_SET); +#endif + + // check for I/O errors + if (length < 0) return status_io_error; + + // check for overflow + size_t result = static_cast(length); + + if (static_cast(result) != length) return status_out_of_memory; + + // finalize + out_result = result; + + return status_ok; +} + +// This function assumes that buffer has extra sizeof(char_t) writable bytes after size +PUGI__FN size_t zero_terminate_buffer(void* buffer, size_t size, xml_encoding encoding) +{ + // We only need to zero-terminate if encoding conversion does not do it for us +#ifdef PUGIXML_WCHAR_MODE + xml_encoding wchar_encoding = get_wchar_encoding(); + + if (encoding == wchar_encoding || need_endian_swap_utf(encoding, wchar_encoding)) { + size_t length = size / sizeof(char_t); + + static_cast(buffer)[length] = 0; + return (length + 1) * sizeof(char_t); + } +#else + if (encoding == encoding_utf8) { + static_cast(buffer)[size] = 0; + return size + 1; + } +#endif + + return size; +} + +PUGI__FN xml_parse_result load_file_impl(xml_document_struct* doc, FILE* file, unsigned int options, xml_encoding encoding, char_t** out_buffer) +{ + if (!file) return make_parse_result(status_file_not_found); + + // get file size (can result in I/O errors) + size_t size = 0; + xml_parse_status size_status = get_file_size(file, size); + if (size_status != status_ok) return make_parse_result(size_status); + + size_t max_suffix_size = sizeof(char_t); + + // allocate buffer for the whole file + char* contents = static_cast(xml_memory::allocate(size + max_suffix_size)); + if (!contents) return make_parse_result(status_out_of_memory); + + // read file in memory + size_t read_size = fread(contents, 1, size, file); + + if (read_size != size) { + xml_memory::deallocate(contents); + return make_parse_result(status_io_error); + } + + xml_encoding real_encoding = get_buffer_encoding(encoding, contents, size); + + return load_buffer_impl(doc, doc, contents, zero_terminate_buffer(contents, size, real_encoding), options, real_encoding, true, true, out_buffer); +} + +#ifndef PUGIXML_NO_STL +template struct xml_stream_chunk { + static xml_stream_chunk* create() { + void* memory = xml_memory::allocate(sizeof(xml_stream_chunk)); + if (!memory) return 0; + + return new (memory) xml_stream_chunk(); + } + + static void destroy(xml_stream_chunk* chunk) { + // free chunk chain + while (chunk) { + xml_stream_chunk* next_ = chunk->next; + + xml_memory::deallocate(chunk); + + chunk = next_; + } + } + + xml_stream_chunk(): next(0), size(0) { + } + + xml_stream_chunk* next; + size_t size; + + T data[xml_memory_page_size / sizeof(T)]; +}; + +template PUGI__FN xml_parse_status load_stream_data_noseek(std::basic_istream& stream, void** out_buffer, size_t* out_size) +{ + auto_deleter > chunks(0, xml_stream_chunk::destroy); + + // read file to a chunk list + size_t total = 0; + xml_stream_chunk* last = 0; + + while (!stream.eof()) { + // allocate new chunk + xml_stream_chunk* chunk = xml_stream_chunk::create(); + if (!chunk) return status_out_of_memory; + + // append chunk to list + if (last) last = last->next = chunk; + else chunks.data = last = chunk; + + // read data to chunk + stream.read(chunk->data, static_cast(sizeof(chunk->data) / sizeof(T))); + chunk->size = static_cast(stream.gcount()) * sizeof(T); + + // read may set failbit | eofbit in case gcount() is less than read length, so check for other I/O errors + if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error; + + // guard against huge files (chunk size is small enough to make this overflow check work) + if (total + chunk->size < total) return status_out_of_memory; + total += chunk->size; + } + + size_t max_suffix_size = sizeof(char_t); + + // copy chunk list to a contiguous buffer + char* buffer = static_cast(xml_memory::allocate(total + max_suffix_size)); + if (!buffer) return status_out_of_memory; + + char* write = buffer; + + for (xml_stream_chunk* chunk = chunks.data; chunk; chunk = chunk->next) { + assert(write + chunk->size <= buffer + total); + memcpy(write, chunk->data, chunk->size); + write += chunk->size; + } + + assert(write == buffer + total); + + // return buffer + *out_buffer = buffer; + *out_size = total; + + return status_ok; +} + +template PUGI__FN xml_parse_status load_stream_data_seek(std::basic_istream& stream, void** out_buffer, size_t* out_size) +{ + // get length of remaining data in stream + typename std::basic_istream::pos_type pos = stream.tellg(); + stream.seekg(0, std::ios::end); + std::streamoff length = stream.tellg() - pos; + stream.seekg(pos); + + if (stream.fail() || pos < 0) return status_io_error; + + // guard against huge files + size_t read_length = static_cast(length); + + if (static_cast(read_length) != length || length < 0) return status_out_of_memory; + + size_t max_suffix_size = sizeof(char_t); + + // read stream data into memory (guard against stream exceptions with buffer holder) + auto_deleter buffer(xml_memory::allocate(read_length * sizeof(T) + max_suffix_size), xml_memory::deallocate); + if (!buffer.data) return status_out_of_memory; + + stream.read(static_cast(buffer.data), static_cast(read_length)); + + // read may set failbit | eofbit in case gcount() is less than read_length (i.e. line ending conversion), so check for other I/O errors + if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error; + + // return buffer + size_t actual_length = static_cast(stream.gcount()); + assert(actual_length <= read_length); + + *out_buffer = buffer.release(); + *out_size = actual_length * sizeof(T); + + return status_ok; +} + +template PUGI__FN xml_parse_result load_stream_impl(xml_document_struct* doc, std::basic_istream& stream, unsigned int options, xml_encoding encoding, char_t** out_buffer) +{ + void* buffer = 0; + size_t size = 0; + xml_parse_status status = status_ok; + + // if stream has an error bit set, bail out (otherwise tellg() can fail and we'll clear error bits) + if (stream.fail()) return make_parse_result(status_io_error); + + // load stream to memory (using seek-based implementation if possible, since it's faster and takes less memory) + if (stream.tellg() < 0) { + stream.clear(); // clear error flags that could be set by a failing tellg + status = load_stream_data_noseek(stream, &buffer, &size); + } else + status = load_stream_data_seek(stream, &buffer, &size); + + if (status != status_ok) return make_parse_result(status); + + xml_encoding real_encoding = get_buffer_encoding(encoding, buffer, size); + + return load_buffer_impl(doc, doc, buffer, zero_terminate_buffer(buffer, size, real_encoding), options, real_encoding, true, true, out_buffer); +} +#endif + +#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) || (defined(__MINGW32__) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR))) +PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode) +{ + return _wfopen(path, mode); +} +#else +PUGI__FN char* convert_path_heap(const wchar_t* str) +{ + assert(str); + + // first pass: get length in utf8 characters + size_t length = strlength_wide(str); + size_t size = as_utf8_begin(str, length); + + // allocate resulting string + char* result = static_cast(xml_memory::allocate(size + 1)); + if (!result) return 0; + + // second pass: convert to utf8 + as_utf8_end(result, size, str, length); + + // zero-terminate + result[size] = 0; + + return result; +} + +PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode) +{ + // there is no standard function to open wide paths, so our best bet is to try utf8 path + char* path_utf8 = convert_path_heap(path); + if (!path_utf8) return 0; + + // convert mode to ASCII (we mirror _wfopen interface) + char mode_ascii[4] = {0}; + for (size_t i = 0; mode[i]; ++i) mode_ascii[i] = static_cast(mode[i]); + + // try to open the utf8 path + FILE* result = fopen(path_utf8, mode_ascii); + + // free dummy buffer + xml_memory::deallocate(path_utf8); + + return result; +} +#endif + +PUGI__FN bool save_file_impl(const xml_document& doc, FILE* file, const char_t* indent, unsigned int flags, xml_encoding encoding) +{ + if (!file) return false; + + xml_writer_file writer(file); + doc.save(writer, indent, flags, encoding); + + return ferror(file) == 0; +} + +struct name_null_sentry { + xml_node_struct* node; + char_t* name; + + name_null_sentry(xml_node_struct* node_): node(node_), name(node_->name) { + node->name = 0; + } + + ~name_null_sentry() { + node->name = name; + } +}; +PUGI__NS_END + +namespace pugi +{ +PUGI__FN xml_writer_file::xml_writer_file(void* file_): file(file_) +{ +} + +PUGI__FN void xml_writer_file::write(const void* data, size_t size) +{ + size_t result = fwrite(data, 1, size, static_cast(file)); + (void)!result; // unfortunately we can't do proper error handling here +} + +#ifndef PUGIXML_NO_STL +PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(&stream), wide_stream(0) +{ +} + +PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(0), wide_stream(&stream) +{ +} + +PUGI__FN void xml_writer_stream::write(const void* data, size_t size) +{ + if (narrow_stream) { + assert(!wide_stream); + narrow_stream->write(reinterpret_cast(data), static_cast(size)); + } else { + assert(wide_stream); + assert(size % sizeof(wchar_t) == 0); + + wide_stream->write(reinterpret_cast(data), static_cast(size / sizeof(wchar_t))); + } +} +#endif + +PUGI__FN xml_tree_walker::xml_tree_walker(): _depth(0) +{ +} + +PUGI__FN xml_tree_walker::~xml_tree_walker() +{ +} + +PUGI__FN int xml_tree_walker::depth() const +{ + return _depth; +} + +PUGI__FN bool xml_tree_walker::begin(xml_node&) +{ + return true; +} + +PUGI__FN bool xml_tree_walker::end(xml_node&) +{ + return true; +} + +PUGI__FN xml_attribute::xml_attribute(): _attr(0) +{ +} + +PUGI__FN xml_attribute::xml_attribute(xml_attribute_struct* attr): _attr(attr) +{ +} + +PUGI__FN static void unspecified_bool_xml_attribute(xml_attribute***) +{ +} + +PUGI__FN xml_attribute::operator xml_attribute::unspecified_bool_type() const +{ + return _attr ? unspecified_bool_xml_attribute : 0; +} + +PUGI__FN bool xml_attribute::operator!() const +{ + return !_attr; +} + +PUGI__FN bool xml_attribute::operator==(const xml_attribute& r) const +{ + return (_attr == r._attr); +} + +PUGI__FN bool xml_attribute::operator!=(const xml_attribute& r) const +{ + return (_attr != r._attr); +} + +PUGI__FN bool xml_attribute::operator<(const xml_attribute& r) const +{ + return (_attr < r._attr); +} + +PUGI__FN bool xml_attribute::operator>(const xml_attribute& r) const +{ + return (_attr > r._attr); +} + +PUGI__FN bool xml_attribute::operator<=(const xml_attribute& r) const +{ + return (_attr <= r._attr); +} + +PUGI__FN bool xml_attribute::operator>=(const xml_attribute& r) const +{ + return (_attr >= r._attr); +} + +PUGI__FN xml_attribute xml_attribute::next_attribute() const +{ + return _attr ? xml_attribute(_attr->next_attribute) : xml_attribute(); +} + +PUGI__FN xml_attribute xml_attribute::previous_attribute() const +{ + return _attr && _attr->prev_attribute_c->next_attribute ? xml_attribute(_attr->prev_attribute_c) : xml_attribute(); +} + +PUGI__FN const char_t* xml_attribute::as_string(const char_t* def) const +{ + return (_attr && _attr->value) ? _attr->value + 0 : def; +} + +PUGI__FN int xml_attribute::as_int(int def) const +{ + return (_attr && _attr->value) ? impl::get_value_int(_attr->value) : def; +} + +PUGI__FN unsigned int xml_attribute::as_uint(unsigned int def) const +{ + return (_attr && _attr->value) ? impl::get_value_uint(_attr->value) : def; +} + +PUGI__FN double xml_attribute::as_double(double def) const +{ + return (_attr && _attr->value) ? impl::get_value_double(_attr->value) : def; +} + +PUGI__FN float xml_attribute::as_float(float def) const +{ + return (_attr && _attr->value) ? impl::get_value_float(_attr->value) : def; +} + +PUGI__FN bool xml_attribute::as_bool(bool def) const +{ + return (_attr && _attr->value) ? impl::get_value_bool(_attr->value) : def; +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN long long xml_attribute::as_llong(long long def) const +{ + return (_attr && _attr->value) ? impl::get_value_llong(_attr->value) : def; +} + +PUGI__FN unsigned long long xml_attribute::as_ullong(unsigned long long def) const +{ + return (_attr && _attr->value) ? impl::get_value_ullong(_attr->value) : def; +} +#endif + +PUGI__FN bool xml_attribute::empty() const +{ + return !_attr; +} + +PUGI__FN const char_t* xml_attribute::name() const +{ + return (_attr && _attr->name) ? _attr->name + 0 : PUGIXML_TEXT(""); +} + +PUGI__FN const char_t* xml_attribute::value() const +{ + return (_attr && _attr->value) ? _attr->value + 0 : PUGIXML_TEXT(""); +} + +PUGI__FN size_t xml_attribute::hash_value() const +{ + return static_cast(reinterpret_cast(_attr) / sizeof(xml_attribute_struct)); +} + +PUGI__FN xml_attribute_struct* xml_attribute::internal_object() const +{ + return _attr; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(const char_t* rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(int rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(unsigned int rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(double rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(float rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(bool rhs) +{ + set_value(rhs); + return *this; +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN xml_attribute& xml_attribute::operator=(long long rhs) +{ + set_value(rhs); + return *this; +} + +PUGI__FN xml_attribute& xml_attribute::operator=(unsigned long long rhs) +{ + set_value(rhs); + return *this; +} +#endif + +PUGI__FN bool xml_attribute::set_name(const char_t* rhs) +{ + if (!_attr) return false; + + return impl::strcpy_insitu(_attr->name, _attr->header, impl::xml_memory_page_name_allocated_mask, rhs, impl::strlength(rhs)); +} + +PUGI__FN bool xml_attribute::set_value(const char_t* rhs) +{ + if (!_attr) return false; + + return impl::strcpy_insitu(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs, impl::strlength(rhs)); +} + +PUGI__FN bool xml_attribute::set_value(int rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +PUGI__FN bool xml_attribute::set_value(unsigned int rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +PUGI__FN bool xml_attribute::set_value(double rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +PUGI__FN bool xml_attribute::set_value(float rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +PUGI__FN bool xml_attribute::set_value(bool rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN bool xml_attribute::set_value(long long rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} + +PUGI__FN bool xml_attribute::set_value(unsigned long long rhs) +{ + if (!_attr) return false; + + return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs); +} +#endif + +#ifdef __BORLANDC__ +PUGI__FN bool operator&&(const xml_attribute& lhs, bool rhs) +{ + return (bool)lhs && rhs; +} + +PUGI__FN bool operator||(const xml_attribute& lhs, bool rhs) +{ + return (bool)lhs || rhs; +} +#endif + +PUGI__FN xml_node::xml_node(): _root(0) +{ +} + +PUGI__FN xml_node::xml_node(xml_node_struct* p): _root(p) +{ +} + +PUGI__FN static void unspecified_bool_xml_node(xml_node***) +{ +} + +PUGI__FN xml_node::operator xml_node::unspecified_bool_type() const +{ + return _root ? unspecified_bool_xml_node : 0; +} + +PUGI__FN bool xml_node::operator!() const +{ + return !_root; +} + +PUGI__FN xml_node::iterator xml_node::begin() const +{ + return iterator(_root ? _root->first_child + 0 : 0, _root); +} + +PUGI__FN xml_node::iterator xml_node::end() const +{ + return iterator(0, _root); +} + +PUGI__FN xml_node::attribute_iterator xml_node::attributes_begin() const +{ + return attribute_iterator(_root ? _root->first_attribute + 0 : 0, _root); +} + +PUGI__FN xml_node::attribute_iterator xml_node::attributes_end() const +{ + return attribute_iterator(0, _root); +} + +PUGI__FN xml_object_range xml_node::children() const +{ + return xml_object_range(begin(), end()); +} + +PUGI__FN xml_object_range xml_node::children(const char_t* name_) const +{ + return xml_object_range(xml_named_node_iterator(child(name_)._root, _root, name_), xml_named_node_iterator(0, _root, name_)); +} + +PUGI__FN xml_object_range xml_node::attributes() const +{ + return xml_object_range(attributes_begin(), attributes_end()); +} + +PUGI__FN bool xml_node::operator==(const xml_node& r) const +{ + return (_root == r._root); +} + +PUGI__FN bool xml_node::operator!=(const xml_node& r) const +{ + return (_root != r._root); +} + +PUGI__FN bool xml_node::operator<(const xml_node& r) const +{ + return (_root < r._root); +} + +PUGI__FN bool xml_node::operator>(const xml_node& r) const +{ + return (_root > r._root); +} + +PUGI__FN bool xml_node::operator<=(const xml_node& r) const +{ + return (_root <= r._root); +} + +PUGI__FN bool xml_node::operator>=(const xml_node& r) const +{ + return (_root >= r._root); +} + +PUGI__FN bool xml_node::empty() const +{ + return !_root; +} + +PUGI__FN const char_t* xml_node::name() const +{ + return (_root && _root->name) ? _root->name + 0 : PUGIXML_TEXT(""); +} + +PUGI__FN xml_node_type xml_node::type() const +{ + return _root ? PUGI__NODETYPE(_root) : node_null; +} + +PUGI__FN const char_t* xml_node::value() const +{ + return (_root && _root->value) ? _root->value + 0 : PUGIXML_TEXT(""); +} + +PUGI__FN xml_node xml_node::child(const char_t* name_) const +{ + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (i->name && impl::strequal(name_, i->name)) return xml_node(i); + + return xml_node(); +} + +PUGI__FN xml_attribute xml_node::attribute(const char_t* name_) const +{ + if (!_root) return xml_attribute(); + + for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute) + if (i->name && impl::strequal(name_, i->name)) + return xml_attribute(i); + + return xml_attribute(); +} + +PUGI__FN xml_node xml_node::next_sibling(const char_t* name_) const +{ + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling) + if (i->name && impl::strequal(name_, i->name)) return xml_node(i); + + return xml_node(); +} + +PUGI__FN xml_node xml_node::next_sibling() const +{ + return _root ? xml_node(_root->next_sibling) : xml_node(); +} + +PUGI__FN xml_node xml_node::previous_sibling(const char_t* name_) const +{ + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->prev_sibling_c; i->next_sibling; i = i->prev_sibling_c) + if (i->name && impl::strequal(name_, i->name)) return xml_node(i); + + return xml_node(); +} + +PUGI__FN xml_attribute xml_node::attribute(const char_t* name_, xml_attribute& hint_) const +{ + xml_attribute_struct* hint = hint_._attr; + + // if hint is not an attribute of node, behavior is not defined + assert(!hint || (_root && impl::is_attribute_of(hint, _root))); + + if (!_root) return xml_attribute(); + + // optimistically search from hint up until the end + for (xml_attribute_struct* i = hint; i; i = i->next_attribute) + if (i->name && impl::strequal(name_, i->name)) { + // update hint to maximize efficiency of searching for consecutive attributes + hint_._attr = i->next_attribute; + + return xml_attribute(i); + } + + // wrap around and search from the first attribute until the hint + // 'j' null pointer check is technically redundant, but it prevents a crash in case the assertion above fails + for (xml_attribute_struct* j = _root->first_attribute; j && j != hint; j = j->next_attribute) + if (j->name && impl::strequal(name_, j->name)) { + // update hint to maximize efficiency of searching for consecutive attributes + hint_._attr = j->next_attribute; + + return xml_attribute(j); + } + + return xml_attribute(); +} + +PUGI__FN xml_node xml_node::previous_sibling() const +{ + if (!_root) return xml_node(); + + if (_root->prev_sibling_c->next_sibling) return xml_node(_root->prev_sibling_c); + else return xml_node(); +} + +PUGI__FN xml_node xml_node::parent() const +{ + return _root ? xml_node(_root->parent) : xml_node(); +} + +PUGI__FN xml_node xml_node::root() const +{ + return _root ? xml_node(&impl::get_document(_root)) : xml_node(); +} + +PUGI__FN xml_text xml_node::text() const +{ + return xml_text(_root); +} + +PUGI__FN const char_t* xml_node::child_value() const +{ + if (!_root) return PUGIXML_TEXT(""); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (impl::is_text_node(i) && i->value) + return i->value; + + return PUGIXML_TEXT(""); +} + +PUGI__FN const char_t* xml_node::child_value(const char_t* name_) const +{ + return child(name_).child_value(); +} + +PUGI__FN xml_attribute xml_node::first_attribute() const +{ + return _root ? xml_attribute(_root->first_attribute) : xml_attribute(); +} + +PUGI__FN xml_attribute xml_node::last_attribute() const +{ + return _root && _root->first_attribute ? xml_attribute(_root->first_attribute->prev_attribute_c) : xml_attribute(); +} + +PUGI__FN xml_node xml_node::first_child() const +{ + return _root ? xml_node(_root->first_child) : xml_node(); +} + +PUGI__FN xml_node xml_node::last_child() const +{ + return _root && _root->first_child ? xml_node(_root->first_child->prev_sibling_c) : xml_node(); +} + +PUGI__FN bool xml_node::set_name(const char_t* rhs) +{ + xml_node_type type_ = _root ? PUGI__NODETYPE(_root) : node_null; + + if (type_ != node_element && type_ != node_pi && type_ != node_declaration) + return false; + + return impl::strcpy_insitu(_root->name, _root->header, impl::xml_memory_page_name_allocated_mask, rhs, impl::strlength(rhs)); +} + +PUGI__FN bool xml_node::set_value(const char_t* rhs) +{ + xml_node_type type_ = _root ? PUGI__NODETYPE(_root) : node_null; + + if (type_ != node_pcdata && type_ != node_cdata && type_ != node_comment && type_ != node_pi && type_ != node_doctype) + return false; + + return impl::strcpy_insitu(_root->value, _root->header, impl::xml_memory_page_value_allocated_mask, rhs, impl::strlength(rhs)); +} + +PUGI__FN xml_attribute xml_node::append_attribute(const char_t* name_) +{ + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::append_attribute(a._attr, _root); + + a.set_name(name_); + + return a; +} + +PUGI__FN xml_attribute xml_node::prepend_attribute(const char_t* name_) +{ + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::prepend_attribute(a._attr, _root); + + a.set_name(name_); + + return a; +} + +PUGI__FN xml_attribute xml_node::insert_attribute_after(const char_t* name_, const xml_attribute& attr) +{ + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::insert_attribute_after(a._attr, attr._attr, _root); + + a.set_name(name_); + + return a; +} + +PUGI__FN xml_attribute xml_node::insert_attribute_before(const char_t* name_, const xml_attribute& attr) +{ + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::insert_attribute_before(a._attr, attr._attr, _root); + + a.set_name(name_); + + return a; +} + +PUGI__FN xml_attribute xml_node::append_copy(const xml_attribute& proto) +{ + if (!proto) return xml_attribute(); + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::append_attribute(a._attr, _root); + impl::node_copy_attribute(a._attr, proto._attr); + + return a; +} + +PUGI__FN xml_attribute xml_node::prepend_copy(const xml_attribute& proto) +{ + if (!proto) return xml_attribute(); + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::prepend_attribute(a._attr, _root); + impl::node_copy_attribute(a._attr, proto._attr); + + return a; +} + +PUGI__FN xml_attribute xml_node::insert_copy_after(const xml_attribute& proto, const xml_attribute& attr) +{ + if (!proto) return xml_attribute(); + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::insert_attribute_after(a._attr, attr._attr, _root); + impl::node_copy_attribute(a._attr, proto._attr); + + return a; +} + +PUGI__FN xml_attribute xml_node::insert_copy_before(const xml_attribute& proto, const xml_attribute& attr) +{ + if (!proto) return xml_attribute(); + if (!impl::allow_insert_attribute(type())) return xml_attribute(); + if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_attribute(); + + xml_attribute a(impl::allocate_attribute(alloc)); + if (!a) return xml_attribute(); + + impl::insert_attribute_before(a._attr, attr._attr, _root); + impl::node_copy_attribute(a._attr, proto._attr); + + return a; +} + +PUGI__FN xml_node xml_node::append_child(xml_node_type type_) +{ + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::append_node(n._root, _root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; +} + +PUGI__FN xml_node xml_node::prepend_child(xml_node_type type_) +{ + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::prepend_node(n._root, _root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; +} + +PUGI__FN xml_node xml_node::insert_child_before(xml_node_type type_, const xml_node& node) +{ + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::insert_node_before(n._root, node._root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; +} + +PUGI__FN xml_node xml_node::insert_child_after(xml_node_type type_, const xml_node& node) +{ + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::insert_node_after(n._root, node._root); + + if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml")); + + return n; +} + +PUGI__FN xml_node xml_node::append_child(const char_t* name_) +{ + xml_node result = append_child(node_element); + + result.set_name(name_); + + return result; +} + +PUGI__FN xml_node xml_node::prepend_child(const char_t* name_) +{ + xml_node result = prepend_child(node_element); + + result.set_name(name_); + + return result; +} + +PUGI__FN xml_node xml_node::insert_child_after(const char_t* name_, const xml_node& node) +{ + xml_node result = insert_child_after(node_element, node); + + result.set_name(name_); + + return result; +} + +PUGI__FN xml_node xml_node::insert_child_before(const char_t* name_, const xml_node& node) +{ + xml_node result = insert_child_before(node_element, node); + + result.set_name(name_); + + return result; +} + +PUGI__FN xml_node xml_node::append_copy(const xml_node& proto) +{ + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::append_node(n._root, _root); + impl::node_copy_tree(n._root, proto._root); + + return n; +} + +PUGI__FN xml_node xml_node::prepend_copy(const xml_node& proto) +{ + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::prepend_node(n._root, _root); + impl::node_copy_tree(n._root, proto._root); + + return n; +} + +PUGI__FN xml_node xml_node::insert_copy_after(const xml_node& proto, const xml_node& node) +{ + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::insert_node_after(n._root, node._root); + impl::node_copy_tree(n._root, proto._root); + + return n; +} + +PUGI__FN xml_node xml_node::insert_copy_before(const xml_node& proto, const xml_node& node) +{ + xml_node_type type_ = proto.type(); + if (!impl::allow_insert_child(type(), type_)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + xml_node n(impl::allocate_node(alloc, type_)); + if (!n) return xml_node(); + + impl::insert_node_before(n._root, node._root); + impl::node_copy_tree(n._root, proto._root); + + return n; +} + +PUGI__FN xml_node xml_node::append_move(const xml_node& moved) +{ + if (!impl::allow_move(*this, moved)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::append_node(moved._root, _root); + + return moved; +} + +PUGI__FN xml_node xml_node::prepend_move(const xml_node& moved) +{ + if (!impl::allow_move(*this, moved)) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::prepend_node(moved._root, _root); + + return moved; +} + +PUGI__FN xml_node xml_node::insert_move_after(const xml_node& moved, const xml_node& node) +{ + if (!impl::allow_move(*this, moved)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + if (moved._root == node._root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::insert_node_after(moved._root, node._root); + + return moved; +} + +PUGI__FN xml_node xml_node::insert_move_before(const xml_node& moved, const xml_node& node) +{ + if (!impl::allow_move(*this, moved)) return xml_node(); + if (!node._root || node._root->parent != _root) return xml_node(); + if (moved._root == node._root) return xml_node(); + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return xml_node(); + + // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers + impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask; + + impl::remove_node(moved._root); + impl::insert_node_before(moved._root, node._root); + + return moved; +} + +PUGI__FN bool xml_node::remove_attribute(const char_t* name_) +{ + return remove_attribute(attribute(name_)); +} + +PUGI__FN bool xml_node::remove_attribute(const xml_attribute& a) +{ + if (!_root || !a._attr) return false; + if (!impl::is_attribute_of(a._attr, _root)) return false; + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return false; + + impl::remove_attribute(a._attr, _root); + impl::destroy_attribute(a._attr, alloc); + + return true; +} + +PUGI__FN bool xml_node::remove_child(const char_t* name_) +{ + return remove_child(child(name_)); +} + +PUGI__FN bool xml_node::remove_child(const xml_node& n) +{ + if (!_root || !n._root || n._root->parent != _root) return false; + + impl::xml_allocator& alloc = impl::get_allocator(_root); + if (!alloc.reserve()) return false; + + impl::remove_node(n._root); + impl::destroy_node(n._root, alloc); + + return true; +} + +PUGI__FN xml_parse_result xml_node::append_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding) +{ + // append_buffer is only valid for elements/documents + if (!impl::allow_insert_child(type(), node_element)) return impl::make_parse_result(status_append_invalid_root); + + // get document node + impl::xml_document_struct* doc = &impl::get_document(_root); + + // disable document_buffer_order optimization since in a document with multiple buffers comparing buffer pointers does not make sense + doc->header |= impl::xml_memory_page_contents_shared_mask; + + // get extra buffer element (we'll store the document fragment buffer there so that we can deallocate it later) + impl::xml_memory_page* page = 0; + impl::xml_extra_buffer* extra = static_cast(doc->allocate_memory(sizeof(impl::xml_extra_buffer), page)); + (void)page; + + if (!extra) return impl::make_parse_result(status_out_of_memory); + + // add extra buffer to the list + extra->buffer = 0; + extra->next = doc->extra_buffers; + doc->extra_buffers = extra; + + // name of the root has to be NULL before parsing - otherwise closing node mismatches will not be detected at the top level + impl::name_null_sentry sentry(_root); + + return impl::load_buffer_impl(doc, _root, const_cast(contents), size, options, encoding, false, false, &extra->buffer); +} + +PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* name_, const char_t* attr_name, const char_t* attr_value) const +{ + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (i->name && impl::strequal(name_, i->name)) { + for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute) + if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value + 0 : PUGIXML_TEXT(""))) + return xml_node(i); + } + + return xml_node(); +} + +PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const +{ + if (!_root) return xml_node(); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute) + if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value + 0 : PUGIXML_TEXT(""))) + return xml_node(i); + + return xml_node(); +} + +#ifndef PUGIXML_NO_STL +PUGI__FN string_t xml_node::path(char_t delimiter) const +{ + if (!_root) return string_t(); + + size_t offset = 0; + + for (xml_node_struct* i = _root; i; i = i->parent) { + offset += (i != _root); + offset += i->name ? impl::strlength(i->name) : 0; + } + + string_t result; + result.resize(offset); + + for (xml_node_struct* j = _root; j; j = j->parent) { + if (j != _root) + result[--offset] = delimiter; + + if (j->name && *j->name) { + size_t length = impl::strlength(j->name); + + offset -= length; + memcpy(&result[offset], j->name, length * sizeof(char_t)); + } + } + + assert(offset == 0); + + return result; +} +#endif + +PUGI__FN xml_node xml_node::first_element_by_path(const char_t* path_, char_t delimiter) const +{ + xml_node found = *this; // Current search context. + + if (!_root || !path_ || !path_[0]) return found; + + if (path_[0] == delimiter) { + // Absolute path; e.g. '/foo/bar' + found = found.root(); + ++path_; + } + + const char_t* path_segment = path_; + + while (*path_segment == delimiter) ++path_segment; + + const char_t* path_segment_end = path_segment; + + while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end; + + if (path_segment == path_segment_end) return found; + + const char_t* next_segment = path_segment_end; + + while (*next_segment == delimiter) ++next_segment; + + if (*path_segment == '.' && path_segment + 1 == path_segment_end) + return found.first_element_by_path(next_segment, delimiter); + else if (*path_segment == '.' && *(path_segment+1) == '.' && path_segment + 2 == path_segment_end) + return found.parent().first_element_by_path(next_segment, delimiter); + else { + for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling) { + if (j->name && impl::strequalrange(j->name, path_segment, static_cast(path_segment_end - path_segment))) { + xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter); + + if (subsearch) return subsearch; + } + } + + return xml_node(); + } +} + +PUGI__FN bool xml_node::traverse(xml_tree_walker& walker) +{ + walker._depth = -1; + + xml_node arg_begin = *this; + if (!walker.begin(arg_begin)) return false; + + xml_node cur = first_child(); + + if (cur) { + ++walker._depth; + + do { + xml_node arg_for_each = cur; + if (!walker.for_each(arg_for_each)) + return false; + + if (cur.first_child()) { + ++walker._depth; + cur = cur.first_child(); + } else if (cur.next_sibling()) + cur = cur.next_sibling(); + else { + // Borland C++ workaround + while (!cur.next_sibling() && cur != *this && !cur.parent().empty()) { + --walker._depth; + cur = cur.parent(); + } + + if (cur != *this) + cur = cur.next_sibling(); + } + } while (cur && cur != *this); + } + + assert(walker._depth == -1); + + xml_node arg_end = *this; + return walker.end(arg_end); +} + +PUGI__FN size_t xml_node::hash_value() const +{ + return static_cast(reinterpret_cast(_root) / sizeof(xml_node_struct)); +} + +PUGI__FN xml_node_struct* xml_node::internal_object() const +{ + return _root; +} + +PUGI__FN void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const +{ + if (!_root) return; + + impl::xml_buffered_writer buffered_writer(writer, encoding); + + impl::node_output(buffered_writer, _root, indent, flags, depth); + + buffered_writer.flush(); +} + +#ifndef PUGIXML_NO_STL +PUGI__FN void xml_node::print(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const +{ + xml_writer_stream writer(stream); + + print(writer, indent, flags, encoding, depth); +} + +PUGI__FN void xml_node::print(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const +{ + xml_writer_stream writer(stream); + + print(writer, indent, flags, encoding_wchar, depth); +} +#endif + +PUGI__FN ptrdiff_t xml_node::offset_debug() const +{ + if (!_root) return -1; + + impl::xml_document_struct& doc = impl::get_document(_root); + + // we can determine the offset reliably only if there is exactly once parse buffer + if (!doc.buffer || doc.extra_buffers) return -1; + + switch (type()) { + case node_document: + return 0; + + case node_element: + case node_declaration: + case node_pi: + return _root->name && (_root->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0 ? _root->name - doc.buffer : -1; + + case node_pcdata: + case node_cdata: + case node_comment: + case node_doctype: + return _root->value && (_root->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0 ? _root->value - doc.buffer : -1; + + default: + return -1; + } +} + +#ifdef __BORLANDC__ +PUGI__FN bool operator&&(const xml_node& lhs, bool rhs) +{ + return (bool)lhs && rhs; +} + +PUGI__FN bool operator||(const xml_node& lhs, bool rhs) +{ + return (bool)lhs || rhs; +} +#endif + +PUGI__FN xml_text::xml_text(xml_node_struct* root): _root(root) +{ +} + +PUGI__FN xml_node_struct* xml_text::_data() const +{ + if (!_root || impl::is_text_node(_root)) return _root; + + for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling) + if (impl::is_text_node(node)) + return node; + + return 0; +} + +PUGI__FN xml_node_struct* xml_text::_data_new() +{ + xml_node_struct* d = _data(); + if (d) return d; + + return xml_node(_root).append_child(node_pcdata).internal_object(); +} + +PUGI__FN xml_text::xml_text(): _root(0) +{ +} + +PUGI__FN static void unspecified_bool_xml_text(xml_text***) +{ +} + +PUGI__FN xml_text::operator xml_text::unspecified_bool_type() const +{ + return _data() ? unspecified_bool_xml_text : 0; +} + +PUGI__FN bool xml_text::operator!() const +{ + return !_data(); +} + +PUGI__FN bool xml_text::empty() const +{ + return _data() == 0; +} + +PUGI__FN const char_t* xml_text::get() const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? d->value + 0 : PUGIXML_TEXT(""); +} + +PUGI__FN const char_t* xml_text::as_string(const char_t* def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? d->value + 0 : def; +} + +PUGI__FN int xml_text::as_int(int def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_int(d->value) : def; +} + +PUGI__FN unsigned int xml_text::as_uint(unsigned int def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_uint(d->value) : def; +} + +PUGI__FN double xml_text::as_double(double def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_double(d->value) : def; +} + +PUGI__FN float xml_text::as_float(float def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_float(d->value) : def; +} + +PUGI__FN bool xml_text::as_bool(bool def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_bool(d->value) : def; +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN long long xml_text::as_llong(long long def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_llong(d->value) : def; +} + +PUGI__FN unsigned long long xml_text::as_ullong(unsigned long long def) const +{ + xml_node_struct* d = _data(); + + return (d && d->value) ? impl::get_value_ullong(d->value) : def; +} +#endif + +PUGI__FN bool xml_text::set(const char_t* rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::strcpy_insitu(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs, impl::strlength(rhs)) : false; +} + +PUGI__FN bool xml_text::set(int rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +PUGI__FN bool xml_text::set(unsigned int rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +PUGI__FN bool xml_text::set(float rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +PUGI__FN bool xml_text::set(double rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +PUGI__FN bool xml_text::set(bool rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN bool xml_text::set(long long rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} + +PUGI__FN bool xml_text::set(unsigned long long rhs) +{ + xml_node_struct* dn = _data_new(); + + return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false; +} +#endif + +PUGI__FN xml_text& xml_text::operator=(const char_t* rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(int rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(unsigned int rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(double rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(float rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(bool rhs) +{ + set(rhs); + return *this; +} + +#ifdef PUGIXML_HAS_LONG_LONG +PUGI__FN xml_text& xml_text::operator=(long long rhs) +{ + set(rhs); + return *this; +} + +PUGI__FN xml_text& xml_text::operator=(unsigned long long rhs) +{ + set(rhs); + return *this; +} +#endif + +PUGI__FN xml_node xml_text::data() const +{ + return xml_node(_data()); +} + +#ifdef __BORLANDC__ +PUGI__FN bool operator&&(const xml_text& lhs, bool rhs) +{ + return (bool)lhs && rhs; +} + +PUGI__FN bool operator||(const xml_text& lhs, bool rhs) +{ + return (bool)lhs || rhs; +} +#endif + +PUGI__FN xml_node_iterator::xml_node_iterator() +{ +} + +PUGI__FN xml_node_iterator::xml_node_iterator(const xml_node& node): _wrap(node), _parent(node.parent()) +{ +} + +PUGI__FN xml_node_iterator::xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent) +{ +} + +PUGI__FN bool xml_node_iterator::operator==(const xml_node_iterator& rhs) const +{ + return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root; +} + +PUGI__FN bool xml_node_iterator::operator!=(const xml_node_iterator& rhs) const +{ + return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root; +} + +PUGI__FN xml_node& xml_node_iterator::operator*() const +{ + assert(_wrap._root); + return _wrap; +} + +PUGI__FN xml_node* xml_node_iterator::operator->() const +{ + assert(_wrap._root); + return const_cast(&_wrap); // BCC32 workaround +} + +PUGI__FN const xml_node_iterator& xml_node_iterator::operator++() +{ + assert(_wrap._root); + _wrap._root = _wrap._root->next_sibling; + return *this; +} + +PUGI__FN xml_node_iterator xml_node_iterator::operator++(int) +{ + xml_node_iterator temp = *this; + ++*this; + return temp; +} + +PUGI__FN const xml_node_iterator& xml_node_iterator::operator--() +{ + _wrap = _wrap._root ? _wrap.previous_sibling() : _parent.last_child(); + return *this; +} + +PUGI__FN xml_node_iterator xml_node_iterator::operator--(int) +{ + xml_node_iterator temp = *this; + --*this; + return temp; +} + +PUGI__FN xml_attribute_iterator::xml_attribute_iterator() +{ +} + +PUGI__FN xml_attribute_iterator::xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent): _wrap(attr), _parent(parent) +{ +} + +PUGI__FN xml_attribute_iterator::xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent) +{ +} + +PUGI__FN bool xml_attribute_iterator::operator==(const xml_attribute_iterator& rhs) const +{ + return _wrap._attr == rhs._wrap._attr && _parent._root == rhs._parent._root; +} + +PUGI__FN bool xml_attribute_iterator::operator!=(const xml_attribute_iterator& rhs) const +{ + return _wrap._attr != rhs._wrap._attr || _parent._root != rhs._parent._root; +} + +PUGI__FN xml_attribute& xml_attribute_iterator::operator*() const +{ + assert(_wrap._attr); + return _wrap; +} + +PUGI__FN xml_attribute* xml_attribute_iterator::operator->() const +{ + assert(_wrap._attr); + return const_cast(&_wrap); // BCC32 workaround +} + +PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator++() +{ + assert(_wrap._attr); + _wrap._attr = _wrap._attr->next_attribute; + return *this; +} + +PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator++(int) +{ + xml_attribute_iterator temp = *this; + ++*this; + return temp; +} + +PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator--() +{ + _wrap = _wrap._attr ? _wrap.previous_attribute() : _parent.last_attribute(); + return *this; +} + +PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator--(int) +{ + xml_attribute_iterator temp = *this; + --*this; + return temp; +} + +PUGI__FN xml_named_node_iterator::xml_named_node_iterator(): _name(0) +{ +} + +PUGI__FN xml_named_node_iterator::xml_named_node_iterator(const xml_node& node, const char_t* name): _wrap(node), _parent(node.parent()), _name(name) +{ +} + +PUGI__FN xml_named_node_iterator::xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name): _wrap(ref), _parent(parent), _name(name) +{ +} + +PUGI__FN bool xml_named_node_iterator::operator==(const xml_named_node_iterator& rhs) const +{ + return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root; +} + +PUGI__FN bool xml_named_node_iterator::operator!=(const xml_named_node_iterator& rhs) const +{ + return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root; +} + +PUGI__FN xml_node& xml_named_node_iterator::operator*() const +{ + assert(_wrap._root); + return _wrap; +} + +PUGI__FN xml_node* xml_named_node_iterator::operator->() const +{ + assert(_wrap._root); + return const_cast(&_wrap); // BCC32 workaround +} + +PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator++() +{ + assert(_wrap._root); + _wrap = _wrap.next_sibling(_name); + return *this; +} + +PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator++(int) +{ + xml_named_node_iterator temp = *this; + ++*this; + return temp; +} + +PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator--() +{ + if (_wrap._root) + _wrap = _wrap.previous_sibling(_name); + else { + _wrap = _parent.last_child(); + + if (!impl::strequal(_wrap.name(), _name)) + _wrap = _wrap.previous_sibling(_name); + } + + return *this; +} + +PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator--(int) +{ + xml_named_node_iterator temp = *this; + --*this; + return temp; +} + +PUGI__FN xml_parse_result::xml_parse_result(): status(status_internal_error), offset(0), encoding(encoding_auto) +{ +} + +PUGI__FN xml_parse_result::operator bool() const +{ + return status == status_ok; +} + +PUGI__FN const char* xml_parse_result::description() const +{ + switch (status) { + case status_ok: + return "No error"; + + case status_file_not_found: + return "File was not found"; + case status_io_error: + return "Error reading from file/stream"; + case status_out_of_memory: + return "Could not allocate memory"; + case status_internal_error: + return "Internal error occurred"; + + case status_unrecognized_tag: + return "Could not determine tag type"; + + case status_bad_pi: + return "Error parsing document declaration/processing instruction"; + case status_bad_comment: + return "Error parsing comment"; + case status_bad_cdata: + return "Error parsing CDATA section"; + case status_bad_doctype: + return "Error parsing document type declaration"; + case status_bad_pcdata: + return "Error parsing PCDATA section"; + case status_bad_start_element: + return "Error parsing start element tag"; + case status_bad_attribute: + return "Error parsing element attribute"; + case status_bad_end_element: + return "Error parsing end element tag"; + case status_end_element_mismatch: + return "Start-end tags mismatch"; + + case status_append_invalid_root: + return "Unable to append nodes: root is not an element or document"; + + case status_no_document_element: + return "No document element found"; + + default: + return "Unknown error"; + } +} + +PUGI__FN xml_document::xml_document(): _buffer(0) +{ + create(); +} + +PUGI__FN xml_document::~xml_document() +{ + destroy(); +} + +PUGI__FN void xml_document::reset() +{ + destroy(); + create(); +} + +PUGI__FN void xml_document::reset(const xml_document& proto) +{ + reset(); + + for (xml_node cur = proto.first_child(); cur; cur = cur.next_sibling()) + append_copy(cur); +} + +PUGI__FN void xml_document::create() +{ + assert(!_root); + +#ifdef PUGIXML_COMPACT + const size_t page_offset = sizeof(uint32_t); +#else + const size_t page_offset = 0; +#endif + + // initialize sentinel page + PUGI__STATIC_ASSERT(sizeof(impl::xml_memory_page) + sizeof(impl::xml_document_struct) + impl::xml_memory_page_alignment - sizeof(void*) + page_offset <= sizeof(_memory)); + + // align upwards to page boundary + void* page_memory = reinterpret_cast((reinterpret_cast(_memory) + (impl::xml_memory_page_alignment - 1)) & ~(impl::xml_memory_page_alignment - 1)); + + // prepare page structure + impl::xml_memory_page* page = impl::xml_memory_page::construct(page_memory); + assert(page); + + page->busy_size = impl::xml_memory_page_size; + + // setup first page marker +#ifdef PUGIXML_COMPACT + // round-trip through void* to avoid 'cast increases required alignment of target type' warning + page->compact_page_marker = reinterpret_cast(static_cast(reinterpret_cast(page) + sizeof(impl::xml_memory_page))); + *page->compact_page_marker = sizeof(impl::xml_memory_page); +#endif + + // allocate new root + _root = new (reinterpret_cast(page) + sizeof(impl::xml_memory_page) + page_offset) impl::xml_document_struct(page); + _root->prev_sibling_c = _root; + + // setup sentinel page + page->allocator = static_cast(_root); + + // verify the document allocation + assert(reinterpret_cast(_root) + sizeof(impl::xml_document_struct) <= _memory + sizeof(_memory)); +} + +PUGI__FN void xml_document::destroy() +{ + assert(_root); + + // destroy static storage + if (_buffer) { + impl::xml_memory::deallocate(_buffer); + _buffer = 0; + } + + // destroy extra buffers (note: no need to destroy linked list nodes, they're allocated using document allocator) + for (impl::xml_extra_buffer* extra = static_cast(_root)->extra_buffers; extra; extra = extra->next) { + if (extra->buffer) impl::xml_memory::deallocate(extra->buffer); + } + + // destroy dynamic storage, leave sentinel page (it's in static memory) + impl::xml_memory_page* root_page = PUGI__GETPAGE(_root); + assert(root_page && !root_page->prev); + assert(reinterpret_cast(root_page) >= _memory && reinterpret_cast(root_page) < _memory + sizeof(_memory)); + + for (impl::xml_memory_page* page = root_page->next; page; ) { + impl::xml_memory_page* next = page->next; + + impl::xml_allocator::deallocate_page(page); + + page = next; + } + +#ifdef PUGIXML_COMPACT + // destroy hash table + static_cast(_root)->hash.clear(); +#endif + + _root = 0; +} + +#ifndef PUGIXML_NO_STL +PUGI__FN xml_parse_result xml_document::load(std::basic_istream >& stream, unsigned int options, xml_encoding encoding) +{ + reset(); + + return impl::load_stream_impl(static_cast(_root), stream, options, encoding, &_buffer); +} + +PUGI__FN xml_parse_result xml_document::load(std::basic_istream >& stream, unsigned int options) +{ + reset(); + + return impl::load_stream_impl(static_cast(_root), stream, options, encoding_wchar, &_buffer); +} +#endif + +PUGI__FN xml_parse_result xml_document::load_string(const char_t* contents, unsigned int options) +{ + // Force native encoding (skip autodetection) +#ifdef PUGIXML_WCHAR_MODE + xml_encoding encoding = encoding_wchar; +#else + xml_encoding encoding = encoding_utf8; +#endif + + return load_buffer(contents, impl::strlength(contents) * sizeof(char_t), options, encoding); +} + +PUGI__FN xml_parse_result xml_document::load(const char_t* contents, unsigned int options) +{ + return load_string(contents, options); +} + +PUGI__FN xml_parse_result xml_document::load_file(const char* path_, unsigned int options, xml_encoding encoding) +{ + reset(); + + using impl::auto_deleter; // MSVC7 workaround + auto_deleter file(fopen(path_, "rb"), fclose); + + return impl::load_file_impl(static_cast(_root), file.data, options, encoding, &_buffer); +} + +PUGI__FN xml_parse_result xml_document::load_file(const wchar_t* path_, unsigned int options, xml_encoding encoding) +{ + reset(); + + using impl::auto_deleter; // MSVC7 workaround + auto_deleter file(impl::open_file_wide(path_, L"rb"), fclose); + + return impl::load_file_impl(static_cast(_root), file.data, options, encoding, &_buffer); +} + +PUGI__FN xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding) +{ + reset(); + + return impl::load_buffer_impl(static_cast(_root), _root, const_cast(contents), size, options, encoding, false, false, &_buffer); +} + +PUGI__FN xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, xml_encoding encoding) +{ + reset(); + + return impl::load_buffer_impl(static_cast(_root), _root, contents, size, options, encoding, true, false, &_buffer); +} + +PUGI__FN xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, xml_encoding encoding) +{ + reset(); + + return impl::load_buffer_impl(static_cast(_root), _root, contents, size, options, encoding, true, true, &_buffer); +} + +PUGI__FN void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding) const +{ + impl::xml_buffered_writer buffered_writer(writer, encoding); + + if ((flags & format_write_bom) && encoding != encoding_latin1) { + // BOM always represents the codepoint U+FEFF, so just write it in native encoding +#ifdef PUGIXML_WCHAR_MODE + unsigned int bom = 0xfeff; + buffered_writer.write(static_cast(bom)); +#else + buffered_writer.write('\xef', '\xbb', '\xbf'); +#endif + } + + if (!(flags & format_no_declaration) && !impl::has_declaration(_root)) { + buffered_writer.write_string(PUGIXML_TEXT("'); + if (!(flags & format_raw)) buffered_writer.write('\n'); + } + + impl::node_output(buffered_writer, _root, indent, flags, 0); + + buffered_writer.flush(); +} + +#ifndef PUGIXML_NO_STL +PUGI__FN void xml_document::save(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding) const +{ + xml_writer_stream writer(stream); + + save(writer, indent, flags, encoding); +} + +PUGI__FN void xml_document::save(std::basic_ostream >& stream, const char_t* indent, unsigned int flags) const +{ + xml_writer_stream writer(stream); + + save(writer, indent, flags, encoding_wchar); +} +#endif + +PUGI__FN bool xml_document::save_file(const char* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const +{ + using impl::auto_deleter; // MSVC7 workaround + auto_deleter file(fopen(path_, (flags & format_save_file_text) ? "w" : "wb"), fclose); + + return impl::save_file_impl(*this, file.data, indent, flags, encoding); +} + +PUGI__FN bool xml_document::save_file(const wchar_t* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const +{ + using impl::auto_deleter; // MSVC7 workaround + auto_deleter file(impl::open_file_wide(path_, (flags & format_save_file_text) ? L"w" : L"wb"), fclose); + + return impl::save_file_impl(*this, file.data, indent, flags, encoding); +} + +PUGI__FN xml_node xml_document::document_element() const +{ + assert(_root); + + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (PUGI__NODETYPE(i) == node_element) + return xml_node(i); + + return xml_node(); +} + +#ifndef PUGIXML_NO_STL +PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str) +{ + assert(str); + + return impl::as_utf8_impl(str, impl::strlength_wide(str)); +} + +PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const std::basic_string& str) +{ + return impl::as_utf8_impl(str.c_str(), str.size()); +} + +PUGI__FN std::basic_string PUGIXML_FUNCTION as_wide(const char* str) +{ + assert(str); + + return impl::as_wide_impl(str, strlen(str)); +} + +PUGI__FN std::basic_string PUGIXML_FUNCTION as_wide(const std::string& str) +{ + return impl::as_wide_impl(str.c_str(), str.size()); +} +#endif + +PUGI__FN void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate) +{ + impl::xml_memory::allocate = allocate; + impl::xml_memory::deallocate = deallocate; +} + +PUGI__FN allocation_function PUGIXML_FUNCTION get_memory_allocation_function() +{ + return impl::xml_memory::allocate; +} + +PUGI__FN deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function() +{ + return impl::xml_memory::deallocate; +} +} + +#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) +namespace std +{ +// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) +PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&) +{ + return std::bidirectional_iterator_tag(); +} + +PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&) +{ + return std::bidirectional_iterator_tag(); +} + +PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_named_node_iterator&) +{ + return std::bidirectional_iterator_tag(); +} +} +#endif + +#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC) +namespace std +{ +// Workarounds for (non-standard) iterator category detection +PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_node_iterator&) +{ + return std::bidirectional_iterator_tag(); +} + +PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_attribute_iterator&) +{ + return std::bidirectional_iterator_tag(); +} + +PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_named_node_iterator&) +{ + return std::bidirectional_iterator_tag(); +} +} +#endif + +#ifndef PUGIXML_NO_XPATH +// STL replacements +PUGI__NS_BEGIN +struct equal_to { + template bool operator()(const T& lhs, const T& rhs) const { + return lhs == rhs; + } +}; + +struct not_equal_to { + template bool operator()(const T& lhs, const T& rhs) const { + return lhs != rhs; + } +}; + +struct less { + template bool operator()(const T& lhs, const T& rhs) const { + return lhs < rhs; + } +}; + +struct less_equal { + template bool operator()(const T& lhs, const T& rhs) const { + return lhs <= rhs; + } +}; + +template void swap(T& lhs, T& rhs) +{ + T temp = lhs; + lhs = rhs; + rhs = temp; +} + +template I min_element(I begin, I end, const Pred& pred) +{ + I result = begin; + + for (I it = begin + 1; it != end; ++it) + if (pred(*it, *result)) + result = it; + + return result; +} + +template void reverse(I begin, I end) +{ + while (end - begin > 1) swap(*begin++, *--end); +} + +template I unique(I begin, I end) +{ + // fast skip head + while (end - begin > 1 && *begin != *(begin + 1)) begin++; + + if (begin == end) return begin; + + // last written element + I write = begin++; + + // merge unique elements + while (begin != end) { + if (*begin != *write) + *++write = *begin++; + else + begin++; + } + + // past-the-end (write points to live element) + return write + 1; +} + +template void copy_backwards(I begin, I end, I target) +{ + while (begin != end) *--target = *--end; +} + +template void insertion_sort(I begin, I end, const Pred& pred, T*) +{ + assert(begin != end); + + for (I it = begin + 1; it != end; ++it) { + T val = *it; + + if (pred(val, *begin)) { + // move to front + copy_backwards(begin, it, it + 1); + *begin = val; + } else { + I hole = it; + + // move hole backwards + while (pred(val, *(hole - 1))) { + *hole = *(hole - 1); + hole--; + } + + // fill hole with element + *hole = val; + } + } +} + +// std variant for elements with == +template void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend) +{ + I eqbeg = middle, eqend = middle + 1; + + // expand equal range + while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg; + while (eqend != end && *eqend == *eqbeg) ++eqend; + + // process outer elements + I ltend = eqbeg, gtbeg = eqend; + + for (;;) { + // find the element from the right side that belongs to the left one + for (; gtbeg != end; ++gtbeg) + if (!pred(*eqbeg, *gtbeg)) { + if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++); + else break; + } + + // find the element from the left side that belongs to the right one + for (; ltend != begin; --ltend) + if (!pred(*(ltend - 1), *eqbeg)) { + if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg); + else break; + } + + // scanned all elements + if (gtbeg == end && ltend == begin) { + *out_eqbeg = eqbeg; + *out_eqend = eqend; + return; + } + + // make room for elements by moving equal area + if (gtbeg == end) { + if (--ltend != --eqbeg) swap(*ltend, *eqbeg); + swap(*eqbeg, *--eqend); + } else if (ltend == begin) { + if (eqend != gtbeg) swap(*eqbeg, *eqend); + ++eqend; + swap(*gtbeg++, *eqbeg++); + } else swap(*gtbeg++, *--ltend); + } +} + +template void median3(I first, I middle, I last, const Pred& pred) +{ + if (pred(*middle, *first)) swap(*middle, *first); + if (pred(*last, *middle)) swap(*last, *middle); + if (pred(*middle, *first)) swap(*middle, *first); +} + +template void median(I first, I middle, I last, const Pred& pred) +{ + if (last - first <= 40) { + // median of three for small chunks + median3(first, middle, last, pred); + } else { + // median of nine + size_t step = (last - first + 1) / 8; + + median3(first, first + step, first + 2 * step, pred); + median3(middle - step, middle, middle + step, pred); + median3(last - 2 * step, last - step, last, pred); + median3(first + step, middle, last - step, pred); + } +} + +template void sort(I begin, I end, const Pred& pred) +{ + // sort large chunks + while (end - begin > 32) { + // find median element + I middle = begin + (end - begin) / 2; + median(begin, middle, end - 1, pred); + + // partition in three chunks (< = >) + I eqbeg, eqend; + partition(begin, middle, end, pred, &eqbeg, &eqend); + + // loop on larger half + if (eqbeg - begin > end - eqend) { + sort(eqend, end, pred); + end = eqbeg; + } else { + sort(begin, eqbeg, pred); + begin = eqend; + } + } + + // insertion sort small chunk + if (begin != end) insertion_sort(begin, end, pred, &*begin); +} +PUGI__NS_END + +// Allocator used for AST and evaluation stacks +PUGI__NS_BEGIN +static const size_t xpath_memory_page_size = +#ifdef PUGIXML_MEMORY_XPATH_PAGE_SIZE + PUGIXML_MEMORY_XPATH_PAGE_SIZE +#else + 4096 +#endif + ; + +static const uintptr_t xpath_memory_block_alignment = sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*); + +struct xpath_memory_block { + xpath_memory_block* next; + size_t capacity; + + union { + char data[xpath_memory_page_size]; + double alignment; + }; +}; + +class xpath_allocator +{ + xpath_memory_block* _root; + size_t _root_size; + +public: +#ifdef PUGIXML_NO_EXCEPTIONS + jmp_buf* error_handler; +#endif + + xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size) { +#ifdef PUGIXML_NO_EXCEPTIONS + error_handler = 0; +#endif + } + + void* allocate_nothrow(size_t size) { + // round size up to block alignment boundary + size = (size + xpath_memory_block_alignment - 1) & ~(xpath_memory_block_alignment - 1); + + if (_root_size + size <= _root->capacity) { + void* buf = &_root->data[0] + _root_size; + _root_size += size; + return buf; + } else { + // make sure we have at least 1/4th of the page free after allocation to satisfy subsequent allocation requests + size_t block_capacity_base = sizeof(_root->data); + size_t block_capacity_req = size + block_capacity_base / 4; + size_t block_capacity = (block_capacity_base > block_capacity_req) ? block_capacity_base : block_capacity_req; + + size_t block_size = block_capacity + offsetof(xpath_memory_block, data); + + xpath_memory_block* block = static_cast(xml_memory::allocate(block_size)); + if (!block) return 0; + + block->next = _root; + block->capacity = block_capacity; + + _root = block; + _root_size = size; + + return block->data; + } + } + + void* allocate(size_t size) { + void* result = allocate_nothrow(size); + + if (!result) { +#ifdef PUGIXML_NO_EXCEPTIONS + assert(error_handler); + longjmp(*error_handler, 1); +#else + throw std::bad_alloc(); +#endif + } + + return result; + } + + void* reallocate(void* ptr, size_t old_size, size_t new_size) { + // round size up to block alignment boundary + old_size = (old_size + xpath_memory_block_alignment - 1) & ~(xpath_memory_block_alignment - 1); + new_size = (new_size + xpath_memory_block_alignment - 1) & ~(xpath_memory_block_alignment - 1); + + // we can only reallocate the last object + assert(ptr == 0 || static_cast(ptr) + old_size == &_root->data[0] + _root_size); + + // adjust root size so that we have not allocated the object at all + bool only_object = (_root_size == old_size); + + if (ptr) _root_size -= old_size; + + // allocate a new version (this will obviously reuse the memory if possible) + void* result = allocate(new_size); + assert(result); + + // we have a new block + if (result != ptr && ptr) { + // copy old data + assert(new_size >= old_size); + memcpy(result, ptr, old_size); + + // free the previous page if it had no other objects + if (only_object) { + assert(_root->data == result); + assert(_root->next); + + xpath_memory_block* next = _root->next->next; + + if (next) { + // deallocate the whole page, unless it was the first one + xml_memory::deallocate(_root->next); + _root->next = next; + } + } + } + + return result; + } + + void revert(const xpath_allocator& state) { + // free all new pages + xpath_memory_block* cur = _root; + + while (cur != state._root) { + xpath_memory_block* next = cur->next; + + xml_memory::deallocate(cur); + + cur = next; + } + + // restore state + _root = state._root; + _root_size = state._root_size; + } + + void release() { + xpath_memory_block* cur = _root; + assert(cur); + + while (cur->next) { + xpath_memory_block* next = cur->next; + + xml_memory::deallocate(cur); + + cur = next; + } + } +}; + +struct xpath_allocator_capture { + xpath_allocator_capture(xpath_allocator* alloc): _target(alloc), _state(*alloc) { + } + + ~xpath_allocator_capture() { + _target->revert(_state); + } + + xpath_allocator* _target; + xpath_allocator _state; +}; + +struct xpath_stack { + xpath_allocator* result; + xpath_allocator* temp; +}; + +struct xpath_stack_data { + xpath_memory_block blocks[2]; + xpath_allocator result; + xpath_allocator temp; + xpath_stack stack; + +#ifdef PUGIXML_NO_EXCEPTIONS + jmp_buf error_handler; +#endif + + xpath_stack_data(): result(blocks + 0), temp(blocks + 1) { + blocks[0].next = blocks[1].next = 0; + blocks[0].capacity = blocks[1].capacity = sizeof(blocks[0].data); + + stack.result = &result; + stack.temp = &temp; + +#ifdef PUGIXML_NO_EXCEPTIONS + result.error_handler = temp.error_handler = &error_handler; +#endif + } + + ~xpath_stack_data() { + result.release(); + temp.release(); + } +}; +PUGI__NS_END + +// String class +PUGI__NS_BEGIN +class xpath_string +{ + const char_t* _buffer; + bool _uses_heap; + size_t _length_heap; + + static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc) { + char_t* result = static_cast(alloc->allocate((length + 1) * sizeof(char_t))); + assert(result); + + memcpy(result, string, length * sizeof(char_t)); + result[length] = 0; + + return result; + } + + xpath_string(const char_t* buffer, bool uses_heap_, size_t length_heap): _buffer(buffer), _uses_heap(uses_heap_), _length_heap(length_heap) { + } + +public: + static xpath_string from_const(const char_t* str) { + return xpath_string(str, false, 0); + } + + static xpath_string from_heap_preallocated(const char_t* begin, const char_t* end) { + assert(begin <= end && *end == 0); + + return xpath_string(begin, true, static_cast(end - begin)); + } + + static xpath_string from_heap(const char_t* begin, const char_t* end, xpath_allocator* alloc) { + assert(begin <= end); + + size_t length = static_cast(end - begin); + + return length == 0 ? xpath_string() : xpath_string(duplicate_string(begin, length, alloc), true, length); + } + + xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false), _length_heap(0) { + } + + void append(const xpath_string& o, xpath_allocator* alloc) { + // skip empty sources + if (!*o._buffer) return; + + // fast append for constant empty target and constant source + if (!*_buffer && !_uses_heap && !o._uses_heap) { + _buffer = o._buffer; + } else { + // need to make heap copy + size_t target_length = length(); + size_t source_length = o.length(); + size_t result_length = target_length + source_length; + + // allocate new buffer + char_t* result = static_cast(alloc->reallocate(_uses_heap ? const_cast(_buffer) : 0, (target_length + 1) * sizeof(char_t), (result_length + 1) * sizeof(char_t))); + assert(result); + + // append first string to the new buffer in case there was no reallocation + if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t)); + + // append second string to the new buffer + memcpy(result + target_length, o._buffer, source_length * sizeof(char_t)); + result[result_length] = 0; + + // finalize + _buffer = result; + _uses_heap = true; + _length_heap = result_length; + } + } + + const char_t* c_str() const { + return _buffer; + } + + size_t length() const { + return _uses_heap ? _length_heap : strlength(_buffer); + } + + char_t* data(xpath_allocator* alloc) { + // make private heap copy + if (!_uses_heap) { + size_t length_ = strlength(_buffer); + + _buffer = duplicate_string(_buffer, length_, alloc); + _uses_heap = true; + _length_heap = length_; + } + + return const_cast(_buffer); + } + + bool empty() const { + return *_buffer == 0; + } + + bool operator==(const xpath_string& o) const { + return strequal(_buffer, o._buffer); + } + + bool operator!=(const xpath_string& o) const { + return !strequal(_buffer, o._buffer); + } + + bool uses_heap() const { + return _uses_heap; + } +}; +PUGI__NS_END + +PUGI__NS_BEGIN +PUGI__FN bool starts_with(const char_t* string, const char_t* pattern) +{ + while (*pattern && *string == *pattern) { + string++; + pattern++; + } + + return *pattern == 0; +} + +PUGI__FN const char_t* find_char(const char_t* s, char_t c) +{ +#ifdef PUGIXML_WCHAR_MODE + return wcschr(s, c); +#else + return strchr(s, c); +#endif +} + +PUGI__FN const char_t* find_substring(const char_t* s, const char_t* p) +{ +#ifdef PUGIXML_WCHAR_MODE + // MSVC6 wcsstr bug workaround (if s is empty it always returns 0) + return (*p == 0) ? s : wcsstr(s, p); +#else + return strstr(s, p); +#endif +} + +// Converts symbol to lower case, if it is an ASCII one +PUGI__FN char_t tolower_ascii(char_t ch) +{ + return static_cast(ch - 'A') < 26 ? static_cast(ch | ' ') : ch; +} + +PUGI__FN xpath_string string_value(const xpath_node& na, xpath_allocator* alloc) +{ + if (na.attribute()) + return xpath_string::from_const(na.attribute().value()); + else { + xml_node n = na.node(); + + switch (n.type()) { + case node_pcdata: + case node_cdata: + case node_comment: + case node_pi: + return xpath_string::from_const(n.value()); + + case node_document: + case node_element: { + xpath_string result; + + xml_node cur = n.first_child(); + + while (cur && cur != n) { + if (cur.type() == node_pcdata || cur.type() == node_cdata) + result.append(xpath_string::from_const(cur.value()), alloc); + + if (cur.first_child()) + cur = cur.first_child(); + else if (cur.next_sibling()) + cur = cur.next_sibling(); + else { + while (!cur.next_sibling() && cur != n) + cur = cur.parent(); + + if (cur != n) cur = cur.next_sibling(); + } + } + + return result; + } + + default: + return xpath_string(); + } + } +} + +PUGI__FN bool node_is_before_sibling(xml_node_struct* ln, xml_node_struct* rn) +{ + assert(ln->parent == rn->parent); + + // there is no common ancestor (the shared parent is null), nodes are from different documents + if (!ln->parent) return ln < rn; + + // determine sibling order + xml_node_struct* ls = ln; + xml_node_struct* rs = rn; + + while (ls && rs) { + if (ls == rn) return true; + if (rs == ln) return false; + + ls = ls->next_sibling; + rs = rs->next_sibling; + } + + // if rn sibling chain ended ln must be before rn + return !rs; +} + +PUGI__FN bool node_is_before(xml_node_struct* ln, xml_node_struct* rn) +{ + // find common ancestor at the same depth, if any + xml_node_struct* lp = ln; + xml_node_struct* rp = rn; + + while (lp && rp && lp->parent != rp->parent) { + lp = lp->parent; + rp = rp->parent; + } + + // parents are the same! + if (lp && rp) return node_is_before_sibling(lp, rp); + + // nodes are at different depths, need to normalize heights + bool left_higher = !lp; + + while (lp) { + lp = lp->parent; + ln = ln->parent; + } + + while (rp) { + rp = rp->parent; + rn = rn->parent; + } + + // one node is the ancestor of the other + if (ln == rn) return left_higher; + + // find common ancestor... again + while (ln->parent != rn->parent) { + ln = ln->parent; + rn = rn->parent; + } + + return node_is_before_sibling(ln, rn); +} + +PUGI__FN bool node_is_ancestor(xml_node_struct* parent, xml_node_struct* node) +{ + while (node && node != parent) node = node->parent; + + return parent && node == parent; +} + +PUGI__FN const void* document_buffer_order(const xpath_node& xnode) +{ + xml_node_struct* node = xnode.node().internal_object(); + + if (node) { + if ((get_document(node).header & xml_memory_page_contents_shared_mask) == 0) { + if (node->name && (node->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return node->name; + if (node->value && (node->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return node->value; + } + + return 0; + } + + xml_attribute_struct* attr = xnode.attribute().internal_object(); + + if (attr) { + if ((get_document(attr).header & xml_memory_page_contents_shared_mask) == 0) { + if ((attr->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return attr->name; + if ((attr->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return attr->value; + } + + return 0; + } + + return 0; +} + +struct document_order_comparator { + bool operator()(const xpath_node& lhs, const xpath_node& rhs) const { + // optimized document order based check + const void* lo = document_buffer_order(lhs); + const void* ro = document_buffer_order(rhs); + + if (lo && ro) return lo < ro; + + // slow comparison + xml_node ln = lhs.node(), rn = rhs.node(); + + // compare attributes + if (lhs.attribute() && rhs.attribute()) { + // shared parent + if (lhs.parent() == rhs.parent()) { + // determine sibling order + for (xml_attribute a = lhs.attribute(); a; a = a.next_attribute()) + if (a == rhs.attribute()) + return true; + + return false; + } + + // compare attribute parents + ln = lhs.parent(); + rn = rhs.parent(); + } else if (lhs.attribute()) { + // attributes go after the parent element + if (lhs.parent() == rhs.node()) return false; + + ln = lhs.parent(); + } else if (rhs.attribute()) { + // attributes go after the parent element + if (rhs.parent() == lhs.node()) return true; + + rn = rhs.parent(); + } + + if (ln == rn) return false; + + if (!ln || !rn) return ln < rn; + + return node_is_before(ln.internal_object(), rn.internal_object()); + } +}; + +struct duplicate_comparator { + bool operator()(const xpath_node& lhs, const xpath_node& rhs) const { + if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true; + else return rhs.attribute() ? false : lhs.node() < rhs.node(); + } +}; + +PUGI__FN double gen_nan() +{ +#if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24)) + union { + float f; + uint32_t i; + } u[sizeof(float) == sizeof(uint32_t) ? 1 : -1]; + u[0].i = 0x7fc00000; + return u[0].f; +#else + // fallback + const volatile double zero = 0.0; + return zero / zero; +#endif +} + +PUGI__FN bool is_nan(double value) +{ +#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) + return !!_isnan(value); +#elif defined(fpclassify) && defined(FP_NAN) + return fpclassify(value) == FP_NAN; +#else + // fallback + const volatile double v = value; + return v != v; +#endif +} + +PUGI__FN const char_t* convert_number_to_string_special(double value) +{ +#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) + if (_finite(value)) return (value == 0) ? PUGIXML_TEXT("0") : 0; + if (_isnan(value)) return PUGIXML_TEXT("NaN"); + return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity"); +#elif defined(fpclassify) && defined(FP_NAN) && defined(FP_INFINITE) && defined(FP_ZERO) + switch (fpclassify(value)) { + case FP_NAN: + return PUGIXML_TEXT("NaN"); + + case FP_INFINITE: + return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity"); + + case FP_ZERO: + return PUGIXML_TEXT("0"); + + default: + return 0; + } +#else + // fallback + const volatile double v = value; + + if (v == 0) return PUGIXML_TEXT("0"); + if (v != v) return PUGIXML_TEXT("NaN"); + if (v * 2 == v) return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity"); + return 0; +#endif +} + +PUGI__FN bool convert_number_to_boolean(double value) +{ + return (value != 0 && !is_nan(value)); +} + +PUGI__FN void truncate_zeros(char* begin, char* end) +{ + while (begin != end && end[-1] == '0') end--; + + *end = 0; +} + +// gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent +#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE) +PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent) +{ + // get base values + int sign, exponent; + _ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign); + + // truncate redundant zeros + truncate_zeros(buffer, buffer + strlen(buffer)); + + // fill results + *out_mantissa = buffer; + *out_exponent = exponent; +} +#else +PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent) +{ + // get a scientific notation value with IEEE DBL_DIG decimals + sprintf(buffer, "%.*e", DBL_DIG, value); + assert(strlen(buffer) < buffer_size); + (void)!buffer_size; + + // get the exponent (possibly negative) + char* exponent_string = strchr(buffer, 'e'); + assert(exponent_string); + + int exponent = atoi(exponent_string + 1); + + // extract mantissa string: skip sign + char* mantissa = buffer[0] == '-' ? buffer + 1 : buffer; + assert(mantissa[0] != '0' && mantissa[1] == '.'); + + // divide mantissa by 10 to eliminate integer part + mantissa[1] = mantissa[0]; + mantissa++; + exponent++; + + // remove extra mantissa digits and zero-terminate mantissa + truncate_zeros(mantissa, exponent_string); + + // fill results + *out_mantissa = mantissa; + *out_exponent = exponent; +} +#endif + +PUGI__FN xpath_string convert_number_to_string(double value, xpath_allocator* alloc) +{ + // try special number conversion + const char_t* special = convert_number_to_string_special(value); + if (special) return xpath_string::from_const(special); + + // get mantissa + exponent form + char mantissa_buffer[32]; + + char* mantissa; + int exponent; + convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent); + + // allocate a buffer of suitable length for the number + size_t result_size = strlen(mantissa_buffer) + (exponent > 0 ? exponent : -exponent) + 4; + char_t* result = static_cast(alloc->allocate(sizeof(char_t) * result_size)); + assert(result); + + // make the number! + char_t* s = result; + + // sign + if (value < 0) *s++ = '-'; + + // integer part + if (exponent <= 0) { + *s++ = '0'; + } else { + while (exponent > 0) { + assert(*mantissa == 0 || static_cast(static_cast(*mantissa) - '0') <= 9); + *s++ = *mantissa ? *mantissa++ : '0'; + exponent--; + } + } + + // fractional part + if (*mantissa) { + // decimal point + *s++ = '.'; + + // extra zeroes from negative exponent + while (exponent < 0) { + *s++ = '0'; + exponent++; + } + + // extra mantissa digits + while (*mantissa) { + assert(static_cast(*mantissa - '0') <= 9); + *s++ = *mantissa++; + } + } + + // zero-terminate + assert(s < result + result_size); + *s = 0; + + return xpath_string::from_heap_preallocated(result, s); +} + +PUGI__FN bool check_string_to_number_format(const char_t* string) +{ + // parse leading whitespace + while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string; + + // parse sign + if (*string == '-') ++string; + + if (!*string) return false; + + // if there is no integer part, there should be a decimal part with at least one digit + if (!PUGI__IS_CHARTYPEX(string[0], ctx_digit) && (string[0] != '.' || !PUGI__IS_CHARTYPEX(string[1], ctx_digit))) return false; + + // parse integer part + while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string; + + // parse decimal part + if (*string == '.') { + ++string; + + while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string; + } + + // parse trailing whitespace + while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string; + + return *string == 0; +} + +PUGI__FN double convert_string_to_number(const char_t* string) +{ + // check string format + if (!check_string_to_number_format(string)) return gen_nan(); + + // parse string +#ifdef PUGIXML_WCHAR_MODE + return wcstod(string, 0); +#else + return strtod(string, 0); +#endif +} + +PUGI__FN bool convert_string_to_number_scratch(char_t (&buffer)[32], const char_t* begin, const char_t* end, double* out_result) +{ + size_t length = static_cast(end - begin); + char_t* scratch = buffer; + + if (length >= sizeof(buffer) / sizeof(buffer[0])) { + // need to make dummy on-heap copy + scratch = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!scratch) return false; + } + + // copy string to zero-terminated buffer and perform conversion + memcpy(scratch, begin, length * sizeof(char_t)); + scratch[length] = 0; + + *out_result = convert_string_to_number(scratch); + + // free dummy buffer + if (scratch != buffer) xml_memory::deallocate(scratch); + + return true; +} + +PUGI__FN double round_nearest(double value) +{ + return floor(value + 0.5); +} + +PUGI__FN double round_nearest_nzero(double value) +{ + // same as round_nearest, but returns -0 for [-0.5, -0] + // ceil is used to differentiate between +0 and -0 (we return -0 for [-0.5, -0] and +0 for +0) + return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5); +} + +PUGI__FN const char_t* qualified_name(const xpath_node& node) +{ + return node.attribute() ? node.attribute().name() : node.node().name(); +} + +PUGI__FN const char_t* local_name(const xpath_node& node) +{ + const char_t* name = qualified_name(node); + const char_t* p = find_char(name, ':'); + + return p ? p + 1 : name; +} + +struct namespace_uri_predicate { + const char_t* prefix; + size_t prefix_length; + + namespace_uri_predicate(const char_t* name) { + const char_t* pos = find_char(name, ':'); + + prefix = pos ? name : 0; + prefix_length = pos ? static_cast(pos - name) : 0; + } + + bool operator()(xml_attribute a) const { + const char_t* name = a.name(); + + if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false; + + return prefix ? name[5] == ':' && strequalrange(name + 6, prefix, prefix_length) : name[5] == 0; + } +}; + +PUGI__FN const char_t* namespace_uri(xml_node node) +{ + namespace_uri_predicate pred = node.name(); + + xml_node p = node; + + while (p) { + xml_attribute a = p.find_attribute(pred); + + if (a) return a.value(); + + p = p.parent(); + } + + return PUGIXML_TEXT(""); +} + +PUGI__FN const char_t* namespace_uri(xml_attribute attr, xml_node parent) +{ + namespace_uri_predicate pred = attr.name(); + + // Default namespace does not apply to attributes + if (!pred.prefix) return PUGIXML_TEXT(""); + + xml_node p = parent; + + while (p) { + xml_attribute a = p.find_attribute(pred); + + if (a) return a.value(); + + p = p.parent(); + } + + return PUGIXML_TEXT(""); +} + +PUGI__FN const char_t* namespace_uri(const xpath_node& node) +{ + return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node()); +} + +PUGI__FN char_t* normalize_space(char_t* buffer) +{ + char_t* write = buffer; + + for (char_t* it = buffer; *it; ) { + char_t ch = *it++; + + if (PUGI__IS_CHARTYPE(ch, ct_space)) { + // replace whitespace sequence with single space + while (PUGI__IS_CHARTYPE(*it, ct_space)) it++; + + // avoid leading spaces + if (write != buffer) *write++ = ' '; + } else *write++ = ch; + } + + // remove trailing space + if (write != buffer && PUGI__IS_CHARTYPE(write[-1], ct_space)) write--; + + // zero-terminate + *write = 0; + + return write; +} + +PUGI__FN char_t* translate(char_t* buffer, const char_t* from, const char_t* to, size_t to_length) +{ + char_t* write = buffer; + + while (*buffer) { + PUGI__DMC_VOLATILE char_t ch = *buffer++; + + const char_t* pos = find_char(from, ch); + + if (!pos) + *write++ = ch; // do not process + else if (static_cast(pos - from) < to_length) + *write++ = to[pos - from]; // replace + } + + // zero-terminate + *write = 0; + + return write; +} + +PUGI__FN unsigned char* translate_table_generate(xpath_allocator* alloc, const char_t* from, const char_t* to) +{ + unsigned char table[128] = {0}; + + while (*from) { + unsigned int fc = static_cast(*from); + unsigned int tc = static_cast(*to); + + if (fc >= 128 || tc >= 128) + return 0; + + // code=128 means "skip character" + if (!table[fc]) + table[fc] = static_cast(tc ? tc : 128); + + from++; + if (tc) to++; + } + + for (int i = 0; i < 128; ++i) + if (!table[i]) + table[i] = static_cast(i); + + void* result = alloc->allocate_nothrow(sizeof(table)); + + if (result) { + memcpy(result, table, sizeof(table)); + } + + return static_cast(result); +} + +PUGI__FN char_t* translate_table(char_t* buffer, const unsigned char* table) +{ + char_t* write = buffer; + + while (*buffer) { + char_t ch = *buffer++; + unsigned int index = static_cast(ch); + + if (index < 128) { + unsigned char code = table[index]; + + // code=128 means "skip character" (table size is 128 so 128 can be a special value) + // this code skips these characters without extra branches + *write = static_cast(code); + write += 1 - (code >> 7); + } else { + *write++ = ch; + } + } + + // zero-terminate + *write = 0; + + return write; +} + +inline bool is_xpath_attribute(const char_t* name) +{ + return !(starts_with(name, PUGIXML_TEXT("xmlns")) && (name[5] == 0 || name[5] == ':')); +} + +struct xpath_variable_boolean: xpath_variable { + xpath_variable_boolean(): xpath_variable(xpath_type_boolean), value(false) { + } + + bool value; + char_t name[1]; +}; + +struct xpath_variable_number: xpath_variable { + xpath_variable_number(): xpath_variable(xpath_type_number), value(0) { + } + + double value; + char_t name[1]; +}; + +struct xpath_variable_string: xpath_variable { + xpath_variable_string(): xpath_variable(xpath_type_string), value(0) { + } + + ~xpath_variable_string() { + if (value) xml_memory::deallocate(value); + } + + char_t* value; + char_t name[1]; +}; + +struct xpath_variable_node_set: xpath_variable { + xpath_variable_node_set(): xpath_variable(xpath_type_node_set) { + } + + xpath_node_set value; + char_t name[1]; +}; + +static const xpath_node_set dummy_node_set; + +PUGI__FN unsigned int hash_string(const char_t* str) +{ + // Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time) + unsigned int result = 0; + + while (*str) { + result += static_cast(*str++); + result += result << 10; + result ^= result >> 6; + } + + result += result << 3; + result ^= result >> 11; + result += result << 15; + + return result; +} + +template PUGI__FN T* new_xpath_variable(const char_t* name) +{ + size_t length = strlength(name); + if (length == 0) return 0; // empty variable names are invalid + + // $$ we can't use offsetof(T, name) because T is non-POD, so we just allocate additional length characters + void* memory = xml_memory::allocate(sizeof(T) + length * sizeof(char_t)); + if (!memory) return 0; + + T* result = new (memory) T(); + + memcpy(result->name, name, (length + 1) * sizeof(char_t)); + + return result; +} + +PUGI__FN xpath_variable* new_xpath_variable(xpath_value_type type, const char_t* name) +{ + switch (type) { + case xpath_type_node_set: + return new_xpath_variable(name); + + case xpath_type_number: + return new_xpath_variable(name); + + case xpath_type_string: + return new_xpath_variable(name); + + case xpath_type_boolean: + return new_xpath_variable(name); + + default: + return 0; + } +} + +template PUGI__FN void delete_xpath_variable(T* var) +{ + var->~T(); + xml_memory::deallocate(var); +} + +PUGI__FN void delete_xpath_variable(xpath_value_type type, xpath_variable* var) +{ + switch (type) { + case xpath_type_node_set: + delete_xpath_variable(static_cast(var)); + break; + + case xpath_type_number: + delete_xpath_variable(static_cast(var)); + break; + + case xpath_type_string: + delete_xpath_variable(static_cast(var)); + break; + + case xpath_type_boolean: + delete_xpath_variable(static_cast(var)); + break; + + default: + assert(!"Invalid variable type"); + } +} + +PUGI__FN bool copy_xpath_variable(xpath_variable* lhs, const xpath_variable* rhs) +{ + switch (rhs->type()) { + case xpath_type_node_set: + return lhs->set(static_cast(rhs)->value); + + case xpath_type_number: + return lhs->set(static_cast(rhs)->value); + + case xpath_type_string: + return lhs->set(static_cast(rhs)->value); + + case xpath_type_boolean: + return lhs->set(static_cast(rhs)->value); + + default: + assert(!"Invalid variable type"); + return false; + } +} + +PUGI__FN bool get_variable_scratch(char_t (&buffer)[32], xpath_variable_set* set, const char_t* begin, const char_t* end, xpath_variable** out_result) +{ + size_t length = static_cast(end - begin); + char_t* scratch = buffer; + + if (length >= sizeof(buffer) / sizeof(buffer[0])) { + // need to make dummy on-heap copy + scratch = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!scratch) return false; + } + + // copy string to zero-terminated buffer and perform lookup + memcpy(scratch, begin, length * sizeof(char_t)); + scratch[length] = 0; + + *out_result = set->get(scratch); + + // free dummy buffer + if (scratch != buffer) xml_memory::deallocate(scratch); + + return true; +} +PUGI__NS_END + +// Internal node set class +PUGI__NS_BEGIN +PUGI__FN xpath_node_set::type_t xpath_get_order(const xpath_node* begin, const xpath_node* end) +{ + if (end - begin < 2) + return xpath_node_set::type_sorted; + + document_order_comparator cmp; + + bool first = cmp(begin[0], begin[1]); + + for (const xpath_node* it = begin + 1; it + 1 < end; ++it) + if (cmp(it[0], it[1]) != first) + return xpath_node_set::type_unsorted; + + return first ? xpath_node_set::type_sorted : xpath_node_set::type_sorted_reverse; +} + +PUGI__FN xpath_node_set::type_t xpath_sort(xpath_node* begin, xpath_node* end, xpath_node_set::type_t type, bool rev) +{ + xpath_node_set::type_t order = rev ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted; + + if (type == xpath_node_set::type_unsorted) { + xpath_node_set::type_t sorted = xpath_get_order(begin, end); + + if (sorted == xpath_node_set::type_unsorted) { + sort(begin, end, document_order_comparator()); + + type = xpath_node_set::type_sorted; + } else + type = sorted; + } + + if (type != order) reverse(begin, end); + + return order; +} + +PUGI__FN xpath_node xpath_first(const xpath_node* begin, const xpath_node* end, xpath_node_set::type_t type) +{ + if (begin == end) return xpath_node(); + + switch (type) { + case xpath_node_set::type_sorted: + return *begin; + + case xpath_node_set::type_sorted_reverse: + return *(end - 1); + + case xpath_node_set::type_unsorted: + return *min_element(begin, end, document_order_comparator()); + + default: + assert(!"Invalid node set type"); + return xpath_node(); + } +} + +class xpath_node_set_raw +{ + xpath_node_set::type_t _type; + + xpath_node* _begin; + xpath_node* _end; + xpath_node* _eos; + +public: + xpath_node_set_raw(): _type(xpath_node_set::type_unsorted), _begin(0), _end(0), _eos(0) { + } + + xpath_node* begin() const { + return _begin; + } + + xpath_node* end() const { + return _end; + } + + bool empty() const { + return _begin == _end; + } + + size_t size() const { + return static_cast(_end - _begin); + } + + xpath_node first() const { + return xpath_first(_begin, _end, _type); + } + + void push_back_grow(const xpath_node& node, xpath_allocator* alloc); + + void push_back(const xpath_node& node, xpath_allocator* alloc) { + if (_end != _eos) + *_end++ = node; + else + push_back_grow(node, alloc); + } + + void append(const xpath_node* begin_, const xpath_node* end_, xpath_allocator* alloc) { + if (begin_ == end_) return; + + size_t size_ = static_cast(_end - _begin); + size_t capacity = static_cast(_eos - _begin); + size_t count = static_cast(end_ - begin_); + + if (size_ + count > capacity) { + // reallocate the old array or allocate a new one + xpath_node* data = static_cast(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size_ + count) * sizeof(xpath_node))); + assert(data); + + // finalize + _begin = data; + _end = data + size_; + _eos = data + size_ + count; + } + + memcpy(_end, begin_, count * sizeof(xpath_node)); + _end += count; + } + + void sort_do() { + _type = xpath_sort(_begin, _end, _type, false); + } + + void truncate(xpath_node* pos) { + assert(_begin <= pos && pos <= _end); + + _end = pos; + } + + void remove_duplicates() { + if (_type == xpath_node_set::type_unsorted) + sort(_begin, _end, duplicate_comparator()); + + _end = unique(_begin, _end); + } + + xpath_node_set::type_t type() const { + return _type; + } + + void set_type(xpath_node_set::type_t value) { + _type = value; + } +}; + +PUGI__FN_NO_INLINE void xpath_node_set_raw::push_back_grow(const xpath_node& node, xpath_allocator* alloc) +{ + size_t capacity = static_cast(_eos - _begin); + + // get new capacity (1.5x rule) + size_t new_capacity = capacity + capacity / 2 + 1; + + // reallocate the old array or allocate a new one + xpath_node* data = static_cast(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node))); + assert(data); + + // finalize + _begin = data; + _end = data + capacity; + _eos = data + new_capacity; + + // push + *_end++ = node; +} +PUGI__NS_END + +PUGI__NS_BEGIN +struct xpath_context { + xpath_node n; + size_t position, size; + + xpath_context(const xpath_node& n_, size_t position_, size_t size_): n(n_), position(position_), size(size_) { + } +}; + +enum lexeme_t { + lex_none = 0, + lex_equal, + lex_not_equal, + lex_less, + lex_greater, + lex_less_or_equal, + lex_greater_or_equal, + lex_plus, + lex_minus, + lex_multiply, + lex_union, + lex_var_ref, + lex_open_brace, + lex_close_brace, + lex_quoted_string, + lex_number, + lex_slash, + lex_double_slash, + lex_open_square_brace, + lex_close_square_brace, + lex_string, + lex_comma, + lex_axis_attribute, + lex_dot, + lex_double_dot, + lex_double_colon, + lex_eof +}; + +struct xpath_lexer_string { + const char_t* begin; + const char_t* end; + + xpath_lexer_string(): begin(0), end(0) { + } + + bool operator==(const char_t* other) const { + size_t length = static_cast(end - begin); + + return strequalrange(other, begin, length); + } +}; + +class xpath_lexer +{ + const char_t* _cur; + const char_t* _cur_lexeme_pos; + xpath_lexer_string _cur_lexeme_contents; + + lexeme_t _cur_lexeme; + +public: + explicit xpath_lexer(const char_t* query): _cur(query) { + next(); + } + + const char_t* state() const { + return _cur; + } + + void next() { + const char_t* cur = _cur; + + while (PUGI__IS_CHARTYPE(*cur, ct_space)) ++cur; + + // save lexeme position for error reporting + _cur_lexeme_pos = cur; + + switch (*cur) { + case 0: + _cur_lexeme = lex_eof; + break; + + case '>': + if (*(cur+1) == '=') { + cur += 2; + _cur_lexeme = lex_greater_or_equal; + } else { + cur += 1; + _cur_lexeme = lex_greater; + } + break; + + case '<': + if (*(cur+1) == '=') { + cur += 2; + _cur_lexeme = lex_less_or_equal; + } else { + cur += 1; + _cur_lexeme = lex_less; + } + break; + + case '!': + if (*(cur+1) == '=') { + cur += 2; + _cur_lexeme = lex_not_equal; + } else { + _cur_lexeme = lex_none; + } + break; + + case '=': + cur += 1; + _cur_lexeme = lex_equal; + + break; + + case '+': + cur += 1; + _cur_lexeme = lex_plus; + + break; + + case '-': + cur += 1; + _cur_lexeme = lex_minus; + + break; + + case '*': + cur += 1; + _cur_lexeme = lex_multiply; + + break; + + case '|': + cur += 1; + _cur_lexeme = lex_union; + + break; + + case '$': + cur += 1; + + if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol)) { + _cur_lexeme_contents.begin = cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + + if (cur[0] == ':' && PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) { // qname + cur++; // : + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + } + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_var_ref; + } else { + _cur_lexeme = lex_none; + } + + break; + + case '(': + cur += 1; + _cur_lexeme = lex_open_brace; + + break; + + case ')': + cur += 1; + _cur_lexeme = lex_close_brace; + + break; + + case '[': + cur += 1; + _cur_lexeme = lex_open_square_brace; + + break; + + case ']': + cur += 1; + _cur_lexeme = lex_close_square_brace; + + break; + + case ',': + cur += 1; + _cur_lexeme = lex_comma; + + break; + + case '/': + if (*(cur+1) == '/') { + cur += 2; + _cur_lexeme = lex_double_slash; + } else { + cur += 1; + _cur_lexeme = lex_slash; + } + break; + + case '.': + if (*(cur+1) == '.') { + cur += 2; + _cur_lexeme = lex_double_dot; + } else if (PUGI__IS_CHARTYPEX(*(cur+1), ctx_digit)) { + _cur_lexeme_contents.begin = cur; // . + + ++cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++; + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_number; + } else { + cur += 1; + _cur_lexeme = lex_dot; + } + break; + + case '@': + cur += 1; + _cur_lexeme = lex_axis_attribute; + + break; + + case '"': + case '\'': { + char_t terminator = *cur; + + ++cur; + + _cur_lexeme_contents.begin = cur; + while (*cur && *cur != terminator) cur++; + _cur_lexeme_contents.end = cur; + + if (!*cur) + _cur_lexeme = lex_none; + else { + cur += 1; + _cur_lexeme = lex_quoted_string; + } + + break; + } + + case ':': + if (*(cur+1) == ':') { + cur += 2; + _cur_lexeme = lex_double_colon; + } else { + _cur_lexeme = lex_none; + } + break; + + default: + if (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) { + _cur_lexeme_contents.begin = cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++; + + if (*cur == '.') { + cur++; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++; + } + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_number; + } else if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol)) { + _cur_lexeme_contents.begin = cur; + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + + if (cur[0] == ':') { + if (cur[1] == '*') { // namespace test ncname:* + cur += 2; // :* + } else if (PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) { // namespace test qname + cur++; // : + + while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++; + } + } + + _cur_lexeme_contents.end = cur; + + _cur_lexeme = lex_string; + } else { + _cur_lexeme = lex_none; + } + } + + _cur = cur; + } + + lexeme_t current() const { + return _cur_lexeme; + } + + const char_t* current_pos() const { + return _cur_lexeme_pos; + } + + const xpath_lexer_string& contents() const { + assert(_cur_lexeme == lex_var_ref || _cur_lexeme == lex_number || _cur_lexeme == lex_string || _cur_lexeme == lex_quoted_string); + + return _cur_lexeme_contents; + } +}; + +enum ast_type_t { + ast_unknown, + ast_op_or, // left or right + ast_op_and, // left and right + ast_op_equal, // left = right + ast_op_not_equal, // left != right + ast_op_less, // left < right + ast_op_greater, // left > right + ast_op_less_or_equal, // left <= right + ast_op_greater_or_equal, // left >= right + ast_op_add, // left + right + ast_op_subtract, // left - right + ast_op_multiply, // left * right + ast_op_divide, // left / right + ast_op_mod, // left % right + ast_op_negate, // left - right + ast_op_union, // left | right + ast_predicate, // apply predicate to set; next points to next predicate + ast_filter, // select * from left where right + ast_string_constant, // string constant + ast_number_constant, // number constant + ast_variable, // variable + ast_func_last, // last() + ast_func_position, // position() + ast_func_count, // count(left) + ast_func_id, // id(left) + ast_func_local_name_0, // local-name() + ast_func_local_name_1, // local-name(left) + ast_func_namespace_uri_0, // namespace-uri() + ast_func_namespace_uri_1, // namespace-uri(left) + ast_func_name_0, // name() + ast_func_name_1, // name(left) + ast_func_string_0, // string() + ast_func_string_1, // string(left) + ast_func_concat, // concat(left, right, siblings) + ast_func_starts_with, // starts_with(left, right) + ast_func_contains, // contains(left, right) + ast_func_substring_before, // substring-before(left, right) + ast_func_substring_after, // substring-after(left, right) + ast_func_substring_2, // substring(left, right) + ast_func_substring_3, // substring(left, right, third) + ast_func_string_length_0, // string-length() + ast_func_string_length_1, // string-length(left) + ast_func_normalize_space_0, // normalize-space() + ast_func_normalize_space_1, // normalize-space(left) + ast_func_translate, // translate(left, right, third) + ast_func_boolean, // boolean(left) + ast_func_not, // not(left) + ast_func_true, // true() + ast_func_false, // false() + ast_func_lang, // lang(left) + ast_func_number_0, // number() + ast_func_number_1, // number(left) + ast_func_sum, // sum(left) + ast_func_floor, // floor(left) + ast_func_ceiling, // ceiling(left) + ast_func_round, // round(left) + ast_step, // process set left with step + ast_step_root, // select root node + + ast_opt_translate_table, // translate(left, right, third) where right/third are constants + ast_opt_compare_attribute // @name = 'string' +}; + +enum axis_t { + axis_ancestor, + axis_ancestor_or_self, + axis_attribute, + axis_child, + axis_descendant, + axis_descendant_or_self, + axis_following, + axis_following_sibling, + axis_namespace, + axis_parent, + axis_preceding, + axis_preceding_sibling, + axis_self +}; + +enum nodetest_t { + nodetest_none, + nodetest_name, + nodetest_type_node, + nodetest_type_comment, + nodetest_type_pi, + nodetest_type_text, + nodetest_pi, + nodetest_all, + nodetest_all_in_namespace +}; + +enum predicate_t { + predicate_default, + predicate_posinv, + predicate_constant, + predicate_constant_one +}; + +enum nodeset_eval_t { + nodeset_eval_all, + nodeset_eval_any, + nodeset_eval_first +}; + +template struct axis_to_type { + static const axis_t axis; +}; + +template const axis_t axis_to_type::axis = N; + +class xpath_ast_node +{ +private: + // node type + char _type; + char _rettype; + + // for ast_step + char _axis; + + // for ast_step/ast_predicate/ast_filter + char _test; + + // tree node structure + xpath_ast_node* _left; + xpath_ast_node* _right; + xpath_ast_node* _next; + + union { + // value for ast_string_constant + const char_t* string; + // value for ast_number_constant + double number; + // variable for ast_variable + xpath_variable* variable; + // node test for ast_step (node name/namespace/node type/pi target) + const char_t* nodetest; + // table for ast_opt_translate_table + const unsigned char* table; + } _data; + + xpath_ast_node(const xpath_ast_node&); + xpath_ast_node& operator=(const xpath_ast_node&); + + template static bool compare_eq(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp) { + xpath_value_type lt = lhs->rettype(), rt = rhs->rettype(); + + if (lt != xpath_type_node_set && rt != xpath_type_node_set) { + if (lt == xpath_type_boolean || rt == xpath_type_boolean) + return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack)); + else if (lt == xpath_type_number || rt == xpath_type_number) + return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack)); + else if (lt == xpath_type_string || rt == xpath_type_string) { + xpath_allocator_capture cr(stack.result); + + xpath_string ls = lhs->eval_string(c, stack); + xpath_string rs = rhs->eval_string(c, stack); + + return comp(ls, rs); + } + } else if (lt == xpath_type_node_set && rt == xpath_type_node_set) { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { + xpath_allocator_capture cri(stack.result); + + if (comp(string_value(*li, stack.result), string_value(*ri, stack.result))) + return true; + } + + return false; + } else { + if (lt == xpath_type_node_set) { + swap(lhs, rhs); + swap(lt, rt); + } + + if (lt == xpath_type_boolean) + return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack)); + else if (lt == xpath_type_number) { + xpath_allocator_capture cr(stack.result); + + double l = lhs->eval_number(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { + xpath_allocator_capture cri(stack.result); + + if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str()))) + return true; + } + + return false; + } else if (lt == xpath_type_string) { + xpath_allocator_capture cr(stack.result); + + xpath_string l = lhs->eval_string(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { + xpath_allocator_capture cri(stack.result); + + if (comp(l, string_value(*ri, stack.result))) + return true; + } + + return false; + } + } + + assert(!"Wrong types"); + return false; + } + + static bool eval_once(xpath_node_set::type_t type, nodeset_eval_t eval) { + return type == xpath_node_set::type_sorted ? eval != nodeset_eval_all : eval == nodeset_eval_any; + } + + template static bool compare_rel(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp) { + xpath_value_type lt = lhs->rettype(), rt = rhs->rettype(); + + if (lt != xpath_type_node_set && rt != xpath_type_node_set) + return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack)); + else if (lt == xpath_type_node_set && rt == xpath_type_node_set) { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) { + xpath_allocator_capture cri(stack.result); + + double l = convert_string_to_number(string_value(*li, stack.result).c_str()); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { + xpath_allocator_capture crii(stack.result); + + if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str()))) + return true; + } + } + + return false; + } else if (lt != xpath_type_node_set && rt == xpath_type_node_set) { + xpath_allocator_capture cr(stack.result); + + double l = lhs->eval_number(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { + xpath_allocator_capture cri(stack.result); + + if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str()))) + return true; + } + + return false; + } else if (lt == xpath_type_node_set && rt != xpath_type_node_set) { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + double r = rhs->eval_number(c, stack); + + for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) { + xpath_allocator_capture cri(stack.result); + + if (comp(convert_string_to_number(string_value(*li, stack.result).c_str()), r)) + return true; + } + + return false; + } else { + assert(!"Wrong types"); + return false; + } + } + + static void apply_predicate_boolean(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once) { + assert(ns.size() >= first); + assert(expr->rettype() != xpath_type_number); + + size_t i = 1; + size_t size = ns.size() - first; + + xpath_node* last = ns.begin() + first; + + // remove_if... or well, sort of + for (xpath_node* it = last; it != ns.end(); ++it, ++i) { + xpath_context c(*it, i, size); + + if (expr->eval_boolean(c, stack)) { + *last++ = *it; + + if (once) break; + } + } + + ns.truncate(last); + } + + static void apply_predicate_number(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once) { + assert(ns.size() >= first); + assert(expr->rettype() == xpath_type_number); + + size_t i = 1; + size_t size = ns.size() - first; + + xpath_node* last = ns.begin() + first; + + // remove_if... or well, sort of + for (xpath_node* it = last; it != ns.end(); ++it, ++i) { + xpath_context c(*it, i, size); + + if (expr->eval_number(c, stack) == i) { + *last++ = *it; + + if (once) break; + } + } + + ns.truncate(last); + } + + static void apply_predicate_number_const(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack) { + assert(ns.size() >= first); + assert(expr->rettype() == xpath_type_number); + + size_t size = ns.size() - first; + + xpath_node* last = ns.begin() + first; + + xpath_context c(xpath_node(), 1, size); + + double er = expr->eval_number(c, stack); + + if (er >= 1.0 && er <= size) { + size_t eri = static_cast(er); + + if (er == eri) { + xpath_node r = last[eri - 1]; + + *last++ = r; + } + } + + ns.truncate(last); + } + + void apply_predicate(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, bool once) { + if (ns.size() == first) return; + + assert(_type == ast_filter || _type == ast_predicate); + + if (_test == predicate_constant || _test == predicate_constant_one) + apply_predicate_number_const(ns, first, _right, stack); + else if (_right->rettype() == xpath_type_number) + apply_predicate_number(ns, first, _right, stack, once); + else + apply_predicate_boolean(ns, first, _right, stack, once); + } + + void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, nodeset_eval_t eval) { + if (ns.size() == first) return; + + bool last_once = eval_once(ns.type(), eval); + + for (xpath_ast_node* pred = _right; pred; pred = pred->_next) + pred->apply_predicate(ns, first, stack, !pred->_next && last_once); + } + + bool step_push(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* parent, xpath_allocator* alloc) { + assert(a); + + const char_t* name = a->name ? a->name + 0 : PUGIXML_TEXT(""); + + switch (_test) { + case nodetest_name: + if (strequal(name, _data.nodetest) && is_xpath_attribute(name)) { + ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc); + return true; + } + break; + + case nodetest_type_node: + case nodetest_all: + if (is_xpath_attribute(name)) { + ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc); + return true; + } + break; + + case nodetest_all_in_namespace: + if (starts_with(name, _data.nodetest) && is_xpath_attribute(name)) { + ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc); + return true; + } + break; + + default: + ; + } + + return false; + } + + bool step_push(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc) { + assert(n); + + xml_node_type type = PUGI__NODETYPE(n); + + switch (_test) { + case nodetest_name: + if (type == node_element && n->name && strequal(n->name, _data.nodetest)) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_type_node: + ns.push_back(xml_node(n), alloc); + return true; + + case nodetest_type_comment: + if (type == node_comment) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_type_text: + if (type == node_pcdata || type == node_cdata) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_type_pi: + if (type == node_pi) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_pi: + if (type == node_pi && n->name && strequal(n->name, _data.nodetest)) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_all: + if (type == node_element) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + case nodetest_all_in_namespace: + if (type == node_element && n->name && starts_with(n->name, _data.nodetest)) { + ns.push_back(xml_node(n), alloc); + return true; + } + break; + + default: + assert(!"Unknown axis"); + } + + return false; + } + + template void step_fill(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc, bool once, T) { + const axis_t axis = T::axis; + + switch (axis) { + case axis_attribute: { + for (xml_attribute_struct* a = n->first_attribute; a; a = a->next_attribute) + if (step_push(ns, a, n, alloc) & once) + return; + + break; + } + + case axis_child: { + for (xml_node_struct* c = n->first_child; c; c = c->next_sibling) + if (step_push(ns, c, alloc) & once) + return; + + break; + } + + case axis_descendant: + case axis_descendant_or_self: { + if (axis == axis_descendant_or_self) + if (step_push(ns, n, alloc) & once) + return; + + xml_node_struct* cur = n->first_child; + + while (cur) { + if (step_push(ns, cur, alloc) & once) + return; + + if (cur->first_child) + cur = cur->first_child; + else { + while (!cur->next_sibling) { + cur = cur->parent; + + if (cur == n) return; + } + + cur = cur->next_sibling; + } + } + + break; + } + + case axis_following_sibling: { + for (xml_node_struct* c = n->next_sibling; c; c = c->next_sibling) + if (step_push(ns, c, alloc) & once) + return; + + break; + } + + case axis_preceding_sibling: { + for (xml_node_struct* c = n->prev_sibling_c; c->next_sibling; c = c->prev_sibling_c) + if (step_push(ns, c, alloc) & once) + return; + + break; + } + + case axis_following: { + xml_node_struct* cur = n; + + // exit from this node so that we don't include descendants + while (!cur->next_sibling) { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->next_sibling; + + while (cur) { + if (step_push(ns, cur, alloc) & once) + return; + + if (cur->first_child) + cur = cur->first_child; + else { + while (!cur->next_sibling) { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->next_sibling; + } + } + + break; + } + + case axis_preceding: { + xml_node_struct* cur = n; + + // exit from this node so that we don't include descendants + while (!cur->prev_sibling_c->next_sibling) { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->prev_sibling_c; + + while (cur) { + if (cur->first_child) + cur = cur->first_child->prev_sibling_c; + else { + // leaf node, can't be ancestor + if (step_push(ns, cur, alloc) & once) + return; + + while (!cur->prev_sibling_c->next_sibling) { + cur = cur->parent; + + if (!cur) return; + + if (!node_is_ancestor(cur, n)) + if (step_push(ns, cur, alloc) & once) + return; + } + + cur = cur->prev_sibling_c; + } + } + + break; + } + + case axis_ancestor: + case axis_ancestor_or_self: { + if (axis == axis_ancestor_or_self) + if (step_push(ns, n, alloc) & once) + return; + + xml_node_struct* cur = n->parent; + + while (cur) { + if (step_push(ns, cur, alloc) & once) + return; + + cur = cur->parent; + } + + break; + } + + case axis_self: { + step_push(ns, n, alloc); + + break; + } + + case axis_parent: { + if (n->parent) + step_push(ns, n->parent, alloc); + + break; + } + + default: + assert(!"Unimplemented axis"); + } + } + + template void step_fill(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* p, xpath_allocator* alloc, bool once, T v) { + const axis_t axis = T::axis; + + switch (axis) { + case axis_ancestor: + case axis_ancestor_or_self: { + if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test + if (step_push(ns, a, p, alloc) & once) + return; + + xml_node_struct* cur = p; + + while (cur) { + if (step_push(ns, cur, alloc) & once) + return; + + cur = cur->parent; + } + + break; + } + + case axis_descendant_or_self: + case axis_self: { + if (_test == nodetest_type_node) // reject attributes based on principal node type test + step_push(ns, a, p, alloc); + + break; + } + + case axis_following: { + xml_node_struct* cur = p; + + while (cur) { + if (cur->first_child) + cur = cur->first_child; + else { + while (!cur->next_sibling) { + cur = cur->parent; + + if (!cur) return; + } + + cur = cur->next_sibling; + } + + if (step_push(ns, cur, alloc) & once) + return; + } + + break; + } + + case axis_parent: { + step_push(ns, p, alloc); + + break; + } + + case axis_preceding: { + // preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding + step_fill(ns, p, alloc, once, v); + break; + } + + default: + assert(!"Unimplemented axis"); + } + } + + template void step_fill(xpath_node_set_raw& ns, const xpath_node& xn, xpath_allocator* alloc, bool once, T v) { + const axis_t axis = T::axis; + const bool axis_has_attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self); + + if (xn.node()) + step_fill(ns, xn.node().internal_object(), alloc, once, v); + else if (axis_has_attributes && xn.attribute() && xn.parent()) + step_fill(ns, xn.attribute().internal_object(), xn.parent().internal_object(), alloc, once, v); + } + + template xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval, T v) { + const axis_t axis = T::axis; + const bool axis_reverse = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling); + const xpath_node_set::type_t axis_type = axis_reverse ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted; + + bool once = + (axis == axis_attribute && _test == nodetest_name) || + (!_right && eval_once(axis_type, eval)) || + (_right && !_right->_next && _right->_test == predicate_constant_one); + + xpath_node_set_raw ns; + ns.set_type(axis_type); + + if (_left) { + xpath_node_set_raw s = _left->eval_node_set(c, stack, nodeset_eval_all); + + // self axis preserves the original order + if (axis == axis_self) ns.set_type(s.type()); + + for (const xpath_node* it = s.begin(); it != s.end(); ++it) { + size_t size = ns.size(); + + // in general, all axes generate elements in a particular order, but there is no order guarantee if axis is applied to two nodes + if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted); + + step_fill(ns, *it, stack.result, once, v); + if (_right) apply_predicates(ns, size, stack, eval); + } + } else { + step_fill(ns, c.n, stack.result, once, v); + if (_right) apply_predicates(ns, 0, stack, eval); + } + + // child, attribute and self axes always generate unique set of nodes + // for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice + if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted) + ns.remove_duplicates(); + + return ns; + } + +public: + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, const char_t* value): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0) { + assert(type == ast_string_constant); + _data.string = value; + } + + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, double value): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0) { + assert(type == ast_number_constant); + _data.number = value; + } + + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_variable* value): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0) { + assert(type == ast_variable); + _data.variable = value; + } + + xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_ast_node* left = 0, xpath_ast_node* right = 0): + _type(static_cast(type)), _rettype(static_cast(rettype_)), _axis(0), _test(0), _left(left), _right(right), _next(0) { + } + + xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents): + _type(static_cast(type)), _rettype(xpath_type_node_set), _axis(static_cast(axis)), _test(static_cast(test)), _left(left), _right(0), _next(0) { + assert(type == ast_step); + _data.nodetest = contents; + } + + xpath_ast_node(ast_type_t type, xpath_ast_node* left, xpath_ast_node* right, predicate_t test): + _type(static_cast(type)), _rettype(xpath_type_node_set), _axis(0), _test(static_cast(test)), _left(left), _right(right), _next(0) { + assert(type == ast_filter || type == ast_predicate); + } + + void set_next(xpath_ast_node* value) { + _next = value; + } + + void set_right(xpath_ast_node* value) { + _right = value; + } + + bool eval_boolean(const xpath_context& c, const xpath_stack& stack) { + switch (_type) { + case ast_op_or: + return _left->eval_boolean(c, stack) || _right->eval_boolean(c, stack); + + case ast_op_and: + return _left->eval_boolean(c, stack) && _right->eval_boolean(c, stack); + + case ast_op_equal: + return compare_eq(_left, _right, c, stack, equal_to()); + + case ast_op_not_equal: + return compare_eq(_left, _right, c, stack, not_equal_to()); + + case ast_op_less: + return compare_rel(_left, _right, c, stack, less()); + + case ast_op_greater: + return compare_rel(_right, _left, c, stack, less()); + + case ast_op_less_or_equal: + return compare_rel(_left, _right, c, stack, less_equal()); + + case ast_op_greater_or_equal: + return compare_rel(_right, _left, c, stack, less_equal()); + + case ast_func_starts_with: { + xpath_allocator_capture cr(stack.result); + + xpath_string lr = _left->eval_string(c, stack); + xpath_string rr = _right->eval_string(c, stack); + + return starts_with(lr.c_str(), rr.c_str()); + } + + case ast_func_contains: { + xpath_allocator_capture cr(stack.result); + + xpath_string lr = _left->eval_string(c, stack); + xpath_string rr = _right->eval_string(c, stack); + + return find_substring(lr.c_str(), rr.c_str()) != 0; + } + + case ast_func_boolean: + return _left->eval_boolean(c, stack); + + case ast_func_not: + return !_left->eval_boolean(c, stack); + + case ast_func_true: + return true; + + case ast_func_false: + return false; + + case ast_func_lang: { + if (c.n.attribute()) return false; + + xpath_allocator_capture cr(stack.result); + + xpath_string lang = _left->eval_string(c, stack); + + for (xml_node n = c.n.node(); n; n = n.parent()) { + xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang")); + + if (a) { + const char_t* value = a.value(); + + // strnicmp / strncasecmp is not portable + for (const char_t* lit = lang.c_str(); *lit; ++lit) { + if (tolower_ascii(*lit) != tolower_ascii(*value)) return false; + ++value; + } + + return *value == 0 || *value == '-'; + } + } + + return false; + } + + case ast_opt_compare_attribute: { + const char_t* value = (_right->_type == ast_string_constant) ? _right->_data.string : _right->_data.variable->get_string(); + + xml_attribute attr = c.n.node().attribute(_left->_data.nodetest); + + return attr && strequal(attr.value(), value) && is_xpath_attribute(attr.name()); + } + + case ast_variable: { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_boolean) + return _data.variable->get_boolean(); + + // fallthrough to type conversion + } + + default: { + switch (_rettype) { + case xpath_type_number: + return convert_number_to_boolean(eval_number(c, stack)); + + case xpath_type_string: { + xpath_allocator_capture cr(stack.result); + + return !eval_string(c, stack).empty(); + } + + case xpath_type_node_set: { + xpath_allocator_capture cr(stack.result); + + return !eval_node_set(c, stack, nodeset_eval_any).empty(); + } + + default: + assert(!"Wrong expression for return type boolean"); + return false; + } + } + } + } + + double eval_number(const xpath_context& c, const xpath_stack& stack) { + switch (_type) { + case ast_op_add: + return _left->eval_number(c, stack) + _right->eval_number(c, stack); + + case ast_op_subtract: + return _left->eval_number(c, stack) - _right->eval_number(c, stack); + + case ast_op_multiply: + return _left->eval_number(c, stack) * _right->eval_number(c, stack); + + case ast_op_divide: + return _left->eval_number(c, stack) / _right->eval_number(c, stack); + + case ast_op_mod: + return fmod(_left->eval_number(c, stack), _right->eval_number(c, stack)); + + case ast_op_negate: + return -_left->eval_number(c, stack); + + case ast_number_constant: + return _data.number; + + case ast_func_last: + return static_cast(c.size); + + case ast_func_position: + return static_cast(c.position); + + case ast_func_count: { + xpath_allocator_capture cr(stack.result); + + return static_cast(_left->eval_node_set(c, stack, nodeset_eval_all).size()); + } + + case ast_func_string_length_0: { + xpath_allocator_capture cr(stack.result); + + return static_cast(string_value(c.n, stack.result).length()); + } + + case ast_func_string_length_1: { + xpath_allocator_capture cr(stack.result); + + return static_cast(_left->eval_string(c, stack).length()); + } + + case ast_func_number_0: { + xpath_allocator_capture cr(stack.result); + + return convert_string_to_number(string_value(c.n, stack.result).c_str()); + } + + case ast_func_number_1: + return _left->eval_number(c, stack); + + case ast_func_sum: { + xpath_allocator_capture cr(stack.result); + + double r = 0; + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_all); + + for (const xpath_node* it = ns.begin(); it != ns.end(); ++it) { + xpath_allocator_capture cri(stack.result); + + r += convert_string_to_number(string_value(*it, stack.result).c_str()); + } + + return r; + } + + case ast_func_floor: { + double r = _left->eval_number(c, stack); + + return r == r ? floor(r) : r; + } + + case ast_func_ceiling: { + double r = _left->eval_number(c, stack); + + return r == r ? ceil(r) : r; + } + + case ast_func_round: + return round_nearest_nzero(_left->eval_number(c, stack)); + + case ast_variable: { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_number) + return _data.variable->get_number(); + + // fallthrough to type conversion + } + + default: { + switch (_rettype) { + case xpath_type_boolean: + return eval_boolean(c, stack) ? 1 : 0; + + case xpath_type_string: { + xpath_allocator_capture cr(stack.result); + + return convert_string_to_number(eval_string(c, stack).c_str()); + } + + case xpath_type_node_set: { + xpath_allocator_capture cr(stack.result); + + return convert_string_to_number(eval_string(c, stack).c_str()); + } + + default: + assert(!"Wrong expression for return type number"); + return 0; + } + + } + } + } + + xpath_string eval_string_concat(const xpath_context& c, const xpath_stack& stack) { + assert(_type == ast_func_concat); + + xpath_allocator_capture ct(stack.temp); + + // count the string number + size_t count = 1; + for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++; + + // gather all strings + xpath_string static_buffer[4]; + xpath_string* buffer = static_buffer; + + // allocate on-heap for large concats + if (count > sizeof(static_buffer) / sizeof(static_buffer[0])) { + buffer = static_cast(stack.temp->allocate(count * sizeof(xpath_string))); + assert(buffer); + } + + // evaluate all strings to temporary stack + xpath_stack swapped_stack = {stack.temp, stack.result}; + + buffer[0] = _left->eval_string(c, swapped_stack); + + size_t pos = 1; + for (xpath_ast_node* n = _right; n; n = n->_next, ++pos) buffer[pos] = n->eval_string(c, swapped_stack); + assert(pos == count); + + // get total length + size_t length = 0; + for (size_t i = 0; i < count; ++i) length += buffer[i].length(); + + // create final string + char_t* result = static_cast(stack.result->allocate((length + 1) * sizeof(char_t))); + assert(result); + + char_t* ri = result; + + for (size_t j = 0; j < count; ++j) + for (const char_t* bi = buffer[j].c_str(); *bi; ++bi) + *ri++ = *bi; + + *ri = 0; + + return xpath_string::from_heap_preallocated(result, ri); + } + + xpath_string eval_string(const xpath_context& c, const xpath_stack& stack) { + switch (_type) { + case ast_string_constant: + return xpath_string::from_const(_data.string); + + case ast_func_local_name_0: { + xpath_node na = c.n; + + return xpath_string::from_const(local_name(na)); + } + + case ast_func_local_name_1: { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); + xpath_node na = ns.first(); + + return xpath_string::from_const(local_name(na)); + } + + case ast_func_name_0: { + xpath_node na = c.n; + + return xpath_string::from_const(qualified_name(na)); + } + + case ast_func_name_1: { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); + xpath_node na = ns.first(); + + return xpath_string::from_const(qualified_name(na)); + } + + case ast_func_namespace_uri_0: { + xpath_node na = c.n; + + return xpath_string::from_const(namespace_uri(na)); + } + + case ast_func_namespace_uri_1: { + xpath_allocator_capture cr(stack.result); + + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); + xpath_node na = ns.first(); + + return xpath_string::from_const(namespace_uri(na)); + } + + case ast_func_string_0: + return string_value(c.n, stack.result); + + case ast_func_string_1: + return _left->eval_string(c, stack); + + case ast_func_concat: + return eval_string_concat(c, stack); + + case ast_func_substring_before: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + xpath_string p = _right->eval_string(c, swapped_stack); + + const char_t* pos = find_substring(s.c_str(), p.c_str()); + + return pos ? xpath_string::from_heap(s.c_str(), pos, stack.result) : xpath_string(); + } + + case ast_func_substring_after: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + xpath_string p = _right->eval_string(c, swapped_stack); + + const char_t* pos = find_substring(s.c_str(), p.c_str()); + if (!pos) return xpath_string(); + + const char_t* rbegin = pos + p.length(); + const char_t* rend = s.c_str() + s.length(); + + return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin); + } + + case ast_func_substring_2: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + size_t s_length = s.length(); + + double first = round_nearest(_right->eval_number(c, stack)); + + if (is_nan(first)) return xpath_string(); // NaN + else if (first >= s_length + 1) return xpath_string(); + + size_t pos = first < 1 ? 1 : static_cast(first); + assert(1 <= pos && pos <= s_length + 1); + + const char_t* rbegin = s.c_str() + (pos - 1); + const char_t* rend = s.c_str() + s.length(); + + return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin); + } + + case ast_func_substring_3: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, swapped_stack); + size_t s_length = s.length(); + + double first = round_nearest(_right->eval_number(c, stack)); + double last = first + round_nearest(_right->_next->eval_number(c, stack)); + + if (is_nan(first) || is_nan(last)) return xpath_string(); + else if (first >= s_length + 1) return xpath_string(); + else if (first >= last) return xpath_string(); + else if (last < 1) return xpath_string(); + + size_t pos = first < 1 ? 1 : static_cast(first); + size_t end = last >= s_length + 1 ? s_length + 1 : static_cast(last); + + assert(1 <= pos && pos <= end && end <= s_length + 1); + const char_t* rbegin = s.c_str() + (pos - 1); + const char_t* rend = s.c_str() + (end - 1); + + return (end == s_length + 1 && !s.uses_heap()) ? xpath_string::from_const(rbegin) : xpath_string::from_heap(rbegin, rend, stack.result); + } + + case ast_func_normalize_space_0: { + xpath_string s = string_value(c.n, stack.result); + + char_t* begin = s.data(stack.result); + char_t* end = normalize_space(begin); + + return xpath_string::from_heap_preallocated(begin, end); + } + + case ast_func_normalize_space_1: { + xpath_string s = _left->eval_string(c, stack); + + char_t* begin = s.data(stack.result); + char_t* end = normalize_space(begin); + + return xpath_string::from_heap_preallocated(begin, end); + } + + case ast_func_translate: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_string s = _left->eval_string(c, stack); + xpath_string from = _right->eval_string(c, swapped_stack); + xpath_string to = _right->_next->eval_string(c, swapped_stack); + + char_t* begin = s.data(stack.result); + char_t* end = translate(begin, from.c_str(), to.c_str(), to.length()); + + return xpath_string::from_heap_preallocated(begin, end); + } + + case ast_opt_translate_table: { + xpath_string s = _left->eval_string(c, stack); + + char_t* begin = s.data(stack.result); + char_t* end = translate_table(begin, _data.table); + + return xpath_string::from_heap_preallocated(begin, end); + } + + case ast_variable: { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_string) + return xpath_string::from_const(_data.variable->get_string()); + + // fallthrough to type conversion + } + + default: { + switch (_rettype) { + case xpath_type_boolean: + return xpath_string::from_const(eval_boolean(c, stack) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false")); + + case xpath_type_number: + return convert_number_to_string(eval_number(c, stack), stack.result); + + case xpath_type_node_set: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_node_set_raw ns = eval_node_set(c, swapped_stack, nodeset_eval_first); + return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result); + } + + default: + assert(!"Wrong expression for return type string"); + return xpath_string(); + } + } + } + } + + xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval) { + switch (_type) { + case ast_op_union: { + xpath_allocator_capture cr(stack.temp); + + xpath_stack swapped_stack = {stack.temp, stack.result}; + + xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack, eval); + xpath_node_set_raw rs = _right->eval_node_set(c, stack, eval); + + // we can optimize merging two sorted sets, but this is a very rare operation, so don't bother + rs.set_type(xpath_node_set::type_unsorted); + + rs.append(ls.begin(), ls.end(), stack.result); + rs.remove_duplicates(); + + return rs; + } + + case ast_filter: { + xpath_node_set_raw set = _left->eval_node_set(c, stack, _test == predicate_constant_one ? nodeset_eval_first : nodeset_eval_all); + + // either expression is a number or it contains position() call; sort by document order + if (_test != predicate_posinv) set.sort_do(); + + bool once = eval_once(set.type(), eval); + + apply_predicate(set, 0, stack, once); + + return set; + } + + case ast_func_id: + return xpath_node_set_raw(); + + case ast_step: { + switch (_axis) { + case axis_ancestor: + return step_do(c, stack, eval, axis_to_type()); + + case axis_ancestor_or_self: + return step_do(c, stack, eval, axis_to_type()); + + case axis_attribute: + return step_do(c, stack, eval, axis_to_type()); + + case axis_child: + return step_do(c, stack, eval, axis_to_type()); + + case axis_descendant: + return step_do(c, stack, eval, axis_to_type()); + + case axis_descendant_or_self: + return step_do(c, stack, eval, axis_to_type()); + + case axis_following: + return step_do(c, stack, eval, axis_to_type()); + + case axis_following_sibling: + return step_do(c, stack, eval, axis_to_type()); + + case axis_namespace: + // namespaced axis is not supported + return xpath_node_set_raw(); + + case axis_parent: + return step_do(c, stack, eval, axis_to_type()); + + case axis_preceding: + return step_do(c, stack, eval, axis_to_type()); + + case axis_preceding_sibling: + return step_do(c, stack, eval, axis_to_type()); + + case axis_self: + return step_do(c, stack, eval, axis_to_type()); + + default: + assert(!"Unknown axis"); + return xpath_node_set_raw(); + } + } + + case ast_step_root: { + assert(!_right); // root step can't have any predicates + + xpath_node_set_raw ns; + + ns.set_type(xpath_node_set::type_sorted); + + if (c.n.node()) ns.push_back(c.n.node().root(), stack.result); + else if (c.n.attribute()) ns.push_back(c.n.parent().root(), stack.result); + + return ns; + } + + case ast_variable: { + assert(_rettype == _data.variable->type()); + + if (_rettype == xpath_type_node_set) { + const xpath_node_set& s = _data.variable->get_node_set(); + + xpath_node_set_raw ns; + + ns.set_type(s.type()); + ns.append(s.begin(), s.end(), stack.result); + + return ns; + } + + // fallthrough to type conversion + } + + default: + assert(!"Wrong expression for return type node set"); + return xpath_node_set_raw(); + } + } + + void optimize(xpath_allocator* alloc) { + if (_left) _left->optimize(alloc); + if (_right) _right->optimize(alloc); + if (_next) _next->optimize(alloc); + + optimize_self(alloc); + } + + void optimize_self(xpath_allocator* alloc) { + // Rewrite [position()=expr] with [expr] + // Note that this step has to go before classification to recognize [position()=1] + if ((_type == ast_filter || _type == ast_predicate) && + _right->_type == ast_op_equal && _right->_left->_type == ast_func_position && _right->_right->_rettype == xpath_type_number) { + _right = _right->_right; + } + + // Classify filter/predicate ops to perform various optimizations during evaluation + if (_type == ast_filter || _type == ast_predicate) { + assert(_test == predicate_default); + + if (_right->_type == ast_number_constant && _right->_data.number == 1.0) + _test = predicate_constant_one; + else if (_right->_rettype == xpath_type_number && (_right->_type == ast_number_constant || _right->_type == ast_variable || _right->_type == ast_func_last)) + _test = predicate_constant; + else if (_right->_rettype != xpath_type_number && _right->is_posinv_expr()) + _test = predicate_posinv; + } + + // Rewrite descendant-or-self::node()/child::foo with descendant::foo + // The former is a full form of //foo, the latter is much faster since it executes the node test immediately + // Do a similar kind of rewrite for self/descendant/descendant-or-self axes + // Note that we only rewrite positionally invariant steps (//foo[1] != /descendant::foo[1]) + if (_type == ast_step && (_axis == axis_child || _axis == axis_self || _axis == axis_descendant || _axis == axis_descendant_or_self) && _left && + _left->_type == ast_step && _left->_axis == axis_descendant_or_self && _left->_test == nodetest_type_node && !_left->_right && + is_posinv_step()) { + if (_axis == axis_child || _axis == axis_descendant) + _axis = axis_descendant; + else + _axis = axis_descendant_or_self; + + _left = _left->_left; + } + + // Use optimized lookup table implementation for translate() with constant arguments + if (_type == ast_func_translate && _right->_type == ast_string_constant && _right->_next->_type == ast_string_constant) { + unsigned char* table = translate_table_generate(alloc, _right->_data.string, _right->_next->_data.string); + + if (table) { + _type = ast_opt_translate_table; + _data.table = table; + } + } + + // Use optimized path for @attr = 'value' or @attr = $value + if (_type == ast_op_equal && + _left->_type == ast_step && _left->_axis == axis_attribute && _left->_test == nodetest_name && !_left->_left && !_left->_right && + (_right->_type == ast_string_constant || (_right->_type == ast_variable && _right->_rettype == xpath_type_string))) { + _type = ast_opt_compare_attribute; + } + } + + bool is_posinv_expr() const { + switch (_type) { + case ast_func_position: + case ast_func_last: + return false; + + case ast_string_constant: + case ast_number_constant: + case ast_variable: + return true; + + case ast_step: + case ast_step_root: + return true; + + case ast_predicate: + case ast_filter: + return true; + + default: + if (_left && !_left->is_posinv_expr()) return false; + + for (xpath_ast_node* n = _right; n; n = n->_next) + if (!n->is_posinv_expr()) return false; + + return true; + } + } + + bool is_posinv_step() const { + assert(_type == ast_step); + + for (xpath_ast_node* n = _right; n; n = n->_next) { + assert(n->_type == ast_predicate); + + if (n->_test != predicate_posinv) + return false; + } + + return true; + } + + xpath_value_type rettype() const { + return static_cast(_rettype); + } +}; + +struct xpath_parser { + xpath_allocator* _alloc; + xpath_lexer _lexer; + + const char_t* _query; + xpath_variable_set* _variables; + + xpath_parse_result* _result; + + char_t _scratch[32]; + +#ifdef PUGIXML_NO_EXCEPTIONS + jmp_buf _error_handler; +#endif + + void throw_error(const char* message) { + _result->error = message; + _result->offset = _lexer.current_pos() - _query; + +#ifdef PUGIXML_NO_EXCEPTIONS + longjmp(_error_handler, 1); +#else + throw xpath_exception(*_result); +#endif + } + + void throw_error_oom() { +#ifdef PUGIXML_NO_EXCEPTIONS + throw_error("Out of memory"); +#else + throw std::bad_alloc(); +#endif + } + + void* alloc_node() { + void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node)); + + if (!result) throw_error_oom(); + + return result; + } + + const char_t* alloc_string(const xpath_lexer_string& value) { + if (value.begin) { + size_t length = static_cast(value.end - value.begin); + + char_t* c = static_cast(_alloc->allocate_nothrow((length + 1) * sizeof(char_t))); + if (!c) throw_error_oom(); + assert(c); // workaround for clang static analysis + + memcpy(c, value.begin, length * sizeof(char_t)); + c[length] = 0; + + return c; + } else return 0; + } + + xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2]) { + assert(argc <= 1); + + if (argc == 1 && args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set"); + + return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]); + } + + xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2]) { + switch (name.begin[0]) { + case 'b': + if (name == PUGIXML_TEXT("boolean") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]); + + break; + + case 'c': + if (name == PUGIXML_TEXT("count") && argc == 1) { + if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set"); + return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]); + } else if (name == PUGIXML_TEXT("contains") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_boolean, args[0], args[1]); + else if (name == PUGIXML_TEXT("concat") && argc >= 2) + return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("ceiling") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]); + + break; + + case 'f': + if (name == PUGIXML_TEXT("false") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean); + else if (name == PUGIXML_TEXT("floor") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]); + + break; + + case 'i': + if (name == PUGIXML_TEXT("id") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]); + + break; + + case 'l': + if (name == PUGIXML_TEXT("last") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number); + else if (name == PUGIXML_TEXT("lang") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]); + else if (name == PUGIXML_TEXT("local-name") && argc <= 1) + return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args); + + break; + + case 'n': + if (name == PUGIXML_TEXT("name") && argc <= 1) + return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args); + else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1) + return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args); + else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("not") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]); + else if (name == PUGIXML_TEXT("number") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]); + + break; + + case 'p': + if (name == PUGIXML_TEXT("position") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number); + + break; + + case 'r': + if (name == PUGIXML_TEXT("round") && argc == 1) + return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]); + + break; + + case 's': + if (name == PUGIXML_TEXT("string") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]); + else if (name == PUGIXML_TEXT("string-length") && argc <= 1) + return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_number, args[0]); + else if (name == PUGIXML_TEXT("starts-with") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]); + else if (name == PUGIXML_TEXT("substring-before") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("substring-after") && argc == 2) + return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3)) + return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("sum") && argc == 1) { + if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set"); + return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]); + } + + break; + + case 't': + if (name == PUGIXML_TEXT("translate") && argc == 3) + return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]); + else if (name == PUGIXML_TEXT("true") && argc == 0) + return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean); + + break; + + default: + break; + } + + throw_error("Unrecognized function or wrong parameter count"); + + return 0; + } + + axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified) { + specified = true; + + switch (name.begin[0]) { + case 'a': + if (name == PUGIXML_TEXT("ancestor")) + return axis_ancestor; + else if (name == PUGIXML_TEXT("ancestor-or-self")) + return axis_ancestor_or_self; + else if (name == PUGIXML_TEXT("attribute")) + return axis_attribute; + + break; + + case 'c': + if (name == PUGIXML_TEXT("child")) + return axis_child; + + break; + + case 'd': + if (name == PUGIXML_TEXT("descendant")) + return axis_descendant; + else if (name == PUGIXML_TEXT("descendant-or-self")) + return axis_descendant_or_self; + + break; + + case 'f': + if (name == PUGIXML_TEXT("following")) + return axis_following; + else if (name == PUGIXML_TEXT("following-sibling")) + return axis_following_sibling; + + break; + + case 'n': + if (name == PUGIXML_TEXT("namespace")) + return axis_namespace; + + break; + + case 'p': + if (name == PUGIXML_TEXT("parent")) + return axis_parent; + else if (name == PUGIXML_TEXT("preceding")) + return axis_preceding; + else if (name == PUGIXML_TEXT("preceding-sibling")) + return axis_preceding_sibling; + + break; + + case 's': + if (name == PUGIXML_TEXT("self")) + return axis_self; + + break; + + default: + break; + } + + specified = false; + return axis_child; + } + + nodetest_t parse_node_test_type(const xpath_lexer_string& name) { + switch (name.begin[0]) { + case 'c': + if (name == PUGIXML_TEXT("comment")) + return nodetest_type_comment; + + break; + + case 'n': + if (name == PUGIXML_TEXT("node")) + return nodetest_type_node; + + break; + + case 'p': + if (name == PUGIXML_TEXT("processing-instruction")) + return nodetest_type_pi; + + break; + + case 't': + if (name == PUGIXML_TEXT("text")) + return nodetest_type_text; + + break; + + default: + break; + } + + return nodetest_none; + } + + // PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall + xpath_ast_node* parse_primary_expression() { + switch (_lexer.current()) { + case lex_var_ref: { + xpath_lexer_string name = _lexer.contents(); + + if (!_variables) + throw_error("Unknown variable: variable set is not provided"); + + xpath_variable* var = 0; + if (!get_variable_scratch(_scratch, _variables, name.begin, name.end, &var)) + throw_error_oom(); + + if (!var) + throw_error("Unknown variable: variable set does not contain the given name"); + + _lexer.next(); + + return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var); + } + + case lex_open_brace: { + _lexer.next(); + + xpath_ast_node* n = parse_expression(); + + if (_lexer.current() != lex_close_brace) + throw_error("Unmatched braces"); + + _lexer.next(); + + return n; + } + + case lex_quoted_string: { + const char_t* value = alloc_string(_lexer.contents()); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value); + _lexer.next(); + + return n; + } + + case lex_number: { + double value = 0; + + if (!convert_string_to_number_scratch(_scratch, _lexer.contents().begin, _lexer.contents().end, &value)) + throw_error_oom(); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value); + _lexer.next(); + + return n; + } + + case lex_string: { + xpath_ast_node* args[2] = {0}; + size_t argc = 0; + + xpath_lexer_string function = _lexer.contents(); + _lexer.next(); + + xpath_ast_node* last_arg = 0; + + if (_lexer.current() != lex_open_brace) + throw_error("Unrecognized function call"); + _lexer.next(); + + if (_lexer.current() != lex_close_brace) + args[argc++] = parse_expression(); + + while (_lexer.current() != lex_close_brace) { + if (_lexer.current() != lex_comma) + throw_error("No comma between function arguments"); + _lexer.next(); + + xpath_ast_node* n = parse_expression(); + + if (argc < 2) args[argc] = n; + else last_arg->set_next(n); + + argc++; + last_arg = n; + } + + _lexer.next(); + + return parse_function(function, argc, args); + } + + default: + throw_error("Unrecognizable primary expression"); + + return 0; + } + } + + // FilterExpr ::= PrimaryExpr | FilterExpr Predicate + // Predicate ::= '[' PredicateExpr ']' + // PredicateExpr ::= Expr + xpath_ast_node* parse_filter_expression() { + xpath_ast_node* n = parse_primary_expression(); + + while (_lexer.current() == lex_open_square_brace) { + _lexer.next(); + + xpath_ast_node* expr = parse_expression(); + + if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set"); + + n = new (alloc_node()) xpath_ast_node(ast_filter, n, expr, predicate_default); + + if (_lexer.current() != lex_close_square_brace) + throw_error("Unmatched square brace"); + + _lexer.next(); + } + + return n; + } + + // Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep + // AxisSpecifier ::= AxisName '::' | '@'? + // NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')' + // NameTest ::= '*' | NCName ':' '*' | QName + // AbbreviatedStep ::= '.' | '..' + xpath_ast_node* parse_step(xpath_ast_node* set) { + if (set && set->rettype() != xpath_type_node_set) + throw_error("Step has to be applied to node set"); + + bool axis_specified = false; + axis_t axis = axis_child; // implied child axis + + if (_lexer.current() == lex_axis_attribute) { + axis = axis_attribute; + axis_specified = true; + + _lexer.next(); + } else if (_lexer.current() == lex_dot) { + _lexer.next(); + + return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0); + } else if (_lexer.current() == lex_double_dot) { + _lexer.next(); + + return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0); + } + + nodetest_t nt_type = nodetest_none; + xpath_lexer_string nt_name; + + if (_lexer.current() == lex_string) { + // node name test + nt_name = _lexer.contents(); + _lexer.next(); + + // was it an axis name? + if (_lexer.current() == lex_double_colon) { + // parse axis name + if (axis_specified) throw_error("Two axis specifiers in one step"); + + axis = parse_axis_name(nt_name, axis_specified); + + if (!axis_specified) throw_error("Unknown axis"); + + // read actual node test + _lexer.next(); + + if (_lexer.current() == lex_multiply) { + nt_type = nodetest_all; + nt_name = xpath_lexer_string(); + _lexer.next(); + } else if (_lexer.current() == lex_string) { + nt_name = _lexer.contents(); + _lexer.next(); + } else throw_error("Unrecognized node test"); + } + + if (nt_type == nodetest_none) { + // node type test or processing-instruction + if (_lexer.current() == lex_open_brace) { + _lexer.next(); + + if (_lexer.current() == lex_close_brace) { + _lexer.next(); + + nt_type = parse_node_test_type(nt_name); + + if (nt_type == nodetest_none) throw_error("Unrecognized node type"); + + nt_name = xpath_lexer_string(); + } else if (nt_name == PUGIXML_TEXT("processing-instruction")) { + if (_lexer.current() != lex_quoted_string) + throw_error("Only literals are allowed as arguments to processing-instruction()"); + + nt_type = nodetest_pi; + nt_name = _lexer.contents(); + _lexer.next(); + + if (_lexer.current() != lex_close_brace) + throw_error("Unmatched brace near processing-instruction()"); + _lexer.next(); + } else + throw_error("Unmatched brace near node type test"); + + } + // QName or NCName:* + else { + if (nt_name.end - nt_name.begin > 2 && nt_name.end[-2] == ':' && nt_name.end[-1] == '*') { // NCName:* + nt_name.end--; // erase * + + nt_type = nodetest_all_in_namespace; + } else nt_type = nodetest_name; + } + } + } else if (_lexer.current() == lex_multiply) { + nt_type = nodetest_all; + _lexer.next(); + } else throw_error("Unrecognized node test"); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, alloc_string(nt_name)); + + xpath_ast_node* last = 0; + + while (_lexer.current() == lex_open_square_brace) { + _lexer.next(); + + xpath_ast_node* expr = parse_expression(); + + xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, 0, expr, predicate_default); + + if (_lexer.current() != lex_close_square_brace) + throw_error("Unmatched square brace"); + _lexer.next(); + + if (last) last->set_next(pred); + else n->set_right(pred); + + last = pred; + } + + return n; + } + + // RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | RelativeLocationPath '//' Step + xpath_ast_node* parse_relative_location_path(xpath_ast_node* set) { + xpath_ast_node* n = parse_step(set); + + while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash) { + lexeme_t l = _lexer.current(); + _lexer.next(); + + if (l == lex_double_slash) + n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0); + + n = parse_step(n); + } + + return n; + } + + // LocationPath ::= RelativeLocationPath | AbsoluteLocationPath + // AbsoluteLocationPath ::= '/' RelativeLocationPath? | '//' RelativeLocationPath + xpath_ast_node* parse_location_path() { + if (_lexer.current() == lex_slash) { + _lexer.next(); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set); + + // relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path + lexeme_t l = _lexer.current(); + + if (l == lex_string || l == lex_axis_attribute || l == lex_dot || l == lex_double_dot || l == lex_multiply) + return parse_relative_location_path(n); + else + return n; + } else if (_lexer.current() == lex_double_slash) { + _lexer.next(); + + xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set); + n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0); + + return parse_relative_location_path(n); + } + + // else clause moved outside of if because of bogus warning 'control may reach end of non-void function being inlined' in gcc 4.0.1 + return parse_relative_location_path(0); + } + + // PathExpr ::= LocationPath + // | FilterExpr + // | FilterExpr '/' RelativeLocationPath + // | FilterExpr '//' RelativeLocationPath + // UnionExpr ::= PathExpr | UnionExpr '|' PathExpr + // UnaryExpr ::= UnionExpr | '-' UnaryExpr + xpath_ast_node* parse_path_or_unary_expression() { + // Clarification. + // PathExpr begins with either LocationPath or FilterExpr. + // FilterExpr begins with PrimaryExpr + // PrimaryExpr begins with '$' in case of it being a variable reference, + // '(' in case of it being an expression, string literal, number constant or + // function call. + + if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace || + _lexer.current() == lex_quoted_string || _lexer.current() == lex_number || + _lexer.current() == lex_string) { + if (_lexer.current() == lex_string) { + // This is either a function call, or not - if not, we shall proceed with location path + const char_t* state = _lexer.state(); + + while (PUGI__IS_CHARTYPE(*state, ct_space)) ++state; + + if (*state != '(') return parse_location_path(); + + // This looks like a function call; however this still can be a node-test. Check it. + if (parse_node_test_type(_lexer.contents()) != nodetest_none) return parse_location_path(); + } + + xpath_ast_node* n = parse_filter_expression(); + + if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash) { + lexeme_t l = _lexer.current(); + _lexer.next(); + + if (l == lex_double_slash) { + if (n->rettype() != xpath_type_node_set) throw_error("Step has to be applied to node set"); + + n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0); + } + + // select from location path + return parse_relative_location_path(n); + } + + return n; + } else if (_lexer.current() == lex_minus) { + _lexer.next(); + + // precedence 7+ - only parses union expressions + xpath_ast_node* expr = parse_expression_rec(parse_path_or_unary_expression(), 7); + + return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr); + } else + return parse_location_path(); + } + + struct binary_op_t { + ast_type_t asttype; + xpath_value_type rettype; + int precedence; + + binary_op_t(): asttype(ast_unknown), rettype(xpath_type_none), precedence(0) { + } + + binary_op_t(ast_type_t asttype_, xpath_value_type rettype_, int precedence_): asttype(asttype_), rettype(rettype_), precedence(precedence_) { + } + + static binary_op_t parse(xpath_lexer& lexer) { + switch (lexer.current()) { + case lex_string: + if (lexer.contents() == PUGIXML_TEXT("or")) + return binary_op_t(ast_op_or, xpath_type_boolean, 1); + else if (lexer.contents() == PUGIXML_TEXT("and")) + return binary_op_t(ast_op_and, xpath_type_boolean, 2); + else if (lexer.contents() == PUGIXML_TEXT("div")) + return binary_op_t(ast_op_divide, xpath_type_number, 6); + else if (lexer.contents() == PUGIXML_TEXT("mod")) + return binary_op_t(ast_op_mod, xpath_type_number, 6); + else + return binary_op_t(); + + case lex_equal: + return binary_op_t(ast_op_equal, xpath_type_boolean, 3); + + case lex_not_equal: + return binary_op_t(ast_op_not_equal, xpath_type_boolean, 3); + + case lex_less: + return binary_op_t(ast_op_less, xpath_type_boolean, 4); + + case lex_greater: + return binary_op_t(ast_op_greater, xpath_type_boolean, 4); + + case lex_less_or_equal: + return binary_op_t(ast_op_less_or_equal, xpath_type_boolean, 4); + + case lex_greater_or_equal: + return binary_op_t(ast_op_greater_or_equal, xpath_type_boolean, 4); + + case lex_plus: + return binary_op_t(ast_op_add, xpath_type_number, 5); + + case lex_minus: + return binary_op_t(ast_op_subtract, xpath_type_number, 5); + + case lex_multiply: + return binary_op_t(ast_op_multiply, xpath_type_number, 6); + + case lex_union: + return binary_op_t(ast_op_union, xpath_type_node_set, 7); + + default: + return binary_op_t(); + } + } + }; + + xpath_ast_node* parse_expression_rec(xpath_ast_node* lhs, int limit) { + binary_op_t op = binary_op_t::parse(_lexer); + + while (op.asttype != ast_unknown && op.precedence >= limit) { + _lexer.next(); + + xpath_ast_node* rhs = parse_path_or_unary_expression(); + + binary_op_t nextop = binary_op_t::parse(_lexer); + + while (nextop.asttype != ast_unknown && nextop.precedence > op.precedence) { + rhs = parse_expression_rec(rhs, nextop.precedence); + + nextop = binary_op_t::parse(_lexer); + } + + if (op.asttype == ast_op_union && (lhs->rettype() != xpath_type_node_set || rhs->rettype() != xpath_type_node_set)) + throw_error("Union operator has to be applied to node sets"); + + lhs = new (alloc_node()) xpath_ast_node(op.asttype, op.rettype, lhs, rhs); + + op = binary_op_t::parse(_lexer); + } + + return lhs; + } + + // Expr ::= OrExpr + // OrExpr ::= AndExpr | OrExpr 'or' AndExpr + // AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr + // EqualityExpr ::= RelationalExpr + // | EqualityExpr '=' RelationalExpr + // | EqualityExpr '!=' RelationalExpr + // RelationalExpr ::= AdditiveExpr + // | RelationalExpr '<' AdditiveExpr + // | RelationalExpr '>' AdditiveExpr + // | RelationalExpr '<=' AdditiveExpr + // | RelationalExpr '>=' AdditiveExpr + // AdditiveExpr ::= MultiplicativeExpr + // | AdditiveExpr '+' MultiplicativeExpr + // | AdditiveExpr '-' MultiplicativeExpr + // MultiplicativeExpr ::= UnaryExpr + // | MultiplicativeExpr '*' UnaryExpr + // | MultiplicativeExpr 'div' UnaryExpr + // | MultiplicativeExpr 'mod' UnaryExpr + xpath_ast_node* parse_expression() { + return parse_expression_rec(parse_path_or_unary_expression(), 0); + } + + xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result) { + } + + xpath_ast_node* parse() { + xpath_ast_node* result = parse_expression(); + + if (_lexer.current() != lex_eof) { + // there are still unparsed tokens left, error + throw_error("Incorrect query"); + } + + return result; + } + + static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result) { + xpath_parser parser(query, variables, alloc, result); + +#ifdef PUGIXML_NO_EXCEPTIONS + int error = setjmp(parser._error_handler); + + return (error == 0) ? parser.parse() : 0; +#else + return parser.parse(); +#endif + } +}; + +struct xpath_query_impl { + static xpath_query_impl* create() { + void* memory = xml_memory::allocate(sizeof(xpath_query_impl)); + if (!memory) return 0; + + return new (memory) xpath_query_impl(); + } + + static void destroy(xpath_query_impl* impl) { + // free all allocated pages + impl->alloc.release(); + + // free allocator memory (with the first page) + xml_memory::deallocate(impl); + } + + xpath_query_impl(): root(0), alloc(&block) { + block.next = 0; + block.capacity = sizeof(block.data); + } + + xpath_ast_node* root; + xpath_allocator alloc; + xpath_memory_block block; +}; + +PUGI__FN xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd) +{ + if (!impl) return xpath_string(); + +#ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return xpath_string(); +#endif + + xpath_context c(n, 1, 1); + + return impl->root->eval_string(c, sd.stack); +} + +PUGI__FN impl::xpath_ast_node* evaluate_node_set_prepare(xpath_query_impl* impl) +{ + if (!impl) return 0; + + if (impl->root->rettype() != xpath_type_node_set) { +#ifdef PUGIXML_NO_EXCEPTIONS + return 0; +#else + xpath_parse_result res; + res.error = "Expression does not evaluate to node set"; + + throw xpath_exception(res); +#endif + } + + return impl->root; +} +PUGI__NS_END + +namespace pugi +{ +#ifndef PUGIXML_NO_EXCEPTIONS +PUGI__FN xpath_exception::xpath_exception(const xpath_parse_result& result_): _result(result_) +{ + assert(_result.error); +} + +PUGI__FN const char* xpath_exception::what() const throw() +{ + return _result.error; +} + +PUGI__FN const xpath_parse_result& xpath_exception::result() const +{ + return _result; +} +#endif + +PUGI__FN xpath_node::xpath_node() +{ +} + +PUGI__FN xpath_node::xpath_node(const xml_node& node_): _node(node_) +{ +} + +PUGI__FN xpath_node::xpath_node(const xml_attribute& attribute_, const xml_node& parent_): _node(attribute_ ? parent_ : xml_node()), _attribute(attribute_) +{ +} + +PUGI__FN xml_node xpath_node::node() const +{ + return _attribute ? xml_node() : _node; +} + +PUGI__FN xml_attribute xpath_node::attribute() const +{ + return _attribute; +} + +PUGI__FN xml_node xpath_node::parent() const +{ + return _attribute ? _node : _node.parent(); +} + +PUGI__FN static void unspecified_bool_xpath_node(xpath_node***) +{ +} + +PUGI__FN xpath_node::operator xpath_node::unspecified_bool_type() const +{ + return (_node || _attribute) ? unspecified_bool_xpath_node : 0; +} + +PUGI__FN bool xpath_node::operator!() const +{ + return !(_node || _attribute); +} + +PUGI__FN bool xpath_node::operator==(const xpath_node& n) const +{ + return _node == n._node && _attribute == n._attribute; +} + +PUGI__FN bool xpath_node::operator!=(const xpath_node& n) const +{ + return _node != n._node || _attribute != n._attribute; +} + +#ifdef __BORLANDC__ +PUGI__FN bool operator&&(const xpath_node& lhs, bool rhs) +{ + return (bool)lhs && rhs; +} + +PUGI__FN bool operator||(const xpath_node& lhs, bool rhs) +{ + return (bool)lhs || rhs; +} +#endif + +PUGI__FN void xpath_node_set::_assign(const_iterator begin_, const_iterator end_, type_t type_) +{ + assert(begin_ <= end_); + + size_t size_ = static_cast(end_ - begin_); + + if (size_ <= 1) { + // deallocate old buffer + if (_begin != &_storage) impl::xml_memory::deallocate(_begin); + + // use internal buffer + if (begin_ != end_) _storage = *begin_; + + _begin = &_storage; + _end = &_storage + size_; + _type = type_; + } else { + // make heap copy + xpath_node* storage = static_cast(impl::xml_memory::allocate(size_ * sizeof(xpath_node))); + + if (!storage) { +#ifdef PUGIXML_NO_EXCEPTIONS + return; +#else + throw std::bad_alloc(); +#endif + } + + memcpy(storage, begin_, size_ * sizeof(xpath_node)); + + // deallocate old buffer + if (_begin != &_storage) impl::xml_memory::deallocate(_begin); + + // finalize + _begin = storage; + _end = storage + size_; + _type = type_; + } +} + +#if __cplusplus >= 201103 +PUGI__FN void xpath_node_set::_move(xpath_node_set& rhs) +{ + _type = rhs._type; + _storage = rhs._storage; + _begin = (rhs._begin == &rhs._storage) ? &_storage : rhs._begin; + _end = _begin + (rhs._end - rhs._begin); + + rhs._type = type_unsorted; + rhs._begin = &rhs._storage; + rhs._end = rhs._begin; +} +#endif + +PUGI__FN xpath_node_set::xpath_node_set(): _type(type_unsorted), _begin(&_storage), _end(&_storage) +{ +} + +PUGI__FN xpath_node_set::xpath_node_set(const_iterator begin_, const_iterator end_, type_t type_): _type(type_unsorted), _begin(&_storage), _end(&_storage) +{ + _assign(begin_, end_, type_); +} + +PUGI__FN xpath_node_set::~xpath_node_set() +{ + if (_begin != &_storage) + impl::xml_memory::deallocate(_begin); +} + +PUGI__FN xpath_node_set::xpath_node_set(const xpath_node_set& ns): _type(type_unsorted), _begin(&_storage), _end(&_storage) +{ + _assign(ns._begin, ns._end, ns._type); +} + +PUGI__FN xpath_node_set& xpath_node_set::operator=(const xpath_node_set& ns) +{ + if (this == &ns) return *this; + + _assign(ns._begin, ns._end, ns._type); + + return *this; +} + +#if __cplusplus >= 201103 +PUGI__FN xpath_node_set::xpath_node_set(xpath_node_set&& rhs): _type(type_unsorted), _begin(&_storage), _end(&_storage) +{ + _move(rhs); +} + +PUGI__FN xpath_node_set& xpath_node_set::operator=(xpath_node_set&& rhs) +{ + if (this == &rhs) return *this; + + if (_begin != &_storage) + impl::xml_memory::deallocate(_begin); + + _move(rhs); + + return *this; +} +#endif + +PUGI__FN xpath_node_set::type_t xpath_node_set::type() const +{ + return _type; +} + +PUGI__FN size_t xpath_node_set::size() const +{ + return _end - _begin; +} + +PUGI__FN bool xpath_node_set::empty() const +{ + return _begin == _end; +} + +PUGI__FN const xpath_node& xpath_node_set::operator[](size_t index) const +{ + assert(index < size()); + return _begin[index]; +} + +PUGI__FN xpath_node_set::const_iterator xpath_node_set::begin() const +{ + return _begin; +} + +PUGI__FN xpath_node_set::const_iterator xpath_node_set::end() const +{ + return _end; +} + +PUGI__FN void xpath_node_set::sort(bool reverse) +{ + _type = impl::xpath_sort(_begin, _end, _type, reverse); +} + +PUGI__FN xpath_node xpath_node_set::first() const +{ + return impl::xpath_first(_begin, _end, _type); +} + +PUGI__FN xpath_parse_result::xpath_parse_result(): error("Internal error"), offset(0) +{ +} + +PUGI__FN xpath_parse_result::operator bool() const +{ + return error == 0; +} + +PUGI__FN const char* xpath_parse_result::description() const +{ + return error ? error : "No error"; +} + +PUGI__FN xpath_variable::xpath_variable(xpath_value_type type_): _type(type_), _next(0) +{ +} + +PUGI__FN const char_t* xpath_variable::name() const +{ + switch (_type) { + case xpath_type_node_set: + return static_cast(this)->name; + + case xpath_type_number: + return static_cast(this)->name; + + case xpath_type_string: + return static_cast(this)->name; + + case xpath_type_boolean: + return static_cast(this)->name; + + default: + assert(!"Invalid variable type"); + return 0; + } +} + +PUGI__FN xpath_value_type xpath_variable::type() const +{ + return _type; +} + +PUGI__FN bool xpath_variable::get_boolean() const +{ + return (_type == xpath_type_boolean) ? static_cast(this)->value : false; +} + +PUGI__FN double xpath_variable::get_number() const +{ + return (_type == xpath_type_number) ? static_cast(this)->value : impl::gen_nan(); +} + +PUGI__FN const char_t* xpath_variable::get_string() const +{ + const char_t* value = (_type == xpath_type_string) ? static_cast(this)->value : 0; + return value ? value : PUGIXML_TEXT(""); +} + +PUGI__FN const xpath_node_set& xpath_variable::get_node_set() const +{ + return (_type == xpath_type_node_set) ? static_cast(this)->value : impl::dummy_node_set; +} + +PUGI__FN bool xpath_variable::set(bool value) +{ + if (_type != xpath_type_boolean) return false; + + static_cast(this)->value = value; + return true; +} + +PUGI__FN bool xpath_variable::set(double value) +{ + if (_type != xpath_type_number) return false; + + static_cast(this)->value = value; + return true; +} + +PUGI__FN bool xpath_variable::set(const char_t* value) +{ + if (_type != xpath_type_string) return false; + + impl::xpath_variable_string* var = static_cast(this); + + // duplicate string + size_t size = (impl::strlength(value) + 1) * sizeof(char_t); + + char_t* copy = static_cast(impl::xml_memory::allocate(size)); + if (!copy) return false; + + memcpy(copy, value, size); + + // replace old string + if (var->value) impl::xml_memory::deallocate(var->value); + var->value = copy; + + return true; +} + +PUGI__FN bool xpath_variable::set(const xpath_node_set& value) +{ + if (_type != xpath_type_node_set) return false; + + static_cast(this)->value = value; + return true; +} + +PUGI__FN xpath_variable_set::xpath_variable_set() +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) + _data[i] = 0; +} + +PUGI__FN xpath_variable_set::~xpath_variable_set() +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) + _destroy(_data[i]); +} + +PUGI__FN xpath_variable_set::xpath_variable_set(const xpath_variable_set& rhs) +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) + _data[i] = 0; + + _assign(rhs); +} + +PUGI__FN xpath_variable_set& xpath_variable_set::operator=(const xpath_variable_set& rhs) +{ + if (this == &rhs) return *this; + + _assign(rhs); + + return *this; +} + +#if __cplusplus >= 201103 +PUGI__FN xpath_variable_set::xpath_variable_set(xpath_variable_set&& rhs) +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) { + _data[i] = rhs._data[i]; + rhs._data[i] = 0; + } +} + +PUGI__FN xpath_variable_set& xpath_variable_set::operator=(xpath_variable_set&& rhs) +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) { + _destroy(_data[i]); + + _data[i] = rhs._data[i]; + rhs._data[i] = 0; + } + + return *this; +} +#endif + +PUGI__FN void xpath_variable_set::_assign(const xpath_variable_set& rhs) +{ + xpath_variable_set temp; + + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) + if (rhs._data[i] && !_clone(rhs._data[i], &temp._data[i])) + return; + + _swap(temp); +} + +PUGI__FN void xpath_variable_set::_swap(xpath_variable_set& rhs) +{ + for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) { + xpath_variable* chain = _data[i]; + + _data[i] = rhs._data[i]; + rhs._data[i] = chain; + } +} + +PUGI__FN xpath_variable* xpath_variable_set::_find(const char_t* name) const +{ + const size_t hash_size = sizeof(_data) / sizeof(_data[0]); + size_t hash = impl::hash_string(name) % hash_size; + + // look for existing variable + for (xpath_variable* var = _data[hash]; var; var = var->_next) + if (impl::strequal(var->name(), name)) + return var; + + return 0; +} + +PUGI__FN bool xpath_variable_set::_clone(xpath_variable* var, xpath_variable** out_result) +{ + xpath_variable* last = 0; + + while (var) { + // allocate storage for new variable + xpath_variable* nvar = impl::new_xpath_variable(var->_type, var->name()); + if (!nvar) return false; + + // link the variable to the result immediately to handle failures gracefully + if (last) + last->_next = nvar; + else + *out_result = nvar; + + last = nvar; + + // copy the value; this can fail due to out-of-memory conditions + if (!impl::copy_xpath_variable(nvar, var)) return false; + + var = var->_next; + } + + return true; +} + +PUGI__FN void xpath_variable_set::_destroy(xpath_variable* var) +{ + while (var) { + xpath_variable* next = var->_next; + + impl::delete_xpath_variable(var->_type, var); + + var = next; + } +} + +PUGI__FN xpath_variable* xpath_variable_set::add(const char_t* name, xpath_value_type type) +{ + const size_t hash_size = sizeof(_data) / sizeof(_data[0]); + size_t hash = impl::hash_string(name) % hash_size; + + // look for existing variable + for (xpath_variable* var = _data[hash]; var; var = var->_next) + if (impl::strequal(var->name(), name)) + return var->type() == type ? var : 0; + + // add new variable + xpath_variable* result = impl::new_xpath_variable(type, name); + + if (result) { + result->_next = _data[hash]; + + _data[hash] = result; + } + + return result; +} + +PUGI__FN bool xpath_variable_set::set(const char_t* name, bool value) +{ + xpath_variable* var = add(name, xpath_type_boolean); + return var ? var->set(value) : false; +} + +PUGI__FN bool xpath_variable_set::set(const char_t* name, double value) +{ + xpath_variable* var = add(name, xpath_type_number); + return var ? var->set(value) : false; +} + +PUGI__FN bool xpath_variable_set::set(const char_t* name, const char_t* value) +{ + xpath_variable* var = add(name, xpath_type_string); + return var ? var->set(value) : false; +} + +PUGI__FN bool xpath_variable_set::set(const char_t* name, const xpath_node_set& value) +{ + xpath_variable* var = add(name, xpath_type_node_set); + return var ? var->set(value) : false; +} + +PUGI__FN xpath_variable* xpath_variable_set::get(const char_t* name) +{ + return _find(name); +} + +PUGI__FN const xpath_variable* xpath_variable_set::get(const char_t* name) const +{ + return _find(name); +} + +PUGI__FN xpath_query::xpath_query(const char_t* query, xpath_variable_set* variables): _impl(0) +{ + impl::xpath_query_impl* qimpl = impl::xpath_query_impl::create(); + + if (!qimpl) { +#ifdef PUGIXML_NO_EXCEPTIONS + _result.error = "Out of memory"; +#else + throw std::bad_alloc(); +#endif + } else { + using impl::auto_deleter; // MSVC7 workaround + auto_deleter impl(qimpl, impl::xpath_query_impl::destroy); + + qimpl->root = impl::xpath_parser::parse(query, variables, &qimpl->alloc, &_result); + + if (qimpl->root) { + qimpl->root->optimize(&qimpl->alloc); + + _impl = impl.release(); + _result.error = 0; + } + } +} + +PUGI__FN xpath_query::xpath_query(): _impl(0) +{ +} + +PUGI__FN xpath_query::~xpath_query() +{ + if (_impl) + impl::xpath_query_impl::destroy(static_cast(_impl)); +} + +#if __cplusplus >= 201103 +PUGI__FN xpath_query::xpath_query(xpath_query&& rhs) +{ + _impl = rhs._impl; + _result = rhs._result; + rhs._impl = 0; + rhs._result = xpath_parse_result(); +} + +PUGI__FN xpath_query& xpath_query::operator=(xpath_query&& rhs) +{ + if (this == &rhs) return *this; + + if (_impl) + impl::xpath_query_impl::destroy(static_cast(_impl)); + + _impl = rhs._impl; + _result = rhs._result; + rhs._impl = 0; + rhs._result = xpath_parse_result(); + + return *this; +} +#endif + +PUGI__FN xpath_value_type xpath_query::return_type() const +{ + if (!_impl) return xpath_type_none; + + return static_cast(_impl)->root->rettype(); +} + +PUGI__FN bool xpath_query::evaluate_boolean(const xpath_node& n) const +{ + if (!_impl) return false; + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + +#ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return false; +#endif + + return static_cast(_impl)->root->eval_boolean(c, sd.stack); +} + +PUGI__FN double xpath_query::evaluate_number(const xpath_node& n) const +{ + if (!_impl) return impl::gen_nan(); + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + +#ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return impl::gen_nan(); +#endif + + return static_cast(_impl)->root->eval_number(c, sd.stack); +} + +#ifndef PUGIXML_NO_STL +PUGI__FN string_t xpath_query::evaluate_string(const xpath_node& n) const +{ + impl::xpath_stack_data sd; + + impl::xpath_string r = impl::evaluate_string_impl(static_cast(_impl), n, sd); + + return string_t(r.c_str(), r.length()); +} +#endif + +PUGI__FN size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const +{ + impl::xpath_stack_data sd; + + impl::xpath_string r = impl::evaluate_string_impl(static_cast(_impl), n, sd); + + size_t full_size = r.length() + 1; + + if (capacity > 0) { + size_t size = (full_size < capacity) ? full_size : capacity; + assert(size > 0); + + memcpy(buffer, r.c_str(), (size - 1) * sizeof(char_t)); + buffer[size - 1] = 0; + } + + return full_size; +} + +PUGI__FN xpath_node_set xpath_query::evaluate_node_set(const xpath_node& n) const +{ + impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast(_impl)); + if (!root) return xpath_node_set(); + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + +#ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return xpath_node_set(); +#endif + + impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_all); + + return xpath_node_set(r.begin(), r.end(), r.type()); +} + +PUGI__FN xpath_node xpath_query::evaluate_node(const xpath_node& n) const +{ + impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast(_impl)); + if (!root) return xpath_node(); + + impl::xpath_context c(n, 1, 1); + impl::xpath_stack_data sd; + +#ifdef PUGIXML_NO_EXCEPTIONS + if (setjmp(sd.error_handler)) return xpath_node(); +#endif + + impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_first); + + return r.first(); +} + +PUGI__FN const xpath_parse_result& xpath_query::result() const +{ + return _result; +} + +PUGI__FN static void unspecified_bool_xpath_query(xpath_query***) +{ +} + +PUGI__FN xpath_query::operator xpath_query::unspecified_bool_type() const +{ + return _impl ? unspecified_bool_xpath_query : 0; +} + +PUGI__FN bool xpath_query::operator!() const +{ + return !_impl; +} + +PUGI__FN xpath_node xml_node::select_node(const char_t* query, xpath_variable_set* variables) const +{ + xpath_query q(query, variables); + return select_node(q); +} + +PUGI__FN xpath_node xml_node::select_node(const xpath_query& query) const +{ + return query.evaluate_node(*this); +} + +PUGI__FN xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const +{ + xpath_query q(query, variables); + return select_nodes(q); +} + +PUGI__FN xpath_node_set xml_node::select_nodes(const xpath_query& query) const +{ + return query.evaluate_node_set(*this); +} + +PUGI__FN xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const +{ + xpath_query q(query, variables); + return select_single_node(q); +} + +PUGI__FN xpath_node xml_node::select_single_node(const xpath_query& query) const +{ + return query.evaluate_node(*this); +} +} + +#endif + +#ifdef __BORLANDC__ +# pragma option pop +#endif + +// Intel C++ does not properly keep warning state for function templates, +// so popping warning state at the end of translation unit leads to warnings in the middle. +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +# pragma warning(pop) +#endif + +// Undefine all local macros (makes sure we're not leaking macros in header-only mode) +#undef PUGI__NO_INLINE +#undef PUGI__UNLIKELY +#undef PUGI__STATIC_ASSERT +#undef PUGI__DMC_VOLATILE +#undef PUGI__MSVC_CRT_VERSION +#undef PUGI__NS_BEGIN +#undef PUGI__NS_END +#undef PUGI__FN +#undef PUGI__FN_NO_INLINE +#undef PUGI__GETPAGE_IMPL +#undef PUGI__GETPAGE +#undef PUGI__NODETYPE +#undef PUGI__IS_CHARTYPE_IMPL +#undef PUGI__IS_CHARTYPE +#undef PUGI__IS_CHARTYPEX +#undef PUGI__ENDSWITH +#undef PUGI__SKIPWS +#undef PUGI__OPTSET +#undef PUGI__PUSHNODE +#undef PUGI__POPNODE +#undef PUGI__SCANFOR +#undef PUGI__SCANWHILE +#undef PUGI__SCANWHILE_UNROLL +#undef PUGI__ENDSEG +#undef PUGI__THROW_ERROR +#undef PUGI__CHECK_ERROR + +#endif + +/** + * Copyright (c) 2006-2015 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/mosesdecoder/moses2/pugixml.hpp b/mosesdecoder/moses2/pugixml.hpp new file mode 100644 index 0000000000000000000000000000000000000000..13bf7917b727865b40dfeb4bb547e9add706000b --- /dev/null +++ b/mosesdecoder/moses2/pugixml.hpp @@ -0,0 +1,1391 @@ +/** + * pugixml parser - version 1.7 + * -------------------------------------------------------- + * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://pugixml.org/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef PUGIXML_VERSION +// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons +# define PUGIXML_VERSION 170 +#endif + +// Include user configuration file (this can define various configuration macros) +#include "pugiconfig.hpp" + +#ifndef HEADER_PUGIXML_HPP +#define HEADER_PUGIXML_HPP + +// Include stddef.h for size_t and ptrdiff_t +#include + +// Include exception header for XPath +#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS) +# include +#endif + +// Include STL headers +#ifndef PUGIXML_NO_STL +# include +# include +# include +#endif + +// Macro for deprecated features +#ifndef PUGIXML_DEPRECATED +# if defined(__GNUC__) +# define PUGIXML_DEPRECATED __attribute__((deprecated)) +# elif defined(_MSC_VER) && _MSC_VER >= 1300 +# define PUGIXML_DEPRECATED __declspec(deprecated) +# else +# define PUGIXML_DEPRECATED +# endif +#endif + +// If no API is defined, assume default +#ifndef PUGIXML_API +# define PUGIXML_API +#endif + +// If no API for classes is defined, assume default +#ifndef PUGIXML_CLASS +# define PUGIXML_CLASS PUGIXML_API +#endif + +// If no API for functions is defined, assume default +#ifndef PUGIXML_FUNCTION +# define PUGIXML_FUNCTION PUGIXML_API +#endif + +// If the platform is known to have long long support, enable long long functions +#ifndef PUGIXML_HAS_LONG_LONG +# if __cplusplus >= 201103 +# define PUGIXML_HAS_LONG_LONG +# elif defined(_MSC_VER) && _MSC_VER >= 1400 +# define PUGIXML_HAS_LONG_LONG +# endif +#endif + +// Character interface macros +#ifdef PUGIXML_WCHAR_MODE +# define PUGIXML_TEXT(t) L ## t +# define PUGIXML_CHAR wchar_t +#else +# define PUGIXML_TEXT(t) t +# define PUGIXML_CHAR char +#endif + +namespace pugi +{ +// Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE +typedef PUGIXML_CHAR char_t; + +#ifndef PUGIXML_NO_STL +// String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE +typedef std::basic_string, std::allocator > string_t; +#endif +} + +// The PugiXML namespace +namespace pugi +{ +// Tree node types +enum xml_node_type { + node_null, // Empty (null) node handle + node_document, // A document tree's absolute root + node_element, // Element tag, i.e. '' + node_pcdata, // Plain character data, i.e. 'text' + node_cdata, // Character data, i.e. '' + node_comment, // Comment tag, i.e. '' + node_pi, // Processing instruction, i.e. '' + node_declaration, // Document declaration, i.e. '' + node_doctype // Document type declaration, i.e. '' +}; + +// Parsing options + +// Minimal parsing mode (equivalent to turning all other flags off). +// Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed. +const unsigned int parse_minimal = 0x0000; + +// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default. +const unsigned int parse_pi = 0x0001; + +// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default. +const unsigned int parse_comments = 0x0002; + +// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default. +const unsigned int parse_cdata = 0x0004; + +// This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree. +// This flag is off by default; turning it on usually results in slower parsing and more memory consumption. +const unsigned int parse_ws_pcdata = 0x0008; + +// This flag determines if character and entity references are expanded during parsing. This flag is on by default. +const unsigned int parse_escapes = 0x0010; + +// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default. +const unsigned int parse_eol = 0x0020; + +// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default. +const unsigned int parse_wconv_attribute = 0x0040; + +// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default. +const unsigned int parse_wnorm_attribute = 0x0080; + +// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default. +const unsigned int parse_declaration = 0x0100; + +// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default. +const unsigned int parse_doctype = 0x0200; + +// This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only +// of whitespace is added to the DOM tree. +// This flag is off by default; turning it on may result in slower parsing and more memory consumption. +const unsigned int parse_ws_pcdata_single = 0x0400; + +// This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default. +const unsigned int parse_trim_pcdata = 0x0800; + +// This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document +// is a valid document. This flag is off by default. +const unsigned int parse_fragment = 0x1000; + +// The default parsing mode. +// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, +// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. +const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol; + +// The full parsing mode. +// Nodes of all types are added to the DOM tree, character/reference entities are expanded, +// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. +const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype; + +// These flags determine the encoding of input data for XML document +enum xml_encoding { + encoding_auto, // Auto-detect input encoding using BOM or < / class xml_object_range +{ +public: + typedef It const_iterator; + typedef It iterator; + + xml_object_range(It b, It e): _begin(b), _end(e) { + } + + It begin() const { + return _begin; + } + It end() const { + return _end; + } + +private: + It _begin, _end; +}; + +// Writer interface for node printing (see xml_node::print) +class PUGIXML_CLASS xml_writer +{ +public: + virtual ~xml_writer() {} + + // Write memory chunk into stream/file/whatever + virtual void write(const void* data, size_t size) = 0; +}; + +// xml_writer implementation for FILE* +class PUGIXML_CLASS xml_writer_file: public xml_writer +{ +public: + // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio + xml_writer_file(void* file); + + virtual void write(const void* data, size_t size); + +private: + void* file; +}; + +#ifndef PUGIXML_NO_STL +// xml_writer implementation for streams +class PUGIXML_CLASS xml_writer_stream: public xml_writer +{ +public: + // Construct writer from an output stream object + xml_writer_stream(std::basic_ostream >& stream); + xml_writer_stream(std::basic_ostream >& stream); + + virtual void write(const void* data, size_t size); + +private: + std::basic_ostream >* narrow_stream; + std::basic_ostream >* wide_stream; +}; +#endif + +// A light-weight handle for manipulating attributes in DOM tree +class PUGIXML_CLASS xml_attribute +{ + friend class xml_attribute_iterator; + friend class xml_node; + +private: + xml_attribute_struct* _attr; + + typedef void (*unspecified_bool_type)(xml_attribute***); + +public: + // Default constructor. Constructs an empty attribute. + xml_attribute(); + + // Constructs attribute from internal pointer + explicit xml_attribute(xml_attribute_struct* attr); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators (compares wrapped attribute pointers) + bool operator==(const xml_attribute& r) const; + bool operator!=(const xml_attribute& r) const; + bool operator<(const xml_attribute& r) const; + bool operator>(const xml_attribute& r) const; + bool operator<=(const xml_attribute& r) const; + bool operator>=(const xml_attribute& r) const; + + // Check if attribute is empty + bool empty() const; + + // Get attribute name/value, or "" if attribute is empty + const char_t* name() const; + const char_t* value() const; + + // Get attribute value, or the default value if attribute is empty + const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; + + // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty + int as_int(int def = 0) const; + unsigned int as_uint(unsigned int def = 0) const; + double as_double(double def = 0) const; + float as_float(float def = 0) const; + +#ifdef PUGIXML_HAS_LONG_LONG + long long as_llong(long long def = 0) const; + unsigned long long as_ullong(unsigned long long def = 0) const; +#endif + + // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty + bool as_bool(bool def = false) const; + + // Set attribute name/value (returns false if attribute is empty or there is not enough memory) + bool set_name(const char_t* rhs); + bool set_value(const char_t* rhs); + + // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") + bool set_value(int rhs); + bool set_value(unsigned int rhs); + bool set_value(double rhs); + bool set_value(float rhs); + bool set_value(bool rhs); + +#ifdef PUGIXML_HAS_LONG_LONG + bool set_value(long long rhs); + bool set_value(unsigned long long rhs); +#endif + + // Set attribute value (equivalent to set_value without error checking) + xml_attribute& operator=(const char_t* rhs); + xml_attribute& operator=(int rhs); + xml_attribute& operator=(unsigned int rhs); + xml_attribute& operator=(double rhs); + xml_attribute& operator=(float rhs); + xml_attribute& operator=(bool rhs); + +#ifdef PUGIXML_HAS_LONG_LONG + xml_attribute& operator=(long long rhs); + xml_attribute& operator=(unsigned long long rhs); +#endif + + // Get next/previous attribute in the attribute list of the parent node + xml_attribute next_attribute() const; + xml_attribute previous_attribute() const; + + // Get hash value (unique for handles to the same object) + size_t hash_value() const; + + // Get internal pointer + xml_attribute_struct* internal_object() const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs); +#endif + +// A light-weight handle for manipulating nodes in DOM tree +class PUGIXML_CLASS xml_node +{ + friend class xml_attribute_iterator; + friend class xml_node_iterator; + friend class xml_named_node_iterator; + +protected: + xml_node_struct* _root; + + typedef void (*unspecified_bool_type)(xml_node***); + +public: + // Default constructor. Constructs an empty node. + xml_node(); + + // Constructs node from internal pointer + explicit xml_node(xml_node_struct* p); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators (compares wrapped node pointers) + bool operator==(const xml_node& r) const; + bool operator!=(const xml_node& r) const; + bool operator<(const xml_node& r) const; + bool operator>(const xml_node& r) const; + bool operator<=(const xml_node& r) const; + bool operator>=(const xml_node& r) const; + + // Check if node is empty. + bool empty() const; + + // Get node type + xml_node_type type() const; + + // Get node name, or "" if node is empty or it has no name + const char_t* name() const; + + // Get node value, or "" if node is empty or it has no value + // Note: For text node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes. + const char_t* value() const; + + // Get attribute list + xml_attribute first_attribute() const; + xml_attribute last_attribute() const; + + // Get children list + xml_node first_child() const; + xml_node last_child() const; + + // Get next/previous sibling in the children list of the parent node + xml_node next_sibling() const; + xml_node previous_sibling() const; + + // Get parent node + xml_node parent() const; + + // Get root of DOM tree this node belongs to + xml_node root() const; + + // Get text object for the current node + xml_text text() const; + + // Get child, attribute or next/previous sibling with the specified name + xml_node child(const char_t* name) const; + xml_attribute attribute(const char_t* name) const; + xml_node next_sibling(const char_t* name) const; + xml_node previous_sibling(const char_t* name) const; + + // Get attribute, starting the search from a hint (and updating hint so that searching for a sequence of attributes is fast) + xml_attribute attribute(const char_t* name, xml_attribute& hint) const; + + // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA + const char_t* child_value() const; + + // Get child value of child with specified name. Equivalent to child(name).child_value(). + const char_t* child_value(const char_t* name) const; + + // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value) + bool set_name(const char_t* rhs); + bool set_value(const char_t* rhs); + + // Add attribute with specified name. Returns added attribute, or empty attribute on errors. + xml_attribute append_attribute(const char_t* name); + xml_attribute prepend_attribute(const char_t* name); + xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr); + xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr); + + // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors. + xml_attribute append_copy(const xml_attribute& proto); + xml_attribute prepend_copy(const xml_attribute& proto); + xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr); + xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr); + + // Add child node with specified type. Returns added node, or empty node on errors. + xml_node append_child(xml_node_type type = node_element); + xml_node prepend_child(xml_node_type type = node_element); + xml_node insert_child_after(xml_node_type type, const xml_node& node); + xml_node insert_child_before(xml_node_type type, const xml_node& node); + + // Add child element with specified name. Returns added node, or empty node on errors. + xml_node append_child(const char_t* name); + xml_node prepend_child(const char_t* name); + xml_node insert_child_after(const char_t* name, const xml_node& node); + xml_node insert_child_before(const char_t* name, const xml_node& node); + + // Add a copy of the specified node as a child. Returns added node, or empty node on errors. + xml_node append_copy(const xml_node& proto); + xml_node prepend_copy(const xml_node& proto); + xml_node insert_copy_after(const xml_node& proto, const xml_node& node); + xml_node insert_copy_before(const xml_node& proto, const xml_node& node); + + // Move the specified node to become a child of this node. Returns moved node, or empty node on errors. + xml_node append_move(const xml_node& moved); + xml_node prepend_move(const xml_node& moved); + xml_node insert_move_after(const xml_node& moved, const xml_node& node); + xml_node insert_move_before(const xml_node& moved, const xml_node& node); + + // Remove specified attribute + bool remove_attribute(const xml_attribute& a); + bool remove_attribute(const char_t* name); + + // Remove specified child + bool remove_child(const xml_node& n); + bool remove_child(const char_t* name); + + // Parses buffer as an XML document fragment and appends all nodes as children of the current node. + // Copies/converts the buffer, so it may be deleted or changed after the function returns. + // Note: append_buffer allocates memory that has the lifetime of the owning document; removing the appended nodes does not immediately reclaim that memory. + xml_parse_result append_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Find attribute using predicate. Returns first attribute for which predicate returned true. + template xml_attribute find_attribute(Predicate pred) const { + if (!_root) return xml_attribute(); + + for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute()) + if (pred(attrib)) + return attrib; + + return xml_attribute(); + } + + // Find child node using predicate. Returns first child for which predicate returned true. + template xml_node find_child(Predicate pred) const { + if (!_root) return xml_node(); + + for (xml_node node = first_child(); node; node = node.next_sibling()) + if (pred(node)) + return node; + + return xml_node(); + } + + // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true. + template xml_node find_node(Predicate pred) const { + if (!_root) return xml_node(); + + xml_node cur = first_child(); + + while (cur._root && cur._root != _root) { + if (pred(cur)) return cur; + + if (cur.first_child()) cur = cur.first_child(); + else if (cur.next_sibling()) cur = cur.next_sibling(); + else { + while (!cur.next_sibling() && cur._root != _root) cur = cur.parent(); + + if (cur._root != _root) cur = cur.next_sibling(); + } + } + + return xml_node(); + } + + // Find child node by attribute name/value + xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const; + xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const; + +#ifndef PUGIXML_NO_STL + // Get the absolute node path from root as a text string. + string_t path(char_t delimiter = '/') const; +#endif + + // Search for a node by path consisting of node names and . or .. elements. + xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const; + + // Recursively traverse subtree with xml_tree_walker + bool traverse(xml_tree_walker& walker); + +#ifndef PUGIXML_NO_XPATH + // Select single node by evaluating XPath query. Returns first node from the resulting node set. + xpath_node select_node(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node select_node(const xpath_query& query) const; + + // Select node set by evaluating XPath query + xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node_set select_nodes(const xpath_query& query) const; + + // (deprecated: use select_node instead) Select single node by evaluating XPath query. + xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node select_single_node(const xpath_query& query) const; + +#endif + + // Print subtree using a writer object + void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; + +#ifndef PUGIXML_NO_STL + // Print subtree to stream + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; +#endif + + // Child nodes iterators + typedef xml_node_iterator iterator; + + iterator begin() const; + iterator end() const; + + // Attribute iterators + typedef xml_attribute_iterator attribute_iterator; + + attribute_iterator attributes_begin() const; + attribute_iterator attributes_end() const; + + // Range-based for support + xml_object_range children() const; + xml_object_range children(const char_t* name) const; + xml_object_range attributes() const; + + // Get node offset in parsed file/string (in char_t units) for debugging purposes + ptrdiff_t offset_debug() const; + + // Get hash value (unique for handles to the same object) + size_t hash_value() const; + + // Get internal pointer + xml_node_struct* internal_object() const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs); +#endif + +// A helper for working with text inside PCDATA nodes +class PUGIXML_CLASS xml_text +{ + friend class xml_node; + + xml_node_struct* _root; + + typedef void (*unspecified_bool_type)(xml_text***); + + explicit xml_text(xml_node_struct* root); + + xml_node_struct* _data_new(); + xml_node_struct* _data() const; + +public: + // Default constructor. Constructs an empty object. + xml_text(); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Check if text object is empty + bool empty() const; + + // Get text, or "" if object is empty + const char_t* get() const; + + // Get text, or the default value if object is empty + const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; + + // Get text as a number, or the default value if conversion did not succeed or object is empty + int as_int(int def = 0) const; + unsigned int as_uint(unsigned int def = 0) const; + double as_double(double def = 0) const; + float as_float(float def = 0) const; + +#ifdef PUGIXML_HAS_LONG_LONG + long long as_llong(long long def = 0) const; + unsigned long long as_ullong(unsigned long long def = 0) const; +#endif + + // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty + bool as_bool(bool def = false) const; + + // Set text (returns false if object is empty or there is not enough memory) + bool set(const char_t* rhs); + + // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") + bool set(int rhs); + bool set(unsigned int rhs); + bool set(double rhs); + bool set(float rhs); + bool set(bool rhs); + +#ifdef PUGIXML_HAS_LONG_LONG + bool set(long long rhs); + bool set(unsigned long long rhs); +#endif + + // Set text (equivalent to set without error checking) + xml_text& operator=(const char_t* rhs); + xml_text& operator=(int rhs); + xml_text& operator=(unsigned int rhs); + xml_text& operator=(double rhs); + xml_text& operator=(float rhs); + xml_text& operator=(bool rhs); + +#ifdef PUGIXML_HAS_LONG_LONG + xml_text& operator=(long long rhs); + xml_text& operator=(unsigned long long rhs); +#endif + + // Get the data node (node_pcdata or node_cdata) for this object + xml_node data() const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs); +#endif + +// Child node iterator (a bidirectional iterator over a collection of xml_node) +class PUGIXML_CLASS xml_node_iterator +{ + friend class xml_node; + +private: + mutable xml_node _wrap; + xml_node _parent; + + xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent); + +public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_node value_type; + typedef xml_node* pointer; + typedef xml_node& reference; + +#ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; +#endif + + // Default constructor + xml_node_iterator(); + + // Construct an iterator which points to the specified node + xml_node_iterator(const xml_node& node); + + // Iterator operators + bool operator==(const xml_node_iterator& rhs) const; + bool operator!=(const xml_node_iterator& rhs) const; + + xml_node& operator*() const; + xml_node* operator->() const; + + const xml_node_iterator& operator++(); + xml_node_iterator operator++(int); + + const xml_node_iterator& operator--(); + xml_node_iterator operator--(int); +}; + +// Attribute iterator (a bidirectional iterator over a collection of xml_attribute) +class PUGIXML_CLASS xml_attribute_iterator +{ + friend class xml_node; + +private: + mutable xml_attribute _wrap; + xml_node _parent; + + xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent); + +public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_attribute value_type; + typedef xml_attribute* pointer; + typedef xml_attribute& reference; + +#ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; +#endif + + // Default constructor + xml_attribute_iterator(); + + // Construct an iterator which points to the specified attribute + xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent); + + // Iterator operators + bool operator==(const xml_attribute_iterator& rhs) const; + bool operator!=(const xml_attribute_iterator& rhs) const; + + xml_attribute& operator*() const; + xml_attribute* operator->() const; + + const xml_attribute_iterator& operator++(); + xml_attribute_iterator operator++(int); + + const xml_attribute_iterator& operator--(); + xml_attribute_iterator operator--(int); +}; + +// Named node range helper +class PUGIXML_CLASS xml_named_node_iterator +{ + friend class xml_node; + +public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_node value_type; + typedef xml_node* pointer; + typedef xml_node& reference; + +#ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; +#endif + + // Default constructor + xml_named_node_iterator(); + + // Construct an iterator which points to the specified node + xml_named_node_iterator(const xml_node& node, const char_t* name); + + // Iterator operators + bool operator==(const xml_named_node_iterator& rhs) const; + bool operator!=(const xml_named_node_iterator& rhs) const; + + xml_node& operator*() const; + xml_node* operator->() const; + + const xml_named_node_iterator& operator++(); + xml_named_node_iterator operator++(int); + + const xml_named_node_iterator& operator--(); + xml_named_node_iterator operator--(int); + +private: + mutable xml_node _wrap; + xml_node _parent; + const char_t* _name; + + xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name); +}; + +// Abstract tree walker class (see xml_node::traverse) +class PUGIXML_CLASS xml_tree_walker +{ + friend class xml_node; + +private: + int _depth; + +protected: + // Get current traversal depth + int depth() const; + +public: + xml_tree_walker(); + virtual ~xml_tree_walker(); + + // Callback that is called when traversal begins + virtual bool begin(xml_node& node); + + // Callback that is called for each node traversed + virtual bool for_each(xml_node& node) = 0; + + // Callback that is called when traversal ends + virtual bool end(xml_node& node); +}; + +// Parsing status, returned as part of xml_parse_result object +enum xml_parse_status { + status_ok = 0, // No error + + status_file_not_found, // File was not found during load_file() + status_io_error, // Error reading from file/stream + status_out_of_memory, // Could not allocate memory + status_internal_error, // Internal error occurred + + status_unrecognized_tag, // Parser could not determine tag type + + status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction + status_bad_comment, // Parsing error occurred while parsing comment + status_bad_cdata, // Parsing error occurred while parsing CDATA section + status_bad_doctype, // Parsing error occurred while parsing document type declaration + status_bad_pcdata, // Parsing error occurred while parsing PCDATA section + status_bad_start_element, // Parsing error occurred while parsing start element tag + status_bad_attribute, // Parsing error occurred while parsing element attribute + status_bad_end_element, // Parsing error occurred while parsing end element tag + status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag) + + status_append_invalid_root, // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) + + status_no_document_element // Parsing resulted in a document without element nodes +}; + +// Parsing result +struct PUGIXML_CLASS xml_parse_result { + // Parsing status (see xml_parse_status) + xml_parse_status status; + + // Last parsed offset (in char_t units from start of input data) + ptrdiff_t offset; + + // Source document encoding + xml_encoding encoding; + + // Default constructor, initializes object to failed state + xml_parse_result(); + + // Cast to bool operator + operator bool() const; + + // Get error description + const char* description() const; +}; + +// Document class (DOM tree root) +class PUGIXML_CLASS xml_document: public xml_node +{ +private: + char_t* _buffer; + + char _memory[192]; + + // Non-copyable semantics + xml_document(const xml_document&); + xml_document& operator=(const xml_document&); + + void create(); + void destroy(); + +public: + // Default constructor, makes empty document + xml_document(); + + // Destructor, invalidates all node/attribute handles to this document + ~xml_document(); + + // Removes all nodes, leaving the empty document + void reset(); + + // Removes all nodes, then copies the entire contents of the specified document + void reset(const xml_document& proto); + +#ifndef PUGIXML_NO_STL + // Load document from stream. + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default); +#endif + + // (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied. + xml_parse_result load(const char_t* contents, unsigned int options = parse_default); + + // Load document from zero-terminated string. No encoding conversions are applied. + xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default); + + // Load document from file + xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns. + xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). + // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed. + xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). + // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore). + xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details). + void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + +#ifndef PUGIXML_NO_STL + // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details). + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; +#endif + + // Save XML to file + bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + + // Get document element + xml_node document_element() const; +}; + +#ifndef PUGIXML_NO_XPATH +// XPath query return type +enum xpath_value_type { + xpath_type_none, // Unknown type (query failed to compile) + xpath_type_node_set, // Node set (xpath_node_set) + xpath_type_number, // Number + xpath_type_string, // String + xpath_type_boolean // Boolean +}; + +// XPath parsing result +struct PUGIXML_CLASS xpath_parse_result { + // Error message (0 if no error) + const char* error; + + // Last parsed offset (in char_t units from string start) + ptrdiff_t offset; + + // Default constructor, initializes object to failed state + xpath_parse_result(); + + // Cast to bool operator + operator bool() const; + + // Get error description + const char* description() const; +}; + +// A single XPath variable +class PUGIXML_CLASS xpath_variable +{ + friend class xpath_variable_set; + +protected: + xpath_value_type _type; + xpath_variable* _next; + + xpath_variable(xpath_value_type type); + + // Non-copyable semantics + xpath_variable(const xpath_variable&); + xpath_variable& operator=(const xpath_variable&); + +public: + // Get variable name + const char_t* name() const; + + // Get variable type + xpath_value_type type() const; + + // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error + bool get_boolean() const; + double get_number() const; + const char_t* get_string() const; + const xpath_node_set& get_node_set() const; + + // Set variable value; no type conversion is performed, false is returned on type mismatch error + bool set(bool value); + bool set(double value); + bool set(const char_t* value); + bool set(const xpath_node_set& value); +}; + +// A set of XPath variables +class PUGIXML_CLASS xpath_variable_set +{ +private: + xpath_variable* _data[64]; + + void _assign(const xpath_variable_set& rhs); + void _swap(xpath_variable_set& rhs); + + xpath_variable* _find(const char_t* name) const; + + static bool _clone(xpath_variable* var, xpath_variable** out_result); + static void _destroy(xpath_variable* var); + +public: + // Default constructor/destructor + xpath_variable_set(); + ~xpath_variable_set(); + + // Copy constructor/assignment operator + xpath_variable_set(const xpath_variable_set& rhs); + xpath_variable_set& operator=(const xpath_variable_set& rhs); + +#if __cplusplus >= 201103 + // Move semantics support + xpath_variable_set(xpath_variable_set&& rhs); + xpath_variable_set& operator=(xpath_variable_set&& rhs); +#endif + + // Add a new variable or get the existing one, if the types match + xpath_variable* add(const char_t* name, xpath_value_type type); + + // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch + bool set(const char_t* name, bool value); + bool set(const char_t* name, double value); + bool set(const char_t* name, const char_t* value); + bool set(const char_t* name, const xpath_node_set& value); + + // Get existing variable by name + xpath_variable* get(const char_t* name); + const xpath_variable* get(const char_t* name) const; +}; + +// A compiled XPath query object +class PUGIXML_CLASS xpath_query +{ +private: + void* _impl; + xpath_parse_result _result; + + typedef void (*unspecified_bool_type)(xpath_query***); + + // Non-copyable semantics + xpath_query(const xpath_query&); + xpath_query& operator=(const xpath_query&); + +public: + // Construct a compiled object from XPath expression. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors. + explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0); + + // Constructor + xpath_query(); + + // Destructor + ~xpath_query(); + +#if __cplusplus >= 201103 + // Move semantics support + xpath_query(xpath_query&& rhs); + xpath_query& operator=(xpath_query&& rhs); +#endif + + // Get query expression return type + xpath_value_type return_type() const; + + // Evaluate expression as boolean value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + bool evaluate_boolean(const xpath_node& n) const; + + // Evaluate expression as double value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + double evaluate_number(const xpath_node& n) const; + +#ifndef PUGIXML_NO_STL + // Evaluate expression as string value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + string_t evaluate_string(const xpath_node& n) const; +#endif + + // Evaluate expression as string value in the specified context; performs type conversion if necessary. + // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero). + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty set instead. + size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const; + + // Evaluate expression as node set in the specified context. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead. + xpath_node_set evaluate_node_set(const xpath_node& n) const; + + // Evaluate expression as node set in the specified context. + // Return first node in document order, or empty node if node set is empty. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node instead. + xpath_node evaluate_node(const xpath_node& n) const; + + // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode) + const xpath_parse_result& result() const; + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; +}; + +#ifndef PUGIXML_NO_EXCEPTIONS +// XPath exception class +class PUGIXML_CLASS xpath_exception: public std::exception +{ +private: + xpath_parse_result _result; + +public: + // Construct exception from parse result + explicit xpath_exception(const xpath_parse_result& result); + + // Get error message + virtual const char* what() const throw(); + + // Get parse result + const xpath_parse_result& result() const; +}; +#endif + +// XPath node class (either xml_node or xml_attribute) +class PUGIXML_CLASS xpath_node +{ +private: + xml_node _node; + xml_attribute _attribute; + + typedef void (*unspecified_bool_type)(xpath_node***); + +public: + // Default constructor; constructs empty XPath node + xpath_node(); + + // Construct XPath node from XML node/attribute + xpath_node(const xml_node& node); + xpath_node(const xml_attribute& attribute, const xml_node& parent); + + // Get node/attribute, if any + xml_node node() const; + xml_attribute attribute() const; + + // Get parent of contained node/attribute + xml_node parent() const; + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators + bool operator==(const xpath_node& n) const; + bool operator!=(const xpath_node& n) const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs); +#endif + +// A fixed-size collection of XPath nodes +class PUGIXML_CLASS xpath_node_set +{ +public: + // Collection type + enum type_t { + type_unsorted, // Not ordered + type_sorted, // Sorted by document order (ascending) + type_sorted_reverse // Sorted by document order (descending) + }; + + // Constant iterator type + typedef const xpath_node* const_iterator; + + // We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work + typedef const xpath_node* iterator; + + // Default constructor. Constructs empty set. + xpath_node_set(); + + // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful + xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted); + + // Destructor + ~xpath_node_set(); + + // Copy constructor/assignment operator + xpath_node_set(const xpath_node_set& ns); + xpath_node_set& operator=(const xpath_node_set& ns); + +#if __cplusplus >= 201103 + // Move semantics support + xpath_node_set(xpath_node_set&& rhs); + xpath_node_set& operator=(xpath_node_set&& rhs); +#endif + + // Get collection type + type_t type() const; + + // Get collection size + size_t size() const; + + // Indexing operator + const xpath_node& operator[](size_t index) const; + + // Collection iterators + const_iterator begin() const; + const_iterator end() const; + + // Sort the collection in ascending/descending order by document order + void sort(bool reverse = false); + + // Get first node in the collection by document order + xpath_node first() const; + + // Check if collection is empty + bool empty() const; + +private: + type_t _type; + + xpath_node _storage; + + xpath_node* _begin; + xpath_node* _end; + + void _assign(const_iterator begin, const_iterator end, type_t type); + void _move(xpath_node_set& rhs); +}; +#endif + +#ifndef PUGIXML_NO_STL +// Convert wide string to UTF8 +std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const wchar_t* str); +std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const std::basic_string, std::allocator >& str); + +// Convert UTF8 to wide string +std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const char* str); +std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const std::basic_string, std::allocator >& str); +#endif + +// Memory allocation function interface; returns pointer to allocated memory or NULL on failure +typedef void* (*allocation_function)(size_t size); + +// Memory deallocation function interface +typedef void (*deallocation_function)(void* ptr); + +// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions. +void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate); + +// Get current memory management functions +allocation_function PUGIXML_FUNCTION get_memory_allocation_function(); +deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function(); +} + +#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) +namespace std +{ +// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) +std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&); +std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&); +std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&); +} +#endif + +#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC) +namespace std +{ +// Workarounds for (non-standard) iterator category detection +std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&); +std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&); +std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&); +} +#endif + +#endif + +// Make sure implementation is included in header-only mode +// Use macro expansion in #include to work around QMake (QTBUG-11923) +#if defined(PUGIXML_HEADER_ONLY) && !defined(PUGIXML_SOURCE) +# define PUGIXML_SOURCE "pugixml.cpp" +# include PUGIXML_SOURCE +#endif + +/** + * Copyright (c) 2006-2015 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/AlignedSentence.h b/mosesdecoder/phrase-extract/extract-mixed-syntax/AlignedSentence.h new file mode 100644 index 0000000000000000000000000000000000000000..30c3d06a26d2b98d5f5e0cc1798b754f91afe392 --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/AlignedSentence.h @@ -0,0 +1,54 @@ +/* + * AlignedSentence.h + * + * Created on: 18 Feb 2014 + * Author: s0565741 + */ +#pragma once + +#include +#include +#include "ConsistentPhrases.h" +#include "Phrase.h" +#include "moses/TypeDef.h" + +class Parameter; + +class AlignedSentence +{ +public: + AlignedSentence(int lineNum) + :m_lineNum(lineNum) + {} + + AlignedSentence(int lineNum, + const std::string &source, + const std::string &target, + const std::string &alignment); + virtual ~AlignedSentence(); + virtual void Create(const Parameter ¶ms); + + const Phrase &GetPhrase(Moses::FactorDirection direction) const { + return (direction == Moses::Input) ? m_source : m_target; + } + + const ConsistentPhrases &GetConsistentPhrases() const { + return m_consistentPhrases; + } + + virtual std::string Debug() const; + + int m_lineNum; +protected: + Phrase m_source, m_target; + ConsistentPhrases m_consistentPhrases; + + void CreateConsistentPhrases(const Parameter ¶ms); + void PopulateWordVec(Phrase &vec, const std::string &line); + + // m_source and m_target MUST be populated before calling this + void PopulateAlignment(const std::string &line); + std::vector GetSourceAlignmentCount() const; +}; + + diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp b/mosesdecoder/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cb088f5b4dd4d8823a297be7b208c1d0a6a7eebc --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/AlignedSentenceSyntax.cpp @@ -0,0 +1,182 @@ +/* + * AlignedSentenceSyntax.cpp + * + * Created on: 26 Feb 2014 + * Author: hieu + */ + +#include "AlignedSentenceSyntax.h" +#include "Parameter.h" +#include "pugixml.hpp" +#include "moses/Util.h" + +using namespace std; + +AlignedSentenceSyntax::AlignedSentenceSyntax(int lineNum, + const std::string &source, + const std::string &target, + const std::string &alignment) + :AlignedSentence(lineNum) + ,m_sourceStr(source) + ,m_targetStr(target) + ,m_alignmentStr(alignment) +{ +} + +AlignedSentenceSyntax::~AlignedSentenceSyntax() +{ + // TODO Auto-generated destructor stub +} + +void AlignedSentenceSyntax::Populate(bool isSyntax, int mixedSyntaxType, const Parameter ¶ms, + string line, Phrase &phrase, SyntaxTree &tree) +{ + // parse source and target string + if (isSyntax) { + line = "" + line + ""; + XMLParse(phrase, tree, line, params); + + if (mixedSyntaxType != 0) { + // mixed syntax. Always add [X] where there isn't 1 + tree.SetHieroLabel(params.hieroNonTerm); + if (mixedSyntaxType == 2) { + tree.AddToAll(params.hieroNonTerm); + } + } + } else { + PopulateWordVec(phrase, line); + tree.SetHieroLabel(params.hieroNonTerm); + } + +} + +void AlignedSentenceSyntax::Create(const Parameter ¶ms) +{ + Populate(params.sourceSyntax, params.mixedSyntaxType, params, m_sourceStr, + m_source, m_sourceTree); + Populate(params.targetSyntax, params.mixedSyntaxType, params, m_targetStr, + m_target, m_targetTree); + + PopulateAlignment(m_alignmentStr); + CreateConsistentPhrases(params); + + // create labels + CreateNonTerms(); +} + +void Escape(string &text) +{ + text = Moses::Replace(text, "&", "&"); + text = Moses::Replace(text, "|", "|"); + text = Moses::Replace(text, "<", "<"); + text = Moses::Replace(text, ">", ">"); + text = Moses::Replace(text, "'", "'"); + text = Moses::Replace(text, "\"", """); + text = Moses::Replace(text, "[", "["); + text = Moses::Replace(text, "]", "]"); + +} + +void AlignedSentenceSyntax::XMLParse(Phrase &output, + SyntaxTree &tree, + const pugi::xml_node &parentNode, + const Parameter ¶ms) +{ + int childNum = 0; + for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) { + string nodeName = childNode.name(); + + // span label + string label; + int startPos = output.size(); + + if (!nodeName.empty()) { + pugi::xml_attribute attribute = childNode.attribute("label"); + label = attribute.as_string(); + + // recursively call this function. For proper recursive trees + XMLParse(output, tree, childNode, params); + } + + + + // fill phrase vector + string text = childNode.value(); + Escape(text); + //cerr << childNum << " " << label << "=" << text << endl; + + std::vector toks; + Moses::Tokenize(toks, text); + + for (size_t i = 0; i < toks.size(); ++i) { + const string &tok = toks[i]; + Word *word = new Word(output.size(), tok); + output.push_back(word); + } + + // is it a labelled span? + int endPos = output.size() - 1; + + // fill syntax labels + if (!label.empty()) { + label = "[" + label + "]"; + tree.Add(startPos, endPos, label, params); + } + + ++childNum; + } + +} + +void AlignedSentenceSyntax::XMLParse(Phrase &output, + SyntaxTree &tree, + const std::string input, + const Parameter ¶ms) +{ + pugi::xml_document doc; + pugi::xml_parse_result result = doc.load(input.c_str(), + pugi::parse_default | pugi::parse_comments); + + pugi::xml_node topNode = doc.child("xml"); + XMLParse(output, tree, topNode, params); +} + +void AlignedSentenceSyntax::CreateNonTerms() +{ + for (int sourceStart = 0; sourceStart < m_source.size(); ++sourceStart) { + for (int sourceEnd = sourceStart; sourceEnd < m_source.size(); ++sourceEnd) { + ConsistentPhrases::Coll &coll = m_consistentPhrases.GetColl(sourceStart, sourceEnd); + const SyntaxTree::Labels &sourceLabels = m_sourceTree.Find(sourceStart, sourceEnd); + + ConsistentPhrases::Coll::iterator iter; + for (iter = coll.begin(); iter != coll.end(); ++iter) { + ConsistentPhrase &cp = **iter; + + int targetStart = cp.corners[2]; + int targetEnd = cp.corners[3]; + const SyntaxTree::Labels &targetLabels = m_targetTree.Find(targetStart, targetEnd); + + CreateNonTerms(cp, sourceLabels, targetLabels); + } + } + } + +} + +void AlignedSentenceSyntax::CreateNonTerms(ConsistentPhrase &cp, + const SyntaxTree::Labels &sourceLabels, + const SyntaxTree::Labels &targetLabels) +{ + SyntaxTree::Labels::const_iterator iterSource; + for (iterSource = sourceLabels.begin(); iterSource != sourceLabels.end(); ++iterSource) { + const string &sourceLabel = *iterSource; + + SyntaxTree::Labels::const_iterator iterTarget; + for (iterTarget = targetLabels.begin(); iterTarget != targetLabels.end(); ++iterTarget) { + const string &targetLabel = *iterTarget; + cp.AddNonTerms(sourceLabel, targetLabel); + } + } +} + + diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/ConsistentPhrases.h b/mosesdecoder/phrase-extract/extract-mixed-syntax/ConsistentPhrases.h new file mode 100644 index 0000000000000000000000000000000000000000..1347b46659a42e4a446fe75e9e2233c23819a4eb --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/ConsistentPhrases.h @@ -0,0 +1,41 @@ +/* + * ConsistentPhrases.h + * + * Created on: 20 Feb 2014 + * Author: hieu + */ +#pragma once + +#include +#include +#include +#include "ConsistentPhrase.h" + +class Word; +class Parameter; + +class ConsistentPhrases +{ +public: + typedef std::set Coll; + + ConsistentPhrases(); + virtual ~ConsistentPhrases(); + + void Initialize(size_t size); + + void Add(int sourceStart, int sourceEnd, + int targetStart, int targetEnd, + const Parameter ¶ms); + + void AddHieroNonTerms(const Parameter ¶ms); + + const Coll &GetColl(int sourceStart, int sourceEnd) const; + Coll &GetColl(int sourceStart, int sourceEnd); + + std::string Debug() const; + +protected: + std::vector< std::vector > m_coll; +}; + diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/InputFileStream.h b/mosesdecoder/phrase-extract/extract-mixed-syntax/InputFileStream.h new file mode 100644 index 0000000000000000000000000000000000000000..5de41623787bf0a8237f61f438b484f26f6290b1 --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/InputFileStream.h @@ -0,0 +1,48 @@ +// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_InputFileStream_h +#define moses_InputFileStream_h + +#include +#include +#include + +namespace Moses +{ + +/** Used in place of std::istream, can read zipped files if it ends in .gz + */ +class InputFileStream : public std::istream +{ +protected: + std::streambuf *m_streambuf; +public: + + explicit InputFileStream(const std::string &filePath); + ~InputFileStream(); + + void Close(); +}; + +} + +#endif diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/Main.h b/mosesdecoder/phrase-extract/extract-mixed-syntax/Main.h new file mode 100644 index 0000000000000000000000000000000000000000..9744ba389b1b36dd7afab06c4b36379215db6780 --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/Main.h @@ -0,0 +1,12 @@ +/* + * Main.h + * + * Created on: 28 Feb 2014 + * Author: hieu + */ +#pragma once + +#include "OutputFileStream.h" + +void CreateGlueGrammar(Moses::OutputFileStream &glueFile); + diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/NonTerm.cpp b/mosesdecoder/phrase-extract/extract-mixed-syntax/NonTerm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..456be8932d320e2914e61ea2d0ecac2ca2946c98 --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/NonTerm.cpp @@ -0,0 +1,69 @@ +/* + * NonTerm.cpp + * + * Created on: 22 Feb 2014 + * Author: hieu + */ + +#include +#include "NonTerm.h" +#include "Word.h" +#include "ConsistentPhrase.h" +#include "Parameter.h" + +using namespace std; + +NonTerm::NonTerm(const ConsistentPhrase &consistentPhrase, + const std::string &source, + const std::string &target) + :m_consistentPhrase(&consistentPhrase) + ,m_source(source) + ,m_target(target) +{ + // TODO Auto-generated constructor stub + +} + +NonTerm::~NonTerm() +{ + // TODO Auto-generated destructor stub +} + +std::string NonTerm::Debug() const +{ + stringstream out; + out << m_source << m_target; + out << m_consistentPhrase->Debug(); + return out.str(); +} + +void NonTerm::Output(std::ostream &out) const +{ + out << m_source << m_target; +} + +void NonTerm::Output(std::ostream &out, Moses::FactorDirection direction) const +{ + out << GetLabel(direction); +} + +const std::string &NonTerm::GetLabel(Moses::FactorDirection direction) const +{ + return (direction == Moses::Input) ? m_source : m_target; +} + +bool NonTerm::IsHiero(Moses::FactorDirection direction, const Parameter ¶ms) const +{ + const std::string &label = NonTerm::GetLabel(direction); + return label == params.hieroNonTerm; +} + +bool NonTerm::IsHiero(const Parameter ¶ms) const +{ + return IsHiero(Moses::Input, params) && IsHiero(Moses::Output, params); +} + +int NonTerm::GetWidth(Moses::FactorDirection direction) const +{ + return GetConsistentPhrase().GetWidth(direction); +} diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/NonTerm.h b/mosesdecoder/phrase-extract/extract-mixed-syntax/NonTerm.h new file mode 100644 index 0000000000000000000000000000000000000000..7019787310f0263e3f45a453d2e0ec873f7d5bad --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/NonTerm.h @@ -0,0 +1,50 @@ +/* + * NonTerm.h + * + * Created on: 22 Feb 2014 + * Author: hieu + */ +#pragma once +#include +#include "RuleSymbol.h" +#include "moses/TypeDef.h" + +class ConsistentPhrase; +class Parameter; + +class NonTerm : public RuleSymbol +{ +public: + + NonTerm(const ConsistentPhrase &consistentPhrase, + const std::string &source, + const std::string &target); + virtual ~NonTerm(); + + const ConsistentPhrase &GetConsistentPhrase() const { + return *m_consistentPhrase; + } + + int GetWidth(Moses::FactorDirection direction) const; + + virtual bool IsNonTerm() const { + return true; + } + + std::string GetString() const { + return m_source + m_target; + } + + virtual std::string Debug() const; + virtual void Output(std::ostream &out) const; + void Output(std::ostream &out, Moses::FactorDirection direction) const; + + const std::string &GetLabel(Moses::FactorDirection direction) const; + bool IsHiero(Moses::FactorDirection direction, const Parameter ¶ms) const; + bool IsHiero(const Parameter ¶ms) const; + +protected: + const ConsistentPhrase *m_consistentPhrase; + std::string m_source, m_target; +}; + diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/Phrase.h b/mosesdecoder/phrase-extract/extract-mixed-syntax/Phrase.h new file mode 100644 index 0000000000000000000000000000000000000000..57cfaf9c26789f52fbea5201bf4ac61158941e36 --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/Phrase.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include "Word.h" + +// a vector of terminals +class Phrase : public std::vector +{ +public: + Phrase() + {} + + Phrase(size_t size) + :std::vector(size) + {} + + std::string Debug() const; + +}; diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/Word.cpp b/mosesdecoder/phrase-extract/extract-mixed-syntax/Word.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f36391f1aa74bfa452ce0d4f38b07a867007d67d --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/Word.cpp @@ -0,0 +1,69 @@ +/* + * Word.cpp + * + * Created on: 18 Feb 2014 + * Author: s0565741 + */ +#include +#include "Word.h" +#include "moses/Util.h" + +using namespace std; + +Word::Word(int pos, const std::string &str) + :m_pos(pos) + ,m_str(str) +{ + // TODO Auto-generated constructor stub + +} + +Word::~Word() +{ + // TODO Auto-generated destructor stub +} + +void Word::AddAlignment(const Word *other) +{ + m_alignment.insert(other); +} + +std::set Word::GetAlignmentIndex() const +{ + std::set ret; + + std::set::const_iterator iter; + for (iter = m_alignment.begin(); iter != m_alignment.end(); ++iter) { + const Word &otherWord = **iter; + int otherPos = otherWord.GetPos(); + ret.insert(otherPos); + } + + return ret; +} + +void Word::Output(std::ostream &out) const +{ + out << m_str; +} + +std::string Word::Debug() const +{ + return m_str; +} + +int Word::CompareString(const Word &other) const +{ + return m_str.compare(other.m_str); +} + +std::string Word::GetString(int factor) const +{ + vector toks; + Moses::Tokenize(toks, m_str, "|"); + + assert(factor < toks.size()); + return toks[factor]; +} + + diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/pugiconfig.hpp b/mosesdecoder/phrase-extract/extract-mixed-syntax/pugiconfig.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5a63fd488ef411bbf72118254eeb749cb7bca0ef --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/pugiconfig.hpp @@ -0,0 +1,69 @@ +/** + * pugixml parser - version 1.2 + * -------------------------------------------------------- + * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://pugixml.org/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef HEADER_PUGICONFIG_HPP +#define HEADER_PUGICONFIG_HPP + +// Uncomment this to enable wchar_t mode +// #define PUGIXML_WCHAR_MODE + +// Uncomment this to disable XPath +// #define PUGIXML_NO_XPATH + +// Uncomment this to disable STL +// #define PUGIXML_NO_STL + +// Uncomment this to disable exceptions +// #define PUGIXML_NO_EXCEPTIONS + +// Set this to control attributes for public classes/functions, i.e.: +// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL +// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL +// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall +// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead + +// Uncomment this to switch to header-only version +// #define PUGIXML_HEADER_ONLY +// #include "pugixml.cpp" + +// Tune these constants to adjust memory-related behavior +// #define PUGIXML_MEMORY_PAGE_SIZE 32768 +// #define PUGIXML_MEMORY_OUTPUT_STACK 10240 +// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096 + +#endif + +/** + * Copyright (c) 2006-2012 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/mosesdecoder/phrase-extract/extract-mixed-syntax/pugixml.hpp b/mosesdecoder/phrase-extract/extract-mixed-syntax/pugixml.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cbc527bef429f40438e774b32d8cc76c8e4c1d6c --- /dev/null +++ b/mosesdecoder/phrase-extract/extract-mixed-syntax/pugixml.hpp @@ -0,0 +1,1256 @@ +/** + * pugixml parser - version 1.2 + * -------------------------------------------------------- + * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at http://pugixml.org/ + * + * This library is distributed under the MIT License. See notice at the end + * of this file. + * + * This work is based on the pugxml parser, which is: + * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) + */ + +#ifndef PUGIXML_VERSION +// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons +# define PUGIXML_VERSION 120 +#endif + +// Include user configuration file (this can define various configuration macros) +#include "pugiconfig.hpp" + +#ifndef HEADER_PUGIXML_HPP +#define HEADER_PUGIXML_HPP + +// Include stddef.h for size_t and ptrdiff_t +#include + +// Include exception header for XPath +#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS) +# include +#endif + +// Include STL headers +#ifndef PUGIXML_NO_STL +# include +# include +# include +#endif + +// Macro for deprecated features +#ifndef PUGIXML_DEPRECATED +# if defined(__GNUC__) +# define PUGIXML_DEPRECATED __attribute__((deprecated)) +# elif defined(_MSC_VER) && _MSC_VER >= 1300 +# define PUGIXML_DEPRECATED __declspec(deprecated) +# else +# define PUGIXML_DEPRECATED +# endif +#endif + +// If no API is defined, assume default +#ifndef PUGIXML_API +# define PUGIXML_API +#endif + +// If no API for classes is defined, assume default +#ifndef PUGIXML_CLASS +# define PUGIXML_CLASS PUGIXML_API +#endif + +// If no API for functions is defined, assume default +#ifndef PUGIXML_FUNCTION +# define PUGIXML_FUNCTION PUGIXML_API +#endif + +// Character interface macros +#ifdef PUGIXML_WCHAR_MODE +# define PUGIXML_TEXT(t) L ## t +# define PUGIXML_CHAR wchar_t +#else +# define PUGIXML_TEXT(t) t +# define PUGIXML_CHAR char +#endif + +namespace pugi +{ +// Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE +typedef PUGIXML_CHAR char_t; + +#ifndef PUGIXML_NO_STL +// String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE +typedef std::basic_string, std::allocator > string_t; +#endif +} + +// The PugiXML namespace +namespace pugi +{ +// Tree node types +enum xml_node_type { + node_null, // Empty (null) node handle + node_document, // A document tree's absolute root + node_element, // Element tag, i.e. '' + node_pcdata, // Plain character data, i.e. 'text' + node_cdata, // Character data, i.e. '' + node_comment, // Comment tag, i.e. '' + node_pi, // Processing instruction, i.e. '' + node_declaration, // Document declaration, i.e. '' + node_doctype // Document type declaration, i.e. '' +}; + +// Parsing options + +// Minimal parsing mode (equivalent to turning all other flags off). +// Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed. +const unsigned int parse_minimal = 0x0000; + +// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default. +const unsigned int parse_pi = 0x0001; + +// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default. +const unsigned int parse_comments = 0x0002; + +// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default. +const unsigned int parse_cdata = 0x0004; + +// This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree. +// This flag is off by default; turning it on usually results in slower parsing and more memory consumption. +const unsigned int parse_ws_pcdata = 0x0008; + +// This flag determines if character and entity references are expanded during parsing. This flag is on by default. +const unsigned int parse_escapes = 0x0010; + +// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default. +const unsigned int parse_eol = 0x0020; + +// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default. +const unsigned int parse_wconv_attribute = 0x0040; + +// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default. +const unsigned int parse_wnorm_attribute = 0x0080; + +// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default. +const unsigned int parse_declaration = 0x0100; + +// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default. +const unsigned int parse_doctype = 0x0200; + +// This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only +// of whitespace is added to the DOM tree. +// This flag is off by default; turning it on may result in slower parsing and more memory consumption. +const unsigned int parse_ws_pcdata_single = 0x0400; + +// The default parsing mode. +// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, +// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. +const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol; + +// The full parsing mode. +// Nodes of all types are added to the DOM tree, character/reference entities are expanded, +// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. +const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype; + +// These flags determine the encoding of input data for XML document +enum xml_encoding { + encoding_auto, // Auto-detect input encoding using BOM or < / class xml_object_range +{ +public: + typedef It const_iterator; + + xml_object_range(It b, It e): _begin(b), _end(e) { + } + + It begin() const { + return _begin; + } + It end() const { + return _end; + } + +private: + It _begin, _end; +}; + +// Writer interface for node printing (see xml_node::print) +class PUGIXML_CLASS xml_writer +{ +public: + virtual ~xml_writer() {} + + // Write memory chunk into stream/file/whatever + virtual void write(const void* data, size_t size) = 0; +}; + +// xml_writer implementation for FILE* +class PUGIXML_CLASS xml_writer_file: public xml_writer +{ +public: + // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio + xml_writer_file(void* file); + + virtual void write(const void* data, size_t size); + +private: + void* file; +}; + +#ifndef PUGIXML_NO_STL +// xml_writer implementation for streams +class PUGIXML_CLASS xml_writer_stream: public xml_writer +{ +public: + // Construct writer from an output stream object + xml_writer_stream(std::basic_ostream >& stream); + xml_writer_stream(std::basic_ostream >& stream); + + virtual void write(const void* data, size_t size); + +private: + std::basic_ostream >* narrow_stream; + std::basic_ostream >* wide_stream; +}; +#endif + +// A light-weight handle for manipulating attributes in DOM tree +class PUGIXML_CLASS xml_attribute +{ + friend class xml_attribute_iterator; + friend class xml_node; + +private: + xml_attribute_struct* _attr; + + typedef void (*unspecified_bool_type)(xml_attribute***); + +public: + // Default constructor. Constructs an empty attribute. + xml_attribute(); + + // Constructs attribute from internal pointer + explicit xml_attribute(xml_attribute_struct* attr); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators (compares wrapped attribute pointers) + bool operator==(const xml_attribute& r) const; + bool operator!=(const xml_attribute& r) const; + bool operator<(const xml_attribute& r) const; + bool operator>(const xml_attribute& r) const; + bool operator<=(const xml_attribute& r) const; + bool operator>=(const xml_attribute& r) const; + + // Check if attribute is empty + bool empty() const; + + // Get attribute name/value, or "" if attribute is empty + const char_t* name() const; + const char_t* value() const; + + // Get attribute value, or the default value if attribute is empty + const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; + + // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty + int as_int(int def = 0) const; + unsigned int as_uint(unsigned int def = 0) const; + double as_double(double def = 0) const; + float as_float(float def = 0) const; + + // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty + bool as_bool(bool def = false) const; + + // Set attribute name/value (returns false if attribute is empty or there is not enough memory) + bool set_name(const char_t* rhs); + bool set_value(const char_t* rhs); + + // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") + bool set_value(int rhs); + bool set_value(unsigned int rhs); + bool set_value(double rhs); + bool set_value(bool rhs); + + // Set attribute value (equivalent to set_value without error checking) + xml_attribute& operator=(const char_t* rhs); + xml_attribute& operator=(int rhs); + xml_attribute& operator=(unsigned int rhs); + xml_attribute& operator=(double rhs); + xml_attribute& operator=(bool rhs); + + // Get next/previous attribute in the attribute list of the parent node + xml_attribute next_attribute() const; + xml_attribute previous_attribute() const; + + // Get hash value (unique for handles to the same object) + size_t hash_value() const; + + // Get internal pointer + xml_attribute_struct* internal_object() const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs); +#endif + +// A light-weight handle for manipulating nodes in DOM tree +class PUGIXML_CLASS xml_node +{ + friend class xml_attribute_iterator; + friend class xml_node_iterator; + friend class xml_named_node_iterator; + +protected: + xml_node_struct* _root; + + typedef void (*unspecified_bool_type)(xml_node***); + +public: + // Default constructor. Constructs an empty node. + xml_node(); + + // Constructs node from internal pointer + explicit xml_node(xml_node_struct* p); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators (compares wrapped node pointers) + bool operator==(const xml_node& r) const; + bool operator!=(const xml_node& r) const; + bool operator<(const xml_node& r) const; + bool operator>(const xml_node& r) const; + bool operator<=(const xml_node& r) const; + bool operator>=(const xml_node& r) const; + + // Check if node is empty. + bool empty() const; + + // Get node type + xml_node_type type() const; + + // Get node name/value, or "" if node is empty or it has no name/value + const char_t* name() const; + const char_t* value() const; + + // Get attribute list + xml_attribute first_attribute() const; + xml_attribute last_attribute() const; + + // Get children list + xml_node first_child() const; + xml_node last_child() const; + + // Get next/previous sibling in the children list of the parent node + xml_node next_sibling() const; + xml_node previous_sibling() const; + + // Get parent node + xml_node parent() const; + + // Get root of DOM tree this node belongs to + xml_node root() const; + + // Get text object for the current node + xml_text text() const; + + // Get child, attribute or next/previous sibling with the specified name + xml_node child(const char_t* name) const; + xml_attribute attribute(const char_t* name) const; + xml_node next_sibling(const char_t* name) const; + xml_node previous_sibling(const char_t* name) const; + + // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA + const char_t* child_value() const; + + // Get child value of child with specified name. Equivalent to child(name).child_value(). + const char_t* child_value(const char_t* name) const; + + // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value) + bool set_name(const char_t* rhs); + bool set_value(const char_t* rhs); + + // Add attribute with specified name. Returns added attribute, or empty attribute on errors. + xml_attribute append_attribute(const char_t* name); + xml_attribute prepend_attribute(const char_t* name); + xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr); + xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr); + + // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors. + xml_attribute append_copy(const xml_attribute& proto); + xml_attribute prepend_copy(const xml_attribute& proto); + xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr); + xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr); + + // Add child node with specified type. Returns added node, or empty node on errors. + xml_node append_child(xml_node_type type = node_element); + xml_node prepend_child(xml_node_type type = node_element); + xml_node insert_child_after(xml_node_type type, const xml_node& node); + xml_node insert_child_before(xml_node_type type, const xml_node& node); + + // Add child element with specified name. Returns added node, or empty node on errors. + xml_node append_child(const char_t* name); + xml_node prepend_child(const char_t* name); + xml_node insert_child_after(const char_t* name, const xml_node& node); + xml_node insert_child_before(const char_t* name, const xml_node& node); + + // Add a copy of the specified node as a child. Returns added node, or empty node on errors. + xml_node append_copy(const xml_node& proto); + xml_node prepend_copy(const xml_node& proto); + xml_node insert_copy_after(const xml_node& proto, const xml_node& node); + xml_node insert_copy_before(const xml_node& proto, const xml_node& node); + + // Remove specified attribute + bool remove_attribute(const xml_attribute& a); + bool remove_attribute(const char_t* name); + + // Remove specified child + bool remove_child(const xml_node& n); + bool remove_child(const char_t* name); + + // Find attribute using predicate. Returns first attribute for which predicate returned true. + template xml_attribute find_attribute(Predicate pred) const { + if (!_root) return xml_attribute(); + + for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute()) + if (pred(attrib)) + return attrib; + + return xml_attribute(); + } + + // Find child node using predicate. Returns first child for which predicate returned true. + template xml_node find_child(Predicate pred) const { + if (!_root) return xml_node(); + + for (xml_node node = first_child(); node; node = node.next_sibling()) + if (pred(node)) + return node; + + return xml_node(); + } + + // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true. + template xml_node find_node(Predicate pred) const { + if (!_root) return xml_node(); + + xml_node cur = first_child(); + + while (cur._root && cur._root != _root) { + if (pred(cur)) return cur; + + if (cur.first_child()) cur = cur.first_child(); + else if (cur.next_sibling()) cur = cur.next_sibling(); + else { + while (!cur.next_sibling() && cur._root != _root) cur = cur.parent(); + + if (cur._root != _root) cur = cur.next_sibling(); + } + } + + return xml_node(); + } + + // Find child node by attribute name/value + xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const; + xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const; + +#ifndef PUGIXML_NO_STL + // Get the absolute node path from root as a text string. + string_t path(char_t delimiter = '/') const; +#endif + + // Search for a node by path consisting of node names and . or .. elements. + xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const; + + // Recursively traverse subtree with xml_tree_walker + bool traverse(xml_tree_walker& walker); + +#ifndef PUGIXML_NO_XPATH + // Select single node by evaluating XPath query. Returns first node from the resulting node set. + xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node select_single_node(const xpath_query& query) const; + + // Select node set by evaluating XPath query + xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const; + xpath_node_set select_nodes(const xpath_query& query) const; +#endif + + // Print subtree using a writer object + void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; + +#ifndef PUGIXML_NO_STL + // Print subtree to stream + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; +#endif + + // Child nodes iterators + typedef xml_node_iterator iterator; + + iterator begin() const; + iterator end() const; + + // Attribute iterators + typedef xml_attribute_iterator attribute_iterator; + + attribute_iterator attributes_begin() const; + attribute_iterator attributes_end() const; + + // Range-based for support + xml_object_range children() const; + xml_object_range children(const char_t* name) const; + xml_object_range attributes() const; + + // Get node offset in parsed file/string (in char_t units) for debugging purposes + ptrdiff_t offset_debug() const; + + // Get hash value (unique for handles to the same object) + size_t hash_value() const; + + // Get internal pointer + xml_node_struct* internal_object() const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs); +#endif + +// A helper for working with text inside PCDATA nodes +class PUGIXML_CLASS xml_text +{ + friend class xml_node; + + xml_node_struct* _root; + + typedef void (*unspecified_bool_type)(xml_text***); + + explicit xml_text(xml_node_struct* root); + + xml_node_struct* _data_new(); + xml_node_struct* _data() const; + +public: + // Default constructor. Constructs an empty object. + xml_text(); + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Check if text object is empty + bool empty() const; + + // Get text, or "" if object is empty + const char_t* get() const; + + // Get text, or the default value if object is empty + const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; + + // Get text as a number, or the default value if conversion did not succeed or object is empty + int as_int(int def = 0) const; + unsigned int as_uint(unsigned int def = 0) const; + double as_double(double def = 0) const; + float as_float(float def = 0) const; + + // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty + bool as_bool(bool def = false) const; + + // Set text (returns false if object is empty or there is not enough memory) + bool set(const char_t* rhs); + + // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") + bool set(int rhs); + bool set(unsigned int rhs); + bool set(double rhs); + bool set(bool rhs); + + // Set text (equivalent to set without error checking) + xml_text& operator=(const char_t* rhs); + xml_text& operator=(int rhs); + xml_text& operator=(unsigned int rhs); + xml_text& operator=(double rhs); + xml_text& operator=(bool rhs); + + // Get the data node (node_pcdata or node_cdata) for this object + xml_node data() const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs); +#endif + +// Child node iterator (a bidirectional iterator over a collection of xml_node) +class PUGIXML_CLASS xml_node_iterator +{ + friend class xml_node; + +private: + mutable xml_node _wrap; + xml_node _parent; + + xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent); + +public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_node value_type; + typedef xml_node* pointer; + typedef xml_node& reference; + +#ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; +#endif + + // Default constructor + xml_node_iterator(); + + // Construct an iterator which points to the specified node + xml_node_iterator(const xml_node& node); + + // Iterator operators + bool operator==(const xml_node_iterator& rhs) const; + bool operator!=(const xml_node_iterator& rhs) const; + + xml_node& operator*() const; + xml_node* operator->() const; + + const xml_node_iterator& operator++(); + xml_node_iterator operator++(int); + + const xml_node_iterator& operator--(); + xml_node_iterator operator--(int); +}; + +// Attribute iterator (a bidirectional iterator over a collection of xml_attribute) +class PUGIXML_CLASS xml_attribute_iterator +{ + friend class xml_node; + +private: + mutable xml_attribute _wrap; + xml_node _parent; + + xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent); + +public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_attribute value_type; + typedef xml_attribute* pointer; + typedef xml_attribute& reference; + +#ifndef PUGIXML_NO_STL + typedef std::bidirectional_iterator_tag iterator_category; +#endif + + // Default constructor + xml_attribute_iterator(); + + // Construct an iterator which points to the specified attribute + xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent); + + // Iterator operators + bool operator==(const xml_attribute_iterator& rhs) const; + bool operator!=(const xml_attribute_iterator& rhs) const; + + xml_attribute& operator*() const; + xml_attribute* operator->() const; + + const xml_attribute_iterator& operator++(); + xml_attribute_iterator operator++(int); + + const xml_attribute_iterator& operator--(); + xml_attribute_iterator operator--(int); +}; + +// Named node range helper +class xml_named_node_iterator +{ +public: + // Iterator traits + typedef ptrdiff_t difference_type; + typedef xml_node value_type; + typedef xml_node* pointer; + typedef xml_node& reference; + +#ifndef PUGIXML_NO_STL + typedef std::forward_iterator_tag iterator_category; +#endif + + // Default constructor + xml_named_node_iterator(); + + // Construct an iterator which points to the specified node + xml_named_node_iterator(const xml_node& node, const char_t* name); + + // Iterator operators + bool operator==(const xml_named_node_iterator& rhs) const; + bool operator!=(const xml_named_node_iterator& rhs) const; + + xml_node& operator*() const; + xml_node* operator->() const; + + const xml_named_node_iterator& operator++(); + xml_named_node_iterator operator++(int); + +private: + mutable xml_node _node; + const char_t* _name; +}; + +// Abstract tree walker class (see xml_node::traverse) +class PUGIXML_CLASS xml_tree_walker +{ + friend class xml_node; + +private: + int _depth; + +protected: + // Get current traversal depth + int depth() const; + +public: + xml_tree_walker(); + virtual ~xml_tree_walker(); + + // Callback that is called when traversal begins + virtual bool begin(xml_node& node); + + // Callback that is called for each node traversed + virtual bool for_each(xml_node& node) = 0; + + // Callback that is called when traversal ends + virtual bool end(xml_node& node); +}; + +// Parsing status, returned as part of xml_parse_result object +enum xml_parse_status { + status_ok = 0, // No error + + status_file_not_found, // File was not found during load_file() + status_io_error, // Error reading from file/stream + status_out_of_memory, // Could not allocate memory + status_internal_error, // Internal error occurred + + status_unrecognized_tag, // Parser could not determine tag type + + status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction + status_bad_comment, // Parsing error occurred while parsing comment + status_bad_cdata, // Parsing error occurred while parsing CDATA section + status_bad_doctype, // Parsing error occurred while parsing document type declaration + status_bad_pcdata, // Parsing error occurred while parsing PCDATA section + status_bad_start_element, // Parsing error occurred while parsing start element tag + status_bad_attribute, // Parsing error occurred while parsing element attribute + status_bad_end_element, // Parsing error occurred while parsing end element tag + status_end_element_mismatch // There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag) +}; + +// Parsing result +struct PUGIXML_CLASS xml_parse_result { + // Parsing status (see xml_parse_status) + xml_parse_status status; + + // Last parsed offset (in char_t units from start of input data) + ptrdiff_t offset; + + // Source document encoding + xml_encoding encoding; + + // Default constructor, initializes object to failed state + xml_parse_result(); + + // Cast to bool operator + operator bool() const; + + // Get error description + const char* description() const; +}; + +// Document class (DOM tree root) +class PUGIXML_CLASS xml_document: public xml_node +{ +private: + char_t* _buffer; + + char _memory[192]; + + // Non-copyable semantics + xml_document(const xml_document&); + const xml_document& operator=(const xml_document&); + + void create(); + void destroy(); + + xml_parse_result load_buffer_impl(void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own); + +public: + // Default constructor, makes empty document + xml_document(); + + // Destructor, invalidates all node/attribute handles to this document + ~xml_document(); + + // Removes all nodes, leaving the empty document + void reset(); + + // Removes all nodes, then copies the entire contents of the specified document + void reset(const xml_document& proto); + +#ifndef PUGIXML_NO_STL + // Load document from stream. + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default); +#endif + + // Load document from zero-terminated string. No encoding conversions are applied. + xml_parse_result load(const char_t* contents, unsigned int options = parse_default); + + // Load document from file + xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns. + xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). + // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed. + xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). + // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore). + xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); + + // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details). + void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + +#ifndef PUGIXML_NO_STL + // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details). + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; +#endif + + // Save XML to file + bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; + + // Get document element + xml_node document_element() const; +}; + +#ifndef PUGIXML_NO_XPATH +// XPath query return type +enum xpath_value_type { + xpath_type_none, // Unknown type (query failed to compile) + xpath_type_node_set, // Node set (xpath_node_set) + xpath_type_number, // Number + xpath_type_string, // String + xpath_type_boolean // Boolean +}; + +// XPath parsing result +struct PUGIXML_CLASS xpath_parse_result { + // Error message (0 if no error) + const char* error; + + // Last parsed offset (in char_t units from string start) + ptrdiff_t offset; + + // Default constructor, initializes object to failed state + xpath_parse_result(); + + // Cast to bool operator + operator bool() const; + + // Get error description + const char* description() const; +}; + +// A single XPath variable +class PUGIXML_CLASS xpath_variable +{ + friend class xpath_variable_set; + +protected: + xpath_value_type _type; + xpath_variable* _next; + + xpath_variable(); + + // Non-copyable semantics + xpath_variable(const xpath_variable&); + xpath_variable& operator=(const xpath_variable&); + +public: + // Get variable name + const char_t* name() const; + + // Get variable type + xpath_value_type type() const; + + // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error + bool get_boolean() const; + double get_number() const; + const char_t* get_string() const; + const xpath_node_set& get_node_set() const; + + // Set variable value; no type conversion is performed, false is returned on type mismatch error + bool set(bool value); + bool set(double value); + bool set(const char_t* value); + bool set(const xpath_node_set& value); +}; + +// A set of XPath variables +class PUGIXML_CLASS xpath_variable_set +{ +private: + xpath_variable* _data[64]; + + // Non-copyable semantics + xpath_variable_set(const xpath_variable_set&); + xpath_variable_set& operator=(const xpath_variable_set&); + + xpath_variable* find(const char_t* name) const; + +public: + // Default constructor/destructor + xpath_variable_set(); + ~xpath_variable_set(); + + // Add a new variable or get the existing one, if the types match + xpath_variable* add(const char_t* name, xpath_value_type type); + + // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch + bool set(const char_t* name, bool value); + bool set(const char_t* name, double value); + bool set(const char_t* name, const char_t* value); + bool set(const char_t* name, const xpath_node_set& value); + + // Get existing variable by name + xpath_variable* get(const char_t* name); + const xpath_variable* get(const char_t* name) const; +}; + +// A compiled XPath query object +class PUGIXML_CLASS xpath_query +{ +private: + void* _impl; + xpath_parse_result _result; + + typedef void (*unspecified_bool_type)(xpath_query***); + + // Non-copyable semantics + xpath_query(const xpath_query&); + xpath_query& operator=(const xpath_query&); + +public: + // Construct a compiled object from XPath expression. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors. + explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0); + + // Destructor + ~xpath_query(); + + // Get query expression return type + xpath_value_type return_type() const; + + // Evaluate expression as boolean value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + bool evaluate_boolean(const xpath_node& n) const; + + // Evaluate expression as double value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + double evaluate_number(const xpath_node& n) const; + +#ifndef PUGIXML_NO_STL + // Evaluate expression as string value in the specified context; performs type conversion if necessary. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + string_t evaluate_string(const xpath_node& n) const; +#endif + + // Evaluate expression as string value in the specified context; performs type conversion if necessary. + // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero). + // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty set instead. + size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const; + + // Evaluate expression as node set in the specified context. + // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors. + // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead. + xpath_node_set evaluate_node_set(const xpath_node& n) const; + + // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode) + const xpath_parse_result& result() const; + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; +}; + +#ifndef PUGIXML_NO_EXCEPTIONS +// XPath exception class +class PUGIXML_CLASS xpath_exception: public std::exception +{ +private: + xpath_parse_result _result; + +public: + // Construct exception from parse result + explicit xpath_exception(const xpath_parse_result& result); + + // Get error message + virtual const char* what() const throw(); + + // Get parse result + const xpath_parse_result& result() const; +}; +#endif + +// XPath node class (either xml_node or xml_attribute) +class PUGIXML_CLASS xpath_node +{ +private: + xml_node _node; + xml_attribute _attribute; + + typedef void (*unspecified_bool_type)(xpath_node***); + +public: + // Default constructor; constructs empty XPath node + xpath_node(); + + // Construct XPath node from XML node/attribute + xpath_node(const xml_node& node); + xpath_node(const xml_attribute& attribute, const xml_node& parent); + + // Get node/attribute, if any + xml_node node() const; + xml_attribute attribute() const; + + // Get parent of contained node/attribute + xml_node parent() const; + + // Safe bool conversion operator + operator unspecified_bool_type() const; + + // Borland C++ workaround + bool operator!() const; + + // Comparison operators + bool operator==(const xpath_node& n) const; + bool operator!=(const xpath_node& n) const; +}; + +#ifdef __BORLANDC__ +// Borland C++ workaround +bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs); +bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs); +#endif + +// A fixed-size collection of XPath nodes +class PUGIXML_CLASS xpath_node_set +{ +public: + // Collection type + enum type_t { + type_unsorted, // Not ordered + type_sorted, // Sorted by document order (ascending) + type_sorted_reverse // Sorted by document order (descending) + }; + + // Constant iterator type + typedef const xpath_node* const_iterator; + + // Default constructor. Constructs empty set. + xpath_node_set(); + + // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful + xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted); + + // Destructor + ~xpath_node_set(); + + // Copy constructor/assignment operator + xpath_node_set(const xpath_node_set& ns); + xpath_node_set& operator=(const xpath_node_set& ns); + + // Get collection type + type_t type() const; + + // Get collection size + size_t size() const; + + // Indexing operator + const xpath_node& operator[](size_t index) const; + + // Collection iterators + const_iterator begin() const; + const_iterator end() const; + + // Sort the collection in ascending/descending order by document order + void sort(bool reverse = false); + + // Get first node in the collection by document order + xpath_node first() const; + + // Check if collection is empty + bool empty() const; + +private: + type_t _type; + + xpath_node _storage; + + xpath_node* _begin; + xpath_node* _end; + + void _assign(const_iterator begin, const_iterator end); +}; +#endif + +#ifndef PUGIXML_NO_STL +// Convert wide string to UTF8 +std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const wchar_t* str); +std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const std::basic_string, std::allocator >& str); + +// Convert UTF8 to wide string +std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const char* str); +std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const std::basic_string, std::allocator >& str); +#endif + +// Memory allocation function interface; returns pointer to allocated memory or NULL on failure +typedef void* (*allocation_function)(size_t size); + +// Memory deallocation function interface +typedef void (*deallocation_function)(void* ptr); + +// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions. +void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate); + +// Get current memory management functions +allocation_function PUGIXML_FUNCTION get_memory_allocation_function(); +deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function(); +} + +#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) +namespace std +{ +// Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) +std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&); +std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&); +std::forward_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&); +} +#endif + +#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC) +namespace std +{ +// Workarounds for (non-standard) iterator category detection +std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&); +std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&); +std::forward_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&); +} +#endif + +#endif + +/** + * Copyright (c) 2006-2012 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/mosesdecoder/phrase-extract/score-stsg/Jamfile b/mosesdecoder/phrase-extract/score-stsg/Jamfile new file mode 100644 index 0000000000000000000000000000000000000000..6ae17b565158a726c9f58772da412672d87571b0 --- /dev/null +++ b/mosesdecoder/phrase-extract/score-stsg/Jamfile @@ -0,0 +1 @@ +exe score-stsg : [ glob *.cpp ] ..//syntax-common ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : .. ; diff --git a/mosesdecoder/phrase-extract/score-stsg/LexicalTable.h b/mosesdecoder/phrase-extract/score-stsg/LexicalTable.h new file mode 100644 index 0000000000000000000000000000000000000000..54bae1dec0f9d9c4362bea71045df36f11a3a15f --- /dev/null +++ b/mosesdecoder/phrase-extract/score-stsg/LexicalTable.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include + +#include + +#include "Vocabulary.h" + +namespace MosesTraining +{ +namespace Syntax +{ +namespace ScoreStsg +{ + +class LexicalTable +{ +public: + LexicalTable(Vocabulary &, Vocabulary &); + + void Load(std::istream &); + + double PermissiveLookup(Vocabulary::IdType s, Vocabulary::IdType t) { + OuterMap::const_iterator p = m_table.find(s); + if (p == m_table.end()) { + return 1.0; + } + const InnerMap &inner = p->second; + InnerMap::const_iterator q = inner.find(t); + return q == inner.end() ? 1.0 : q->second; + } + +private: + typedef boost::unordered_map InnerMap; + typedef boost::unordered_map OuterMap; + + Vocabulary &m_srcVocab; + Vocabulary &m_tgtVocab; + OuterMap m_table; +}; + +} // namespace ScoreStsg +} // namespace Syntax +} // namespace MosesTraining diff --git a/mosesdecoder/phrase-extract/score-stsg/RuleGroup.cpp b/mosesdecoder/phrase-extract/score-stsg/RuleGroup.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a4e6ff3a2145ac2babe1a7c3d2445a17bcf09724 --- /dev/null +++ b/mosesdecoder/phrase-extract/score-stsg/RuleGroup.cpp @@ -0,0 +1,47 @@ +#include "RuleGroup.h" + +namespace MosesTraining +{ +namespace Syntax +{ +namespace ScoreStsg +{ + +void RuleGroup::SetNewSource(const StringPiece &source) +{ + source.CopyToString(&m_source); + m_distinctRules.clear(); + m_totalCount = 0; +} + +void RuleGroup::AddRule(const StringPiece &target, const StringPiece &ntAlign, + const StringPiece &fullAlign, int count, + double treeScore) +{ + if (m_distinctRules.empty() || + ntAlign != m_distinctRules.back().ntAlign || + target != m_distinctRules.back().target) { + DistinctRule r; + target.CopyToString(&r.target); + ntAlign.CopyToString(&r.ntAlign); + r.alignments.resize(r.alignments.size()+1); + fullAlign.CopyToString(&r.alignments.back().first); + r.alignments.back().second = count; + r.count = count; + r.treeScore = treeScore; + m_distinctRules.push_back(r); + } else { + DistinctRule &r = m_distinctRules.back(); + if (r.alignments.back().first != fullAlign) { + r.alignments.resize(r.alignments.size()+1); + fullAlign.CopyToString(&r.alignments.back().first); + } + r.alignments.back().second += count; + r.count += count; + } + m_totalCount += count; +} + +} // namespace ScoreStsg +} // namespace Syntax +} // namespace MosesTraining diff --git a/mosesdecoder/phrase-extract/score-stsg/RuleSymbol.h b/mosesdecoder/phrase-extract/score-stsg/RuleSymbol.h new file mode 100644 index 0000000000000000000000000000000000000000..e8cd9645888b9cdb080461e4314ef96f9e70c6cd --- /dev/null +++ b/mosesdecoder/phrase-extract/score-stsg/RuleSymbol.h @@ -0,0 +1,19 @@ +#pragma once + +#include "util/string_piece.hh" + +namespace MosesTraining +{ +namespace Syntax +{ +namespace ScoreStsg +{ + +struct RuleSymbol { + StringPiece value; + bool isNonTerminal; +}; + +} // namespace ScoreStsg +} // namespace Syntax +} // namespace MosesTraining diff --git a/mosesdecoder/phrase-extract/score-stsg/RuleTableWriter.h b/mosesdecoder/phrase-extract/score-stsg/RuleTableWriter.h new file mode 100644 index 0000000000000000000000000000000000000000..4f7df99244789f634d12beebb3c0192097b61d19 --- /dev/null +++ b/mosesdecoder/phrase-extract/score-stsg/RuleTableWriter.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +#include "OutputFileStream.h" + +#include "Options.h" +#include "TokenizedRuleHalf.h" + +namespace MosesTraining +{ +namespace Syntax +{ +namespace ScoreStsg +{ + +class RuleTableWriter +{ +public: + RuleTableWriter(const Options &options, Moses::OutputFileStream &out) + : m_options(options) + , m_out(out) {} + + void WriteLine(const TokenizedRuleHalf &, const TokenizedRuleHalf &, + const std::string &, double, double, int, int, int); + +private: + double MaybeLog(double a) const { + if (!m_options.logProb) { + return a; + } + return m_options.negLogProb ? -log(a) : log(a); + } + + void WriteRuleHalf(const TokenizedRuleHalf &); + + const Options &m_options; + Moses::OutputFileStream &m_out; +}; + +} // namespace ScoreStsg +} // namespace Syntax +} // namespace MosesTraining diff --git a/mosesdecoder/phrase-extract/score-stsg/ScoreStsg.cpp b/mosesdecoder/phrase-extract/score-stsg/ScoreStsg.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f6df0d0da80b86733df1c46d480d2995ccc4b758 --- /dev/null +++ b/mosesdecoder/phrase-extract/score-stsg/ScoreStsg.cpp @@ -0,0 +1,431 @@ +#include "ScoreStsg.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "util/string_piece.hh" +#include "util/string_piece_hash.hh" +#include "util/tokenize_piece.hh" + +#include "InputFileStream.h" +#include "OutputFileStream.h" + +#include "syntax-common/exception.h" + +#include "LexicalTable.h" +#include "Options.h" +#include "RuleGroup.h" +#include "RuleTableWriter.h" + +namespace MosesTraining +{ +namespace Syntax +{ +namespace ScoreStsg +{ + +const int ScoreStsg::kCountOfCountsMax = 10; + +ScoreStsg::ScoreStsg() + : Tool("score-stsg") + , m_lexTable(m_srcVocab, m_tgtVocab) + , m_countOfCounts(kCountOfCountsMax, 0) + , m_totalDistinct(0) +{ +} + +int ScoreStsg::Main(int argc, char *argv[]) +{ + // Process command-line options. + ProcessOptions(argc, argv, m_options); + + // Open input files. + Moses::InputFileStream extractStream(m_options.extractFile); + Moses::InputFileStream lexStream(m_options.lexFile); + + // Open output files. + Moses::OutputFileStream outStream; + Moses::OutputFileStream countOfCountsStream; + OpenOutputFileOrDie(m_options.tableFile, outStream); + if (m_options.goodTuring || m_options.kneserNey) { + OpenOutputFileOrDie(m_options.tableFile+".coc", countOfCountsStream); + } + + // Load lexical table. + if (!m_options.noLex) { + m_lexTable.Load(lexStream); + } + + const util::MultiCharacter delimiter("|||"); + std::size_t lineNum = 0; + std::size_t startLine= 0; + std::string line; + std::string tmp; + RuleGroup ruleGroup; + RuleTableWriter ruleTableWriter(m_options, outStream); + + while (std::getline(extractStream, line)) { + ++lineNum; + + // Tokenize the input line. + util::TokenIter it(line, delimiter); + StringPiece source = *it++; + StringPiece target = *it++; + StringPiece ntAlign = *it++; + StringPiece fullAlign = *it++; + it->CopyToString(&tmp); + int count = std::atoi(tmp.c_str()); + double treeScore = 0.0f; + if (m_options.treeScore && !m_options.inverse) { + ++it; + it->CopyToString(&tmp); + treeScore = std::atof(tmp.c_str()); + } + + // If this is the first line or if source has changed since the last + // line then process the current rule group and start a new one. + if (source != ruleGroup.GetSource()) { + if (lineNum > 1) { + ProcessRuleGroupOrDie(ruleGroup, ruleTableWriter, startLine, lineNum-1); + } + startLine = lineNum; + ruleGroup.SetNewSource(source); + } + + // Add the rule to the current rule group. + ruleGroup.AddRule(target, ntAlign, fullAlign, count, treeScore); + } + + // Process the final rule group. + ProcessRuleGroupOrDie(ruleGroup, ruleTableWriter, startLine, lineNum); + + // Write count of counts file. + if (m_options.goodTuring || m_options.kneserNey) { + // Kneser-Ney needs the total number of distinct rules. + countOfCountsStream << m_totalDistinct << std::endl; + // Write out counts of counts. + for (int i = 1; i <= kCountOfCountsMax; ++i) { + countOfCountsStream << m_countOfCounts[i] << std::endl; + } + } + + return 0; +} + +void ScoreStsg::TokenizeRuleHalf(const std::string &s, TokenizedRuleHalf &half) +{ + // Copy s to half.string, but strip any leading or trailing whitespace. + std::size_t start = s.find_first_not_of(" \t"); + if (start == std::string::npos) { + throw Exception("rule half is empty"); + } + std::size_t end = s.find_last_not_of(" \t"); + assert(end != std::string::npos); + half.string = s.substr(start, end-start+1); + + // Tokenize half.string. + half.tokens.clear(); + for (TreeFragmentTokenizer p(half.string); + p != TreeFragmentTokenizer(); ++p) { + half.tokens.push_back(*p); + } + + // Extract the frontier symbols. + half.frontierSymbols.clear(); + const std::size_t numTokens = half.tokens.size(); + for (int i = 0; i < numTokens; ++i) { + if (half.tokens[i].type != TreeFragmentToken_WORD) { + continue; + } + if (i == 0 || half.tokens[i-1].type != TreeFragmentToken_LSB) { + // A word is a terminal iff it doesn't follow '[' + half.frontierSymbols.resize(half.frontierSymbols.size()+1); + half.frontierSymbols.back().value = half.tokens[i].value; + half.frontierSymbols.back().isNonTerminal = false; + } else if (i+1 < numTokens && + half.tokens[i+1].type == TreeFragmentToken_RSB) { + // A word is a non-terminal iff it it follows '[' and is succeeded by ']' + half.frontierSymbols.resize(half.frontierSymbols.size()+1); + half.frontierSymbols.back().value = half.tokens[i].value; + half.frontierSymbols.back().isNonTerminal = true; + ++i; // Skip over the ']' + } + } +} + +void ScoreStsg::ProcessRuleGroupOrDie(const RuleGroup &group, + RuleTableWriter &writer, + std::size_t start, + std::size_t end) +{ + try { + ProcessRuleGroup(group, writer); + } catch (const Exception &e) { + std::ostringstream msg; + msg << "failed to process rule group at lines " << start << "-" << end + << ": " << e.msg(); + Error(msg.str()); + } catch (const std::exception &e) { + std::ostringstream msg; + msg << "failed to process rule group at lines " << start << "-" << end + << ": " << e.what(); + Error(msg.str()); + } +} + +void ScoreStsg::ProcessRuleGroup(const RuleGroup &group, + RuleTableWriter &writer) +{ + const std::size_t totalCount = group.GetTotalCount(); + const std::size_t distinctCount = group.GetSize(); + + TokenizeRuleHalf(group.GetSource(), m_sourceHalf); + + const bool fullyLexical = m_sourceHalf.IsFullyLexical(); + + // Process each distinct rule in turn. + for (RuleGroup::ConstIterator p = group.Begin(); p != group.End(); ++p) { + const RuleGroup::DistinctRule &rule = *p; + + // Update count of count statistics. + if (m_options.goodTuring || m_options.kneserNey) { + ++m_totalDistinct; + int countInt = rule.count + 0.99999; + if (countInt <= kCountOfCountsMax) { + ++m_countOfCounts[countInt]; + } + } + + // If the rule is not fully lexical then discard it if the count is below + // the threshold value. + if (!fullyLexical && rule.count < m_options.minCountHierarchical) { + continue; + } + + TokenizeRuleHalf(rule.target, m_targetHalf); + + // Find the most frequent alignment (if there's a tie, take the first one). + std::vector >::const_iterator q = + rule.alignments.begin(); + const std::pair *bestAlignmentAndCount = &(*q++); + for (; q != rule.alignments.end(); ++q) { + if (q->second > bestAlignmentAndCount->second) { + bestAlignmentAndCount = &(*q); + } + } + const std::string &bestAlignment = bestAlignmentAndCount->first; + ParseAlignmentString(bestAlignment, m_targetHalf.frontierSymbols.size(), + m_tgtToSrc); + + // Compute the lexical translation probability. + double lexProb = ComputeLexProb(m_sourceHalf.frontierSymbols, + m_targetHalf.frontierSymbols, m_tgtToSrc); + + // Write a line to the rule table. + writer.WriteLine(m_sourceHalf, m_targetHalf, bestAlignment, lexProb, + rule.treeScore, p->count, totalCount, distinctCount); + } +} + +void ScoreStsg::ParseAlignmentString(const std::string &s, int numTgtWords, + ALIGNMENT &tgtToSrc) +{ + tgtToSrc.clear(); + tgtToSrc.resize(numTgtWords); + + const std::string digits = "0123456789"; + + std::string::size_type begin = 0; + while (true) { + std::string::size_type end = s.find("-", begin); + if (end == std::string::npos) { + return; + } + int src = std::atoi(s.substr(begin, end-begin).c_str()); + if (end+1 == s.size()) { + throw Exception("Target index missing"); + } + begin = end+1; + end = s.find_first_not_of(digits, begin+1); + int tgt; + if (end == std::string::npos) { + tgt = std::atoi(s.substr(begin).c_str()); + tgtToSrc[tgt].insert(src); + return; + } else { + tgt = std::atoi(s.substr(begin, end-begin).c_str()); + tgtToSrc[tgt].insert(src); + } + begin = end+1; + } +} + +double ScoreStsg::ComputeLexProb(const std::vector &sourceFrontier, + const std::vector &targetFrontier, + const ALIGNMENT &tgtToSrc) +{ + double lexScore = 1.0; + for (std::size_t i = 0; i < targetFrontier.size(); ++i) { + if (targetFrontier[i].isNonTerminal) { + continue; + } + Vocabulary::IdType tgtId = m_tgtVocab.Lookup(targetFrontier[i].value, + StringPieceCompatibleHash(), + StringPieceCompatibleEquals()); + const std::set &srcIndices = tgtToSrc[i]; + if (srcIndices.empty()) { + // Explain unaligned word by NULL. + lexScore *= m_lexTable.PermissiveLookup(Vocabulary::NullId(), tgtId); + } else { + double thisWordScore = 0.0; + for (std::set::const_iterator p = srcIndices.begin(); + p != srcIndices.end(); ++p) { + Vocabulary::IdType srcId = + m_srcVocab.Lookup(sourceFrontier[*p].value, + StringPieceCompatibleHash(), + StringPieceCompatibleEquals()); + thisWordScore += m_lexTable.PermissiveLookup(srcId, tgtId); + } + lexScore *= thisWordScore / static_cast(srcIndices.size()); + } + } + return lexScore; +} + +void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const +{ + namespace po = boost::program_options; + namespace cls = boost::program_options::command_line_style; + + // Construct the 'top' of the usage message: the bit that comes before the + // options list. + std::ostringstream usageTop; + usageTop << "Usage: " << name() + << " [OPTION]... EXTRACT LEX TABLE\n\n" + << "STSG rule scorer\n\n" + << "Options"; + + // Construct the 'bottom' of the usage message. + std::ostringstream usageBottom; + usageBottom << "TODO"; + + // Declare the command line options that are visible to the user. + po::options_description visible(usageTop.str()); + visible.add_options() + ("GoodTuring", + "apply Good-Turing smoothing to relative frequency probability estimates") + ("Hierarchical", + "ignored (included for compatibility with score)") + ("Inverse", + "use inverse mode") + ("KneserNey", + "apply Kneser-Ney smoothing to relative frequency probability estimates") + ("LogProb", + "output log probabilities") + ("MinCountHierarchical", + po::value(&options.minCountHierarchical)-> + default_value(options.minCountHierarchical), + "filter out rules with frequency < arg (except fully lexical rules)") + ("NegLogProb", + "output negative log probabilities") + ("NoLex", + "do not compute lexical translation score") + ("NoWordAlignment", + "do not output word alignments") + ("PCFG", + "synonym for TreeScore (included for compatibility with score)") + ("TreeScore", + "include pre-computed tree score from extract") + ("UnpairedExtractFormat", + "ignored (included for compatibility with score)") + ; + + // Declare the command line options that are hidden from the user + // (these are used as positional options). + po::options_description hidden("Hidden options"); + hidden.add_options() + ("ExtractFile", + po::value(&options.extractFile), + "extract file") + ("LexFile", + po::value(&options.lexFile), + "lexical probability file") + ("TableFile", + po::value(&options.tableFile), + "output file") + ; + + // Compose the full set of command-line options. + po::options_description cmdLineOptions; + cmdLineOptions.add(visible).add(hidden); + + // Register the positional options. + po::positional_options_description p; + p.add("ExtractFile", 1); + p.add("LexFile", 1); + p.add("TableFile", 1); + + // Process the command-line. + po::variables_map vm; + try { + po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()). + options(cmdLineOptions).positional(p).run(), vm); + po::notify(vm); + } catch (const std::exception &e) { + std::ostringstream msg; + msg << e.what() << "\n\n" << visible << usageBottom.str(); + Error(msg.str()); + } + + if (vm.count("help")) { + std::cout << visible << usageBottom.str() << std::endl; + std::exit(0); + } + + // Check all positional options were given. + if (!vm.count("ExtractFile") || + !vm.count("LexFile") || + !vm.count("TableFile")) { + std::ostringstream msg; + std::cerr << visible << usageBottom.str() << std::endl; + std::exit(1); + } + + // Process Boolean options. + if (vm.count("GoodTuring")) { + options.goodTuring = true; + } + if (vm.count("Inverse")) { + options.inverse = true; + } + if (vm.count("KneserNey")) { + options.kneserNey = true; + } + if (vm.count("LogProb")) { + options.logProb = true; + } + if (vm.count("NegLogProb")) { + options.negLogProb = true; + } + if (vm.count("NoLex")) { + options.noLex = true; + } + if (vm.count("NoWordAlignment")) { + options.noWordAlignment = true; + } + if (vm.count("TreeScore") || vm.count("PCFG")) { + options.treeScore = true; + } +} + +} // namespace ScoreStsg +} // namespace Syntax +} // namespace MosesTraining diff --git a/mosesdecoder/phrase-extract/score-stsg/ScoreStsg.h b/mosesdecoder/phrase-extract/score-stsg/ScoreStsg.h new file mode 100644 index 0000000000000000000000000000000000000000..1757e181bca1cf4c711b7b3641f7ec583e7299da --- /dev/null +++ b/mosesdecoder/phrase-extract/score-stsg/ScoreStsg.h @@ -0,0 +1,69 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "ExtractionPhrasePair.h" +#include "OutputFileStream.h" + +#include "syntax-common/tool.h" + +#include "LexicalTable.h" +#include "Options.h" +#include "RuleSymbol.h" +#include "TokenizedRuleHalf.h" +#include "Vocabulary.h" + +namespace MosesTraining +{ +namespace Syntax +{ +namespace ScoreStsg +{ + +class RuleGroup; +class RuleTableWriter; + +class ScoreStsg : public Tool +{ +public: + ScoreStsg(); + + virtual int Main(int argc, char *argv[]); + +private: + static const int kCountOfCountsMax; + + double ComputeLexProb(const std::vector &, + const std::vector &, + const ALIGNMENT &); + + void ParseAlignmentString(const std::string &, int, + ALIGNMENT &); + + void ProcessOptions(int, char *[], Options &) const; + + void ProcessRuleGroup(const RuleGroup &, RuleTableWriter &); + + void ProcessRuleGroupOrDie(const RuleGroup &, RuleTableWriter &, + std::size_t, std::size_t); + + void TokenizeRuleHalf(const std::string &, TokenizedRuleHalf &); + + Options m_options; + Vocabulary m_srcVocab; + Vocabulary m_tgtVocab; + LexicalTable m_lexTable; + std::vector m_countOfCounts; + int m_totalDistinct; + TokenizedRuleHalf m_sourceHalf; + TokenizedRuleHalf m_targetHalf; + ALIGNMENT m_tgtToSrc; +}; + +} // namespace ScoreStsg +} // namespace Syntax +} // namespace MosesTraining diff --git a/mosesdecoder/phrase-extract/score-stsg/TokenizedRuleHalf.cpp b/mosesdecoder/phrase-extract/score-stsg/TokenizedRuleHalf.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6ccc2a311dd3262d2d22318428947bf76b309e29 --- /dev/null +++ b/mosesdecoder/phrase-extract/score-stsg/TokenizedRuleHalf.cpp @@ -0,0 +1,40 @@ +#include "TokenizedRuleHalf.h" + +namespace MosesTraining +{ +namespace Syntax +{ +namespace ScoreStsg +{ + +bool TokenizedRuleHalf::IsFullyLexical() const +{ + for (std::vector::const_iterator p = frontierSymbols.begin(); + p != frontierSymbols.end(); ++p) { + if (p->isNonTerminal) { + return false; + } + } + return true; +} + +bool TokenizedRuleHalf::IsString() const +{ + // A rule half is either a string (like "[X] and [X]") or a tree (like + // "[NP [NP] [CC and] [NP]]"). + // + // A string must start with a terminal or a non-terminal (in square brackets). + // A tree must start with '[' followed by a word then either another word or + // another '['. + return (tokens[0].type == TreeFragmentToken_WORD || + tokens[2].type == TreeFragmentToken_RSB); +} + +bool TokenizedRuleHalf::IsTree() const +{ + return !IsString(); +} + +} // namespace ScoreStsg +} // namespace Syntax +} // namespace MosesTraining diff --git a/mosesdecoder/phrase-extract/score-stsg/Vocabulary.h b/mosesdecoder/phrase-extract/score-stsg/Vocabulary.h new file mode 100644 index 0000000000000000000000000000000000000000..6370544f47f605b1dcf4a1bf74f95ce9351664bf --- /dev/null +++ b/mosesdecoder/phrase-extract/score-stsg/Vocabulary.h @@ -0,0 +1,18 @@ +#pragma once + +#include + +#include "syntax-common/numbered_set.h" + +namespace MosesTraining +{ +namespace Syntax +{ +namespace ScoreStsg +{ + +typedef NumberedSet Vocabulary; + +} // namespace ScoreStsg +} // namespace Syntax +} // namespace MosesTraining diff --git a/mosesdecoder/run-regtests.sh b/mosesdecoder/run-regtests.sh new file mode 100644 index 0000000000000000000000000000000000000000..f2c02aaa807d9014dfc97b36aaed14331350e2ce --- /dev/null +++ b/mosesdecoder/run-regtests.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# this script assumes that all 3rd-party dependencies are installed under ./opt +# you can install all 3rd-party dependencies by running make -f contrib/Makefiles/install-dependencies.gmake + +set -e -o pipefail + +opt=$(pwd)/opt + +args=$(getopt -oj:aq -lwith-irstlm:,with-boost:,with-cmph:,with-regtest:,no-xmlrpc-c,with-xmlrpc-c:,full -- "$@") +eval set -- "$args" + +# default settings +noserver=false; +full=false; +j=$(getconf _NPROCESSORS_ONLN) +irstlm=$opt/irstlm-5.80.08 +boost=$opt +cmph=$opt +xmlrpc=--with-xmlrpc-c\=$opt +regtest=$(pwd)/regtest +unset q +unset a +# the regression test for the compactpt bug is currently know to fail, +# let's skip it for the time being +skipcompact=--regtest-skip-compactpt + +# overrides from command line +while true ; do + case "$1" in + -j ) j=$2; shift 2 ;; + -a ) a=-a; shift ;; + -q ) q=-q; shift ;; + --no-xmlrpc-c ) xmlrpc=$1; shift ;; + --with-xmlrpc-c ) + xmlrpc=--with-xmlrpc-c\=$2; shift 2 ;; + --with-irstlm ) irstlm=$2; shift 2 ;; + --with-boost ) boost=$2; shift 2 ;; + --with-cmph ) cmph=$2; shift 2 ;; + --with-regtest ) regtest=$2; shift 2 ;; + --full ) full=true; shift 2 ;; + -- ) shift; break ;; + * ) break ;; + esac +done + +if [ $? != 0 ] ; then exit $?; fi + +git submodule init +git submodule update regtest + +# full test means +# -- compile from scratch without server, run regtests +# -- compile from scratch with server, run regtests +set -x +if [ "$full" == true ] ; then + ./bjam -j$j --with-mm --with-mm-extras --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest -a $skipcompact $@ $q || exit $? + if ./regression-testing/run-single-test.perl --server --startuptest ; then + ./bjam -j$j --with-mm --with-mm-extras --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest -a $skipcompact $@ $q + fi +else + # when investigating failures, always run single-threaded + if [ "$q" == "-q" ] ; then j=1; fi + + if ./regression-testing/run-single-test.perl --server --startuptest ; then + ./bjam -j$j --with-mm $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest $skipcompact $@ + else + ./bjam -j$j --with-mm --with-mm-extras $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest $skipcompact $@ + fi +fi + +# if [ "$RECOMPILE" == "NO" ] ; then +# RECOMPILE= +# else +# RECOMPILE="-a" +# fi + +# # test compilation without xmlrpc-c +# # ./bjam -j$(nproc) --with-irstlm=$opt --with-boost=$opt --with-cmph=$opt --no-xmlrpc-c --with-regtest=$(pwd)/regtest -a -q $@ || exit $? + +# # test compilation with xmlrpc-c +# if ./regression-testing/run-single-test.perl --server --startuptest ; then +# ./bjam -j$(nproc) --with-irstlm=$opt --with-boost=$opt --with-cmph=$opt --with-xmlrpc-c=$opt --with-regtest=$(pwd)/regtest $RECOMPILE -q --regtest-skip-compactpt $@ +# fi