File size: 5,418 Bytes
a579dd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# -*- coding:utf-8 -*-
import re

from ultradata_math_parser.config import Forum_XPATH, Unique_ID
from ultradata_math_parser.utils import *
from ultradata_math_parser.parsers.base_parser import BaseParser
from ultradata_math_parser.parsers.title_parser import TitleParser


class ForumParser(BaseParser):
    def __init__(self) -> None:
        super().__init__()

    def extract(self, html="", base_url="", **kwargs) -> dict:
        self.preserve_math_containers = kwargs.get("preserve_math_containers", self.preserve_math_containers)
        self.process_math = kwargs.get("process_math", self.process_math)
        self.include_tables = kwargs.get("include_tables", self.include_tables)
        self.include_images = kwargs.get("include_images", self.include_images)
        self.need_comment = True
        html = html.replace(" ", " ").replace(" ", " ")
        tree = load_html(html)
        if tree is None:
            raise ValueError

        # 获取title
        title = TitleParser().process(tree)

        # base_url
        base_href = tree.xpath("//base/@href")

        if base_href and "http" in base_href[0]:
            base_url = base_href[0]
        self.generate_unique_id(tree)

        format_tree = self.convert_tags(tree, base_url=base_url)
        format_tree = self._remove_tables_from_tree(format_tree)
        format_tree = self._remove_images_from_tree(format_tree)

        normal_tree = self.clean_tags(format_tree)
        normal_tree = self._remove_tables_from_tree(normal_tree)
        normal_tree = self._remove_images_from_tree(normal_tree)

        subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
        if xp_num == "others":
            subtree, drop_list = self.prune_unwanted_sections(normal_tree)
        body_html = self.get_content_html(subtree, xp_num, base_url)
        body_html = self._strip_tables_from_html(body_html)
        body_html = self._strip_images_from_html(body_html)

        # 论坛等独有
        body_html_tree = fromstring(body_html)
        try:
            body_tree = body_html_tree.body
        except:
            body_tree = Element("body")
            body_tree.extend(body_html_tree)
        main_ids = body_tree.xpath(f".//@{Unique_ID}")

        for main_id in main_ids:
            main_tree = normal_tree.xpath(
                f".//*[@{Unique_ID}={main_id}]"
            )
            if main_tree:
                self.remove_node(main_tree[0])
        if not main_ids:
            main_ids = [-1]

        if xp_num != "others":
            normal_tree, _ = self.prune_unwanted_sections(normal_tree)
        for c_xpath in Forum_XPATH:
            while normal_tree.xpath(c_xpath):
                x = normal_tree.xpath(c_xpath)[0]
                self.remove_node(x)
                if "'post-'" in c_xpath:
                    if not (re.findall('post-\d+', x.attrib.get("id", "").lower()) or re.findall('post_\d+',
                                                                                                 x.attrib.get("id",
                                                                                                              "").lower())):
                        continue
                if (
                        "header" in x.attrib.get("class", "").lower()
                        or "header" in x.attrib.get("id", "").lower()
                ):
                    continue
                try:
                    if int(x.attrib.get(Unique_ID, "0")) > int(
                            main_ids[-1]
                    ):
                        body_tree.append(x)
                    else:
                        prefix_div = Element("div")
                        suffix_div = Element("div")
                        need_prefix = False
                        need_suffix = False
                        while x.xpath(
                                f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
                        ):
                            tmp_x = x.xpath(
                                f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
                            )[0]
                            self.remove_node(tmp_x)
                            suffix_div.append(tmp_x)
                            need_suffix = True
                        while x.xpath(
                                f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
                        ):
                            tmp_x = x.xpath(
                                f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
                            )[0]
                            self.remove_node(tmp_x)
                            prefix_div.append(tmp_x)
                            need_prefix = True
                        if need_prefix:
                            body_tree.insert(0, prefix_div)
                        if need_suffix:
                            body_tree.append(suffix_div)

                except:
                    pass

        body_html = re.sub(
            f' {Unique_ID}="\d+"',
            "",
            tostring(body_tree, encoding=str),
        )

        text_length = self._text_length_from_html(body_html)

        return {
            "xp_num": xp_num,
            "drop_list": drop_list,
            "html": body_html,
            "title": title,
            "base_url": base_url,
            "text_length": text_length,
        }