File size: 2,364 Bytes
26aeab0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# parser/table_extractor.py
from bs4 import BeautifulSoup, Tag
from typing import List, Dict, Any
from helpers.cleaner import clean_text
from parser.article_extractor import is_article_line, extract_article_number
from parser.section_extractor import is_section_line

def tables_from_soup(soup: BeautifulSoup):
    return soup.find_all("table")

def table_to_struct(table: Tag):
    trs = table.find_all("tr")
    if not trs:
        return {"headers": [], "rows": []}

    headers = [
        clean_text(" ".join(td.stripped_strings))
        for td in trs[0].find_all(["th", "td"])
    ]

    num_cols = len(headers)
    rows = []

    for tr in trs[1:]:
        cols = [clean_text(" ".join(td.stripped_strings)) for td in tr.find_all(["td", "th"])]
        while len(cols) < num_cols:
            cols.append("")
        rows.append(cols)

    return {"headers": headers, "rows": rows}

def link_tables_to_sections_and_articles(soup: BeautifulSoup, sections: List[Dict[str, Any]]):
    tables = tables_from_soup(soup)

    for idx, table in enumerate(tables):
        struct = table_to_struct(table)
        struct["position"] = idx

        prev = table.find_previous(string=True)

        target_section = None
        target_article = None

        while prev:
            text = prev.strip()

            if is_article_line(text):
                num = extract_article_number(text)
                for sec in reversed(sections):
                    for art in reversed(sec["articles"]):
                        if art["number"] == num:
                            target_article = art
                            break
                    if target_article:
                        break
                if target_article:
                    break

            if is_section_line(text):
                for sec in reversed(sections):
                    if sec["title"] == text:
                        target_section = sec
                        break
                if target_section:
                    break

            prev = prev.find_previous(string=True)

        if target_article:
            target_article.setdefault("tables", []).append(struct)
        elif target_section:
            target_section.setdefault("tables", []).append(struct)
        else:
            sections[0].setdefault("tables", []).append(struct)

    return sections