File size: 1,990 Bytes
34b531b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from __future__ import annotations

from app.processing.chunking import chunk_documents, chunk_blocks, split_block_by_tokens
from app.processing.documents import iter_raw_documents, modality_from_path, text_dump_exists, ticker_from_path
from app.processing.outputs import summarize_processed_data, write_processed_outputs
from app.processing.readers import load_metadata_for_artifact, read_csv_rows, read_pdf_text, read_raw_file
from app.processing.structures import HTMLStructureParser, parse_document_structures, parse_text_blocks
from app.processing.text_utils import (
    clean_document_text,
    detokenize,
    is_noise_line,
    looks_like_heading,
    looks_like_table,
    looks_like_widget,
    normalize_text,
    rows_to_table_text,
    stable_id,
    token_count,
    tokenize,
)


def process_raw_data():
    documents = iter_raw_documents()
    chunks = chunk_documents(documents)
    write_processed_outputs(documents, chunks)
    return chunks


def process_raw_data_with_summary():
    documents = iter_raw_documents()
    chunks = chunk_documents(documents)
    write_processed_outputs(documents, chunks)
    return chunks, summarize_processed_data(documents, chunks)


__all__ = [
    "HTMLStructureParser",
    "chunk_blocks",
    "chunk_documents",
    "clean_document_text",
    "detokenize",
    "is_noise_line",
    "iter_raw_documents",
    "load_metadata_for_artifact",
    "looks_like_heading",
    "looks_like_table",
    "looks_like_widget",
    "modality_from_path",
    "normalize_text",
    "parse_document_structures",
    "parse_text_blocks",
    "process_raw_data",
    "process_raw_data_with_summary",
    "read_csv_rows",
    "read_pdf_text",
    "read_raw_file",
    "rows_to_table_text",
    "split_block_by_tokens",
    "stable_id",
    "summarize_processed_data",
    "text_dump_exists",
    "ticker_from_path",
    "token_count",
    "tokenize",
    "write_processed_outputs",
]