File size: 1,990 Bytes
34b531b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | from __future__ import annotations
from app.processing.chunking import chunk_documents, chunk_blocks, split_block_by_tokens
from app.processing.documents import iter_raw_documents, modality_from_path, text_dump_exists, ticker_from_path
from app.processing.outputs import summarize_processed_data, write_processed_outputs
from app.processing.readers import load_metadata_for_artifact, read_csv_rows, read_pdf_text, read_raw_file
from app.processing.structures import HTMLStructureParser, parse_document_structures, parse_text_blocks
from app.processing.text_utils import (
clean_document_text,
detokenize,
is_noise_line,
looks_like_heading,
looks_like_table,
looks_like_widget,
normalize_text,
rows_to_table_text,
stable_id,
token_count,
tokenize,
)
def process_raw_data():
documents = iter_raw_documents()
chunks = chunk_documents(documents)
write_processed_outputs(documents, chunks)
return chunks
def process_raw_data_with_summary():
documents = iter_raw_documents()
chunks = chunk_documents(documents)
write_processed_outputs(documents, chunks)
return chunks, summarize_processed_data(documents, chunks)
__all__ = [
"HTMLStructureParser",
"chunk_blocks",
"chunk_documents",
"clean_document_text",
"detokenize",
"is_noise_line",
"iter_raw_documents",
"load_metadata_for_artifact",
"looks_like_heading",
"looks_like_table",
"looks_like_widget",
"modality_from_path",
"normalize_text",
"parse_document_structures",
"parse_text_blocks",
"process_raw_data",
"process_raw_data_with_summary",
"read_csv_rows",
"read_pdf_text",
"read_raw_file",
"rows_to_table_text",
"split_block_by_tokens",
"stable_id",
"summarize_processed_data",
"text_dump_exists",
"ticker_from_path",
"token_count",
"tokenize",
"write_processed_outputs",
]
|