| from __future__ import annotations | |
| from app.processing.chunking import chunk_documents, chunk_blocks, split_block_by_tokens | |
| from app.processing.documents import iter_raw_documents, modality_from_path, text_dump_exists, ticker_from_path | |
| from app.processing.outputs import summarize_processed_data, write_processed_outputs | |
| from app.processing.readers import load_metadata_for_artifact, read_csv_rows, read_pdf_text, read_raw_file | |
| from app.processing.structures import HTMLStructureParser, parse_document_structures, parse_text_blocks | |
| from app.processing.text_utils import ( | |
| clean_document_text, | |
| detokenize, | |
| is_noise_line, | |
| looks_like_heading, | |
| looks_like_table, | |
| looks_like_widget, | |
| normalize_text, | |
| rows_to_table_text, | |
| stable_id, | |
| token_count, | |
| tokenize, | |
| ) | |
| def process_raw_data(): | |
| documents = iter_raw_documents() | |
| chunks = chunk_documents(documents) | |
| write_processed_outputs(documents, chunks) | |
| return chunks | |
| def process_raw_data_with_summary(): | |
| documents = iter_raw_documents() | |
| chunks = chunk_documents(documents) | |
| write_processed_outputs(documents, chunks) | |
| return chunks, summarize_processed_data(documents, chunks) | |
| __all__ = [ | |
| "HTMLStructureParser", | |
| "chunk_blocks", | |
| "chunk_documents", | |
| "clean_document_text", | |
| "detokenize", | |
| "is_noise_line", | |
| "iter_raw_documents", | |
| "load_metadata_for_artifact", | |
| "looks_like_heading", | |
| "looks_like_table", | |
| "looks_like_widget", | |
| "modality_from_path", | |
| "normalize_text", | |
| "parse_document_structures", | |
| "parse_text_blocks", | |
| "process_raw_data", | |
| "process_raw_data_with_summary", | |
| "read_csv_rows", | |
| "read_pdf_text", | |
| "read_raw_file", | |
| "rows_to_table_text", | |
| "split_block_by_tokens", | |
| "stable_id", | |
| "summarize_processed_data", | |
| "text_dump_exists", | |
| "ticker_from_path", | |
| "token_count", | |
| "tokenize", | |
| "write_processed_outputs", | |
| ] | |