chatvns / app /processing /__init__.py
liamxdev's picture
Upload folder using huggingface_hub
34b531b verified
Raw
History Blame Contribute Delete
1.99 kB
from __future__ import annotations
from app.processing.chunking import chunk_documents, chunk_blocks, split_block_by_tokens
from app.processing.documents import iter_raw_documents, modality_from_path, text_dump_exists, ticker_from_path
from app.processing.outputs import summarize_processed_data, write_processed_outputs
from app.processing.readers import load_metadata_for_artifact, read_csv_rows, read_pdf_text, read_raw_file
from app.processing.structures import HTMLStructureParser, parse_document_structures, parse_text_blocks
from app.processing.text_utils import (
clean_document_text,
detokenize,
is_noise_line,
looks_like_heading,
looks_like_table,
looks_like_widget,
normalize_text,
rows_to_table_text,
stable_id,
token_count,
tokenize,
)
def process_raw_data():
documents = iter_raw_documents()
chunks = chunk_documents(documents)
write_processed_outputs(documents, chunks)
return chunks
def process_raw_data_with_summary():
documents = iter_raw_documents()
chunks = chunk_documents(documents)
write_processed_outputs(documents, chunks)
return chunks, summarize_processed_data(documents, chunks)
__all__ = [
"HTMLStructureParser",
"chunk_blocks",
"chunk_documents",
"clean_document_text",
"detokenize",
"is_noise_line",
"iter_raw_documents",
"load_metadata_for_artifact",
"looks_like_heading",
"looks_like_table",
"looks_like_widget",
"modality_from_path",
"normalize_text",
"parse_document_structures",
"parse_text_blocks",
"process_raw_data",
"process_raw_data_with_summary",
"read_csv_rows",
"read_pdf_text",
"read_raw_file",
"rows_to_table_text",
"split_block_by_tokens",
"stable_id",
"summarize_processed_data",
"text_dump_exists",
"ticker_from_path",
"token_count",
"tokenize",
"write_processed_outputs",
]