Spaces:
Sleeping
Sleeping
| # file: chunking.py | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_core.documents import Document | |
| from typing import List | |
| from unstructured.partition.md import partition_md | |
| from unstructured.documents.elements import Header, Footer, PageBreak, Table, NarrativeText | |
| # --- Configuration --- | |
| CHUNK_SIZE = 1000 | |
| CHUNK_OVERLAP = 200 | |
| def process_and_chunk(raw_text: str) -> List[Document]: | |
| """ | |
| Partitions raw text from a document processor using 'unstructured', | |
| correctly interpreting it as markdown to preserve table structures, | |
| and then chunks the remaining text content. | |
| Args: | |
| raw_text: The raw string content of the document (expected to be markdown). | |
| Returns: | |
| A list of Document objects, including structured tables and chunked text. | |
| """ | |
| if not raw_text: | |
| print("Warning: Input text for chunking is empty.") | |
| return [] | |
| print(f"Processing raw text of length {len(raw_text)} with 'unstructured' markdown parser.") | |
| # --- FIX: Change content_type to "text/markdown" --- | |
| # This tells unstructured to use its specialized markdown parser, which | |
| # correctly handles tables and other structures from your PyMuPDF output. | |
| elements = partition_md(text=raw_text) | |
| documents = [] | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=CHUNK_SIZE, | |
| chunk_overlap=CHUNK_OVERLAP, | |
| length_function=len, | |
| is_separator_regex=False, | |
| ) | |
| for element in elements: | |
| if isinstance(element, (Header, Footer, PageBreak)): | |
| continue | |
| # Process tables | |
| if "unstructured.documents.elements.Table" in str(type(element)): | |
| table_html = element.metadata.text_as_html | |
| table_metadata = element.metadata.to_dict() | |
| table_metadata['content_type'] = 'table' | |
| documents.append(Document(page_content=table_html, metadata=table_metadata)) | |
| # Process and chunk narrative text | |
| elif "unstructured.documents.elements.NarrativeText" in str(type(element)): | |
| chunks = text_splitter.split_text(element.text) | |
| for chunk in chunks: | |
| chunk_metadata = element.metadata.to_dict() | |
| chunk_metadata['content_type'] = 'text' | |
| documents.append(Document(page_content=chunk, metadata=chunk_metadata)) | |
| # Handle other elements directly | |
| else: | |
| general_metadata = element.metadata.to_dict() | |
| general_metadata['content_type'] = 'other' | |
| documents.append(Document(page_content=element.text, metadata=general_metadata)) | |
| print(f"Created {len(documents)} documents (chunks and tables).") | |
| return documents |