Spaces:
Build error
Build error
| import hashlib | |
| from abc import ABC, abstractmethod | |
| from typing import Generic, TypeVar | |
| from uuid import UUID | |
| from llm_engineering.domain.chunks import ArticleChunk, Chunk, PostChunk, RepositoryChunk | |
| from llm_engineering.domain.cleaned_documents import ( | |
| CleanedArticleDocument, | |
| CleanedDocument, | |
| CleanedPostDocument, | |
| CleanedRepositoryDocument, | |
| ) | |
| from .operations import chunk_article, chunk_text | |
| CleanedDocumentT = TypeVar("CleanedDocumentT", bound=CleanedDocument) | |
| ChunkT = TypeVar("ChunkT", bound=Chunk) | |
| class ChunkingDataHandler(ABC, Generic[CleanedDocumentT, ChunkT]): | |
| """ | |
| Abstract class for all Chunking data handlers. | |
| All data transformations logic for the chunking step is done here | |
| """ | |
| def metadata(self) -> dict: | |
| return { | |
| "chunk_size": 500, | |
| "chunk_overlap": 50, | |
| } | |
| def chunk(self, data_model: CleanedDocumentT) -> list[ChunkT]: | |
| pass | |
| class PostChunkingHandler(ChunkingDataHandler): | |
| def metadata(self) -> dict: | |
| return { | |
| "chunk_size": 250, | |
| "chunk_overlap": 25, | |
| } | |
| def chunk(self, data_model: CleanedPostDocument) -> list[PostChunk]: | |
| data_models_list = [] | |
| cleaned_content = data_model.content | |
| chunks = chunk_text( | |
| cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"] | |
| ) | |
| for chunk in chunks: | |
| chunk_id = hashlib.md5(chunk.encode()).hexdigest() | |
| model = PostChunk( | |
| id=UUID(chunk_id, version=4), | |
| content=chunk, | |
| platform=data_model.platform, | |
| document_id=data_model.id, | |
| author_id=data_model.author_id, | |
| author_full_name=data_model.author_full_name, | |
| image=data_model.image if data_model.image else None, | |
| metadata=self.metadata, | |
| ) | |
| data_models_list.append(model) | |
| return data_models_list | |
| class ArticleChunkingHandler(ChunkingDataHandler): | |
| def metadata(self) -> dict: | |
| return { | |
| "min_length": 1000, | |
| "max_length": 2000, | |
| } | |
| def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]: | |
| data_models_list = [] | |
| cleaned_content = data_model.content | |
| chunks = chunk_article( | |
| cleaned_content, min_length=self.metadata["min_length"], max_length=self.metadata["max_length"] | |
| ) | |
| for chunk in chunks: | |
| chunk_id = hashlib.md5(chunk.encode()).hexdigest() | |
| model = ArticleChunk( | |
| id=UUID(chunk_id, version=4), | |
| content=chunk, | |
| platform=data_model.platform, | |
| link=data_model.link, | |
| document_id=data_model.id, | |
| author_id=data_model.author_id, | |
| author_full_name=data_model.author_full_name, | |
| metadata=self.metadata, | |
| ) | |
| data_models_list.append(model) | |
| return data_models_list | |
| class RepositoryChunkingHandler(ChunkingDataHandler): | |
| def metadata(self) -> dict: | |
| return { | |
| "chunk_size": 1500, | |
| "chunk_overlap": 100, | |
| } | |
| def chunk(self, data_model: CleanedRepositoryDocument) -> list[RepositoryChunk]: | |
| data_models_list = [] | |
| cleaned_content = data_model.content | |
| chunks = chunk_text( | |
| cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"] | |
| ) | |
| for chunk in chunks: | |
| chunk_id = hashlib.md5(chunk.encode()).hexdigest() | |
| model = RepositoryChunk( | |
| id=UUID(chunk_id, version=4), | |
| content=chunk, | |
| platform=data_model.platform, | |
| name=data_model.name, | |
| link=data_model.link, | |
| document_id=data_model.id, | |
| author_id=data_model.author_id, | |
| author_full_name=data_model.author_full_name, | |
| metadata=self.metadata, | |
| ) | |
| data_models_list.append(model) | |
| return data_models_list | |