Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 9, 2025

Commit

c0f31c1

verified ·

1 Parent(s): 3755446

Upload 10 files

Browse files

Files changed (10) hide show

src/core/BaseChunker.py +367 -0
src/core/ChunkingManager.py +354 -0
src/core/HierarchicalChunker.py +183 -0
src/core/OCREnhancedPDFLoader.py +89 -0
src/core/PageChunker.py +119 -0
src/core/ParagraphChunker.py +315 -0
src/core/SemanticChunker.py +200 -0
src/core/TextPreprocessor.py +198 -0
src/core/TokenChunker.py +458 -0
src/core/__init__.py +0 -0

src/core/BaseChunker.py ADDED Viewed

	@@ -0,0 +1,367 @@

+"""
+BaseChunker.py
+An abstract base class defining the interface for document chunking strategies.
+"""
+import logging
+from core.OCREnhancedPDFLoader import OCREnhancedPDFLoader
+from core.TextPreprocessor import TextPreprocessor
+import numpy as np
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any, Optional, Union
+import spacy
+from langchain_core.documents import Document
+# Import tiktoken at the module level
+try:
+    import tiktoken
+    TIKTOKEN_AVAILABLE = True
+except ImportError:
+    TIKTOKEN_AVAILABLE = False
+    logging.warning("tiktoken not installed. Some tokenization features will be limited. "
+                  "Install with: pip install tiktoken")
+logger = logging.getLogger(__name__)
+class BaseChunker(ABC):
+    """Abstract base class for document chunking strategies."""
+    # Common constants
+    BLANK_THRESHOLD = 20  # Minimum characters for non-blank text
+    TOKEN_THRESHOLD = 10  # Minimum tokens for valid content
+    # Model type indicators
+    TIKTOKEN_MODELS = ["gpt", "davinci", "curie", "babbage", "ada"]
+    BASIC_TOKENIZER_MODELS = ["llama", "mistral", "granite"]
+    def __init__(self, model_name: Optional[str] = None, embedding_model: Optional[Any] = None):
+        """
+        Initialize base chunker with model settings.
+        Args:
+            model_name: Name of the model for tokenization
+            embedding_model: Model for generating embeddings
+        """
+        self.model_name = model_name
+        self.embedding_model = embedding_model
+        self.uses_tiktoken = False
+        self.uses_basic_tokenizer = False
+        self.tokenizer = None
+        self._initialize_tokenizer()
+        # Initialize NLP pipeline for text analysis
+        self.nlp = spacy.load("en_core_web_sm")
+    def _initialize_tokenizer(self):
+        """Initialize the appropriate tokenizer based on model name."""
+        if not self.model_name:
+            logger.warning("No model name provided. Using basic tokenization.")
+            self.uses_basic_tokenizer = True
+            return
+        # Check if model is supported by tiktoken
+        if TIKTOKEN_AVAILABLE and self.model_name in ["cl100k_base", "p50k_base", "r50k_base", "gpt2"]:
+            try:
+                encoding = tiktoken.get_encoding(self.model_name)
+                # Create a tokenizer-like interface for tiktoken
+                class TiktokenWrapper:
+                    def __init__(self, encoding):
+                        self.encoding = encoding
+                    def tokenize(self, text):
+                        return self.encoding.encode(text)
+                self.tokenizer = TiktokenWrapper(encoding)
+                self.uses_tiktoken = True
+                logger.info(f"Initialized tiktoken tokenizer for model: {self.model_name}")
+                return
+            except Exception as e:
+                logger.warning(f"Error with specified tiktoken model: {e}")
+                # Fall back to a standard encoding
+                try:
+                    encoding = tiktoken.get_encoding("cl100k_base")
+                    class TiktokenWrapper:
+                        def __init__(self, encoding):
+                            self.encoding = encoding
+                        def tokenize(self, text):
+                            return self.encoding.encode(text)
+                    self.tokenizer = TiktokenWrapper(encoding)
+                    self.uses_tiktoken = True
+                    logger.info("Initialized tiktoken with cl100k_base encoding")
+                except Exception as e:
+                    logger.warning(f"Error initializing tiktoken: {e}")
+                    self.uses_basic_tokenizer = True
+        if TIKTOKEN_AVAILABLE and (
+            any(model in self.model_name.lower() for model in self.TIKTOKEN_MODELS) or
+            self.model_name.startswith("gpt-") or
+            self.model_name.endswith("-base")
+        ):
+            try:
+                encoding = tiktoken.get_encoding(self.model_name)
+                # Create a tokenizer-like interface for tiktoken
+                class TiktokenWrapper:
+                    def __init__(self, encoding):
+                        self.encoding = encoding
+                    def tokenize(self, text):
+                        return self.encoding.encode(text)
+                self.tokenizer = TiktokenWrapper(encoding)
+                self.uses_tiktoken = True
+                logger.info(f"Initialized tiktoken tokenizer for model: {self.model_name}")
+            except Exception as e:
+                logger.warning(f"Error with specified tiktoken model: {e}")
+                # Fall back to a standard encoding
+                try:
+                    encoding = tiktoken.get_encoding("cl100k_base")
+                    class TiktokenWrapper:
+                        def __init__(self, encoding):
+                            self.encoding = encoding
+                        def tokenize(self, text):
+                            return self.encoding.encode(text)
+                    self.tokenizer = TiktokenWrapper(encoding)
+                    self.uses_tiktoken = True
+                    logger.info("Initialized tiktoken with cl100k_base encoding")
+                except Exception as e:
+                    logger.warning(f"Error initializing tiktoken: {e}")
+                    self.uses_basic_tokenizer = True
+        # Check if model uses basic tokenization
+        elif any(model in self.model_name.lower() for model in self.BASIC_TOKENIZER_MODELS):
+            self.uses_basic_tokenizer = True
+            logger.info("Using basic tokenization for model")
+        # Fall back to transformers tokenizer
+        else:
+            try:
+                from transformers import AutoTokenizer
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+                logger.info(f"Initialized transformers tokenizer for model: {self.model_name}")
+            except Exception as e:
+                logger.warning(f"Error initializing transformer tokenizer: {e}")
+                logger.warning("Falling back to basic tokenization")
+                self.uses_basic_tokenizer = True
+    def count_tokens(self, text: str) -> int:
+        """Count tokens in a text string using the available tokenizer."""
+        if not text:
+            return 0
+        try:
+            # Try with the standard tokenizer
+            if self.tokenizer:
+                if self.uses_tiktoken:
+                    # For tiktoken wrapper
+                    return len(self.tokenizer.tokenize(text))
+                else:
+                    # For transformers tokenizer
+                    tokens = self.tokenizer.tokenize(text)
+                    return len(tokens)
+        except Exception as e:
+            logger.warning(f"Primary tokenization failed: {e}")
+        # Basic tokenization fallback
+        if self.uses_basic_tokenizer or not self.tokenizer:
+            # Simple approximation (word count)
+            return len(text.split())
+        # If we somehow got here, return a reasonable approximation
+        return len(text) // 4  # Rough character-to-token ratio
+    def get_embedding(self, text: str) -> Optional[np.ndarray]:
+        """Generate embedding vector for text."""
+        if not text.strip() or not self.embedding_model:
+            return None
+        try:
+            return self.embedding_model.encode(text)
+        except Exception as e:
+            logger.error(f"Error generating embedding: {e}")
+            return None
+    def analyze_text(self, text: str) -> Dict[str, Any]:
+        """Perform detailed analysis of text content."""
+        if not text.strip():
+            return {
+                "char_count": 0,
+                "token_count": 0,
+                "sentence_count": 0,
+                "word_count": 0,
+                "embedding_dim": 0,
+                "has_content": False
+            }
+        try:
+            embedding = self.get_embedding(text)
+            doc = self.nlp(text)
+            return {
+                "char_count": len(text),
+                "token_count": self.count_tokens(text),
+                "sentence_count": len(list(doc.sents)),
+                "word_count": len(text.split()),
+                "embedding_dim": len(embedding) if embedding is not None else 0,
+                "has_content": bool(text.strip())
+            }
+        except Exception as e:
+            logger.error(f"Error analyzing text: {e}")
+            return {
+                "char_count": len(text),
+                "token_count": 0,
+                "sentence_count": 0,
+                "word_count": len(text.split()),
+                "embedding_dim": 0,
+                "has_content": bool(text.strip())
+            }
+    def is_content_valid(self, text: str, min_chars: int = None, min_tokens: int = None) -> bool:
+        """Check if content meets minimum requirements."""
+        if not text.strip():
+            return False
+        min_chars = min_chars or self.BLANK_THRESHOLD
+        min_tokens = min_tokens or self.TOKEN_THRESHOLD
+        if len(text.strip()) < min_chars:
+            return False
+        token_count = self.count_tokens(text)
+        return token_count >= min_tokens
+    def validate_documents(self, documents):
+        """Validate documents before sending to vector database"""
+        valid_documents = []
+        for i, doc in enumerate(documents):
+            # Check if document content is empty or just whitespace
+            if not doc.page_content or not doc.page_content.strip():
+                print(f"Skipping document {i}: Empty content")
+                continue
+            # Check if content starts with invalid characters
+            if doc.page_content and len(doc.page_content) > 0:
+                # Remove any potential BOM or invisible characters at start
+                cleaned_content = doc.page_content.lstrip('\ufeff\u200b\u200c\u200d\u200e\u200f\u2060')
+                # Replace document content with cleaned version
+                doc.page_content = cleaned_content
+            valid_documents.append(doc)
+        print(f"Validated {len(valid_documents)} of {len(documents)} documents")
+        return valid_documents
+    def debug_documents(self, documents, num_chars=50):
+        """Print diagnostic information about documents"""
+        print(f"\nDEBUG INFO: Examining {len(documents)} documents")
+        for i, doc in enumerate(documents):
+            content = doc.page_content
+            if not content:
+                print(f"  Doc {i}: EMPTY CONTENT")
+                continue
+            # Get first few characters and their ASCII/Unicode codes
+            first_chars = content[:num_chars]
+            char_codes = [f"{c}({ord(c)})" for c in first_chars[:10]]
+            print(f"  Doc {i}: Length={len(content)}, First chars: {''.join(char_codes)}")
+            print(f"    Preview: {first_chars!r}")
+        print("DEBUG INFO END\n")
+    def load_document(self, file_path: str) -> List[Document]:
+        """Load document using OCREnhancedPDFLoader."""
+        try:
+            loader = OCREnhancedPDFLoader(file_path)
+            documents = loader.load()
+            self.debug_documents(documents)
+            cleaned_docs = self.validate_documents(documents)
+            return cleaned_docs
+        except Exception as e:
+            logger.error(f"Error loading document: {e}")
+            raise
+    def preprocess_text(self, text: str, remove_headers_footers: bool = True) -> str:
+        """Preprocess text using TextPreprocessor."""
+        try:
+            preprocessor = TextPreprocessor()
+            return preprocessor.preprocess(text, remove_headers_footers)
+        except Exception as e:
+            logger.error(f"Error preprocessing text: {e}")
+            return text
+    @abstractmethod
+    def process_document(self, file_path: str, preprocess: bool = True) -> Union[List[Document], Dict[str, List[Document]]]:
+        """Process document using specific chunking strategy."""
+        pass
+    def load_text_file(self, file_path: str) -> str:
+        """
+        Load raw text file content.
+        Args:
+            file_path: Path to the text file
+        Returns:
+            Raw text content
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            logger.info(f"Loaded text file: {file_path} ({len(content)} characters)")
+            return content
+        except Exception as e:
+            logger.error(f"Error loading text file {file_path}: {e}")
+            raise
+    def clean_text_for_processing(self, text: str) -> str:
+        """
+        Clean text using Unicode character replacement (same as PDF conversion logic).
+        Args:
+            text: Raw text content
+        Returns:
+            Cleaned text content
+        """
+        replacements = {
+            '\u2019': "'", '\u2018': "'", '\u201c': '"', '\u201d': '"',
+            '\u2014': '-', '\u2013': '-', '\u2026': '...',
+            '\u200b': '', '\u00a0': ' ', '\u2022': '*',
+            '\u2192': '->', '\u2190': '<-',
+        }
+        for old, new in replacements.items():
+            text = text.replace(old, new)
+        return text
+    def process_text_file(self, file_path: str, preprocess: bool = True) -> List[Document]:
+        """
+        Default text file processing method. Can be overridden by specific chunkers.
+        Args:
+            file_path: Path to the text file
+            preprocess: Whether to preprocess the text
+        Returns:
+            List of Document objects
+        """
+        # This is a default implementation that should be overridden
+        # by specific chunkers like ParagraphChunker and TokenChunker
+        raise NotImplementedError("Subclasses must implement process_text_file method")

src/core/ChunkingManager.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+ChunkingManager.py
+A manager class that orchestrates document chunking using different strategies.
+"""
+from typing import Dict, List, Optional, Union
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+from langchain_core.documents import Document
+# Import chunker strategies
+from core.BaseChunker import BaseChunker
+from core.PageChunker import PageChunker
+from core.ParagraphChunker import ParagraphChunker
+from core.SemanticChunker import SemanticChunker
+from core.HierarchicalChunker import HierarchicalChunker
+from core.TokenChunker import TokenChunker
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class ChunkingStrategy:
+    """Enumeration of available chunking strategies."""
+    PAGE = "page"
+    PARAGRAPH = "paragraph"
+    SEMANTIC = "semantic"
+    HIERARCHICAL = "hierarchical"
+    TOKEN = "token"
+class ChunkingManager:
+    """Manager class for document chunking strategies."""
+    def __init__(
+        self,
+        embedding_model_name: str = "all-mpnet-base-v2",
+        token_model_name: Optional[str] = None
+    ):
+        """
+        Initialize chunking manager.
+        Args:
+            embedding_model_name: Name of the sentence transformer model
+            token_model_name: Name of the token counting model
+        """
+        self.token_model_name = token_model_name
+        self.embedding_model_name = embedding_model_name
+        self._embedding_model = None
+        self._chunkers = {}
+    @property
+    def embedding_model(self):
+        """Lazy-load the embedding model."""
+        if self._embedding_model is None:
+            try:
+                # Only try to load as SentenceTransformer if it's a known SentenceTransformer model
+                if self.embedding_model_name and not any(x in self.embedding_model_name.lower() for x in ["gpt", "text-embedding", "openai"]):
+                    logger.info(f"Loading embedding model: {self.embedding_model_name}")
+                    self._embedding_model = SentenceTransformer(self.embedding_model_name)
+                else:
+                    # Return a dummy embedding model that returns None
+                    logger.info("Using dummy embedding model for tokenization only")
+                    class DummyEmbedder:
+                        def encode(self, text, **kwargs):
+                            return [0.0] * 384  # Return dummy vector
+                    self._embedding_model = DummyEmbedder()
+            except Exception as e:
+                logger.error(f"Error loading embedding model: {e}")
+                # Return a dummy embedding model that returns None
+                class DummyEmbedder:
+                    def encode(self, text, **kwargs):
+                        return [0.0] * 384  # Return dummy vector
+                self._embedding_model = DummyEmbedder()
+        return self._embedding_model
+    def _get_chunker(self, strategy: str) -> BaseChunker:
+        """Get or create chunker for the specified strategy."""
+        strategy = strategy.lower()
+        if strategy not in self._chunkers:
+            if strategy == ChunkingStrategy.PAGE:
+                self._chunkers[strategy] = PageChunker(
+                    model_name=self.token_model_name,
+                    embedding_model=self.embedding_model
+                )
+            elif strategy == ChunkingStrategy.PARAGRAPH:
+                self._chunkers[strategy] = ParagraphChunker(
+                    model_name=self.token_model_name,
+                    embedding_model=self.embedding_model
+                )
+            elif strategy == ChunkingStrategy.SEMANTIC:
+                self._chunkers[strategy] = SemanticChunker(
+                    embedding_model=self.embedding_model,
+                    model_name=self.token_model_name
+                )
+            elif strategy == ChunkingStrategy.HIERARCHICAL:
+                self._chunkers[strategy] = HierarchicalChunker(
+                    model_name=self.token_model_name,
+                    embedding_model=self.embedding_model
+                )
+            elif strategy == ChunkingStrategy.TOKEN:
+                self._chunkers[strategy] = TokenChunker(
+                    model_name=self.token_model_name,
+                    embedding_model=self.embedding_model,
+                    chunk_size=256,  # Default values, could be made configurable
+                    chunk_overlap=50
+                )
+            else:
+                raise ValueError(f"Unknown chunking strategy: {strategy}")
+        return self._chunkers[strategy]
+    def process_document(
+        self,
+        file_path: str,
+        strategy: str = ChunkingStrategy.PARAGRAPH,
+        preprocess: bool = True
+    ) -> Union[List[Document], Dict[str, List[Document]]]:
+        """
+        Process document using specified chunking strategy.
+        Args:
+            file_path: Path to document file
+            strategy: Chunking strategy to use
+            preprocess: Whether to preprocess text
+        Returns:
+            Chunked document(s) according to strategy
+        """
+        # Validate file exists
+        path = Path(file_path)
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        # Determine file type
+        file_extension = path.suffix.lower()
+        # Process based on file type
+        if file_extension == '.csv':
+            return self._process_csv(file_path, strategy)
+        elif file_extension == '.txt':
+            return self._process_txt(file_path, strategy, preprocess)
+        elif file_extension == '.pdf':
+            # Get appropriate chunker and process document
+            chunker = self._get_chunker(strategy)
+            logger.info(f"Processing document using {strategy} chunking strategy")
+            if strategy == ChunkingStrategy.PAGE:
+                return chunker.page_process_document(file_path, preprocess)
+            elif strategy == ChunkingStrategy.PARAGRAPH:
+                return chunker.paragraph_process_document(file_path, preprocess)
+            elif strategy == ChunkingStrategy.SEMANTIC:
+                return chunker.semantic_process_document(file_path, preprocess)
+            elif strategy == ChunkingStrategy.HIERARCHICAL:
+                return chunker.hierarchical_process_document(file_path, preprocess)
+            elif strategy == ChunkingStrategy.TOKEN:
+                return chunker.token_process_document(file_path, preprocess)
+            else:
+                raise ValueError(f"Unknown chunking strategy: {strategy}")
+        else:
+            raise ValueError(f"Unsupported file type: {file_extension}. Supported types: .pdf, .csv, .txt")
+    def process_directory(
+        self,
+        dir_path: str,
+        strategy: str = ChunkingStrategy.PARAGRAPH,
+        preprocess: bool = True
+    ) -> Dict[str, Union[List[Document], Dict[str, List[Document]]]]:
+        """
+        Process all supported documents in a directory.
+        Args:
+            dir_path: Directory containing files
+            strategy: Chunking strategy to use
+            preprocess: Whether to preprocess text
+        Returns:
+            Dictionary mapping filenames to their processed documents
+        """
+        path = Path(dir_path)
+        if not path.is_dir():
+            raise NotADirectoryError(f"Not a directory: {dir_path}")
+        results = {}
+        # Find supported files (PDFs, CSVs, and TXT files)
+        pdf_files = list(path.glob("**/*.pdf"))
+        csv_files = list(path.glob("**/*.csv"))
+        txt_files = list(path.glob("**/*.txt"))
+        all_files = pdf_files + csv_files + txt_files
+        logger.info(f"Found {len(pdf_files)} PDF files, {len(csv_files)} CSV files, and {len(txt_files)} TXT files in {dir_path}")
+        for file in all_files:
+            try:
+                logger.info(f"Processing {file.name}")
+                result = self.process_document(
+                    str(file),
+                    strategy=strategy,
+                    preprocess=preprocess
+                )
+                results[file.name] = result
+            except Exception as e:
+                logger.error(f"Error processing {file.name}: {e}")
+                results[file.name] = {"error": str(e)}
+        return results
+    def _process_txt(self, file_path: str, strategy: str, preprocess: bool) -> List[Document]:
+        """Process a TXT file into document chunks."""
+        logger.info(f"Processing TXT file: {file_path}")
+        # Validate strategy for TXT files
+        if strategy not in [ChunkingStrategy.PARAGRAPH, ChunkingStrategy.TOKEN]:
+            raise ValueError(f"TXT files only support paragraph and token chunking strategies. Got: {strategy}")
+        # Get appropriate chunker
+        chunker = self._get_chunker(strategy)
+        # Process based on strategy
+        if strategy == ChunkingStrategy.PARAGRAPH:
+            return chunker.process_text_file(file_path, preprocess)
+        elif strategy == ChunkingStrategy.TOKEN:
+            return chunker.process_text_file(file_path, preprocess)
+        else:
+            raise ValueError(f"Unsupported chunking strategy for TXT: {strategy}")
+    def _process_txt(self, file_path: str, strategy: str, preprocess: bool) -> List[Document]:
+        """Process a TXT file into document chunks."""
+        logger.info(f"Processing TXT file: {file_path}")
+        # Validate strategy for TXT files
+        if strategy not in [ChunkingStrategy.PARAGRAPH, ChunkingStrategy.TOKEN]:
+            raise ValueError(f"TXT files only support paragraph and token chunking strategies. Got: {strategy}")
+        # Get appropriate chunker
+        chunker = self._get_chunker(strategy)
+        # Process based on strategy
+        if strategy == ChunkingStrategy.PARAGRAPH:
+            return chunker.process_text_file(file_path, preprocess)
+        elif strategy == ChunkingStrategy.TOKEN:
+            return chunker.process_text_file(file_path, preprocess)
+        else:
+            raise ValueError(f"Unsupported chunking strategy for TXT: {strategy}")
+    def _process_csv(self, file_path: str, strategy: str) -> List[Document]:
+        """Process a CSV file into document chunks."""
+        import pandas as pd
+        logger.info(f"Loading CSV file: {file_path}")
+        # Read the CSV file
+        df = pd.read_csv(file_path)
+        # Determine the chunking approach based on strategy
+        if strategy == ChunkingStrategy.PARAGRAPH:
+            # For these strategies, we treat each row as a separate document
+            # with columns combined into a structured text format
+            return self._chunk_csv_by_row(df, file_path)
+        elif strategy == ChunkingStrategy.PAGE:
+            # For page strategy, we create larger chunks with multiple rows
+            return self._chunk_csv_by_page(df, file_path)
+        elif strategy == ChunkingStrategy.HIERARCHICAL:
+            # For hierarchical, create documents with metadata structure
+            return {"chunks": self._chunk_csv_by_row(df, file_path)}
+        else:
+            raise ValueError(f"Unsupported chunking strategy for CSV: {strategy}")
+    def _chunk_csv_by_row(self, df, file_path: str) -> List[Document]:
+        """Convert each CSV row to a document chunk."""
+        chunks = []
+        file_name = Path(file_path).name
+        # Get column names
+        columns = df.columns.tolist()
+        # Process each row
+        for i, row in df.iterrows():
+            # Convert row to formatted text
+            content = "\n".join([f"{col}: {row[col]}" for col in columns])
+            # Create metadata
+            metadata = {
+                "source": file_path,
+                "file_name": file_name,
+                "file_type": "csv",
+                "row_index": i,
+                "chunk_type": "csv_row",
+            }
+            # Add columns as additional metadata
+            for col in columns:
+                # Convert to string to ensure compatibility
+                metadata[f"csv_{col}"] = str(row[col])
+            # Create document
+            doc = Document(page_content=content, metadata=metadata)
+            chunks.append(doc)
+        logger.info(f"Created {len(chunks)} chunks from CSV (row-based)")
+        return chunks
+    def _chunk_csv_by_page(self, df, file_path: str, rows_per_chunk: int = 20) -> List[Document]:
+        """Convert CSV into larger chunks with multiple rows per chunk."""
+        chunks = []
+        file_name = Path(file_path).name
+        columns = df.columns.tolist()
+        # Calculate number of chunks
+        total_rows = len(df)
+        chunk_count = (total_rows + rows_per_chunk - 1) // rows_per_chunk  # Ceiling division
+        # Generate chunks
+        for chunk_idx in range(chunk_count):
+            start_row = chunk_idx * rows_per_chunk
+            end_row = min(start_row + rows_per_chunk, total_rows)
+            chunk_df = df.iloc[start_row:end_row]
+            # Format the chunk content
+            content = f"CSV Data (Rows {start_row+1}-{end_row}):\n\n"
+            # Add header row
+            content += " | ".join(columns) + "\n"
+            content += "-" * (sum(len(col) for col in columns) + 3 * (len(columns) - 1)) + "\n"
+            # Add data rows
+            for _, row in chunk_df.iterrows():
+                content += " | ".join(str(row[col]) for col in columns) + "\n"
+            # Create metadata
+            metadata = {
+                "source": file_path,
+                "file_name": file_name,
+                "file_type": "csv",
+                "chunk_type": "csv_page",
+                "start_row": start_row,
+                "end_row": end_row - 1,
+                "row_count": end_row - start_row,
+            }
+            # Create document
+            doc = Document(page_content=content, metadata=metadata)
+            chunks.append(doc)
+        logger.info(f"Created {len(chunks)} chunks from CSV (page-based)")
+        return chunks

src/core/HierarchicalChunker.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""
+HierarchicalChunker.py
+A module for hierarchical document chunking that combines page-level and semantic chunking.
+Features:
+- Multi-level document representation (pages and chunks)
+- Semantic chunking with sentence boundaries
+- Size and overlap controls
+- Hierarchical metadata
+"""
+import logging
+import spacy
+from typing import Dict, List, Optional, Any
+from langchain_core.documents import Document
+from core.PageChunker import PageChunker
+logger = logging.getLogger(__name__)
+class HierarchicalChunker(PageChunker):
+    """Handles document chunking at multiple hierarchical levels."""
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        embedding_model: Optional[Any] = None,
+        chunk_size: int = 500,
+        chunk_overlap: int = 50,
+        similarity_threshold: float = 0.85
+    ):
+        """
+        Initialize hierarchical chunker with specified models and parameters.
+        Args:
+            model_name: Name of the model for tokenization
+            embedding_model: Model for generating embeddings
+            chunk_size: Maximum size of semantic chunks
+            chunk_overlap: Overlap between chunks
+            similarity_threshold: Similarity threshold for merging chunks
+        """
+        super().__init__(model_name, embedding_model)
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.similarity_threshold = similarity_threshold
+        # Initialize spaCy for NLP tasks
+        try:
+            self.nlp = spacy.load("en_core_web_sm")
+        except OSError:
+            logger.info("Installing spaCy model...")
+            import subprocess
+            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
+                         capture_output=True)
+            self.nlp = spacy.load("en_core_web_sm")
+    def _create_semantic_chunks(self, content: str, page_number: int) -> List[Document]:
+        """
+        Create semantic chunks with detailed metadata.
+        Args:
+            content: The page content to chunk
+            page_number: The page number
+        Returns:
+            List of Document objects representing semantic chunks
+        """
+        if not content.strip():
+            return []
+        sentences = list(self.nlp(content).sents)
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for sent in sentences:
+            sent_text = sent.text.strip()
+            sent_length = len(sent_text)
+            if current_length + sent_length > self.chunk_size:
+                if current_chunk:
+                    chunk_text = " ".join(current_chunk)
+                    stats = self.analyze_text(chunk_text)
+                    chunks.append(Document(
+                        page_content=chunk_text,
+                        metadata={
+                            "level": "chunk",
+                            "page_num": page_number,
+                            "chunk_num": len(chunks) + 1,
+                            "parent_page": page_number,
+                            "char_count": stats["char_count"],
+                            "token_count": stats["token_count"],
+                            "sentence_count": stats["sentence_count"],
+                            "word_count": stats["word_count"],
+                            "has_ocr": stats.get("has_content", "true")
+                        }
+                    ))
+                current_chunk = [sent_text]
+                current_length = sent_length
+            else:
+                current_chunk.append(sent_text)
+                current_length += sent_length
+        # Handle final chunk
+        if current_chunk:
+            chunk_text = " ".join(current_chunk)
+            stats = self.analyze_text(chunk_text)
+            chunks.append(Document(
+                page_content=chunk_text,
+                metadata={
+                    "level": "chunk",
+                    "page_num": page_number,
+                    "chunk_num": len(chunks) + 1,
+                    "parent_page": page_number,
+                    "char_count": stats["char_count"],
+                    "token_count": stats["token_count"],
+                    "sentence_count": stats["sentence_count"],
+                    "word_count": stats["word_count"],
+                    "has_ocr": stats.get("has_content", "true")
+                }
+            ))
+        self.page_stats.append(f"Created {len(chunks)} chunks for page {page_number}")
+        return chunks
+    def hierarchical_process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
+        """
+        Process document with hierarchical chunking strategy.
+        Args:
+            file_path: Path to the PDF file
+            preprocess: Whether to preprocess text
+        Returns:
+            Dictionary with 'pages' and 'chunks' lists of Documents
+        """
+        self.page_stats = []  # Reset stats
+        # First get the page-level documents using PageChunker
+        page_docs = super().page_process_document(file_path, preprocess)
+        # Now create chunk-level documents
+        chunk_docs = []
+        total_chunks = 0
+        for page_doc in page_docs:
+            page_num = page_doc.metadata["page"]
+            # Mark this as a page-level document
+            page_doc.metadata["level"] = "page"
+            # Create chunks for this page
+            page_chunks = self._create_semantic_chunks(
+                page_doc.page_content,
+                page_num
+            )
+            chunk_docs.extend(page_chunks)
+            total_chunks += len(page_chunks)
+        # Log summary information
+        logger.info(f"\nHierarchical Processing Summary:")
+        logger.info(f"Total Pages: {len(page_docs)}")
+        logger.info(f"Total Chunks: {total_chunks}")
+        logger.info("\n".join(self.page_stats))
+        return {
+            "pages": page_docs,
+            "chunks": chunk_docs
+        }
+    def process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
+        """
+        Process document using hierarchical chunking strategy (implements abstract method).
+        Args:
+            file_path: Path to the PDF file
+            preprocess: Whether to preprocess text
+        Returns:
+            Dictionary with 'pages' and 'chunks' lists of Documents
+        """
+        return self.hierarchical_process_document(file_path, preprocess)

src/core/OCREnhancedPDFLoader.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import pytesseract
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_core.documents import Document
+from pdf2image import convert_from_path
+class OCREnhancedPDFLoader:
+    """Loads PDFs with OCR support for text extraction"""
+    BLANK_THRESHOLD = 10
+    # FIXED: Removed Windows default path
+    def __init__(self, file_path: str, tesseract_path: str = None):
+        if not os.path.isfile(file_path):
+            raise FileNotFoundError(f"PDF file not found at path: {file_path}")
+        self.file_path = file_path
+        self.skipped_pages = []
+        # Only set cmd if specific path provided, otherwise trust Linux PATH
+        if tesseract_path:
+            if not os.path.isfile(tesseract_path):
+                raise ValueError(f"Tesseract executable not found at path: {tesseract_path}")
+            pytesseract.pytesseract.tesseract_cmd = tesseract_path
+    def _is_blank_page(self, text: str) -> bool:
+        if not text or not text.strip():
+            return True
+        cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
+        return len(cleaned_text) < self.BLANK_THRESHOLD
+    def _process_page(self, doc, img, page_number: int):
+        existing_text = doc.page_content
+        # Use existing text if substantial
+        if len(existing_text.strip()) > self.BLANK_THRESHOLD * 5:
+            combined_text = existing_text
+            ocr_used = False
+        else:
+            # Fallback to OCR
+            try:
+                ocr_text = pytesseract.image_to_string(img)
+                combined_text = ocr_text
+                ocr_used = True
+            except Exception as e:
+                print(f"Error applying OCR to page {page_number}: {e}")
+                combined_text = existing_text
+                ocr_used = False
+        if self._is_blank_page(combined_text):
+            self.skipped_pages.append(page_number)
+            return None
+        return Document(
+            page_content=combined_text,
+            metadata={
+                **doc.metadata,
+                "source": "ocr" if ocr_used else "text_extraction",
+                "page": page_number,
+                "is_blank": "false",
+                "has_ocr": str(ocr_used)
+            }
+        )
+    def load(self):
+        try:
+            # 1. Standard Load
+            loader = PyMuPDFLoader(self.file_path)
+            text_documents = loader.load()
+            # 2. Image Conversion (Linux requires poppler-utils installed)
+            images = convert_from_path(self.file_path, dpi=300)
+            enhanced_documents = []
+            for idx, (doc, img) in enumerate(zip(text_documents, images)):
+                page_number = idx + 1
+                enhanced_doc = self._process_page(doc, img, page_number)
+                if enhanced_doc:
+                    enhanced_documents.append(enhanced_doc)
+            if self.skipped_pages:
+                print(f"Skipped blank pages: {self.skipped_pages}")
+            return enhanced_documents
+        except Exception as e:
+            print(f"Error in OCR-enhanced loading: {e}")
+            raise

src/core/PageChunker.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+PageChunker.py
+A module for page-level document chunking with token counting and preprocessing.
+Features:
+- Page-based document splitting
+- Content validation
+- Blank page detection
+- Document metadata enrichment
+"""
+from typing import List, Optional
+import logging
+from langchain_core.documents import Document
+from core.BaseChunker import BaseChunker
+logger = logging.getLogger(__name__)
+class PageChunker(BaseChunker):
+    """Handles document chunking at the page level."""
+    def __init__(self, model_name=None, embedding_model=None):
+        """
+        Initialize page chunker with specified models.
+        Args:
+            model_name: Name of the model for tokenization
+            embedding_model: Model for generating embeddings
+        """
+        super().__init__(model_name, embedding_model)
+        self.page_stats = []
+    def _is_blank_page(self, text: str) -> bool:
+        """Check if page is blank or contains only whitespace/special characters."""
+        cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
+        return len(cleaned_text) < self.BLANK_THRESHOLD
+    def _process_single_page(self, content: str, page_number: int, preprocess: bool) -> Optional[Document]:
+        """
+        Process a single page with optional preprocessing and analysis.
+        Args:
+            content: The page content
+            page_number: The page number
+            preprocess: Whether to preprocess the text
+        Returns:
+            Document object with processed content and metadata, or None if page is blank
+        """
+        if self._is_blank_page(content):
+            self.page_stats.append(f"Page {page_number} is blank.")
+            return None
+        # Optionally preprocess the text
+        if preprocess:
+            content = self.preprocess_text(content)
+        # Analyze the page and generate metadata
+        stats = self.analyze_text(content)
+        metadata = {
+            "page": page_number,
+            "char_count": stats["char_count"],
+            "token_count": stats["token_count"],
+            "sentence_count": stats["sentence_count"],
+            "word_count": stats["word_count"],
+            "has_ocr": str(stats.get("has_content", True)),
+            "is_blank": "false"
+        }
+        return Document(page_content=content, metadata=metadata)
+    def page_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
+        """
+        Process PDF document page by page with analysis and optional preprocessing.
+        Args:
+            file_path: Path to the PDF file
+            preprocess: Whether to preprocess page text
+        Returns:
+            List of Document objects, one per non-blank page
+        """
+        try:
+            self.page_stats = []  # Reset stats for this document
+            raw_pages = self.load_document(file_path)
+            processed_pages = []
+            logger.info(f"Processing document with {len(raw_pages)} pages")
+            for idx, page in enumerate(raw_pages):
+                processed_page = self._process_single_page(page.page_content, idx + 1, preprocess)
+                if processed_page:
+                    processed_pages.append(processed_page)
+            # Output skipped pages for transparency
+            if self.page_stats:
+                logger.info("\n".join(self.page_stats))
+            logger.info(f"Processed {len(processed_pages)} non-blank pages")
+            return processed_pages
+        except Exception as e:
+            logger.error(f"Error in page_process_document: {e}")
+            raise
+    def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
+        """
+        Process document using page chunking strategy (implements abstract method).
+        Args:
+            file_path: Path to the PDF file
+            preprocess: Whether to preprocess page text
+        Returns:
+            List of Document objects, one per non-blank page
+        """
+        return self.page_process_document(file_path, preprocess)

src/core/ParagraphChunker.py ADDED Viewed

	@@ -0,0 +1,315 @@

+"""
+ParagraphChunker.py
+A module for paragraph-level document chunking with token counting and preprocessing.
+Features:
+- Paragraph-based document splitting
+- Content validation
+- Multi-level delimiter detection
+- Smart paragraph boundary detection
+"""
+import logging
+import spacy
+from typing import List, Optional
+from pathlib import Path
+from datetime import datetime
+from langchain_core.documents import Document
+from core.BaseChunker import BaseChunker
+logger = logging.getLogger(__name__)
+class ParagraphChunker(BaseChunker):
+    """Handles document chunking at the paragraph level with token counting."""
+    PARAGRAPH_MIN_LENGTH = 50  # Minimum characters for a valid paragraph
+    def __init__(self, model_name=None, embedding_model=None):
+        """
+        Initialize paragraph chunker with specified models.
+        Args:
+            model_name: Name of the model for tokenization
+            embedding_model: Model for generating embeddings
+        """
+        super().__init__(model_name, embedding_model)
+        self.page_stats = []
+        # Initialize spaCy for NLP tasks
+        try:
+            self.nlp = spacy.load("en_core_web_sm")
+        except Exception as e:
+            logger.error(f"Error loading spaCy model: {e}")
+            import subprocess
+            logger.info("Installing spaCy model...")
+            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
+                         capture_output=True)
+            self.nlp = spacy.load("en_core_web_sm")
+    def _split_into_paragraphs(self, text: str) -> List[str]:
+        """
+        Split text into paragraphs using length and punctuation heuristics.
+        Args:
+            text: The text content to split
+        Returns:
+            List of paragraphs
+        """
+        # Pre-clean the text
+        text = text.replace('\r', '\n')
+        # First, try double line breaks
+        paragraphs = text.split('\n\n')
+        # If that fails (PDF extraction issue), use sentence-based reconstruction
+        if len(paragraphs) <= 3:
+            print(f"PDF extraction flattened structure. Reconstructing from sentences...")
+            # Use spaCy for sentence detection
+            doc = self.nlp(text)
+            paragraphs = []
+            current_para = []
+            current_length = 0
+            for sent in doc.sents:
+                sent_text = sent.text.strip()
+                if not sent_text:
+                    continue
+                # Add sentence to current paragraph
+                current_para.append(sent_text)
+                current_length += len(sent_text)
+                # Check if we should end the current paragraph
+                should_end_paragraph = (
+                    # Paragraph is getting long (300-600 chars is typical)
+                    current_length > 300 and
+                    # Current sentence ends with proper punctuation
+                    sent_text.endswith(('.', '!', '?')) and
+                    # We have substantial content
+                    len(current_para) >= 2
+                )
+                if should_end_paragraph:
+                    paragraphs.append(' '.join(current_para))
+                    current_para = []
+                    current_length = 0
+            # Add the last paragraph
+            if current_para:
+                paragraphs.append(' '.join(current_para))
+            print(f"Reconstructed {len(paragraphs)} paragraphs using length heuristics")
+        # Clean and filter paragraphs
+        cleaned_paragraphs = []
+        for para in paragraphs:
+            clean_para = ' '.join(para.split())
+            if len(clean_para) >= self.PARAGRAPH_MIN_LENGTH:
+                cleaned_paragraphs.append(clean_para)
+        print(f"Final paragraph count: {len(cleaned_paragraphs)}")
+        return cleaned_paragraphs
+    def _process_single_paragraph(self, content: str, page_number: int,
+                                 para_number: int, preprocess: bool) -> Optional[Document]:
+        """
+        Process a single paragraph with analysis and metadata.
+        Args:
+            content: The paragraph content
+            page_number: The page number
+            para_number: The paragraph number
+            preprocess: Whether to preprocess the text
+        Returns:
+            Document object with processed content and metadata, or None if paragraph is invalid
+        """
+        # First check character length
+        if len(content.strip()) < self.PARAGRAPH_MIN_LENGTH:
+            self.page_stats.append(f"Paragraph {para_number} on page {page_number} is too short.")
+            return None
+        # Optionally preprocess the text
+        if preprocess:
+            content = self.preprocess_text(content)
+        # Analyze the paragraph and generate metadata
+        stats = self.analyze_text(content)
+        # Check token threshold
+        if stats["token_count"] < self.TOKEN_THRESHOLD:
+            self.page_stats.append(
+                f"Paragraph {para_number} on page {page_number} dropped: "
+                f"only {stats['token_count']} tokens"
+            )
+            return None
+        metadata = {
+            "page": page_number,
+            "paragraph": para_number,
+            "char_count": stats["char_count"],
+            "token_count": stats["token_count"],
+            "sentence_count": stats["sentence_count"],
+            "word_count": stats["word_count"],
+            "has_ocr": str(stats.get("has_content", True))
+        }
+        return Document(page_content=content, metadata=metadata)
+    def paragraph_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
+        """
+        Process PDF document paragraph by paragraph with analysis.
+        Args:
+            file_path: Path to the PDF file
+            preprocess: Whether to preprocess paragraph text
+        Returns:
+            List of Document objects, one per valid paragraph
+        """
+        try:
+            self.page_stats = []  # Reset stats for this document
+            raw_pages = self.load_document(file_path)
+            processed_paragraphs = []
+            logger.info(f"Processing document with {len(raw_pages)} pages")
+            for page_idx, page in enumerate(raw_pages):
+                paragraphs = self._split_into_paragraphs(page.page_content)
+                logger.info(f"Page {page_idx+1}: Found {len(paragraphs)} paragraphs")
+                for para_idx, paragraph in enumerate(paragraphs):
+                    processed_para = self._process_single_paragraph(
+                        paragraph,
+                        page_idx + 1,
+                        para_idx + 1,
+                        preprocess
+                    )
+                    if processed_para:
+                        processed_paragraphs.append(processed_para)
+            # Output skipped paragraphs for transparency
+            if self.page_stats:
+                logger.info("\n".join(self.page_stats))
+            logger.info(f"Processed {len(processed_paragraphs)} valid paragraphs")
+            return processed_paragraphs
+        except Exception as e:
+            logger.error(f"Error in paragraph_process_document: {e}")
+            raise
+    def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
+        """
+        Process document using paragraph chunking strategy (implements abstract method).
+        Args:
+            file_path: Path to the PDF file
+            preprocess: Whether to preprocess paragraph text
+        Returns:
+            List of Document objects, one per valid paragraph
+        """
+        return self.paragraph_process_document(file_path, preprocess)
+    def process_text_file(self, file_path: str, preprocess: bool = False) -> List[Document]:
+        """
+        Process text file directly, preserving paragraph structure.
+        Args:
+            file_path: Path to the text file
+            preprocess: Whether to preprocess paragraph text
+        Returns:
+            List of Document objects, one per valid paragraph
+        """
+        try:
+            # Load the text file directly
+            content = self.load_text_file(file_path)
+            # Clean the text using the same logic as PDF conversion
+            content = self.clean_text_for_processing(content)
+            # Split into paragraphs using double line breaks
+            paragraphs = content.split('\n\n')
+            logger.info(f"Found {len(paragraphs)} paragraphs in text file: {file_path}")
+            processed_paragraphs = []
+            file_name = Path(file_path).name
+            for para_idx, paragraph in enumerate(paragraphs):
+                paragraph = paragraph.strip()
+                if paragraph:
+                    processed_para = self._process_single_paragraph_from_text(
+                        paragraph,
+                        file_path,
+                        file_name,
+                        para_idx + 1,
+                        preprocess
+                    )
+                    if processed_para:
+                        processed_paragraphs.append(processed_para)
+            logger.info(f"Processed {len(processed_paragraphs)} valid paragraphs from text file")
+            return processed_paragraphs
+        except Exception as e:
+            logger.error(f"Error processing text file: {e}")
+            raise
+    def _process_single_paragraph_from_text(self, content: str, file_path: str,
+                                        file_name: str, para_number: int,
+                                        preprocess: bool) -> Optional[Document]:
+        """
+        Process a single paragraph from text file with analysis and metadata.
+        Args:
+            content: The paragraph content
+            file_path: Full path to the source file
+            file_name: Name of the source file
+            para_number: The paragraph number
+            preprocess: Whether to preprocess the text
+        Returns:
+            Document object with processed content and metadata, or None if paragraph is invalid
+        """
+        # First check character length
+        if len(content.strip()) < self.PARAGRAPH_MIN_LENGTH:
+            logger.debug(f"Paragraph {para_number} too short ({len(content)} chars), skipping")
+            return None
+        # Preprocess if requested
+        if preprocess:
+            content = self.preprocess_text(content, remove_headers_footers=False)
+        # Analyze the paragraph
+        analysis = self.analyze_text(content)
+        # Validate content quality
+        if not self.is_content_valid(content):
+            logger.debug(f"Paragraph {para_number} failed content validation, skipping")
+            return None
+        # Create metadata
+        metadata = {
+            "source": file_path,
+            "file_name": file_name,
+            "file_type": "txt",
+            "paragraph": para_number,
+            "char_count": analysis["char_count"],
+            "token_count": analysis["token_count"],
+            "sentence_count": analysis["sentence_count"],
+            "word_count": analysis["word_count"],
+            "chunk_type": "paragraph",
+            "processing_timestamp": datetime.now().isoformat(),
+        }
+        # Create and return document
+        doc = Document(page_content=content, metadata=metadata)
+        logger.debug(f"Created paragraph {para_number}: {analysis['char_count']} chars, {analysis['token_count']} tokens")
+        return doc

src/core/SemanticChunker.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+SemanticChunker.py
+A module for semantic-aware text chunking using embeddings and similarity metrics.
+"""
+import logging
+from typing import List, Optional, Any
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from langchain_core.documents import Document
+# FIXED IMPORT: Updated for LangChain v0.2+
+from langchain_text_splitters import SpacyTextSplitter
+from sentence_transformers import SentenceTransformer
+from core.BaseChunker import BaseChunker
+logger = logging.getLogger(__name__)
+class SemanticChunker(BaseChunker):
+    """Chunks text based on semantic similarity and size constraints"""
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        embedding_model: Optional[Any] = None,
+        chunk_size: int = 200,
+        chunk_overlap: int = 0,
+        similarity_threshold: float = 0.9,
+        separator: str = " "
+    ):
+        """
+        Initialize the semantic chunker with configurable parameters
+        """
+        # Validate parameters
+        if chunk_size <= 0:
+            raise ValueError("chunk_size must be a positive integer.")
+        if not (0 <= similarity_threshold <= 1):
+            raise ValueError("similarity_threshold must be between 0 and 1.")
+        # Initialize BaseChunker first
+        super().__init__(model_name, embedding_model)
+        # Set semantic chunking parameters
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.similarity_threshold = similarity_threshold
+        self.separator = separator
+        # Use provided embedding model or initialize sentence transformer
+        is_dummy = False
+        if embedding_model is not None:
+            try:
+                test_output = embedding_model.encode("test")
+                if isinstance(test_output, list) and len(test_output) == 384 and all(x == 0.0 for x in test_output):
+                    is_dummy = True
+            except:
+                pass
+        if embedding_model is None or is_dummy:
+            try:
+                self.sentence_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
+                self.embedding_model = self.sentence_model
+                logger.info("Initialized SentenceTransformer for semantic chunking")
+            except Exception as e:
+                logger.error(f"Error loading SentenceTransformer: {e}")
+                class DummyEmbedder:
+                    def encode(self, text, **kwargs):
+                        return [0.0] * 384
+                self.sentence_model = DummyEmbedder()
+                self.embedding_model = self.sentence_model
+        else:
+            self.sentence_model = embedding_model
+            logger.info("Using provided embedding model for semantic chunking")
+        # Initialize text splitter for initial chunking
+        self.text_splitter = SpacyTextSplitter(
+            chunk_size=self.chunk_size - self.chunk_overlap,
+            chunk_overlap=self.chunk_overlap,
+            separator=self.separator
+        )
+    def _enforce_size_immediately(self, text: str) -> List[str]:
+        if not text.strip():
+            return []
+        chunks = []
+        current_chunk = []
+        words = text.split()
+        for word in words:
+            if sum(len(w) for w in current_chunk) + len(word) + len(current_chunk) <= self.chunk_size:
+                current_chunk.append(word)
+            else:
+                if current_chunk:
+                    chunks.append(" ".join(current_chunk))
+                current_chunk = [word]
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
+        return chunks
+    def get_semantic_chunks(self, documents: List[Document]) -> List[Document]:
+        if not documents:
+            logger.warning("No documents provided for semantic chunking")
+            return []
+        try:
+            base_chunks = self.text_splitter.split_documents(documents)
+            logger.info(f"Initial splitting created {len(base_chunks)} base chunks")
+            if not base_chunks:
+                return []
+            chunk_contents = [doc.page_content for doc in base_chunks]
+            chunk_embeddings = self.sentence_model.encode(chunk_contents)
+            grouped_chunks = []
+            current_group = []
+            current_embedding = None
+            for i, base_chunk in enumerate(base_chunks):
+                if not current_group:
+                    current_group.append(base_chunk)
+                    current_embedding = chunk_embeddings[i].reshape(1, -1)
+                    continue
+                similarity = cosine_similarity(current_embedding, chunk_embeddings[i].reshape(1, -1))[0][0]
+                combined_content = " ".join([doc.page_content for doc in current_group] + [base_chunk.page_content])
+                if similarity >= self.similarity_threshold and len(combined_content) <= self.chunk_size:
+                    current_group.append(base_chunk)
+                else:
+                    grouped_chunks.extend(self._finalize_chunk_group(current_group))
+                    current_group = [base_chunk]
+                    current_embedding = chunk_embeddings[i].reshape(1, -1)
+            if current_group:
+                grouped_chunks.extend(self._finalize_chunk_group(current_group))
+            logger.info(f"Created {len(grouped_chunks)} semantic chunks")
+            return grouped_chunks
+        except Exception as e:
+            logger.error(f"Error in semantic chunking: {e}")
+            return documents
+    def _finalize_chunk_group(self, group: List[Document]) -> List[Document]:
+        if not group:
+            return []
+        processed_chunks = []
+        content = " ".join([doc.page_content for doc in group])
+        size_limited_chunks = self._enforce_size_immediately(content)
+        base_metadata = group[0].metadata.copy()
+        for i, chunk in enumerate(size_limited_chunks):
+            stats = self.analyze_text(chunk)
+            metadata = base_metadata.copy()
+            metadata.update({
+                "chunk_index": i + 1,
+                "chunk_count": len(size_limited_chunks),
+                "char_count": stats["char_count"],
+                "token_count": stats["token_count"],
+                "sentence_count": stats["sentence_count"],
+                "word_count": stats["word_count"],
+                "chunk_type": "semantic"
+            })
+            processed_chunks.append(Document(page_content=chunk, metadata=metadata))
+        return processed_chunks
+    def semantic_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
+        try:
+            logger.info(f"Processing document with semantic chunking: {file_path}")
+            raw_documents = self.load_document(file_path)
+            processed_documents = []
+            for doc in raw_documents:
+                content = doc.page_content
+                if preprocess:
+                    content = self.preprocess_text(content)
+                processed_documents.append(Document(
+                    page_content=content,
+                    metadata=doc.metadata
+                ))
+            documents = self.get_semantic_chunks(processed_documents)
+            logger.info(f"Created {len(documents)} semantic chunks")
+            return documents
+        except Exception as e:
+            logger.error(f"Error in semantic_process_document: {e}")
+            raise
+    def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
+        return self.semantic_process_document(file_path, preprocess)

src/core/TextPreprocessor.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import logging
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import re
+class TextPreprocessor:
+    def __init__(self):
+        try:
+            self.stopwords = set(stopwords.words('english'))
+            self.lemmatizer = WordNetLemmatizer()
+            self.logger = logging.getLogger(__name__)
+        except Exception as e:
+            self.logger.error(f"Failed to initialize NLTK resources: {e}")
+            raise
+    def standardize_case(self, text):
+        return text.lower()
+    def remove_punctuation(self, text):
+        return re.sub(r'[^\w\s]', '', text)
+    def normalize_whitespace(self, text):
+        return re.sub(r'\s+', ' ', text).strip()
+    def remove_stopwords(self, words):
+        return [word for word in words if word not in self.stopwords]
+    def lemmatize_words(self, words):
+        return [self.lemmatizer.lemmatize(word) for word in words]
+    def remove_headers_and_footers(self, text, aggressive=False, pattern=None):
+        try:
+            if not text or not text.strip():
+                return text
+            lines = text.splitlines()
+            if len(lines) <= 4:  # For very short text, don't remove anything
+                return text
+            # Store original lines for fallback
+            original_lines = lines.copy()
+            # Use different strategies based on document characteristics
+            if self._appears_to_be_slide(lines):
+                # Slide-friendly approach - only remove obvious headers/footers
+                cleaned_lines = self._clean_slide_headers_footers(lines, pattern)
+            elif aggressive:
+                # Traditional document approach - remove first/last few lines
+                num_lines = 2
+                cleaned_lines = lines[num_lines:-num_lines]
+            else:
+                # Conservative approach - only remove based on patterns
+                cleaned_lines = self._pattern_based_removal(lines, pattern)
+            # If we removed too much (over 30% of content), revert to original
+            if len(cleaned_lines) < len(lines) * 0.7:
+                self.logger.warning("Header/footer removal eliminated too much content, reverting")
+                cleaned_lines = original_lines
+            # Additional heuristic: Remove single-word lines that might be page numbers
+            cleaned_lines = [line for line in cleaned_lines
+                            if not (len(line.strip().split()) == 1 and
+                                line.strip().isdigit())]
+            # Join lines back into text
+            return '\n'.join(cleaned_lines)
+        except Exception as e:
+            self.logger.error(f"Error removing headers/footers: {e}")
+            return text  # Return original text on error
+    def _appears_to_be_slide(self, lines):
+        """Detect if the content appears to be from a slide/presentation."""
+        # Characteristics of slides:
+        # - Shorter overall text
+        # - Fewer lines
+        # - More bullet points
+        # - Title followed by bullet points
+        if len(lines) < 15:  # Short content
+            return True
+        # Check for bullet point patterns
+        bullet_pattern = r'^\s*[•\-\*\>\◦\○\◆\◇\▪\▫\⚫\⚪\✓\✔\✕\✖\✗\✘]'
+        bullet_lines = sum(1 for line in lines if re.match(bullet_pattern, line))
+        # If more than 20% of lines are bullets, likely a slide
+        if bullet_lines > len(lines) * 0.2:
+            return True
+        # If first non-empty line is short (likely a title) and followed by bullet points
+        non_empty_lines = [line for line in lines if line.strip()]
+        if non_empty_lines and len(non_empty_lines[0].strip()) < 60:
+            # Check for bullet points in the following lines
+            for line in non_empty_lines[1:4]:  # Check next few lines
+                if re.match(bullet_pattern, line):
+                    return True
+        return False
+    def _clean_slide_headers_footers(self, lines, pattern=None):
+        """Clean headers/footers from slide-based content."""
+        cleaned_lines = lines.copy()
+        # For slides, we primarily rely on pattern matching rather than line position
+        if pattern:
+            cleaned_lines = [line for line in cleaned_lines
+                            if not re.search(pattern, line)]
+        # Common slide footer patterns to remove
+        footer_patterns = [
+            r'^\s*\d+\s*$',  # Standalone page number
+            r'confidential',  # Confidentiality notices
+            r'all rights reserved',
+            r'proprietary',
+            r'^\s*www\.',  # Website in footer
+            r'^\s*https?://',  # URL in footer
+            r'\bpage\s+\d+\b',  # "Page X" footer
+            r'^\s*[©Ⓒ]\s*\d{4}'  # Copyright notice
+        ]
+        # Combine all patterns
+        combined_pattern = '|'.join(f'({p})' for p in footer_patterns)
+        # Filter out footer lines
+        if combined_pattern:
+            cleaned_lines = [line for line in cleaned_lines
+                            if not re.search(combined_pattern, line, re.IGNORECASE)]
+        return cleaned_lines
+    def _pattern_based_removal(self, lines, pattern=None):
+        """Remove headers/footers based only on patterns, not position."""
+        if not pattern:
+            # Default patterns for headers/footers
+            patterns = [
+                r'^\s*\d+\s*$',  # Standalone page numbers
+                r'^\s*page\s+\d+\s+of\s+\d+\s*$',  # Page X of Y
+                r'^\s*[©Ⓒ]\s*\d{4}.*$',  # Copyright lines
+                r'^\s*confidential\s*$',  # Confidentiality markers
+                r'^\s*https?://.*$',  # URLs alone on a line
+                r'^\s*www\..*$',  # Website alone on a line
+                r'^\s*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\s*$'  # Email addresses
+            ]
+            combined_pattern = '|'.join(f'({p})' for p in patterns)
+        else:
+            combined_pattern = pattern
+        return [line for line in lines
+                if not re.search(combined_pattern, line, re.IGNORECASE)]
+    def remove_common_pdf_artifacts(self, text):
+        try:
+            # Remove form field indicators
+            text = re.sub(r'\[\s*\]\s*|\[\s*X\s*\]|\(\s*\)\s*|\(\s*X\s*\)', '', text)
+            # Remove common PDF annotations
+            text = re.sub(r'<<[^>]*>>', '', text)
+            # Remove artifact markers often found in PDFs
+            text = re.sub(r'obj\s*\d+\s*\d+\s*R', '', text)
+            return text
+        except Exception as e:
+            self.logger.error(f"Error removing PDF artifacts: {e}")
+            return text
+    def preprocess(self, text, remove_headers_footers=True, aggressive_removal=False):
+        try:
+            if remove_headers_footers:
+                text = self.remove_headers_and_footers(text, aggressive=aggressive_removal)
+            text = self.remove_common_pdf_artifacts(text)
+            text = self.standardize_case(text)
+            text = self.remove_punctuation(text)
+            text = self.normalize_whitespace(text)
+            words = text.split()
+            words = self.remove_stopwords(words)
+            words = self.lemmatize_words(words)
+            return ' '.join(words)
+        except Exception as e:
+            self.logger.error(f"Error preprocessing text: {e}")
+            raise

src/core/TokenChunker.py ADDED Viewed

	@@ -0,0 +1,458 @@

+"""
+TokenChunker.py
+A module for token-based document chunking with configurable overlap and preprocessing.
+Features:
+- Token-based document splitting with overlap
+- Content validation and token counting
+- Smart boundary detection to preserve word integrity
+- Compatible with multiple tokenizer types (tiktoken, transformers, basic)
+"""
+import logging
+import re
+from typing import List, Optional, Dict, Any
+from langchain_core.documents import Document
+from core.BaseChunker import BaseChunker
+logger = logging.getLogger(__name__)
+class TokenChunker(BaseChunker):
+    """Handles document chunking at the token level with configurable overlap."""
+    def __init__(
+        self,
+        model_name=None,
+        embedding_model=None,
+        chunk_size: int = 256,
+        chunk_overlap: int = 50,
+        min_chunk_size: int = 50
+    ):
+        """
+        Initialize token chunker with specified models and parameters.
+        Args:
+            model_name: Name of the model for tokenization
+            embedding_model: Model for generating embeddings
+            chunk_size: Maximum tokens per chunk
+            chunk_overlap: Number of tokens to overlap between chunks
+            min_chunk_size: Minimum tokens for a valid chunk
+        """
+        super().__init__(model_name, embedding_model)
+        # Validate chunking parameters
+        if chunk_overlap >= chunk_size:
+            raise ValueError("chunk_overlap must be less than chunk_size")
+        if min_chunk_size <= 0:
+            raise ValueError("min_chunk_size must be positive")
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.min_chunk_size = min_chunk_size
+        self.chunk_stats = []
+        logger.info(f"TokenChunker initialized: chunk_size={chunk_size}, overlap={chunk_overlap}, min_size={min_chunk_size}")
+    def _smart_tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize text while preserving word boundaries for reconstruction.
+        Args:
+            text: The text content to tokenize
+        Returns:
+            List of tokens that can be cleanly rejoined
+        """
+        if not text.strip():
+            return []
+        try:
+            if self.uses_tiktoken:
+                # For tiktoken, we need a hybrid approach to preserve boundaries
+                return self._tiktoken_boundary_aware_split(text)
+            elif hasattr(self.tokenizer, 'tokenize'):
+                # For transformers tokenizers
+                tokens = self.tokenizer.tokenize(text)
+                return self._clean_subword_tokens(tokens)
+            else:
+                # Fallback to intelligent word splitting
+                return self._word_boundary_split(text)
+        except Exception as e:
+            logger.warning(f"Tokenization failed: {e}. Using word boundary fallback.")
+            return self._word_boundary_split(text)
+    def _tiktoken_boundary_aware_split(self, text: str) -> List[str]:
+        """
+        Split text in a way that's compatible with tiktoken while preserving boundaries.
+        Args:
+            text: Input text
+        Returns:
+            List of text segments that approximate tokens
+        """
+        # Get actual token count for validation
+        target_token_count = self.count_tokens(text)
+        # Split on natural boundaries (spaces, punctuation)
+        words = re.findall(r'\S+|\s+', text)
+        # If we have roughly the right number of words, use them
+        if abs(len(words) - target_token_count) / max(target_token_count, 1) < 0.3:
+            return [w for w in words if w.strip()]
+        # Otherwise, use a more granular split
+        segments = re.findall(r'\w+|[^\w\s]|\s+', text)
+        return [s for s in segments if s.strip()]
+    def _clean_subword_tokens(self, tokens: List[str]) -> List[str]:
+        """
+        Clean subword tokens for better reconstruction.
+        Args:
+            tokens: Raw tokens from tokenizer
+        Returns:
+            Cleaned tokens
+        """
+        cleaned = []
+        for token in tokens:
+            # Remove special tokens but keep the content
+            if token.startswith('##'):
+                # BERT-style subwords
+                cleaned.append(token[2:])
+            elif token.startswith('▁'):
+                # SentencePiece-style
+                cleaned.append(' ' + token[1:])
+            else:
+                cleaned.append(token)
+        return [t for t in cleaned if t.strip()]
+    def _word_boundary_split(self, text: str) -> List[str]:
+        """
+        Split text on word boundaries as fallback tokenization.
+        Args:
+            text: Input text
+        Returns:
+            List of words
+        """
+        # Split on whitespace but preserve some punctuation as separate tokens
+        tokens = re.findall(r'\w+|[.!?;,]', text)
+        return tokens
+    def _detokenize(self, tokens: List[str]) -> str:
+        """
+        Reconstruct text from tokens, handling different tokenizer types.
+        Args:
+            tokens: List of token strings
+        Returns:
+            Reconstructed text
+        """
+        if not tokens:
+            return ""
+        if self.uses_tiktoken or not hasattr(self.tokenizer, 'tokenize'):
+            # For tiktoken and basic tokenizers, use space joining with smart spacing
+            result = ""
+            for i, token in enumerate(tokens):
+                if not token.strip():
+                    continue
+                if i == 0:
+                    result = token
+                elif token in '.,!?;:':
+                    result += token
+                elif result and result[-1] in '.,!?;:':
+                    result += " " + token
+                else:
+                    result += " " + token
+            return result
+        else:
+            # For transformers tokenizers, handle subword reconstruction
+            text = "".join(tokens)
+            # Clean up spacing around punctuation
+            text = re.sub(r'\s+([.!?;,])', r'\1', text)
+            text = re.sub(r'\s+', ' ', text)
+            return text.strip()
+    def _create_token_chunks(self, tokens: List[str]) -> List[List[str]]:
+        """
+        Split tokens into overlapping chunks of specified size.
+        Args:
+            tokens: List of token strings
+        Returns:
+            List of token chunks
+        """
+        if not tokens:
+            return []
+        chunks = []
+        start = 0
+        while start < len(tokens):
+            # Calculate end position for this chunk
+            end = min(start + self.chunk_size, len(tokens))
+            # Extract the chunk
+            chunk_tokens = tokens[start:end]
+            # Only add chunks that meet minimum size requirement
+            if len(chunk_tokens) >= self.min_chunk_size:
+                chunks.append(chunk_tokens)
+                self.chunk_stats.append(f"Created chunk with {len(chunk_tokens)} tokens")
+            else:
+                self.chunk_stats.append(f"Skipped small chunk with {len(chunk_tokens)} tokens")
+            # Break if we've reached the end
+            if end >= len(tokens):
+                break
+            # Calculate next start position with overlap
+            start = end - self.chunk_overlap
+            # Ensure forward progress
+            if start <= 0:
+                start = end
+        return chunks
+    def _process_single_chunk(self, chunk_tokens: List[str], chunk_index: int,
+                             source_metadata: Dict[str, Any]) -> Optional[Document]:
+        """
+        Process a single token chunk into a Document with metadata.
+        Args:
+            chunk_tokens: List of tokens for this chunk
+            chunk_index: Index of this chunk in the document
+            source_metadata: Metadata from source document
+        Returns:
+            Document object with processed content and metadata, or None if invalid
+        """
+        # Reconstruct text from tokens
+        chunk_text = self._detokenize(chunk_tokens)
+        # Validate chunk content
+        if not self.is_content_valid(chunk_text, min_tokens=self.min_chunk_size):
+            self.chunk_stats.append(f"Chunk {chunk_index} failed validation")
+            return None
+        # Analyze the chunk content
+        stats = self.analyze_text(chunk_text)
+        # Create comprehensive metadata
+        metadata = source_metadata.copy()
+        metadata.update({
+            "chunk_index": chunk_index,
+            "chunk_type": "token",
+            "chunking_method": "token_based",
+            "token_count": len(chunk_tokens),
+            "char_count": stats["char_count"],
+            "sentence_count": stats["sentence_count"],
+            "word_count": stats["word_count"],
+            "chunk_size_limit": self.chunk_size,
+            "chunk_overlap": self.chunk_overlap
+        })
+        return Document(page_content=chunk_text, metadata=metadata)
+    def token_process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
+        """
+        Process document using token-based chunking with overlap.
+        Args:
+            file_path: Path to the document file
+            preprocess: Whether to preprocess text content
+        Returns:
+            List of Document objects, one per valid token chunk
+        """
+        try:
+            self.chunk_stats = []  # Reset stats for this document
+            raw_pages = self.load_document(file_path)
+            processed_chunks = []
+            logger.info(f"Processing document with {len(raw_pages)} pages using token chunking")
+            # Combine all pages into a single text for token-based processing
+            full_text = ""
+            combined_metadata = {}
+            page_info = []  # Track which pages contributed to the text
+            for page_idx, page in enumerate(raw_pages):
+                content = page.page_content
+                # Skip invalid content
+                if not self.is_content_valid(content):
+                    logger.debug(f"Skipping invalid content on page {page_idx + 1}")
+                    continue
+                # Preprocess if requested
+                if preprocess:
+                    content = self.preprocess_text(content)
+                    if not self.is_content_valid(content):
+                        continue
+                # Track page information
+                page_info.append({
+                    "page_number": page_idx + 1,
+                    "original_metadata": page.metadata
+                })
+                # Combine text with page separation
+                if full_text:
+                    full_text += "\n\n" + content
+                else:
+                    full_text = content
+                    # Use metadata from first valid page as base
+                    combined_metadata = page.metadata.copy()
+            # Update combined metadata to reflect all pages
+            if page_info:
+                combined_metadata.update({
+                    "total_pages_processed": len(page_info),
+                    "page_range": f"{page_info[0]['page_number']}-{page_info[-1]['page_number']}",
+                    "source_pages": [str(p["page_number"]) for p in page_info]  # ✅ Convert to list of strings
+                })
+                # Remove the single "page" field since this represents multiple pages
+                combined_metadata.pop("page", None)
+            if not full_text.strip():
+                logger.warning("No valid content found in document")
+                return []
+            # Tokenize the entire document
+            all_tokens = self._smart_tokenize(full_text)
+            logger.info(f"Document tokenized into {len(all_tokens)} tokens")
+            if len(all_tokens) < self.min_chunk_size:
+                logger.warning(f"Document too short for chunking ({len(all_tokens)} tokens)")
+                return []
+            # Create overlapping token chunks
+            token_chunks = self._create_token_chunks(all_tokens)
+            logger.info(f"Created {len(token_chunks)} token chunks")
+            # Convert token chunks to Document objects
+            for chunk_idx, chunk_tokens in enumerate(token_chunks):
+                chunk_doc = self._process_single_chunk(
+                    chunk_tokens,
+                    chunk_idx,
+                    combined_metadata
+                )
+                if chunk_doc:
+                    processed_chunks.append(chunk_doc)
+            # Output processing statistics
+            if self.chunk_stats:
+                logger.info("\n".join(self.chunk_stats))
+            logger.info(f"Processed {len(processed_chunks)} valid token chunks")
+            return processed_chunks
+        except Exception as e:
+            logger.error(f"Error in token_process_document: {e}")
+            raise
+    def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
+        """
+        Process document using token chunking strategy (implements abstract method).
+        Args:
+            file_path: Path to the document file
+            preprocess: Whether to preprocess text content
+        Returns:
+            List of Document objects, one per valid token chunk
+        """
+        return self.token_process_document(file_path, preprocess)
+    def process_text_file(self, file_path: str, preprocess: bool = True) -> List[Document]:
+        """
+        Process text file directly using token-based chunking with overlap.
+        Args:
+            file_path: Path to the text file
+            preprocess: Whether to preprocess text content
+        Returns:
+            List of Document objects, one per valid token chunk
+        """
+        try:
+            from pathlib import Path
+            from datetime import datetime
+            self.chunk_stats = []  # Reset stats for this document
+            # Load the text file directly
+            content = self.load_text_file(file_path)
+            # Clean the text using the same logic as PDF conversion
+            content = self.clean_text_for_processing(content)
+            # Basic validation
+            if not self.is_content_valid(content):
+                logger.warning("Text file content failed validation")
+                return []
+            # Light preprocessing if requested (no header/footer removal for txt files)
+            if preprocess:
+                # Only apply basic text cleaning, not aggressive preprocessing
+                content = ' '.join(content.split())  # Normalize whitespace
+            # Create file-level metadata
+            file_path_obj = Path(file_path)
+            file_metadata = {
+                "source": file_path,
+                "file_name": file_path_obj.name,
+                "file_type": "txt",
+                "total_characters": len(content),
+                "processing_timestamp": datetime.now().isoformat(),
+            }
+            logger.info(f"Processing text file: {file_path_obj.name} ({len(content)} characters)")
+            # Tokenize the entire document
+            all_tokens = self._smart_tokenize(content)
+            logger.info(f"Text file tokenized into {len(all_tokens)} tokens")
+            if len(all_tokens) < self.min_chunk_size:
+                logger.warning(f"Text file too short for chunking ({len(all_tokens)} tokens)")
+                return []
+            # Create overlapping token chunks
+            token_chunks = self._create_token_chunks(all_tokens)
+            logger.info(f"Created {len(token_chunks)} token chunks from text file")
+            # Convert token chunks to Document objects
+            processed_chunks = []
+            for chunk_idx, chunk_tokens in enumerate(token_chunks):
+                chunk_doc = self._process_single_chunk(
+                    chunk_tokens,
+                    chunk_idx,
+                    file_metadata
+                )
+                if chunk_doc:
+                    processed_chunks.append(chunk_doc)
+            # Output processing statistics
+            if self.chunk_stats:
+                logger.info("\n".join(self.chunk_stats))
+            logger.info(f"Processed {len(processed_chunks)} valid token chunks from text file")
+            return processed_chunks
+        except Exception as e:
+            logger.error(f"Error processing text file: {e}")
+            raise

src/core/__init__.py ADDED Viewed

File without changes