Spaces:
Sleeping
Sleeping
| """ | |
| Document Reader for page-level document access. | |
| """ | |
| import os | |
| import json | |
| from typing import List, Optional, Dict, Union | |
| from pathlib import Path | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class DocumentReader: | |
| def __init__(self, pages_dir: Optional[Path] = None): | |
| """Initialize the document reader.""" | |
| self.pages_dir = pages_dir or Path(__file__).parent.parent / "pages" | |
| self.document_index = self._load_document_index() | |
| def _load_document_index(self) -> Dict: | |
| """Load document index if available.""" | |
| index_path = self.pages_dir / "document_index.json" | |
| if index_path.exists(): | |
| try: | |
| with open(index_path, 'r') as f: | |
| return json.load(f) | |
| except Exception as e: | |
| logger.error(f"Error loading document index: {e}") | |
| return {} | |
| def _normalize_document_name(self, document_name: str) -> str: | |
| """Normalize document name for consistent file matching.""" | |
| # Remove common prefixes/suffixes | |
| name = document_name.strip() | |
| name = name.replace(" ", "_") | |
| name = name.replace(".", "_") | |
| # Handle different formats | |
| if not name.endswith(("UserGuide", "InstallationGuide", "QuickStartGuide")): | |
| # Try to identify the document type | |
| if "user" in name.lower() and "guide" in name.lower(): | |
| if not name.endswith("UserGuide"): | |
| name = name.replace("User_Guide", "UserGuide") | |
| elif "installation" in name.lower() and "guide" in name.lower(): | |
| if not name.endswith("InstallationGuide"): | |
| name = name.replace("Installation_Guide", "InstallationGuide") | |
| elif "quick" in name.lower() and "start" in name.lower(): | |
| if not name.endswith("QuickStartGuide"): | |
| name = name.replace("Quick_Start_Guide", "QuickStartGuide") | |
| return name | |
| def get_table_of_contents(self, document_name: str) -> Optional[str]: | |
| """Get the table of contents for a document.""" | |
| normalized_name = self._normalize_document_name(document_name) | |
| toc_filename = f"{normalized_name}_TOC.txt" | |
| toc_path = self.pages_dir / toc_filename | |
| if not toc_path.exists(): | |
| # Try alternative naming conventions | |
| alternatives = [ | |
| f"{document_name}_TOC.txt", | |
| f"{document_name.replace(' ', '_')}_TOC.txt", | |
| f"{document_name.replace('.', '_')}_TOC.txt" | |
| ] | |
| for alt in alternatives: | |
| alt_path = self.pages_dir / alt | |
| if alt_path.exists(): | |
| toc_path = alt_path | |
| break | |
| if toc_path.exists(): | |
| try: | |
| with open(toc_path, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| except Exception as e: | |
| logger.error(f"Error reading TOC file {toc_path}: {e}") | |
| return None | |
| logger.warning(f"TOC file not found for document: {document_name}") | |
| return None | |
| def read_pages(self, document_name: str, page_numbers: Optional[List[int]] = None) -> Union[str, Dict[int, str]]: | |
| """ | |
| Read specific pages from a document. | |
| If page_numbers is None, returns the table of contents. | |
| """ | |
| if page_numbers is None: | |
| # Return table of contents | |
| toc = self.get_table_of_contents(document_name) | |
| if toc: | |
| return f"Table of Contents for {document_name}:\n\n{toc}" | |
| else: | |
| return f"Table of contents not found for document: {document_name}" | |
| # Read specific pages | |
| normalized_name = self._normalize_document_name(document_name) | |
| pages_content = {} | |
| for page_num in page_numbers: | |
| page_filename = f"{normalized_name}_page_{page_num:03d}.txt" | |
| page_path = self.pages_dir / page_filename | |
| if not page_path.exists(): | |
| # Try alternative formats | |
| alternatives = [ | |
| f"{document_name}_page_{page_num:03d}.txt", | |
| f"{document_name.replace(' ', '_')}_page_{page_num:03d}.txt", | |
| f"{document_name.replace('.', '_')}_page_{page_num:03d}.txt" | |
| ] | |
| for alt in alternatives: | |
| alt_path = self.pages_dir / alt | |
| if alt_path.exists(): | |
| page_path = alt_path | |
| break | |
| if page_path.exists(): | |
| try: | |
| with open(page_path, 'r', encoding='utf-8') as f: | |
| pages_content[page_num] = f.read() | |
| except Exception as e: | |
| logger.error(f"Error reading page {page_num} from {document_name}: {e}") | |
| pages_content[page_num] = f"Error reading page {page_num}" | |
| else: | |
| pages_content[page_num] = f"Page {page_num} not found" | |
| # Format the output | |
| if len(pages_content) == 1: | |
| page_num = list(pages_content.keys())[0] | |
| return f"Page {page_num} of {document_name}:\n\n{pages_content[page_num]}" | |
| else: | |
| formatted_pages = [] | |
| for page_num in sorted(pages_content.keys()): | |
| formatted_pages.append(f"=== Page {page_num} ===\n{pages_content[page_num]}") | |
| return f"Pages from {document_name}:\n\n" + "\n\n".join(formatted_pages) | |
| def list_available_documents(self) -> List[str]: | |
| """List all available documents.""" | |
| documents = set() | |
| # Scan for TOC files | |
| for toc_file in self.pages_dir.glob("*_TOC.txt"): | |
| doc_name = toc_file.stem.replace("_TOC", "") | |
| documents.add(doc_name) | |
| # Also check document index | |
| if self.document_index: | |
| documents.update(self.document_index.keys()) | |
| return sorted(list(documents)) | |
| def get_document_info(self, document_name: str) -> Dict[str, any]: | |
| """Get information about a document (number of pages, etc.).""" | |
| normalized_name = self._normalize_document_name(document_name) | |
| info = { | |
| "name": document_name, | |
| "normalized_name": normalized_name, | |
| "has_toc": False, | |
| "page_count": 0, | |
| "available_pages": [] | |
| } | |
| # Check for TOC | |
| toc_path = self.pages_dir / f"{normalized_name}_TOC.txt" | |
| info["has_toc"] = toc_path.exists() | |
| # Count pages | |
| page_pattern = f"{normalized_name}_page_*.txt" | |
| page_files = list(self.pages_dir.glob(page_pattern)) | |
| if not page_files: | |
| # Try alternative patterns | |
| for alt_pattern in [f"{document_name}_page_*.txt", | |
| f"{document_name.replace(' ', '_')}_page_*.txt"]: | |
| page_files = list(self.pages_dir.glob(alt_pattern)) | |
| if page_files: | |
| break | |
| if page_files: | |
| page_numbers = [] | |
| for page_file in page_files: | |
| try: | |
| # Extract page number from filename | |
| page_num_str = page_file.stem.split("_page_")[-1] | |
| page_num = int(page_num_str) | |
| page_numbers.append(page_num) | |
| except: | |
| pass | |
| info["page_count"] = len(page_numbers) | |
| info["available_pages"] = sorted(page_numbers) | |
| return info |