| """ |
| loader.py — Handles ZIP extraction and file loading. |
| |
| Responsibilities: |
| - Extract uploaded ZIP archives |
| - Filter files by allowed extensions |
| - Read file contents safely |
| - Return a list of raw document dicts |
| """ |
|
|
| import zipfile |
| import os |
| import logging |
| from pathlib import Path |
| from typing import List, Dict |
|
|
| from config import ALLOWED_EXTENSIONS, MAX_FILE_SIZE_MB, UPLOAD_DIR |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def extract_zip(zip_path: str) -> Path: |
| """ |
| Extract a ZIP archive to a unique subdirectory under UPLOAD_DIR. |
| |
| Args: |
| zip_path: Path to the uploaded .zip file. |
| |
| Returns: |
| Path to the extraction directory. |
| """ |
| zip_path = Path(zip_path) |
| extract_dir = UPLOAD_DIR / zip_path.stem |
| extract_dir.mkdir(parents=True, exist_ok=True) |
|
|
| with zipfile.ZipFile(zip_path, "r") as zf: |
| zf.extractall(extract_dir) |
|
|
| logger.info(f"Extracted ZIP to: {extract_dir}") |
| return extract_dir |
|
|
|
|
| def load_files(extract_dir: Path) -> List[Dict]: |
| """ |
| Walk the extraction directory and load allowed source files. |
| |
| Each returned dict contains: |
| - content (str): raw file text |
| - file_path (str): relative path within the archive |
| - extension (str): file extension |
| |
| Args: |
| extract_dir: Directory containing extracted files. |
| |
| Returns: |
| List of raw document dicts. |
| """ |
| documents: List[Dict] = [] |
| max_bytes = MAX_FILE_SIZE_MB * 1024 * 1024 |
|
|
| for root, _dirs, files in os.walk(extract_dir): |
| for filename in files: |
| full_path = Path(root) / filename |
| ext = full_path.suffix.lower() |
|
|
| if ext not in ALLOWED_EXTENSIONS: |
| continue |
|
|
| if full_path.stat().st_size > max_bytes: |
| logger.warning(f"Skipping large file: {full_path}") |
| continue |
|
|
| try: |
| content = full_path.read_text(encoding="utf-8", errors="replace") |
| relative_path = str(full_path.relative_to(extract_dir)) |
| documents.append({ |
| "content": content, |
| "file_path": relative_path, |
| "extension": ext, |
| }) |
| except Exception as e: |
| logger.warning(f"Failed to read {full_path}: {e}") |
|
|
| logger.info(f"Loaded {len(documents)} files from {extract_dir}") |
| return documents |