File size: 2,398 Bytes
f9e2c6d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | """
loader.py — Handles ZIP extraction and file loading.
Responsibilities:
- Extract uploaded ZIP archives
- Filter files by allowed extensions
- Read file contents safely
- Return a list of raw document dicts
"""
import zipfile
import os
import logging
from pathlib import Path
from typing import List, Dict
from config import ALLOWED_EXTENSIONS, MAX_FILE_SIZE_MB, UPLOAD_DIR
logger = logging.getLogger(__name__)
def extract_zip(zip_path: str) -> Path:
"""
Extract a ZIP archive to a unique subdirectory under UPLOAD_DIR.
Args:
zip_path: Path to the uploaded .zip file.
Returns:
Path to the extraction directory.
"""
zip_path = Path(zip_path)
extract_dir = UPLOAD_DIR / zip_path.stem
extract_dir.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(extract_dir)
logger.info(f"Extracted ZIP to: {extract_dir}")
return extract_dir
def load_files(extract_dir: Path) -> List[Dict]:
"""
Walk the extraction directory and load allowed source files.
Each returned dict contains:
- content (str): raw file text
- file_path (str): relative path within the archive
- extension (str): file extension
Args:
extract_dir: Directory containing extracted files.
Returns:
List of raw document dicts.
"""
documents: List[Dict] = []
max_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
for root, _dirs, files in os.walk(extract_dir):
for filename in files:
full_path = Path(root) / filename
ext = full_path.suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
continue
if full_path.stat().st_size > max_bytes:
logger.warning(f"Skipping large file: {full_path}")
continue
try:
content = full_path.read_text(encoding="utf-8", errors="replace")
relative_path = str(full_path.relative_to(extract_dir))
documents.append({
"content": content,
"file_path": relative_path,
"extension": ext,
})
except Exception as e:
logger.warning(f"Failed to read {full_path}: {e}")
logger.info(f"Loaded {len(documents)} files from {extract_dir}")
return documents |