DevDocs / ingestion /loader.py
manan75
initial commit
f9e2c6d
"""
loader.py — Handles ZIP extraction and file loading.
Responsibilities:
- Extract uploaded ZIP archives
- Filter files by allowed extensions
- Read file contents safely
- Return a list of raw document dicts
"""
import zipfile
import os
import logging
from pathlib import Path
from typing import List, Dict
from config import ALLOWED_EXTENSIONS, MAX_FILE_SIZE_MB, UPLOAD_DIR
logger = logging.getLogger(__name__)
def extract_zip(zip_path: str) -> Path:
"""
Extract a ZIP archive to a unique subdirectory under UPLOAD_DIR.
Args:
zip_path: Path to the uploaded .zip file.
Returns:
Path to the extraction directory.
"""
zip_path = Path(zip_path)
extract_dir = UPLOAD_DIR / zip_path.stem
extract_dir.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(extract_dir)
logger.info(f"Extracted ZIP to: {extract_dir}")
return extract_dir
def load_files(extract_dir: Path) -> List[Dict]:
"""
Walk the extraction directory and load allowed source files.
Each returned dict contains:
- content (str): raw file text
- file_path (str): relative path within the archive
- extension (str): file extension
Args:
extract_dir: Directory containing extracted files.
Returns:
List of raw document dicts.
"""
documents: List[Dict] = []
max_bytes = MAX_FILE_SIZE_MB * 1024 * 1024
for root, _dirs, files in os.walk(extract_dir):
for filename in files:
full_path = Path(root) / filename
ext = full_path.suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
continue
if full_path.stat().st_size > max_bytes:
logger.warning(f"Skipping large file: {full_path}")
continue
try:
content = full_path.read_text(encoding="utf-8", errors="replace")
relative_path = str(full_path.relative_to(extract_dir))
documents.append({
"content": content,
"file_path": relative_path,
"extension": ext,
})
except Exception as e:
logger.warning(f"Failed to read {full_path}: {e}")
logger.info(f"Loaded {len(documents)} files from {extract_dir}")
return documents