Spaces:

boy177
/

DevDocs

Running

manan75

initial commit

f9e2c6d about 2 months ago

2.4 kB

	"""
	loader.py — Handles ZIP extraction and file loading.

	Responsibilities:
	- Extract uploaded ZIP archives
	- Filter files by allowed extensions
	- Read file contents safely
	- Return a list of raw document dicts
	"""

	import zipfile
	import os
	import logging
	from pathlib import Path
	from typing import List, Dict

	from config import ALLOWED_EXTENSIONS, MAX_FILE_SIZE_MB, UPLOAD_DIR

	logger = logging.getLogger(__name__)


	def extract_zip(zip_path: str) -> Path:
	"""
	Extract a ZIP archive to a unique subdirectory under UPLOAD_DIR.

	Args:
	zip_path: Path to the uploaded .zip file.

	Returns:
	Path to the extraction directory.
	"""
	zip_path = Path(zip_path)
	extract_dir = UPLOAD_DIR / zip_path.stem
	extract_dir.mkdir(parents=True, exist_ok=True)

	with zipfile.ZipFile(zip_path, "r") as zf:
	zf.extractall(extract_dir)

	logger.info(f"Extracted ZIP to: {extract_dir}")
	return extract_dir


	def load_files(extract_dir: Path) -> List[Dict]:
	"""
	Walk the extraction directory and load allowed source files.

	Each returned dict contains:
	- content (str): raw file text
	- file_path (str): relative path within the archive
	- extension (str): file extension

	Args:
	extract_dir: Directory containing extracted files.

	Returns:
	List of raw document dicts.
	"""
	documents: List[Dict] = []
	max_bytes = MAX_FILE_SIZE_MB * 1024 * 1024

	for root, _dirs, files in os.walk(extract_dir):
	for filename in files:
	full_path = Path(root) / filename
	ext = full_path.suffix.lower()

	if ext not in ALLOWED_EXTENSIONS:
	continue

	if full_path.stat().st_size > max_bytes:
	logger.warning(f"Skipping large file: {full_path}")
	continue

	try:
	content = full_path.read_text(encoding="utf-8", errors="replace")
	relative_path = str(full_path.relative_to(extract_dir))
	documents.append({
	"content": content,
	"file_path": relative_path,
	"extension": ext,
	})
	except Exception as e:
	logger.warning(f"Failed to read {full_path}: {e}")

	logger.info(f"Loaded {len(documents)} files from {extract_dir}")
	return documents