# src/utils/data_collector.py from __future__ import annotations import json import re from dataclasses import dataclass, asdict from pathlib import Path from typing import Optional, List, Dict, Any # Ensure your project has these modules available: # - config.settings: must define RAW_PDFS_DIR (string path), etc. # - src.utils.logger: exposes get_logger() returning a loguru logger try: from config.settings import settings except Exception as e: raise RuntimeError( "Failed to import settings. Ensure config/settings.py exists and is importable. " "If config is at project root, run from project root and include both '.' and 'src' in PYTHONPATH." ) from e try: from src.utils.logger import get_logger except Exception as e: # Fallback simple logger if project logger not available import logging logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s") def get_logger(name: str): return logging.getLogger(name) logger = get_logger(__name__) @dataclass class ManualEntry: file_name: str file_path: Path brand: Optional[str] = None model: Optional[str] = None year: Optional[str] = None size_bytes: Optional[int] = None pages: Optional[int] = None valid: bool = False notes: Optional[str] = None class LocalManualCollector: """ Collector for locally available car manual PDFs. - Scans data/raw_pdfs/ - Infers metadata from filename (brand, model, year) - Optionally normalizes filenames to Brand_Model_Year.pdf - Performs minimal PDF integrity check - Writes a manifest JSON listing all files and metadata """ def __init__(self, input_dir: Optional[str] = None): self.input_dir: Path = Path(input_dir or settings.RAW_PDFS_DIR) # Manifest always written into RAW_PDFS_DIR self.manifest_path: Path = self.input_dir / "manuals_manifest.json" self._ensure_dirs() # Log resolved absolute paths to avoid confusion logger.info(f"RAW_PDFS_DIR resolved to: {self.input_dir.resolve()}") logger.info(f"Manifest target resolved to: {self.manifest_path.resolve()}") def _ensure_dirs(self) -> None: self.input_dir.mkdir(parents=True, exist_ok=True) def _infer_metadata_from_name(self, name: str) -> Dict[str, Optional[str]]: """ Try to infer brand, model, year from filename. Handles underscores/hyphens/spaces, is case-insensitive. Examples: - be_6_2025.pdf -> Brand: Be, Model: 6, Year: 2025 - HYUNDAI_model_2024.pdf-> Brand: Hyundai, Model: Model, Year: 2024 - Volkswagen_atlas_2024.pdf -> Brand: Volkswagen, Model: Atlas, Year: 2024 """ base = name.rsplit(".", 1)[0] cleaned = re.sub(r"[-_]+", " ", base.strip()) tokens = [t for t in cleaned.split() if t] # Find a 4-digit year token at the end or near the end year = None for t in reversed(tokens): if re.fullmatch(r"(19|20)\d{2}", t): year = t break brand: Optional[str] = None model: Optional[str] = None if tokens: if year and year in tokens: yi = tokens.index(year) # brand is first token; model is everything between brand and year if yi >= 2: brand = tokens[0].title() model = " ".join(tokens[1:yi]).title() elif yi == 1: brand = tokens.title() else: # Year is the first token; likely not intended, fall back brand = tokens.title() if len(tokens) > 1: model = " ".join(tokens[1:]).title() else: # No year found; brand = first token, model = rest if len(tokens) >= 2: brand = tokens.title() model = " ".join(tokens[1:]).title() else: brand = tokens.title() return {"brand": brand, "model": model, "year": year} def _quick_pdf_check(self, path: Path) -> Dict[str, Any]: """ Minimal integrity check: - size > 0 - try to read first page text if PyMuPDF available Notes: - If PyMuPDF is not installed, still mark valid if size_bytes > 0 """ size_bytes = path.stat().st_size if path.exists() else None pages: Optional[int] = None notes: Optional[str] = None valid = False if size_bytes and size_bytes > 0: try: import fitz # PyMuPDF with fitz.open(str(path)) as doc: pages = doc.page_count # Try reading the first page to ensure it opens if pages and pages > 0: try: _ = doc[0].get_text() # ignore content except Exception: pass valid = True else: valid = False except ImportError: # PyMuPDF not available; allow progression if non-empty notes = "PyMuPDF not installed; validated by non-empty file size." valid = True except Exception as e: notes = f"PDF open failed: {e}" valid = False else: notes = "Empty or missing file." return {"size_bytes": size_bytes, "pages": pages, "valid": valid, "notes": notes} def _safe_rename(self, src: Path, dst: Path) -> Path: """ Safely rename src to dst; if dst exists, append a numeric suffix. Returns the final destination path used. """ if src.resolve() == dst.resolve(): return dst candidate = dst stem = dst.stem suffix = dst.suffix parent = dst.parent i = 1 while candidate.exists(): candidate = parent / f"{stem}_{i}{suffix}" i += 1 src.rename(candidate) return candidate def _normalize_filename(self, entry: ManualEntry) -> ManualEntry: """ Normalize filename to Brand_Model_Year.pdf if brand, model, year inferred. Otherwise, keep the original filename. """ if entry.brand and entry.model and entry.year: normalized = f"{entry.brand}_{entry.model}_{entry.year}.pdf" normalized = re.sub(r'[\\/:*?"<>| ]', "_", normalized) target_path = entry.file_path.parent / normalized if target_path.name != entry.file_path.name: try: final_path = self._safe_rename(entry.file_path, target_path) logger.info(f"Renamed '{entry.file_name}' -> '{final_path.name}'") entry.file_name = final_path.name entry.file_path = final_path except Exception as e: logger.warning(f"Could not rename '{entry.file_name}': {e}") return entry def collect(self, normalize: bool = True) -> List[ManualEntry]: """ Collect local PDFs from RAW_PDFS_DIR, optionally normalize filenames, and write/update a manifest file for downstream pipeline stages. Always writes a manifest (even if zero PDFs are found). """ logger.info(f"Scanning directory for PDFs: {self.input_dir}") # Case-insensitive scanning: handle .pdf and .PDF pdf_files: List[Path] = sorted( list(self.input_dir.glob("*.pdf")) + list(self.input_dir.glob("*.PDF")) ) if not pdf_files: logger.warning("No PDF files found. Writing empty manifest for traceability.") entries: List[ManualEntry] = [] for pdf in pdf_files: entry = self._build_entry(pdf) if normalize: entry = self._normalize_filename(entry) entries.append(entry) # Write manifest (always) manifest = { "total_files": len(entries), "valid_files": sum(1 for e in entries if e.valid), "invalid_files": [e.file_name for e in entries if not e.valid], "items": [ { **asdict(e), "file_path": str(e.file_path), # make JSON-serializable } for e in entries ], } # Ensure directory exists and write self.input_dir.mkdir(parents=True, exist_ok=True) with open(self.manifest_path, "w", encoding="utf-8") as f: json.dump(manifest, f, indent=2, ensure_ascii=False) logger.info(f"Manifest written: {self.manifest_path.resolve()}") logger.info(f"Total PDFs: {manifest['total_files']}, Valid: {manifest['valid_files']}") if manifest["invalid_files"]: logger.warning(f"Invalid PDFs: {manifest['invalid_files']}") return entries def _build_entry(self, pdf_path: Path) -> ManualEntry: meta = self._infer_metadata_from_name(pdf_path.name) check = self._quick_pdf_check(pdf_path) entry = ManualEntry( file_name=pdf_path.name, file_path=pdf_path, brand=meta.get("brand"), model=meta.get("model"), year=meta.get("year"), size_bytes=check.get("size_bytes"), pages=check.get("pages"), valid=check.get("valid", False), notes=check.get("notes"), ) return entry if __name__ == "__main__": """ Run from the project root: - Ensure both '.' (project root) and 'src' are on PYTHONPATH. - Example (PowerShell): $env:PYTHONPATH=".;src" python -m src.utils.data_collector """ collector = LocalManualCollector() collector.collect(normalize=True)