Spaces:
Sleeping
Sleeping
| # src/utils/data_collector.py | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from dataclasses import dataclass, asdict | |
| from pathlib import Path | |
| from typing import Optional, List, Dict, Any | |
| # Ensure your project has these modules available: | |
| # - config.settings: must define RAW_PDFS_DIR (string path), etc. | |
| # - src.utils.logger: exposes get_logger() returning a loguru logger | |
| try: | |
| from config.settings import settings | |
| except Exception as e: | |
| raise RuntimeError( | |
| "Failed to import settings. Ensure config/settings.py exists and is importable. " | |
| "If config is at project root, run from project root and include both '.' and 'src' in PYTHONPATH." | |
| ) from e | |
| try: | |
| from src.utils.logger import get_logger | |
| except Exception as e: | |
| # Fallback simple logger if project logger not available | |
| import logging | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s") | |
| def get_logger(name: str): | |
| return logging.getLogger(name) | |
| logger = get_logger(__name__) | |
| class ManualEntry: | |
| file_name: str | |
| file_path: Path | |
| brand: Optional[str] = None | |
| model: Optional[str] = None | |
| year: Optional[str] = None | |
| size_bytes: Optional[int] = None | |
| pages: Optional[int] = None | |
| valid: bool = False | |
| notes: Optional[str] = None | |
| class LocalManualCollector: | |
| """ | |
| Collector for locally available car manual PDFs. | |
| - Scans data/raw_pdfs/ | |
| - Infers metadata from filename (brand, model, year) | |
| - Optionally normalizes filenames to Brand_Model_Year.pdf | |
| - Performs minimal PDF integrity check | |
| - Writes a manifest JSON listing all files and metadata | |
| """ | |
| def __init__(self, input_dir: Optional[str] = None): | |
| self.input_dir: Path = Path(input_dir or settings.RAW_PDFS_DIR) | |
| # Manifest always written into RAW_PDFS_DIR | |
| self.manifest_path: Path = self.input_dir / "manuals_manifest.json" | |
| self._ensure_dirs() | |
| # Log resolved absolute paths to avoid confusion | |
| logger.info(f"RAW_PDFS_DIR resolved to: {self.input_dir.resolve()}") | |
| logger.info(f"Manifest target resolved to: {self.manifest_path.resolve()}") | |
| def _ensure_dirs(self) -> None: | |
| self.input_dir.mkdir(parents=True, exist_ok=True) | |
| def _infer_metadata_from_name(self, name: str) -> Dict[str, Optional[str]]: | |
| """ | |
| Try to infer brand, model, year from filename. | |
| Handles underscores/hyphens/spaces, is case-insensitive. | |
| Examples: | |
| - be_6_2025.pdf -> Brand: Be, Model: 6, Year: 2025 | |
| - HYUNDAI_model_2024.pdf-> Brand: Hyundai, Model: Model, Year: 2024 | |
| - Volkswagen_atlas_2024.pdf -> Brand: Volkswagen, Model: Atlas, Year: 2024 | |
| """ | |
| base = name.rsplit(".", 1)[0] | |
| cleaned = re.sub(r"[-_]+", " ", base.strip()) | |
| tokens = [t for t in cleaned.split() if t] | |
| # Find a 4-digit year token at the end or near the end | |
| year = None | |
| for t in reversed(tokens): | |
| if re.fullmatch(r"(19|20)\d{2}", t): | |
| year = t | |
| break | |
| brand: Optional[str] = None | |
| model: Optional[str] = None | |
| if tokens: | |
| if year and year in tokens: | |
| yi = tokens.index(year) | |
| # brand is first token; model is everything between brand and year | |
| if yi >= 2: | |
| brand = tokens[0].title() | |
| model = " ".join(tokens[1:yi]).title() | |
| elif yi == 1: | |
| brand = tokens.title() | |
| else: | |
| # Year is the first token; likely not intended, fall back | |
| brand = tokens.title() | |
| if len(tokens) > 1: | |
| model = " ".join(tokens[1:]).title() | |
| else: | |
| # No year found; brand = first token, model = rest | |
| if len(tokens) >= 2: | |
| brand = tokens.title() | |
| model = " ".join(tokens[1:]).title() | |
| else: | |
| brand = tokens.title() | |
| return {"brand": brand, "model": model, "year": year} | |
| def _quick_pdf_check(self, path: Path) -> Dict[str, Any]: | |
| """ | |
| Minimal integrity check: | |
| - size > 0 | |
| - try to read first page text if PyMuPDF available | |
| Notes: | |
| - If PyMuPDF is not installed, still mark valid if size_bytes > 0 | |
| """ | |
| size_bytes = path.stat().st_size if path.exists() else None | |
| pages: Optional[int] = None | |
| notes: Optional[str] = None | |
| valid = False | |
| if size_bytes and size_bytes > 0: | |
| try: | |
| import fitz # PyMuPDF | |
| with fitz.open(str(path)) as doc: | |
| pages = doc.page_count | |
| # Try reading the first page to ensure it opens | |
| if pages and pages > 0: | |
| try: | |
| _ = doc[0].get_text() # ignore content | |
| except Exception: | |
| pass | |
| valid = True | |
| else: | |
| valid = False | |
| except ImportError: | |
| # PyMuPDF not available; allow progression if non-empty | |
| notes = "PyMuPDF not installed; validated by non-empty file size." | |
| valid = True | |
| except Exception as e: | |
| notes = f"PDF open failed: {e}" | |
| valid = False | |
| else: | |
| notes = "Empty or missing file." | |
| return {"size_bytes": size_bytes, "pages": pages, "valid": valid, "notes": notes} | |
| def _safe_rename(self, src: Path, dst: Path) -> Path: | |
| """ | |
| Safely rename src to dst; if dst exists, append a numeric suffix. | |
| Returns the final destination path used. | |
| """ | |
| if src.resolve() == dst.resolve(): | |
| return dst | |
| candidate = dst | |
| stem = dst.stem | |
| suffix = dst.suffix | |
| parent = dst.parent | |
| i = 1 | |
| while candidate.exists(): | |
| candidate = parent / f"{stem}_{i}{suffix}" | |
| i += 1 | |
| src.rename(candidate) | |
| return candidate | |
| def _normalize_filename(self, entry: ManualEntry) -> ManualEntry: | |
| """ | |
| Normalize filename to Brand_Model_Year.pdf if brand, model, year inferred. | |
| Otherwise, keep the original filename. | |
| """ | |
| if entry.brand and entry.model and entry.year: | |
| normalized = f"{entry.brand}_{entry.model}_{entry.year}.pdf" | |
| normalized = re.sub(r'[\\/:*?"<>| ]', "_", normalized) | |
| target_path = entry.file_path.parent / normalized | |
| if target_path.name != entry.file_path.name: | |
| try: | |
| final_path = self._safe_rename(entry.file_path, target_path) | |
| logger.info(f"Renamed '{entry.file_name}' -> '{final_path.name}'") | |
| entry.file_name = final_path.name | |
| entry.file_path = final_path | |
| except Exception as e: | |
| logger.warning(f"Could not rename '{entry.file_name}': {e}") | |
| return entry | |
| def collect(self, normalize: bool = True) -> List[ManualEntry]: | |
| """ | |
| Collect local PDFs from RAW_PDFS_DIR, optionally normalize filenames, | |
| and write/update a manifest file for downstream pipeline stages. | |
| Always writes a manifest (even if zero PDFs are found). | |
| """ | |
| logger.info(f"Scanning directory for PDFs: {self.input_dir}") | |
| # Case-insensitive scanning: handle .pdf and .PDF | |
| pdf_files: List[Path] = sorted( | |
| list(self.input_dir.glob("*.pdf")) + list(self.input_dir.glob("*.PDF")) | |
| ) | |
| if not pdf_files: | |
| logger.warning("No PDF files found. Writing empty manifest for traceability.") | |
| entries: List[ManualEntry] = [] | |
| for pdf in pdf_files: | |
| entry = self._build_entry(pdf) | |
| if normalize: | |
| entry = self._normalize_filename(entry) | |
| entries.append(entry) | |
| # Write manifest (always) | |
| manifest = { | |
| "total_files": len(entries), | |
| "valid_files": sum(1 for e in entries if e.valid), | |
| "invalid_files": [e.file_name for e in entries if not e.valid], | |
| "items": [ | |
| { | |
| **asdict(e), | |
| "file_path": str(e.file_path), # make JSON-serializable | |
| } | |
| for e in entries | |
| ], | |
| } | |
| # Ensure directory exists and write | |
| self.input_dir.mkdir(parents=True, exist_ok=True) | |
| with open(self.manifest_path, "w", encoding="utf-8") as f: | |
| json.dump(manifest, f, indent=2, ensure_ascii=False) | |
| logger.info(f"Manifest written: {self.manifest_path.resolve()}") | |
| logger.info(f"Total PDFs: {manifest['total_files']}, Valid: {manifest['valid_files']}") | |
| if manifest["invalid_files"]: | |
| logger.warning(f"Invalid PDFs: {manifest['invalid_files']}") | |
| return entries | |
| def _build_entry(self, pdf_path: Path) -> ManualEntry: | |
| meta = self._infer_metadata_from_name(pdf_path.name) | |
| check = self._quick_pdf_check(pdf_path) | |
| entry = ManualEntry( | |
| file_name=pdf_path.name, | |
| file_path=pdf_path, | |
| brand=meta.get("brand"), | |
| model=meta.get("model"), | |
| year=meta.get("year"), | |
| size_bytes=check.get("size_bytes"), | |
| pages=check.get("pages"), | |
| valid=check.get("valid", False), | |
| notes=check.get("notes"), | |
| ) | |
| return entry | |
| if __name__ == "__main__": | |
| """ | |
| Run from the project root: | |
| - Ensure both '.' (project root) and 'src' are on PYTHONPATH. | |
| - Example (PowerShell): | |
| $env:PYTHONPATH=".;src" | |
| python -m src.utils.data_collector | |
| """ | |
| collector = LocalManualCollector() | |
| collector.collect(normalize=True) | |