Spaces:

Nihal2000
/

CarAssistanceQA

Sleeping

File size: 10,306 Bytes

f05e8f9

# src/utils/data_collector.py
from __future__ import annotations

import json
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional, List, Dict, Any

# Ensure your project has these modules available:
# - config.settings: must define RAW_PDFS_DIR (string path), etc.
# - src.utils.logger: exposes get_logger() returning a loguru logger
try:
    from config.settings import settings
except Exception as e:
    raise RuntimeError(
        "Failed to import settings. Ensure config/settings.py exists and is importable. "
        "If config is at project root, run from project root and include both '.' and 'src' in PYTHONPATH."
    ) from e

try:
    from src.utils.logger import get_logger
except Exception as e:
    # Fallback simple logger if project logger not available
    import logging

    logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
    def get_logger(name: str):
        return logging.getLogger(name)

logger = get_logger(__name__)


@dataclass
class ManualEntry:
    file_name: str
    file_path: Path
    brand: Optional[str] = None
    model: Optional[str] = None
    year: Optional[str] = None
    size_bytes: Optional[int] = None
    pages: Optional[int] = None
    valid: bool = False
    notes: Optional[str] = None


class LocalManualCollector:
    """

    Collector for locally available car manual PDFs.

    - Scans data/raw_pdfs/

    - Infers metadata from filename (brand, model, year)

    - Optionally normalizes filenames to Brand_Model_Year.pdf

    - Performs minimal PDF integrity check

    - Writes a manifest JSON listing all files and metadata

    """

    def __init__(self, input_dir: Optional[str] = None):
        self.input_dir: Path = Path(input_dir or settings.RAW_PDFS_DIR)
        # Manifest always written into RAW_PDFS_DIR
        self.manifest_path: Path = self.input_dir / "manuals_manifest.json"
        self._ensure_dirs()

        # Log resolved absolute paths to avoid confusion
        logger.info(f"RAW_PDFS_DIR resolved to: {self.input_dir.resolve()}")
        logger.info(f"Manifest target resolved to: {self.manifest_path.resolve()}")

    def _ensure_dirs(self) -> None:
        self.input_dir.mkdir(parents=True, exist_ok=True)

    def _infer_metadata_from_name(self, name: str) -> Dict[str, Optional[str]]:
        """

        Try to infer brand, model, year from filename.

        Handles underscores/hyphens/spaces, is case-insensitive.

        Examples:

          - be_6_2025.pdf         -> Brand: Be, Model: 6, Year: 2025

          - HYUNDAI_model_2024.pdf-> Brand: Hyundai, Model: Model, Year: 2024

          - Volkswagen_atlas_2024.pdf -> Brand: Volkswagen, Model: Atlas, Year: 2024

        """
        base = name.rsplit(".", 1)[0]
        cleaned = re.sub(r"[-_]+", " ", base.strip())
        tokens = [t for t in cleaned.split() if t]

        # Find a 4-digit year token at the end or near the end
        year = None
        for t in reversed(tokens):
            if re.fullmatch(r"(19|20)\d{2}", t):
                year = t
                break

        brand: Optional[str] = None
        model: Optional[str] = None

        if tokens:
            if year and year in tokens:
                yi = tokens.index(year)
                # brand is first token; model is everything between brand and year
                if yi >= 2:
                    brand = tokens[0].title()
                    model = " ".join(tokens[1:yi]).title()
                elif yi == 1:
                    brand = tokens.title()
                else:
                    # Year is the first token; likely not intended, fall back
                    brand = tokens.title()
                    if len(tokens) > 1:
                        model = " ".join(tokens[1:]).title()
            else:
                # No year found; brand = first token, model = rest
                if len(tokens) >= 2:
                    brand = tokens.title()
                    model = " ".join(tokens[1:]).title()
                else:
                    brand = tokens.title()

        return {"brand": brand, "model": model, "year": year}

    def _quick_pdf_check(self, path: Path) -> Dict[str, Any]:
        """

        Minimal integrity check:

        - size > 0

        - try to read first page text if PyMuPDF available

        Notes:

        - If PyMuPDF is not installed, still mark valid if size_bytes > 0

        """
        size_bytes = path.stat().st_size if path.exists() else None
        pages: Optional[int] = None
        notes: Optional[str] = None
        valid = False

        if size_bytes and size_bytes > 0:
            try:
                import fitz  # PyMuPDF
                with fitz.open(str(path)) as doc:
                    pages = doc.page_count
                    # Try reading the first page to ensure it opens
                    if pages and pages > 0:
                        try:
                            _ = doc[0].get_text()  # ignore content
                        except Exception:
                            pass
                        valid = True
                    else:
                        valid = False
            except ImportError:
                # PyMuPDF not available; allow progression if non-empty
                notes = "PyMuPDF not installed; validated by non-empty file size."
                valid = True
            except Exception as e:
                notes = f"PDF open failed: {e}"
                valid = False
        else:
            notes = "Empty or missing file."

        return {"size_bytes": size_bytes, "pages": pages, "valid": valid, "notes": notes}

    def _safe_rename(self, src: Path, dst: Path) -> Path:
        """

        Safely rename src to dst; if dst exists, append a numeric suffix.

        Returns the final destination path used.

        """
        if src.resolve() == dst.resolve():
            return dst

        candidate = dst
        stem = dst.stem
        suffix = dst.suffix
        parent = dst.parent

        i = 1
        while candidate.exists():
            candidate = parent / f"{stem}_{i}{suffix}"
            i += 1

        src.rename(candidate)
        return candidate

    def _normalize_filename(self, entry: ManualEntry) -> ManualEntry:
        """

        Normalize filename to Brand_Model_Year.pdf if brand, model, year inferred.

        Otherwise, keep the original filename.

        """
        if entry.brand and entry.model and entry.year:
            normalized = f"{entry.brand}_{entry.model}_{entry.year}.pdf"
            normalized = re.sub(r'[\\/:*?"<>| ]', "_", normalized)
            target_path = entry.file_path.parent / normalized

            if target_path.name != entry.file_path.name:
                try:
                    final_path = self._safe_rename(entry.file_path, target_path)
                    logger.info(f"Renamed '{entry.file_name}' -> '{final_path.name}'")
                    entry.file_name = final_path.name
                    entry.file_path = final_path
                except Exception as e:
                    logger.warning(f"Could not rename '{entry.file_name}': {e}")

        return entry

    def collect(self, normalize: bool = True) -> List[ManualEntry]:
        """

        Collect local PDFs from RAW_PDFS_DIR, optionally normalize filenames,

        and write/update a manifest file for downstream pipeline stages.

        Always writes a manifest (even if zero PDFs are found).

        """
        logger.info(f"Scanning directory for PDFs: {self.input_dir}")

        # Case-insensitive scanning: handle .pdf and .PDF
        pdf_files: List[Path] = sorted(
            list(self.input_dir.glob("*.pdf")) + list(self.input_dir.glob("*.PDF"))
        )

        if not pdf_files:
            logger.warning("No PDF files found. Writing empty manifest for traceability.")

        entries: List[ManualEntry] = []
        for pdf in pdf_files:
            entry = self._build_entry(pdf)
            if normalize:
                entry = self._normalize_filename(entry)
            entries.append(entry)

        # Write manifest (always)
        manifest = {
            "total_files": len(entries),
            "valid_files": sum(1 for e in entries if e.valid),
            "invalid_files": [e.file_name for e in entries if not e.valid],
            "items": [
                {
                    **asdict(e),
                    "file_path": str(e.file_path),  # make JSON-serializable
                }
                for e in entries
            ],
        }

        # Ensure directory exists and write
        self.input_dir.mkdir(parents=True, exist_ok=True)
        with open(self.manifest_path, "w", encoding="utf-8") as f:
            json.dump(manifest, f, indent=2, ensure_ascii=False)

        logger.info(f"Manifest written: {self.manifest_path.resolve()}")
        logger.info(f"Total PDFs: {manifest['total_files']}, Valid: {manifest['valid_files']}")
        if manifest["invalid_files"]:
            logger.warning(f"Invalid PDFs: {manifest['invalid_files']}")

        return entries

    def _build_entry(self, pdf_path: Path) -> ManualEntry:
        meta = self._infer_metadata_from_name(pdf_path.name)
        check = self._quick_pdf_check(pdf_path)
        entry = ManualEntry(
            file_name=pdf_path.name,
            file_path=pdf_path,
            brand=meta.get("brand"),
            model=meta.get("model"),
            year=meta.get("year"),
            size_bytes=check.get("size_bytes"),
            pages=check.get("pages"),
            valid=check.get("valid", False),
            notes=check.get("notes"),
        )
        return entry


if __name__ == "__main__":
    """

    Run from the project root:

      - Ensure both '.' (project root) and 'src' are on PYTHONPATH.

      - Example (PowerShell):

          $env:PYTHONPATH=".;src"

          python -m src.utils.data_collector

    """
    collector = LocalManualCollector()
    collector.collect(normalize=True)