File size: 10,306 Bytes
f05e8f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# src/utils/data_collector.py
from __future__ import annotations

import json
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional, List, Dict, Any

# Ensure your project has these modules available:
# - config.settings: must define RAW_PDFS_DIR (string path), etc.
# - src.utils.logger: exposes get_logger() returning a loguru logger
try:
    from config.settings import settings
except Exception as e:
    raise RuntimeError(
        "Failed to import settings. Ensure config/settings.py exists and is importable. "
        "If config is at project root, run from project root and include both '.' and 'src' in PYTHONPATH."
    ) from e

try:
    from src.utils.logger import get_logger
except Exception as e:
    # Fallback simple logger if project logger not available
    import logging

    logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
    def get_logger(name: str):
        return logging.getLogger(name)

logger = get_logger(__name__)


@dataclass
class ManualEntry:
    file_name: str
    file_path: Path
    brand: Optional[str] = None
    model: Optional[str] = None
    year: Optional[str] = None
    size_bytes: Optional[int] = None
    pages: Optional[int] = None
    valid: bool = False
    notes: Optional[str] = None


class LocalManualCollector:
    """

    Collector for locally available car manual PDFs.

    - Scans data/raw_pdfs/

    - Infers metadata from filename (brand, model, year)

    - Optionally normalizes filenames to Brand_Model_Year.pdf

    - Performs minimal PDF integrity check

    - Writes a manifest JSON listing all files and metadata

    """

    def __init__(self, input_dir: Optional[str] = None):
        self.input_dir: Path = Path(input_dir or settings.RAW_PDFS_DIR)
        # Manifest always written into RAW_PDFS_DIR
        self.manifest_path: Path = self.input_dir / "manuals_manifest.json"
        self._ensure_dirs()

        # Log resolved absolute paths to avoid confusion
        logger.info(f"RAW_PDFS_DIR resolved to: {self.input_dir.resolve()}")
        logger.info(f"Manifest target resolved to: {self.manifest_path.resolve()}")

    def _ensure_dirs(self) -> None:
        self.input_dir.mkdir(parents=True, exist_ok=True)

    def _infer_metadata_from_name(self, name: str) -> Dict[str, Optional[str]]:
        """

        Try to infer brand, model, year from filename.

        Handles underscores/hyphens/spaces, is case-insensitive.

        Examples:

          - be_6_2025.pdf         -> Brand: Be, Model: 6, Year: 2025

          - HYUNDAI_model_2024.pdf-> Brand: Hyundai, Model: Model, Year: 2024

          - Volkswagen_atlas_2024.pdf -> Brand: Volkswagen, Model: Atlas, Year: 2024

        """
        base = name.rsplit(".", 1)[0]
        cleaned = re.sub(r"[-_]+", " ", base.strip())
        tokens = [t for t in cleaned.split() if t]

        # Find a 4-digit year token at the end or near the end
        year = None
        for t in reversed(tokens):
            if re.fullmatch(r"(19|20)\d{2}", t):
                year = t
                break

        brand: Optional[str] = None
        model: Optional[str] = None

        if tokens:
            if year and year in tokens:
                yi = tokens.index(year)
                # brand is first token; model is everything between brand and year
                if yi >= 2:
                    brand = tokens[0].title()
                    model = " ".join(tokens[1:yi]).title()
                elif yi == 1:
                    brand = tokens.title()
                else:
                    # Year is the first token; likely not intended, fall back
                    brand = tokens.title()
                    if len(tokens) > 1:
                        model = " ".join(tokens[1:]).title()
            else:
                # No year found; brand = first token, model = rest
                if len(tokens) >= 2:
                    brand = tokens.title()
                    model = " ".join(tokens[1:]).title()
                else:
                    brand = tokens.title()

        return {"brand": brand, "model": model, "year": year}

    def _quick_pdf_check(self, path: Path) -> Dict[str, Any]:
        """

        Minimal integrity check:

        - size > 0

        - try to read first page text if PyMuPDF available

        Notes:

        - If PyMuPDF is not installed, still mark valid if size_bytes > 0

        """
        size_bytes = path.stat().st_size if path.exists() else None
        pages: Optional[int] = None
        notes: Optional[str] = None
        valid = False

        if size_bytes and size_bytes > 0:
            try:
                import fitz  # PyMuPDF
                with fitz.open(str(path)) as doc:
                    pages = doc.page_count
                    # Try reading the first page to ensure it opens
                    if pages and pages > 0:
                        try:
                            _ = doc[0].get_text()  # ignore content
                        except Exception:
                            pass
                        valid = True
                    else:
                        valid = False
            except ImportError:
                # PyMuPDF not available; allow progression if non-empty
                notes = "PyMuPDF not installed; validated by non-empty file size."
                valid = True
            except Exception as e:
                notes = f"PDF open failed: {e}"
                valid = False
        else:
            notes = "Empty or missing file."

        return {"size_bytes": size_bytes, "pages": pages, "valid": valid, "notes": notes}

    def _safe_rename(self, src: Path, dst: Path) -> Path:
        """

        Safely rename src to dst; if dst exists, append a numeric suffix.

        Returns the final destination path used.

        """
        if src.resolve() == dst.resolve():
            return dst

        candidate = dst
        stem = dst.stem
        suffix = dst.suffix
        parent = dst.parent

        i = 1
        while candidate.exists():
            candidate = parent / f"{stem}_{i}{suffix}"
            i += 1

        src.rename(candidate)
        return candidate

    def _normalize_filename(self, entry: ManualEntry) -> ManualEntry:
        """

        Normalize filename to Brand_Model_Year.pdf if brand, model, year inferred.

        Otherwise, keep the original filename.

        """
        if entry.brand and entry.model and entry.year:
            normalized = f"{entry.brand}_{entry.model}_{entry.year}.pdf"
            normalized = re.sub(r'[\\/:*?"<>| ]', "_", normalized)
            target_path = entry.file_path.parent / normalized

            if target_path.name != entry.file_path.name:
                try:
                    final_path = self._safe_rename(entry.file_path, target_path)
                    logger.info(f"Renamed '{entry.file_name}' -> '{final_path.name}'")
                    entry.file_name = final_path.name
                    entry.file_path = final_path
                except Exception as e:
                    logger.warning(f"Could not rename '{entry.file_name}': {e}")

        return entry

    def collect(self, normalize: bool = True) -> List[ManualEntry]:
        """

        Collect local PDFs from RAW_PDFS_DIR, optionally normalize filenames,

        and write/update a manifest file for downstream pipeline stages.

        Always writes a manifest (even if zero PDFs are found).

        """
        logger.info(f"Scanning directory for PDFs: {self.input_dir}")

        # Case-insensitive scanning: handle .pdf and .PDF
        pdf_files: List[Path] = sorted(
            list(self.input_dir.glob("*.pdf")) + list(self.input_dir.glob("*.PDF"))
        )

        if not pdf_files:
            logger.warning("No PDF files found. Writing empty manifest for traceability.")

        entries: List[ManualEntry] = []
        for pdf in pdf_files:
            entry = self._build_entry(pdf)
            if normalize:
                entry = self._normalize_filename(entry)
            entries.append(entry)

        # Write manifest (always)
        manifest = {
            "total_files": len(entries),
            "valid_files": sum(1 for e in entries if e.valid),
            "invalid_files": [e.file_name for e in entries if not e.valid],
            "items": [
                {
                    **asdict(e),
                    "file_path": str(e.file_path),  # make JSON-serializable
                }
                for e in entries
            ],
        }

        # Ensure directory exists and write
        self.input_dir.mkdir(parents=True, exist_ok=True)
        with open(self.manifest_path, "w", encoding="utf-8") as f:
            json.dump(manifest, f, indent=2, ensure_ascii=False)

        logger.info(f"Manifest written: {self.manifest_path.resolve()}")
        logger.info(f"Total PDFs: {manifest['total_files']}, Valid: {manifest['valid_files']}")
        if manifest["invalid_files"]:
            logger.warning(f"Invalid PDFs: {manifest['invalid_files']}")

        return entries

    def _build_entry(self, pdf_path: Path) -> ManualEntry:
        meta = self._infer_metadata_from_name(pdf_path.name)
        check = self._quick_pdf_check(pdf_path)
        entry = ManualEntry(
            file_name=pdf_path.name,
            file_path=pdf_path,
            brand=meta.get("brand"),
            model=meta.get("model"),
            year=meta.get("year"),
            size_bytes=check.get("size_bytes"),
            pages=check.get("pages"),
            valid=check.get("valid", False),
            notes=check.get("notes"),
        )
        return entry


if __name__ == "__main__":
    """

    Run from the project root:

      - Ensure both '.' (project root) and 'src' are on PYTHONPATH.

      - Example (PowerShell):

          $env:PYTHONPATH=".;src"

          python -m src.utils.data_collector

    """
    collector = LocalManualCollector()
    collector.collect(normalize=True)