CarAssistanceQA / src /utils /data_collector.py
Nihal2000's picture
inferancing using gemma 270 model
f05e8f9
# src/utils/data_collector.py
from __future__ import annotations
import json
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional, List, Dict, Any
# Ensure your project has these modules available:
# - config.settings: must define RAW_PDFS_DIR (string path), etc.
# - src.utils.logger: exposes get_logger() returning a loguru logger
try:
from config.settings import settings
except Exception as e:
raise RuntimeError(
"Failed to import settings. Ensure config/settings.py exists and is importable. "
"If config is at project root, run from project root and include both '.' and 'src' in PYTHONPATH."
) from e
try:
from src.utils.logger import get_logger
except Exception as e:
# Fallback simple logger if project logger not available
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
def get_logger(name: str):
return logging.getLogger(name)
logger = get_logger(__name__)
@dataclass
class ManualEntry:
file_name: str
file_path: Path
brand: Optional[str] = None
model: Optional[str] = None
year: Optional[str] = None
size_bytes: Optional[int] = None
pages: Optional[int] = None
valid: bool = False
notes: Optional[str] = None
class LocalManualCollector:
"""
Collector for locally available car manual PDFs.
- Scans data/raw_pdfs/
- Infers metadata from filename (brand, model, year)
- Optionally normalizes filenames to Brand_Model_Year.pdf
- Performs minimal PDF integrity check
- Writes a manifest JSON listing all files and metadata
"""
def __init__(self, input_dir: Optional[str] = None):
self.input_dir: Path = Path(input_dir or settings.RAW_PDFS_DIR)
# Manifest always written into RAW_PDFS_DIR
self.manifest_path: Path = self.input_dir / "manuals_manifest.json"
self._ensure_dirs()
# Log resolved absolute paths to avoid confusion
logger.info(f"RAW_PDFS_DIR resolved to: {self.input_dir.resolve()}")
logger.info(f"Manifest target resolved to: {self.manifest_path.resolve()}")
def _ensure_dirs(self) -> None:
self.input_dir.mkdir(parents=True, exist_ok=True)
def _infer_metadata_from_name(self, name: str) -> Dict[str, Optional[str]]:
"""
Try to infer brand, model, year from filename.
Handles underscores/hyphens/spaces, is case-insensitive.
Examples:
- be_6_2025.pdf -> Brand: Be, Model: 6, Year: 2025
- HYUNDAI_model_2024.pdf-> Brand: Hyundai, Model: Model, Year: 2024
- Volkswagen_atlas_2024.pdf -> Brand: Volkswagen, Model: Atlas, Year: 2024
"""
base = name.rsplit(".", 1)[0]
cleaned = re.sub(r"[-_]+", " ", base.strip())
tokens = [t for t in cleaned.split() if t]
# Find a 4-digit year token at the end or near the end
year = None
for t in reversed(tokens):
if re.fullmatch(r"(19|20)\d{2}", t):
year = t
break
brand: Optional[str] = None
model: Optional[str] = None
if tokens:
if year and year in tokens:
yi = tokens.index(year)
# brand is first token; model is everything between brand and year
if yi >= 2:
brand = tokens[0].title()
model = " ".join(tokens[1:yi]).title()
elif yi == 1:
brand = tokens.title()
else:
# Year is the first token; likely not intended, fall back
brand = tokens.title()
if len(tokens) > 1:
model = " ".join(tokens[1:]).title()
else:
# No year found; brand = first token, model = rest
if len(tokens) >= 2:
brand = tokens.title()
model = " ".join(tokens[1:]).title()
else:
brand = tokens.title()
return {"brand": brand, "model": model, "year": year}
def _quick_pdf_check(self, path: Path) -> Dict[str, Any]:
"""
Minimal integrity check:
- size > 0
- try to read first page text if PyMuPDF available
Notes:
- If PyMuPDF is not installed, still mark valid if size_bytes > 0
"""
size_bytes = path.stat().st_size if path.exists() else None
pages: Optional[int] = None
notes: Optional[str] = None
valid = False
if size_bytes and size_bytes > 0:
try:
import fitz # PyMuPDF
with fitz.open(str(path)) as doc:
pages = doc.page_count
# Try reading the first page to ensure it opens
if pages and pages > 0:
try:
_ = doc[0].get_text() # ignore content
except Exception:
pass
valid = True
else:
valid = False
except ImportError:
# PyMuPDF not available; allow progression if non-empty
notes = "PyMuPDF not installed; validated by non-empty file size."
valid = True
except Exception as e:
notes = f"PDF open failed: {e}"
valid = False
else:
notes = "Empty or missing file."
return {"size_bytes": size_bytes, "pages": pages, "valid": valid, "notes": notes}
def _safe_rename(self, src: Path, dst: Path) -> Path:
"""
Safely rename src to dst; if dst exists, append a numeric suffix.
Returns the final destination path used.
"""
if src.resolve() == dst.resolve():
return dst
candidate = dst
stem = dst.stem
suffix = dst.suffix
parent = dst.parent
i = 1
while candidate.exists():
candidate = parent / f"{stem}_{i}{suffix}"
i += 1
src.rename(candidate)
return candidate
def _normalize_filename(self, entry: ManualEntry) -> ManualEntry:
"""
Normalize filename to Brand_Model_Year.pdf if brand, model, year inferred.
Otherwise, keep the original filename.
"""
if entry.brand and entry.model and entry.year:
normalized = f"{entry.brand}_{entry.model}_{entry.year}.pdf"
normalized = re.sub(r'[\\/:*?"<>| ]', "_", normalized)
target_path = entry.file_path.parent / normalized
if target_path.name != entry.file_path.name:
try:
final_path = self._safe_rename(entry.file_path, target_path)
logger.info(f"Renamed '{entry.file_name}' -> '{final_path.name}'")
entry.file_name = final_path.name
entry.file_path = final_path
except Exception as e:
logger.warning(f"Could not rename '{entry.file_name}': {e}")
return entry
def collect(self, normalize: bool = True) -> List[ManualEntry]:
"""
Collect local PDFs from RAW_PDFS_DIR, optionally normalize filenames,
and write/update a manifest file for downstream pipeline stages.
Always writes a manifest (even if zero PDFs are found).
"""
logger.info(f"Scanning directory for PDFs: {self.input_dir}")
# Case-insensitive scanning: handle .pdf and .PDF
pdf_files: List[Path] = sorted(
list(self.input_dir.glob("*.pdf")) + list(self.input_dir.glob("*.PDF"))
)
if not pdf_files:
logger.warning("No PDF files found. Writing empty manifest for traceability.")
entries: List[ManualEntry] = []
for pdf in pdf_files:
entry = self._build_entry(pdf)
if normalize:
entry = self._normalize_filename(entry)
entries.append(entry)
# Write manifest (always)
manifest = {
"total_files": len(entries),
"valid_files": sum(1 for e in entries if e.valid),
"invalid_files": [e.file_name for e in entries if not e.valid],
"items": [
{
**asdict(e),
"file_path": str(e.file_path), # make JSON-serializable
}
for e in entries
],
}
# Ensure directory exists and write
self.input_dir.mkdir(parents=True, exist_ok=True)
with open(self.manifest_path, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
logger.info(f"Manifest written: {self.manifest_path.resolve()}")
logger.info(f"Total PDFs: {manifest['total_files']}, Valid: {manifest['valid_files']}")
if manifest["invalid_files"]:
logger.warning(f"Invalid PDFs: {manifest['invalid_files']}")
return entries
def _build_entry(self, pdf_path: Path) -> ManualEntry:
meta = self._infer_metadata_from_name(pdf_path.name)
check = self._quick_pdf_check(pdf_path)
entry = ManualEntry(
file_name=pdf_path.name,
file_path=pdf_path,
brand=meta.get("brand"),
model=meta.get("model"),
year=meta.get("year"),
size_bytes=check.get("size_bytes"),
pages=check.get("pages"),
valid=check.get("valid", False),
notes=check.get("notes"),
)
return entry
if __name__ == "__main__":
"""
Run from the project root:
- Ensure both '.' (project root) and 'src' are on PYTHONPATH.
- Example (PowerShell):
$env:PYTHONPATH=".;src"
python -m src.utils.data_collector
"""
collector = LocalManualCollector()
collector.collect(normalize=True)