Spaces:

Nihal2000
/

CarAssistanceQA

Sleeping

App Files Files Community

CarAssistanceQA / src /utils /data_collector.py

Nihal2000

inferancing using gemma 270 model

f05e8f9 3 months ago

raw

history blame contribute delete

10.3 kB

	# src/utils/data_collector.py
	from __future__ import annotations

	import json
	import re
	from dataclasses import dataclass, asdict
	from pathlib import Path
	from typing import Optional, List, Dict, Any

	# Ensure your project has these modules available:
	# - config.settings: must define RAW_PDFS_DIR (string path), etc.
	# - src.utils.logger: exposes get_logger() returning a loguru logger
	try:
	from config.settings import settings
	except Exception as e:
	raise RuntimeError(
	"Failed to import settings. Ensure config/settings.py exists and is importable. "
	"If config is at project root, run from project root and include both '.' and 'src' in PYTHONPATH."
	) from e

	try:
	from src.utils.logger import get_logger
	except Exception as e:
	# Fallback simple logger if project logger not available
	import logging

	logging.basicConfig(level=logging.INFO, format="%(asctime)s \| %(levelname)s \| %(message)s")
	def get_logger(name: str):
	return logging.getLogger(name)

	logger = get_logger(__name__)


	@dataclass
	class ManualEntry:
	file_name: str
	file_path: Path
	brand: Optional[str] = None
	model: Optional[str] = None
	year: Optional[str] = None
	size_bytes: Optional[int] = None
	pages: Optional[int] = None
	valid: bool = False
	notes: Optional[str] = None


	class LocalManualCollector:
	"""
	Collector for locally available car manual PDFs.
	- Scans data/raw_pdfs/
	- Infers metadata from filename (brand, model, year)
	- Optionally normalizes filenames to Brand_Model_Year.pdf
	- Performs minimal PDF integrity check
	- Writes a manifest JSON listing all files and metadata
	"""

	def __init__(self, input_dir: Optional[str] = None):
	self.input_dir: Path = Path(input_dir or settings.RAW_PDFS_DIR)
	# Manifest always written into RAW_PDFS_DIR
	self.manifest_path: Path = self.input_dir / "manuals_manifest.json"
	self._ensure_dirs()

	# Log resolved absolute paths to avoid confusion
	logger.info(f"RAW_PDFS_DIR resolved to: {self.input_dir.resolve()}")
	logger.info(f"Manifest target resolved to: {self.manifest_path.resolve()}")

	def _ensure_dirs(self) -> None:
	self.input_dir.mkdir(parents=True, exist_ok=True)

	def _infer_metadata_from_name(self, name: str) -> Dict[str, Optional[str]]:
	"""
	Try to infer brand, model, year from filename.
	Handles underscores/hyphens/spaces, is case-insensitive.
	Examples:
	- be_6_2025.pdf -> Brand: Be, Model: 6, Year: 2025
	- HYUNDAI_model_2024.pdf-> Brand: Hyundai, Model: Model, Year: 2024
	- Volkswagen_atlas_2024.pdf -> Brand: Volkswagen, Model: Atlas, Year: 2024
	"""
	base = name.rsplit(".", 1)[0]
	cleaned = re.sub(r"[-_]+", " ", base.strip())
	tokens = [t for t in cleaned.split() if t]

	# Find a 4-digit year token at the end or near the end
	year = None
	for t in reversed(tokens):
	if re.fullmatch(r"(19\|20)\d{2}", t):
	year = t
	break

	brand: Optional[str] = None
	model: Optional[str] = None

	if tokens:
	if year and year in tokens:
	yi = tokens.index(year)
	# brand is first token; model is everything between brand and year
	if yi >= 2:
	brand = tokens[0].title()
	model = " ".join(tokens[1:yi]).title()
	elif yi == 1:
	brand = tokens.title()
	else:
	# Year is the first token; likely not intended, fall back
	brand = tokens.title()
	if len(tokens) > 1:
	model = " ".join(tokens[1:]).title()
	else:
	# No year found; brand = first token, model = rest
	if len(tokens) >= 2:
	brand = tokens.title()
	model = " ".join(tokens[1:]).title()
	else:
	brand = tokens.title()

	return {"brand": brand, "model": model, "year": year}

	def _quick_pdf_check(self, path: Path) -> Dict[str, Any]:
	"""
	Minimal integrity check:
	- size > 0
	- try to read first page text if PyMuPDF available
	Notes:
	- If PyMuPDF is not installed, still mark valid if size_bytes > 0
	"""
	size_bytes = path.stat().st_size if path.exists() else None
	pages: Optional[int] = None
	notes: Optional[str] = None
	valid = False

	if size_bytes and size_bytes > 0:
	try:
	import fitz # PyMuPDF
	with fitz.open(str(path)) as doc:
	pages = doc.page_count
	# Try reading the first page to ensure it opens
	if pages and pages > 0:
	try:
	_ = doc[0].get_text() # ignore content
	except Exception:
	pass
	valid = True
	else:
	valid = False
	except ImportError:
	# PyMuPDF not available; allow progression if non-empty
	notes = "PyMuPDF not installed; validated by non-empty file size."
	valid = True
	except Exception as e:
	notes = f"PDF open failed: {e}"
	valid = False
	else:
	notes = "Empty or missing file."

	return {"size_bytes": size_bytes, "pages": pages, "valid": valid, "notes": notes}

	def _safe_rename(self, src: Path, dst: Path) -> Path:
	"""
	Safely rename src to dst; if dst exists, append a numeric suffix.
	Returns the final destination path used.
	"""
	if src.resolve() == dst.resolve():
	return dst

	candidate = dst
	stem = dst.stem
	suffix = dst.suffix
	parent = dst.parent

	i = 1
	while candidate.exists():
	candidate = parent / f"{stem}_{i}{suffix}"
	i += 1

	src.rename(candidate)
	return candidate

	def _normalize_filename(self, entry: ManualEntry) -> ManualEntry:
	"""
	Normalize filename to Brand_Model_Year.pdf if brand, model, year inferred.
	Otherwise, keep the original filename.
	"""
	if entry.brand and entry.model and entry.year:
	normalized = f"{entry.brand}_{entry.model}_{entry.year}.pdf"
	normalized = re.sub(r'[\\/:*?"<>\| ]', "_", normalized)
	target_path = entry.file_path.parent / normalized

	if target_path.name != entry.file_path.name:
	try:
	final_path = self._safe_rename(entry.file_path, target_path)
	logger.info(f"Renamed '{entry.file_name}' -> '{final_path.name}'")
	entry.file_name = final_path.name
	entry.file_path = final_path
	except Exception as e:
	logger.warning(f"Could not rename '{entry.file_name}': {e}")

	return entry

	def collect(self, normalize: bool = True) -> List[ManualEntry]:
	"""
	Collect local PDFs from RAW_PDFS_DIR, optionally normalize filenames,
	and write/update a manifest file for downstream pipeline stages.
	Always writes a manifest (even if zero PDFs are found).
	"""
	logger.info(f"Scanning directory for PDFs: {self.input_dir}")

	# Case-insensitive scanning: handle .pdf and .PDF
	pdf_files: List[Path] = sorted(
	list(self.input_dir.glob(".pdf")) + list(self.input_dir.glob(".PDF"))
	)

	if not pdf_files:
	logger.warning("No PDF files found. Writing empty manifest for traceability.")

	entries: List[ManualEntry] = []
	for pdf in pdf_files:
	entry = self._build_entry(pdf)
	if normalize:
	entry = self._normalize_filename(entry)
	entries.append(entry)

	# Write manifest (always)
	manifest = {
	"total_files": len(entries),
	"valid_files": sum(1 for e in entries if e.valid),
	"invalid_files": [e.file_name for e in entries if not e.valid],
	"items": [
	{
	**asdict(e),
	"file_path": str(e.file_path), # make JSON-serializable
	}
	for e in entries
	],
	}

	# Ensure directory exists and write
	self.input_dir.mkdir(parents=True, exist_ok=True)
	with open(self.manifest_path, "w", encoding="utf-8") as f:
	json.dump(manifest, f, indent=2, ensure_ascii=False)

	logger.info(f"Manifest written: {self.manifest_path.resolve()}")
	logger.info(f"Total PDFs: {manifest['total_files']}, Valid: {manifest['valid_files']}")
	if manifest["invalid_files"]:
	logger.warning(f"Invalid PDFs: {manifest['invalid_files']}")

	return entries

	def _build_entry(self, pdf_path: Path) -> ManualEntry:
	meta = self._infer_metadata_from_name(pdf_path.name)
	check = self._quick_pdf_check(pdf_path)
	entry = ManualEntry(
	file_name=pdf_path.name,
	file_path=pdf_path,
	brand=meta.get("brand"),
	model=meta.get("model"),
	year=meta.get("year"),
	size_bytes=check.get("size_bytes"),
	pages=check.get("pages"),
	valid=check.get("valid", False),
	notes=check.get("notes"),
	)
	return entry


	if __name__ == "__main__":
	"""
	Run from the project root:
	- Ensure both '.' (project root) and 'src' are on PYTHONPATH.
	- Example (PowerShell):
	$env:PYTHONPATH=".;src"
	python -m src.utils.data_collector
	"""
	collector = LocalManualCollector()
	collector.collect(normalize=True)