from __future__ import annotations from pathlib import Path from app.config import RAW_DIR from app.processing.constants import IMAGE_EXTENSIONS, PDF_EXTENSIONS, TEXT_EXTENSIONS from app.processing.readers import load_metadata_for_artifact, read_raw_file from app.processing.text_utils import clean_document_text, normalize_text, stable_id from app.schemas import RawDocument def ticker_from_path(path: Path) -> str: scope = scope_from_path(path) if scope.lower() == "market": return "" return scope.upper() def scope_from_path(path: Path) -> str: try: relative = path.relative_to(RAW_DIR) except ValueError: return "unknown" parts = relative.parts return parts[1] if len(parts) > 1 else "unknown" def modality_from_path(path: Path) -> str: try: category = path.relative_to(RAW_DIR).parts[0] except (ValueError, IndexError): category = path.parent.name if category == "csv": return "table" if category == "pdf": return "pdf" if category == "images": return "image" return "text" def text_dump_exists(path: Path) -> bool: scope = scope_from_path(path) text_path = RAW_DIR / "text" / scope / f"{path.stem}.txt" return text_path.exists() def iter_raw_documents(raw_dir: Path = RAW_DIR) -> list[RawDocument]: documents: list[RawDocument] = [] allowed_extensions = TEXT_EXTENSIONS | PDF_EXTENSIONS | IMAGE_EXTENSIONS for path in raw_dir.rglob("*"): if not path.is_file(): continue if "metadata" in path.parts: continue if path.suffix.lower() == ".html" and text_dump_exists(path): continue if path.suffix.lower() not in allowed_extensions: continue text = clean_document_text(normalize_text(read_raw_file(path))) if not text: continue ticker = ticker_from_path(path) scope = scope_from_path(path) relative = path.relative_to(raw_dir.parent) metadata = load_metadata_for_artifact(path, scope) documents.append( RawDocument( id=stable_id(relative.as_posix()), text=text, source_path=path, modality=modality_from_path(path), ticker=ticker, metadata=metadata, scope=scope, ) ) return documents