| from __future__ import annotations
|
|
|
| import csv |
| import json |
| from pathlib import Path |
|
|
| from app.config import RAW_DIR |
| from app.processing.constants import IMAGE_EXTENSIONS, PDF_EXTENSIONS |
| from app.processing.text_utils import rows_to_table_text |
|
|
|
|
| def read_csv_rows(path: Path) -> list[list[str]]:
|
| with path.open("r", encoding="utf-8-sig", newline="") as handle:
|
| return [row for row in csv.reader(handle)]
|
|
|
|
|
| def read_pdf_text(path: Path) -> str: |
| try: |
| import fitz |
|
|
| document = fitz.open(str(path)) |
| try: |
| text = "\n".join(page.get_text("text") or "" for page in document) |
| finally: |
| document.close() |
| except Exception as exc: |
| return f"[PDF artifact without extracted text] {path.name} | pymupdf_error={exc}" |
| if text.strip(): |
| return text |
| return f"[PDF artifact without extracted text] {path.name}" |
|
|
|
|
| def read_raw_file(path: Path) -> str:
|
| suffix = path.suffix.lower()
|
| if suffix == ".csv":
|
| return rows_to_table_text(read_csv_rows(path))
|
| if suffix in PDF_EXTENSIONS:
|
| return read_pdf_text(path)
|
| if suffix in IMAGE_EXTENSIONS:
|
| return f"[Image artifact] {path.name}"
|
| return path.read_text(encoding="utf-8", errors="ignore")
|
|
|
|
|
| def load_metadata_for_artifact(path: Path, ticker: str) -> dict:
|
| stem = path.stem
|
| candidates = sorted((RAW_DIR / "metadata" / ticker).glob(f"{stem}.metadata.json"))
|
| if not candidates:
|
| return {}
|
| try:
|
| return json.loads(candidates[0].read_text(encoding="utf-8-sig"))
|
| except (OSError, json.JSONDecodeError):
|
| return {}
|
|
|