File size: 1,658 Bytes
34b531b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from __future__ import annotations

import csv
import json
from pathlib import Path

from app.config import RAW_DIR
from app.processing.constants import IMAGE_EXTENSIONS, PDF_EXTENSIONS
from app.processing.text_utils import rows_to_table_text


def read_csv_rows(path: Path) -> list[list[str]]:
    with path.open("r", encoding="utf-8-sig", newline="") as handle:
        return [row for row in csv.reader(handle)]


def read_pdf_text(path: Path) -> str:
    try:
        import fitz

        document = fitz.open(str(path))
        try:
            text = "\n".join(page.get_text("text") or "" for page in document)
        finally:
            document.close()
    except Exception as exc:  # noqa: BLE001
        return f"[PDF artifact without extracted text] {path.name} | pymupdf_error={exc}"
    if text.strip():
        return text
    return f"[PDF artifact without extracted text] {path.name}"


def read_raw_file(path: Path) -> str:
    suffix = path.suffix.lower()
    if suffix == ".csv":
        return rows_to_table_text(read_csv_rows(path))
    if suffix in PDF_EXTENSIONS:
        return read_pdf_text(path)
    if suffix in IMAGE_EXTENSIONS:
        return f"[Image artifact] {path.name}"
    return path.read_text(encoding="utf-8", errors="ignore")


def load_metadata_for_artifact(path: Path, ticker: str) -> dict:
    stem = path.stem
    candidates = sorted((RAW_DIR / "metadata" / ticker).glob(f"{stem}.metadata.json"))
    if not candidates:
        return {}
    try:
        return json.loads(candidates[0].read_text(encoding="utf-8-sig"))
    except (OSError, json.JSONDecodeError):
        return {}