Spaces:

liamxdev
/

chatvns

Sleeping

Upload folder using huggingface_hub

34b531b verified 6 days ago

1.66 kB

	from __future__ import annotations

	import csv
	import json
	from pathlib import Path

	from app.config import RAW_DIR
	from app.processing.constants import IMAGE_EXTENSIONS, PDF_EXTENSIONS
	from app.processing.text_utils import rows_to_table_text


	def read_csv_rows(path: Path) -> list[list[str]]:
	with path.open("r", encoding="utf-8-sig", newline="") as handle:
	return [row for row in csv.reader(handle)]


	def read_pdf_text(path: Path) -> str:
	try:
	import fitz

	document = fitz.open(str(path))
	try:
	text = "\n".join(page.get_text("text") or "" for page in document)
	finally:
	document.close()
	except Exception as exc: # noqa: BLE001
	return f"[PDF artifact without extracted text] {path.name} \| pymupdf_error={exc}"
	if text.strip():
	return text
	return f"[PDF artifact without extracted text] {path.name}"


	def read_raw_file(path: Path) -> str:
	suffix = path.suffix.lower()
	if suffix == ".csv":
	return rows_to_table_text(read_csv_rows(path))
	if suffix in PDF_EXTENSIONS:
	return read_pdf_text(path)
	if suffix in IMAGE_EXTENSIONS:
	return f"[Image artifact] {path.name}"
	return path.read_text(encoding="utf-8", errors="ignore")


	def load_metadata_for_artifact(path: Path, ticker: str) -> dict:
	stem = path.stem
	candidates = sorted((RAW_DIR / "metadata" / ticker).glob(f"{stem}.metadata.json"))
	if not candidates:
	return {}
	try:
	return json.loads(candidates[0].read_text(encoding="utf-8-sig"))
	except (OSError, json.JSONDecodeError):
	return {}