akryldigital's picture
bug fix
383ecc7 verified
"""Data loading utilities for chunks and JSON files."""
import json
from pathlib import Path
from typing import List, Dict, Any
try:
from langchain.docstore.document import Document
except:
from langchain_core.documents import Document
def load_json(filepath: Path | str) -> List[Dict[str, Any]]:
"""
Load JSON data from file.
Args:
filepath: Path to JSON file
Returns:
List of dictionaries containing the JSON data
"""
filepath = Path(filepath)
if not filepath.exists():
raise FileNotFoundError(f"JSON file not found: {filepath}")
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def open_file(filepath: Path | str) -> str:
"""
Open and read a text file.
Args:
filepath: Path to text file
Returns:
File contents as string
"""
filepath = Path(filepath)
if not filepath.exists():
raise FileNotFoundError(f"File not found: {filepath}")
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
return content
def load_chunks(chunks_file: Path | str = None) -> List[Dict[str, Any]]:
"""
Load document chunks from JSON file.
Args:
chunks_file: Path to chunks JSON file. If None, uses default path.
Returns:
List of chunk dictionaries
"""
if chunks_file is None:
chunks_file = Path("reports/docling_chunks.json")
return load_json(chunks_file)
def chunks_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]:
"""
Convert chunk dictionaries to LangChain Document objects.
Args:
chunks: List of chunk dictionaries
Returns:
List of Document objects
"""
documents = []
for chunk in chunks:
doc = Document(
page_content=chunk.get("content", ""),
metadata=chunk.get("metadata", {})
)
documents.append(doc)
return documents
def validate_chunks(chunks: List[Dict[str, Any]]) -> bool:
"""
Validate that chunks have required fields.
Args:
chunks: List of chunk dictionaries
Returns:
True if valid, raises ValueError if invalid
"""
required_fields = ["content", "metadata"]
for i, chunk in enumerate(chunks):
for field in required_fields:
if field not in chunk:
raise ValueError(f"Chunk {i} missing required field: {field}")
# Validate metadata has required fields
metadata = chunk["metadata"]
if not isinstance(metadata, dict):
raise ValueError(f"Chunk {i} metadata must be a dictionary")
# Check for common metadata fields
if "filename" not in metadata:
raise ValueError(f"Chunk {i} metadata missing 'filename' field")
return True