chatvns / app /processing /outputs.py
liamxdev's picture
Upload folder using huggingface_hub
34b531b verified
Raw
History Blame Contribute Delete
3.68 kB
from __future__ import annotations
import json
from app.config import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE, PROCESSED_DIR
from app.schemas import Chunk, RawDocument
def output_scope(value) -> str:
return (getattr(value, "scope", "") or getattr(value, "ticker", "") or "unknown").lower()
def write_processed_outputs(documents: list[RawDocument], chunks: list[Chunk]) -> None:
for directory in [PROCESSED_DIR / "text", PROCESSED_DIR / "chunks", PROCESSED_DIR / "metadata"]:
directory.mkdir(parents=True, exist_ok=True)
for document in documents:
out_dir = PROCESSED_DIR / "text" / output_scope(document)
out_dir.mkdir(parents=True, exist_ok=True)
name = document.source_path.stem + ".txt"
(out_dir / name).write_text(document.text, encoding="utf-8")
chunks_by_scope: dict[str, list[Chunk]] = {}
for chunk in chunks:
chunks_by_scope.setdefault(output_scope(chunk), []).append(chunk)
for scope, scope_chunks in chunks_by_scope.items():
out_dir = PROCESSED_DIR / "chunks" / scope
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "chunks.jsonl"
with out_path.open("w", encoding="utf-8") as handle:
for chunk in scope_chunks:
handle.write(json.dumps(chunk.__dict__, ensure_ascii=False) + "\n")
manifest = {
"document_count": len(documents),
"chunk_count": len(chunks),
"tickers": sorted({chunk.ticker for chunk in chunks if chunk.ticker}),
"scopes": sorted({output_scope(chunk) for chunk in chunks}),
"chunking": {
"strategy": "structure-aware-token-based",
"max_tokens": DEFAULT_CHUNK_SIZE,
"overlap_tokens": DEFAULT_CHUNK_OVERLAP,
"structures": ["headings", "tables", "widgets", "paragraphs"],
},
}
(PROCESSED_DIR / "metadata" / "manifest.json").write_text(
json.dumps(manifest, ensure_ascii=False, indent=2),
encoding="utf-8",
)
def summarize_processed_data(documents: list[RawDocument], chunks: list[Chunk]) -> dict:
modality_counts: dict[str, int] = {}
ticker_doc_counts: dict[str, int] = {}
ticker_chunk_counts: dict[str, int] = {}
scope_doc_counts: dict[str, int] = {}
scope_chunk_counts: dict[str, int] = {}
structure_counts: dict[str, int] = {}
for document in documents:
modality_counts[document.modality] = modality_counts.get(document.modality, 0) + 1
scope = output_scope(document)
scope_doc_counts[scope] = scope_doc_counts.get(scope, 0) + 1
if document.ticker:
ticker_doc_counts[document.ticker] = ticker_doc_counts.get(document.ticker, 0) + 1
for chunk in chunks:
scope = output_scope(chunk)
scope_chunk_counts[scope] = scope_chunk_counts.get(scope, 0) + 1
if chunk.ticker:
ticker_chunk_counts[chunk.ticker] = ticker_chunk_counts.get(chunk.ticker, 0) + 1
structure_counts[chunk.structure_type] = structure_counts.get(chunk.structure_type, 0) + 1
return {
"document_count": len(documents),
"chunk_count": len(chunks),
"tickers": sorted(ticker_chunk_counts),
"scopes": sorted(scope_chunk_counts),
"modalities": dict(sorted(modality_counts.items())),
"documents_by_ticker": dict(sorted(ticker_doc_counts.items())),
"chunks_by_ticker": dict(sorted(ticker_chunk_counts.items())),
"documents_by_scope": dict(sorted(scope_doc_counts.items())),
"chunks_by_scope": dict(sorted(scope_chunk_counts.items())),
"structures": dict(sorted(structure_counts.items())),
}