File size: 3,684 Bytes
34b531b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from __future__ import annotations

import json

from app.config import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE, PROCESSED_DIR
from app.schemas import Chunk, RawDocument


def output_scope(value) -> str:
    return (getattr(value, "scope", "") or getattr(value, "ticker", "") or "unknown").lower()


def write_processed_outputs(documents: list[RawDocument], chunks: list[Chunk]) -> None:
    for directory in [PROCESSED_DIR / "text", PROCESSED_DIR / "chunks", PROCESSED_DIR / "metadata"]:
        directory.mkdir(parents=True, exist_ok=True)

    for document in documents:
        out_dir = PROCESSED_DIR / "text" / output_scope(document)
        out_dir.mkdir(parents=True, exist_ok=True)
        name = document.source_path.stem + ".txt"
        (out_dir / name).write_text(document.text, encoding="utf-8")

    chunks_by_scope: dict[str, list[Chunk]] = {}
    for chunk in chunks:
        chunks_by_scope.setdefault(output_scope(chunk), []).append(chunk)

    for scope, scope_chunks in chunks_by_scope.items():
        out_dir = PROCESSED_DIR / "chunks" / scope
        out_dir.mkdir(parents=True, exist_ok=True)
        out_path = out_dir / "chunks.jsonl"
        with out_path.open("w", encoding="utf-8") as handle:
            for chunk in scope_chunks:
                handle.write(json.dumps(chunk.__dict__, ensure_ascii=False) + "\n")

    manifest = {
        "document_count": len(documents),
        "chunk_count": len(chunks),
        "tickers": sorted({chunk.ticker for chunk in chunks if chunk.ticker}),
        "scopes": sorted({output_scope(chunk) for chunk in chunks}),
        "chunking": {
            "strategy": "structure-aware-token-based",
            "max_tokens": DEFAULT_CHUNK_SIZE,
            "overlap_tokens": DEFAULT_CHUNK_OVERLAP,
            "structures": ["headings", "tables", "widgets", "paragraphs"],
        },
    }
    (PROCESSED_DIR / "metadata" / "manifest.json").write_text(
        json.dumps(manifest, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )


def summarize_processed_data(documents: list[RawDocument], chunks: list[Chunk]) -> dict:
    modality_counts: dict[str, int] = {}
    ticker_doc_counts: dict[str, int] = {}
    ticker_chunk_counts: dict[str, int] = {}
    scope_doc_counts: dict[str, int] = {}
    scope_chunk_counts: dict[str, int] = {}
    structure_counts: dict[str, int] = {}

    for document in documents:
        modality_counts[document.modality] = modality_counts.get(document.modality, 0) + 1
        scope = output_scope(document)
        scope_doc_counts[scope] = scope_doc_counts.get(scope, 0) + 1
        if document.ticker:
            ticker_doc_counts[document.ticker] = ticker_doc_counts.get(document.ticker, 0) + 1

    for chunk in chunks:
        scope = output_scope(chunk)
        scope_chunk_counts[scope] = scope_chunk_counts.get(scope, 0) + 1
        if chunk.ticker:
            ticker_chunk_counts[chunk.ticker] = ticker_chunk_counts.get(chunk.ticker, 0) + 1
        structure_counts[chunk.structure_type] = structure_counts.get(chunk.structure_type, 0) + 1

    return {
        "document_count": len(documents),
        "chunk_count": len(chunks),
        "tickers": sorted(ticker_chunk_counts),
        "scopes": sorted(scope_chunk_counts),
        "modalities": dict(sorted(modality_counts.items())),
        "documents_by_ticker": dict(sorted(ticker_doc_counts.items())),
        "chunks_by_ticker": dict(sorted(ticker_chunk_counts.items())),
        "documents_by_scope": dict(sorted(scope_doc_counts.items())),
        "chunks_by_scope": dict(sorted(scope_chunk_counts.items())),
        "structures": dict(sorted(structure_counts.items())),
    }