File size: 4,310 Bytes
b15141d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from __future__ import annotations

from dataclasses import dataclass
import re
from typing import Iterable


_ZWSP_RE = re.compile(r"[\u200b\u200c\u200d\ufeff]")


def _clean(s: str) -> str:
    return _ZWSP_RE.sub("", s).strip()


_CHAPTER_HEADER_RE = re.compile(
    r"^(chapter\s+\d+|new\s+chapter\b.*|kungur\b.*chapter\b.*)$",
    re.IGNORECASE,
)


def is_chapter_header(line: str) -> bool:
    return bool(_CHAPTER_HEADER_RE.match(_clean(line)))


_DIVIDER_RE = re.compile(r"^\*{3,}\s*$")


def is_segment_divider(line: str) -> bool:
    return bool(_DIVIDER_RE.match(_clean(line)))


@dataclass(frozen=True)
class Segment:
    id: str
    text: str


@dataclass(frozen=True)
class Chapter:
    id: str
    title: str
    segments: list[Segment]


@dataclass(frozen=True)
class Manuscript:
    title: str
    chapters: list[Chapter]
    source_path: str | None = None


def _slugify(value: str) -> str:
    v = _clean(value).lower()
    v = re.sub(r"[^a-z0-9]+", "-", v).strip("-")
    return v or "untitled"


def split_segments(lines: Iterable[str]) -> list[str]:
    segments: list[list[str]] = [[]]
    for line in lines:
        if is_segment_divider(line):
            if segments[-1]:
                segments.append([])
            continue
        segments[-1].append(line)
    out: list[str] = []
    for block in segments:
        text = "\n".join(block).strip()
        if text:
            out.append(text)
    return out


def parse_chapters_from_markdown(text: str, *, manuscript_title: str = "The Shadow of Lillya") -> Manuscript:
    lines = text.splitlines()

    chapters: list[tuple[str, list[str]]] = []
    current_title: str | None = None
    current_lines: list[str] = []

    for raw in lines:
        if is_chapter_header(raw):
            # flush previous
            if current_title is not None:
                chapters.append((current_title, current_lines))
            current_title = _clean(raw)
            current_lines = []
            continue
        if current_title is None:
            # ignore preamble until first chapter marker
            continue
        current_lines.append(raw)

    if current_title is not None:
        chapters.append((current_title, current_lines))

    parsed: list[Chapter] = []
    for idx, (title, body_lines) in enumerate(chapters, start=1):
        seg_texts = split_segments(body_lines)
        segs: list[Segment] = []
        for sidx, seg_text in enumerate(seg_texts, start=1):
            seg_id = f"{idx:03d}-{sidx:03d}"
            segs.append(Segment(id=seg_id, text=seg_text))
        chap_id = f"ch-{idx:03d}-{_slugify(title)[:40]}"
        parsed.append(Chapter(id=chap_id, title=title, segments=segs))

    # If no explicit chapter headers were found, treat the whole text as one chapter.
    if not parsed:
        seg_texts = split_segments(lines)
        segs = [Segment(id=f"001-{i:03d}", text=t) for i, t in enumerate(seg_texts, start=1)]
        parsed = [Chapter(id="ch-001-draft", title="Draft", segments=segs)]

    return Manuscript(title=manuscript_title, chapters=parsed)


def manuscript_to_dict(m: Manuscript) -> dict:
    return {
        "title": m.title,
        "source_path": m.source_path,
        "chapters": [
            {
                "id": c.id,
                "title": c.title,
                "segments": [{"id": s.id, "text": s.text} for s in c.segments],
            }
            for c in m.chapters
        ],
    }


def dict_to_manuscript(data: dict) -> Manuscript:
    chapters: list[Chapter] = []
    for c in data.get("chapters", []):
        segs = [Segment(id=s["id"], text=s.get("text", "")) for s in c.get("segments", [])]
        chapters.append(Chapter(id=c["id"], title=c.get("title", ""), segments=segs))
    return Manuscript(title=data.get("title", "Manuscript"), chapters=chapters, source_path=data.get("source_path"))


def manuscript_to_markdown(m: Manuscript) -> str:
    parts: list[str] = [f"# {m.title}".strip(), ""]
    for i, c in enumerate(m.chapters, start=1):
        parts.append(f"## {c.title or f'Chapter {i}'}".strip())
        parts.append("")
        for s in c.segments:
            parts.append(s.text.strip())
            parts.append("")
            parts.append("***")
            parts.append("")
    return "\n".join(parts).rstrip() + "\n"