Spaces:

ICGenAIShare06
/

numen-scriptorium-demo

Sleeping

App Files Files Community

Yifei Wang commited on Mar 6

Commit

9168fe7

1 Parent(s): fccac4e

Add RAG toggle app and dependencies

Browse files

Files changed (8) hide show

README.md +1 -1
app_rag.py +797 -0
build_vector_db.py +100 -0
infer_hybrid_RAG.py +212 -0
kg_merge.py +101 -0
requirements.txt +3 -1
src/numen_scriptorium/inference/qwen.py +14 -1
summarise_manus.py +324 -0

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ emoji: ✨
 colorFrom: indigo
 colorTo: purple
 sdk: gradio
-app_file: app.py
 pinned: false
 ---
 ## Project Structure (Updated)

 colorFrom: indigo
 colorTo: purple
 sdk: gradio
+app_file: app_rag.py
 pinned: false
 ---
 ## Project Structure (Updated)

app_rag.py ADDED Viewed

	@@ -0,0 +1,797 @@

+from __future__ import annotations
+import traceback
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent / "src"))
+import os
+import queue
+import re
+import threading
+import time
+from functools import lru_cache
+from pathlib import Path
+import gradio as gr
+from numen_scriptorium.inference.qwen import get_model_device, load_model, stream_generate
+BASE_MODEL = os.getenv("NS_BASE_MODEL", "Qwen/Qwen2.5-7B-Instruct")
+ADAPTER = os.getenv("NS_ADAPTER", "outputs/qwen2_5_7b_boh_qlora/best").strip() or None
+USE_4BIT = os.getenv("NS_USE_4BIT", "1") == "1"
+DEFAULT_INSTRUCTION = os.getenv("NS_DEFAULT_INSTRUCTION", "请将输入翻译为中文，并保持原文风格。")
+_RUNTIME_LOADED = False
+_ACTIVE_STOP_EVENT: threading.Event | None = None
+_STOP_LOCK = threading.Lock()
+@lru_cache(maxsize=1)
+def _get_rag_resource_summary() -> str:
+    from infer_hybrid_RAG import rag_resource_summary
+    return rag_resource_summary()
+def _format_mode_indicator(use_rag: bool) -> str:
+    if use_rag:
+        resources = _get_rag_resource_summary()
+        return (
+            "### Active mode\n"
+            "- **Mode:** `RAG (hybrid)`\n"
+            f"- **Resources:** `{resources}`"
+        )
+    return (
+        "### Active mode\n"
+        "- **Mode:** `Non-RAG (existing stream_generate pipeline)`\n"
+        f"- **Resources:** `base={BASE_MODEL}, adapter={ADAPTER or 'None'}, 4bit={USE_4BIT}`"
+    )
+def _on_mode_toggled(use_rag: bool):
+    return _format_mode_indicator(use_rag)
+def _is_rag_runtime_loaded() -> bool:
+    try:
+        from infer_hybrid_RAG import get_rag_runtime
+        return get_rag_runtime.cache_info().currsize > 0
+    except Exception:
+        return False
+def _set_active_stop_event(stop_event: threading.Event | None):
+    global _ACTIVE_STOP_EVENT
+    lock = _STOP_LOCK
+    # During interpreter shutdown, module globals can be partially torn down.
+    # Fall back to a best-effort direct assignment instead of raising.
+    if lock is None:
+        _ACTIVE_STOP_EVENT = stop_event
+        return
+    try:
+        with lock:
+            _ACTIVE_STOP_EVENT = stop_event
+    except Exception:
+        _ACTIVE_STOP_EVENT = stop_event
+def _request_stop():
+    lock = _STOP_LOCK
+    if lock is None:
+        event = _ACTIVE_STOP_EVENT
+        if event is not None:
+            event.set()
+        return
+    try:
+        with lock:
+            if _ACTIVE_STOP_EVENT is not None:
+                _ACTIVE_STOP_EVENT.set()
+    except Exception:
+        event = _ACTIVE_STOP_EVENT
+        if event is not None:
+            event.set()
+def _on_stop_clicked():
+    _request_stop()
+    return _format_status(
+        stage="Stop requested",
+        loaded=_RUNTIME_LOADED,
+        device="unknown",
+        loading_percent="--",
+        error="Stop requested. Waiting for backend generation to halt.",
+    )
+def _on_clear_clicked():
+    # Clear should also stop any in-flight generation to avoid concurrent
+    # updates from the stream generator after UI has been reset.
+    _request_stop()
+    return (
+        DEFAULT_INSTRUCTION,
+        "",
+        False,
+        "",
+        _format_status(stage="Idle", loaded=_RUNTIME_LOADED, device="unknown", loading_percent="0%"),
+        _format_mode_indicator(False),
+        "0.00s",
+    )
+def _format_loading_percent(value: int) -> str:
+    return f"{max(0, min(100, int(value)))}%"
+def _infer_example_label(instruction: str, user_input: str, idx: int) -> str:
+    lower_instruction = instruction.lower()
+    if "sun's design" in user_input.lower():
+        return "BoH EN→ZH (Sun's Design)"
+    if "velvet lesson" in user_input.lower() or "moth and dream" in lower_instruction:
+        return "Moth&Dream EN→ZH (Velvet Lesson)"
+    if "deposition" in lower_instruction:
+        return "EN Generation (Deposition)"
+    if "generate one entry" in lower_instruction or "catalog" in lower_instruction:
+        return "EN Generation (Catalog Entry)"
+    return f"Example {idx + 1}"
+def _load_demo_examples():
+    """Load examples from demo_examples.txt / demo_example.txt.
+    Expected per block:
+    - python infer_qlora_qwen3_boh.py ...
+    - --instruction "..."
+    - --input "..."
+    - optional --max_new_tokens <int>
+    """
+    candidate_files = [
+        Path(__file__).resolve().parent / "demo_examples.txt",
+        Path(__file__).resolve().parent / "demo_example.txt",
+    ]
+    file_path = next((p for p in candidate_files if p.exists()), None)
+    if file_path is None:
+        return [], "⚠️ Examples file not found (expected demo_examples.txt)."
+    try:
+        raw = file_path.read_text(encoding="utf-8")
+    except Exception:
+        return [], "⚠️ Could not read examples file."
+    block_pattern = re.compile(
+        r"python\s+infer_qlora_qwen3_boh\.py(?P<body>.*?)(?=(?:\n\s*python\s+infer_qlora_qwen3_boh\.py)|\Z)",
+        re.DOTALL,
+    )
+    instruction_pattern = re.compile(r'--instruction\s+"(?P<instruction>.*?)"\s*`', re.DOTALL)
+    input_pattern = re.compile(r'--input\s+"(?P<input>.*?)"\s*`', re.DOTALL)
+    max_tokens_pattern = re.compile(r"--max_new_tokens\s+(?P<max_new_tokens>\d+)")
+    parsed = []
+    for idx, block in enumerate(block_pattern.finditer(raw)):
+        body = block.group("body")
+        instruction_match = instruction_pattern.search(body)
+        input_match = input_pattern.search(body)
+        if not instruction_match or not input_match:
+            continue
+        instruction = instruction_match.group("instruction").strip()
+        user_input = input_match.group("input").strip()
+        max_match = max_tokens_pattern.search(body)
+        max_new_tokens = int(max_match.group("max_new_tokens")) if max_match else None
+        parsed.append(
+            {
+                "label": _infer_example_label(instruction, user_input, idx),
+                "instruction": instruction,
+                "input": user_input,
+                "max_new_tokens": max_new_tokens,
+                "use_rag": False,
+            }
+        )
+    if not parsed:
+        return [], "⚠️ Failed to parse demo examples. Please check examples file format."
+    has_rag_example = any("rag" in ex["label"].lower() or ex.get("use_rag") for ex in parsed)
+    if not has_rag_example:
+        parsed.append(
+            {
+                "label": "RAG Example (hybrid terms)",
+                "instruction": "You are a translator. Translate English into Chinese while preserving lore style and preferred lore term mappings.",
+                "input": "In Emesa, the Sun-in-Splendour is named in a black corundum tablet beside the Grail and the Forge.",
+                "max_new_tokens": 384,
+                "use_rag": True,
+            }
+        )
+    return parsed, None
+def _apply_example(example: dict):
+    max_tokens_update = (
+        example["max_new_tokens"] if example.get("max_new_tokens") is not None else gr.update()
+    )
+    use_rag = bool(example.get("use_rag", False))
+    return example["instruction"], example["input"], max_tokens_update, use_rag, _format_mode_indicator(use_rag)
+def _format_status(
+    *,
+    stage: str,
+    loaded: bool,
+    device: str,
+    loading_percent: str | None = None,
+    elapsed: float | None = None,
+    error: str | None = None,
+    stream_chunks: int | None = None,
+    output_chars: int | None = None,
+):
+    lines = [
+        "### Model / System status",
+        f"- **Stage:** {stage}",
+        f"- **Model loaded:** {'✅ Yes' if loaded else '❌ No'}",
+        f"- **Device:** `{device}`",
+        f"- **Base model:** `{BASE_MODEL}`",
+        f"- **Adapter:** `{ADAPTER or 'None'}`",
+        f"- **4-bit quantization:** `{USE_4BIT}`",
+    ]
+    if loading_percent is not None:
+        lines.append(f"- **Model loading:** `{loading_percent}`")
+    if elapsed is not None:
+        lines.append(f"- **Time per request:** `{elapsed:.2f}s`")
+    if stream_chunks is not None:
+        lines.append(f"- **Stream chunks received:** `{stream_chunks}`")
+    if output_chars is not None:
+        lines.append(f"- **Output characters so far:** `{output_chars}`")
+    if error:
+        lines.append(f"- **Error:** ⚠️ {error}")
+    return "\n".join(lines)
+@lru_cache(maxsize=1)
+def get_runtime():
+    global _RUNTIME_LOADED
+    runtime = load_model(base_model=BASE_MODEL, lora_dir=ADAPTER, use_4bit=USE_4BIT)
+    _RUNTIME_LOADED = True
+    return runtime
+def run_inference_stream(
+    instruction: str,
+    user_input: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    seed: int,
+):
+    set_active_stop = _set_active_stop_event
+    start = time.perf_counter()
+    device = "unknown"
+    stage = "Preparing request"
+    load_progress = 0
+    cleaned_instruction = instruction.strip() or DEFAULT_INSTRUCTION
+    cleaned_input = user_input.strip()
+    normalized_seed = None if seed is None or int(seed) < 0 else int(seed)
+    stop_event = threading.Event()
+    set_active_stop(stop_event)
+    if not cleaned_input:
+        msg = "⚠️ Please provide input text before running generation."
+        yield (
+            msg,
+            _format_status(
+                stage="Waiting for input",
+                loaded=_RUNTIME_LOADED,
+                device=device,
+                loading_percent=_format_loading_percent(load_progress),
+            ),
+            "0.00s",
+        )
+        set_active_stop(None)
+        return
+    try:
+        stage = "Loading model"
+        if _RUNTIME_LOADED:
+            tokenizer, model = get_runtime()
+            load_progress = 100
+            yield (
+                "",
+                _format_status(
+                    stage="Model ready (cached)",
+                    loaded=True,
+                    device=device,
+                    loading_percent=_format_loading_percent(load_progress),
+                ),
+                f"{time.perf_counter() - start:.2f}s",
+            )
+        else:
+            runtime_box: dict[str, tuple] = {}
+            err_box: dict[str, Exception] = {}
+            def _loader():
+                try:
+                    runtime_box["runtime"] = get_runtime()
+                except Exception as exc:
+                    err_box["error"] = exc
+            loader_thread = threading.Thread(target=_loader, daemon=True)
+            loader_thread.start()
+            load_progress = 3
+            while loader_thread.is_alive():
+                if stop_event.is_set():
+                    elapsed = time.perf_counter() - start
+                    yield (
+                        "⚠️ Stop requested. Model loading may continue in background.",
+                        _format_status(
+                            stage="Stopped during model loading",
+                            loaded=False,
+                            device=device,
+                            loading_percent=_format_loading_percent(load_progress),
+                            elapsed=elapsed,
+                        ),
+                        f"{elapsed:.2f}s",
+                    )
+                    return
+                load_progress = min(95, load_progress + 4)
+                elapsed = time.perf_counter() - start
+                yield (
+                    "",
+                    _format_status(
+                        stage=f"Loading model ({load_progress}%)",
+                        loaded=False,
+                        device=device,
+                        loading_percent=_format_loading_percent(load_progress),
+                        elapsed=elapsed,
+                    ),
+                    f"{elapsed:.2f}s",
+                )
+                time.sleep(0.2)
+            loader_thread.join()
+            if "error" in err_box:
+                raise err_box["error"]
+            tokenizer, model = runtime_box["runtime"]
+            load_progress = 100
+        device = get_model_device(model)
+        stage = "Tokenizing / preparing generation"
+        elapsed = time.perf_counter() - start
+        yield (
+            "",
+            _format_status(
+                stage=stage,
+                loaded=True,
+                device=device,
+                loading_percent=_format_loading_percent(load_progress),
+                elapsed=elapsed,
+                stream_chunks=0,
+                output_chars=0,
+            ),
+            f"{elapsed:.2f}s",
+        )
+        stage = "Generating"
+        partial = ""
+        chunk_count = 0
+        token_queue: queue.Queue[str | None] = queue.Queue()
+        error_queue: queue.Queue[Exception] = queue.Queue()
+        def _token_producer():
+            try:
+                for token in stream_generate(
+                    tokenizer=tokenizer,
+                    model=model,
+                    instruction=cleaned_instruction,
+                    user_input=cleaned_input,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=True,
+                    seed=normalized_seed,
+                    stop_event=stop_event,
+                ):
+                    token_queue.put(token)
+            except Exception as exc:
+                error_queue.put(exc)
+            finally:
+                token_queue.put(None)
+        producer = threading.Thread(target=_token_producer, daemon=True)
+        producer.start()
+        first_token_seen = False
+        while True:
+            if stop_event.is_set():
+                elapsed = time.perf_counter() - start
+                yield (
+                    partial.strip(),
+                    _format_status(
+                        stage="Stopped by user",
+                        loaded=True,
+                        device=device,
+                        loading_percent=_format_loading_percent(load_progress),
+                        elapsed=elapsed,
+                        stream_chunks=chunk_count,
+                        output_chars=len(partial.strip()),
+                    ),
+                    f"{elapsed:.2f}s",
+                )
+                return
+            if not error_queue.empty():
+                raise error_queue.get()
+            try:
+                delta = token_queue.get(timeout=0.2)
+            except queue.Empty:
+                elapsed = time.perf_counter() - start
+                wait_stage = "Generating (waiting for first token)" if not first_token_seen else "Generating"
+                yield (
+                    partial,
+                    _format_status(
+                        stage=wait_stage,
+                        loaded=True,
+                        device=device,
+                        loading_percent=_format_loading_percent(load_progress),
+                        elapsed=elapsed,
+                        stream_chunks=chunk_count,
+                        output_chars=len(partial),
+                    ),
+                    f"{elapsed:.2f}s",
+                )
+                continue
+            if delta is None:
+                if not error_queue.empty():
+                    raise error_queue.get()
+                break
+            first_token_seen = True
+            chunk_count += 1
+            partial += delta
+            elapsed = time.perf_counter() - start
+            yield (
+                partial,
+                _format_status(
+                    stage=stage,
+                    loaded=True,
+                    device=device,
+                    loading_percent=_format_loading_percent(load_progress),
+                    elapsed=elapsed,
+                    stream_chunks=chunk_count,
+                    output_chars=len(partial),
+                ),
+                f"{elapsed:.2f}s",
+            )
+        elapsed = time.perf_counter() - start
+        yield (
+            partial.strip(),
+            _format_status(
+                stage="Done",
+                loaded=True,
+                device=device,
+                loading_percent=_format_loading_percent(load_progress),
+                elapsed=elapsed,
+                stream_chunks=chunk_count,
+                output_chars=len(partial.strip()),
+            ),
+            f"{elapsed:.2f}s",
+        )
+    except Exception as e:
+        elapsed = time.perf_counter() - start
+        tb = traceback.format_exc()
+        print("=== Generation failure traceback ===")
+        print(tb)
+        err = f"{type(e).__name__}: {e}"
+        yield (
+            f"⚠️ Generation failed: {err}",
+            _format_status(
+                stage=stage,
+                loaded=_RUNTIME_LOADED,
+                device=device,
+                loading_percent=_format_loading_percent(load_progress),
+                elapsed=elapsed,
+                error=err,
+            ),
+            f"{elapsed:.2f}s",
+        )
+    finally:
+        set_active_stop(None)
+def run_rag_inference_stream(
+    instruction: str,
+    user_input: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    seed: int,
+):
+    set_active_stop = _set_active_stop_event
+    start = time.perf_counter()
+    cleaned_instruction = instruction.strip() or DEFAULT_INSTRUCTION
+    cleaned_input = user_input.strip()
+    normalized_seed = None if seed is None or int(seed) < 0 else int(seed)
+    stop_event = threading.Event()
+    set_active_stop(stop_event)
+    resources = "(lazy-loaded)"
+    if not cleaned_input:
+        yield (
+            "⚠️ Please provide input text before running generation.",
+            _format_status(stage="Waiting for input", loaded=False, device="unknown", loading_percent="0%"),
+            "0.00s",
+        )
+        set_active_stop(None)
+        return
+    try:
+        stage = "Loading RAG pipeline"
+        yield (
+            "",
+            _format_status(stage=stage, loaded=_is_rag_runtime_loaded(), device="unknown", loading_percent="5%"),
+            f"{time.perf_counter() - start:.2f}s",
+        )
+        from infer_hybrid_RAG import rag_answer_stream, rag_resource_summary
+        resources = rag_resource_summary()
+        yield (
+            "",
+            _format_status(
+                stage="Retrieving (hybrid)",
+                loaded=_is_rag_runtime_loaded(),
+                device="unknown",
+                loading_percent="25%",
+            ),
+            f"{time.perf_counter() - start:.2f}s",
+        )
+        token_queue: queue.Queue[str | None] = queue.Queue()
+        error_queue: queue.Queue[Exception] = queue.Queue()
+        def _token_producer():
+            try:
+                for token in rag_answer_stream(
+                    instruction=cleaned_instruction,
+                    user_input=cleaned_input,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=True,
+                    seed=normalized_seed,
+                    stop_event=stop_event,
+                ):
+                    token_queue.put(token)
+            except Exception as exc:
+                error_queue.put(exc)
+            finally:
+                token_queue.put(None)
+        producer = threading.Thread(target=_token_producer, daemon=True)
+        producer.start()
+        partial = ""
+        chunk_count = 0
+        first_token_seen = False
+        while True:
+            if stop_event.is_set():
+                elapsed = time.perf_counter() - start
+                yield (
+                    partial.strip(),
+                    _format_status(
+                        stage="Stopped by user (RAG)",
+                        loaded=_is_rag_runtime_loaded(),
+                        device="auto",
+                        loading_percent="--",
+                        elapsed=elapsed,
+                        stream_chunks=chunk_count,
+                        output_chars=len(partial.strip()),
+                    ),
+                    f"{elapsed:.2f}s",
+                )
+                return
+            if not error_queue.empty():
+                raise error_queue.get()
+            try:
+                delta = token_queue.get(timeout=0.2)
+            except queue.Empty:
+                elapsed = time.perf_counter() - start
+                wait_stage = (
+                    "Generating with RAG (loading/retrieving...)"
+                    if not first_token_seen
+                    else "Generating with RAG"
+                )
+                yield (
+                    partial,
+                    _format_status(
+                        stage=wait_stage,
+                        loaded=_is_rag_runtime_loaded(),
+                        device="auto",
+                        loading_percent="90%" if first_token_seen else "60%",
+                        elapsed=elapsed,
+                        stream_chunks=chunk_count,
+                        output_chars=len(partial),
+                    ),
+                    f"{elapsed:.2f}s",
+                )
+                continue
+            if delta is None:
+                if not error_queue.empty():
+                    raise error_queue.get()
+                break
+            first_token_seen = True
+            chunk_count += 1
+            partial += delta
+            elapsed = time.perf_counter() - start
+            yield (
+                partial,
+                _format_status(
+                    stage="Generating with RAG",
+                        loaded=_is_rag_runtime_loaded(),
+                    device="auto",
+                    loading_percent="95%",
+                    elapsed=elapsed,
+                    stream_chunks=chunk_count,
+                    output_chars=len(partial),
+                ),
+                f"{elapsed:.2f}s",
+            )
+        elapsed = time.perf_counter() - start
+        yield (
+            partial.strip(),
+            _format_status(
+                stage=f"Done (RAG) · {resources}",
+                loaded=_is_rag_runtime_loaded(),
+                device="auto",
+                loading_percent="100%",
+                elapsed=elapsed,
+                stream_chunks=chunk_count,
+                output_chars=len(partial.strip()),
+            ),
+            f"{elapsed:.2f}s",
+        )
+    except Exception as e:
+        elapsed = time.perf_counter() - start
+        err = f"{type(e).__name__}: {e}"
+        tb = traceback.format_exc()
+        print("=== RAG generation failure traceback ===")
+        print(tb)
+        yield (
+            f"⚠️ RAG generation failed: {err}",
+            _format_status(
+                stage="RAG failure",
+                loaded=False,
+                device="unknown",
+                loading_percent="--",
+                elapsed=elapsed,
+                error=err,
+            ),
+            f"{elapsed:.2f}s",
+        )
+    finally:
+        set_active_stop(None)
+def run_inference_with_mode(
+    instruction: str,
+    user_input: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    seed: int,
+    use_rag: bool,
+):
+    # Routing note: checkbox OFF -> existing non-RAG stream_generate path,
+    # checkbox ON -> hybrid RAG retrieval + generation path.
+    if use_rag:
+        yield from run_rag_inference_stream(
+            instruction=instruction,
+            user_input=user_input,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            seed=seed,
+        )
+        return
+    yield from run_inference_stream(
+        instruction=instruction,
+        user_input=user_input,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        seed=seed,
+    )
+with gr.Blocks(title="Numen Scriptorium Demo") as demo:
+    gr.Markdown("# ✨ Numen Scriptorium · HF Demo")
+    gr.Markdown(
+        "This demo can: (1) translate EN↔ZH with Book-of-Hours/Cultist-Simulator-like tone., and (2) rewrite/generate text with instructed tone and nouns.\n\n"
+        "For lore-like quality, load a matching LoRA adapter (base model alone is not enough).\n\n"
+        "**How to use**\n"
+        "1. Keep or edit the instruction.\n"
+        "2. Paste your input text.\n"
+        "3. Click **Run** to generate output."
+    )
+    with gr.Row():
+        with gr.Column(scale=3):
+            instruction = gr.Textbox(label="Instruction", value=DEFAULT_INSTRUCTION, lines=3)
+            user_input = gr.Textbox(label="Input", placeholder="在这里输入待翻译/待改写文本", lines=8)
+            use_rag = gr.Checkbox(label="Use RAG (hybrid)", value=False)
+            mode_panel = gr.Markdown(_format_mode_indicator(False), label="Inference mode")
+            with gr.Accordion("Advanced settings", open=False):
+                max_new_tokens = gr.Slider(32, 1024, value=256, step=16, label="max_new_tokens")
+                temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="temperature")
+                top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
+                seed = gr.Number(
+                    value=-1,
+                    precision=0,
+                    label="seed (-1 = random)",
+                    info="Use a fixed integer seed for more reproducible sampling.",
+                )
+            gr.Markdown("### Examples")
+            gr.Markdown("Click an example button to auto-fill Instruction and Input.")
+            parsed_examples, example_warning = _load_demo_examples()
+            if example_warning:
+                gr.Markdown(example_warning)
+            with gr.Row():
+                for example in parsed_examples:
+                    example_btn = gr.Button(example["label"], variant="secondary")
+                    example_btn.click(
+                        fn=lambda ex=example: _apply_example(ex),
+                        inputs=None,
+                        outputs=[instruction, user_input, max_new_tokens, use_rag, mode_panel],
+                    )
+            with gr.Row():
+                run_btn = gr.Button("Run", variant="primary")
+                stop_btn = gr.Button("Stop")
+                clear_btn = gr.Button("Clear")
+        with gr.Column(scale=2):
+            output = gr.Markdown(label="Output", value="")
+            elapsed_text = gr.Textbox(label="Elapsed", value="0.00s", interactive=False)
+            status_panel = gr.Markdown(
+                _format_status(stage="Idle", loaded=False, device="unknown", loading_percent="0%"),
+                label="Model / System status",
+            )
+    use_rag.change(fn=_on_mode_toggled, inputs=[use_rag], outputs=[mode_panel])
+    run_event = run_btn.click(
+        fn=run_inference_with_mode,
+        inputs=[instruction, user_input, max_new_tokens, temperature, top_p, seed, use_rag],
+        outputs=[output, status_panel, elapsed_text],
+    )
+    stop_btn.click(fn=_on_stop_clicked, inputs=None, outputs=[status_panel], cancels=[run_event])
+    clear_btn.click(
+        fn=_on_clear_clicked,
+        inputs=None,
+        outputs=[instruction, user_input, use_rag, output, status_panel, mode_panel, elapsed_text],
+        cancels=[run_event],
+    )
+if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=1).launch()

build_vector_db.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import json
+import os
+import chromadb
+from sentence_transformers import SentenceTransformer
+def load_json(filepath):
+    if not os.path.exists(filepath):
+        print(f"[错误] 找不到文件: {filepath}")
+        return {}
+    with open(filepath, "r", encoding="utf-8") as f:
+        return json.load(f)
+def build_vector_db():
+    # 推荐使用 m3e-base，对中文文本的检索效果非常好，且体积小
+    print("[1] 正在加载嵌入模型...")
+    embedder = SentenceTransformer('moka-ai/m3e-base', device='cuda')
+    print("[2] 初始化本地 Chroma 向量数据库...")
+    # 这会在当前目录下创建一个名为 "chroma_data" 的文件夹来持久化存储数据
+    chroma_client = chromadb.PersistentClient(path="./chroma_data")
+    # 创建或获取一个集合（Collection），相当于关系型数据库里的"表"
+    collection = chroma_client.get_or_create_collection(name="mansus_lore")
+    print("[3] 正在读取 JSON 数据...")
+    hours_data = load_json("data/hours_merged.json")
+    history_data = load_json("data/mansus_history_events_rag.json")
+    documents = []   # 存储纯文本块
+    metadatas = []   # 存储元数据（用于过滤和与图谱联动）
+    ids = []         # 存储唯一 ID
+    print("[4] 正在处理司辰 (Hours) 文本...")
+    for hour in hours_data.get("hours", []):
+        hour_id = hour.get("id", "")
+        desc = hour.get("desc_cn", "")
+        name = hour.get("name_cn", "")
+        if not hour_id or not desc:
+            continue
+        documents.append(f"【司辰档案】{name}：{desc}")
+        metadatas.append({
+            "type": "hour",
+            "entity_id": hour_id,
+            "entity_name": name
+        })
+        ids.append(f"doc_{hour_id}")
+    print("[5] 正在处理漫宿历史事件 (History Events) 文本...")
+    for era_name, era_obj in history_data.items():
+        for event_title, event_obj in era_obj.get("events", {}).items():
+            # 优先使用我们之前用大模型生成的精炼摘要
+            summary = event_obj.get("summary_cn", "")
+            if not summary:
+                # 如果没有摘要，就把原段落拼起来
+                summary = "\n".join(event_obj.get("paragraphs", []))
+            if summary.strip():
+                documents.append(f"【历史事件】{era_name} - {event_title}：\n{summary}")
+                metadatas.append({
+                    "type": "event",
+                    "era": era_name,
+                    "event_title": event_title
+                })
+                ids.append(f"doc_event_{event_title}")
+            # 处理子事件 (h4)
+            for sub_title, sub_obj in event_obj.get("subevents", {}).items():
+                sub_summary = sub_obj.get("summary_cn", "")
+                if not sub_summary:
+                    sub_summary = "\n".join(sub_obj.get("paragraphs", []))
+                if sub_summary.strip():
+                    documents.append(f"【历史事件】{era_name} - {event_title} ({sub_title})：\n{sub_summary}")
+                    metadatas.append({
+                        "type": "subevent",
+                        "era": era_name,
+                        "parent_event": event_title,
+                        "event_title": sub_title
+                    })
+                    ids.append(f"doc_subevent_{sub_title}")
+    print(f"[6] 开始对 {len(documents)} 个文本块进行向量化并存入数据库 ...")
+    # 批量进行向量化
+    embeddings = embedder.encode(documents, show_progress_bar=True).tolist()
+    # 批量存入 ChromaDB
+    # 注意：如果数据量上万，建议分批次（Batch）存入。这里数据量在几百条左右，可以直接一次性插入。
+    collection.upsert(
+        documents=documents,
+        embeddings=embeddings,
+        metadatas=metadatas,
+        ids=ids
+    )
+    print("[7] 向量库构建完成！数据已持久化保存在 ./chroma_data 目录。")
+if __name__ == "__main__":
+    build_vector_db()

infer_hybrid_RAG.py ADDED Viewed

	@@ -0,0 +1,212 @@

+from __future__ import annotations
+import json
+import os
+import sys
+from functools import lru_cache
+from pathlib import Path
+from threading import Event
+from typing import Iterator
+sys.path.insert(0, str(Path(__file__).resolve().parent / "src"))
+from numen_scriptorium.inference.qwen import generate, load_model, stream_generate
+RAG_BASE_MODEL = os.getenv("NS_RAG_BASE_MODEL", os.getenv("NS_BASE_MODEL", "Qwen/Qwen2.5-7B-Instruct"))
+RAG_ADAPTER = os.getenv("NS_RAG_ADAPTER", os.getenv("NS_ADAPTER", "ICGenAIShare06/boh-qlora-adapter/best")).strip() or None
+RAG_USE_4BIT = os.getenv("NS_RAG_USE_4BIT", os.getenv("NS_USE_4BIT", "1")) == "1"
+RAG_CHROMA_DIR = os.getenv("NS_RAG_CHROMA_DIR", "chroma_data")
+RAG_COLLECTION = os.getenv("NS_RAG_COLLECTION", "mansus_lore")
+RAG_ALIAS_FILE = os.getenv("NS_RAG_ALIAS_FILE", "data/hours_merged.json")
+RAG_EMBED_MODEL = os.getenv("NS_RAG_EMBED_MODEL", "moka-ai/m3e-base")
+def _resolve_repo_path(path_like: str) -> Path:
+    p = Path(path_like)
+    if p.exists():
+        return p
+    return Path(__file__).resolve().parent / p
+class HybridRetriever:
+    def __init__(self, chroma_dir: str, collection_name: str, alias_file: str, embed_model: str):
+        import chromadb
+        import torch
+        from sentence_transformers import SentenceTransformer
+        chroma_path = _resolve_repo_path(chroma_dir)
+        self.chroma_client = chromadb.PersistentClient(path=str(chroma_path))
+        self.collection = self.chroma_client.get_or_create_collection(name=collection_name)
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.embedder = SentenceTransformer(embed_model, device=device)
+        self.alias_map = self._load_alias_map(alias_file)
+    @staticmethod
+    def _load_alias_map(alias_file: str) -> dict[str, str]:
+        path = _resolve_repo_path(alias_file)
+        if not path.exists():
+            return {}
+        with path.open("r", encoding="utf-8") as f:
+            hours_data = json.load(f)
+        alias_map: dict[str, str] = {}
+        for hour in hours_data.get("hours", []):
+            standard_name = hour.get("name_cn", "")
+            for alias in hour.get("aliases", []):
+                alias = alias.strip()
+                if alias:
+                    alias_map[alias] = standard_name
+        return alias_map
+    def retrieve_dict(self, query: str, stop_event: Event | None = None) -> dict[str, str]:
+        rag_dict: dict[str, str] = {}
+        lowered = query.lower()
+        for alias, std_name in self.alias_map.items():
+            if stop_event is not None and stop_event.is_set():
+                break
+            if len(alias) <= 2:
+                continue
+            if alias.lower() in lowered:
+                rag_dict[alias] = std_name
+        return rag_dict
+    def retrieve_context(self, query: str, top_k: int = 1) -> str:
+        query_embedding = self.embedder.encode([query]).tolist()
+        results = self.collection.query(query_embeddings=query_embedding, n_results=top_k)
+        docs = results.get("documents", [[]])
+        vector_context = docs[0] if docs else []
+        return "\n".join(vector_context)
+@lru_cache(maxsize=1)
+def get_hybrid_retriever() -> HybridRetriever:
+    return HybridRetriever(
+        chroma_dir=RAG_CHROMA_DIR,
+        collection_name=RAG_COLLECTION,
+        alias_file=RAG_ALIAS_FILE,
+        embed_model=RAG_EMBED_MODEL,
+    )
+@lru_cache(maxsize=1)
+def get_rag_runtime():
+    return load_model(base_model=RAG_BASE_MODEL, lora_dir=RAG_ADAPTER, use_4bit=RAG_USE_4BIT)
+def rag_resource_summary() -> str:
+    return (
+        f"base={RAG_BASE_MODEL}, adapter={RAG_ADAPTER or 'None'}, "
+        f"embed={RAG_EMBED_MODEL}, chroma={RAG_CHROMA_DIR}/{RAG_COLLECTION}, alias={RAG_ALIAS_FILE}"
+    )
+def prepare_rag_input(
+    user_input: str,
+    stop_event: Event | None = None,
+    top_k: int = 1,
+) -> tuple[str, dict[str, str], str]:
+    retriever = get_hybrid_retriever()
+    rag_dict = retriever.retrieve_dict(user_input, stop_event=stop_event)
+    vector_context = ""
+    if stop_event is None or not stop_event.is_set():
+        try:
+            vector_context = retriever.retrieve_context(user_input, top_k=top_k)
+        except Exception:
+            vector_context = ""
+    injected_text = user_input
+    for eng_term, cn_term in rag_dict.items():
+        if stop_event is not None and stop_event.is_set():
+            break
+        if eng_term in injected_text:
+            injected_text = injected_text.replace(eng_term, f"{eng_term}({cn_term})")
+    return injected_text, rag_dict, vector_context
+def _build_rag_instruction(base_instruction: str, rag_dict: dict[str, str], vector_context: str) -> str:
+    glossary = "\n".join(f"- {k} -> {v}" for k, v in rag_dict.items()) or "- (no matched terms)"
+    context = vector_context.strip() or "(no retrieved context)"
+    return (
+        f"{base_instruction.strip()}\n\n"
+        "[RAG glossary: use these preferred translations when relevant]\n"
+        f"{glossary}\n\n"
+        "[RAG retrieved background context: reference only, do not copy verbatim]\n"
+        f"{context}"
+    )
+def rag_answer(
+    instruction: str,
+    user_input: str,
+    *,
+    max_new_tokens: int = 512,
+    temperature: float = 0.3,
+    top_p: float = 0.85,
+    do_sample: bool = True,
+    seed: int | None = None,
+    stop_event: Event | None = None,
+) -> str:
+    if stop_event is not None and stop_event.is_set():
+        return ""
+    injected_text, rag_dict, vector_context = prepare_rag_input(user_input, stop_event=stop_event)
+    if stop_event is not None and stop_event.is_set():
+        return ""
+    tokenizer, model = get_rag_runtime()
+    rag_instruction = _build_rag_instruction(instruction, rag_dict, vector_context)
+    return generate(
+        tokenizer=tokenizer,
+        model=model,
+        instruction=rag_instruction,
+        user_input=injected_text,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=do_sample,
+        seed=seed,
+    )
+def rag_answer_stream(
+    instruction: str,
+    user_input: str,
+    *,
+    max_new_tokens: int = 512,
+    temperature: float = 0.3,
+    top_p: float = 0.85,
+    do_sample: bool = True,
+    seed: int | None = None,
+    stop_event: Event | None = None,
+) -> Iterator[str]:
+    if stop_event is not None and stop_event.is_set():
+        return
+    injected_text, rag_dict, vector_context = prepare_rag_input(user_input, stop_event=stop_event)
+    if stop_event is not None and stop_event.is_set():
+        return
+    tokenizer, model = get_rag_runtime()
+    rag_instruction = _build_rag_instruction(instruction, rag_dict, vector_context)
+    yield from stream_generate(
+        tokenizer=tokenizer,
+        model=model,
+        instruction=rag_instruction,
+        user_input=injected_text,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=do_sample,
+        seed=seed,
+        stop_event=stop_event,
+    )
+if __name__ == "__main__":
+    sample_instruction = (
+        "You are a translator. Translate the English text into Chinese and keep lore-related style and terms coherent."
+    )
+    sample_input = (
+        "In the city of Emesa, Elagabalus lies beneath black corundum, and the Sun-in-Splendour watches in silence."
+    )
+    print(rag_answer(sample_instruction, sample_input, max_new_tokens=200))

kg_merge.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import json
+import os
+def load_json(filepath):
+    if not os.path.exists(filepath):
+        print(f"[错误] 找不到文件: {filepath}")
+        return {}
+    with open(filepath, "r", encoding="utf-8") as f:
+        return json.load(f)
+def build_knowledge_graph():
+    print("[1] 正在加载数据...")
+    hours_data = load_json("data/hours_merged.json")
+    history_data = load_json("data/mansus_history_events_rag.json")
+    triplets = []
+    alias_map = {}
+    print("[2] 正在解析司辰实体，提取内部关系 (起源、派系)...")
+    hours_list = hours_data.get("hours", [])
+    for hour in hours_list:
+        hour_id = hour.get("id", "")
+        hour_name = hour.get("name_cn", "")
+        if not hour_id:
+            continue
+        # 提取起源 (HAS_ORIGIN)
+        for origin in hour.get("origin", []):
+            triplets.append({
+                "head_id": hour_id, "head_name": hour_name,
+                "relation": "HAS_ORIGIN",
+                "tail_id": f"origin.{origin}", "tail_name": origin
+            })
+        # 提取派系 (BELONGS_TO)
+        for faction in hour.get("factions", []):
+            triplets.append({
+                "head_id": hour_id, "head_name": hour_name,
+                "relation": "BELONGS_TO",
+                "tail_id": f"faction.{faction}", "tail_name": faction
+            })
+        # 构建倒排索引映射字典，用于后续在历史文本中“抓取”司辰
+        for alias in hour.get("aliases", []):
+            if alias.strip():
+                # 记录别名对应的司辰 ID 和标准名称
+                alias_map[alias.strip()] = {"id": hour_id, "name": hour_name}
+    print(f" -> 提取了 {len(alias_map)} 个别名用于实体链接匹配。")
+    print("[3] 正在扫描历史事件，建立事件参与关系 (PARTICIPATED_IN)...")
+    # 遍历漫宿历史的每一个时代和事件
+    for era_name, era_obj in history_data.items():
+        events = era_obj.get("events", {})
+        for event_title, event_obj in events.items():
+            # 将主事件的段落和摘要拼成一段完整文本用于检索
+            texts_to_search =  [event_obj.get("summary_cn", "")] #+ event_obj.get("paragraphs", [])
+            full_text = "\n".join(texts_to_search)
+            # 使用别名映射表在文本中寻找司辰的踪迹
+            matched_hours = set()
+            for alias, hour_info in alias_map.items():
+                if alias in full_text:
+                    matched_hours.add((hour_info["id"], hour_info["name"]))
+            # 如果找到，则生成参与事件的三元组
+            for h_id, h_name in matched_hours:
+                triplets.append({
+                    "head_id": h_id, "head_name": h_name,
+                    "relation": "PARTICIPATED_IN",
+                    "tail_id": f"event.{event_title}", "tail_name": event_title
+                })
+            # 同样地，扫描子事件 (h4)
+            for sub_title, sub_obj in event_obj.get("subevents", {}).items():
+                sub_texts = sub_obj.get("paragraphs", []) + [sub_obj.get("summary_cn", "")]
+                sub_full_text = "\n".join(sub_texts)
+                sub_matched = set()
+                for alias, hour_info in alias_map.items():
+                    if alias in sub_full_text:
+                        sub_matched.add((hour_info["id"], hour_info["name"]))
+                for h_id, h_name in sub_matched:
+                    triplets.append({
+                        "head_id": h_id, "head_name": h_name,
+                        "relation": "PARTICIPATED_IN",
+                        "tail_id": f"event.{sub_title}", "tail_name": sub_title
+                    })
+    print(f"[4] 构建完成！共生成 {len(triplets)} 条知识图谱三元组边。")
+    output_file = "kg_triplets.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(triplets, f, ensure_ascii=False, indent=2)
+    print(f"[5] 数据已保存至 {output_file}")
+if __name__ == "__main__":
+    build_knowledge_graph()

requirements.txt CHANGED Viewed

@@ -4,4 +4,6 @@ transformers>=4.45.0
 peft>=0.12.0
 accelerate>=0.33.0
 sentencepiece>=0.2.0
-bitsandbytes

 peft>=0.12.0
 accelerate>=0.33.0
 sentencepiece>=0.2.0
+bitsandbytes
+chromadb>=0.5.0
+sentence-transformers>=3.0.1

src/numen_scriptorium/inference/qwen.py CHANGED Viewed

@@ -61,7 +61,20 @@ def load_model(base_model: str, lora_dir: str | None, use_4bit: bool = True):
     model = base
     if lora_dir:
-        model = PeftModel.from_pretrained(base, _resolve_path(lora_dir))
     model.eval()
     return tokenizer, model

     model = base
     if lora_dir:
+        resolved_lora = _resolve_path(lora_dir)
+        try:
+            model = PeftModel.from_pretrained(base, resolved_lora)
+        except ValueError as exc:
+            # Common misconfiguration: passing a ".../best" suffix when the
+            # adapter files are actually stored at the repo root.
+            # Try a graceful fallback before surfacing the original error.
+            lora_text = str(lora_dir).rstrip("/\\")
+            if lora_text.endswith("/best") or lora_text.endswith("\\best"):
+                parent_lora = lora_text.rsplit("/", 1)[0].rsplit("\\", 1)[0]
+                resolved_parent = _resolve_path(parent_lora)
+                model = PeftModel.from_pretrained(base, resolved_parent)
+            else:
+                raise exc
     model.eval()
     return tokenizer, model

summarise_manus.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import json
+import os
+import time
+import re
+import requests
+from bs4 import BeautifulSoup
+from typing import Dict, List, Any, Optional
+from google import genai
+from google.genai import types
+import re
+# ========= 配置 =========
+WIKI_URL = "https://mansus.huijiwiki.com/wiki/%E6%BC%AB%E5%AE%BF%E5%8E%86%E5%8F%B2"
+OUTPUT_JSON = "mansus_history_events_rag.json"
+os.environ.get("MY_API_KEY")
+client = genai.Client()
+GEMINI_MODEL = "gemini-2.5-flash"
+# ========= 工具函数 =========
+HTML_CACHE_PATH = "data/mansus_history.html"
+def fetch_html(url: str) -> str:
+    if os.path.exists(HTML_CACHE_PATH):
+        with open(HTML_CACHE_PATH, "r", encoding="utf-8") as f:
+            return f.read()
+    headers = {
+        #add your header here
+    }
+    resp = requests.get(url, headers=headers, timeout=20)
+    resp.raise_for_status()
+    resp.encoding = resp.apparent_encoding
+    html = resp.text
+    os.makedirs(os.path.dirname(HTML_CACHE_PATH) or ".", exist_ok=True)
+    with open(HTML_CACHE_PATH, "w", encoding="utf-8") as f:
+        f.write(html)
+    return html
+def parse_article_structure(html: str) -> Dict[str, Any]:
+    soup = BeautifulSoup(html, "html.parser")
+    article = soup.find("article", class_="wiki-body-section", role="main")
+    if not article:
+        raise RuntimeError("Cannot find target <article> section.")
+    data: Dict[str, Any] = {}
+    # 1. 预设“引言”状态：这样在遇到第一个 <h2> 之前出现的所有 <p> 标签，
+    # 都会被自动接住，并归类到“漫宿历史与时代划分”这个伪事件中。
+    current_era = "引言"
+    current_h3 = "漫宿历史与时代划分"
+    current_h4 = None
+    data[current_era] = {
+        "title": current_era,
+        "events": {
+            current_h3: {
+                "level": "h3",
+                "paragraphs": [],
+                "subevents": {}
+            }
+        }
+    }
+    # 开始遍历 DOM 树
+    for el in article.descendants:
+        if not getattr(el, "name", None):
+            continue
+        name = el.name.lower()
+        if name == "h2":
+            # 遇到新的 h2，切换时代
+            current_era = el.get_text(strip=True)
+            data.setdefault(current_era, {"title": current_era, "events": {}})
+            current_h3 = None
+            current_h4 = None
+        elif name == "h3":
+            if not current_era:
+                continue
+            current_h3 = el.get_text(strip=True)
+            current_h4 = None
+            data[current_era]["events"].setdefault(
+                current_h3,
+                {"level": "h3", "paragraphs": [], "subevents": {}}
+            )
+        elif name == "h4":
+            if not current_era or not current_h3:
+                continue
+            current_h4 = el.get_text(strip=True)
+            data[current_era]["events"][current_h3]["subevents"].setdefault(
+                current_h4,
+                {"level": "h4", "paragraphs": []}
+            )
+        elif name == "p":
+            if not current_era or not current_h3:
+                continue
+            text = el.get_text(strip=True)
+            if not text:
+                continue
+            text = re.sub(r'\[\d+\]', '', text)
+            event_obj = data[current_era]["events"][current_h3]
+            if current_h4:
+                event_obj["subevents"][current_h4]["paragraphs"].append(text)
+            else:
+                event_obj["paragraphs"].append(text)
+    # 2. 后置清理：遍历提取到的数据，剔除没有任何段落内容的“空壳”节点
+    cleaned_data = {}
+    for era, era_obj in data.items():
+        valid_events = {}
+        for h3_title, event_obj in era_obj["events"].items():
+            has_h3_paras = len(event_obj["paragraphs"]) > 0
+            # 顺便清理空的 h4 子事件
+            valid_subevents = {}
+            for h4_title, sub_obj in event_obj["subevents"].items():
+                if len(sub_obj["paragraphs"]) > 0:
+                    valid_subevents[h4_title] = sub_obj
+            event_obj["subevents"] = valid_subevents
+            # 只要 h3 自身有段落，或者其子节点 h4 有段落，就视为有效事件并保留
+            if has_h3_paras or len(valid_subevents) > 0:
+                valid_events[h3_title] = event_obj
+        # 只要这个大时代 (h2) 下存在有效的事件，就保留整个大时代
+        if len(valid_events) > 0:
+            era_obj["events"] = valid_events
+            cleaned_data[era] = era_obj
+    return cleaned_data
+def is_conflict_or_death_event(title: str, paragraphs: List[str]) -> bool:
+    """
+    粗略判断是否是“司辰斗争 / 死亡”相关重大事件，用于决定摘要长度。
+    可以根据需要扩展关键词。
+    """
+    text = title + "\n" + "\n".join(paragraphs)
+    keywords = [
+        "覆石之战", "太阳大战", "大战", "战争",
+        "被", "杀死", "斩杀", "粉碎", "饮干",
+        "除名", "分裂", "死亡", "陨落",'毁灭','击败','猎杀'
+    ]
+    # 简单规则：出现“战”“大战”等高风险词，或者“被…杀死/斩杀”等
+    for kw in keywords:
+        if kw in text:
+            return True
+    return False
+def summarise_event_text(
+    era: str,
+    title: str,
+    paragraphs: List[str],
+    is_conflict: bool
+) -> str:
+    full_text = "\n\n".join(paragraphs)
+    if is_conflict:
+        length_hint = "请写 4~6 句中文摘要，适当具体描述关键冲突、参与者与结果。"
+    else:
+        length_hint = "请写 2~4 句中文摘要，突出关键参与者、起因与后果。"
+    # ========== 新增修改 ==========
+    # 强化 Prompt，严禁直接摘抄，以绕过 Recitation 拦截
+    system_prompt = (
+        "你是一个世界观设定编辑，现在要为漫宿相关的历史事件生成适合 RAG 的精炼摘要。\n"
+        "总体要求：\n"
+        "1. 使用中文输出。\n"
+        "2. 保持信息密度高，不写旁白、不写对白，不编造新设定。\n"
+        "3. 尽量保留关键参与者（司辰/派系/起源）、事件起因与影响。\n"
+        "4. 【极其重要】绝对不可使用引号原样摘抄原文的词句！必须完全使用你自己的语言进行转述（Paraphrase），否则会被判定为抄袭。\n"
+    )
+    user_prompt = (
+        f"时代（h2）：{era}\n"
+        f"事件标题：{title}\n\n"
+        f"原始段落：\n{full_text}\n\n"
+        f"{length_hint}"
+    )
+    # 放宽安全限制
+    safety_settings = [
+        types.SafetySetting(
+            category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
+            threshold=types.HarmBlockThreshold.BLOCK_NONE,
+        ),
+        types.SafetySetting(
+            category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
+            threshold=types.HarmBlockThreshold.BLOCK_NONE,
+        ),
+        types.SafetySetting(
+            category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
+            threshold=types.HarmBlockThreshold.BLOCK_NONE,
+        ),
+        types.SafetySetting(
+            category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
+            threshold=types.HarmBlockThreshold.BLOCK_NONE,
+        ),
+    ]
+    resp = client.models.generate_content(
+        model=GEMINI_MODEL,
+        contents=user_prompt,
+        config=types.GenerateContentConfig(
+            system_instruction=system_prompt,
+            temperature=0.4,
+            max_output_tokens=2048,
+            safety_settings=safety_settings
+        )
+    )
+    # ========== 新增修改 ==========
+    # 强制诊断输出：如果文本被截断，告诉你到底是撞了什么拦截墙
+    if resp.candidates:
+        finish_reason = resp.candidates[0].finish_reason.name
+        if finish_reason != "STOP":
+            print(f"\n[拦截警告] 事件 '{title}' 被意外截断！原因代码: {finish_reason}")
+            # 如果原因是 RECITATION，说明模型还是照抄了；如果是 SAFETY，说明还有别的敏感词。
+    return resp.text.strip()
+def build_rag_json(structured: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    输出结构：
+    {
+      era_h2: {
+        "title":...,
+        "events": {
+          h3_title: {
+            "level": "h3",
+            "paragraphs": [...],
+            "summary_cn": "...",
+            "subevents": {
+              h4_title: {
+                "level": "h4",
+                "paragraphs": [...],
+                "summary_cn": "..."
+              }
+            }
+          }
+        }
+      }
+    }
+    """
+    rag = {}
+    for era, era_obj in structured.items():
+        rag[era] = {"title": era_obj["title"], "events": {}}
+        for h3_title, event_obj in era_obj["events"].items():
+            paragraphs_h3 = event_obj.get("paragraphs", [])
+            subevents = event_obj.get("subevents", {})
+            # 先 summarise h3 主事件本身
+            event_entry = {
+                "level": "h3",
+                "paragraphs": paragraphs_h3,
+                "summary_cn": ""
+            }
+            if paragraphs_h3:
+                is_conflict = is_conflict_or_death_event(h3_title, paragraphs_h3)
+                try:
+                    summary = summarise_event_text(era, h3_title, paragraphs_h3, is_conflict)
+                    time.sleep(1.0)
+                except Exception as e:
+                    print(f"[WARN] summarise failed for {era} / {h3_title}: {e}")
+                    summary = ""
+                event_entry["summary_cn"] = summary
+            # 再 summarise 每个 h4 子事件
+            subevents_out = {}
+            for h4_title, sub_obj in subevents.items():
+                paras_h4 = sub_obj.get("paragraphs", [])
+                if not paras_h4:
+                    continue
+                is_conflict_sub = is_conflict_or_death_event(h4_title, paras_h4)
+                try:
+                    summary_h4 = summarise_event_text(era, h4_title, paras_h4, is_conflict_sub)
+                    time.sleep(1.0)
+                except Exception as e:
+                    print(f"[WARN] summarise failed for {era} / {h3_title} / {h4_title}: {e}")
+                    summary_h4 = ""
+                subevents_out[h4_title] = {
+                    "level": "h4",
+                    "paragraphs": paras_h4,
+                    "summary_cn": summary_h4
+                }
+            event_entry["subevents"] = subevents_out
+            rag[era]["events"][h3_title] = event_entry
+    return rag
+def main():
+    print("[1] Fetching page...")
+    html = fetch_html(WIKI_URL)
+    print("[2] Parsing article structure (h2/h3/h4/p)...")
+    structured = parse_article_structure(html)
+    print("[3] Summarising events via Gemini (with conflict-aware length)...")
+    rag_json = build_rag_json(structured)
+    print(f"[4] Saving JSON to {OUTPUT_JSON}...")
+    os.makedirs(os.path.dirname(OUTPUT_JSON) or ".", exist_ok=True)
+    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
+        json.dump(rag_json, f, ensure_ascii=False, indent=2)
+    print("Done.")
+if __name__ == "__main__":
+    main()