Spaces:

bor
/

aozora-bunko-preprocessor

Sleeping

App Files Files Community

Bor Hodošček commited on Jun 12, 2025

Commit

8cef38d

unverified ·

1 Parent(s): 257858f

feat: inital commit of working demo

Browse files

Files changed (7) hide show

789_14547.html +0 -0
Dockerfile +11 -9
app.py +637 -353
pyproject.toml +23 -0
requirements.txt +0 -5
uv.lock +0 -0
wagahaiwa_nekodearu.txt +0 -0

789_14547.html ADDED Viewed

The diff for this file is too large to render. See raw diff

Dockerfile CHANGED Viewed

@@ -1,5 +1,5 @@
-FROM python:3.12
-COPY --from=ghcr.io/astral-sh/uv:0.4.20 /uv /bin/uv
 RUN useradd -m -u 1000 user
 ENV PATH="/home/user/.local/bin:$PATH"
@@ -7,13 +7,15 @@ ENV UV_SYSTEM_PYTHON=1
 WORKDIR /app
-COPY --chown=user ./requirements.txt requirements.txt
-RUN uv pip install -r requirements.txt
-COPY --chown=user . /app
-RUN mkdir -p /app/__marimo__ && \
-    chown -R user:user /app && \
-    chmod -R 755 /app
 USER user
-CMD ["marimo", "run", "app.py", "--include-code", "--host", "0.0.0.0", "--port", "7860"]

+FROM python:3.12-slim
+COPY --from=ghcr.io/astral-sh/uv:0.7.12 /uv /bin/uv
 RUN useradd -m -u 1000 user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
+RUN apt update && apt install -y git pkg-config libxml2-dev libxslt-dev libz-dev gcc
+RUN mkdir -p /app && chown -R user:user /app
+COPY --chown=user ./pyproject.toml ./uv.lock ./app.py ./789_14547.html ./wagahaiwa_nekodearu.txt /app
+RUN chmod -R u+w /app
 USER user
+RUN uv sync
+CMD ["uv", "run", "marimo", "run", "app.py", "--no-sandbox", "--include-code", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,469 +1,753 @@
 import marimo
-__generated_with = "0.9.2"
 app = marimo.App()
 @app.cell
-def __():
     import marimo as mo
-    mo.md("# Welcome to marimo! 🌊🍃")
-    return (mo,)
 @app.cell
-def __(mo):
-    slider = mo.ui.slider(1, 22)
-    return (slider,)
 @app.cell
-def __(mo, slider):
-    mo.md(
-        f"""
-        marimo is a **reactive** Python notebook.
-        This means that unlike traditional notebooks, marimo notebooks **run
-        automatically** when you modify them or
-        interact with UI elements, like this slider: {slider}.
-        {"##" + "🍃" * slider.value}
-        """
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: disabling automatic execution": mo.md(
-                rf"""
-            marimo lets you disable automatic execution: just go into the
-            notebook settings and set
-            "Runtime > On Cell Change" to "lazy".
-            When the runtime is lazy, after running a cell, marimo marks its
-            descendants as stale instead of automatically running them. The
-            lazy runtime puts you in control over when cells are run, while
-            still giving guarantees about the notebook state.
-            """
-            )
-        }
-    )
     return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        Tip: This is a tutorial notebook. You can create your own notebooks
-        by entering `marimo edit` at the command line.
-        """
-    ).callout()
-    return
-@app.cell(hide_code=True)
-def __(mo):
     mo.md(
-        """
-        ## 1. Reactive execution
-        A marimo notebook is made up of small blocks of Python code called
-        cells.
-        marimo reads your cells and models the dependencies among them: whenever
-        a cell that defines a global variable  is run, marimo
-        **automatically runs** all cells that reference that variable.
-        Reactivity keeps your program state and outputs in sync with your code,
-        making for a dynamic programming environment that prevents bugs before they
-        happen.
-        """
     )
-    return
-@app.cell(hide_code=True)
-def __(changed, mo):
-    (
-        mo.md(
-            f"""
-            **✨ Nice!** The value of `changed` is now {changed}.
-            When you updated the value of the variable `changed`, marimo
-            **reacted** by running this cell automatically, because this cell
-            references the global variable `changed`.
-            Reactivity ensures that your notebook state is always
-            consistent, which is crucial for doing good science; it's also what
-            enables marimo notebooks to double as tools and  apps.
-            """
-        )
-        if changed
-        else mo.md(
-            """
-            **🌊 See it in action.** In the next cell, change the value of the
-            variable  `changed` to `True`, then click the run button.
-            """
-        )
     )
-    return
 @app.cell
-def __():
-    changed = False
-    return (changed,)
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: execution order": (
-                """
-                The order of cells on the page has no bearing on
-                the order in which cells are executed: marimo knows that a cell
-                reading a variable must run after the cell that  defines it. This
-                frees you to organize your code in the way that makes the most
-                sense for you.
-                """
-            )
-        }
     )
     return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        **Global names must be unique.** To enable reactivity, marimo imposes a
-        constraint on how names appear in cells: no two cells may define the same
-        variable.
-        """
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: encapsulation": (
-                """
-                By encapsulating logic in functions, classes, or Python modules,
-                you can minimize the number of global variables in your notebook.
-                """
-            )
-        }
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: private variables": (
-                """
-                Variables prefixed with an underscore are "private" to a cell, so
-                they can be defined by multiple cells.
-                """
-            )
-        }
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 2. UI elements
-        Cells can output interactive UI elements. Interacting with a UI
-        element **automatically triggers notebook execution**: when
-        you interact with a UI element, its value is sent back to Python, and
-        every cell that references that element is re-run.
-        marimo provides a library of UI elements to choose from under
-        `marimo.ui`.
-        """
-    )
-    return
 @app.cell
-def __(mo):
-    mo.md("""**🌊 Some UI elements.** Try interacting with the below elements.""")
     return
 @app.cell
-def __(mo):
-    icon = mo.ui.dropdown(["🍃", "🌊", "✨"], value="🍃")
-    return (icon,)
-@app.cell
-def __(icon, mo):
-    repetitions = mo.ui.slider(1, 16, label=f"number of {icon.value}: ")
-    return (repetitions,)
 @app.cell
-def __(icon, repetitions):
-    icon, repetitions
-    return
 @app.cell
-def __(icon, mo, repetitions):
-    mo.md("# " + icon.value * repetitions.value)
     return
-@app.cell(hide_code=True)
-def __(mo):
     mo.md(
-        """
-        ## 3. marimo is just Python
-        marimo cells parse Python (and only Python), and marimo notebooks are
-        stored as pure Python files — outputs are _not_ included. There's no
-        magical syntax.
-        The Python files generated by marimo are:
-        - easily versioned with git, yielding minimal diffs
-        - legible for both humans and machines
-        - formattable using your tool of choice,
-        - usable as Python  scripts, with UI  elements taking their default
-        values, and
-        - importable by other modules (more on that in the future).
-        """
     )
     return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 4. Running notebooks as apps
-        marimo notebooks can double as apps. Click the app window icon in the
-        bottom-right to see this notebook in "app view."
-        Serve a notebook as an app with `marimo run` at the command-line.
-        Of course, you can use marimo just to level-up your
-        notebooking, without ever making apps.
-        """
     )
     return
 @app.cell(hide_code=True)
-def __(mo):
     mo.md(
         """
-        ## 5. The `marimo` command-line tool
-        **Creating and editing notebooks.** Use
-        ```
-        marimo edit
-        ```
-        in a terminal to start the marimo notebook server. From here
-        you can create a new notebook or edit existing ones.
-        **Running as apps.** Use
-        ```
-        marimo run notebook.py
-        ```
-        to start a webserver that serves your notebook as an app in read-only mode,
-        with code cells hidden.
-        **Convert a Jupyter notebook.** Convert a Jupyter notebook to a marimo
-        notebook using `marimo convert`:
-        ```
-        marimo convert your_notebook.ipynb > your_app.py
-        ```
-        **Tutorials.** marimo comes packaged with tutorials:
-        - `dataflow`: more on marimo's automatic execution
-        - `ui`: how to use UI elements
-        - `markdown`: how to write markdown, with interpolated values and
-           LaTeX
-        - `plots`: how plotting works in marimo
-        - `sql`: how to use SQL
-        - `layout`: layout elements in marimo
-        - `fileformat`: how marimo's file format works
-        - `markdown-format`: for using `.md` files in marimo
-        - `for-jupyter-users`: if you are coming from Jupyter
-        Start a tutorial with `marimo tutorial`; for example,
-        ```
-        marimo tutorial dataflow
-        ```
-        In addition to tutorials, we have examples in our
-        [our GitHub repo](https://www.github.com/marimo-team/marimo/tree/main/examples).
-        """
     )
     return
 @app.cell(hide_code=True)
-def __(mo):
     mo.md(
         """
-        ## 6. The marimo editor
-        Here are some tips to help you get started with the marimo editor.
-        """
     )
     return
 @app.cell
-def __(mo, tips):
-    mo.accordion(tips)
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md("""## Finally, a fun fact""")
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        The name "marimo" is a reference to a type of algae that, under
-        the right conditions, clumps together to form a small sphere
-        called a "marimo moss ball". Made of just strands of algae, these
-        beloved assemblages are greater than the sum of their parts.
-        """
     )
     return
-@app.cell(hide_code=True)
-def __():
-    tips = {
-        "Saving": (
-            """
-            **Saving**
-            - _Name_ your app using the box at the top of the screen, or
-              with `Ctrl/Cmd+s`. You can also create a named app at the
-              command line, e.g., `marimo edit app_name.py`.
-            - _Save_ by clicking the save icon on the bottom right, or by
-              inputting `Ctrl/Cmd+s`. By default marimo is configured
-              to autosave.
-            """
-        ),
-        "Running": (
-            """
-            1. _Run a cell_ by clicking the play ( ▷ ) button on the top
-            right of a cell, or by inputting `Ctrl/Cmd+Enter`.
-            2. _Run a stale cell_  by clicking the yellow run button on the
-            right of the cell, or by inputting `Ctrl/Cmd+Enter`. A cell is
-            stale when its code has been modified but not run.
-            3. _Run all stale cells_ by clicking the play ( ▷ ) button on
-            the bottom right of the screen, or input `Ctrl/Cmd+Shift+r`.
-            """
-        ),
-        "Console Output": (
-            """
-            Console output (e.g., `print()` statements) is shown below a
-            cell.
-            """
-        ),
-        "Creating, Moving, and Deleting Cells": (
-            """
-            1. _Create_ a new cell above or below a given one by clicking
-                the plus button to the left of the cell, which appears on
-                mouse hover.
-            2. _Move_ a cell up or down by dragging on the handle to the
-                right of the cell, which appears on mouse hover.
-            3. _Delete_ a cell by clicking the trash bin icon. Bring it
-                back by clicking the undo button on the bottom right of the
-                screen, or with `Ctrl/Cmd+Shift+z`.
-            """
-        ),
-        "Disabling Automatic Execution": (
-            """
-            Via the notebook settings (gear icon) or footer panel, you
-            can disable automatic execution. This is helpful when
-            working with expensive notebooks or notebooks that have
-            side-effects like database transactions.
-            """
-        ),
-        "Disabling Cells": (
-            """
-            You can disable a cell via the cell context menu.
-            marimo will never run a disabled cell or any cells that depend on it.
-            This can help prevent accidental execution of expensive computations
-            when editing a notebook.
-            """
-        ),
-        "Code Folding": (
-            """
-            You can collapse or fold the code in a cell by clicking the arrow
-            icons in the line number column to the left, or by using keyboard
-            shortcuts.
-            Use the command palette (`Ctrl/Cmd+k`) or a keyboard shortcut to
-            quickly fold or unfold all cells.
-            """
-        ),
-        "Code Formatting": (
-            """
-            If you have [ruff](https://github.com/astral-sh/ruff) installed,
-            you can format a cell with the keyboard shortcut `Ctrl/Cmd+b`.
-            """
-        ),
-        "Command Palette": (
-            """
-            Use `Ctrl/Cmd+k` to open the command palette.
-            """
-        ),
-        "Keyboard Shortcuts": (
-            """
-            Open the notebook menu (top-right) or input `Ctrl/Cmd+Shift+h` to
-            view a list of all keyboard shortcuts.
-            """
-        ),
-        "Configuration": (
-            """
-           Configure the editor by clicking the gears icon near the top-right
-           of the screen.
-           """
-        ),
-    }
-    return (tips,)
 if __name__ == "__main__":

+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "aozora-corpus-generator==0.1.1",
+#     "cdifflib==1.2.9",
+#     "ginza",
+#     "ja-ginza",
+#     "ipython==7.23.1",
+#     "marimo",
+#     "polars==1.30.0",
+#     "spacy==3.8.7",
+#     "wcwidth",
+# ]
+#
+# [tool.uv.sources]
+# aozora-corpus-generator = { git = "https://github.com/borh/aozora-corpus-generator.git" }
+# ///
 import marimo
+__generated_with = "0.13.15"
 app = marimo.App()
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        rf"""
+    # Aozora Bunko Text Processing Pipeline Demo
+    ### Summary
+    1. Upload a text file from Aozora Bunko (or use the default sample).
+    2. Preprocess using customizable regex patterns.
+    3. Preview the first and last 50 lines of the cleaned text.
+    4. Download the cleaned text.
+    5. Process the XHTML version with a Python library.
+    6. Compare against the regex variant.
+    6. Define token matching patterns.
+    7. Visualize token matches.
+    8. Define dependency matching patterns.
+    9. Visualize dependency matches.
+    ### 概要
+    1. 青空文庫のテキストファイルをアップロードする（またはデフォルトサンプルを利用する）。
+    2. 編集可能な正規表現で前処理する。
+    3. 前処理済みテキストの先頭50行と末尾50行をプレビューする。
+    4. 前処理済みテキストをダウンロードする。
+    5. XHTML版をPythonのパッケージで処理する。
+    6. 正規表現処理版と比較する。
+    7. トークンマッチング用パターンを定義する。
+    8. トークンマッチ結果を可視化する。
+    9. 係り受け（依存）関係マッチング用パターンを定義する。
+    10. 係り受け関係マッチ結果を可視化する。
+    {mo.callout("By default, this demo uses Natsume Soseki's _‘Wagahai wa neko de aru’_")}
+    """
+    )
+    return
 @app.cell
+def _():
+    import re
     import marimo as mo
+    import polars as pl
+    import spacy
+    from spacy.tokens import Doc
+    nlp = spacy.load(
+        "ja_ginza"
+    )  # or "ja_ginza_electra"/"ja_ginza_bert_large" if installed
+    return Doc, mo, nlp, pl, re, spacy
 @app.cell
+def upload_aozora_text(mo):
+    """
+    UI element to upload an Aozora‐Bunko text file.
+    Falls back to local file if none is provided.
+    """
+    aozora_file = mo.ui.file(label="Upload Aozora-Bunko text (.txt)", multiple=False)
+    return (aozora_file,)
 @app.cell
+def select_encoding(mo):
+    """
+    Let the user choose the text‐file encoding.
+    """
+    encoding = mo.ui.dropdown(
+        options=["shift-jis", "utf-8"],
+        value="shift-jis",
+        label="Text file encoding",
+        full_width=False,
     )
+    return (encoding,)
+@app.cell
+def _(aozora_file, encoding, mo):
+    ab_upload_ui = mo.hstack([aozora_file, encoding])
+    mo.md(f"## 青空文庫テキストファイル設定\n{ab_upload_ui}")
     return
+@app.cell
+def load_aozora_text(aozora_file, encoding):
+    """
+    Load the uploaded file if provided; otherwise read the local wagahaiwa_nekodearu.txt.
+    Returns the raw text.
+    """
+    enc = encoding.value
+    if aozora_file.value:
+        uploaded = aozora_file.contents()
+        text_raw = uploaded.decode(enc)
+    else:
+        with open("wagahaiwa_nekodearu.txt", encoding="shift-jis") as f:
+            text_raw = f.read()
+    return (text_raw,)
+@app.cell
+def show_raw_head(mo, text_raw):
     mo.md(
+        f"""
+    ## 青空文庫のヘッダーとフッターを確認
+    最初の500字
+    ```raw
+    {text_raw[:500]}
+    ```
+    最後の500字
+    ```raw
+    {text_raw[-500:]}
+    ```
+    """
+    )
+    return
+@app.cell
+def regex_inputs(mo):
+    ruby_pattern = mo.ui.text(
+        value=r"《[^》]+》",
+        label="Ruby‐annotation regex",
+        full_width=True,
+    )
+    ruby_bar_pattern = mo.ui.text(
+        value=r"｜",
+        label="Ruby‐bar regex",
+        full_width=True,
+    )
+    annotation_pattern = mo.ui.text(
+        value=r"［＃[^］]+?］",
+        label="Inline‐annotation regex",
+        full_width=True,
+    )
+    hajime_pattern = mo.ui.text(
+        value=r"-{55}(.|\n)+?-{55}",
+        label="Start‐marker regex",
+        full_width=True,
+    )
+    owari_pattern = mo.ui.text(
+        value=(
+            r"^[　【]?(底本：|訳者あとがき|この翻訳は|この作品.*翻訳|"
+            r"この翻訳.*全訳)"
+        ),
+        label="End‐marker regex",
+        full_width=True,
+    )
+    regexes = mo.vstack(
+        [
+            ruby_pattern,
+            ruby_bar_pattern,
+            annotation_pattern,
+            hajime_pattern,
+            owari_pattern,
+        ]
     )
+    mo.md(f"""## 正規表現による前処理
+    （必要な場合は修正）
+    {regexes}
+    """)
+    return (
+        annotation_pattern,
+        hajime_pattern,
+        owari_pattern,
+        ruby_bar_pattern,
+        ruby_pattern,
     )
 @app.cell
+def clean_aozora(
+    annotation_pattern,
+    hajime_pattern,
+    mo,
+    owari_pattern,
+    re,
+    ruby_bar_pattern,
+    ruby_pattern,
+    text_raw,
+):
+    # compile from user‐editable patterns
+    ruby_rx = re.compile(ruby_pattern.value)
+    ruby_bar_rx = re.compile(ruby_bar_pattern.value)
+    annotation_rx = re.compile(annotation_pattern.value)
+    hajime_rx = re.compile(hajime_pattern.value)
+    owari_rx = re.compile(owari_pattern.value, re.M)
+    def clean_text(text: str) -> tuple[str, str, str]:
+        """青空文庫テキスト形式の文字列textを入力とし，改行方式の統一，ルビーと各種のアノーテーションの削除，
+        青空文庫特有の"""
+        title, author, text = (text.split("\n", 2) + ["", ""])[:3]
+        # 青空文庫では改行がDOS形式の\r\nのため，それをUNIX形式の\nに統一する。
+        cleaned_text = re.sub(r"(\r\n)+", "\n", text)
+        # ルビ《...》の記号とその中身を削除
+        cleaned_text = re.sub(ruby_rx, "", cleaned_text)
+        # ルビのもう一つの書き方に対応：「一番｜獰悪《どうあく》」
+        cleaned_text = re.sub(ruby_bar_rx, "", cleaned_text)
+        # 注釈対応：「※［＃「言＋墟のつくり」、第4水準2-88-74］」
+        cleaned_text = re.sub(annotation_rx, "", cleaned_text)
+        # 本文までのテキストを削除
+        cleaned_text = re.sub(hajime_rx, "", cleaned_text)
+        # 本文の後のテキストを削除
+        maybe_owari = owari_rx.search(cleaned_text)
+        if maybe_owari:
+            return (title, author, cleaned_text[0 : maybe_owari.start()].strip())
+        return (title, author, cleaned_text.strip())
+    title, author, cleaned_text = clean_text(text_raw)
+    mo.md(f"""### 前処理結果の確認
+    -   著者：`{author}`
+    -   タイトル：`{title}`
+    最初の100字
+    ```raw
+    {cleaned_text[:100]}
+    ```
+    最後の100字
+    ```raw
+    {cleaned_text[-100:]}
+    ```
+    """)
+    return author, cleaned_text, title
+@app.cell
+def download_cleaned_text(author, cleaned_text, mo, title):
+    """
+    Provide a download link for the cleaned Aozora text.
+    """
+    download_link = mo.download(
+        data=cleaned_text.encode("utf-8"),
+        filename=f"{author}_{title}.txt",
+        mimetype="text/plain",
     )
+    mo.md(f"""
+    前処理済みファイルのダウンロード：
+    {download_link}
+    """)
     return
+@app.cell
+def get_alternative_file(mo):
+    aozora_xhtml_file = mo.ui.file(
+        label="Upload Aozora-Bunko text (.html)", multiple=False
     )
+    xhtml_encoding = mo.ui.dropdown(
+        options=["shift-jis", "utf-8"],
+        value="shift-jis",
+        label="Text file encoding",
+        full_width=False,
     )
+    mo.md(f"""
+    ## HTMLを使用した前処理との比較（オプショナル）
+    プレインテキスト版を正規表現で前処理した結果を、(X)HTML版をPythonで処理した結果を比較したい場合は同じ作品のHTMLファイルをアップします。
+    {aozora_xhtml_file}
+    {xhtml_encoding}
+    """)
+    return aozora_xhtml_file, xhtml_encoding
+@app.cell
+def show_natsume_head(aozora_xhtml_file, mo, xhtml_encoding):
+    from aozora_corpus_generator.aozora import parse_aozora_bunko_xml_content
+    xhtml_enc = xhtml_encoding.value
+    if aozora_xhtml_file.value:
+        uploaded_xhtml = aozora_xhtml_file.contents()
+        xhtml_raw = uploaded_xhtml.decode(xhtml_enc)
+    else:
+        with open("789_14547.html", "rb") as xhtml_f:
+            xhtml_raw = xhtml_f.read()
+    aozora_xhtml_processed = parse_aozora_bunko_xml_content(
+        xhtml_raw, do_tokenize=False
     )
+    aozora_xhtml_processed_text = aozora_xhtml_processed["text"]
+    mo.md(f"""
+    HTML版の最初の200字
+    ```raw
+    {aozora_xhtml_processed_text[:200]}
+    ```
+    HTML版の最後の200字
+    ```raw
+    {aozora_xhtml_processed_text[-200:]}
+    ```
+    """)
+    return (aozora_xhtml_processed_text,)
 @app.cell
+def _(aozora_xhtml_processed_text, author, mo, title):
+    xhtml_download_link = mo.download(
+        data=aozora_xhtml_processed_text.encode("utf-8"),
+        filename=f"{author}_{title}_xhtml.txt",
+        mimetype="text/plain",
+    )
+    mo.md(f"""
+    HTML版の前処理済みファイルをダウンロード：
+    {xhtml_download_link}
+    """)
     return
 @app.cell
+def _():
+    import difflib
+    import html
+    from cdifflib import CSequenceMatcher
+    from IPython.display import HTML
+    from IPython.display import display_html as display
+    difflib.SequenceMatcher = CSequenceMatcher
+    DEL_STYLE = "background-color:#f6c6c6;color:#000;"  # red bg, black text
+    INS_STYLE = "background-color:#c6f6c6;color:#000;"  # green bg, black text
+    WRAP_STYLE = (
+        "font-family: ui-monospace, monospace; "
+        "white-space: pre-wrap; line-height:1.4; color:#000;"
+    )
+    WS_MAP = str.maketrans({" ": "␣", "\t": "⇥", "\n": "↩\n"})
+    def _escape(txt: str) -> str:
+        return html.escape(txt.translate(WS_MAP))
+    def _char_changes(a: str, b: str) -> str:
+        """Return HTML for *only* the changed chars between a and b."""
+        sm = difflib.SequenceMatcher(None, a, b, autojunk=False)
+        pieces = []
+        for tag, i1, i2, j1, j2 in sm.get_opcodes():
+            if tag == "delete":
+                pieces.append(f'<span style="{DEL_STYLE}">{_escape(a[i1:i2])}</span>')
+            elif tag == "insert":
+                pieces.append(f'<span style="{INS_STYLE}">{_escape(b[j1:j2])}</span>')
+            elif tag == "replace":
+                pieces.append(f'<span style="{DEL_STYLE}">{_escape(a[i1:i2])}</span>')
+                pieces.append(f'<span style="{INS_STYLE}">{_escape(b[j1:j2])}</span>')
+            # equal → ignore
+        return "".join(pieces)
+    def diff_changes(a: str, b: str, auto_display: bool = True):
+        """
+        Colab/Jupyter-friendly inline diff that shows *only the changed segments*.
+        Lightning-fast on large, mostly-identical texts.
+        """
+        a_lines = a.splitlines(keepends=True)
+        b_lines = b.splitlines(keepends=True)
+        outer = difflib.SequenceMatcher(None, a_lines, b_lines, autojunk=True)
+        html_chunks = []
+        for tag, i1, i2, j1, j2 in outer.get_opcodes():
+            if tag == "replace":  # both sides present
+                for la, lb in zip(a_lines[i1:i2], b_lines[j1:j2]):
+                    html_chunks.append(_char_changes(la, lb))
+                # handle length mismatch
+                for la in a_lines[i1 + (j2 - j1) : i2]:
+                    html_chunks.append(
+                        f'<span style="{DEL_STYLE}">{_escape(la)}</span>'
+                    )
+                for lb in b_lines[j1 + (i2 - i1) : j2]:
+                    html_chunks.append(
+                        f'<span style="{INS_STYLE}">{_escape(lb)}</span>'
+                    )
+            elif tag == "delete":
+                for la in a_lines[i1:i2]:
+                    html_chunks.append(
+                        f'<span style="{DEL_STYLE}">{_escape(la)}</span>'
+                    )
+            elif tag == "insert":
+                for lb in b_lines[j1:j2]:
+                    html_chunks.append(
+                        f'<span style="{INS_STYLE}">{_escape(lb)}</span>'
+                    )
+            # equal → skip entirely (we want only changes)
+        rendered = HTML(f'<div style="{WRAP_STYLE}">{"".join(html_chunks)}</div>')
+        if auto_display:
+            display(rendered)
+        return rendered
+    return (diff_changes,)
 @app.cell
+def toggle_diff(mo):
+    run_diff = mo.ui.switch(label="文章間の比較（差分）を表示", value=False)
+    run_diff
+    return (run_diff,)
 @app.cell
+def compare_preprocessed_vs_old(
+    aozora_xhtml_processed_text,
+    cleaned_text,
+    diff_changes,
+    run_diff,
+):
+    """
+    Compare our cleaned text against the original Aozora‐processed text.
+    """
+    diff_result = None
+    if run_diff.value:
+        # run the expensive diff only when checked
+        diff_result = diff_changes(
+            cleaned_text, aozora_xhtml_processed_text, auto_display=False
+        )
+    # else:
+    #     diff_result = mo.md("Diff comparison is turned off.")
+    diff_result
     return
+@app.cell
+def _(mo):
     mo.md(
+        r"""
+    ## spaCy (GiNZA) による解析
+    以下からは、正規表現で前処理したテキストに対して、
+    -   形態素解析
+    -   係り受け解析
+    を行う。
+    > 作品によっては時間がかかる。
+    """
     )
     return
+@app.cell
+def process_aozora_text(Doc, cleaned_text, mo, nlp, re):
+    """
+    Turn each paragraph into one Doc.  If any paragraph > MAX_BYTES,
+    fall back to sentence‐splitting, then raw‐byte‐splitting, and only
+    in that fallback re‐assemble via Doc.from_docs.
+    """
+    def split_text_to_paragraphs(text: str) -> list[str]:
+        """Split on one or more blank lines."""
+        return re.split(r"\n+\s*", text)
+    MAX_BYTES = 40000
+    paras = split_text_to_paragraphs(cleaned_text)
+    aozora_docs: list[Doc] = []
+    with mo.status.progress_bar(total=len(paras), title="spaCy processing") as bar:
+        for para in paras:
+            b = len(para.encode("utf-8"))
+            if b <= MAX_BYTES:
+                doc = nlp(para)
+            else:
+                # 1) try sentence‐level split
+                parts = re.split(r"([。！？])", para)
+                sents = [
+                    parts[i] + (parts[i + 1] if i + 1 < len(parts) else "")
+                    for i in range(0, len(parts), 2)
+                ]
+                # 2) accumulate into <= MAX_BYTES
+                chunks: list[str] = []
+                cur, cur_b = "", 0
+                for s in sents:
+                    sb = len(s.encode("utf-8"))
+                    if cur_b + sb > MAX_BYTES:
+                        if cur:
+                            chunks.append(cur)
+                        cur, cur_b = s, sb
+                    else:
+                        cur += s
+                        cur_b += sb
+                if cur:
+                    chunks.append(cur)
+                # 3) raw‐byte fallback for any too‐large piece
+                final_chunks: list[str] = []
+                for c in chunks:
+                    if len(c.encode("utf-8")) <= MAX_BYTES:
+                        final_chunks.append(c)
+                    else:
+                        rem = c
+                        while rem:
+                            pb = rem.encode("utf-8")[:MAX_BYTES]
+                            part = pb.decode("utf-8", "ignore")
+                            final_chunks.append(part)
+                            rem = rem[len(part) :]
+                # 4) merge into one Doc for this paragraph
+                subdocs = list(nlp.pipe(final_chunks, batch_size=20))
+                doc = Doc.from_docs(subdocs)
+            aozora_docs.append(doc)
+            bar.update()
+    return (aozora_docs,)
+@app.cell
+def display_noun_chunks(aozora_docs: "list[Doc]", mo, pl):
+    """
+    Show the most frequent noun-chunks in the entire text made up of at least two tokens, along with the number of tokens in each chunk.
+    """
+    # build, filter (>=2 tokens), group and sort in one go
+    top_chunks = (
+        pl.DataFrame(
+            {
+                "chunk_text": [c.text for doc in aozora_docs for c in doc.noun_chunks],
+                "token_count": [len(c) for doc in aozora_docs for c in doc.noun_chunks],
+            }
+        )
+        .filter(pl.col("token_count") >= 2)
+        .group_by("chunk_text")
+        .agg([pl.len().alias("frequency"), pl.first("token_count")])
+        .sort("frequency", descending=True)
     )
+    mo.md(f"""
+    spaCyには様々な機能が内蔵されていて、例えば、`noun_chunks`では[名詞句](https://spacy.io/usage/linguistic-features#noun-chunks)を構文（係り受け）解析結果に基づいて。ここでいう名詞句、すなわち「NPチャンク」とは、他の名詞句がその中に入れ子にならない名詞句のことで、名詞句レベルの並列や前置詞句、関係節は含まない。
+    ### 2語以上からなる名詞句トップ25
+    {mo.ui.dataframe(top_chunks, page_size=25)}
+    > カスタマイズも[可能](https://github.com/explosion/spaCy/blob/41e07772dc5805594bab2997a090a9033e26bf56/spacy/lang/ja/syntax_iterators.py#L12)
+    """)
     return
 @app.cell(hide_code=True)
+def _(mo):
     mo.md(
         """
+    ## Token Pattern Matching
+    トークンベースのルールを使用して、短単位で分割された動詞の塊をまとめ上げて観察する。
+    > ここで使用されるルールはあくまでも例で、完璧に動詞の塊をまとめ上げていない。また、短単位より長い単位でテキスト分析する場合は長単位による解析も[可能](https://github.com/komiya-lab/monaka)。
+    """
+    )
+    return
+@app.cell
+def token_pattern():
+    ###### ここにサイトからコピーしたパターンを入れ変える
+    pattern = [
+        {"POS": "NOUN", "OP": "+"},
+        {"POS": "VERB", "OP": "+"},
+        {"POS": {"REGEX": "VERB|AUX"}, "OP": "+"},
+    ]
+    #####################################################
+    return (pattern,)
+@app.cell
+def token_pattern_match(aozora_docs: "list[Doc]", mo, nlp, pattern, pl, spacy):
+    # https://spacy.io/usage/rule-based-matching#example1
+    from spacy.matcher import Matcher
+    matcher = Matcher(nlp.vocab)
+    matched_sentences = []  # Collect data of matched sentences to be visualized
+    match_texts: list[str] = []
+    def collect_sents(matcher, doc, i, matches):
+        match_id, start, end = matches[i]
+        span = doc[start:end]  # Matched span
+        sent = span.sent  # Sentence containing matched span
+        # get the match span by offsetting the start/end of the span
+        match_ents = [
+            {
+                "start": span.start_char - sent.start_char,
+                "end": span.end_char - sent.start_char,
+                "label": "ヒット",
+            }
+        ]
+        matched_sentences.append({"text": sent.text, "ents": match_ents})
+        match_texts.append(span.text)
+    matcher.add("MyPattern", [pattern], on_match=collect_sents)  # add pattern
+    # run matcher over each paragraph
+    for p_doc2 in aozora_docs:
+        matcher(p_doc2)
+    # only show first 10 matches
+    MAX_PATTERN_MATCHES = 10
+    viz_html = spacy.displacy.render(
+        matched_sentences[:MAX_PATTERN_MATCHES], style="ent", manual=True
+    )
+    # build top‐25 frequency table of matched span texts
+    df = pl.DataFrame({"match_text": match_texts})
+    top_matches = (
+        df.group_by("match_text")
+        .agg(pl.len().alias("frequency"))
+        .sort("frequency", descending=True)
+        .head(25)
     )
+    # display the displaCy‐rendered HTML *and* the frequency table
+    mo.vstack([mo.Html(viz_html), top_matches])
     return
 @app.cell(hide_code=True)
+def _(mo):
     mo.md(
         """
+    ## Dependency Pattern Matching
+    係り受けパターンのルールを記述し、動詞と名詞が[nsubj](https://universaldependencies.org/ja/dep/nsubj.html) (nominal subject) という係り受け関係にあるもの、すなわち動詞とその主語を抽出する。
+    > 係り受け解析は形態素解析のタスクより複雑、その解析制度がより低い。ここでは`ja_ginza`という軽量なモデルを使用しているが、解析制度を求めるのであれば、Transformerベースモデルを使用するとよい。
+    """
     )
     return
 @app.cell
+def dependency_pattern():
+    ###### ここにサイトからコピーしたパターンを入れ変える
+    # this is your dependency‐matcher pattern
+    dep_pattern = [
+        {"RIGHT_ID": "anchor_verb", "RIGHT_ATTRS": {"POS": "VERB"}},
+        {
+            "LEFT_ID": "anchor_verb",
+            "REL_OP": ">",
+            "RIGHT_ID": "verb_subject",
+            "RIGHT_ATTRS": {"DEP": "nsubj"},
+        },
+    ]
+    #####################################################
+    return (dep_pattern,)
+@app.cell
+def show_dependency_matches(
+    aozora_docs: "list[Doc]",
+    dep_pattern,
+    mo,
+    nlp,
+    pl,
+    spacy,
+):
+    from spacy.matcher import DependencyMatcher
+    dep_matcher = DependencyMatcher(nlp.vocab)
+    viz_dep_sents: list[dict] = []
+    dep_pairs: list[dict[str, str]] = []
+    def collect_deps(matcher, doc, i, matches):
+        _, token_ids = matches[i]
+        sent = doc[token_ids[0]].sent
+        # map each RIGHT_ID to its matched Token
+        rid_to_tok = {
+            pat["RIGHT_ID"]: doc[tok_id] for pat, tok_id in zip(dep_pattern, token_ids)
+        }
+        verb = rid_to_tok["anchor_verb"]
+        subj = rid_to_tok["verb_subject"]
+        # build ents for displaCy
+        ents = []
+        for rid, tok in rid_to_tok.items():
+            label = "subject" if rid == "verb_subject" else "verb"
+            ents.append(
+                {
+                    "start": tok.idx - sent.start_char,
+                    "end": tok.idx + len(tok) - sent.start_char,
+                    "label": label,
+                    "text": tok.text,
+                }
+            )
+        viz_dep_sents.append({"text": sent.text, "ents": ents})
+        dep_pairs.append({"subject": subj.text, "verb": verb.text})
+    dep_matcher.add("MyDepPattern", [dep_pattern], on_match=collect_deps)
+    for dep_doc in aozora_docs:
+        dep_matcher(dep_doc)
+    dep_viz_html = spacy.displacy.render(viz_dep_sents[:10], style="ent", manual=True)
+    dep_df = pl.DataFrame(dep_pairs)
+    top_dep_matches = (
+        dep_df.group_by(["subject", "verb"])
+        .agg(pl.len().alias("frequency"))
+        .sort("frequency", descending=True)
+    )
+    mo.vstack(
+        [
+            mo.Html(dep_viz_html),
+            top_dep_matches,
+        ]
     )
     return
+@app.cell
+def _():
+    return
 if __name__ == "__main__":

pyproject.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[project]
+name = ""
+version = "0.1.0"
+description = "Regex and Python based preprocessing demo"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+     "aozora-corpus-generator==0.1.1",
+     "cdifflib==1.2.9",
+     "ginza",
+     "ja-ginza",
+     "ipython==7.23.1",
+     "marimo",
+     "polars==1.30.0",
+     "spacy==3.8.7",
+     "wcwidth",
+]
+[tool.uv.sources]
+aozora-corpus-generator = { git = "https://github.com/borh/aozora-corpus-generator.git" }
+[tool.uv]
+no-binary-package = ["html5-parser", "lxml"]

requirements.txt DELETED Viewed

@@ -1,5 +0,0 @@
-marimo
-# Or a specific version
-# marimo>=0.9.0
-# Add other dependencies as needed

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

wagahaiwa_nekodearu.txt ADDED Viewed

The diff for this file is too large to render. See raw diff