Spaces:
Sleeping
Sleeping
| # /// script | |
| # requires-python = ">=3.12" | |
| # dependencies = [ | |
| # "aozora-corpus-generator==0.1.1", | |
| # "cdifflib==1.2.9", | |
| # "ginza", | |
| # "ja-ginza", | |
| # "ipython==7.23.1", | |
| # "marimo", | |
| # "polars==1.30.0", | |
| # "spacy==3.8.7", | |
| # "wcwidth", | |
| # ] | |
| # | |
| # [tool.uv.sources] | |
| # aozora-corpus-generator = { git = "https://github.com/borh/aozora-corpus-generator.git" } | |
| # /// | |
| import marimo | |
| __generated_with = "0.13.15" | |
| app = marimo.App() | |
| def _(mo): | |
| mo.md( | |
| rf""" | |
| # Aozora Bunko Text Processing Pipeline Demo / 青空文庫テキストの前処理パイプラインデモ | |
| ### Summary | |
| This notebook allows you to upload, preprocess, compare, visualize and analyze Aozora Bunko texts. | |
| 1. Upload a text file from Aozora Bunko (or use the default sample). | |
| 2. Preprocess using customizable regex patterns. | |
| 3. Preview the first and last 50 lines of the cleaned text. | |
| 4. Download the cleaned text. | |
| 5. Process the XHTML version with the `aozora-corpus-generator` Python library for comparison. | |
| 6. Compare against the regex variant. | |
| 6. Define token matching patterns (not possible in App mode). | |
| 7. Visualize token matches. | |
| 8. Define dependency matching patterns (not possible in App mode). | |
| 9. Visualize dependency matches. | |
| ### 概要 | |
| このノートブックでは以下の手順で青空文庫テキストを読み込み、前処理、解析、可視化を行います。 | |
| 1. 青空文庫のテキストファイルをアップロードする(またはデフォルトサンプルを利用する)。 | |
| 2. 編集可能な正規表現で前処理する。 | |
| 3. 前処理済みテキストの先頭50行と末尾50行をプレビューし、前処理が正常に本文以外のテキストを除外したか確認する。 | |
| 4. 前処理済みテキストをダウンロードする。 | |
| 5. 比較のため、XHTML版をPythonのパッケージで処理する。 | |
| 6. 正規表現処理版と比較する。 | |
| 7. トークンマッチング用パターンを定義する(アプリの場合は編集不可)。 | |
| 8. トークンマッチ結果を可視化する。 | |
| 9. 係り受け(依存)関係マッチング用パターンを定義する(アプリの場合は編集不可)。 | |
| 10. 係り受け関係マッチ結果を可視化する。 | |
| """ | |
| ) | |
| return | |
| def _(mo): | |
| mo.md(''' | |
| - By default, this demo uses Natsume Soseki's _‘Wagahai wa neko de aru’_ | |
| - ファイルをアップロードしない場合は、デフォルトで夏目漱石『吾輩は猫である』が使用されます。 | |
| ''').callout(kind="info") | |
| return | |
| def _(): | |
| import re | |
| import marimo as mo | |
| import polars as pl | |
| import spacy | |
| from spacy.tokens import Doc | |
| nlp = spacy.load( | |
| "ja_ginza" | |
| ) # or "ja_ginza_electra"/"ja_ginza_bert_large" if installed | |
| return Doc, mo, nlp, pl, re, spacy | |
| def upload_aozora_text(mo): | |
| aozora_file = mo.ui.file(label="Upload Aozora-Bunko text (.txt)", multiple=False) | |
| return (aozora_file,) | |
| def select_encoding(mo): | |
| """ | |
| Let the user choose the text‐file encoding. | |
| """ | |
| encoding = mo.ui.dropdown( | |
| options=["shift-jis", "utf-8"], | |
| value="shift-jis", | |
| label="Text file encoding / 文字コード", | |
| full_width=False, | |
| ) | |
| return (encoding,) | |
| def _(aozora_file, encoding, mo): | |
| ab_upload_ui = mo.hstack([aozora_file, encoding]) | |
| mo.md(f"## 青空文庫テキストファイル設定\n{ab_upload_ui}") | |
| return | |
| def load_aozora_text(aozora_file, encoding): | |
| """ | |
| Load the uploaded file if provided; otherwise read the local wagahaiwa_nekodearu.txt. | |
| Returns the raw text. | |
| """ | |
| enc = encoding.value | |
| if aozora_file.value: | |
| uploaded = aozora_file.contents() | |
| text_raw = uploaded.decode(enc) | |
| else: | |
| with open("wagahaiwa_nekodearu.txt", encoding="shift-jis") as f: | |
| text_raw = f.read() | |
| return (text_raw,) | |
| def show_raw_head(mo, text_raw): | |
| mo.md( | |
| f""" | |
| ## 青空文庫のヘッダーとフッターを確認 | |
| 最初の500字 | |
| ```raw | |
| {text_raw[:500]} | |
| ``` | |
| 最後の500字 | |
| ```raw | |
| {text_raw[-500:]} | |
| ``` | |
| """ | |
| ) | |
| return | |
| def regex_inputs(mo): | |
| ruby_pattern = mo.ui.text( | |
| value=r"《[^》]+》", | |
| label="ルビ", | |
| full_width=True, | |
| ) | |
| ruby_bar_pattern = mo.ui.text( | |
| value=r"|", | |
| label="ルビのかかる範囲を示す記号", | |
| full_width=True, | |
| ) | |
| annotation_pattern = mo.ui.text( | |
| value=r"[#[^]]+?]", | |
| label="注釈・アノテーション", | |
| full_width=True, | |
| ) | |
| hajime_pattern = mo.ui.text( | |
| value=r"-{55}(.|\n)+?-{55}", | |
| label="青空文庫のヘッダー", | |
| full_width=True, | |
| ) | |
| owari_pattern = mo.ui.text( | |
| value=( | |
| r"^[ 【]?(底本:|訳者あとがき|この翻訳は|この作品.*翻訳|" | |
| r"この翻訳.*全訳)" | |
| ), | |
| label="青空文庫のフッター", | |
| full_width=True, | |
| ) | |
| regexes = mo.vstack( | |
| [ | |
| ruby_pattern, | |
| ruby_bar_pattern, | |
| annotation_pattern, | |
| hajime_pattern, | |
| owari_pattern, | |
| ] | |
| ) | |
| mo.md(f"""## 正規表現による前処理 | |
| (必要な場合は修正) | |
| {regexes} | |
| """) | |
| return ( | |
| annotation_pattern, | |
| hajime_pattern, | |
| owari_pattern, | |
| ruby_bar_pattern, | |
| ruby_pattern, | |
| ) | |
| def clean_aozora( | |
| annotation_pattern, | |
| hajime_pattern, | |
| mo, | |
| owari_pattern, | |
| re, | |
| ruby_bar_pattern, | |
| ruby_pattern, | |
| text_raw, | |
| ): | |
| # compile from user‐editable patterns | |
| ruby_rx = re.compile(ruby_pattern.value) | |
| ruby_bar_rx = re.compile(ruby_bar_pattern.value) | |
| annotation_rx = re.compile(annotation_pattern.value) | |
| hajime_rx = re.compile(hajime_pattern.value) | |
| owari_rx = re.compile(owari_pattern.value, re.M) | |
| def clean_text(text: str) -> tuple[str, str, str]: | |
| """青空文庫テキスト形式の文字列textを入力とし,改行方式の統一,ルビーと各種のアノーテーションの削除, | |
| 青空文庫特有のヘッダーとフッターを取り除く処理を行う。""" | |
| title, author, text = (text.split("\n", 2) + ["", ""])[:3] | |
| # 青空文庫では改行がDOS形式の\r\nのため,それをUNIX形式の\nに統一する。 | |
| cleaned_text = re.sub(r"(\r\n)+", "\n", text) | |
| # ルビ《...》の記号とその中身を削除 | |
| cleaned_text = re.sub(ruby_rx, "", cleaned_text) | |
| # ルビのもう一つの書き方に対応:「一番|獰悪《どうあく》」 | |
| cleaned_text = re.sub(ruby_bar_rx, "", cleaned_text) | |
| # 注釈対応:「※[#「言+墟のつくり」、第4水準2-88-74]」 | |
| cleaned_text = re.sub(annotation_rx, "", cleaned_text) | |
| # 本文までのテキストを削除 | |
| cleaned_text = re.sub(hajime_rx, "", cleaned_text) | |
| # 本文の後のテキストを削除 | |
| maybe_owari = owari_rx.search(cleaned_text) | |
| if maybe_owari: | |
| return (title, author, cleaned_text[0 : maybe_owari.start()].strip()) | |
| return (title, author, cleaned_text.strip()) | |
| title, author, cleaned_text = clean_text(text_raw) | |
| mo.md(f"""### 前処理結果の確認 | |
| - 著者:`{author}` | |
| - タイトル:`{title}` | |
| 最初の100字 | |
| ```raw | |
| {cleaned_text[:100]} | |
| ``` | |
| 最後の100字 | |
| ```raw | |
| {cleaned_text[-100:]} | |
| ``` | |
| """) | |
| return author, cleaned_text, title | |
| def download_cleaned_text(author, cleaned_text, mo, title): | |
| """ | |
| Provide a download link for the cleaned Aozora text. | |
| """ | |
| download_link = mo.download( | |
| data=cleaned_text.encode("utf-8"), | |
| filename=f"{author}_{title}.txt", | |
| mimetype="text/plain", | |
| ) | |
| mo.md(f""" | |
| 前処理済みファイルのダウンロード (UTF-8): | |
| {download_link} | |
| """) | |
| return | |
| def get_alternative_file(mo): | |
| aozora_xhtml_file = mo.ui.file( | |
| label="Upload Aozora-Bunko text (.html)", multiple=False | |
| ) | |
| xhtml_encoding = mo.ui.dropdown( | |
| options=["shift-jis", "utf-8"], | |
| value="shift-jis", | |
| label="Text file encoding", | |
| full_width=False, | |
| ) | |
| mo.md(f""" | |
| ## HTMLを使用した前処理との比較(オプショナル) | |
| プレインテキスト版を正規表現で前処理した結果を、(X)HTML版をPythonで処理した結果を比較したい場合は同じ作品のHTMLファイルをアップします。 | |
| {aozora_xhtml_file} | |
| {xhtml_encoding} | |
| """) | |
| return aozora_xhtml_file, xhtml_encoding | |
| def show_natsume_head(aozora_xhtml_file, mo, xhtml_encoding): | |
| from aozora_corpus_generator.aozora import parse_aozora_bunko_xml_content | |
| xhtml_enc = xhtml_encoding.value | |
| if aozora_xhtml_file.value: | |
| uploaded_xhtml = aozora_xhtml_file.contents() | |
| xhtml_raw = uploaded_xhtml | |
| else: | |
| with open("789_14547.html", "rb") as xhtml_f: | |
| xhtml_raw = xhtml_f.read() | |
| aozora_xhtml_processed = parse_aozora_bunko_xml_content( | |
| xhtml_raw, do_tokenize=False | |
| ) | |
| aozora_xhtml_processed_text = aozora_xhtml_processed["text"] | |
| mo.md(f""" | |
| HTML版の最初の200字 | |
| ```raw | |
| {aozora_xhtml_processed_text[:200]} | |
| ``` | |
| HTML版の最後の200字 | |
| ```raw | |
| {aozora_xhtml_processed_text[-200:]} | |
| ``` | |
| """) | |
| return (aozora_xhtml_processed_text,) | |
| def _(aozora_xhtml_processed_text, author, mo, title): | |
| xhtml_download_link = mo.download( | |
| data=aozora_xhtml_processed_text.encode("utf-8"), | |
| filename=f"{author}_{title}_xhtml.txt", | |
| mimetype="text/plain", | |
| ) | |
| mo.md(f""" | |
| HTML版の前処理済みファイルをダウンロード (UTF-8): | |
| {xhtml_download_link} | |
| """) | |
| return | |
| def _(): | |
| import difflib | |
| import html | |
| from cdifflib import CSequenceMatcher | |
| from IPython.display import HTML | |
| from IPython.display import display_html as display | |
| difflib.SequenceMatcher = CSequenceMatcher | |
| DEL_STYLE = "background-color:#f6c6c6;color:#000;" # red bg, black text | |
| INS_STYLE = "background-color:#c6f6c6;color:#000;" # green bg, black text | |
| WRAP_STYLE = ( | |
| "font-family: ui-monospace, monospace; " | |
| "white-space: pre-wrap; line-height:1.4; color:#000;" | |
| ) | |
| WS_MAP = str.maketrans({" ": "␣", "\t": "⇥", "\n": "↩\n"}) | |
| def _escape(txt: str) -> str: | |
| return html.escape(txt.translate(WS_MAP)) | |
| def _char_changes(a: str, b: str) -> str: | |
| """Return HTML for *only* the changed chars between a and b.""" | |
| sm = difflib.SequenceMatcher(None, a, b, autojunk=False) | |
| pieces = [] | |
| for tag, i1, i2, j1, j2 in sm.get_opcodes(): | |
| if tag == "delete": | |
| pieces.append(f'<span style="{DEL_STYLE}">{_escape(a[i1:i2])}</span>') | |
| elif tag == "insert": | |
| pieces.append(f'<span style="{INS_STYLE}">{_escape(b[j1:j2])}</span>') | |
| elif tag == "replace": | |
| pieces.append(f'<span style="{DEL_STYLE}">{_escape(a[i1:i2])}</span>') | |
| pieces.append(f'<span style="{INS_STYLE}">{_escape(b[j1:j2])}</span>') | |
| # equal → ignore | |
| return "".join(pieces) | |
| def diff_changes(a: str, b: str, auto_display: bool = True): | |
| """ | |
| Colab/Jupyter-friendly inline diff that shows *only the changed segments*. | |
| Lightning-fast on large, mostly-identical texts. | |
| """ | |
| a_lines = a.splitlines(keepends=True) | |
| b_lines = b.splitlines(keepends=True) | |
| outer = difflib.SequenceMatcher(None, a_lines, b_lines, autojunk=True) | |
| html_chunks = [] | |
| for tag, i1, i2, j1, j2 in outer.get_opcodes(): | |
| if tag == "replace": # both sides present | |
| for la, lb in zip(a_lines[i1:i2], b_lines[j1:j2]): | |
| html_chunks.append(_char_changes(la, lb)) | |
| # handle length mismatch | |
| for la in a_lines[i1 + (j2 - j1) : i2]: | |
| html_chunks.append( | |
| f'<span style="{DEL_STYLE}">{_escape(la)}</span>' | |
| ) | |
| for lb in b_lines[j1 + (i2 - i1) : j2]: | |
| html_chunks.append( | |
| f'<span style="{INS_STYLE}">{_escape(lb)}</span>' | |
| ) | |
| elif tag == "delete": | |
| for la in a_lines[i1:i2]: | |
| html_chunks.append( | |
| f'<span style="{DEL_STYLE}">{_escape(la)}</span>' | |
| ) | |
| elif tag == "insert": | |
| for lb in b_lines[j1:j2]: | |
| html_chunks.append( | |
| f'<span style="{INS_STYLE}">{_escape(lb)}</span>' | |
| ) | |
| # equal → skip entirely (we want only changes) | |
| rendered = f'<div style="{WRAP_STYLE}">{"".join(html_chunks)}</div>' | |
| if auto_display: | |
| display(HTML(rendered)) | |
| return rendered | |
| return (diff_changes,) | |
| def toggle_diff(mo): | |
| run_diff = mo.ui.switch(label="文章間の比較(差分)を表示", value=False) | |
| run_diff | |
| return (run_diff,) | |
| def compare_preprocessed_vs_old( | |
| aozora_xhtml_processed_text, | |
| cleaned_text, | |
| diff_changes, | |
| mo, | |
| run_diff, | |
| ): | |
| """ | |
| Compare our cleaned text against the original Aozora‐processed text. | |
| """ | |
| mo.stop(not run_diff.value) | |
| diff_result = diff_changes( | |
| cleaned_text, aozora_xhtml_processed_text, auto_display=False | |
| ) | |
| mo.md(f""" | |
| - 赤: 正規表現版のみにある文字列 | |
| - 青: HTML版のみにある文字列 | |
| {diff_result} | |
| """) | |
| return | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| ## spaCy (GiNZA) による解析 | |
| 以下からは、正規表現で前処理したテキストに対して、 | |
| - 形態素解析 | |
| - 係り受け解析 | |
| を行う。 | |
| > 作品によっては時間がかかる。 | |
| """ | |
| ) | |
| return | |
| def _(mo): | |
| run_spacy = mo.ui.switch(label="spaCyで解析する", value=False) | |
| run_spacy | |
| return (run_spacy,) | |
| def process_aozora_text(Doc, cleaned_text, mo, nlp, re, run_spacy): | |
| mo.stop(not run_spacy.value) | |
| """ | |
| Turn each paragraph into one Doc. If any paragraph > MAX_BYTES, | |
| fall back to sentence‐splitting, then raw‐byte‐splitting, and only | |
| in that fallback re‐assemble via Doc.from_docs. | |
| """ | |
| def split_text_to_paragraphs(text: str) -> list[str]: | |
| """Split on one or more blank lines.""" | |
| return re.split(r"\n+\s*", text) | |
| MAX_BYTES = 40000 | |
| paras = split_text_to_paragraphs(cleaned_text) | |
| aozora_docs: list[Doc] = [] | |
| with mo.status.progress_bar(total=len(paras), title="spaCy processing") as bar: | |
| for para in paras: | |
| b = len(para.encode("utf-8")) | |
| if b <= MAX_BYTES: | |
| doc = nlp(para) | |
| else: | |
| # 1) try sentence‐level split | |
| parts = re.split(r"([。!?])", para) | |
| sents = [ | |
| parts[i] + (parts[i + 1] if i + 1 < len(parts) else "") | |
| for i in range(0, len(parts), 2) | |
| ] | |
| # 2) accumulate into <= MAX_BYTES | |
| chunks: list[str] = [] | |
| cur, cur_b = "", 0 | |
| for s in sents: | |
| sb = len(s.encode("utf-8")) | |
| if cur_b + sb > MAX_BYTES: | |
| if cur: | |
| chunks.append(cur) | |
| cur, cur_b = s, sb | |
| else: | |
| cur += s | |
| cur_b += sb | |
| if cur: | |
| chunks.append(cur) | |
| # 3) raw‐byte fallback for any too‐large piece | |
| final_chunks: list[str] = [] | |
| for c in chunks: | |
| if len(c.encode("utf-8")) <= MAX_BYTES: | |
| final_chunks.append(c) | |
| else: | |
| rem = c | |
| while rem: | |
| pb = rem.encode("utf-8")[:MAX_BYTES] | |
| part = pb.decode("utf-8", "ignore") | |
| final_chunks.append(part) | |
| rem = rem[len(part) :] | |
| # 4) merge into one Doc for this paragraph | |
| subdocs = list(nlp.pipe(final_chunks, batch_size=20)) | |
| doc = Doc.from_docs(subdocs) | |
| aozora_docs.append(doc) | |
| bar.update() | |
| return (aozora_docs,) | |
| def display_noun_chunks(aozora_docs: "list[Doc]", mo, pl): | |
| """ | |
| Show the most frequent noun-chunks in the entire text made up of at least two tokens, along with the number of tokens in each chunk. | |
| """ | |
| # build, filter (>=2 tokens), group and sort in one go | |
| top_chunks = ( | |
| pl.DataFrame( | |
| { | |
| "chunk_text": [c.text for doc in aozora_docs for c in doc.noun_chunks], | |
| "token_count": [len(c) for doc in aozora_docs for c in doc.noun_chunks], | |
| } | |
| ) | |
| .filter(pl.col("token_count") >= 2) | |
| .group_by("chunk_text") | |
| .agg([pl.len().alias("frequency"), pl.first("token_count")]) | |
| .sort("frequency", descending=True) | |
| ) | |
| mo.md(f""" | |
| spaCyには様々な機能が内蔵されていて、例えば、`noun_chunks`では[名詞句](https://spacy.io/usage/linguistic-features#noun-chunks)を構文(係り受け)解析結果に基づいて。ここでいう名詞句、すなわち「NPチャンク」とは、他の名詞句がその中に入れ子にならない名詞句のことで、名詞句レベルの並列や前置詞句、関係節は含まない。 | |
| ### 2語以上からなる名詞句トップ25 | |
| {mo.ui.dataframe(top_chunks, page_size=25)} | |
| > カスタマイズも[可能](https://github.com/explosion/spaCy/blob/41e07772dc5805594bab2997a090a9033e26bf56/spacy/lang/ja/syntax_iterators.py#L12) | |
| """) | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## Token Pattern Matching | |
| トークンベースのルールを使用して、短単位で分割された動詞の塊をまとめ上げて観察する。 | |
| > ここで使用されるルールはあくまでも例で、完璧に動詞の塊をまとめ上げていない。また、短単位より長い単位でテキスト分析する場合は長単位による解析も[可能](https://github.com/komiya-lab/monaka)。 | |
| """ | |
| ) | |
| return | |
| def token_pattern(): | |
| ###### ここにサイトからコピーしたパターンを入れ変える | |
| pattern = [ | |
| {"POS": "NOUN", "OP": "+"}, | |
| {"POS": "VERB", "OP": "+"}, | |
| {"POS": {"REGEX": "VERB|AUX"}, "OP": "+"}, | |
| ] | |
| ##################################################### | |
| return (pattern,) | |
| def token_pattern_match(aozora_docs: "list[Doc]", mo, nlp, pattern, pl, spacy): | |
| # https://spacy.io/usage/rule-based-matching#example1 | |
| from spacy.matcher import Matcher | |
| matcher = Matcher(nlp.vocab) | |
| matched_sentences = [] # Collect data of matched sentences to be visualized | |
| match_texts: list[str] = [] | |
| def collect_sents(matcher, doc, i, matches): | |
| match_id, start, end = matches[i] | |
| span = doc[start:end] # Matched span | |
| sent = span.sent # Sentence containing matched span | |
| # get the match span by offsetting the start/end of the span | |
| match_ents = [ | |
| { | |
| "start": span.start_char - sent.start_char, | |
| "end": span.end_char - sent.start_char, | |
| "label": "ヒット", | |
| } | |
| ] | |
| matched_sentences.append({"text": sent.text, "ents": match_ents}) | |
| match_texts.append(span.text) | |
| matcher.add("MyPattern", [pattern], on_match=collect_sents) # add pattern | |
| # run matcher over each paragraph | |
| for p_doc2 in aozora_docs: | |
| matcher(p_doc2) | |
| # only show first 10 matches | |
| MAX_PATTERN_MATCHES = 10 | |
| viz_html = spacy.displacy.render( | |
| matched_sentences[:MAX_PATTERN_MATCHES], style="ent", manual=True | |
| ) | |
| # build top‐25 frequency table of matched span texts | |
| df = pl.DataFrame({"match_text": match_texts}) | |
| top_matches = ( | |
| df.group_by("match_text") | |
| .agg(pl.len().alias("frequency")) | |
| .sort("frequency", descending=True) | |
| .head(25) | |
| ) | |
| # display the displaCy‐rendered HTML *and* the frequency table | |
| mo.vstack([mo.Html(viz_html), top_matches]) | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| ## Dependency Pattern Matching | |
| 係り受けパターンのルールを記述し、動詞と名詞が[nsubj](https://universaldependencies.org/ja/dep/nsubj.html) (nominal subject) という係り受け関係にあるもの、すなわち動詞とその主語を抽出する。 | |
| > 係り受け解析は形態素解析のタスクより複雑、その解析制度がより低い。ここでは`ja_ginza`という軽量なモデルを使用しているが、解析制度を求めるのであれば、Transformerベースモデルを使用するとよい。 | |
| """ | |
| ) | |
| return | |
| def dependency_pattern(): | |
| ###### ここにサイトからコピーしたパターンを入れ変える | |
| # this is your dependency‐matcher pattern | |
| dep_pattern = [ | |
| {"RIGHT_ID": "anchor_verb", "RIGHT_ATTRS": {"POS": "VERB"}}, | |
| { | |
| "LEFT_ID": "anchor_verb", | |
| "REL_OP": ">", | |
| "RIGHT_ID": "verb_subject", | |
| "RIGHT_ATTRS": {"DEP": "nsubj"}, | |
| }, | |
| ] | |
| ##################################################### | |
| return (dep_pattern,) | |
| def show_dependency_matches( | |
| aozora_docs: "list[Doc]", | |
| dep_pattern, | |
| mo, | |
| nlp, | |
| pl, | |
| spacy, | |
| ): | |
| from spacy.matcher import DependencyMatcher | |
| dep_matcher = DependencyMatcher(nlp.vocab) | |
| viz_dep_sents: list[dict] = [] | |
| dep_pairs: list[dict[str, str]] = [] | |
| def collect_deps(matcher, doc, i, matches): | |
| _, token_ids = matches[i] | |
| sent = doc[token_ids[0]].sent | |
| # map each RIGHT_ID to its matched Token | |
| rid_to_tok = { | |
| pat["RIGHT_ID"]: doc[tok_id] for pat, tok_id in zip(dep_pattern, token_ids) | |
| } | |
| verb = rid_to_tok["anchor_verb"] | |
| subj = rid_to_tok["verb_subject"] | |
| # build ents for displaCy | |
| ents = [] | |
| for rid, tok in rid_to_tok.items(): | |
| label = "subject" if rid == "verb_subject" else "verb" | |
| ents.append( | |
| { | |
| "start": tok.idx - sent.start_char, | |
| "end": tok.idx + len(tok) - sent.start_char, | |
| "label": label, | |
| "text": tok.text, | |
| } | |
| ) | |
| viz_dep_sents.append({"text": sent.text, "ents": ents}) | |
| dep_pairs.append({"subject": subj.text, "verb": verb.text}) | |
| dep_matcher.add("MyDepPattern", [dep_pattern], on_match=collect_deps) | |
| for dep_doc in aozora_docs: | |
| dep_matcher(dep_doc) | |
| dep_viz_html = spacy.displacy.render(viz_dep_sents[:10], style="ent", manual=True) | |
| dep_df = pl.DataFrame(dep_pairs) | |
| top_dep_matches = ( | |
| dep_df.group_by(["subject", "verb"]) | |
| .agg(pl.len().alias("frequency")) | |
| .sort("frequency", descending=True) | |
| ) | |
| mo.vstack( | |
| [ | |
| mo.Html(dep_viz_html), | |
| top_dep_matches, | |
| ] | |
| ) | |
| return | |
| def _(): | |
| return | |
| if __name__ == "__main__": | |
| app.run() | |