diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..65a05197f6262d96eb50a750160dac656f7410de --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.keras filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..4a4d272041fec679b862264ee9be15170607c090 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__ +/.idea +/local \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000000000000000000000000000000000000..0af83df9594a71bd158b6cf41089a2db6b748ee1 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,220 @@ +# Agent 编码规范 + +## 防御性编程精简 + +避免过度防御性编程,遵循以下原则: + +### 1. None 检查 + +- **不要**进行显式的 None 检查 +- 信任输入数据,让程序在真正的错误点上失败 +- 避免 `if x is not None:` 这样的防御性代码 + +```python +# ❌ 避免 +def process(data): + if data is not None: + return data.value + return None + +# ✅ 推荐 +def process(data): + return data.value +``` + +### 2. 类型检查 + +- **不要**使用 `isinstance`、`type()`、`typeof` 等进行运行时类型检查 +- 依靠类型提示和静态类型检查工具(如 mypy) +- 让 Duck Typing 发挥作用 + +```python +# ❌ 避免 +def calculate(obj): + if isinstance(obj, int): + return obj * 2 + elif isinstance(obj, str): + return obj * 2 + else: + raise TypeError("不支持的类型") + +# ✅ 推荐 +def calculate(obj: int | str) -> int | str: + return obj * 2 +``` + +### 3. 异常处理 + +- **不要**滥用 try-catch 来压制异常 +- **不要**用 try-catch 让程序"容错"运行 +- 只在真正需要处理异常的地方捕获 +- 让未处理的异常自然抛出,暴露真正的问题 + +```python +# ❌ 避免 - 压制异常 +import logging + +logger = logging.getLogger(__name__) + +def parse_config(path): + try: + with open(path) as f: + return json.load(f) + except Exception as e: + logger.error(f"加载配置失败: {e}") + return {} # 返回空配置让程序继续运行 + +# ✅ 推荐 - 让异常传播 +def parse_config(path): + with open(path) as f: + return json.load(f) + +# ✅ 或仅在必要时转换异常类型 +def parse_config(path): + try: + with open(path) as f: + return json.load(f) + except json.JSONDecodeError as e: + raise ConfigError(f"配置文件格式错误: {e}") from e +``` + +### 4. 原则总结 + +1. **早失败(Fail Fast)** - 让错误尽早暴露,不要试图掩盖 +2. **信任调用方** - 假设调用方会提供正确的输入 +3. **清晰错误信息** - 让异常信息直接指出问题所在 +4. **代码简洁** - 减少不必要的检查代码,专注于业务逻辑 + +--- + +**核心信条**:清晰的代码比健壮的代码更重要。让错误暴露,让问题可见。 + +## 编辑文件时的精准修改原则 + +在进行代码编辑时,**只修改必要的部分**,不要进行任何无关改动: + +### 禁止的无关改动 + +- **不要**调整代码缩进或格式 +- **不要**重排 import 语句的顺序 +- **不要**添加或删除空行 +- **不要**修改注释(除非任务明确要求) +- **不要**修改变量名、函数名等标识符(除非任务明确要求) +- **不要**进行任何代码重构(除非任务明确要求) + +### ✅ 正确示例 + +如果任务是将 `import config` 改为 `from mini_gpt import config`: + +```python +# 修改前 +import config +from typing import Callable + +# 修改后 - 只修改 import 语句,其他保持不变 +from mini_gpt import config +from typing import Callable +``` + +### ❌ 错误示例 + +#### 示例1:无关地调整 import 顺序 + +```python +# 修改前 +import config +from typing import Callable + +# 错误 - 无关地调整了 import 顺序 +from typing import Callable +from mini_gpt import config +``` + +#### 示例2:无关地修改函数参数格式 + +```python +# 修改前 +def my_function( + param1, + param2, + param3, +): + pass + +# 错误 - 任务只要求修改函数体,却无关地修改了参数格式 +def my_function( + param1, # 调整了缩进宽度 + param2, + param3 # 去掉了尾部逗号 +): + pass +``` + +**原则**:最小化改动范围,只改必须改的地方。 + +## 运行单元测试 + +本项目使用 pytest 运行单元测试,必须在 `mini-gpt` conda 环境中执行。 + +### 运行命令 + +```bash +/Users/run/anaconda3/envs/mini-gpt/bin/python -m pytest test/ -v +``` + +### 重要提示 + +1. **必须使用 mini-gpt 环境** - 基础环境缺少 tensorflow 依赖,会导致测试收集失败 +2. **不要添加 `pytest.importorskip("tensorflow")`** - 这些测试依赖 tensorflow,跳过会掩盖真正的问题 + +## Python 代码风格 + +### 禁止尾逗号 + +**任何情况下都不应出现尾逗号**(trailing comma)。 + +```python +# ❌ 避免 - 尾逗号 +my_list = [ + 1, + 2, + 3, +] + +# ✅ 推荐 +my_list = [ + 1, + 2, + 3 +] + +# ❌ 避免 - 函数参数尾逗号 +def my_func( + arg1, + arg2, +): + pass + +# ✅ 推荐 +def my_func( + arg1, + arg2 +): + pass + +# ❌ 避免 - 字典尾逗号 +my_dict = { + "key1": "value1", + "key2": "value2", +} + +# ✅ 推荐 +my_dict = { + "key1": "value1", + "key2": "value2" +} +``` + +## 禁止命令行参数 + +永远不要在代码中使用命令行参数(如 `argparse`、`sys.argv` 等)。配置应通过代码中硬编码实现。 \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..239350aa4d30b5b0cdc5207a7e5d288305ef3059 --- /dev/null +++ b/README.md @@ -0,0 +1,146 @@ +--- +title: General Deep Learning +emoji: 🏃 +colorFrom: yellow +colorTo: gray +sdk: gradio +sdk_version: 6.12.0 +app_file: app.py +pinned: false +license: mit +short_description: General Deep Learning is a practical deep learning experimen +--- + +# 通用深度学习(General Deep Learning) + +## 项目简介 + +**通用深度学习(General Deep Learning)** 是一个面向实践的深度学习实验平台,致力于打造"训练-部署-体验"一体化的完整工作流。 + +### ✨ 为什么适合你? + +**🎯 我的愿景** +- 构建一个**从零开始、透明可学、工程模块化**的深度学习平台。 + +**🎓学习友好** +- ✅ **纯手工从零构建** - Transformer、RNN 都是一行行代码手撸 +- ✅ **代码即教程** - 没有黑盒封装,每个组件清晰可见 +- ✅ **完整的训练闭环** - 从数据处理到部署,全流程透明 +- +**🔧 技术特性** +- ✅ **覆盖主流模型** - Transformer、RNN,未来将扩展至 CNN、Diffusion 等 +- ✅ **模块化架构** - 可插拔设计,新模型/新数据集快速接入 +- ✅ **生产级部署** - 一键部署到 Hugging Face,支持断点续训、TensorBoard 监控 + +### 📅 关于这个项目 + +> *历时俩月,忙里偷闲。* + +这不是一个追求最新模型的项目,而是一个**"代码即教程"**的个人实验场。 + +**已完成功能**: +- Wiki GPT - 基于中文维基的手写 Transformer +- 诗歌生成器 - GPT 和 RNN 双版本对比 + +**未来规划**: +4 月有事不再投入,5 月开始计划每月新增一个模型,探索更多架构(CNN、Diffusion...) + +- 🔮 逐步扩展至 CV、多模态等领域 +- 🔮 保持"从零手撸"的风格,让每个新模型都成为学习素材 + +**欢迎一起折腾** —— 反馈问题、贡献代码,或单纯聊聊技术! + +### 🤗 在线体验 + +[![Hugging Face Space](https://img.shields.io/badge/🤗-Hugging%20Face%20Space-blue)](https://huggingface.co/spaces/yetrun/general-deep-learning) + +🚀 **在线体验**:[点击访问 Hugging Face Space](https://huggingface.co/spaces/yetrun/general-deep-learning) + +本项目已部署到 Hugging Face Space,你可以在线体验以下功能: + +- **Wiki GPT 文本生成**:基于 Transformer 架构的中文文本生成,训练数据来自中文维基语料库 +- **诗歌生成器(GPT)**:基于 Transformer 的中文诗歌生成,支持五言、七言诗等 +- **诗歌生成器(RNN)**:基于 RNN 架构的中文诗歌生成,支持五言、七言诗等 + +## 部署说明 + +本项目已配置为 Hugging Face Space 兼容格式,如需更新部署: + +```bash +# 1. 在 Hugging Face 创建新的 Space(选择 Gradio SDK) +# 2. 绑定 Space 远程仓库 +git remote add huggingface https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME +# 3. 确保依赖同步(生成 requirements.txt) +python3 generate_requirements.py +# 4. 提交并推送 +git push huggingface master +``` + +## 本地开发 + +### Conda 环境使用 + +使用方法: +```bash +# 创建环境 +conda env create -f +# 激活环境 +conda activate general-dl +# 更新 environment.yml +conda env update -f --prune +``` + +上述 `` 是环境配置文件的路径,需要替换成实际的文件名: + +- 如果你是本地开发,使用 `environment.yml`(Mac Intel 64 环境,`ENV=test`) +- 如果你是在远程服务器上运行,使用 `environments-linux.yml`(Linux 服务器环境,`ENV=production`) + +> **插曲:** +> +> 环境配置出现了问题,强制重新安装 tensorflow-text 才修复。 +> +> ```bash +> pip uninstall tensorflow-text -y +> pip install --no-cache-dir --force-reinstall tensorflow-text==2.20.0 +> ``` + +### 开发工具配置 + +#### TensorBoard 说明 + +训练时,调用 `tensorboard --logdir=` 来启动 TensorBoard,默认访问地址是 http://localhost:6006/. + +`` 通常是 `local/tasks//tensorboard`. + +> 冷知识:tensorboard 中的代数与我们常规认为的代数不一致,第一代的计数是 0. + +#### JetBrains 远程开发配置 + +配置本地代码映射: + +1. 菜单栏:Tools → Deployment → Configuration +2. 配置目录映射:切换到Mappings标签页,Deployment path 设置远程目录路径 +3. 配置排除目录,一般可排除的本地目录包括:`data/dev`, `local`, `test`. + +手工同步: + +- 右键文件/目录 → Deployment → Upload to... + +## 数据集说明 + +### WIKI 数据集 + +*(本项目中 `wiki_gpt` 任务使用了中文维基语料库进行训练)* + +下载维基百科的数据。 + +```bash +wget https://dumps.wikimedia.org/other/mediawiki_content_current/zhwiki/2026-01-01/xml/bzip2/zhwiki-2026-01-01-p1p5254490.xml.bz2 +wget https://dumps.wikimedia.org/other/mediawiki_content_current/zhwiki/2026-01-01/xml/bzip2/zhwiki-2026-01-01-p5254491p9382552.xml.bz2 +``` + +维基百科的数据分成两个文件,可使用 cat 命令合并成一个文件: + +```bash +cat zhwiki-2026-01-01-p1p5254490.xml.bz2 zhwiki-2026-01-01-p5254491p9382552.xml.bz2 > zhwiki-2026-01-01.xml.bz2 +``` diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..b5da2ba2a980b94230f2b89cd6ec3f8380c2721f --- /dev/null +++ b/app.py @@ -0,0 +1,58 @@ +""" +AI 文本生成工具集 - 多页面 Gradio 应用 + +入口点,提供导航到各个子应用: +- /:首页导航 +- /wiki_gpt:Wiki GPT 文本生成器 +- /poetry_gpt:诗歌生成器(GPT) +- /poetry_rnn:诗歌生成器(RNN) + +特点: +- 每个子页面可以独立运行测试 +""" +import gradio as gr + +from tasks.wiki_gpt.gradio import demo as wiki_gpt_demo +from tasks.poetry_gpt.gradio import demo as poetry_gpt_demo +from tasks.poetry_rnn.gradio import demo as poetry_rnn_demo + + +with gr.Blocks(title="AI 文本生成工具集") as demo: + gr.Markdown("# AI 文本生成工具集") + gr.Markdown("请选择要使用的应用:") + + with gr.Row(): + with gr.Column(): + gr.Markdown("## 诗歌生成器(GPT)") + gr.Markdown("基于 Transformer 的中文诗歌生成,支持五言、七言诗等。") + gr.Button("进入诗歌生成器", link="/poetry_gpt") + + with gr.Column(): + gr.Markdown("## 诗歌生成器(RNN)") + gr.Markdown("基于 RNN 的中文诗歌生成,支持五言、七言诗等。") + gr.Button("进入诗歌生成器", link="/poetry_rnn") + + with gr.Column(): + gr.Markdown("## Wiki GPT 文本生成") + gr.Markdown("基于 Transformer 的中文文本生成,训练来自于中文维基语料库。") + gr.Button("进入 Wiki GPT", link="/wiki_gpt") + + gr.Markdown("---") + gr.Markdown("### 说明") + gr.Markdown("每个应用都是独立加载的,进入页面后需要等待模型加载完成。") + + +with demo.route("诗歌生成器(GPT)", "/poetry_gpt"): + poetry_gpt_demo.render() + + +with demo.route("诗歌生成器(RNN)", "/poetry_rnn"): + poetry_rnn_demo.render() + + +with demo.route("Wiki GPT", "/wiki_gpt"): + wiki_gpt_demo.render() + + +if __name__ == "__main__": + demo.launch() diff --git a/data/__init__.py b/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..be71827e6400343062b90979466dabfb83276a05 --- /dev/null +++ b/data/__init__.py @@ -0,0 +1,25 @@ +"""数据集模块 + +提供统一的数据集接口,包括 Wiki 和诗歌数据集。 + +Usage: + from data import WikiDataset, PoetryDataset + + # Wiki 数据集 + wiki = WikiDataset(data_dir="~/data/wiki/mini_c4") + doc_ds = wiki.doc_ds() + tokens_ds = wiki.tokens_ds(seq_length=256, batch_size=32) + wiki.stat(seq_length=256) + + # 诗歌数据集 + poetry = PoetryDataset(data_dir="~/data/Poetry") + doc_ds = poetry.doc_ds() + tokens_ds = poetry.tokens_ds(seq_length=100, batch_size=128) + poetry.stat(seq_length=100) +""" + +from data.base import DataBundle, TokenizerBundle +from data.wiki import WikiDataset +from data.poetry import PoetryDataset + +__all__ = ["DataBundle", "TokenizerBundle", "WikiDataset", "PoetryDataset"] diff --git a/data/base.py b/data/base.py new file mode 100644 index 0000000000000000000000000000000000000000..f5d43f2924078cabe28d696b5e4605cd8dba424e --- /dev/null +++ b/data/base.py @@ -0,0 +1,85 @@ +"""数据集抽象基类模块 + +定义 DataBundle 抽象基类,统一数据集的接口规范。 +每个具体的数据集(如 Wiki、诗歌)都应该继承此类并实现相应方法。 +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Callable, Optional + +import tensorflow as tf + + +@dataclass +class TokenizerBundle: + """分词器信息包装类 + + 将分词器相关的属性打包在一起,简化 DataBundle 接口。 + """ + + tokenizer: Callable + decode: Callable + end_of_text: int + vocab_size: int + vocab_path: str = "" + + +@dataclass +class DataBundle(ABC): + """数据集抽象基类 + + 将数据加载、分词、统计等功能绑定在一起,提供统一的数据集接口。 + + Usage: + dataset = WikiDataset(data_dir="~/data/wiki") + doc_ds = dataset.doc_ds() + tokens_ds = dataset.tokens_ds(seq_length=256, batch_size=32) + dataset.stat() + """ + + data_dir: str + sequence_length: int = 256 + + @abstractmethod + def doc_ds(self) -> tf.data.Dataset: + """返回原始文档数据集 + + Returns: + TensorFlow Dataset,每个元素是一个文档字符串 + """ + pass + + @abstractmethod + def tokens_ds(self, seq_length: int, batch_size: int) -> tf.data.Dataset: + """返回 tokenized 数据集 + + 将原始文档转换为 token ID 序列,并分割为训练样本。 + + Args: + seq_length: 序列长度 + batch_size: 批次大小 + + Returns: + TensorFlow Dataset,每个元素是 (input_ids, target_ids) 对 + """ + pass + + @abstractmethod + def tokenizer_bundle(self) -> TokenizerBundle: + """返回分词器信息""" + pass + + def stat(self, seq_length: int | None = None) -> None: + """打印数据集统计信息 + + Args: + seq_length: 序列长度,用于估算训练样本数 + """ + from data.common import collect_stats + + info = self.tokenizer_bundle() + stats = collect_stats( + name=self.__class__.__name__, loader=self.doc_ds, tokenizer=info.tokenizer + ) + stats.print_report(seq_length=seq_length) diff --git a/data/common.py b/data/common.py new file mode 100644 index 0000000000000000000000000000000000000000..3e84ad90be5295ab5711d70165569ed215f80c97 --- /dev/null +++ b/data/common.py @@ -0,0 +1,147 @@ +"""数据集共享工具模块 + +提供数据集统计、报告生成等共享功能。 +""" + +import pathlib +from dataclasses import dataclass +from typing import Callable + +import numpy as np +import tensorflow as tf +from keras import layers + + +@dataclass +class DatasetStats: + """数据集统计结果""" + + name: str + doc_count: int + total_chars: int + total_tokens: int + max_length: int + median_length: int + + def print_report(self, seq_length: int | None = 256): + """打印统一格式的统计报表 + + Args: + seq_length: 序列长度,用于估算训练样本数。 + 为 None 时表示不切割,一个文档一个样本。 + """ + avg_chars = self.total_chars / self.doc_count if self.doc_count > 0 else 0 + avg_tokens = self.total_tokens / self.doc_count if self.doc_count > 0 else 0 + + print() + print("=" * 60) + print(f"{self.name} 数据集统计") + print("=" * 60) + print(f"{'文档数:':<20} {self.doc_count:>15,}") + print(f"{'总字符数:':<20} {self.total_chars:>15,}") + print(f"{'总 Token 数:':<20} {self.total_tokens:>15,}") + print("-" * 60) + print(f"{'平均每文档字符数:':<20} {avg_chars:>15.1f}") + print(f"{'平均每文档 Token 数:':<20} {avg_tokens:>15.1f}") + print(f"{'最长文档字符数:':<20} {self.max_length:>15,}") + print(f"{'文档长度中位数:':<20} {self.median_length:>15,}") + print("=" * 60) + + if self.total_tokens > 0: + print() + if seq_length is None: + print(f"训练样本数: {self.doc_count:,} 个 (一个文档一个样本)") + else: + print(f"训练样本预估 (seq={seq_length}):") + print(f" 可生成约 {self.total_tokens // seq_length:,} 个训练样本") + + +def collect_stats( + name: str, loader: Callable[[], tf.data.Dataset], tokenizer: Callable +) -> DatasetStats: + """从 DatasetLoader 收集统计数据 + + Args: + name: 数据集名称(用于报表显示) + loader: 返回 tf.data.Dataset 的加载器函数 + tokenizer: 分词器函数,接收文本返回 token ID 列表 + + Returns: + DatasetStats 统计结果对象 + """ + ds = loader() + + doc_count = 0 + total_chars = 0 + total_tokens = 0 + lengths = [] + + for item in ds: + text = item.numpy().decode("utf-8") + if not text.strip(): + continue + + doc_count += 1 + total_chars += len(text) + lengths.append(len(text)) + + # Token 统计,过滤掉末尾的 padding (值为 0 的 token) + try: + import keras + + token_ids = keras.ops.convert_to_numpy(tokenizer(text)) + except ImportError: + # Fallback: assume tokenizer returns numpy array directly + token_ids = np.array(tokenizer(text)) + + # 只去掉末尾的 0,保留中间内容(包括中间的 OOV/padding) + valid_tokens = np.trim_zeros(token_ids, "b") + total_tokens += len(valid_tokens) + + return DatasetStats( + name=name, + doc_count=doc_count, + total_chars=total_chars, + total_tokens=total_tokens, + max_length=max(lengths) if lengths else 0, + median_length=int(np.median(lengths)) if lengths else 0, + ) + + +def save_vocabulary(vocab: list[str], vocab_path: pathlib.Path) -> None: + """保存词汇表到文件 + + Args: + vocab: 词汇表列表 + vocab_path: 保存路径 + """ + vocab_path.parent.mkdir(parents=True, exist_ok=True) + with open(vocab_path, "w", encoding="utf-8") as f: + for char in vocab: + written = char if char != "\n" else r"\n" + f.write(written + "\n") + + +def build_vocab_from_dataset( + doc_ds: tf.data.Dataset, vocab_path: pathlib.Path +) -> list[str]: + """从文档数据集构建词汇表 + + Args: + doc_ds: 文档数据集 + vocab_path: 词汇表保存路径 + + Returns: + 词汇表列表 + """ + vectorizer = layers.TextVectorization( + output_mode="int", split="character", standardize=None + ) + vectorizer.adapt(doc_ds, batch_size=128) + + vocab = vectorizer.get_vocabulary() + if "$" not in vocab: + vocab = [*vocab, "$"] + + save_vocabulary(vocab, vocab_path) + return vocab diff --git a/data/dev/mini_c4/file1.txt b/data/dev/mini_c4/file1.txt new file mode 100644 index 0000000000000000000000000000000000000000..413defaba8b39cf20a92cac84a58e9af307cc8c0 --- /dev/null +++ b/data/dev/mini_c4/file1.txt @@ -0,0 +1,3 @@ +first document of first file +second document of first file +third document of first file diff --git a/data/dev/mini_c4/file2.txt b/data/dev/mini_c4/file2.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0b7d284939e523191a8c1ef61c12a497648f592 --- /dev/null +++ b/data/dev/mini_c4/file2.txt @@ -0,0 +1,4 @@ +first document of second file +second document of second file +third document of second file +fourth document of second file diff --git a/data/dev/mini_c4/file3.txt b/data/dev/mini_c4/file3.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce08747bf5964d2d3ef662ecc93ed054d909b89d --- /dev/null +++ b/data/dev/mini_c4/file3.txt @@ -0,0 +1,3 @@ +first document of third file +second document of third file +third document of third file diff --git "a/data/dev/poetry/\345\205\203.csv" "b/data/dev/poetry/\345\205\203.csv" new file mode 100644 index 0000000000000000000000000000000000000000..8b38cfd83007e94a8b937f61372f1c366e67d2c1 --- /dev/null +++ "b/data/dev/poetry/\345\205\203.csv" @@ -0,0 +1,11 @@ +标题,朝代,作者,体裁,内容 +西洱河,元,述律杰,五言排律,洱水何雄壮,源流自邓川。两关龙首尾,九曲势蜿蜒。大理城池固,金汤铁石坚。四洲从古号,三岛至今传。罗阁凭巘崄,蒙人恃极边。要当兵十万,不数客三千。世祖亲征日,初还一统天。雨师清瘴疠,风伯扫氛烟。民物因蕃富,封疆近百年。点苍山色好,铭刻尚依然。 +陟玩春山纪兴,元,忽必烈,七言律诗,时膺韶景陟兰峰,不惮跻攀谒粹容。花色映霞祥彩混,垆烟拂雾瑞光重。雨沾琼干岩边竹,风袭琴声岭际松。净刹玉毫瞻礼罢,回程仙驾驭苍龙。 +结联,元,奥鲁赤,句,久立危栏须北望,无边秋色杳冥冥。 +八月初四日雪坡太守周门拓入云居山中复度岭饮于水月尼寺赋诗书似太守及苏州刺史周义卿,元,杨维桢,七言律诗,文章太守早休牙,五马传呼处士家。好客新分朱露酒,题诗近在白云窝。山中子落千年桂,海上人归八月槎。水月楼头横玉笛,误猜萼绿是韶华。 +用顾松江韵复理贰守并柬雪坡刺史,元,杨维桢,七言律诗,仙客归来隘九州,身骑黄鹤记南游。乌衣故国江山在,铜柱荒台草木秋。起舞刘琨空有志,登高王粲不胜愁。问君蔗境今何在,祇忆当年顾虎头。 +寄小蓬莱主者闻梅涧并简沈元方宇文仲美贤主宾,元,杨维桢,七言律诗,罗浮主者是仙才,东老诸孙亦俊哉。风雨春城花落尽,江山故国燕归来。酒盟自有乌巾在,笑口应随皓齿开。十八仙人重会处,劫灰不到小蓬莱。 +次韵奉答倪元镇,元,杨维桢,七言律诗,坐断深林事不闻,西窗风日爱余曛。旧经高赤寻三传,新咏山王削五君。翠筱侵床落苍雪,石池洗砚动玄云。东邻书屋最相忆,莫遣草堂移浪文。 +送谢太守,元,杨维桢,七言律诗,朝廷遣使航东海,万里南来送玺书。著屐登山良不恶,分符典郡复何如。白苏事业千年后,吴楚封疆百战馀。今日养民方急务,肯将徵算及舟车。 +送谢太守,元,杨维桢,七言律诗, +回上张太尉(一云"谢赐玳瑁笔见征楚国公碑文"),元,杨维桢,七言律诗,昨夜文星照南极,今朝客省过东维。锦囊颖脱千年兔,斑管光摇九尾龟。墨卷风云随王气,恩分雨露出天池。老夫来草平蛮策,先写新封楚国碑。 \ No newline at end of file diff --git "a/data/dev/poetry/\345\205\210\347\247\246.csv" "b/data/dev/poetry/\345\205\210\347\247\246.csv" new file mode 100644 index 0000000000000000000000000000000000000000..8ca9d666ec809f71b05e1cc5b55cad728c881f66 --- /dev/null +++ "b/data/dev/poetry/\345\205\210\347\247\246.csv" @@ -0,0 +1,5 @@ +标题,朝代,作者,体裁,内容 +禹玉牒辞,先秦,无名氏,古风,祝融司方发其英,沐日浴月百宝生。 +衣铭,先秦,无名氏,古风,桑蚕苦,女工难,得新捐故后必寒。 +书车,先秦,无名氏,古风,出畏之,入惧之。 +击壤歌,先秦,无名氏,古风,日出而作。日入而息。凿井而饮。耕田而食。帝力于我何有哉。 diff --git "a/data/dev/poetry/\345\215\227\345\214\227\346\234\235.csv" "b/data/dev/poetry/\345\215\227\345\214\227\346\234\235.csv" new file mode 100644 index 0000000000000000000000000000000000000000..853964130fd46870f7b0d9fedc7d6d5a4fde8998 --- /dev/null +++ "b/data/dev/poetry/\345\215\227\345\214\227\346\234\235.csv" @@ -0,0 +1,6 @@ +标题,朝代,作者,体裁,内容 +悬瓠方丈竹堂飨侍臣联句诗,南北朝,元宏,古风,白日光天兮无不曜。江左一隅独未照。愿从圣明兮登衡会。万国驰诚混内外。云雷大振兮天门辟。率土来宾一正历。舜舞干戚兮天下归。文德远被莫不思。皇风一鼓兮九地匝。戴日依天清六合。遵彼汝坟兮昔化贞。未若今日道风明。文王政教兮晖江沼。宁如大化光四表。 +歌,南北朝,元宏,句,两菖蒲,新野乐。 +应制赋铜鞮山松诗,南北朝,元协,古风,问松林。松林经几冬。山川何如昔。风云与古同。 +绝命诗二首 其一 ,南北朝,元熙,古风,义实动君子,主辱死忠臣。何以明是节,将解七尺身。 +绝命诗二首 其二 ,南北朝,元熙,古风,平生方寸心,殷勤属知己。从今一销化,悲伤无极已。 diff --git a/data/poetry/__init__.py b/data/poetry/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f2a8f75a0635595ec8609fa8ad68b5b9b136eaec --- /dev/null +++ b/data/poetry/__init__.py @@ -0,0 +1,21 @@ +"""诗歌数据集模块 + +从以下 github 地址下载数据集到目录 ./data/Poetry: + +> https://github.com/xiu-ze/Poetry.git + +数据集的格式是多文件 CSV 格式,统计结果: + +> 找到 22 个 CSV 文件 +> +> 诗歌总数: 1014507 +> 最长字符数: 4872 +> 平均字符数: 66.04 +> 中位数: 48 + +因此可设置序列长度为 100. +""" + +from data.poetry.dataset import PoetryDataset + +__all__ = ["PoetryDataset"] diff --git a/data/poetry/dataset.py b/data/poetry/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..6d32d675e1355863ddeb4d7cc5c4792dab7e5ce7 --- /dev/null +++ b/data/poetry/dataset.py @@ -0,0 +1,104 @@ +"""诗歌数据集主模块 + +实现 PoetryDataset 类,继承自 DataBundle。 +""" + +import pathlib +from dataclasses import dataclass, field +from typing import Optional + +import tensorflow as tf + +from data.base import DataBundle, TokenizerBundle +from data.poetry.loader import doc_load_with_eot +from data.poetry.transformer import transform +from data.poetry.tokenizer import load_vectorizer + + +@dataclass +class PoetryDataset(DataBundle): + """诗歌数据集 + + 将文档加载、分词、统计等功能绑定在一起的数据集类。 + + Usage: + dataset = PoetryDataset( + data_dir="~/data/Poetry/诗歌数据集", + vocab_path="~/data/Poetry/vocabulary.txt", + sequence_length=100 + ) + + # 获取文档数据集 + doc_ds = dataset.doc_ds() + + # 获取 token 数据集 + tokens_ds = dataset.tokens_ds(seq_length=100, batch_size=128) + + # 打印统计信息 + dataset.stat(seq_length=100) + """ + + vocab_path: str = "" + + _data_path: pathlib.Path = field(init=False, repr=False) + _vocab_path: pathlib.Path = field(init=False, repr=False) + _tokenizer_info: Optional[TokenizerBundle] = field( + init=False, repr=False, default=None + ) + + def __post_init__(self): + self._data_path = pathlib.Path(self.data_dir).expanduser() + self._vocab_path = pathlib.Path(self.vocab_path).expanduser() + + def _load_tokenizer(self): + """懒加载分词器""" + if self._tokenizer_info is None: + tokenizer = load_vectorizer(self._vocab_path, self.sequence_length + 1) + vocab = tokenizer.get_vocabulary() + end_of_text = vocab.index("$") + vocab_size = len(vocab) + + def decode(token_ids: list[int]) -> str: + chars = [ + vocab[token_id] for token_id in token_ids if token_id < len(vocab) + ] + return "".join(chars) + + self._tokenizer_info = TokenizerBundle( + tokenizer=tokenizer, + decode=decode, + end_of_text=end_of_text, + vocab_size=vocab_size, + vocab_path=str(self._vocab_path) + ) + + def doc_ds(self) -> tf.data.Dataset: + """返回原始文档数据集 + + Returns: + TensorFlow Dataset,每个元素是带结束标记的诗歌内容 + """ + return doc_load_with_eot(self._data_path) + + def tokens_ds(self, seq_length: int, batch_size: int) -> tf.data.Dataset: + """返回 tokenized 数据集 + + Args: + seq_length: 序列长度(诗歌中此参数主要用于兼容性) + batch_size: 批次大小 + + Returns: + TensorFlow Dataset,每个元素是 (input_ids, target_ids) 对 + """ + self._load_tokenizer() + ds = self.doc_ds() + return transform( + ds=ds, + tokenizer=self._tokenizer_info.tokenizer, + batch_size=batch_size, + ) + + def tokenizer_bundle(self) -> TokenizerBundle: + """返回分词器信息""" + self._load_tokenizer() + return self._tokenizer_info diff --git a/data/poetry/loader.py b/data/poetry/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..0ecc6470ef98278c9e7b8b9c5ae399c24f38e173 --- /dev/null +++ b/data/poetry/loader.py @@ -0,0 +1,59 @@ +"""诗歌数据集文档加载模块 + +从 CSV 文件加载诗歌文本数据。 +""" + +import glob +import os +import pathlib + +import tensorflow as tf + + +def _parse_csv_line(line: tf.Tensor) -> tf.Tensor: + """解析 CSV 行,返回内容列""" + fields = tf.io.decode_csv( + line, + use_quote_delim=False, # 行内的引号是普通字符 + record_defaults=["", "", "", "", ""], + ) + return fields[4] # 返回 '内容' 列的值 + + +def doc_load(data_dir: pathlib.Path) -> tf.data.Dataset: + """加载诗歌数据集 + + 从指定目录下的 CSV 文件中加载诗歌文本数据。 + 每个 CSV 文件应该包含以下列:标题、作者、朝代、类型、内容。 + + Args: + data_dir: 数据目录路径 + + Returns: + TensorFlow Dataset,每个元素是诗歌内容字符串 + """ + csv_files = glob.glob(os.path.join(data_dir, "*.csv")) + if not csv_files: + raise ValueError(f"在目录 {data_dir} 中未找到任何 CSV 文件!") + + files_ds = tf.data.Dataset.from_tensor_slices(csv_files) + csv_line_ds = files_ds.interleave( + lambda csv_file: tf.data.TextLineDataset(csv_file).skip(1), + cycle_length=1, + ) + return csv_line_ds.map(_parse_csv_line, num_parallel_calls=tf.data.AUTOTUNE).filter( + lambda x: tf.strings.length(x) > 0 + ) + + +def doc_load_with_eot(data_dir: pathlib.Path) -> tf.data.Dataset: + """加载诗歌数据集,每行末尾添加结束标记 + + Args: + data_dir: 数据目录路径 + + Returns: + TensorFlow Dataset,每个元素是带结束标记的诗歌内容 + """ + ds = doc_load(data_dir) + return ds.map(lambda x: tf.strings.join([x, "$"])) diff --git a/data/poetry/runner.py b/data/poetry/runner.py new file mode 100644 index 0000000000000000000000000000000000000000..b85363910b9eaf7101673fc464e3f9e1e7ed8959 --- /dev/null +++ b/data/poetry/runner.py @@ -0,0 +1,38 @@ +"""诗歌数据集 Runner + +Usage: + python data/poetry/runner.py build_vocab + python data/poetry/runner.py test_dataset + ENV=production python data/poetry/runner.py build_vocab +""" + +import pathlib +import sys + +sys.path.insert(0, str(pathlib.Path(__file__).parent.parent.parent)) + +from data.poetry.dataset import PoetryDataset +from data.runner import DatasetRunner +from env.resolve import resolve_path, resolve_env, resolve_saved + + +dataset = PoetryDataset( + data_dir=str( + resolve_env(resolve_path("data/dev/poetry"), resolve_path("~/data/Poetry/诗歌数据集")) + ), + vocab_path=str( + resolve_env( + resolve_saved("vocab/poetry/vocab.txt"), + resolve_path("~/data/Poetry/vocabulary.txt"), + ) + ), + sequence_length=100, +) + +runner = DatasetRunner( + dataset=dataset, + name="poetry", +) + +if __name__ == "__main__": + runner() diff --git a/data/poetry/tokenizer.py b/data/poetry/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..9d43651b594e87082de4234abce18d4ff75707ff --- /dev/null +++ b/data/poetry/tokenizer.py @@ -0,0 +1,67 @@ +"""诗歌数据集分词器模块 + +提供诗歌数据集专用的分词器实现。 +""" + +import pathlib + +from keras import layers + + +def load_vocabulary(vocab_path: pathlib.Path): + """从文本文件加载词汇表,每行一个字符。 + + Args: + vocab_path: 词汇表文件路径 + + Returns: + 词汇表列表 + """ + + def extract_word(line: str) -> str: + word = line[:-1] # 去掉行末的换行符 + return word if word != r"\n" else "\n" + + with open(vocab_path, "r", encoding="utf-8") as f: + vocab = [extract_word(line) for line in f] + return vocab + + +def load_vectorizer( + vocab_path: pathlib.Path, sequence_length: int = 101 +) -> layers.TextVectorization: + """从词汇表文件加载分词器 + + Args: + vocab_path: 词汇表文件路径 + sequence_length: 输出序列长度,默认为 101 + (多一位是为了在训练时构建输入和目标偏移一位) + + Returns: + TextVectorization 层 + """ + vectorizer = layers.TextVectorization( + output_mode="int", + split="character", + output_sequence_length=sequence_length, + standardize=None, + ) + + vocab = load_vocabulary(vocab_path) + vectorizer.set_vocabulary(vocab) + + return vectorizer + + +def create_vectorizer(sequence_length: int = 101) -> layers.TextVectorization: + """创建新的分词器(用于训练词汇表) + + Args: + sequence_length: 输出序列长度,默认为 101 + + Returns: + TextVectorization 层 + """ + return layers.TextVectorization( + output_mode="int", split="character", standardize=None + ) diff --git a/data/poetry/transformer.py b/data/poetry/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..5887d6a88dff7e9527efd48579ec1b2c69727baa --- /dev/null +++ b/data/poetry/transformer.py @@ -0,0 +1,38 @@ +"""诗歌数据集 token 转换模块 + +将诗歌文档数据集转换为训练用的 token 序列。 +""" + +from typing import Callable + +import tensorflow as tf + + +def transform( + ds: tf.data.Dataset, + tokenizer: Callable, + batch_size: int, +) -> tf.data.Dataset: + """转换诗歌数据集为训练数据集 + + 诗歌数据集已经生成了固定数量的 token 序列,不足的部分会 padding。 + + Args: + ds: 文档数据集 + tokenizer: 分词器函数 + batch_size: 批次大小 + + Returns: + 训练数据集,每个元素是 (input_ids, target_ids) 对 + """ + # 文本向量化;对于诗歌数据集来说,已经生成了固定数量的 token 序列了,不足的部分会 padding + ds = ds.map(tokenizer, num_parallel_calls=8) + + # 构建输入和目标(偏移一位) + # 无需在这里添加结束标记,因为在 doc_load 中已经添加了结束标记 + ds = ds.map(lambda x: (x[:-1], x[1:])) + + # 重新设置批次大小并预取数据以提高性能 + ds = ds.batch(batch_size).prefetch(8) + + return ds diff --git a/data/runner.py b/data/runner.py new file mode 100644 index 0000000000000000000000000000000000000000..23b75cb61baafa6356f0c77314f30ee534fa0b24 --- /dev/null +++ b/data/runner.py @@ -0,0 +1,142 @@ +"""数据集 Runner 公共模块 + +提供通用的数据集测试和词汇表生成功能。 + +Usage: + # 在各自 runner.py 中实例化 + from data.runner import DatasetRunner + from data.poetry.dataset import PoetryDataset + from env.resolve import resolve, resolve_saved, resolve_env + + dataset = PoetryDataset( + data_dir=str(resolve_env(resolve("data/dev/poetry"), resolve("~/data/Poetry/诗歌数据集"))), + vocab_path=str(resolve_env(resolve_saved("poetry/vocab.txt"), resolve("~/data/Poetry/vocabulary.txt"))), + sequence_length=100, + ) + runner = DatasetRunner(dataset=dataset, name="poetry") + runner() +""" + +from data.base import DataBundle +from data.common import build_vocab_from_dataset +from env.resolve import resolve_saved +from env.runner import ActionRunner + + +class DatasetRunner(ActionRunner): + """数据集 Runner + + 提供通用的数据集测试和词汇表生成功能。 + + Args: + dataset: 数据集实例(PoetryDataset 或 WikiDataset) + name: 数据集英文名称(如 "poetry", "wiki") + max_docs: 测试时显示的文档数量,默认 5 + max_samples: 测试时显示的 token 样本数量,默认 3 + max_doc_chars: 文档显示的最大字符数,默认 200 + max_text_display: token 文本显示的最大字符数,默认 80 + + Usage: + runner = DatasetRunner(dataset=poetry_dataset, name="poetry") + runner.test_dataset() # 或 runner.build_vocab() + """ + + # 中英文名称映射 + NAME_MAP = { + "poetry": "诗歌", + "wiki": "Wiki", + } + + def __init__( + self, + dataset: DataBundle, + name: str, + max_docs: int = 5, + max_samples: int = 3, + max_doc_chars: int = 200, + max_text_display: int = 80, + ): + self.dataset = dataset + self.name = name + self.display_name = self.NAME_MAP.get(name, name) + self.vocab_path = resolve_saved(f"vocab/{name}/vocab.txt") + self.max_docs = max_docs + self.max_samples = max_samples + self.max_doc_chars = max_doc_chars + self.max_text_display = max_text_display + + def build_vocab(self) -> None: + """生成字符词汇表""" + print(f"正在加载数据集...") + ds = self.dataset.doc_ds() + + print(f"正在保存词汇表到: {self.vocab_path}") + vocab = build_vocab_from_dataset(ds, self.vocab_path) + + print(f"词汇表大小: {len(vocab)}") + print("完成!") + + def test_dataset(self) -> None: + """测试数据集""" + print("\n" + "=" * 60) + print(f"{self.display_name} 数据集测试") + print("=" * 60) + + self._view_documents(self.dataset.doc_ds()) + self._view_tokens(self.dataset) + self._show_vocab_info(self.dataset.tokenizer_bundle()) + + print("\n" + "=" * 60) + print("测试完成") + print("=" * 60) + + def _view_documents(self, doc_ds) -> None: + """查看原始文档""" + print("\n【原始文档查看】") + print("-" * 60) + count = 0 + for doc in doc_ds.take(self.max_docs): + count += 1 + text = doc.numpy().decode("utf-8") + if len(text) > self.max_doc_chars: + text = text[: self.max_doc_chars] + "..." + print(f"\n第 {count} 个文档:") + print(f" {text}") + print(f"\n共显示 {count} 个文档") + + def _view_tokens(self, dataset) -> None: + """查看 tokenized 数据""" + print("\n【Tokenized 数据查看】") + print("-" * 60) + + tokenizer_info = dataset.tokenizer_bundle() + tokens_ds = dataset.tokens_ds(seq_length=dataset.sequence_length, batch_size=1) + + count = 0 + for batch_input, batch_target in tokens_ds.take(self.max_samples): + count += 1 + input_ids = batch_input[0].numpy() + target_ids = batch_target[0].numpy() + + input_text = tokenizer_info.decode(input_ids.tolist()) + target_text = tokenizer_info.decode(target_ids.tolist()) + + if len(input_text) > self.max_text_display: + input_text = input_text[: self.max_text_display] + "..." + if len(target_text) > self.max_text_display: + target_text = target_text[: self.max_text_display] + "..." + + print(f"\n第 {count} 个样本:") + print(f" 输入 tokens: {input_ids[:20]}... (长度: {len(input_ids)})") + print(f" 目标 tokens: {target_ids[:20]}... (长度: {len(target_ids)})") + print(f" 输入文本: {input_text}") + print(f" 目标文本: {target_text}") + print(f"\n共显示 {count} 个样本") + + @staticmethod + def _show_vocab_info(tokenizer_info) -> None: + """显示词汇表信息""" + print("\n【词汇表信息】") + print("-" * 60) + print(f" 词汇表大小: {tokenizer_info.vocab_size}") + print(f" 结束标记 ID: {tokenizer_info.end_of_text}") diff --git a/data/tokenizers.py b/data/tokenizers.py new file mode 100644 index 0000000000000000000000000000000000000000..c0e049441140964eb889bbc75162192c586dc483 --- /dev/null +++ b/data/tokenizers.py @@ -0,0 +1,89 @@ +""" +GPT模型的共享组件模块: + +- 分词器 +""" + +import keras +import keras_hub +from keras import layers + + +def sentence_piece(): + # 用预训练好的分词器,也就是说我们不去自己训练分词器了 + vocabulary_file = keras.utils.get_file( + origin="https://hf-mirror.com/mattdangerw/spiece/resolve/main/vocabulary.proto" + ) + # [Note] 依然需要 tensorflow_text 包 + tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(vocabulary_file) + + end_of_text = tokenizer.token_to_id("<|endoftext|>") + + def decode(tokens: list[int]) -> str: + return tokenizer.detokenize(tokens) + + return tokenizer, end_of_text, decode + + +def character_vectorization(): + """简单的字符级分词器,适用于测试""" + vectorizer = layers.TextVectorization(output_mode="int", split="character") + vectorizer.set_vocabulary( + list("abcdefghijklmnopqrstuvwxyz0123456789 .,!?;:()[]{}<>-_\n") + + ["<|endoftext|>"] # 兼容 sentence_piece 分词器的特殊标记 + ) + + vocab = vectorizer.get_vocabulary() + for idx, word in enumerate(vocab): + if word == "<|endoftext|>": + end_of_text = idx + break + else: + raise ValueError("Vocabulary does not contain <|endoftext|> token.") + + def decode(tokens: list[int]) -> str: + words = [vocab[token] for token in tokens] + return "".join(words) + + return vectorizer, end_of_text, decode + + +def poetry_character_vectorization( + vocab_path: str = "local/saved/vocab/poetry/vocab.txt", +): + """从文本文件加载诗歌字符级分词器。 + + 词汇表文件格式:每行一个字符,第一行必须是 <|endoftext|>。 + + Args: + vocab_path: 词汇表文件路径,默认为 "local/saved/poetry/vocab.txt" + + Returns: + (vectorizer, end_of_text, decode): 分词器、结束标记ID、解码函数 + """ + from env.resolve import resolve_path + + # 读取词汇表 + vocab_file = resolve_path(vocab_path) + with open(vocab_file, "r", encoding="utf-8") as f: + vocab = [line.rstrip("\n") for line in f] + + # 创建 TextVectorization 层 + vectorizer = layers.TextVectorization( + output_mode="int", split="character", standardize=None + ) + vectorizer.set_vocabulary(vocab) + + # 找到 end_of_text 的索引 + for idx, word in enumerate(vocab): + if word == "<|endoftext|>": + end_of_text = idx + break + else: + raise ValueError("Vocabulary does not contain <|endoftext|> token.") + + def decode(tokens: list[int]) -> str: + words = [vocab[token] for token in tokens] + return "".join(words) + + return vectorizer, end_of_text, decode diff --git a/data/wiki/__init__.py b/data/wiki/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0e4508b1ad641f3dacfeb9afa6a22a5584620037 --- /dev/null +++ b/data/wiki/__init__.py @@ -0,0 +1,8 @@ +"""Wiki 数据集模块 + +导出 WikiDataset 类。 +""" + +from data.wiki.dataset import WikiDataset + +__all__ = ["WikiDataset"] diff --git a/data/wiki/dataset.py b/data/wiki/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..35ebe2a1b89781689a281060f03c8a733e0d8ce4 --- /dev/null +++ b/data/wiki/dataset.py @@ -0,0 +1,100 @@ +"""Wiki 数据集主模块 + +实现 WikiDataset 类,继承自 DataBundle。 +""" + +import pathlib +from dataclasses import dataclass, field +from typing import Optional + +import tensorflow as tf + +from data.base import DataBundle, TokenizerBundle +from data.wiki.loader import doc_load +from data.wiki.transformer import transform +from data.wiki.tokenizer import sentence_piece, character_vectorization + + +@dataclass +class WikiDataset(DataBundle): + """Wiki 数据集 + + 将文档加载、分词、统计等功能绑定在一起的数据集类。 + + Usage: + dataset = WikiDataset( + data_dir="~/data/wiki/mini_c4", + tokenizer_type="sentence_piece" # 或 "character" + ) + + # 获取文档数据集 + doc_ds = dataset.doc_ds() + + # 获取 token 数据集 + tokens_ds = dataset.tokens_ds(seq_length=256, batch_size=32) + + # 打印统计信息 + dataset.stat(seq_length=256) + """ + + glob_pattern: str = "*" + tokenizer_type: str = "sentence_piece" + + _data_path: pathlib.Path = field(init=False, repr=False) + _tokenizer_bundle: Optional[TokenizerBundle] = field( + init=False, repr=False, default=None + ) + + def __post_init__(self): + self._data_path = pathlib.Path(self.data_dir).expanduser() + + def _load_tokenizer(self): + """懒加载分词器""" + if self._tokenizer_bundle is None: + if self.tokenizer_type == "sentence_piece": + tokenizer, end_of_text, decode = sentence_piece() + elif self.tokenizer_type == "character": + tokenizer, end_of_text, decode = character_vectorization() + else: + raise ValueError(f"Unknown tokenizer type: {self.tokenizer_type}") + + vocab_size = tokenizer.vocabulary_size() + self._tokenizer_bundle = TokenizerBundle( + tokenizer=tokenizer, + decode=decode, + end_of_text=end_of_text, + vocab_size=vocab_size + ) + + def doc_ds(self) -> tf.data.Dataset: + """返回原始文档数据集 + + Returns: + TensorFlow Dataset,每个元素是一个文档字符串 + """ + return doc_load(self._data_path, glob_pattern=self.glob_pattern) + + def tokens_ds(self, seq_length: int, batch_size: int) -> tf.data.Dataset: + """返回 tokenized 数据集 + + Args: + seq_length: 序列长度 + batch_size: 批次大小 + + Returns: + TensorFlow Dataset,每个元素是 (input_ids, target_ids) 对 + """ + self._load_tokenizer() + ds = self.doc_ds() + return transform( + ds=ds, + tokenizer=self._tokenizer_bundle.tokenizer, + end_of_text=self._tokenizer_bundle.end_of_text, + sequence_length=seq_length, + batch_size=batch_size, + ) + + def tokenizer_bundle(self) -> TokenizerBundle: + """返回分词器信息""" + self._load_tokenizer() + return self._tokenizer_bundle diff --git a/data/wiki/loader.py b/data/wiki/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..253ae8ca731ced3f233056834c8a8ac9b6d8c103 --- /dev/null +++ b/data/wiki/loader.py @@ -0,0 +1,52 @@ +"""Wiki 数据集文档加载模块 + +从 mini_c4 格式加载文档数据集。 +""" + +import pathlib + +import tensorflow as tf + + +def doc_load( + data_dir: pathlib.Path, glob_pattern: str = "*", cycle_length: int = 32 +) -> tf.data.Dataset: + """加载并处理文档数据集为 TensorFlow Dataset。 + + 递归查找指定目录下匹配 glob_pattern 的所有文件,使用 doc_extract 函数 + 将每个文件转换为 TensorFlow Dataset,然后使用 interleave 进行并行处理。 + + 目录下的文件格式要求每行一个文档,其中的换行符使用 "\\n" 转义。 + + Args: + data_dir: 数据目录路径 + glob_pattern: 文件匹配模式,如 "*.txt",默认为 "*" 匹配所有文件 + cycle_length: interleave 的 cycle_length 参数,控制并行处理的文件数量,默认为 32 + + Returns: + 合并后的 TensorFlow Dataset,包含所有文件处理后的数据 + """ + # 获取所有文件(过滤掉目录),递归查找子目录 + files = [str(file) for file in data_dir.rglob(glob_pattern) if file.is_file()] + if not files: + raise FileNotFoundError(f"在目录 {data_dir} 中未找到匹配 {glob_pattern} 的文件") + + # 排序文件列表以确保一致的处理顺序 + files = sorted(files) + + # 创建数据集管道 + ds = tf.data.Dataset.from_tensor_slices(files) + ds = ds.interleave( + _line_doc_extract, + cycle_length=cycle_length, + num_parallel_calls=tf.data.AUTOTUNE, + ) + + return ds + + +def _line_doc_extract(path: str) -> tf.data.Dataset: + """Mini-c4 format: one document per line.""" + return tf.data.TextLineDataset(path).map( + lambda x: tf.strings.regex_replace(x, r"\\n", "\n") + ) diff --git a/data/wiki/runner.py b/data/wiki/runner.py new file mode 100644 index 0000000000000000000000000000000000000000..cc5f6764017939820c1d6ff32ce9acd3b1dc80f3 --- /dev/null +++ b/data/wiki/runner.py @@ -0,0 +1,32 @@ +"""Wiki 数据集 Runner + +Usage: + python data/wiki/runner.py test_dataset + ENV=production python data/wiki/runner.py test_dataset +""" + +import pathlib +import sys + +sys.path.insert(0, str(pathlib.Path(__file__).parent.parent.parent)) + +from data.runner import DatasetRunner +from data.wiki.dataset import WikiDataset +from env.resolve import resolve_path, resolve_env + + +dataset = WikiDataset( + data_dir=str( + resolve_env(resolve_path("data/dev/mini_c4"), resolve_path("~/data/wiki/mini_c4")) + ), + tokenizer_type=resolve_env("character", "sentence_piece"), + sequence_length=256, +) + +runner = DatasetRunner( + dataset=dataset, + name="wiki", +) + +if __name__ == "__main__": + runner() diff --git a/data/wiki/tokenizer.py b/data/wiki/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..56b47837ca4314ec2e1ac0ad16d394b13b68d994 --- /dev/null +++ b/data/wiki/tokenizer.py @@ -0,0 +1,60 @@ +"""Wiki 数据集分词器模块 + +提供 Wiki 数据集专用的分词器实现。 +""" + +import keras +import keras_hub +from keras import layers + + +def sentence_piece(): + """SentencePiece 分词器 + + 使用预训练好的分词器,无需自己训练。 + + Returns: + (tokenizer, end_of_text, decode): 分词器、结束标记ID、解码函数 + """ + # 用预训练好的分词器,也就是说我们不去自己训练分词器了 + vocabulary_file = keras.utils.get_file( + origin="https://hf-mirror.com/mattdangerw/spiece/resolve/main/vocabulary.proto" + ) + # [Note] 依然需要 tensorflow_text 包 + tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(vocabulary_file) + + end_of_text = tokenizer.token_to_id("<|endoftext|>") + + def decode(tokens: list[int]) -> str: + return tokenizer.detokenize(tokens) + + return tokenizer, end_of_text, decode + + +def character_vectorization(): + """字符级分词器 + + 简单的字符级分词器,适用于测试。 + + Returns: + (tokenizer, end_of_text, decode): 分词器、结束标记ID、解码函数 + """ + vectorizer = layers.TextVectorization(output_mode="int", split="character") + vectorizer.set_vocabulary( + list("abcdefghijklmnopqrstuvwxyz0123456789 .,!?;:()[]{}\u003c\u003e-_\n") + + ["<|endoftext|>"] # 兼容 sentence_piece 分词器的特殊标记 + ) + + vocab = vectorizer.get_vocabulary() + for idx, word in enumerate(vocab): + if word == "<|endoftext|>": + end_of_text = idx + break + else: + raise ValueError("Vocabulary does not contain <|endoftext|> token.") + + def decode(tokens: list[int]) -> str: + words = [vocab[token] for token in tokens] + return "".join(words) + + return vectorizer, end_of_text, decode diff --git a/data/wiki/transformer.py b/data/wiki/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b8fd8f4cd9d497df130553f64beda96011b016ad --- /dev/null +++ b/data/wiki/transformer.py @@ -0,0 +1,47 @@ +"""Wiki 数据集 token 转换模块 + +将文档数据集转换为训练用的 token 序列。 +""" + +from typing import Callable + +import numpy as np +import tensorflow as tf + + +def transform( + ds: tf.data.Dataset, + tokenizer: Callable, + end_of_text: int, + sequence_length: int, + batch_size: int, +) -> tf.data.Dataset: + """转换文档数据集为训练数据集 + + 将文档转换为 token ID,添加结束标记,分割为固定长度的序列。 + + Args: + ds: 文档数据集 + tokenizer: 分词器函数 + end_of_text: 结束标记的 token ID + sequence_length: 序列长度 + batch_size: 批次大小 + + Returns: + 训练数据集,每个元素是 (input_ids, target_ids) 对 + """ + ds = ds.map(tokenizer, num_parallel_calls=8) + + # 将文档之间添加 end_of_text 标记分隔 + ds = ds.map(lambda x: tf.concat([x, np.array([end_of_text])], -1)) + + # 重新设置样本大小为固定长度序列 + ds = ds.rebatch(sequence_length + 1, drop_remainder=True) + + # 构建输入和目标(偏移一位) + ds = ds.map(lambda x: (x[:-1], x[1:])) + + # 重新设置批次大小并预取数据以提高性能 + ds = ds.batch(batch_size).prefetch(8) + + return ds diff --git a/data/wiki/wiki_cleaner.py b/data/wiki/wiki_cleaner.py new file mode 100644 index 0000000000000000000000000000000000000000..fe596fabefd5a0080b22592dd56f015b8d36d81e --- /dev/null +++ b/data/wiki/wiki_cleaner.py @@ -0,0 +1,122 @@ +""" +Wiki 文本清洗模块。 + +提供多种过滤器用于清洗 wiki 格式的文本数据。 +""" + +import re + + +def filter_single_line(text: str) -> str | None: + """ + 过滤只有一行的数据(通常是重定向页面)。 + + Args: + text: 输入文本 + + Returns: + 如果只有一行返回 None,否则返回原文本 + """ + lines = [line for line in text.split("\n") if line.strip()] + if len(lines) <= 1: + return None + return text + + +def filter_empty_brackets(text: str) -> str: + """ + 移除文本中的空括号对。 + + 例如:()、()、( )、( )、[ ]、【 】、{ } 等 + + Args: + text: 输入文本 + + Returns: + 移除空括号后的文本 + """ + # 匹配空括号对:() () [] 【】 {} 等,中间可有空白 + pattern = re.compile(r"[\(\)()\[\]【】{}]\s*[\(\)()\[\]【】{}]") + return pattern.sub("", text) + + +def filter_html_tags(text: str) -> str: + """ + 移除 HTML/XML 标签(HTML 实体编码格式)。 + + 例如:<templatestyles src="ShareCSS/infobox.css" /> + + Args: + text: 输入文本 + + Returns: + 移除 HTML 标签后的文本 + """ + # 匹配 <...> 格式的实体编码标签 + pattern = re.compile(r"<[^&]+>") + return pattern.sub("", text) + + +def filter_lang_tags(text: str) -> str: + """ + 移除特殊的语言标记(支持嵌套)。 + + 例如:-{H|zh-hans:重定向;zh-hant:重新导向;}- + 嵌套例如:-{T|zh:-{zh|}-;zh-hans:-{zh-hans|}-;}- + + Args: + text: 输入文本 + + Returns: + 移除语言转换标记后的文本 + """ + # 使用非贪婪匹配,循环处理嵌套 + pattern = re.compile(r"-\{[^{}]+?}-") + while True: + new_text = pattern.sub("", text) + if new_text == text: # 没有更多匹配了 + break + text = new_text + return text + + +def clean(text: str) -> str | None: + """ + 应用所有过滤器清洗文本。 + + 过滤顺序: + 1. 单行检查(重定向页面) + 2. HTML 标签 + 3. 空白括号行 + 4. 语言转换标记 + 5. 最终空检查 + + Args: + text: 输入文本 + + Returns: + 清洗后的文本,如果应该丢弃则返回 None + """ + # 1. 检查单行 + result = filter_single_line(text) + if result is None: + return None + + # 2. 移除 HTML 标签 + result = filter_html_tags(result) + + # 3. 移除空白括号行 + result = filter_empty_brackets(result) + + # 4. 移除语言转换标记 + result = filter_lang_tags(result) + + # 5. 多个连续空行替换为一个空行 + result = re.sub(r"\n\s*\n", "\n\n", result) + result = result.strip() + + # 6. 最终检查:如果结果为空或只剩空白,返回 None + if not result.strip(): + return None + + return result diff --git a/docs/TODOs.md b/docs/TODOs.md new file mode 100644 index 0000000000000000000000000000000000000000..fc4bc6ded228e819e68f7678cbba6e81ba5f8e67 --- /dev/null +++ b/docs/TODOs.md @@ -0,0 +1,3 @@ +- [ ] `` 格式由于计算图的限制还无法实现,未来打算实现。 +- [ ] 希望能通过 Callback 或 train_step 截取到训练过程中的数据。 +- [ ] wiki 训练后不能回答事实性问题,感觉是过拟合了,将 dropout 调成 0.5 试一试(当前 0.1)。 \ No newline at end of file diff --git a/docs/pycharm.md b/docs/pycharm.md new file mode 100644 index 0000000000000000000000000000000000000000..b9ea1866e768290caf50648167dfb4b858ee32dc --- /dev/null +++ b/docs/pycharm.md @@ -0,0 +1,10 @@ +# PyCharm 开发指南 + +最近,我在项目里尝试将 PyCharm 的代码连接到远程服务器运行,遇到了一些莫名的问题。现在将一些解决方案记录下来,供以后参考。 + +总的来说,我直接应用远程环境就会出错,有各种各样的问题。我需要重新构建一个全新的环境才使得它正常运作。记录如下: + +1. 在远程服务器上创建一个新的 conda 环境。 +2. 创建一个新的 Python 项目(我直接移动了我的项目目录,并删除目录下的 .idea, .ruff_cache, .pytest_cache 等文件夹)。 +3. 在本地 PyCharm 中配置远程 Python 解释器,指向远程服务器上的新环境。这一步骤中,注意配置好目录映射,和不自动上传文件。 +4. 等待一段时间,就能正常运作了。 \ No newline at end of file diff --git a/env/keras.py b/env/keras.py new file mode 100644 index 0000000000000000000000000000000000000000..ae2018aa83ead11864a11af40a378e974fc5aef9 --- /dev/null +++ b/env/keras.py @@ -0,0 +1,11 @@ +"""Keras 相关工具模块 + +提供 Keras 配置相关的功能。 +""" + +import keras + + +def enable_mixed_precision(): + """开启混合精度训练/推理""" + keras.config.set_dtype_policy("mixed_float16") diff --git a/env/logger.py b/env/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..fcfdc8f45fa3c63668383de4051e89dfe9f47ff7 --- /dev/null +++ b/env/logger.py @@ -0,0 +1,52 @@ +import logging +from functools import wraps + + +def get_logger(name: str, filepath: str = None): + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + + # 控制台 + console_handler = logging.StreamHandler() + logger.addHandler(console_handler) + + # 文件 + if filepath: + file_handler = logging.FileHandler(filepath) + logger.addHandler(file_handler) + + return logger + + +def log(enter_message: str = "", exit_message: str = ""): + return _Log(enter_message=enter_message, exit_message=exit_message) + + +class _Log: + def __init__( + self, + enter_message: str = "", + exit_message: str = "" + ): + self.enter_message = enter_message + self.exit_message = exit_message + + def __enter__(self): + if self.enter_message: + print(self.enter_message) + return self + + def __exit__(self, exc_type, exc, tb): + if self.exit_message: + print(self.exit_message) + print("") + return False + + def __call__(self, func): + @wraps(func) + def wrapper(*args, **kwargs): + with _Log(self.enter_message, self.exit_message): + return func(*args, **kwargs) + return None + + return wrapper diff --git a/env/resolve.py b/env/resolve.py new file mode 100644 index 0000000000000000000000000000000000000000..4dc12f0a7fc9e59c814c1d0dd20b83c12f12c9e9 --- /dev/null +++ b/env/resolve.py @@ -0,0 +1,85 @@ +import os +from enum import Enum, StrEnum +from pathlib import Path + + +"""定义项目的根路径""" +PROJECT_ROOT = Path(__file__).parent.parent.absolute() + + +"""定义根据环境变量选择配置的函数""" +class Env(StrEnum): + TEST = "test" + PRODUCTION = "production" + +def resolve_env[T](test_conf: T = Env.TEST, prod_conf: T = Env.PRODUCTION) -> T: + env = os.environ.get("ENV", str(Env.TEST)) + return prod_conf if env == str(Env.PRODUCTION) else test_conf + + +"""定义一些预设的目录""" +SAVED_DIR = resolve_env( + PROJECT_ROOT / "local" / "saved", + PROJECT_ROOT / "saved", +) +TASKS_DIR = PROJECT_ROOT / "local" / "tasks" + + +"""定义一些路径解析函数,方便在项目中使用""" +def resolve_saved(path: str | Path = None) -> Path: + """解析相对于 saved 目录的路径 + + 1. 如果本身就是 Path 对象,直接返回。 + 2. 如果 path 是 None,返回 saved 目录本身。 + 3. 否则,将 path 解析为相对于 saved 目录的路径。 + """ + if isinstance(path, Path): + return path + return SAVED_DIR / path if path else SAVED_DIR + + +def resolve_task_dir(task_name: str) -> Path: + """解析任务所在的目录 + + Args: + task_name: 任务名称,即定义在 Pipeline 中的 name 字段,例如 "poetry_gpt" 或 "poetry_rnn"。 + """ + return TASKS_DIR / task_name + + +def resolve_path(path: str | Path) -> Path: + """从项目根目录解析路径 + + 1. 如果路径是 Path 对象,直接返回。 + 2. 如果路径是以 ~ 或 / 开头的绝对路径,则直接返回该路径。 + 3. 如果路径是相对路径,则将其解析为相对于项目根目录的路径。 + + Args: + path: 相对于项目根目录的路径 + + Returns: + 解析后的绝对路径 + + Example: + >>> resolve_path("data/dev/mini_c4/file.txt") + PosixPath('/Users/.../universal_deeplearning/data/dev/mini_c4/file.txt') + """ + if isinstance(path, Path): + return path + elif path.startswith("~") or path.startswith("/"): + return Path(path).expanduser().resolve() + else: + return PROJECT_ROOT / path + + +def display_path(path: str | Path) -> str: + """将路径转换为适合展示的字符串 + + 如果路径位于项目根目录内,则显示为相对项目根目录的路径; + 否则显示绝对路径。 + """ + resolved = resolve_path(path) + try: + return str(resolved.relative_to(PROJECT_ROOT)) + except ValueError: + return str(resolved) diff --git a/env/runner.py b/env/runner.py new file mode 100644 index 0000000000000000000000000000000000000000..693d290eeb2efceecd5ad44934b914499fad5aab --- /dev/null +++ b/env/runner.py @@ -0,0 +1,23 @@ +import sys +from typing import Callable + + +class ActionRunner: + def __call__(self, default_method: str | Callable = None): + if len(sys.argv) > 1: + method = self._resolve_method(sys.argv[1]) + else: + method = default_method + if type(method) == str: + method = self._resolve_method(method) + + if method: + method() + else: + raise ValueError("没有指定要执行的方法") + + def _resolve_method(self, method_name: str) -> Callable: + method = getattr(self, method_name, None) + if method is None: + raise ValueError(f"没有找到对应的方法:{method_name}") + return method diff --git a/env/vocab.py b/env/vocab.py new file mode 100644 index 0000000000000000000000000000000000000000..7229816d142d95ced0e1d7d18ec645ca2398712b --- /dev/null +++ b/env/vocab.py @@ -0,0 +1,2 @@ +# 定义所有词典中 PAD 的 id token +PAD = 0 \ No newline at end of file diff --git a/environment-linux.yml b/environment-linux.yml new file mode 100644 index 0000000000000000000000000000000000000000..7ea5dbb596b19045ed2f44b71159abb349423cff --- /dev/null +++ b/environment-linux.yml @@ -0,0 +1,15 @@ +name: general-dl +channels: + - defaults +dependencies: + - python=3.12 + - pip + - numpy + - tensorflow + - tensorflow-text + - keras + - pip: + - keras-hub + - gradio +variables: + ENV: production \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..4272dce38738fe6c6907eecd4ce31b38d60dfc69 --- /dev/null +++ b/environment.yml @@ -0,0 +1,17 @@ +name: general-dl +channels: + - defaults +dependencies: + - python=3.12 + - pip + - setuptools>=68,<70 + - numpy + - ruff + - pytest + - pytest-mock + - tensorflow + - keras + - pip: + - gradio +variables: + ENV: test diff --git a/generate_requirements.py b/generate_requirements.py new file mode 100644 index 0000000000000000000000000000000000000000..8806a5cee34ed99c41ef2fb7fffa7302f363acce --- /dev/null +++ b/generate_requirements.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +""" +从 environment-linux.yml 生成 requirements.txt +YAML 中的版本号优先级最高 +未指定版本号时查询当前环境的实际版本 +排除 python 和 pip +""" + +import os +from datetime import datetime +import yaml +from importlib.metadata import version, PackageNotFoundError + + +# 排除的包(不加入 requirements.txt) +EXCLUDE_PACKAGES = {"python", "pip"} + + +def get_installed_version(package_name): + """获取包的安装版本,未安装返回 None""" + try: + return version(package_name) + except PackageNotFoundError: + return None + + +def parse_package_string(dep): + """ + 解析包字符串,返回 (包名, yaml版本号或None) + 例如: "tensorflow=2.15.0" -> ("tensorflow", "2.15.0") + "numpy" -> ("numpy", None) + """ + if "=" in dep: + parts = dep.split("=") + pkg_name = parts[0] + pkg_version = parts[1] + return pkg_name, pkg_version + else: + return dep, None + + +def parse_environment_yml(filepath): + """解析 environment-linux.yml,提取包列表和版本信息""" + with open(filepath, "r") as f: + env = yaml.safe_load(f) + + packages = [] + + for dep in env.get("dependencies", []): + if isinstance(dep, str): + # 简单字符串格式:package 或 package=version + pkg_name, yaml_version = parse_package_string(dep) + if pkg_name not in EXCLUDE_PACKAGES: + packages.append((pkg_name, yaml_version)) + elif isinstance(dep, dict) and "pip" in dep: + # pip 子列表 + for pip_dep in dep["pip"]: + pkg_name, yaml_version = parse_package_string(pip_dep) + if pkg_name not in EXCLUDE_PACKAGES: + packages.append((pkg_name, yaml_version)) + + return packages + + +def main(): + yml_file = "environment-linux.yml" + output_file = "requirements.txt" + + print(f"读取 {yml_file}...") + packages = parse_environment_yml(yml_file) + print(f"发现 {len(packages)} 个包(排除 {EXCLUDE_PACKAGES})") + + lines = [] + for pkg_name, yaml_version in packages: + if yaml_version: + # YAML 中有版本号,优先使用 + lines.append(f"{pkg_name}=={yaml_version}") + print(f" ✓ {pkg_name}=={yaml_version} (来自 YAML)") + else: + # YAML 中没有版本号,查询当前环境 + env_version = get_installed_version(pkg_name) + if env_version: + lines.append(f"{pkg_name}=={env_version}") + print(f" ✓ {pkg_name}=={env_version} (来自当前环境)") + else: + lines.append(pkg_name) + print(f" ⚠ {pkg_name} (未安装,无版本号)") + + # 添加头部注释 + header_lines = [ + f"# Generated from {yml_file}", + f"# Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + f"# Environment: {os.environ.get('ENV', 'unknown')}", + "#", + ] + + # 合并所有行 + all_lines = header_lines + lines + + with open(output_file, "w") as f: + f.write("\n".join(all_lines) + "\n") + + print(f"\n已生成 {output_file}:") + print("-" * 40) + print("\n".join(all_lines)) + print("-" * 40) + + +if __name__ == "__main__": + main() diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/mini_gpt/__init__.py b/models/mini_gpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5d4202307cbabc561ee17220ca1f1b29cc15950 --- /dev/null +++ b/models/mini_gpt/__init__.py @@ -0,0 +1 @@ +from models.mini_gpt.model_builder import GptModelBuilder diff --git a/models/mini_gpt/gpt_components.py b/models/mini_gpt/gpt_components.py new file mode 100644 index 0000000000000000000000000000000000000000..c26c5cb7e504dc8902cb29bfa2383057082f88a8 --- /dev/null +++ b/models/mini_gpt/gpt_components.py @@ -0,0 +1,61 @@ +""" +GPT模型的共享组件模块: + +- Positional Encoding +- Transformer Decoder +""" + +import keras +from keras import layers, ops + +class PositionalEmbedding(keras.Layer): + def __init__(self, sequence_length, input_dim, output_dim, **kwargs): + super().__init__(**kwargs) + self.token_embeddings = layers.Embedding(input_dim, output_dim) + self.position_embeddings = layers.Embedding(sequence_length, output_dim) + + def call(self, inputs, reverse=False): + if reverse: + token_embeddings = self.token_embeddings.embeddings + return ops.matmul(inputs, ops.transpose(token_embeddings)) + positions = ops.cumsum(ops.ones_like(inputs), axis=-1) - 1 + embedded_tokens = self.token_embeddings(inputs) + embedded_positions = self.position_embeddings(positions) + return embedded_tokens + embedded_positions + + +class TransformerDecoder(keras.Layer): + def __init__(self, hidden_dim, intermediate_dim, num_heads, **kwargs): + super().__init__(**kwargs) + + self.hidden_dim = hidden_dim + self.intermediate_dim = intermediate_dim + + key_dim = hidden_dim // num_heads + + # self-attention 层 + self.self_attention = layers.MultiHeadAttention(num_heads, key_dim, dropout=0.1) + self.self_attention_layernorm = layers.LayerNormalization() + + # feed-forward 层 + self.feed_forward_1 = layers.Dense(intermediate_dim, activation="relu") + self.feed_forward_2 = layers.Dense(hidden_dim) + self.feed_forward_layernorm = layers.LayerNormalization() + self.dropout = layers.Dropout(0.1) + + def call(self, inputs): + # self-attention 计算 + residual = x = inputs + x = self.self_attention(query=x, key=x, value=x, use_causal_mask=True) + x = self.dropout(x) + x = x + residual + x = self.self_attention_layernorm(x) + + # feed-forward 计算 + residual = x + x = self.feed_forward_1(x) + x = self.feed_forward_2(x) + x = self.dropout(x) + x = x + residual + x = self.feed_forward_layernorm(x) + return x diff --git a/models/mini_gpt/model_builder.py b/models/mini_gpt/model_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..ee00cf21c0eef72f762868d87764431559f936b4 --- /dev/null +++ b/models/mini_gpt/model_builder.py @@ -0,0 +1,54 @@ +from dataclasses import dataclass +from functools import partial + +import keras +from keras import layers + +from models.mini_gpt.gpt_components import PositionalEmbedding, TransformerDecoder +from pipeline.base.generation import generate_with_training_model +from pipeline.base.model_builder import ModelArtifact + + +@dataclass +class GptModelBuilder: + hidden_dim: int + intermediate_dim: int + num_heads: int + num_layers: int + + def build_training_artifact( + self, + vocab_size: int, + sequence_length: int + ) -> ModelArtifact: + inputs = keras.Input(shape=(None,), dtype="int32", name="inputs") + embedding = PositionalEmbedding( + sequence_length, + vocab_size, + self.hidden_dim, + name="embedding" + ) + x = embedding(inputs) + x = layers.LayerNormalization(name="input_layer_norm")(x) + + for i in range(self.num_layers): + decoder = TransformerDecoder( + self.hidden_dim, + self.intermediate_dim, + self.num_heads, + name=f"decoder_{i}" + ) + x = decoder(x) + + outputs = embedding(x, reverse=True) + model = keras.Model(inputs, outputs, name="mini_gpt") + return ModelArtifact( + model=model, + generate=partial(generate_with_training_model, model) + ) + + def build_inference_artifact( + self, + training_artifact: ModelArtifact + ) -> ModelArtifact: + return training_artifact diff --git a/models/rnn/__init__.py b/models/rnn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..af1af89cfb4ca4d6f4c74d060396ec8df263b16b --- /dev/null +++ b/models/rnn/__init__.py @@ -0,0 +1 @@ +from models.rnn.model_builder import RNNModelBuilder diff --git a/models/rnn/model_builder.py b/models/rnn/model_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..d2e5b3a286249e91038a8e3eff07625a2cb1cfcb --- /dev/null +++ b/models/rnn/model_builder.py @@ -0,0 +1,114 @@ +from dataclasses import dataclass +from functools import partial + +import keras +import tensorflow as tf +from keras import layers + +from pipeline.base.generation import generate_with_stateful_model, generate_with_training_model +from pipeline.base.model_builder import ModelArtifact + + +@dataclass +class RNNModelBuilder: + num_layers: int = 2 + embedding_dim: int = 100 + hidden_dim: int = 1024 + + def build_training_artifact( + self, + vocab_size: int, + sequence_length: int + ) -> ModelArtifact: + inputs = keras.Input(shape=(None,), dtype="int32", name="inputs") + x = layers.Embedding( + input_dim=vocab_size, + output_dim=self.embedding_dim, + mask_zero=True, + name="embedding" + )(inputs) + + for i in range(self.num_layers): + x = layers.LSTM( + self.hidden_dim, + return_sequences=True, + recurrent_dropout=0.1, + name=f"lstm_{i}" + )(x) + x = layers.Dropout(0.1, name=f"dropout_{i}")(x) + + outputs = layers.Dense(vocab_size, name="logits")(x) + model = keras.Model(inputs=inputs, outputs=outputs, name="rnn_training") + return ModelArtifact( + model=model, + generate=partial(generate_with_training_model, model) + ) + + def build_inference_artifact( + self, + training_artifact: ModelArtifact + ) -> ModelArtifact: + inference_model = self._build_inference_model_from_training_model( + training_artifact.model + ) + return ModelArtifact( + model=inference_model, + generate=partial( + generate_with_stateful_model, + inference_model, + initial_states=self._initial_states(batch_size=1) + ) + ) + + def _build_inference_model_from_training_model( + self, + training_model: keras.Model + ) -> keras.Model: + token_input = keras.Input(shape=(None,), dtype="int32", name="token_input") + state_inputs = [] + for i in range(self.num_layers): + h_input = keras.Input(shape=(self.hidden_dim,), name=f"h_{i}_input") + c_input = keras.Input(shape=(self.hidden_dim,), name=f"c_{i}_input") + state_inputs.extend([h_input, c_input]) + + embedding = training_model.get_layer("embedding") + logits_layer = training_model.get_layer("logits") + x = embedding(token_input) + + new_states = [] + inference_lstm_layers = [] + for i in range(self.num_layers): + inference_lstm = layers.LSTM( + self.hidden_dim, + return_sequences=i < self.num_layers - 1, + return_state=True, + recurrent_dropout=0.1, + name=f"lstm_{i}" + ) + h_input = state_inputs[i * 2] + c_input = state_inputs[i * 2 + 1] + x, new_h, new_c = inference_lstm(x, initial_state=[h_input, c_input]) + new_states.extend([new_h, new_c]) + dropout = training_model.get_layer(f"dropout_{i}") + x = dropout(x) + inference_lstm_layers.append(inference_lstm) + + logits = logits_layer(x) + inference_model = keras.Model( + [token_input] + state_inputs, + [logits] + new_states, + name="rnn_inference" + ) + + for i, inference_lstm in enumerate(inference_lstm_layers): + training_lstm = training_model.get_layer(f"lstm_{i}") + inference_lstm.set_weights(training_lstm.get_weights()) + + return inference_model + + def _initial_states(self, batch_size: int) -> list: + states = [] + for _ in range(self.num_layers): + states.append(tf.zeros((batch_size, self.hidden_dim))) + states.append(tf.zeros((batch_size, self.hidden_dim))) + return states diff --git a/pipeline/__init__.py b/pipeline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b96461177cc8b546f54152c93f8b7ca077e4076e --- /dev/null +++ b/pipeline/__init__.py @@ -0,0 +1,3 @@ +from .runner import PipelineRunner +from .pipeline import Pipeline +from .base.configs import CheckpointConfig diff --git a/pipeline/base/__init__.py b/pipeline/base/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pipeline/base/checkpoint.py b/pipeline/base/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..cab9b01f21332fc5d76c17fa368e4e26fdd2ab84 --- /dev/null +++ b/pipeline/base/checkpoint.py @@ -0,0 +1,147 @@ +""" +模型工具模块 + +包含模型构建、检查点管理等通用功能。 +""" + +import pathlib +import re +import warnings + +from env.resolve import resolve_path + + +def extract_number_of_filename(filename: str) -> int: + """ + 从文件名中提取数字,无论数字出现在文件名的哪个位置。 + + 例如: + - "model_epoch_001.weights.h5" -> 1 + - "checkpoint_2024_06_30_epoch_002.weights.h5" -> 2 + - "model_epoch_final.weights.h5" -> 抛出异常 + + :param filename: 包含数字的文件名字符串 + :return: 提取的数字,如果没有数字则返回0 + """ + numbers = re.findall(r"\d+", filename) + if numbers: + return int(numbers[-1]) # 返回最后一个数字,假设它是代数 + else: + raise ValueError(f"No number found in filename: {filename}") + + +def resolve_checkpoint( + dirs: list[pathlib.Path | str] | None = None, + path: pathlib.Path | str | None = None, + epoch: int | None = None, + suffix: str | None = None +): + """统一解析模型检查点路径 + + 支持直接指定检查点文件路径或在目录中查找检查点文件。 + + 参数: + dirs: 检查点目录列表 + path: 直接指定的检查点文件路径(支持绝对路径和相对路径) + epoch: 指定的 epoch,用于查找对应的 .weights.h5 文件 + suffix: 指定检查点文件后缀 + + 返回: + (resolved_path, epoch): 绝对路径和 epoch 数 + + 抛出: + FileNotFoundError: 当指定的路径不存在或未找到检查点文件时 + ValueError: 当参数无效时 + """ + resolved_dirs = _resolve_checkpoint_dirs(dirs) + + if path is not None: + path = pathlib.Path(path) + + if not path.is_absolute(): + if not resolved_dirs: + raise ValueError("path 是相对路径时,必须提供 dirs") + path = _resolve_relative_checkpoint_path(path, resolved_dirs) + else: + if dirs is not None: + warnings.warn( + "警告:path 是绝对路径,dirs 参数将被忽略", + UserWarning + ) + + if not path.exists(): + raise FileNotFoundError(f"检查点文件不存在: {path}") + if suffix is not None and not path.name.endswith(suffix): + raise FileNotFoundError(f"检查点文件后缀不匹配: {path}") + + try: + epoch_num = extract_number_of_filename(path.stem) + except ValueError: + epoch_num = 0 + + return path, epoch_num + + if not resolved_dirs: + raise ValueError("必须提供 dirs 或 path") + + files_with_number = _collect_checkpoint_files( + checkpoint_dirs=resolved_dirs, + suffix=suffix + ) + + if epoch is not None: + matches = [(f, num) for f, num in files_with_number if num == epoch] + if not matches: + raise FileNotFoundError(f"未找到 epoch {epoch} 对应的检查点文件") + if len(matches) > 1: + raise RuntimeError( + f"找到多个 epoch {epoch} 对应的检查点文件: {[match[0].name for match in matches]}" + ) + return matches[0] + + if not files_with_number: + return None, 0 + + return max(files_with_number, key=lambda item: item[1]) + + +def _resolve_checkpoint_dirs( + dirs: list[pathlib.Path | str] | None +) -> list[pathlib.Path]: + if dirs is None: + return [] + return [resolve_path(path) for path in dirs] + + +def _resolve_relative_checkpoint_path( + checkpoint_path: pathlib.Path, + checkpoint_dirs: list[pathlib.Path] +) -> pathlib.Path: + for checkpoint_dir in checkpoint_dirs: + candidate = checkpoint_dir / checkpoint_path + if candidate.exists(): + return candidate + return checkpoint_dirs[0] / checkpoint_path + + +def _collect_checkpoint_files( + checkpoint_dirs: list[pathlib.Path], + suffix: str | None +) -> list[tuple[pathlib.Path, int]]: + files_with_number = [] + for checkpoint_dir in checkpoint_dirs: + if not checkpoint_dir.exists(): + continue + for file_path in sorted(checkpoint_dir.iterdir()): + if not file_path.is_file(): + continue + if suffix is not None and not file_path.name.endswith(suffix): + continue + if suffix is None and not _is_checkpoint_file(file_path): + continue + files_with_number.append((file_path, extract_number_of_filename(file_path.stem))) + return files_with_number + + +def _is_checkpoint_file(file_path: pathlib.Path) -> bool: + return file_path.name.endswith(".keras") or file_path.name.endswith(".weights.h5") diff --git a/pipeline/base/configs.py b/pipeline/base/configs.py new file mode 100644 index 0000000000000000000000000000000000000000..3b739b25de425d62a4b5fed6399778667c38369a --- /dev/null +++ b/pipeline/base/configs.py @@ -0,0 +1,69 @@ +from dataclasses import dataclass, field +from pathlib import Path +from typing import Callable + + +@dataclass +class CheckpointConfig: + dirs: list[Path] | None = None + path: Path = None + epoch: int = None + suffix: str = None + + +@dataclass +class ModelConfig: + sequence_length: int = 256 + hidden_dim: int = 512 + intermediate_dim: int = 2056 + num_heads: int = 8 + num_layers: int = 8 + + +@dataclass +class TrainingRule: + batch_size: int = 128 + epochs: int = 1 + steps_per_epoch: int = 30 + validation_batches: int = 1 + + +@dataclass +class GenerationRule: + prompts_generator: Callable + sample_strategy: Callable + + +@dataclass +class CheckpointRules: + training: CheckpointConfig = field(default_factory=CheckpointConfig) + testing: CheckpointConfig = field(default_factory=CheckpointConfig) + deployment: CheckpointConfig = field(default_factory=CheckpointConfig) + + def resolve_training_rule( + self, + default_dirs: list[Path | str] | None = None + ) -> dict: + return self._resolve_rule(self.training, default_dirs) + + def resolve_testing_rule( + self, + default_dirs: list[Path | str] | None = None + ) -> dict: + return self._resolve_rule(self.testing, default_dirs) + + def resolve_deployment_rule( + self, + default_dirs: list[Path | str] | None = None + ) -> dict: + return self._resolve_rule(self.deployment, default_dirs) + + @staticmethod + def _resolve_rule(checkpoint: CheckpointConfig, default_dirs: list[Path | str] | None) -> dict: + dirs = checkpoint.dirs if checkpoint.dirs is not None else default_dirs + return { + "dirs": dirs, + "path": checkpoint.path, + "epoch": checkpoint.epoch, + "suffix": checkpoint.suffix + } diff --git a/pipeline/base/generation.py b/pipeline/base/generation.py new file mode 100644 index 0000000000000000000000000000000000000000..34e2cfec7571760952403b3455b696af9eb68b34 --- /dev/null +++ b/pipeline/base/generation.py @@ -0,0 +1,174 @@ +""" +与生成有关的组件 +""" + +import pathlib +from dataclasses import dataclass +from typing import Any, Callable + +import keras +import numpy as np +from keras import callbacks, ops + +from env.vocab import PAD +from env.logger import get_logger +from pipeline.base.model_builder import GenerationContext, GenerationResult, ModelArtifact + + +def generate_with_training_model( + model: keras.Model, + context: GenerationContext, + prompt_tokens: list[int] +) -> GenerationResult: + prompt_length = len(prompt_tokens) + + if prompt_length == 0: + return GenerationResult([], "<|empty|>") + + tokens = prompt_tokens + [PAD] * (context.max_length - prompt_length) + + for i in range(prompt_length, context.max_length): + prediction = model.predict(np.array([tokens]), verbose=0) + prediction = prediction[0, i - 1] + next_token = ops.convert_to_numpy(context.sample_fn(prediction)) + next_token_id = np.array(next_token).item() + tokens[i] = next_token_id + + if next_token_id == context.end_of_text: + return GenerationResult(tokens[:i], "<|endoftext|>") + if next_token_id == PAD: + return GenerationResult(tokens[:i], "<|pad|>") + + return GenerationResult(tokens, "<|maxlength|>") + + +def generate_with_stateful_model( + model: keras.Model, + context: GenerationContext, + prompt_tokens: list[int], + initial_states: list +) -> GenerationResult: + if not prompt_tokens: + return GenerationResult([], "<|empty|>") + + tokens = list(prompt_tokens) + batch_tokens = np.array([tokens]) + logits, *states = model.predict([batch_tokens] + initial_states, verbose=0) + + for _ in range(len(tokens), context.max_length): + next_token = ops.convert_to_numpy(context.sample_fn(logits[0])) + next_token_id = np.array(next_token).item() + tokens.append(next_token_id) + + if next_token_id == context.end_of_text: + return GenerationResult(tokens[:-1], "<|endoftext|>") + if next_token_id <= PAD: + return GenerationResult(tokens, "<|pad|>") + + logits, *states = model.predict([np.array([[next_token_id]])] + states, verbose=0) + + return GenerationResult(tokens, "<|maxlength|>") + + +@dataclass +class TextGenerationResult: + text: str + stop_reason: str + + +class TextGenerator: + def __init__( + self, + artifact: ModelArtifact, + tokenizer: Any, + decode: Callable, + end_of_text: int, + sample_fn: Callable, + max_length: int + ): + self.artifact = artifact + self.tokenizer = tokenizer + self.decode = decode + self.context = GenerationContext( + end_of_text=end_of_text, + max_length=max_length, + sample_fn=sample_fn + ) + + def generate_tokens( + self, + prompt: str, + max_length: int | None = None, + sample_fn: Callable | None = None + ) -> GenerationResult: + context = GenerationContext( + end_of_text=self.context.end_of_text, + max_length=max_length if max_length is not None else self.context.max_length, + sample_fn=sample_fn if sample_fn is not None else self.context.sample_fn + ) + prompt_tokens = self._tokenize_prompt(prompt) + return self.artifact.generate(context, prompt_tokens) + + def generate_text( + self, + prompt: str, + max_length: int | None = None, + sample_fn: Callable | None = None + ) -> TextGenerationResult: + result = self.generate_tokens(prompt, max_length, sample_fn) + return TextGenerationResult( + text=self.decode(result.token_ids), + stop_reason=result.stop_reason + ) + + def _tokenize_prompt(self, prompt: str) -> list[int]: + prompt_tokens = list(ops.convert_to_numpy(self.tokenizer(prompt))) + return [token for token in prompt_tokens if token > PAD] + + +class GenerationCallback(callbacks.Callback): + def __init__( + self, + prompts: list[str], + log_file: pathlib.Path, + tokenizer: Any, + decode: Callable, + end_of_text: int, + max_length: int, + sample_fn: Callable, + training_artifact: ModelArtifact + ): + super().__init__() + self.prompts = prompts + self.tokenizer = tokenizer + self.decode = decode + self.end_of_text = end_of_text + self.max_length = max_length + self.sample_fn = sample_fn + self.training_artifact = training_artifact + self.logger = self.init_logger(log_file) + + def on_epoch_end(self, epoch, logs=None): + generator = TextGenerator( + artifact=self.training_artifact, + tokenizer=self.tokenizer, + decode=self.decode, + end_of_text=self.end_of_text, + max_length=self.max_length, + sample_fn=self.sample_fn + ) + self.logger.info(f"\nGenerated text after epoch {epoch + 1}:") + for i, prompt in enumerate(self.prompts): + result = generator.generate_text(prompt) + self.logger.info(f"Prompt {i + 1:2}: {prompt}") + self.logger.info(f"Generated: {result.text}{result.stop_reason}\n") + + @staticmethod + def init_logger(log_file: pathlib.Path): + if not log_file.parent.exists(): + log_file.parent.mkdir(parents=True) + + logger = get_logger("GenerationCallback", filepath=str(log_file)) + logger.info("Initialized GenerationCallback logger") + + return logger diff --git a/pipeline/base/generation_runner.py b/pipeline/base/generation_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..7a28c765e0bb302a05ad1233a7ef119e32efbd55 --- /dev/null +++ b/pipeline/base/generation_runner.py @@ -0,0 +1,109 @@ +""" +文本生成 ActionRunner 基类模块 + +提供统一的文本生成功能,支持交互式、固定 prompts、随机 prompts 三种模式。 +""" + +from abc import ABC, abstractmethod + +from env.runner import ActionRunner +from pipeline.base.generation import TextGenerator +from pipeline.base.prompts_strategy import random_prompts + + +class BaseGenerationRunner(ActionRunner, ABC): + """ + 文本生成 ActionRunner 基类。 + + 子类必须实现: + - _build_generator(): 构建并返回生成器实例 + + 子类必须设置: + - fixed_prompts: 固定 prompts 列表(类属性) + + 子类可配置: + - title: 显示标题 + - random_config: random_prompts 参数字典 + """ + + title = "文本生成器" + fixed_prompts = [] + random_config = {"num_text": 10, "text_length": 20} + + def __init__(self, resolve_pipeline_func): + self.pipeline = resolve_pipeline_func() + self.generator: TextGenerator = self._build_generator() + + @abstractmethod + def _build_generator(self) -> TextGenerator: + """子类实现:构建并返回生成器实例""" + pass + + def run_interactive(self): + """交互式文本生成""" + self.pipeline.log_config() + print("\n" + "=" * 60) + print(self.title) + print("=" * 60) + print("输入提示文本,模型将生成续写内容。") + print("输入 'quit', 'exit' 或 'q' 退出程序。") + print("=" * 60 + "\n") + + while True: + try: + prompt = input("提示: ").strip() + + if prompt.lower() in ["quit", "exit", "q"]: + print("退出程序。") + break + + if not prompt: + print("提示不能为空,请重新输入。") + continue + + print("正在生成...") + result = self.generator.generate_text(prompt) + + print("\n" + "-" * 60) + print(f"提示: {prompt}") + print(f"生成: {result.text}{result.stop_reason}") + print("-" * 60 + "\n") + + except KeyboardInterrupt: + print("\n\n检测到中断信号,退出程序。") + break + except Exception as e: + print(f"生成过程中出现错误: {e}") + print("请重新输入提示。\n") + + def run_fixed(self): + """固定 prompts 文本生成""" + self.pipeline.log_config() + print(f"{self.title} - 固定提示生成启动...") + + print("\n" + "=" * 60) + print(f"{self.title} 固定提示生成结果") + print("=" * 60 + "\n") + + for i, prompt in enumerate(self.fixed_prompts): + print(f"提示 {i + 1:2}: {prompt}") + result = self.generator.generate_text(prompt) + print(f"生成: {result.text}{result.stop_reason}\n") + + def run_random(self): + """随机 prompts 文本生成""" + self.pipeline.log_config() + print(f"{self.title} - Random Prompts 生成器启动...") + + docs_ds = self.pipeline.dataset.doc_ds() + prompts_generator = random_prompts(**self.random_config) + prompts = prompts_generator(docs_ds) + + print("\n" + "=" * 60) + print(f"{self.title} Random Prompts 生成结果") + print("=" * 60 + "\n") + + for i, prompt in enumerate(prompts): + print(f"提示 {i + 1:2}: {prompt}") + result = self.generator.generate_text(prompt) + print(f"生成: {result.text}{result.stop_reason}\n") diff --git a/pipeline/base/logging_config_utils.py b/pipeline/base/logging_config_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..93cd70f5ede0ca601d308fad69c9de57a966c8c5 --- /dev/null +++ b/pipeline/base/logging_config_utils.py @@ -0,0 +1,68 @@ +from dataclasses import fields, is_dataclass +from pathlib import Path +from typing import Any + + +INDENT = " " + + +def format_config_value(value: Any, indent: int = 0) -> str: + """ + 格式化配置值,支持 dataclass、callable 和嵌套结构。 + + Args: + value: 要格式化的值 + indent: 缩进层级 + + Returns: + 格式化后的字符串 + """ + prefix = INDENT * indent + + if is_dataclass(value): + field_lines = [] + for field in fields(value): + field_value = getattr(value, field.name) + formatted = format_config_value(field_value, indent + 1) + formatted = formatted.strip() # 去掉多余的空白 + field_lines.append(f"{prefix}{INDENT}{field.name}={formatted}") # 内部的字段需要再缩进一步骤 + + if field_lines: + return ( + f"{prefix}{value.__class__.__name__}(\n" + + "\n".join(field_lines) + + f"\n{prefix})" + ) + else: + return f"{prefix}{value.__class__.__name__}()" + elif callable(value) and hasattr(value, "__name__"): + return value.__name__ + else: + return str(value) + + +def log_config( + config: Any, log_path: Path, header: str = None +) -> str: + """ + 记录配置到文件并返回格式化后的字符串。 + + Args: + config: 主配置对象(通常是 dataclass) + log_path: 日志文件路径 + header: 可选的标题前缀 + + Returns: + 格式化后的配置字符串 + """ + lines = [] + if header: + lines.append(header) + lines.append(format_config_value(config)) + output = "\n".join(lines) + + log_path.parent.mkdir(parents=True, exist_ok=True) + with open(log_path, "w", encoding="utf-8") as f: + f.write(output + "\n") + + return output diff --git a/pipeline/base/model_builder.py b/pipeline/base/model_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..b5402dc1cd607750af2a6115ef2f985e7254c1a0 --- /dev/null +++ b/pipeline/base/model_builder.py @@ -0,0 +1,51 @@ +""" +ModelBuilder 协议定义 + +所有模型构建器应实现的接口。 +""" + +from dataclasses import dataclass +from typing import Callable, Protocol + +import keras + + +@dataclass +class GenerationContext: + end_of_text: int + max_length: int + sample_fn: Callable + + +@dataclass +class GenerationResult: + token_ids: list[int] + stop_reason: str + + +GenerateFn = Callable[[GenerationContext, list[int]], GenerationResult] + + +@dataclass +class ModelArtifact: + model: keras.Model + generate: GenerateFn + + +class ModelBuilder(Protocol): + """模型构建器协议""" + + def build_training_artifact( + self, + vocab_size: int, + sequence_length: int + ) -> ModelArtifact: + """构建训练产物""" + ... + + def build_inference_artifact( + self, + training_artifact: ModelArtifact + ) -> ModelArtifact: + """基于训练产物构建推理产物""" + ... diff --git a/pipeline/base/model_loader.py b/pipeline/base/model_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..055b39234d6fe9ccdd6a126e4d755db7d8a62256 --- /dev/null +++ b/pipeline/base/model_loader.py @@ -0,0 +1,73 @@ +""" +模型加载工具模块 +""" + +from pathlib import Path +from functools import partial +from typing import TYPE_CHECKING + +import keras + +from data.base import TokenizerBundle +from pipeline.base.checkpoint import resolve_checkpoint +from pipeline.base.generation import generate_with_training_model +from pipeline.base.model_builder import ModelArtifact + +if TYPE_CHECKING: + from pipeline import Pipeline + + +def load_training_artifact_from_pipeline( + pipeline: "Pipeline", + checkpoint_rule: dict +) -> tuple[ModelArtifact, TokenizerBundle]: + tokenizer_info = pipeline.dataset.tokenizer_bundle() + + checkpoint_path, _ = resolve_checkpoint(**checkpoint_rule) + if checkpoint_path is None: + raise FileNotFoundError("未找到任何检查点文件") + + if checkpoint_path.suffix.lower() == ".keras": + model = _load_keras_model(checkpoint_path) + training_artifact = ModelArtifact( + model=model, + generate=partial(generate_with_training_model, model) + ) + else: + vocab_size = tokenizer_info.vocab_size + training_artifact = pipeline.model_builder.build_training_artifact( + vocab_size=vocab_size, + sequence_length=pipeline.dataset.sequence_length + ) + training_artifact.model.load_weights(str(checkpoint_path)) + + print(f"已加载推理检查点: {checkpoint_path}") + return training_artifact, tokenizer_info + + +def load_inference_artifact_from_pipeline( + pipeline: "Pipeline", + checkpoint_rule: dict +) -> tuple[ModelArtifact, TokenizerBundle]: + training_artifact, tokenizer_info = load_training_artifact_from_pipeline( + pipeline, + checkpoint_rule + ) + inference_artifact = pipeline.model_builder.build_inference_artifact( + training_artifact=training_artifact + ) + return inference_artifact, tokenizer_info + +def _load_keras_model(checkpoint_path: Path) -> keras.Model: + from pipeline.pipeline import WarmupSchedule + from models.mini_gpt.gpt_components import PositionalEmbedding, TransformerDecoder + + return keras.models.load_model( + str(checkpoint_path), + # TODO: 这种在通用结构里引入特定模型组件的方式需要改进 + custom_objects={ + "WarmupSchedule": WarmupSchedule, + "PositionalEmbedding": PositionalEmbedding, + "TransformerDecoder": TransformerDecoder + } + ) diff --git a/pipeline/base/prompts_strategy.py b/pipeline/base/prompts_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..c79fbf3b226fd8f9acb99cec75b1d73355459994 --- /dev/null +++ b/pipeline/base/prompts_strategy.py @@ -0,0 +1,58 @@ +""" +Prompts 生成策略模块 + +提供两种策略: +- fixed_prompts: 使用固定的 prompts 列表 +- random_prompts: 从 dataset 中随机选取 prompts +""" + +from typing import Callable + +import numpy as np +import tensorflow as tf + + +def fixed_prompts(prompts: list[str]) -> Callable[[tf.data.Dataset], list[str]]: + """ + 固定文案策略:使用预定义的固定 prompts 列表。 + + :param prompts: 固定的 prompts 列表 + :return: 接收 dataset 并返回 prompts 列表的函数(dataset 参数被忽略) + """ + + def generate(dataset: tf.data.Dataset) -> list[str]: + return prompts + + return generate + + +def random_prompts( + num_text: int = 10, text_length = 20, taken_samples: int = 100 +) -> Callable[[tf.data.Dataset], list[str]]: + """ + 随机选择策略:从 dataset 中随机选取 prompts。 + + :param num_text: 需要选取的 prompts 数量 + :param text_length: 每个 prompt 的长度 + :param taken_samples: 从 dataset 中预览的样本数量 + :return: 接收 dataset 并返回 prompts 列表的函数 + """ + + def generate(dataset: tf.data.Dataset) -> list[str]: + # 将 dataset 转换为列表以便随机选择 + texts = list( + dataset.take(taken_samples).as_numpy_iterator() + ) # 只选取前 preview_size 个,否则内存要爆掉 + full_texts = np.random.choice(texts, size=num_text, replace=False) + selected_texts = [] + for text in full_texts: + # 将文本转换为字符串 + text = text.decode("utf-8") + # 随机选取 20 长度的片段作为提示语 + selected_length = min(text_length, len(text) // 2) + start_idx = np.random.randint(0, len(text) - selected_length) + selected_text = text[start_idx : start_idx + selected_length] + selected_texts.append(selected_text) + return selected_texts + + return generate diff --git a/pipeline/base/sample_functions.py b/pipeline/base/sample_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..43bf285600a2c94ab9a161dbc3643553b8a1481e --- /dev/null +++ b/pipeline/base/sample_functions.py @@ -0,0 +1,21 @@ +import keras +from keras import ops + + +# 按照最大概率采样 +def greedy_search(preds): + return ops.argmax(preds) + + +# 温度采样 +def random_sample(preds, temperature=1.0): + preds = preds / temperature + return keras.random.categorical(preds[None, :], num_samples=1)[0] + + +# 只从前 k 个元素中采用温度采样 +def top_k(preds, k=5, temperature=1.0): + preds = preds / temperature + top_preds, top_indices = ops.top_k(preds, k=k, sorted=False) + choice = keras.random.categorical(top_preds[None, :], num_samples=1)[0] + return ops.take_along_axis(top_indices, choice, axis=-1) diff --git a/pipeline/env/__init__.py b/pipeline/env/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pipeline/env/const.py b/pipeline/env/const.py new file mode 100644 index 0000000000000000000000000000000000000000..af02038b555767670913ef5509f636496d74d3f9 --- /dev/null +++ b/pipeline/env/const.py @@ -0,0 +1,3 @@ +import os + +ENV = os.environ.get("ENV", "test") diff --git a/pipeline/pipeline.py b/pipeline/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..622dca540486c8e91b6f7aedf3e342e761e9bbb0 --- /dev/null +++ b/pipeline/pipeline.py @@ -0,0 +1,291 @@ +from dataclasses import dataclass, field +from functools import partial +from pathlib import Path +from typing import Callable + +import keras +import tensorflow as tf +from keras import callbacks, ops + +from data import DataBundle +from env.resolve import resolve_task_dir +from pipeline.base.checkpoint import resolve_checkpoint +from pipeline.base.configs import CheckpointRules, GenerationRule, TrainingRule +from pipeline.base.generation import GenerationCallback, generate_with_training_model +from pipeline.base.logging_config_utils import log_config +from pipeline.base.model_builder import ModelArtifact, ModelBuilder +from pipeline.base.model_loader import _load_keras_model +from env.logger import log +from pipeline.env.const import ENV + + +class MetricsLoger(callbacks.CSVLogger): + """CSV Logger,epoch 显示为 1-based""" + + def on_epoch_end(self, epoch, logs=None): + super().on_epoch_end(epoch + 1, logs) + + +class WarmupSchedule(keras.optimizers.schedules.LearningRateSchedule): + """ + 学习率调度器,包含预热阶段。在预热阶段,学习率从0线性增加到指定的初始学习率。 + """ + + def __init__(self, rate=2e-4, warmup_steps=1000): + super().__init__() + self.rate = rate + self.warmup_steps = warmup_steps + + def __call__(self, step): + step = ops.cast(step, dtype="float32") + scale = ops.minimum(step / self.warmup_steps, 1.0) + return self.rate * scale + + def get_config(self): + return {"rate": self.rate, "warmup_steps": self.warmup_steps} + + +@dataclass +class Pipeline: + name: str + dataset: "DataBundle" + model_builder: "ModelBuilder" + training_rule: TrainingRule + generation_rule: GenerationRule + task_dir: Path = None + checkpoint_rules: CheckpointRules = field(default_factory=CheckpointRules) + + def __post_init__(self): + if self.task_dir is None: + self.task_dir = resolve_task_dir(self.name) + + @property + def log_dir(self) -> Path: + return self.task_dir / "logs" + + @property + def checkpoint_dir(self) -> Path: + return self.task_dir / "checkpoints" + + @property + def tensorboard_dir(self) -> Path: + return self.task_dir / "tensorboard" + + def execute(self): + with log(): + # 开启混合精度训练(TODO: 如果这个不开启这个,貌似训练时内存会爆) + from env.keras import enable_mixed_precision + enable_mixed_precision() + + with log(): + self.log_config() + + with log(): + # 从数据集获取分词器信息 + tokenizer_info = self.dataset.tokenizer_bundle() + + # 数据集和训练数据加载 + docs_ds = self.dataset.doc_ds() + tokens_ds = self.dataset.tokens_ds( + seq_length=self.dataset.sequence_length, + batch_size=self.training_rule.batch_size + ) + validation_ds = tokens_ds.take(self.training_rule.validation_batches) + train_ds = tokens_ds.skip(self.training_rule.validation_batches).repeat() + + with log(): + # 构建并编译模型,加载检查点权重 + self.checkpoint_dir.mkdir(parents=True, exist_ok=True) + training_artifact, checkpoint_epoch = self._build_training_artifact( + vocab_size=tokenizer_info.vocab_size + ) + + with log("构建回调"): + callbacks_list = self._build_callbacks( + training_artifact=training_artifact, + tokenizer=tokenizer_info.tokenizer, + decode=tokenizer_info.decode, + end_of_text=tokenizer_info.end_of_text, + dataset=docs_ds + ) + + with log("开始训练", "训练结束"): + training_artifact.model.fit( + train_ds, + validation_data=validation_ds, + initial_epoch=checkpoint_epoch, + epochs=self.training_rule.epochs, + steps_per_epoch=self.training_rule.steps_per_epoch, + callbacks=callbacks_list + ) + + def save_inference_model(self) -> Path: + """保存推理模型到 local/saved/{task_name}/ + + Returns: + 保存的完整文件路径 + """ + from env.resolve import resolve_saved + + self.log_config() + + # 使用固定路径 + task_model_dir = resolve_saved(f"models/{self.name}") + task_model_dir.mkdir(parents=True, exist_ok=True) + + # 构建训练模型并加载权重 + tokenizer_info = self.dataset.tokenizer_bundle() + checkpoint_rule = self.checkpoint_rules.resolve_testing_rule( + default_dirs=[self.checkpoint_dir] + ) + training_artifact, checkpoint_epoch = self._build_artifact_from_checkpoint( + vocab_size=tokenizer_info.vocab_size, + checkpoint_rule=checkpoint_rule, + checkpoint_must=True + ) + inference_artifact = self.model_builder.build_inference_artifact( + training_artifact=training_artifact + ) + + filename = f"model_epoch_{checkpoint_epoch:03d}.keras" + model_path = task_model_dir / filename + inference_artifact.model.save(model_path) + + return model_path + + def _build_training_artifact( + self, + vocab_size: int, + checkpoint_must: bool = False + ) -> tuple[ModelArtifact, int]: + """构建训练产物并加载检查点权重 + + Args: + vocab_size: 词汇表大小 + checkpoint_must: 是否必须加载检查点。如果为 True,没有找到检查点会抛出异常; + 如果为 False,没有找到检查点会继续返回一个未加载权重的模型。 + + Returns: (training_artifact, checkpoint_epoch) + - training_artifact 是构建并加载权重的训练产物 + - checkpoint_epoch 是从检查点加载的 epoch,如果没有检查点则为 0 + """ + checkpoint_rule = self.checkpoint_rules.resolve_training_rule( + default_dirs=[self.checkpoint_dir] + ) + return self._build_artifact_from_checkpoint( + vocab_size=vocab_size, + checkpoint_rule=checkpoint_rule, + checkpoint_must=checkpoint_must + ) + + def _build_artifact_from_checkpoint( + self, + vocab_size: int, + checkpoint_rule: dict, + checkpoint_must: bool + ) -> tuple[ModelArtifact, int]: + # 从之前的检查点加载权重 + checkpoint_path, checkpoint_epoch = resolve_checkpoint(**checkpoint_rule) + if checkpoint_path is not None: + print(f"正在加载检查点: {checkpoint_path}, epoch: {checkpoint_epoch}") + if checkpoint_path.suffix.lower() == ".keras": + model = _load_keras_model(checkpoint_path) + training_artifact = ModelArtifact( + model=model, + generate=partial(generate_with_training_model, model) + ) + else: + training_artifact = self.model_builder.build_training_artifact( + vocab_size=vocab_size, + sequence_length=self.dataset.sequence_length + ) + training_artifact.model.load_weights(str(checkpoint_path)) + print(f"已加载检查点: {checkpoint_path}") + elif checkpoint_must: + raise ValueError(f"目录 {self.checkpoint_dir} 中未找到检查点文件") + else: + training_artifact = self.model_builder.build_training_artifact( + vocab_size=vocab_size, + sequence_length=self.dataset.sequence_length + ) + print("未找到检查点,使用新模型") + + schedule = WarmupSchedule() + training_artifact.model.compile( + optimizer=keras.optimizers.Adam(schedule), + loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=["accuracy"] + ) + training_artifact.model.summary() + + return training_artifact, checkpoint_epoch + + def log_config(self): + """记录配置到文件并打印到控制台""" + self.log_dir.mkdir(parents=True, exist_ok=True) + + config_path = self.log_dir / "config.txt" + output = log_config(self, config_path, header=f"ENV[{ENV}]") + print(output) + print(f"配置已保存到: {config_path}") + + def _build_callbacks( + self, + training_artifact: ModelArtifact, + tokenizer: Callable, + decode: Callable, + end_of_text: int, + dataset: tf.data.Dataset, + ) -> list[callbacks.Callback]: + """ + 构建回调函数。这里需要构建两个回调函数: + + - 模型保存回调:每代都保存模型权重 + - 生成回调:每代结束时生成一些文本以监控训练进展 + + 构建生成回调时,需要从原始的文本数据集中随机选取一些提示语,以便在每代结束时使用这些提示语生成文本并输出到控制台。 + 这里的 dataset 参数是数据流水线最开始的文本数据集。 + """ + # 创建模型保存回调 - 每代都保存,文件名包含代数 + checkpoint_callback = callbacks.ModelCheckpoint( + filepath=str(self.checkpoint_dir / "model_epoch_{epoch:03d}.weights.h5"), + save_best_only=False, # 每代都保存 + save_weights_only=True, # 只保存权重 + verbose=1 + ) + + # 创建生成回调 - 每代结束时生成一些文本以监控训练进展 + # 使用配置的 prompts_generator 生成 prompts + prompts = self.generation_rule.prompts_generator(dataset) + generation_callback = GenerationCallback( + log_file=self.log_dir / "generation.log", + prompts=prompts, + tokenizer=tokenizer, + decode=decode, + max_length=self.dataset.sequence_length or 100, + end_of_text=end_of_text, + sample_fn=self.generation_rule.sample_strategy, + training_artifact=training_artifact + ) + + # 创建 metrics 日志回调 - 记录每个 epoch 的 loss 和 accuracy + csv_logger = MetricsLoger( + filename=str(self.log_dir / "metrics.csv"), + append=True, # 追加模式,支持断点续训 + ) + + # 创建 TensorBoard 回调 + tensorboard_callback = callbacks.TensorBoard( + log_dir=str(self.tensorboard_dir), + histogram_freq=0, + write_graph=False, + write_images=False, + update_freq="epoch", + ) + + return [ + checkpoint_callback, + generation_callback, + csv_logger, + tensorboard_callback, + ] diff --git a/pipeline/runner.py b/pipeline/runner.py new file mode 100644 index 0000000000000000000000000000000000000000..0aee12d94bdc0fa3289e74c7d850c442d1a828b9 --- /dev/null +++ b/pipeline/runner.py @@ -0,0 +1,21 @@ +from env.resolve import resolve_env +from pipeline.pipeline import Pipeline + + +class PipelineRunner: + def __init__(self, test_pip: Pipeline, prod_pip: Pipeline): + self.test_pip = test_pip + self.prod_pip = prod_pip + + def __call__(self, pip: Pipeline = None): + """ + 通过当前绑定的环境变量来选择运行 test_pip 还是 prod_pip。 + + Args: + pip: 如果提供了 pip 参数,则直接运行该 Pipeline。否则,根据环境变量 ENV 的值来选择 + 运行 test_pip(当 ENV=test)或 prod_pip(当 ENV=prod)。 + """ + if not pip: + pip = resolve_env(self.test_pip, self.prod_pip) + + pip.execute() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfd32366cfe5117fd77fc125ec1ae9d56a23a549 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# Generated from environment-linux.yml +# Timestamp: 2026-03-27 11:42:52 +# Environment: production +# +numpy==2.1.3 +tensorflow==2.19.1 +tensorflow-text==2.19.0 +keras==3.13.2 +keras-hub==0.26.0 +gradio==6.9.0 \ No newline at end of file diff --git a/saved/models/poetry_gpt/model_epoch_045.keras b/saved/models/poetry_gpt/model_epoch_045.keras new file mode 100644 index 0000000000000000000000000000000000000000..60ae2e19997503f1f98a454cb3d05d5267735870 --- /dev/null +++ b/saved/models/poetry_gpt/model_epoch_045.keras @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:209e102fb7dbdd3a541b9a844c7b9e8e027188005abd6b72d4b020fad0c72907 +size 80183046 diff --git a/saved/models/poetry_rnn/model_epoch_041.keras b/saved/models/poetry_rnn/model_epoch_041.keras new file mode 100644 index 0000000000000000000000000000000000000000..ae5799807a2acf246554d1b3a4327461b2ba0cb8 --- /dev/null +++ b/saved/models/poetry_rnn/model_epoch_041.keras @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe2d5438f418fdbb9a627271799c8f952699433a49b71465f10a8fcdfc09665 +size 48486745 diff --git a/saved/models/wiki_gpt/model_epoch_086.keras b/saved/models/wiki_gpt/model_epoch_086.keras new file mode 100644 index 0000000000000000000000000000000000000000..1c8b6e2977ba3ed31f6c06923ac544e1bdcafda9 --- /dev/null +++ b/saved/models/wiki_gpt/model_epoch_086.keras @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4df6accdab1d8e43c055cccf7d055c7a832ba6dc2e74c6033f72eff41c659d +size 167481070 diff --git a/saved/vocab/poetry/vocab.txt b/saved/vocab/poetry/vocab.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee064cbefd4894e74c996c3423f3fa3185a73748 --- /dev/null +++ b/saved/vocab/poetry/vocab.txt @@ -0,0 +1,14284 @@ + +[UNK] +。 +, +不 +人 +一 +风 +山 +无 +有 +天 +云 +来 +日 +何 +花 +春 +生 +年 +中 +如 +月 +时 +水 +自 +为 +上 +我 +心 +相 +此 +秋 +长 +清 +江 +雨 +知 +君 +未 +得 +白 +是 +归 +子 +千 +三 +今 +高 +里 +行 +空 +见 +明 +青 +去 +万 +下 +夜 +老 +事 +在 +寒 +谁 +可 +家 +玉 +南 +声 +处 +前 +与 +流 +落 +东 +酒 +飞 +门 +金 +已 +客 +多 +看 +成 +新 +西 +烟 +道 +古 +、 +深 +海 +书 +欲 +香 +重 +出 +地 +更 +梦 +开 +头 +光 +诗 +之 +石 +作 +黄 +同 +情 +尽 +几 +入 +能 +朝 +十 +难 +大 +草 +当 +游 +还 +色 +回 +世 +远 +城 +林 +思 +气 +红 +树 +意 +然 +名 +亦 +从 +小 +间 +百 +过 +将 +平 +愁 +雪 +路 +马 +犹 +似 +到 +龙 +尘 +旧 +阳 +身 +好 +独 +望 +满 +歌 +衣 +公 +华 +复 +分 +向 +闻 +莫 +楼 +应 +后 +方 +外 +仙 +起 +非 +五 +问 +笑 +文 +安 +故 +言 +真 +别 +岁 +神 +以 +孤 +影 +须 +初 +坐 +阴 +发 +竹 +两 +留 +叶 +北 +正 +松 +半 +台 +又 +醉 +幽 +边 +吾 +绿 +所 +四 +湖 +语 +岂 +先 +霜 +波 +溪 +少 +碧 +传 +柳 +王 +其 +双 +馀 +曾 +怀 +河 +翠 +野 +随 +共 +物 +国 +儿 +晚 +画 +九 +断 +眼 +吟 +轻 +惊 +堂 +枝 +对 +经 +吹 +离 +露 +浮 +芳 +微 +手 +只 +残 +乐 +舟 +寻 +鸟 +听 +苍 +关 +若 +数 +却 +才 +照 +逢 +居 +怜 +暮 +临 +尚 +久 +者 +二 +窗 +峰 +曲 +堪 +近 +兴 +梅 +依 +凉 +泉 +星 +木 +于 +足 +苦 +鱼 +乡 +且 +忽 +终 +喜 +官 +疏 +转 +绝 +太 +宫 +亭 +汉 +萧 +夕 +车 +忆 +常 +遥 +登 +庭 +连 +鸣 +隐 +首 +閒 +士 +往 +灵 +合 +期 +燕 +泪 +和 +动 +园 +晴 +垂 +鹤 +虚 +胜 +至 +横 +节 +容 +载 +紫 +爱 +识 +灯 +休 +早 +倚 +使 +昔 +说 +田 +度 +点 +丹 +都 +学 +送 +闲 +兮 +待 +忘 +晓 +沙 +那 +因 +便 +: +斜 +穷 +州 +荒 +他 +火 +亲 +六 +池 +群 +桃 +静 +定 +女 +尔 +悲 +散 +雁 +夫 +惟 +信 +觉 +死 +也 +乱 +恨 +丝 +隔 +景 +凤 +冷 +寄 +易 +元 +珠 +解 +通 +面 +许 +病 +遗 +而 +立 +会 +章 +细 +功 +兰 +但 +奇 +纷 +寂 +争 +论 +斗 +笔 +藏 +用 +杯 +屋 +带 +教 +佳 +霞 +径 +罗 +令 +观 +贤 +魂 +吴 +消 +迟 +目 +迹 +结 +阁 +图 +桥 +陵 +力 +飘 +了 +化 +冰 +颜 +村 +锦 +交 +民 +愿 +主 +欢 +郎 +浪 +原 +著 +舞 +直 +疑 +帘 +本 +翻 +尺 +薄 +字 +念 +忧 +船 +川 +岩 +楚 +漫 +破 +鼓 +食 +想 +底 +把 +宁 +钟 +翁 +乃 +悠 +赋 +虽 +步 +茫 +劳 +卧 +谷 +啼 +素 +息 +惜 +宜 +皆 +八 +移 +恩 +音 +英 +伤 +诸 +端 +取 +师 +良 +口 +异 +感 +暗 +德 +土 +收 +凭 +始 +雄 +最 +乘 +指 +骨 +记 +虎 +帝 +呼 +频 +银 +圣 +皇 +盘 +句 +蓬 +叹 +壁 +敢 +片 +卷 +鸿 +饮 +种 +鬓 +绕 +报 +朱 +杨 +偏 +尊 +及 +美 +缘 +宿 +僧 +谢 +极 +辞 +命 +眉 +或 +剑 +琴 +负 +要 +冠 +短 +住 +臣 +低 +嗟 +涯 +求 +放 +徒 +聊 +鸡 +折 +栖 +洞 +抱 +军 +试 +渐 +题 +齐 +俗 +志 +镜 +采 +即 +牛 +持 +况 +干 +迷 +昏 +岸 +户 +理 +眠 +失 +摇 +根 +桑 +宝 +七 +狂 +梁 +称 +精 +羽 +秦 +宵 +倾 +枕 +逐 +斯 +顾 +周 +携 +必 +乌 +□ +并 +兵 +任 +计 +孙 +悬 +全 +角 +比 +; +市 +接 +存 +李 +肯 +急 +催 +映 +侯 +沧 +含 +傍 +渔 +贵 +竟 +由 +胡 +肠 +变 +遍 +守 +甘 +机 +仍 +法 +阑 +乾 +泽 +盈 +哀 +愧 +沉 +岭 +凄 +室 +沈 +纵 +陈 +读 +注 +举 +骑 +邻 +参 +历 +张 +迎 +总 +写 +每 +苔 +走 +帆 +众 +封 +罢 +武 +幸 +雾 +京 +征 +兹 +潮 +偶 +弦 +拂 +泥 +恐 +倒 +轩 +扬 +衰 +永 +扶 +谈 +淡 +个 +再 +玄 +词 +莺 +殊 +妙 +昨 +药 +哉 +莲 +冥 +佛 +寺 +舍 +荷 +招 +被 +湘 +荡 +味 +汝 +床 +桂 +形 +约 +伴 +渡 +业 +钱 +赤 +壮 +塞 +就 +醒 +禅 +耳 +墨 +母 +净 +瑶 +投 +修 +管 +殿 +实 +庐 +各 +际 +贫 +寿 +危 +茅 +雷 +代 +俱 +勤 +既 +仰 +洗 +父 +暖 +战 +义 +引 +越 +壑 +掩 +洲 +借 +荣 +枯 +唱 +稀 +丘 +追 +菊 +绣 +烛 +史 +内 +? +积 +评 +杖 +晨 +调 +拟 +友 +彼 +袖 +夏 +丛 +鸾 +屏 +凝 +湿 +昼 +添 +击 +阙 +披 +弄 +浅 +痕 +第 +策 +挂 +宇 +怨 +性 +钓 +强 +墙 +零 +余 +赏 +琼 +秀 +忍 +壶 +响 +檐 +奈 +丈 +阶 +渺 +旦 +浩 +势 +井 +驱 +妆 +条 +工 +受 +甚 +你 +承 +扫 +拜 +象 +惭 +斋 +怅 +列 +轮 +鸥 +层 +铁 +拥 +童 +席 +洒 +源 +瘦 +礼 +盛 +圆 +避 +环 +怪 +矣 +杜 +浑 +途 +霄 +遇 +淮 +禁 +停 +涛 +寸 +羡 +穿 +襟 +的 +除 +郁 +付 +佩 +遂 +塘 +养 +没 +凌 +暂 +兼 +粉 +茶 +予 +府 +儒 +吏 +盖 +止 +倦 +驰 +繁 +樽 +夷 +耕 +饥 +房 +鬼 +豪 +视 +涧 +话 +谓 +广 +毛 +供 +侵 +浓 +富 +境 +岳 +致 +丰 +差 +旋 +睡 +嘉 +郭 +院 +沾 +推 +苏 +旗 +销 +则 +遣 +血 +咏 +怕 +驾 +严 +弟 +弹 +觞 +羊 +御 +索 +浦 +雅 +系 +厌 +荆 +篱 +布 +蝶 +阿 +挥 +乍 +密 +笛 +鼎 +篇 +固 +社 +熟 +背 +集 +访 +畔 +寥 +翩 +政 +营 +托 +闭 +属 +崖 +场 +忠 +脱 +慰 +聚 +猿 +烦 +宾 +馆 +午 +唐 +唤 +冬 +奔 +堕 +炉 +商 +彩 +妨 +程 +旅 +漏 +农 +踪 +妇 +团 +适 +喧 +棹 +围 +杀 +陶 +懒 +攀 +坤 +造 +亡 +吐 +叠 +欣 +宗 +窥 +绮 +甲 +泛 +踏 +诚 +腰 +益 +剪 +津 +累 +省 +利 +护 +旌 +等 +善 +艳 +暑 +买 +达 +羞 +升 +支 +鹿 +觅 +芙 +卿 +黑 +辉 +祖 +腾 +俯 +珍 +郊 +果 +温 +谋 +末 +筵 +苑 +刀 +禽 +桐 +延 +伊 +服 +饱 +勿 +覆 +囊 +尝 +兄 +漠 +换 +射 +蒲 +迢 +表 +恶 +辰 +候 +这 +奏 +纤 +洛 +姿 +萝 +澄 +裳 +左 +炎 +巢 +鞭 +纸 +宽 +庙 +进 +鸦 +栏 +铜 +笼 +误 +献 +莱 +衔 +压 +堤 +奉 +界 +戏 +伯 +锁 +啸 +仁 +伏 +娇 +展 +箫 +占 +切 +饭 +颇 +牵 +蓉 +舆 +济 +宴 +胸 +( +酬 +) +瑟 +威 +尾 +恋 +号 +阔 +曹 +返 +巾 +瞻 +幕 +柏 +区 +染 +着 +维 +摩 +宅 +杂 +翼 +丽 +虫 +磨 +侧 +渠 +剩 +量 +险 +置 +殷 +趋 +滴 +蒙 +匆 +杳 +趣 +灭 +弃 +增 +潇 +樵 +森 +冲 +毫 +絮 +宣 +福 +麦 +限 +雕 +潜 +屡 +巧 +寞 +座 +蜀 +鲜 +郡 +钩 +哭 +焉 +侍 +润 +澹 +韵 +刘 +履 +酌 +改 +泣 +卜 +略 +帐 +蝉 +辈 +鳞 +司 +降 +扉 +� +渊 +嫌 +屈 +体 +羁 +梧 +绵 +痴 +逸 +扁 +扇 +簪 +违 +烂 +运 +赖 +徐 +渚 +柱 +赠 +荐 +怒 +补 +乞 +振 +袅 +柴 +肉 +姑 +辛 +雀 +唯 +裘 +劝 +治 +颠 +废 +劫 +潭 +祠 +赐 +县 +驻 +脚 +侣 +孰 +打 +次 +柔 +陌 +舒 +番 +碑 +疾 +戈 +退 +排 +筑 +埃 +快 +顷 +徊 +衡 +陆 +榻 +编 +孝 +氏 +施 +齿 +顶 +提 +竞 +健 +笙 +尤 +领 +虹 +忙 +畏 +祇 +宛 +态 +否 +简 +班 +灰 +艰 +肥 +餐 +续 +碎 +免 +认 +料 +芝 +峨 +请 +悟 +刻 +乎 +坛 +鸳 +泊 +誇 +慈 +恰 +制 +抚 +彻 +庄 +巷 +腹 +丁 +姓 +冻 +藉 +迥 +芦 +探 +烧 +网 +斑 +继 +晖 +讵 +诏 +缕 +酣 +翰 +具 +趁 +骚 +联 +庆 +鹭 +遮 +牙 +仪 +裁 +昭 +慨 +歇 +杏 +藤 +蛮 +虞 +旁 +迁 +濛 +崇 +堆 +宦 +驿 +辟 +蛇 +朋 +竿 +霁 +衫 +鲁 +束 +费 +肃 +右 +融 +朔 +仲 +翔 +瑞 +符 +触 +加 +弱 +祝 +己 +阅 +膏 +建 +热 +坚 +霏 +错 +徘 +掌 +遭 +枫 +典 +萦 +困 +芜 +痛 +纪 +岛 +纱 +蛟 +霭 +坠 +算 +麻 +弓 +委 +犬 +印 +启 +私 +输 +弥 +保 +薰 +戎 +泠 +凡 +虑 +幻 +昌 +暇 +织 +孔 +棠 +辨 +爽 +阵 +顿 +蕊 +曰 +妻 +饶 +邀 +袍 +值 +摧 +槛 +怎 +插 +康 +埋 +鹊 +减 +贱 +惠 +镇 +宋 +坡 +栽 +厚 +惨 +麟 +肩 +阻 +资 +屐 +亩 +濯 +瓜 +妍 +绪 +样 +稍 +习 +查 +瓦 +勋 +类 +反 +娟 +脉 +执 +奴 +咽 +米 +材 +旷 +昆 +璧 +激 +超 +贪 +牧 +鉴 +盟 +滋 +特 +捲 +吞 +拾 +透 +媚 +娥 +逝 +韶 +萍 +烈 +穴 +莽 +妾 +韩 +廊 +涕 +兀 +寐 +涂 +弗 +皋 +标 +戒 +浊 +笺 +绛 +黯 +假 +贼 +峡 +汗 +介 +抛 +涨 +焚 +纶 +娱 +粟 +促 +术 +晋 +役 +活 +局 +男 +慕 +惆 +帽 +籍 +稳 +博 +祥 +敌 +诵 +默 +倍 +暝 +质 +巨 +槎 +洪 +缓 +冈 +笋 +陪 +祗 +滩 +刚 +兔 +魄 +溟 +圃 +徵 +耿 +豆 +愚 +岚 +敛 +藻 +契 +拙 +偷 +冉 +棋 +品 +仗 +梨 +陇 +设 +飒 +惯 +蹄 +枉 +卖 +颓 +黍 +琅 +般 +做 +塔 +科 +梢 +吊 +滨 +淹 +助 +禄 +瓶 +防 +逃 +裙 +贞 +藜 +悴 +邑 +遐 +挽 +凿 +庶 +汤 +尧 +叟 +闾 +览 +咫 +单 +循 +皎 +涉 +怯 +旬 +障 +谩 +龟 +舌 +槐 +仆 +优 +赵 +丑 +邦 +街 +规 +綵 +巡 +腊 +电 +乔 +稻 +操 +袂 +玩 +珊 +钗 +希 +毕 +刺 +夹 +装 +芒 +蟠 +蜂 +告 +畴 +孟 +葛 +垒 +欺 +娘 +案 +敲 +黛 +鲸 +缀 +苗 +缨 +貌 +凋 +闺 +逾 +岑 +价 +筹 +格 +傲 +滞 +锡 +凫 +狼 +蛾 +帷 +乳 +让 +洁 +匪 +瀛 +季 +咸 +峦 +猎 +贾 +律 +伫 +澜 +逼 +拍 +吁 +些 +恼 +鞍 +悔 +颂 +奚 +呵 +臂 +敬 +悄 +夙 +恍 +脂 +夺 +练 +禾 +肝 +沟 +漾 +耀 +垣 +偃 +泰 +茗 +扰 +侬 +软 +据 +线 +飙 +效 +涌 +卫 +鸯 +揽 +柯 +俊 +茂 +剧 +递 +握 +隅 +稽 +跃 +权 +蟾 +赴 +烽 +汀 +缺 +湾 +部 +器 +胆 +整 +衾 +泻 +曳 +廷 +酿 +裂 +菜 +位 +败 +蚕 +锄 +叫 +缠 +棘 +挟 +狐 +么 +鼠 +蚁 +恒 +蒿 +凰 +缥 +烹 +植 +墓 +凛 +渴 +蒸 +憔 +婆 +酸 +宰 +突 +箭 +慵 +掷 +雏 +煮 +骄 +鹃 +浸 +序 +芬 +籁 +宠 +鬟 +充 +涵 +彭 +较 +砚 +速 +梵 +瀑 +禹 +瓮 +株 +授 +呜 +皮 +铺 +斧 +智 +祸 +篆 +庵 +拔 +巴 +《 +浴 +》 +眷 +薇 +曙 +蕉 +逆 +砌 +戍 +诉 +范 +丧 +魏 +! +搜 +像 +沦 +议 +幅 +仓 +霖 +跨 +决 +蘋 +岫 +戴 +窟 +胥 +恣 +睹 +卢 +勒 +矫 +叔 +髯 +备 +朵 +靡 +纳 +鹄 +倏 +膝 +坟 +嫩 +叩 +滑 +隆 +蔽 +衮 +秘 +混 +摘 +架 +炊 +拈 +捧 +倩 +选 +霓 +榆 +潺 +衢 +眺 +岐 +耐 +嶂 +巳 +毋 +奋 +氛 +款 +谱 +篷 +俄 +谪 +答 +讶 +蹇 +均 +匹 +汲 +申 +慎 +翳 +蓝 +板 +鹅 +讲 +艇 +梯 +簇 +稚 +互 +荫 +愈 +妖 +专 +磬 +巫 +熙 +猛 +绳 +扑 +患 +疆 +函 +卒 +矶 +躬 +羹 +鳌 +耽 +薪 +豹 +碍 +宸 +沽 +峭 +楫 +伦 +粲 +按 +矜 +祀 +荔 +伐 +巍 +闽 +馨 +晏 +脩 +召 +抽 +壤 +盐 +郑 +职 +葱 +盼 +浆 +赊 +秉 +芽 +妒 +湛 +皓 +吉 +匡 +笳 +震 +辱 +牢 +萤 +臾 +获 +卑 +抵 +杰 +鼻 +眸 +模 +鹰 +寝 +羲 +沐 +檀 +冢 +蓑 +甫 +耻 +贯 +逍 +豁 +蔬 +朽 +崔 +斟 +渭 +瘴 +锋 +黎 +损 +示 +穹 +污 +央 +考 +焦 +谒 +勇 +屯 +窈 +佐 +寰 +顺 +缈 +藓 +寓 +橘 +绡 +罪 +龄 +辽 +躯 +倘 +廿 +戟 +溯 +域 +牖 +辔 +蜡 +萼 +擅 +卓 +盆 +滔 +裹 +廉 +救 +捐 +筇 +顽 +辇 +耸 +虬 +镫 +醪 +洋 +遽 +啄 +惹 +庸 +割 +卮 +贺 +笠 +昂 +仞 +拨 +寡 +疲 +削 +铭 +誓 +嘶 +菲 +憩 +显 +驹 +嫁 +姬 +畦 +朗 +鞋 +率 +棱 +珑 +僻 +貂 +慷 +椒 +医 +粗 +雉 +蔓 +呈 +旱 +旨 +陂 +附 +劲 +释 +厨 +妃 +骊 +悦 +灶 +驴 +髻 +鸠 +状 +赫 +填 +犀 +毡 +沿 +徙 +煌 +苟 +猜 +轴 +桓 +琐 +浣 +例 +潦 +欹 +螺 +肌 +灌 +贡 +署 +盏 +谐 +额 +瓢 +欠 +现 +藩 +纹 +纡 +栋 +陟 +勉 +戚 +骢 +薜 +坏 +捷 +贮 +玲 +针 +窃 +俦 +漂 +晶 +墟 +熊 +敷 +矢 +萋 +钧 +圭 +怡 +辙 +豫 +傅 +赢 +隙 +屠 +丞 +鸭 +裾 +斛 +阮 +蕙 +腥 +盗 +兽 +艺 +溜 +瓯 +冶 +闱 +磴 +榜 +簟 +拳 +篁 +舜 +奸 +冤 +铅 +蹉 +液 +检 +煎 +丸 +鹏 +骏 +梳 +狗 +段 +晦 +偕 +鲤 +茎 +雍 +蛩 +菱 +完 +舫 +乏 +揖 +派 +兆 +油 +灾 +誉 +妄 +嬉 +铃 +箧 +族 +嗔 +迤 +厥 +傥 +敝 +邪 +霸 +箕 +庾 +驭 +沼 +贴 +跳 +陋 +队 +骥 +砂 +包 +毒 +鹦 +襄 +训 +预 +乖 +葭 +迩 +务 +砧 +骖 +昧 +校 +衲 +藕 +袭 +俨 +跎 +剥 +沸 +警 +扃 +仅 +樯 +蔼 +筠 +榭 +瑚 +阖 +缄 +遁 +航 +筝 +晕 +扣 +浇 +攒 +迫 +俎 +铸 +垆 +粤 +钵 +辄 +骤 +闹 +讯 +络 +辕 +檄 +恻 +娑 +伸 +肆 +鲈 +邈 +衷 +巅 +诛 +缚 +祈 +刑 +党 +跻 +琶 +廓 +肤 +恁 +涓 +杪 +璃 +翅 +惬 +挹 +嵩 +熏 +峻 +悽 +翮 +苇 +搔 +嵬 +拱 +仕 +翘 +俺 +挑 +寇 +基 +谏 +咨 +葵 +邮 +磊 +宙 +勾 +犊 +穆 +榛 +卉 +钿 +哲 +虏 +构 +碗 +帏 +蛙 +麓 +尉 +峤 +峥 +咄 +凶 +栗 +樊 +耆 +祭 +旆 +擎 +课 +涤 +恙 +蹑 +溅 +准 +愤 +夸 +店 +坞 +漆 +抹 +麾 +办 +产 +浙 +匀 +窕 +桡 +茧 +辅 +嵘 +钦 +怆 +喷 +奠 +冀 +吸 +撑 +衙 +迂 +冯 +耶 +粮 +竭 +贻 +琵 +夭 +谊 +匣 +弯 +攻 +腐 +援 +缩 +濡 +畅 +奕 +淳 +漪 +迈 +爵 +尖 +蠹 +控 +拘 +述 +拭 +拄 +研 +颐 +赞 +缅 +墀 +吕 +害 +魔 +葬 +湍 +幡 +淋 +暄 +唇 +茸 +培 +舂 +楹 +亏 +雌 +莹 +桨 +侠 +颦 +帖 +慧 +担 +榴 +犁 +抗 +粱 +溢 +槁 +嚣 +簿 +犯 +胎 +稼 +葩 +鄙 +淫 +哦 +滟 +憎 +菰 +爪 +骸 +扪 +朴 +庚 +冕 +靖 +纨 +溶 +莼 +椽 +讨 +鹉 +伟 +纲 +掠 +屑 +唾 +阜 +玕 +恭 +崩 +曛 +储 +蟹 +肺 +婴 +驼 +彷 +銮 +韦 +亚 +逅 +橐 +邂 +琳 +猷 +稷 +择 +截 +播 +践 +淑 +渥 +柄 +煖 +灿 +忌 +判 +阊 +厉 +幼 +茵 +讴 +彤 +录 +绶 +邱 +萱 +樱 +闷 +芸 +帛 +责 +翛 +牡 +测 +徂 +刍 +徽 +毁 +屦 +监 +窄 +袜 +荧 +焰 +稠 +谅 +灼 +鸱 +嗣 +膺 +绩 +哥 +刹 +嗜 +峙 +狎 +冒 +配 +噪 +褐 +驯 +拚 +蓼 +躅 +蹙 +尼 +诀 +潘 +惧 +栊 +恃 +毂 +敞 +漓 +匝 +吼 +婵 +育 +屿 +蝇 +旭 +橹 +媒 +涩 +蚤 +敦 +陀 +糊 +贝 +组 +漱 +杵 +蓟 +块 +亮 +颗 +克 +仇 +骋 +撼 +尸 +沃 +蔚 +擘 +纯 +替 +舸 +胶 +闪 +蹊 +癖 +逋 +羌 +忝 +蔡 +挺 +颊 +幄 +遵 +醇 +蛰 +吃 +夔 +炼 +帙 +炙 +隈 +狱 +坊 +飕 +谣 +筒 +札 +拖 +饰 +揭 +馥 +艾 +枣 +羸 +碣 +粒 +饷 +暴 +拓 +努 +紧 +普 +琉 +吠 +剖 +滚 +董 +饼 +氲 +聪 +竖 +莎 +脸 +诘 +窦 +坎 +骇 +梓 +捣 +閟 +泼 +斩 +式 +崎 +秩 +辜 +革 +踰 +豺 +缁 +逮 +飏 +瞰 +螭 +导 +奁 +荻 +彦 +册 +炯 +垄 +厓 +剡 +逗 +验 +掉 +霰 +慢 +嵯 +嘲 +虔 +迸 +饵 +婉 +淩 +笏 +碌 +阡 +箸 +享 +麒 +筋 +甸 +琢 +悭 +尹 +麋 +啾 +镌 +鍊 +究 +跸 +麝 +腻 +抑 +挲 +乙 +遑 +财 +韬 +褒 +舄 +枢 +给 +嚼 +谙 +幔 +坦 +悉 +掀 +幢 +楸 +技 +宪 +创 +踞 +湲 +瓣 +汎 +堵 +秃 +蝴 +岱 +鲛 +蜃 +曷 +鸪 +嗤 +虾 +釜 +炬 +债 +庞 +偿 +斤 +缪 +蕃 +翡 +撩 +缝 +梗 +囚 +掣 +豚 +蜜 +苕 +敕 +迅 +绘 +赓 +颁 +穗 +煞 +麈 +蕴 +譬 +衍 +挈 +租 +鹧 +畿 +鸢 +歧 +瞬 +旄 +篙 +烬 +鹓 +胭 +绾 +缆 +税 +孺 +箱 +泗 +刃 +侪 +纬 +泄 +髓 +霆 +墅 +项 +郢 +杉 +粥 +涟 +匠 +罔 +腴 +恤 +龛 +褰 +掬 +巘 +饿 +汾 +崙 +芭 +雠 +萸 +骝 +惮 +嘘 +垢 +驷 +详 +眇 +欧 +姚 +蠡 +亿 +叨 +皱 +蒹 +洽 +瞳 +蜗 +它 +蕤 +矗 +濑 +淅 +篮 +蹈 +氤 +黔 +绅 +泓 +欤 +魁 +鼍 +颖 +臭 +爰 +唳 +伍 +隋 +孩 +苞 +版 +睨 +陛 +酹 +帅 +绽 +叱 +统 +姜 +恬 +抟 +岖 +堞 +询 +芰 +签 +瞥 +饯 +萌 +桧 +證 +仿 +斫 +伶 +浔 +啭 +蓄 +莓 +湄 +协 +睇 +蓦 +梭 +秧 +檠 +屹 +僮 +罍 +拿 +邹 +秾 +胧 +鬣 +搴 +谟 +遨 +妹 +鹜 +骎 +裴 +缟 +椿 +栈 +描 +鹂 +蒂 +弧 +谬 +攲 +茨 +恢 +裔 +苒 +袁 +需 +糟 +轨 +阎 +捉 +缭 +沛 +晤 +隘 +烘 +沥 +讼 +哑 +廛 +珂 +臆 +什 +搅 +诧 +肘 +牍 +掖 +亟 +醺 +骂 +旃 +酥 +觥 +燃 +鲍 +曦 +猗 +喉 +颍 +婿 +砥 +俭 +鼙 +汪 +俟 +陨 +枚 +溺 +跋 +弘 +泯 +宏 +祷 +僚 +盍 +喻 +窜 +稿 +玑 +霎 +朅 +恸 +倡 +哗 +蜕 +皂 +搆 +芹 +罅 +茜 +杞 +蕖 +厦 +摹 +谨 +焕 +锵 +酷 +蚀 +逶 +翎 +诞 +亘 +沤 +镂 +峣 +箔 +曜 +霍 +醴 +蕨 +察 +蛛 +辍 +眩 +攘 +戢 +蹰 +绸 +婚 +腕 +删 +涎 +蘼 +馔 +晞 +淙 +朦 +窍 +钥 +划 +瑰 +斥 +瞿 +槃 +昙 +锥 +灞 +蹴 +徼 +惘 +逡 +彝 +鹢 +邯 +坼 +轰 +翱 +股 +炷 +柘 +僵 +钺 +逞 +督 +茆 +陷 +副 +厄 +巉 +觑 +豕 +啖 +泮 +秪 +髭 +缸 +甜 +仄 +椎 +郸 +惺 +踟 +萄 +盎 +阆 +绰 +哺 +窝 +拆 +筐 +讥 +珰 +嵇 +骞 +呀 +姊 +枪 +芷 +碛 +轸 +邸 +姐 +醅 +讹 +棺 +扈 +颈 +凯 +涸 +逊 +嶙 +寅 +昊 +厮 +浥 +沱 +漳 +愆 +滥 +愔 +岷 +黏 +惕 +炭 +弩 +奥 +咱 +踵 +箨 +趾 +诺 +璞 +疮 +襦 +罕 +摄 +撞 +辩 +踌 +萃 +篝 +笃 +秣 +爨 +级 +脆 +汇 +趺 +妩 +潢 +庖 +榔 +禦 +汹 +港 +轧 +硕 +洄 +允 +绍 +倪 +瑕 +姻 +噫 +叉 +曼 +稔 +邺 +菩 +惑 +馈 +齑 +秽 +徨 +峋 +渌 +旸 +舣 +棂 +蠲 +卸 +肖 +驺 +轺 +弊 +颅 +闼 +竺 +劣 +芊 +叙 +玻 +兜 +隶 +躇 +茹 +迭 +酝 +奎 +幌 +霾 +偈 +骈 +戛 +窠 +俛 +濠 +爷 +潋 +审 +毵 +婢 +簧 +擢 +铗 +蹲 +渤 +掇 +芋 +屣 +聋 +岧 +娃 +彫 +寤 +柝 +■ +藿 +荼 +肱 +赛 +诰 +谭 +衬 +狸 +狭 +汴 +铿 +球 +沓 +阍 +狮 +铎 +坂 +析 +渍 +悯 +诡 +囷 +聒 +毗 +绥 +阗 +杓 +帻 +员 +恹 +蟀 +腔 +谗 +逦 +觚 +脊 +寮 +奢 +剔 +柰 +潸 +俸 +杭 +铄 +谤 +坑 +罄 +唧 +祚 +励 +炳 +旂 +卅 +斲 +勃 +苧 +耘 +旒 +斸 +腮 +鞠 +芥 +滂 +嫣 +衿 +漉 +嗷 +縠 +氓 +埽 +蜿 +笥 +矧 +鞅 +匿 +蘧 +褪 +枰 +闰 +邓 +蟋 +罨 +芍 +驶 +杼 +沮 +菽 +穰 +柑 +陡 +斝 +茱 +嫦 +货 +蘸 +霹 +酲 +薤 +粼 +惚 +蚊 +瀰 +滇 +釐 +繇 +沫 +绀 +俾 +滓 +啜 +缃 +驽 +脑 +蹋 +瓠 +撒 +辘 +泾 +鹘 +臼 +铢 +拊 +骅 +膻 +赭 +屧 +瘁 +鹗 +苓 +撷 +癯 +栾 +魅 +邃 +貔 +订 +绂 +雳 +茁 +爇 +滕 +湫 +概 +骧 +飔 +擒 +兢 +觐 +徉 +纭 +旻 +孱 +皤 +钝 +演 +卯 +盂 +禊 +婷 +涴 +缣 +忻 +瞩 +锐 +沁 +甑 +娉 +跪 +粘 +岘 +沌 +矛 +矩 +虱 +浃 +妓 +搏 +觌 +诣 +弭 +髣 +渝 +鲙 +髴 +踯 +荏 +彰 +聘 +峒 +亨 +枭 +嵌 +冽 +槽 +遏 +螯 +瞒 +靥 +狡 +伪 +畹 +烁 +樗 +铛 +绚 +隼 +秫 +拒 +姝 +臧 +绢 +曝 +陲 +燎 +肇 +砺 +祐 +帚 +暾 +堑 +筱 +刊 +蒋 +玳 +瞑 +渎 +鲲 +庇 +孥 +塍 +侈 +陬 +簌 +荇 +豸 +萎 +榕 +蔷 +缤 +蜒 +疗 +库 +佞 +颔 +缱 +廑 +勺 +薮 +甃 +燄 +徜 +嬴 +窅 +泫 +奄 +躔 +孜 +丙 +飐 +坳 +谦 +苜 +枥 +纫 +蓿 +盲 +捕 +裤 +穑 +澈 +匏 +薛 +罥 +鄂 +熳 +睛 +砾 +企 +鳖 +竽 +箴 +糠 +汨 +簸 +沆 +娄 +晒 +甍 +摆 +曩 +偎 +牒 +勖 +垠 +绎 +粳 +荦 +咎 +菟 +轲 +炫 +吝 +孽 +绫 +窘 +撚 +楮 +矮 +酡 +杲 +廪 +葺 +憾 +鹳 +荀 +榼 +侔 +琪 +贶 +箪 +赪 +齧 +菁 +洼 +萏 +攫 +挝 +菡 +阃 +圜 +溉 +坰 +亹 +迓 +閤 +皴 +柁 +崦 +厅 +贲 +脍 +槿 +括 +刮 +隽 +讽 +讳 +笈 +沅 +韭 +蛱 +臻 +葳 +糜 +忒 +汩 +胄 +钻 +赌 +嵚 +佑 +舷 +澌 +碾 +砖 +钜 +殆 +葆 +缑 +礴 +泚 +谑 +嘤 +喃 +啻 +舅 +猩 +绻 +牲 +姮 +骀 +淼 +爆 +劈 +聆 +靓 +茕 +伎 +鴂 +绯 +罴 +糁 +垓 +葡 +飧 +闉 +膳 +彊 +崚 +倖 +腑 +棉 +嘱 +罹 +樾 +穫 +湔 +嶒 +吻 +殉 +诬 +虐 +罩 +敏 +禀 +殢 +曈 +邛 +谡 +悼 +谯 +瀣 +棣 +遒 +蚌 +髦 +耗 +摛 +镳 +醑 +鼋 +栩 +宕 +跌 +懿 +孕 +鸰 +卦 +辋 +嘈 +徇 +炮 +漕 +锸 +晷 +撇 +燠 +菖 +磷 +戮 +亸 +耒 +羃 +赚 +证 +伽 +霈 +飖 +祁 +攸 +喝 +璇 +毬 +妥 +圞 +缶 +缬 +靴 +旰 +斓 +诃 +劬 +饕 +芃 +皑 +勘 +批 +嗅 +纻 +惶 +崆 +镵 +帜 +喘 +睿 +桴 +瘠 +咿 +淇 +蔗 +庑 +刷 +绊 +臞 +肢 +窣 +鲵 +址 +蚩 +狄 +慇 +鹫 +荚 +舲 +炽 +柚 +酉 +爹 +弋 +潞 +娜 +卤 +瀼 +晃 +卵 +痍 +珥 +肴 +舶 +玺 +侮 +脾 +溷 +黾 +纠 +牌 +蹶 +跹 +睫 +睢 +畜 +颤 +兕 +呻 +轳 +翥 +堠 +摊 +嫂 +耄 +罚 +撰 +阕 +孚 +羔 +栉 +粹 +欷 +浚 +枳 +菌 +帕 +厢 +恕 +羯 +弛 +硬 +煦 +诩 +舠 +晌 +耦 +湮 +该 +隰 +蘖 +霅 +谕 +粪 +箓 +翟 +绠 +棚 +拣 +摅 +眄 +罂 +槊 +殃 +橙 +泱 +诠 +雩 +绨 +邵 +诮 +戾 +确 +骐 +距 +瀹 +懊 +型 +舻 +纛 +掾 +娲 +闇 +牟 +黼 +酩 +蠢 +煤 +濆 +阒 +禋 +贷 +珀 +猪 +疴 +辖 +蜍 +姥 +筌 +翕 +猴 +鞯 +辀 +憨 +蜚 +刈 +甄 +刁 +侑 +缰 +弁 +篪 +扼 +忡 +涡 +赎 +釭 +殇 +抬 +縻 +壳 +袈 +抉 +抄 +瀚 +泬 +黠 +蟆 +沄 +铙 +蜉 +伞 +喔 +迦 +蹀 +鹑 +魑 +酊 +汞 +颉 +抔 +谆 +竦 +軿 +莪 +腋 +溽 +歔 +歉 +蓂 +措 +咒 +贳 +藁 +裟 +币 +罾 +煨 +淘 +觜 +睥 +售 +赍 +藐 +髫 +螀 +畬 +猊 +暧 +蚓 +毅 +券 +璚 +琥 +殚 +悰 +墩 +译 +鳄 +沂 +芟 +扳 +脯 +挨 +狞 +瘗 +龚 +亥 +溃 +掘 +遘 +墉 +叵 +惰 +闵 +饫 +貅 +猖 +囿 +酤 +暌 +鬻 +磐 +萨 +疠 +拉 +谶 +挫 +嘻 +嘴 +匕 +骛 +饴 +轶 +蜩 +艘 +淞 +哩 +楞 +某 +串 +燧 +哂 +瑁 +泷 +佯 +荠 +枻 +烨 +刬 +蜻 +盥 +燥 +殁 +啮 +厕 +毳 +蘅 +涔 +岌 +赦 +辗 +谛 +喙 +絷 +酪 +猱 +磔 +歆 +髀 +徕 +呕 +髡 +缯 +汁 +瓷 +烝 +蝼 +濩 +媪 +濂 +鴳 +匮 +蔌 +雒 +覃 +徬 +埙 +馋 +噬 +矍 +狩 +漭 +泳 +桔 +贰 +筛 +槩 +棒 +婺 +咳 +爻 +厩 +髅 +絺 +燐 +怖 +祉 +璋 +核 +喟 +剌 +箬 +颙 +裕 +姹 +祛 +估 +亢 +惩 +届 +飨 +汐 +欸 +邙 +揣 +玦 +鹍 +驮 +赈 +荪 +甥 +熠 +荄 +楠 +醁 +懦 +仑 +诈 +愉 +痒 +蜓 +弈 +扮 +铮 +洵 +荜 +浒 +孑 +酴 +牝 +涪 +姨 +舰 +杆 +押 +寨 +蝗 +熨 +忤 +鼕 +馁 +崛 +畎 +嘹 +咤 +纾 +妪 +醾 +遄 +栎 +镕 +疵 +澒 +噤 +蹬 +屩 +茏 +缔 +缊 +愠 +罟 +猫 +仔 +豳 +歘 +晡 +咀 +槔 +搀 +瞢 +漙 +撤 +扆 +涘 +伺 +鹪 +赀 +荃 +偻 +① +闸 +隳 +阐 +崒 +赘 +脏 +燮 +熬 +楣 +裨 +俞 +殖 +邅 +牺 +挠 +偪 +佗 +缗 +奖 +跛 +篘 +踊 +秖 +盻 +摸 +揩 +儗 +鹚 +跄 +卞 +鞦 +佣 +桁 +戊 +冗 +丕 +壬 +贩 +翊 +璨 +揉 +庠 +娶 +娆 +碓 +岗 +鞚 +纂 +籥 +暨 +揆 +拗 +岿 +戌 +虺 +汶 +琤 +楷 +靳 +瘼 +刳 +谀 +溆 +肓 +殒 +钞 +躞 +苴 +胪 +泡 +呢 +钲 +玷 +柿 +菑 +圈 +鹥 +踽 +斡 +拯 +堰 +饧 +荅 +抒 +彀 +苛 +捻 +匈 +跂 +啁 +阀 +鹡 +诲 +黜 +钳 +鏖 +弼 +岏 +眦 +枌 +憧 +喁 +韆 +诱 +蝣 +珉 +镝 +逵 +狙 +歼 +巇 +蛤 +侄 +袷 +荑 +甓 +皖 +凑 +砻 +疋 +氅 +龊 +衽 +繄 +爬 +俪 +豗 +2 +鸮 +铓 +拼 +饲 +胫 +” +菘 +“ +镞 +锻 +缴 +毓 +抡 +棕 +婀 +喑 +酋 +琚 +濒 +敖 +雹 +戕 +忏 +锤 +蜺 +篌 +慄 +祲 +渑 +鸬 +驩 +鄱 +璀 +崄 +鹩 +癸 +恳 +燔 +缉 +悤 +袒 +檗 +箜 +轼 +旎 +咬 +筼 +抢 +靠 +髑 +肚 +纽 +旐 +捎 +叮 +脐 +鸂 +呦 +胁 +灺 +壖 +埒 +佚 +毹 +茔 +芗 +冱 +鼯 +祓 +忿 +阪 +酾 +蹠 +购 +裸 +蕲 +璠 +怠 +箠 +巩 +崧 +鸽 +栅 +吓 +潼 +祢 +瓈 +叛 +鲂 +煜 +陔 +蓍 +蓠 +揄 +啧 +笞 +淤 +夐 +觇 +蒐 +阱 +琰 +涅 +鶱 +狖 +撄 +垫 +淄 +怦 +倭 +慑 +悍 +娈 +丫 +筜 +玙 +嶷 +醯 +瘳 +焙 +灏 +捍 +俜 +豢 +蠖 +础 +聿 +铨 +澎 +辂 +琊 +拦 +蹁 +楝 +鳏 +痾 +愕 +澍 +氍 +闯 +菅 +筮 +瘿 +瑜 +壅 +鵩 +琯 +涣 +帔 +旖 +騑 +钉 +逻 +耜 +薏 +婪 +葫 +滹 +醽 +衅 +俘 +颃 +渟 +搭 +鎗 +迄 +邕 +秕 +歊 +巑 +吭 +侏 +螟 +赉 +摵 +烱 +昵 +钤 +钏 +舵 +呱 +剂 +绦 +掺 +哽 +饤 +笾 +滪 +埏 +迨 +襆 +莘 +鬯 +隗 +缫 +畀 +茯 +牂 +喈 +拽 +韫 +彬 +贬 +蒯 +禺 +圮 +抖 +嚬 +骆 +莅 +溥 +嫖 +皦 +摺 +颢 +肮 +綦 +凹 +翚 +洙 +欃 +裛 +椰 +鶒 +袋 +砭 +湓 +沚 +鶗 +阛 +镰 +晻 +忱 +呆 +诫 +噩 +缙 +黻 +侃 +葑 +睦 +籀 +彴 +圉 +觏 +澳 +跏 +淬 +槌 +捩 +嶪 +镐 +饬 +躁 +罡 +砢 +蕾 +昽 +1 +纩 +棐 +鳣 +磅 +霙 +彗 +辑 +蹭 +鸷 +椠 +詹 +硎 +筏 +町 +槟 +枵 +肋 +祟 +痼 +泖 +搦 +叆 +们 +㶁 +祯 +殽 +歙 +搓 +咛 +鹴 +桀 +莞 +飓 +跬 +琮 +欻 +晁 +殄 +酢 +械 +驳 +赣 +茝 +濮 +惫 +崭 +圻 +3 +赁 +艮 +盾 +瓴 +嫠 +稗 +蔻 +峄 +鸲 +骓 +裯 +鹆 +跟 +樛 +嫔 +辚 +褥 +舳 +棼 +懵 +绁 +璜 +幂 +骠 +汰 +崿 +鬖 +摐 +媸 +跚 +袪 +缇 +黝 +陕 +蝀 +聃 +滁 +哎 +谥 +呗 +餍 +讣 +耋 +筦 +哮 +侨 +糕 +畸 +狯 +杷 +啰 +匙 +醍 +苡 +罘 +椟 +襜 +桄 +诳 +您 +魃 +餔 +锢 +蓊 +恺 +伧 +䍦 +琬 +晔 +迍 +瞋 +櫜 +扛 +幰 +丐 +媲 +伥 +赶 +胞 +皈 +鳷 +莉 +窖 +滦 +醐 +皪 +枨 +咍 +锷 +瞽 +皞 +沴 +沪 +怊 +酱 +撮 +轓 +諠 +晰 +枒 +螳 +窑 +矰 +忉 +鋋 +莠 +盱 +蝠 +茉 +玖 +夥 +垤 +骡 +忪 +骁 +媛 +醮 +酽 +裀 +薝 +舐 +耨 +矻 +嫉 +嘒 +訾 +艋 +絜 +狠 +澧 +恂 +俏 +综 +禧 +磻 +巽 +剜 +② +怵 +帑 +侥 +獭 +徭 +嗥 +铩 +谴 +脔 +粕 +睽 +桕 +鬘 +舴 +揶 +籴 +滃 +滉 +隧 +遝 +茭 +猬 +毙 +俚 +邗 +抠 +夤 +籯 +屃 +诙 +笄 +栝 +擞 +懈 +斐 +巀 +嵷 +麰 +觊 +粝 +宥 +芡 +眈 +畛 +鷟 +畚 +孀 +颡 +榱 +挼 +鼐 +潆 +泞 +镛 +铦 +淀 +懋 +噀 +瘵 +淖 +洳 +饘 +圯 +衒 +仝 +鹇 +邢 +羝 +璈 +琛 +湃 +怍 +阨 +锼 +繻 +泸 +扯 +瘖 +黉 +鲫 +阇 +芾 +紬 +惝 +怏 +菹 +菀 +儋 +麇 +踉 +绒 +嵂 +哢 +鹣 +輶 +蜷 +槭 +捋 +鬐 +澥 +谖 +悃 +雎 +醲 +獠 +昃 +嵋 +莳 +戡 +伛 +膜 +鸶 +荟 +窿 +皛 +痊 +泐 +枇 +挛 +怛 +埭 +跣 +熹 +巃 +唼 +惋 +恧 +鼾 +朮 +傀 +鹔 +洮 +隍 +翾 +慌 +悁 +妈 +吮 +辐 +缦 +鸑 +谲 +蒜 +剽 +锱 +蘩 +怿 +龉 +玛 +鹈 +砑 +琨 +鞬 +缛 +磈 +蹐 +胃 +篦 +稊 +孳 +坝 +瑾 +淰 +鹯 +蔑 +苹 +籞 +浯 +桷 +嘿 +酎 +瞠 +瘢 +楯 +朕 +悒 +俩 +韝 +逖 +诜 +沔 +鲠 +鲊 +蹒 +赑 +蛄 +忖 +姗 +飗 +篚 +汜 +塌 +騕 +跕 +裈 +牁 +燹 +榇 +拌 +壕 +镬 +胝 +窭 +猥 +潏 +桩 +鞮 +跫 +虿 +鹖 +荤 +篴 +祜 +僦 +僝 +4 +胚 +狺 +惓 +塿 +蝌 +肿 +砀 +畤 +旟 +哇 +魍 +迪 +诤 +虮 +羿 +唫 +诋 +褚 +薙 +眯 +歈 +惴 +刖 +庥 +闬 +谠 +渗 +憀 +垦 +喋 +龌 +鬨 +瑙 +暍 +龃 +鸩 +隤 +磁 +澡 +咆 +呖 +舡 +坫 +釂 +殡 +啬 +貉 +恪 +嵓 +彪 +堡 +醢 +茑 +璿 +汍 +栀 +儡 +璎 +件 +阓 +募 +钢 +襁 +菼 +绉 +悸 +坪 +镮 +蝙 +肪 +篑 +厖 +荞 +泌 +橡 +嗒 +褓 +蛊 +蔀 +罳 +硗 +寔 +霪 +箑 +晼 +忺 +倔 +贸 +蠕 +纺 +阏 +键 +蓐 +苋 +膴 +獐 +僭 +赝 +褊 +舛 +捞 +鲭 +掞 +鍪 +瞎 +汛 +榾 +宓 +诒 +莸 +荩 +敧 +憺 +倜 +辣 +绷 +槲 +馌 +螂 +璆 +挐 +嘏 +魉 +蹂 +昕 +幛 +匍 +鹁 +褫 +袄 +萁 +揾 +僽 +蝎 +腼 +罽 +玫 +榷 +挞 +幺 +辏 +絪 +坻 +耰 +柮 +赡 +潴 +谧 +诹 +荛 +珞 +铲 +蜮 +浏 +悌 +娴 +坷 +呶 +诔 +蟪 +拢 +塾 +佥 +褶 +踬 +觱 +欬 +彘 +谔 +膊 +盒 +帼 +凸 +縢 +疫 +斁 +颛 +蹻 +虢 +崟 +哄 +幪 +蚪 +匐 +俶 +鞘 +闳 +镊 +趍 +訇 +睁 +疚 +炤 +饔 +悚 +怙 +谺 +苻 +纮 +丱 +馘 +霤 +霢 +瀁 +陉 +猝 +棨 +昴 +唶 +伉 +蟫 +蓁 +苾 +箐 +稏 +犍 +挤 +厂 +乂 +鹞 +鲋 +躲 +簦 +磋 +堙 +佺 +鳜 +肸 +鬅 +仃 +醨 +绤 +淝 +峪 +鲰 +轫 +谽 +禳 +湟 +柢 +忾 +廨 +壒 +頫 +踔 +劾 +缲 +朓 +垩 +矿 +烜 +剿 +蚨 +莆 +狷 +橛 +另 +谚 +谄 +艎 +磕 +甬 +炜 +涝 +吒 +霶 +繐 +恫 +騧 +鞲 +螮 +蒇 +苌 +戆 +姒 +膂 +壹 +哈 +叇 +③ +鄞 +逭 +赧 +诟 +罝 +熄 +搬 +帨 +贽 +蔫 +脰 +睐 +陴 +獬 +锯 +睍 +嶰 +劄 +剸 +蕞 +踆 +琏 +牯 +涿 +浍 +霂 +讫 +楂 +懔 +奶 +騄 +蓓 +捏 +崤 +噭 +斿 +坌 +醋 +曀 +「 +鞿 +糖 +樨 +嵝 +寘 +砰 +暵 +慊 +嵲 +嵫 +喂 +蚋 +耍 +澨 +窱 +樟 +兑 +鬙 +踣 +跑 +愬 +礧 +瑛 +黩 +鬒 +蛉 +矾 +瓿 +煽 +噎 +勍 +鯈 +郤 +帱 +桅 +搢 +䆉 +苎 +簏 +爝 +渲 +搥 +奂 +僰 +蚃 +綍 +簋 +秤 +淆 +扩 +惇 +尻 +騃 +跗 +蒺 +簉 +箦 +筴 +瓒 +楩 +柬 +埼 +胼 +癃 +疟 +枋 +岵 +剃 +」 +鲑 +锅 +芼 +缋 +绋 +慝 +鬲 +蛆 +舁 +% +嘅 +蜑 +屺 +膈 +篾 +睆 +旺 +媵 +塑 +哆 +齁 +槚 +愦 +尪 +裒 +胤 +黥 +狈 +炀 +柂 +枷 +嫡 +刲 +擪 +壈 +" +霔 +缮 +缡 +榄 +弆 +鸧 +腓 +纥 +殳 +掐 +鼪 +蜘 +茄 +艭 +糗 +抃 +噱 +俳 +駮 +蛸 +胯 +瓞 +戺 +哪 +馑 +锣 +菉 +哨 +凊 +蕡 +徯 +嗫 +觳 +荥 +狻 +湜 +垞 +哓 +猾 +挚 +铉 +詈 +梏 +戋 +糇 +琲 +娩 +謇 +艟 +嵎 +屼 +媳 +咮 +謦 +虻 +萑 +摽 +嗏 +劖 +儆 +踢 +砉 +酺 +炰 +漩 +桢 +愫 +趯 +苫 +搁 +卌 +刓 +諕 +聱 +洑 +氐 +稌 +疣 +疢 +氎 +椅 +哕 +笮 +犴 +螫 +藟 +瞪 +餮 +铠 +艨 +狝 +堧 +蟏 +蔺 +煇 +洧 +惄 +豉 +缧 +枞 +④ +鲥 +魈 +轿 +躏 +浼 +悖 +咻 +劭 +璁 +洎 +娼 +鶄 +雘 +轭 +趱 +襶 +緌 +笭 +洟 +岣 +唏 +鵁 +臊 +肾 +璘 +杠 +昉 +嗈 +倥 +遰 +辎 +踠 +谌 +螗 +肄 +糵 +睎 +疼 +熇 +嚅 +貊 +襞 +篥 +燬 +歃 +哙 +覈 +褦 +衎 +捶 +埤 +耔 +郦 +蠃 +姱 +骦 +镆 +跧 +纚 +嬛 +颀 +趄 +薿 +眵 +椷 +寖 +坱 +阚 +纁 +箵 +桦 +捡 +5 +詟 +觫 +螽 +稂 +龁 +黧 +鷇 +雊 +茀 +粢 +喊 +傩 +刎 +鞶 +贿 +眚 +瘝 +狰 +榈 +鬃 +跽 +襕 +漘 +帡 +匼 +偬 +偓 +骼 +郫 +藋 +窒 +楔 +讷 +褷 +珩 +睒 +畋 +渫 +娓 +娅 +佔 +鳊 +蟒 +龀 +澶 +亶 +莩 +椁 +帮 +嗽 +饾 +隩 +辊 +薨 +忸 +囹 +呷 +鹠 +骕 +邴 +枘 +黮 +郴 +郛 +紞 +眢 +弇 +墦 +嘐 +骭 +訚 +羚 +猺 +枸 +忭 +丢 +蚍 +蘤 +艑 +臈 +耇 +縆 +禆 +鸺 +驎 +阉 +裌 +蚯 +溘 +毯 +扎 +套 +蛎 +聩 +聂 +缵 +囧 +轗 +蹩 +赳 +蠓 +腿 +獍 +桶 +埴 +圄 +讪 +蜴 +篸 +瀍 +湎 +毸 +陁 +邳 +汭 +椹 +鹙 +雰 +诼 +槜 +柽 +嫱 +兖 +篡 +洱 +摭 +鄣 +诅 +诂 +硁 +渰 +呴 +趑 +貙 +蟊 +毰 +怃 +隃 +迕 +菭 +姣 +傞 +趼 +沩 +媾 +圹 +啅 +骍 +韡 +谜 +胙 +筥 +祺 +疥 +猕 +焜 +栱 +鲔 +谍 +硉 +汊 +撝 +彯 +隮 +酂 +豨 +纣 +硙 +犒 +⑤ +鸊 +裆 +歹 +摈 +拶 +崱 +锈 +诎 +蔟 +篯 +镡 +铫 +糅 +殪 +髹 +醵 +蒌 +砦 +痡 +炒 +溱 +柈 +慭 +腷 +灊 +亵 +籧 +疃 +锉 +畯 +咋 +臬 +翣 +缒 +矇 +柞 +庋 +陜 +虓 +堇 +龂 +笆 +站 +穸 +穟 +沲 +桎 +尨 +卬 +仳 +骘 +殂 +桠 +扊 +帢 +赩 +菂 +祧 +溧 +栴 +柙 +噞 +郧 +谳 +蠛 +筲 +砍 +矼 +瞀 +懂 +帟 +翙 +獒 +廖 +帧 +垲 +轹 +赂 +袤 +腆 +粜 +扅 +幮 +侗 +筚 +狨 +洌 +毖 +恊 +崷 +壸 +墐 +魇 +镒 +褕 +濊 +洸 +杈 +雯 +粽 +籋 +彳 +囗 +厝 +亍 +鹬 +赆 +懑 +喳 +呓 +吷 +6 +谎 +詄 +絓 +玆 +涬 +厎 +僛 +佃 +麑 +轾 +谂 +簴 +窀 +秸 +甔 +柩 +屴 +喇 +轣 +輀 +窳 +眊 +猧 +濈 +浈 +赜 +譊 +葹 +縳 +窆 +洫 +擐 +悱 +恽 +孛 +鞞 +鍧 +袛 +蜥 +莒 +殛 +愀 +嫜 +閧 +铘 +笱 +瞭 +橄 +棬 +拮 +岈 +嘬 +黟 +韨 +郅 +邠 +荈 +绞 +硖 +痏 +妳 +倅 +榑 +抺 +圩 +唁 +䆗 +霣 +篨 +瘥 +搊 +抓 +佻 +酆 +赃 +糈 +沨 +殍 +旴 +搽 +搂 +垛 +驸 +馗 +鄠 +郗 +萚 +漻 +沬 +擿 +挣 +钁 +蚬 +绌 +洏 +楗 +嶕 +嶓 +嫫 +浐 +掊 +悫 +幨 +亳 +鬵 +蚿 +蕝 +祼 +睚 +狴 +歜 +朣 +忼 +冁 +龈 +辫 +裋 +瀫 +卣 +劘 +駃 +竣 +竛 +痹 +湩 +够 +赒 +笕 +竮 +畇 +洹 +杕 +抆 +愒 +僣 +佽 +䨴 +⑥ +郝 +蝮 +捃 +挡 +怩 +9 +駬 +鏦 +玞 +煅 +妫 +襫 +擂 +嫭 +卺 +躠 +薅 +秭 +潨 +姆 +麚 +袯 +蕺 +紊 +眶 +溲 +薖 +梃 +廌 +哳 +俑 +醰 +谮 +诿 +蜾 +薯 +摴 +拇 +娣 +坯 +喤 +卐 +剉 +侩 +驔 +阄 +揎 +扭 +8 +鈇 +蜊 +筤 +窊 +珈 +屎 +麛 +鹾 +驲 +餗 +逄 +轕 +躐 +胖 +翀 +箯 +痈 +玠 +漶 +泺 +匜 +鞓 +誊 +矹 +偾 +躄 +蘉 +槱 +鱍 +蒱 +簖 +瞆 +爚 +柅 +掎 +芑 +皲 +皙 +痗 +汧 +歪 +櫩 +搰 +抨 +娭 +噏 +骷 +镗 +郇 +蠙 +蓺 +籓 +簜 +瞧 +睟 +眨 +珣 +杌 +嚱 +鳅 +鐍 +趫 +荭 +羖 +瘏 +燖 +撕 +悢 +埘 +雇 +镈 +蝥 +蕣 +茇 +绐 +窾 +枹 +恝 +儓 +侦 +騣 +駪 +颧 +邾 +蒻 +罫 +糺 +瑳 +惙 +恚 +嬖 +唔 +鲇 +馡 +颣 +醄 +芘 +胾 +棫 +栌 +擦 +妤 +觋 +裼 +磥 +悛 +嫚 +坭 +厜 +冑 +黦 +鴐 +靧 +莛 +肫 +惔 +嬗 +堁 +咐 +髿 +钮 +醥 +轇 +衩 +涏 +匾 +㕒 +鲚 +鲐 +褴 +苣 +罐 +眙 +癫 +狃 +犷 +烙 +喣 +鸴 +靶 +蛬 +薶 +硠 +盬 +渣 +楖 +桤 +巂 +嶭 +婕 +儤 +鼷 +黁 +髽 +蹡 +菶 +珷 +牗 +沕 +庳 +埸 +嚘 +鳝 +飂 +霮 +蹯 +艖 +腌 +絇 +碉 +矞 +洚 +揪 +捆 +恓 +宄 +埠 +嚏 +噍 +咷 +咙 +呿 +㷀 +鞴 +衄 +糯 +簠 +笯 +璅 +灂 +屟 +厘 +龆 +讧 +猰 +灸 +棍 +憯 +奫 +奡 +佪 +锭 +酖 +遴 +诽 +薾 +笴 +盅 +琦 +湑 +橦 +晬 +憸 +嬿 +堄 +骜 +饩 +鄢 +逑 +踧 +踝 +蚝 +蓰 +蓧 +瀜 +滤 +掳 +喀 +倬 +鲽 +鬑 +賨 +蝤 +蛋 +碇 +症 +甪 +歠 +楛 +拴 +怫 +岝 +鸨 +骰 +駴 +铏 +轘 +躧 +葚 +眴 +疽 +瀌 +杻 +䲭 +鰌 +锹 +锜 +鄮 +郿 +觎 +臛 +纆 +猋 +涖 +媭 +匦 +俐 +䙰 +7 +阈 +詶 +袢 +螣 +蜣 +罛 +窬 +祅 +甡 +瑽 +燋 +沜 +斮 +媕 +嗾 +鹒 +鱮 +镠 +綷 +痂 +犄 +晾 +憬 +儇 +阽 +阂 +蟢 +苶 +艓 +祔 +獗 +怔 +媮 +嚚 +⑦ +鞳 +迮 +罻 +绖 +絙 +瘤 +玎 +猃 +歛 +很 +嫏 +喨 +倨 +饪 +羾 +票 +瓻 +猘 +橱 +尅 +鬌 +驵 +页 +阼 +觯 +菔 +芮 +艅 +纴 +瓤 +瀺 +氂 +拷 +勷 +馺 +馒 +胛 +秬 +熛 +炕 +濞 +槅 +暹 +幞 +寀 +嗛 +仡 +趠 +袆 +蟉 +膑 +禖 +矑 +牸 +牷 +挦 +惛 +嵽 +倕 +䍥 +髧 +锧 +礽 +盦 +涑 +摒 +傺 +颏 +輠 +虡 +胠 +胔 +綮 +綀 +痌 +瑗 +爓 +滫 +渼 +洿 +搯 +戽 +喽 +凘 +靺 +靷 +靮 +阋 +鄜 +郐 +袿 +蒟 +臲 +窔 +礡 +瑀 +椮 +弮 +妮 +咥 +E +鲧 +阯 +遹 +蠍 +蛴 +籊 +矱 +痿 +犗 +橑 +旓 +奭 +僇 +㟏 +麂 +裓 +蝘 +蛹 +虩 +缜 +玮 +桉 +栟 +擗 +撺 +揠 +峍 +壝 +嚷 +侘 +馂 +镃 +锒 +酕 +跷 +趵 +谇 +緉 +眬 +畲 +燉 +岨 +咦 +駏 +駉 +踸 +踖 +讦 +蛭 +纕 +緤 +枲 +捽 +墄 +咈 +匊 +䰐 +鼗 +鬉 +辆 +訢 +腯 +纑 +樕 +嬷 +妣 +勚 +僖 +㠝 +颾 +顼 +镪 +谫 +蚶 +舱 +竁 +窸 +氉 +奼 +佼 +頖 +闿 +裲 +蠋 +蒉 +腭 +弑 +嚄 +呐 +鵔 +魋 +阘 +葄 +笨 +祊 +畺 +殓 +杙 +捺 +怼 +垡 +喓 +伾 +鴃 +踦 +赅 +貐 +臃 +窨 +潾 +榨 +廋 +嬲 +啐 +劓 +劂 +鲎 +铚 +鎞 +郯 +螬 +蝈 +窞 +祏 +狁 +桌 +愊 +怲 +妁 +吰 +劻 +0 +騋 +芎 +膰 +罶 +筈 +瘃 +澼 +噌 +嗄 +鵊 +镶 +甗 +熔 +滮 +泅 +櫑 +拐 +懆 +嗑 +卼 +偌 +齾 +鹎 +鬈 +郏 +轵 +觿 +穜 +礌 +瑱 +犵 +焱 +栲 +昱 +塼 +圠 +咚 +僶 +侁 +伋 +㵳 +鸃 +騠 +頀 +螓 +筳 +禔 +瘅 +璐 +滏 +楦 +朏 +忮 +嵏 +哟 +傒 +䍐 +鳍 +鳃 +馣 +韎 +覼 +蚴 +蓣 +祫 +漈 +彖 +娵 +姤 +佾 + +馐 +霉 +蚖 +瀡 +湱 +攧 +挍 +幐 +圬 +僾 +颟 +郓 +粔 +箩 +礀 +圳 +噶 +噰 +呸 +剞 +鸤 +颒 +顸 +觔 +蔘 +蓘 +翌 +睪 +癞 +玼 +斌 +塈 +偲 +㕙 +韔 +舀 +绹 +涷 +朘 +晢 +挪 +憝 +媻 +凳 +鹌 +鲨 +鲦 +鬋 +顗 +韺 +韪 +遛 +貤 +襮 +蛀 +緺 +煴 +晛 +娿 +剋 +㪺 +B +鲟 +骙 +靸 +螃 +蜞 +耤 +羜 +籺 +籹 +礁 +碨 +獾 +牣 +焄 +淟 +棰 +敚 +堀 +㡳 +⑧ +蚱 +蔿 +粻 +篓 +箙 +秒 +瞌 +痁 +瀵 +漰 +斨 +敩 +墁 +黤 +鶋 +鮀 +醭 +诊 +薋 +箾 +窌 +焘 +浡 +曚 +捭 +弨 +坍 +吩 +龋 +髼 +骗 +頞 +霿 +輴 +蹢 +珓 +猢 +漼 +浟 +椤 +棁 +岊 +嚎 +〕 +〔 +雺 +襭 +蟭 +蜢 +蜎 +膋 +腠 +羑 +祎 +痔 +疹 +洴 +沗 +桫 +昩 +扺 +慆 +勌 +劵 +伻 +龠 +鶢 +赔 +諵 +襌 +翿 +盔 +炱 +漦 +嫞 +她 +噂 +啴 +咂 +黕 +骹 +騀 +饟 +鞺 +鞨 +軨 +袨 +蟛 +螾 +脓 +胀 +稛 +瘉 +痰 +玃 +狲 +朐 +摔 +愞 +囤 +喏 +鼛 +麸 +麌 +麀 +铤 +趿 +誵 +袱 +蒪 +缿 +缞 +淂 +沭 +汽 +撅 +揲 +嵜 +岕 +噇 +卨 +卡 +㛹 +頩 +蕈 +艚 +穮 +硝 +焞 +橇 +槠 +晟 +揔 +掴 +彄 +崃 +嚭 +钴 +逌 +蓱 +菇 +茈 +芣 +砮 +瑑 +爞 +淦 +汕 +椓 +梼 +梠 +慱 +嵁 +啑 +剐 +鶤 +鳗 +闑 +蟺 +芧 +缢 +稑 +瞅 +癣 +昳 +嗢 +唆 +鲿 +骃 +跱 +趖 +赙 +蕳 +笊 +碜 +瘟 +淜 +涊 +洺 +泂 +氄 +椭 +庀 +嘎 +呫 +騊 +駼 +駊 +韘 +靃 +雭 +锾 +锓 +酗 +谼 +诐 +衱 +蟂 +蔂 +臀 +绺 +绔 +祃 +硫 +盝 +疡 +哤 +䰄 +鞫 +鞟 +锲 +轑 +踶 +谰 +薆 +纰 +絖 +狒 +毷 +晹 +彧 +嵼 +噆 +傻 +㹀 +㡧 +⑨ +鹕 +鲩 +骫 +觖 +褛 +膨 +籽 +礿 +礓 +疐 +淈 +沣 +檛 +椑 +挏 +憙 +慁 +堮 +嗉 +哜 +剨 +佶 +韐 +軥 +蜧 +萟 +珏 +燀 +濉 +溔 +敉 +搒 +憃 +惷 +哏 +㩳 +鲕 +鉧 +诪 +蓏 +芄 +篿 +禨 +畟 +檿 +搪 +搉 +戬 +憭 +憖 +埔 +囝 +哝 +吽 +剺 +䑳 +㸌 +㬠 +鼬 +鸹 +韧 +鋗 +蛑 +莝 +絁 +碡 +硌 +獞 +熯 +潍 +滀 +栳 +挢 +愓 +廒 +岞 +傃 +𥚃 +黪 +靛 +隹 +閜 +醳 +辒 +袽 +薹 +萹 +芩 +簳 +睬 +璺 +歅 +敔 +摝 +掸 +彍 +弸 +弰 +廱 +㫋 +㪷 +㜷 +鬷 +遌 +躨 +諴 +荂 +簃 +稹 +矬 +睊 +揵 +岔 +囱 +㺄 +⑩ +黳 +黐 +魖 +顑 +鞑 +躗 +趐 +諟 +蛚 +罦 +箄 +禘 +磝 +瞍 +珙 +狌 +牴 +泔 +榉 +掏 +峿 +婳 +妲 +鵙 +骳 +饦 +餈 +飋 +颎 +跰 +跜 +衠 +藷 +翏 +繂 +瞷 +滢 +姁 +偨 +鼢 +鸒 +鳦 +鲀 +髟 +雱 +镯 +軏 +豅 +蝝 +蚺 +藭 +絅 +箘 +磲 +痟 +狉 +牾 +爟 +梱 +桹 +晅 +扢 +憪 +悾 +悻 +屙 +尿 +墠 +唈 +份 +A +鼹 +鳀 +阹 +酃 +诨 +讟 +脽 +耞 +耙 +籼 +睩 +皭 +疯 +珧 +玱 +焯 +炸 +潩 +曶 +揫 +岢 +奰 +嗨 +㟧 +⑾ +麃 +鱄 +骴 +陊 +锽 +謷 +觺 +螉 +虒 +茳 +碟 +皿 +璟 +狶 +昶 +攕 +攓 +憍 +嵰 +堋 +埂 +刿 +] +馅 +餦 +酦 +遫 +褧 +蟨 +蝻 +蛣 +艼 +臡 +碰 +码 +獝 +熻 +熸 +氃 +枮 +杮 +怗 +坨 +嘑 +吵 +剁 +[ +鰋 +髁 +餭 +邬 +逤 +轪 +踹 +裎 +袗 +芴 +绑 +縡 +祆 +砗 +眤 +盭 +澉 +湝 +氋 +殴 +櫋 +檎 +朒 +挖 +夬 +噣 +噙 +劙 +⑿ +鼘 +駹 +馎 +靫 +霫 +鐻 +醊 +襚 +紽 +簺 +睋 +痞 +疁 +檃 +擭 +扒 +愎 +恇 +崮 +崥 +媌 +凇 +儳 +麨 +髾 +鄏 +蹅 +襻 +袐 +脤 +罱 +焮 +欿 +怂 +奘 +吪 +俣 +『 +齰 +齯 +鸐 +鯹 +髇 +餤 +飶 +铻 +躩 +账 +贉 +觕 +羓 +箊 +禜 +瓘 +琫 +煊 +澴 +汔 +汃 +柷 +枅 +杗 +戳 +庨 +峛 +媆 +埆 +黴 +魆 +韽 +豭 +襼 +褉 +蝛 +蛏 +苢 +碞 +甒 +甋 +潗 +泲 +毾 +榰 +楙 +撋 +抶 +慥 +惎 +囋 +唉 +刜 +僬 +㸐 +鸸 +鷾 +鲗 +镀 +錞 +醹 +鄗 +逴 +跞 +跅 +豋 +谾 +褢 +衋 +蠵 +脞 +罭 +簨 +稣 +璊 +揕 +幖 +崺 +媠 +噜 +唬 +䨲 +㨿 +㔩 +馤 +饙 +霳 +锟 +铳 +铣 +輗 +豌 +覂 +虖 +蒀 +芯 +胹 +翯 +罼 +罙 +紏 +秆 +睅 +眹 +溇 +泜 +楥 +柎 +慅 +嶅 +婶 +咢 +劼 +刽 +儃 +僩 +䝟 +⒀ +鵻 +鳐 +鲞 +靘 +蹜 +豝 +謼 +蘦 +萐 +臕 +胍 +羼 +糒 +粞 +簬 +笫 +瓅 +獦 +漴 +毣 +攽 +撙 +找 +懜 +恿 +嫪 +娠 +嗌 +匉 +䟃 +㾕 +㟙 +⿰ +鷕 +鳙 +隉 +邝 +谵 +谞 +蛜 +艧 +膛 +膀 +脝 +羵 +纼 +綖 +磺 +磹 +矉 +濙 +掂 +挶 +庬 +姞 +囵 +唅 +剀 +傧 +㦬 +㗳 +● +𢀩 +鴥 +鲢 +騞 +钹 +錾 +轖 +觓 +蠮 +蜼 +蓖 +莨 +耑 +罿 +縩 +絣 +箍 +硪 +盩 +瘕 +痀 +烤 +浜 +殣 +楪 +柹 +宧 +媢 +嗳 +嗓 +哔 +佹 +佘 +䟎 +䕸 +䏑 +䍡 +⒁ +馏 +靅 +铍 +郾 +邰 +踱 +诖 +蜇 +莴 +莟 +篽 +箖 +礐 +碱 +砯 +玢 +獯 +獧 +涞 +洇 +櫹 +梆 +懡 +峃 +妯 +唠 +剚 +乩 +䰀 +䢴 +䍽 +○ +◆ +鼟 +黹 +黂 +骪 +镘 +镔 +钰 +鍮 +鈯 +鄘 +郜 +輤 +谹 +詀 +褼 +蝂 +茷 +舔 +臜 +胏 +窼 +窎 +瞵 +瘐 +畈 +琇 +玗 +淊 +毚 +槥 +棙 +柸 +昜 +挻 +憁 +峞 +寱 +囮 +囫 +啍 +哿 +儸 +傔 +偭 +偟 +侲 +佝 +㵝 +㲪 +㩁 +㟪 +⒂ +鳇 +饨 +韠 +阸 +镣 +铰 +钖 +褠 +褋 +衵 +蒳 +茙 +腪 +翃 +簩 +稧 +磳 +皠 +痱 +甤 +玓 +犛 +牿 +棽 +搠 +掫 +嶬 +嶝 +媟 +埵 +唽 +吖 +儦 +傫 +侐 +䠞 +⒃ +麖 +髋 +霱 +醝 +鄚 +诌 +诇 +詅 +蘙 +肰 +繘 +糍 +筀 +磾 +睃 +瑄 +琭 +熉 +漯 +滆 +棓 +柤 +柍 +懞 +廧 +屁 +寠 +媞 +嗃 +咭 +匑 +䬟 +䨥 +䌨 +— +齮 +鷤 +鶠 +鴽 +鮹 +颩 +雈 +逯 +辌 +衕 +蝫 +蚣 +藨 +蓛 +苯 +缳 +笇 +稆 +磶 +瘣 +璪 +熢 +焫 +湢 +浽 +欂 +枧 +嵊 +娖 +姽 +妺 +嘛 +呺 +偫 +俙 +䱉 +䨑 +䦟 +䛕 +䆒 +㦲 +㡠 +㠑 +㠎 +⒅ +⒄ +黈 +鳢 +魌 +髵 +騩 +馞 +馄 +頍 +鞈 +銙 +鉥 +邋 +蹛 +跦 +謏 +諓 +觍 +褂 +蜈 +蚹 +薍 +苁 +缎 +穾 +稡 +禫 +禠 +祻 +硊 +珵 +澬 +潎 +欱 +橧 +橤 +椸 +柜 +杅 +昒 +拲 +嵾 +崹 +娌 +妊 +夘 +堲 +埶 +埳 +嚆 +喌 +啽 +唵 +刌 +偢 +䯢 +䋏 +𧥄 +鳆 +魫 +韣 +鏬 +邽 +邶 +遢 +輖 +軷 +貆 +訑 +覛 +蝜 +菆 +莂 +臑 +膢 +膘 +缌 +繣 +箳 +礨 +砟 +瞹 +熌 +瀄 +澾 +潟 +滘 +淠 +涳 +涢 +浤 +沵 +沇 +氿 +桯 +攞 +掿 +愯 +惉 +忑 +峚 +垝 +儜 +伙 +㟢 +』 +鳟 +鲻 +鱎 +饛 +韛 +鋹 +迒 +贙 +豵 +詾 +裥 +蠉 +虙 +薧 +艿 +脡 +脖 +羱 +磉 +砠 +盹 +狫 +焭 +潬 +滍 +檞 +樏 +棻 +柺 +杴 +攴 +掁 +愡 +忐 +廥 +廔 +帣 +峉 +屉 +尵 +尰 +宬 +嫄 +嘂 +唗 +吧 +厔 +剟 +儱 +䎫 +㬋 +㐲 +鼮 +鲮 +駾 +駜 +韸 +釉 +輷 +輘 +讻 +襹 +蚑 +藞 +蔪 +荍 +芚 +芉 +舋 +耏 +耎 +縿 +篂 +禬 +砐 +畼 +玚 +烺 +濲 +澓 +漎 +淢 +涆 +沏 +汳 +歰 +榧 +榝 +椴 +擉 +揝 +尬 +孪 +娙 +娀 +夯 +埚 +垌 +圾 +圌 +啿 +咯 +呤 +呃 +勠 +刱 +倌 +䫻 +㡢 +㟅 +⒆ +⑴ +鲒 +鬡 +鬝 +髺 +駓 +颌 +韅 +隺 +镏 +锴 +锬 +鏊 +鎝 +赕 +貘 +謟 +襧 +蚻 +蚳 +紃 +穊 +硾 +矕 +矐 +犠 +煼 +灉 +滈 +浞 +洨 +泙 +泆 +橼 +桭 +栭 +扱 +惣 +恟 +嶆 +岙 +夗 +堍 +唪 +哼 +僎 +侜 +侅 +佁 +䭀 +䦱 +䥥 +䡾 +䖟 +䀨 +𢬵 +黭 +黬 +鰅 +鬇 +髲 +餫 +鞔 +霩 +阌 +链 +辴 +赇 +襋 +蠚 +葽 +臒 +綟 +紾 +眑 +皵 +皃 +瘪 +疍 +猵 +梐 +枑 +掔 +懮 +恌 +忔 +尴 +埧 +噷 +唲 +叭 +刭 +刨 +冓 +伹 +仟 +䳇 +䟤 +䓻 +㻞 +㵿 +㩀 +㙷 +⒇ +鼒 +鲖 +鬔 +驉 +馍 +餋 +霃 +镖 +钑 +酏 +轃 +跒 +豻 +豜 +褾 +螷 +蝓 +蝄 +蔹 +蓻 +荬 +茢 +艛 +艗 +艄 +糙 +籸 +秇 +祋 +礰 +碯 +砒 +睱 +睘 +癙 +瘾 +痎 +疱 +玡 +獶 +猳 +牏 +潝 +浭 +檨 +敡 +捖 +拎 +惵 +彟 +廞 +崣 +岋 +屝 +媊 +婥 +嗡 +啕 +唝 +咡 +吆 +冂 +僸 +仵 +乜 +䲔 +䣛 +㣶 +鳸 +馵 +饐 +饎 +鞸 +雴 +隥 +鐏 +鄯 +跲 +跁 +赗 +褆 +袝 +蝍 +蚂 +藘 +艒 +脧 +篬 +笤 +穄 +稃 +秺 +硐 +砅 +矸 +瘸 +疻 +疤 +瑊 +猓 +熺 +煠 +炅 +瀽 +溁 +淯 +樠 +榥 +楶 +棵 +曭 +旡 +敫 +撶 +摏 +揃 +捥 +嶊 +嫇 +婓 +堨 +坋 +唻 +吨 +叚 +倛 +侹 +㴻 +㬅 +㗲 +隣 +齘 +鼚 +黡 +驖 +馊 +韂 +靼 +鋈 +鄹 +踩 +豩 +讱 +覕 +襡 +裰 +袌 +蚰 +蘘 +蒢 +荓 +芿 +艴 +筊 +秠 +碻 +矌 +痯 +甀 +瓟 +玈 +牱 +熭 +烓 +渻 +渹 +毨 +棷 +栘 +晲 +晱 +旝 +拧 +戣 +懠 +懘 +彡 +帓 +帉 +嶟 +嵿 +岪 +寣 +婞 +墺 +嚵 +嚗 +咧 +吗 +佟 +丼 +䳏 +䰒 +䯼 +䔿 +䌟 +䃔 +㩻 +㕮 +◇ +𧽼 +齴 +鱐 +餂 +閦 +铧 +钊 +醟 +鄫 +逛 +轒 +輣 +踼 +跔 +趟 +豮 +誽 +蝹 +葼 +茴 +芈 +膇 +肜 +耝 +羂 +繶 +繠 +縰 +綪 +紒 +籉 +笪 +穈 +碚 +砵 +睺 +眕 +畷 +璲 +璩 +瑴 +玭 +猇 +犉 +熚 +瀔 +漷 +湣 +渒 +涒 +浿 +汋 +樀 +椔 +梜 +摋 +搣 +掤 +挃 +扤 +扐 +戗 +懭 +悝 +庌 +崁 +壄 +叽 +匽 +冼 +俋 +佌 +佅 +丳 +䳵 +䣊 +䒠 +㿽 +㲚 +㗋 +㑴 +… +𨖍 +鼲 +鼫 +鼖 +鵴 +馝 +鞧 +靊 +錀 +釱 +醧 +酳 +郪 +迾 +躤 +蹝 +趢 +誷 +覸 +蟘 +葸 +葐 +菤 +菠 +莋 +舕 +腒 +翐 +翍 +翂 +絧 +磩 +磓 +碝 +矄 +皕 +瘫 +瘈 +瘆 +瘀 +煆 +煁 +溦 +槵 +楬 +椵 +梩 +栻 +枊 +撧 +搐 +捼 +愲 +忳 +庤 +嶶 +嵣 +嵔 +嵉 +孓 +姌 +奓 +塺 +圊 +囒 +嗹 +啊 +咖 +匋 +兟 +侂 +仨 +丏 +䶉 +䪫 +䪥 +䕬 +䉶 +䄡 +㠠 +鵣 +鴶 +鳒 +鯸 +鮰 +髳 +闶 +钐 +鍭 +鋂 +鉼 +酮 +轊 +蹎 +蹍 +趗 +趔 +赸 +袙 +薢 +薁 +蔯 +荽 +茬 +茌 +舚 +耩 +缏 +箛 +筅 +穧 +祴 +磤 +碳 +砣 +砏 +矅 +瘨 +瘘 +瑵 +玒 +獉 +潕 +渀 +浉 +欐 +橚 +槫 +梴 +枓 +捂 +戁 +愰 +弝 +帤 +嶾 +崴 +岰 +婑 +妷 +塕 +囟 +嗞 +嗋 +啀 +唣 +呰 +呞 +吱 +厈 +凓 +冘 +冔 +僄 +倢 +伣 +亝 +䱐 +䯀 +䫉 +䓤 +䋣 +䁡 +㾓 +㲉 +㭊 +㩧 +㡛 +㡚 +㝱 += +﨣 +鼊 +黫 +鹐 +鵀 +饆 +頠 +鞻 +闟 +鎏 +酅 +郲 +轜 +趸 +諆 +觲 +蠈 +螵 +蛲 +蛔 +虥 +虈 +蘑 +蓷 +蓲 +茩 +苙 +芨 +舑 +臐 +臄 +脢 +胳 +胲 +綄 +簥 +篹 +篗 +窹 +禼 +礜 +碙 +碐 +砸 +盉 +皬 +癠 +猲 +猭 +犨 +爢 +烫 +瀸 +瀴 +溰 +湠 +淌 +檩 +楅 +楄 +桼 +暪 +斶 +擽 +憣 +慬 +愺 +惏 +悷 +怚 +怓 +幩 +婘 +妞 +墼 +塸 +堶 +垕 +坩 +啚 +啙 +匌 +佰 +仉 +䵷 +䳱 +䱥 +䫜 +䛏 +䒀 +䏰 +䏈 +㷊 +㵟 +㮰 +㟹 +㝅 +㜻 +’ +‘ +𦌉 +鼜 +麧 +鳔 +鱽 +騢 +饠 +飑 +飍 +鞗 +靿 +锚 +鐎 +鄛 +遬 +辁 +躣 +踅 +襳 +襘 +襂 +蟳 +蟟 +蛳 +薸 +菏 +苈 +臇 +膧 +膎 +腤 +肛 +罧 +緵 +緎 +絿 +絭 +粺 +穱 +稞 +秷 +礒 +碥 +瞯 +甝 +瓀 +瑿 +珽 +犦 +燂 +瀎 +漇 +溓 +浲 +泃 +毈 +殑 +橥 +樲 +曣 +昈 +擩 +掯 +挜 +懰 +憹 +恅 +巆 +嵺 +峗 +峕 +孅 +嬽 +喾 +啤 +叻 +勔 +僯 +俅 +伃 +䭔 +䦛 +䙚 +䘵 +䘲 +䑲 +䃜 +㬻 +㨂 +㡘 +㖟 +F +C +𣢾 +Q +齼 +鸀 +鳿 +鬗 +髬 +驒 +駥 +鞁 +隒 +闩 +镼 +鏄 +錽 +邜 +轥 +躘 +跙 +趩 +谝 +诓 +詑 +襺 +褙 +螔 +蝑 +蕸 +蕄 +葶 +莙 +粿 +粇 +籦 +簝 +篻 +竻 +瞤 +睗 +癈 +瘽 +瘰 +痘 +甊 +瓓 +瑭 +珌 +玟 +狵 +狤 +灇 +瀩 +漍 +溵 +浠 +毦 +档 +杶 +朩 +暏 +晀 +摰 +摎 +搞 +揥 +拫 +戻 +戙 +戄 +慉 +怮 +忲 +帲 +帗 +崡 +峘 +屮 +嫈 +墋 +塳 +塉 +垍 +嘕 +啦 +咺 +叼 +儚 +伭 +䯻 +䯱 +䮉 +䬯 +䬀 +䠱 +䃸 +䃱 +䃧 +䀜 +㱿 +㭬 +㠥 +㠡 +㒿 +J +𦨣 +鹝 +鸼 +鷩 +鷘 +鷅 +鴹 +鲳 +鮧 +鬤 +髐 +髂 +駻 +駣 +馲 +馓 +颋 +顤 +頣 +霝 +陫 +镴 +铝 +钘 +鉎 +鈜 +郕 +郈 +轞 +輹 +蹦 +跐 +趪 +貒 +譣 +裍 +蜹 +藑 +藇 +薠 +蕹 +蔤 +菵 +荖 +苃 +艽 +膞 +腄 +胱 +罤 +綅 +籔 +窴 +窫 +秏 +礛 +磢 +眭 +痨 +瓛 +璏 +琁 +珇 +獹 +狆 +牦 +爸 +爉 +熣 +炖 +灙 +瀯 +溏 +湉 +浺 +浛 +洓 +泑 +沺 +氶 +毼 +欙 +欈 +槢 +楢 +椌 +杝 +暤 +晙 +昐 +旼 +旊 +攺 +攃 +抿 +扔 +慞 +愐 +惢 +廜 +帩 +嶞 +嶛 +嵞 +嵛 +嵃 +崽 +尞 +婐 +妗 +埌 +嗻 +嗗 +喎 +啡 +啝 +咶 +厹 +冇 +僿 +傋 +偊 +丿 +䶎 +䲹 +䭭 +䝤 +䔍 +㹞 +㷒 +㶿 +㬈 +㤝 +㢝 +㠾 +㠂 +㟽 +㗀 +𣏌 +𡾰 +鷮 +鶑 +鮿 +魮 +骲 +驈 +騝 +駽 +馢 +餲 +颲 +靬 +雚 +陑 +阰 +铒 +钭 +鑮 +鏺 +鏕 +鍝 +酰 +酟 +酘 +鄳 +郘 +迋 +躎 +跶 +趷 +豖 +謻 +謱 +訹 +觭 +觡 +襓 +蠥 +蚧 +蚏 +蘪 +蕻 +蔇 +菻 +菪 +茐 +苘 +芫 +舽 +胗 +羭 +羬 +羫 +罃 +绲 +緁 +簄 +篖 +窐 +窋 +禗 +祤 +祍 +礥 +碆 +碅 +硿 +硼 +瞖 +瑨 +狓 +熂 +煿 +炘 +瀇 +濎 +湥 +湋 +湁 +泇 +沘 +殗 +歾 +歭 +櫰 +檋 +椐 +棔 +框 +枍 +暅 +昡 +斠 +攦 +擖 +撱 +揸 +揬 +捘 +慺 +慠 +惦 +悕 +廫 +崨 +孴 +嫕 +媱 +媐 +妠 +塯 +堈 +垭 +坲 +坁 +圪 +圛 +嚧 +嘀 +哱 +吺 +厣 +俵 +伈 +䱷 +䬾 +䓿 +䑠 +䏶 +䄄 +䃴 +䃭 +䁀 +䀪 +㾪 +㻬 +㲯 +㬢 +㠁 +㟞 +㘖 +㘅 +㗶 +㔉 +㓦 +㑳 +⿱ +] +𧤏 +𦕈 +𡡉 +𠴲 +鼩 +黓 +鷔 +鶧 +鴷 +鴚 +鳘 +魿 +騟 +駷 +馕 +鞙 +霼 +钃 +錍 +鋬 +鉊 +鈌 +軮 +躺 +踮 +趎 +譩 +謞 +覶 +覢 +襵 +褣 +裻 +裱 +螴 +蜳 +蜦 +蛦 +蚷 +虨 +蘠 +藾 +莶 +艂 +舼 +脘 +聣 +翪 +羙 +緃 +糦 +粣 +粑 +笓 +竑 +窲 +稘 +礉 +磡 +碏 +硔 +砫 +瞱 +眛 +盰 +痢 +疄 +畐 +甈 +璹 +璗 +獌 +狘 +狔 +熁 +熀 +煚 +烔 +瀃 +濏 +濋 +澻 +潓 +滭 +溠 +溞 +涾 +涮 +涫 +浻 +殹 +殈 +欋 +榠 +榖 +楟 +椳 +栫 +栓 +栒 +柧 +柣 +暳 +昝 +昄 +旿 +攰 +攑 +攉 +撬 +搕 +揳 +捓 +抇 +懩 +悹 +恛 +忀 +彽 +彺 +嵤 +嵑 +崅 +岦 +岠 +岍 +媫 +媅 +婧 +墯 +堛 +圁 +囡 +嗺 +啃 +唦 +傱 +傢 +傝 +傕 +俇 +佖 +佋 +䨧 +䢅 +䙫 +䖃 +䑸 +䎕 +䈥 +䃺 +䁔 +㷈 +㱲 +㧙 +㠯 +㘘 +[ +X +D +/ +𪄠 +𨤍 +𦦨 +𥿄 +𥰡 +𤏡 +𣲖 +𣪫 +𣠄 +𢅏 +齇 +騵 +馦 +饽 +餴 +飁 +顉 +頵 +雵 +隑 +陧 +阢 +锊 +鍱 +鈚 +鄄 +轙 +軖 +蹱 +蹖 +踤 +趚 +豏 +讘 +謑 +諲 +袀 +衭 +蠀 +蟮 +蟜 +蜽 +蜲 +藠 +藚 +蔎 +蔍 +蓨 +葴 +葍 +萯 +萩 +莤 +艕 +腢 +脕 +脁 +胈 +肭 +翨 +罯 +綎 +紑 +糳 +篰 +筕 +竫 +窡 +窙 +穋 +秅 +祾 +礩 +硱 +硩 +矆 +皀 +痤 +痣 +疙 +疔 +瓐 +璱 +璙 +璖 +琈 +玘 +獢 +爣 +熿 +濭 +潣 +潒 +漨 +湙 +洭 +泵 +汸 +殰 +歍 +槴 +槯 +榣 +椄 +棡 +梿 +梡 +栮 +柲 +朌 +暸 +暯 +暆 +昪 +撸 +撨 +撠 +摚 +捾 +捅 +抷 +扙 +懪 +憕 +慗 +悊 +怇 +忞 +徔 +弴 +庱 +幧 +崂 +峓 +峐 +岒 +寁 +孵 +孲 +嫛 +嫒 +媖 +婹 +娽 +姎 +墆 +垾 +垃 +圚 +嚾 +嗿 +嗝 +唢 +咜 +咕 +呥 +厸 +厗 +剳 +刔 +儴 +偁 +乒 +䱹 +䨓 +䧢 +䘯 +䓇 +䐶 +䎘 +䌰 +䊵 +䉡 +䈚 +䇲 +䇄 +䆞 +䄖 +䁥 +䀌 +㽛 +㸦 +㷔 +㰤 +㰕 +㰂 +㥏 +㠗 +㠉 +㜮 +㜎 +㙠 +㗵 +㗭 +㔶 +㔣 +㒩 +㑺 +⽟ +𧾨 +𧇠 +𦜕 +𦒘 +𦋏 +𦂳 +𥻘 +𥓂 +𥈭 +𤾂 +𣰕 +𣯀 +𢥞 +𠿕 +𠴨 +兀 + + +鼽 +黵 +鵱 +鴠 +魗 +鬳 +鬕 +鬊 +駚 +馧 +韊 +鞢 +霵 +霋 +镦 +铪 +铋 +錤 +鋘 +鉟 +鉔 +鈲 +鈋 +釽 +醷 +醠 +酵 +鄀 +郙 +邡 +輚 +輑 +踥 +跠 +趻 +豾 +豽 +豇 +讔 +讆 +譓 +譀 +謰 +謈 +詨 +訉 +襛 +袥 +袟 +袎 +衁 +蠠 +蜐 +蚾 +蚵 +蚛 +藖 +薷 +薕 +蕟 +蔊 +蓪 +蓩 +蒄 +菫 +荝 +舤 +膍 +腱 +脬 +肷 +聛 +耛 +罞 +繵 +繊 +紩 +紟 +糸 +穇 +稯 +祡 +磗 +硭 +硞 +矲 +瞾 +瞟 +睻 +睕 +眻 +癹 +瘯 +疰 +畻 +甾 +瑺 +瑢 +琎 +珅 +猡 +猄 +狋 +犿 +爊 +焌 +烻 +潈 +滵 +滱 +溋 +渃 +淲 +浰 +汘 +氇 +氁 +毌 +毊 +殥 +欇 +榅 +梫 +梀 +桋 +曥 +曤 +旞 +攥 +摲 +揱 +掼 +挕 +抏 +扦 +懥 +慜 +愗 +悆 +怐 +徛 +廙 +廇 +幝 +巙 +巏 +嵖 +峱 +岉 +尌 +宎 +嫽 +婗 +姼 +奯 +塽 +塪 +埯 +嚃 +嘡 +啯 +啥 +唓 +哵 +哫 +哧 +哒 +叿 +厾 +厞 +卝 +劢 +剶 +剭 +僲 +僔 +僒 +倠 +佬 +伢 +乓 +䴠 +䳒 +䰂 +䯰 +䬞 +䬝 +䪺 +䨸 +䤹 +䤴 +䣃 +䡮 +䟏 +䜴 +䘭 +䔲 +䓞 +䓁 +䑃 +䎒 +䍙 +䌌 +䊺 +㿱 +㽦 +㺑 +㹶 +㶏 +㵐 +㴸 +㮚 +㬥 +㬒 +㫰 +㪣 +㥾 +㡓 +㠔 +㟼 +㞦 +㜲 +㜪 +㖧 +㖃 +· +e +𩞾 +𩑺 +𩈣 +𩆷 +𨵽 +𨏥 +𧟌 +𦶜 +𥱧 +𢧵 +𢥠 +𠌯 +裏 + +龏 +齽 +齹 +黰 +黣 +黖 +黋 +黅 +麷 +麳 +麡 +鹱 +鸓 +鷢 +鷡 +鷉 +鵽 +鵧 +鴸 +鳛 +鯼 +鯆 +鮬 +鬿 +骣 +馩 +饺 +饀 +飉 +颹 +頄 +韄 +鞜 +鞃 +靽 +陾 +闫 +閛 +閕 +閍 +镺 +镉 +铡 +铖 +鐀 +鍜 +鋻 +鋚 +銶 +釴 +醓 +酶 +逿 +迣 +輁 +軱 +軓 +踷 +跇 +趹 +趡 +譞 +譝 +觼 +覤 +襒 +褡 +裣 +袊 +蟥 +蟙 +蟑 +螇 +蝾 +蜫 +蛖 +蚄 +虸 +虷 +虋 +虇 +蘹 +藙 +薱 +薐 +蒦 +葌 +萛 +菙 +荨 +荙 +荗 +苨 +苐 +艬 +舺 +臁 +膮 +膟 +腩 +肣 +聻 +耴 +耧 +耖 +羕 +缹 +繸 +縓 +緟 +緅 +絟 +籫 +籑 +簶 +簙 +篵 +篢 +篟 +篅 +箶 +筸 +笢 +稨 +祑 +磌 +磄 +矘 +矖 +瞡 +睯 +睑 +眒 +盳 +癗 +癌 +疸 +疳 +疧 +疖 +珫 +玿 +玥 +獚 +獑 +狿 +犏 +犎 +爧 +爁 +烰 +烩 +瀤 +潲 +潫 +潖 +溴 +湴 +湒 +渱 +涚 +泎 +沷 +氆 +毐 +歏 +欀 +檑 +檈 +橶 +樻 +樔 +槺 +槷 +槦 +榹 +榸 +椇 +梣 +桱 +朡 +朄 +暼 +暡 +昤 +攐 +擸 +揞 +懹 +懤 +憢 +憛 +慹 +慸 +愶 +惾 +惼 +怞 +徲 +弣 +廆 +庉 +嶱 +嶉 +嶀 +嵥 +崠 +峮 +峎 +峁 +岯 +岮 +嬥 +媰 +婜 +娗 +姡 +姏 +夌 +墣 +堌 +埪 +埢 +垸 +囲 +噽 +嘌 +嘁 +喵 +喢 +哻 +咩 +咉 +呬 +呙 +厶 +厏 +厊 +卲 +勮 +剕 +凔 +凅 +僪 +僤 +傪 +偠 +倳 +侻 +侉 +丌 +䵼 +䱜 +䰞 +䮾 +䬘 +䫲 +䫈 +䨟 +䢼 +䢩 +䢇 +䞴 +䝇 +䛡 +䛁 +䘿 +䕩 +䕡 +䓑 +䑰 +䐑 +䏿 +䎖 +䌄 +䈡 +䃂 +䃁 +䂺 +䁾 +䁱 +䀻 +㼾 +㻫 +㺦 +㹱 +㹪 +㵾 +㳯 +㳁 +㱥 +㰚 +㬪 +㩗 +㨾 +㨨 +㧖 +㠍 +㟋 +㞐 +㝗 +㜞 +㚟 +㚇 +㙞 +㘰 +㘆 +㗫 +〉 +〈 +⿸ +⑵ +Q +? +- +𬗟 +𬋩 +𬈑 +𪐴 +𩍐 +𧙪 +𥬞 +𥔀 +𤲬 +𤱔 +𣽅 +𣯶 +𣬈 +𣙜 +𣂏 +𢤱 +𢈪 +𡙇 + +龇 +齸 +齤 +齞 +齝 +齍 +齆 +齃 +鼱 +麔 +鸗 +鶀 +鳻 +鱊 +鮾 +魶 +鬺 +鬞 +驙 +馯 +馚 +餪 +颴 +颬 +顟 +鞣 +靀 +霻 +雿 +隓 +阞 +镤 +锔 +锏 +铔 +铐 +钣 +钚 +鑴 +鏴 +鍖 +鍑 +鍉 +錜 +鋜 +鋉 +銗 +銊 +鈩 +釿 +釰 +醙 +鄦 +邞 +邘 +轠 +轈 +軜 +蹼 +蹃 +踾 +跺 +跆 +趭 +趬 +貁 +譳 +謢 +誻 +訞 +觟 +覗 +襱 +襊 +襈 +褔 +裶 +袺 +袓 +蠘 +蟧 +螖 +螊 +蜰 +蜪 +蜌 +蛘 +蚼 +虉 +蘡 +藫 +薳 +蕬 +蒩 +蒡 +葪 +葧 +葈 +葇 +菨 +菣 +莑 +莃 +荰 +荁 +茖 +茒 +苮 +苪 +芪 +艡 +艐 +腽 +腲 +腝 +胴 +胐 +肦 +聍 +翷 +翲 +羒 +羍 +罬 +繀 +縼 +絻 +糨 +簕 +笲 +笐 +窢 +穬 +稐 +秥 +禭 +礤 +礅 +磏 +碶 +碫 +碖 +瞲 +睴 +睌 +眝 +盯 +癊 +瘙 +痫 +痝 +疬 +甩 +甂 +瓽 +瓬 +珶 +珢 +珝 +獥 +狧 +犼 +犺 +犝 +牉 +熘 +烊 +灢 +瀖 +濧 +澰 +澞 +澝 +湭 +湡 +湆 +渧 +涽 +浘 +汵 +氧 +毻 +殜 +殙 +歫 +欓 +櫊 +檕 +樚 +樆 +楱 +楰 +棜 +棆 +梮 +梬 +梌 +桬 +柠 +曃 +昀 +攎 +擀 +摞 +搳 +搋 +揢 +揗 +揌 +掮 +抻 +抎 +扂 +戵 +戭 +懬 +懙 +憋 +愣 +惈 +惂 +悐 +悇 +恲 +恑 +怢 +弶 +庪 +幭 +幍 +帴 +巊 +嶍 +嵱 +嵍 +崌 +崀 +峬 +岽 +岬 +孊 +嫲 +婼 +姘 +姈 +墱 +塓 +堫 +埻 +埕 +埇 +垑 +囐 +噔 +嘟 +嘄 +嗼 +嗲 +嗍 +唑 +哶 +哌 +咪 +呾 +吡 +厍 +劋 +剬 +刅 +凚 +儭 +僜 +僙 +僁 +偯 +俴 +俓 +䶂 +䵃 +䴹 +䳟 +䲺 +䰠 +䰉 +䭿 +䬐 +䬂 +䫿 +䫸 +䪜 +䪌 +䩬 +䩕 +䨜 +䨚 +䦘 +䦖 +䥶 +䤖 +䤋 +䢱 +䡨 +䟫 +䟆 +䞭 +䞟 +䜪 +䚧 +䚟 +䙬 +䘨 +䗖 +䔫 +䔄 +䒌 +䑵 +䑔 +䏖 +䎉 +䍚 +䋎 +䊸 +䆸 +䆱 +䅘 +䄠 +䄜 +䃘 +䃉 +䂀 +䁹 +䀽 +䀹 +㿻 +㿉 +㽜 +㼜 +㼚 +㼌 +㹳 +㶼 +㶷 +㵫 +㳷 +㳚 +㳐 +㳅 +㲹 +㲄 +㫚 +㩉 +㨖 +㥮 +㤞 +㣻 +㠌 +㟿 +㟴 +㟃 +㙻 +㙡 +㗻 +㗚 +㗇 +㕧 +㔢 +㒽 +㒹 +〇 +s +𩭝 +𩛥 +𩘟 +𨵵 +𨴻 +𨥛 +𨚗 +𨔁 +𨁏 +𧹬 +𧴆 +𧞔 +𧉮 +𦨻 +𦨴 +𦍑 +𦌊 +𦉥 +𥴦 +𥱰 +𥪡 +𥥆 +𥤮 +𤠔 +𤇺 +𣰉 +𣦠 +𣚃 +𢼮 +𡾐 +𡼭 +𡹮 +𡰥 +𡧯 +𡡓 +𡒦 +𡑞 +𡏝 +𡄸 +𠿨 +𠷠 +𠧧 +𠜱 +𠇗 +𠆩 + +齵 +齱 +齂 +鼵 +鼏 +鼁 +黺 +麠 +麜 +鸋 +鸆 +鸁 +鷷 +鷠 +鷎 +鶞 +鵳 +鵛 +鵗 +鵌 +鵋 +鳱 +鳡 +鳕 +鲡 +鱿 +鱲 +鰽 +鰦 +鰞 +鯓 +髸 +骟 +骒 +驓 +騉 +駩 +駖 +馜 +餰 +餩 +餥 +颵 +颥 +颞 +顩 +顈 +頧 +頔 +韹 +鞕 +鞒 +霦 +霕 +陮 +阺 +闛 +镭 +锨 +铴 +铊 +钛 +钌 +钋 +鑯 +鐄 +鎈 +鍦 +錧 +鋞 +鋊 +銎 +鉠 +釬 +釪 +醦 +鄩 +鄡 +郚 +邲 +邭 +遆 +迬 +迀 +輮 +輂 +軵 +軧 +躝 +躆 +踒 +跮 +跤 +跢 +跘 +趰 +赨 +赥 +赟 +貜 +豰 +豃 +诶 +讄 +譒 +謥 +謣 +諈 +誃 +誂 +觻 +觰 +觛 +覭 +襽 +襗 +褅 +袾 +衸 +衯 +衃 +蠼 +蠝 +蠊 +蟓 +蟃 +蝪 +蝢 +蝏 +蜶 +蜬 +蜁 +蛷 +蛓 +蚭 +蚔 +虭 +虪 +虦 +虤 +蘮 +蘛 +薥 +蕼 +蕶 +蕯 +蕗 +蓹 +蓫 +蓢 +蒶 +蒥 +蒗 +葰 +萭 +萡 +菥 +莚 +荿 +茿 +芲 +芁 +艻 +艞 +舥 +舢 +臶 +腶 +腨 +腧 +腍 +腇 +脀 +胰 +胣 +胊 +肨 +聸 +聎 +耵 +翴 +羉 +罺 +罜 +繴 +繲 +繟 +縺 +緰 +緀 +絯 +糿 +粰 +簎 +篣 +篍 +篊 +箹 +箅 +筃 +筁 +笰 +竲 +竘 +竉 +穚 +稴 +稰 +稙 +稄 +秼 +秝 +禯 +禢 +禓 +礏 +磞 +碽 +碬 +碊 +碀 +硡 +硚 +砳 +砊 +矎 +瞫 +瞄 +瞂 +瞁 +睮 +睖 +睄 +眧 +眜 +眅 +皯 +瘚 +瘒 +痉 +疭 +疝 +畾 +甹 +瓝 +瓆 +瓃 +瑮 +瑦 +瑈 +琄 +珲 +珐 +玾 +玊 +獴 +獳 +獖 +猍 +狾 +狳 +狚 +狍 +犮 +犪 +犘 +犓 +牚 +爃 +燝 +燏 +熝 +煟 +煓 +煏 +焊 +焂 +烞 +炟 +灴 +灒 +灈 +瀊 +瀀 +濿 +濄 +澫 +澚 +澖 +潻 +漅 +溛 +湳 +湗 +湅 +渿 +淍 +涹 +涶 +涠 +泦 +泏 +沰 +毽 +毢 +殾 +歒 +欨 +欦 +櫾 +櫖 +檽 +檷 +檬 +橿 +橎 +橁 +槬 +榽 +楘 +楇 +椈 +椆 +棸 +桲 +桊 +柦 +柖 +柀 +朾 +曫 +暚 +暋 +暀 +晭 +晊 +昲 +旇 +斔 +斄 +敱 +敜 +撽 +撗 +撍 +摫 +摍 +搎 +揿 +掭 +捰 +捬 +挋 +抳 +抪 +抍 +扡 +戠 +懁 +憟 +憞 +憏 +慛 +慒 +恔 +怭 +忨 +彋 +弤 +弡 +弅 +廦 +廍 +廅 +庼 +庲 +庮 +庢 +幵 +嶵 +嶜 +嶈 +嵻 +嵵 +嵦 +崶 +屖 +寴 +寙 +寎 +宨 +孖 +嬮 +嫟 +嫘 +媷 +媗 +媔 +媀 +婠 +婍 +娸 +娋 +姅 +妘 +妎 +奝 +奅 +墈 +堭 +埲 +埦 +埝 +埈 +埄 +垮 +坜 +嚫 +嚑 +嚊 +噈 +嘼 +嘇 +嗦 +嗐 +嗂 +喿 +喍 +啷 +啛 +啒 +啋 +咣 +咔 +咁 +呛 +呔 +呑 +呇 +卟 +匰 +勏 +劶 +刋 +刂 +冫 +儥 +儢 +僷 +傸 +傣 +傏 +傂 +偞 +偄 +倰 +俿 +俈 +俀 +佤 +佂 +仂 +乇 +丨 +丂 +䵠 +䲧 +䱎 +䰱 +䰩 +䯂 +䭲 +䭡 +䭓 +䬅 +䤨 +䤃 +䣻 +䢔 +䢆 +䡅 +䠑 +䠊 +䝿 +䜺 +䜈 +䚢 +䙏 +䘕 +䘃 +䗬 +䗂 +䖴 +䕢 +䕠 +䔌 +䔇 +䑩 +䐹 +䏂 +䎧 +䎙 +䍺 +䍠 +䌥 +䋫 +䋪 +䊗 +䊀 +䉫 +䉋 +䈉 +䈂 +䇢 +䆷 +䆮 +䅌 +䄺 +䃹 +䃬 +䃤 +䃕 +䂮 +䂭 +䂬 +䂌 +䀮 +䀣 +䀡 +䀆 +㽘 +㽎 +㺜 +㷸 +㷇 +㴉 +㳬 +㳇 +㲞 +㲈 +㲀 +㰦 +㯺 +㮶 +㮮 +㮨 +㭿 +㩥 +㩇 +㨏 +㧎 +㥚 +㥄 +㡩 +㠢 +㞚 +㞂 +㝿 +㛐 +㚿 +㘩 +㘞 +㘈 +㗱 +㖶 +㖔 +㖀 +㓤 +㈠ +】 +【 +⿳ +⺡ +◍ +u +o +i +h +a +𪠘 +𪘶 +𪘏 +𪕋 +𪔱 +𪑮 +𪍣 +𪅂 +𩸥 +𩮰 +𩩉 +𩨳 +𩦺 +𩦙 +𩤳 +𩣡 +𩡧 +𩞟 +𩘅 +𩗗 +𩓥 +𩓐 +𩏌 +𩆵 +𩆲 +𩆌 +𩅰 +𩃱 +𩂣 +𨳺 +𨫼 +𨚫 +𨏢 +𨊾 +𨇰 +𨇯 +𨇤 +𧿇 +𧽾 +𧻕 +𧸛 +𧶊 +𧱚 +𧮳 +𧠨 +𧗟 +𧕏 +𧒒 +𧑐 +𧑀 +𧐖 +𧍓 +𧌒 +𦿆 +𦫃 +𦧈 +𦣔 +𦠄 +𦈢 +𥷑 +𥲤 +𥯨 +𥨩 +𥥾 +𥝈 +𥜿 +𥜥 +𥜗 +𥚑 +𥗝 +𥖵 +𥅆 +𥀼 +𤻲 +𤱶 +𤯍 +𤦏 +𤟧 +𤝞 +𤙡 +𤒞 +𤏖 +𤂱 +𣰦 +𣬐 +𣣲 +𣙷 +𣗖 +𣋵 +𢼸 +𢱧 +𢯆 +𢧴 +𢠳 +𢛯 +𢕂 +𢆯 +𢁘 +𡿖 +𡾋 +𡺸 +𡺢 +𡸅 +𡷐 +𡴘 +𡱝 +𡠜 +𡗝 +𡒰 +𡒄 +𡏖 +𡎚 +𡎊 +𡊺 +𡄖 +𠽁 +𠻳 +𠺕 +𠴈 +𠫗 +𠫍 +𠪾 +𠧠 +𠟡 +𠏉 +D + + + + + + + + + + +龥 +龑 +齳 +齈 +鼨 +黊 +黀 +麮 +鹲 +鹮 +鹨 +鹛 +鸙 +鷣 +鷐 +鷏 +鶮 +鵺 +鵵 +鵚 +鵎 +鴡 +鴊 +鴄 +鳼 +鳺 +鳠 +鳑 +鳁 +鲼 +鲌 +鲅 +鱴 +鱛 +鱁 +鰆 +鯬 +鯩 +鯦 +鯄 +鮨 +鮠 +鮛 +鮇 +魱 +魪 +鬾 +骱 +驐 +騼 +騱 +騛 +駍 +饳 +饡 +颰 +颫 +顠 +顐 +韰 +鞇 +鞄 +靋 +霗 +霐 +雔 +隇 +隁 +陭 +阫 +闧 +闆 +閪 +镩 +锳 +锫 +锞 +锗 +锖 +锑 +锃 +铞 +铌 +铂 +钾 +鑺 +鑫 +鑋 +鐖 +鐉 +鐇 +鏙 +鎯 +鍐 +錝 +鋾 +鋐 +鋀 +釳 +醚 +鄤 +鄋 +鄈 +鄇 +鄅 +鄃 +郋 +遪 +辥 +轱 +輲 +輫 +輨 +輡 +軯 +軞 +軘 +軎 +躽 +躟 +躜 +蹳 +蹗 +踭 +踃 +跿 +跓 +跉 +跈 +趜 +貾 +貕 +貇 +豂 +諰 +諔 +諎 +詷 +詝 +訔 +觷 +觢 +觠 +觗 +覵 +覞 +褟 +褑 +裬 +裪 +裞 +裉 +袲 +袮 +袩 +袑 +衤 +蠰 +蠪 +蠜 +蟷 +蟵 +蟞 +蟖 +蟕 +螶 +螰 +螨 +螝 +螒 +螈 +螆 +蝺 +蝁 +蜭 +蛼 +蛥 +蛡 +蛐 +蛈 +蛃 +蛁 +蚗 +蚆 +虼 +蘥 +蘁 +藦 +藀 +薼 +蕫 +蕧 +蕠 +蔩 +蔜 +蓶 +蓥 +蓌 +蓇 +蓃 +蒽 +葖 +葏 +萓 +萒 +菝 +菋 +菈 +莦 +荺 +荸 +荌 +茾 +茼 +茺 +茪 +茥 +茞 +苷 +苤 +芤 +芓 +芐 +芅 +舮 +臤 +膷 +膱 +膡 +腜 +腂 +脭 +脟 +肽 +肂 +聑 +聈 +耭 +翓 +罠 +罀 +缂 +繗 +縎 +縇 +緳 +緆 +綝 +絩 +紸 +糩 +糔 +糌 +籚 +籗 +籕 +籎 +篺 +篞 +篎 +箷 +箤 +筷 +筬 +筣 +筡 +笿 +笡 +笘 +竱 +穥 +稢 +秞 +禈 +祩 +祌 +礞 +礕 +礊 +磫 +磪 +碲 +硧 +硣 +硓 +砡 +砝 +砎 +矺 +矊 +矃 +睼 +睷 +睲 +眳 +眲 +眓 +眐 +眏 +盿 +盓 +皾 +皽 +皻 +皣 +癓 +瘭 +痷 +痳 +痬 +痦 +疕 +疒 +畑 +甶 +甭 +甐 +璬 +璑 +璂 +瑸 +瑫 +瑖 +瑎 +瑆 +琕 +琑 +珼 +珸 +玹 +玤 +狪 +狦 +狑 +狊 +狇 +牼 +爮 +爖 +爌 +燊 +熤 +熜 +熐 +煹 +煔 +焬 +焥 +焑 +焇 +焆 +焃 +烿 +烚 +炔 +炆 +灪 +灡 +灅 +瀿 +瀷 +濥 +濖 +濌 +澅 +潚 +漐 +滠 +滐 +溻 +溹 +溤 +溎 +湸 +湤 +渨 +淴 +涐 +涀 +浕 +洦 +洣 +洐 +泶 +泧 +泒 +沶 +汿 +汆 +氻 +氯 +毭 +毇 +殟 +殕 +殅 +歂 +欶 +欪 +檡 +檖 +橝 +樴 +樢 +樘 +樃 +槻 +榫 +榡 +榐 +榍 +楺 +楀 +椫 +椃 +椂 +棳 +棦 +棑 +桪 +桖 +栺 +栵 +栯 +柶 +柪 +柆 +枱 +杬 +杫 +杘 +朻 +朷 +朲 +朊 +朁 +暻 +暕 +晵 +昢 +旵 +旲 +斞 +斀 +敹 +敮 +攮 +攠 +攗 +攈 +撵 +摖 +摁 +搷 +搩 +搄 +揰 +揍 +掰 +掕 +掍 +捔 +捒 +挴 +挮 +挭 +挎 +挆 +拹 +抾 +抰 +抧 +抌 +扷 +扴 +戫 +戨 +懫 +懀 +憽 +憵 +慦 +愮 +惀 +悧 +悜 +悘 +悗 +恞 +恖 +恈 +怴 +怑 +忴 +忬 +忓 +忂 +徾 +徱 +彣 +弜 +廲 +廯 +廮 +廗 +幏 +巤 +巁 +嵹 +嵪 +嵢 +嵡 +崳 +崲 +崞 +崜 +崓 +崊 +峺 +峹 +峸 +峳 +峧 +峇 +岾 +岻 +岟 +岓 +屶 +屘 +尯 +尡 +宭 +孬 +孋 +嫳 +嫥 +媦 +媥 +媓 +媂 +婇 +娏 +妭 +妚 +奷 +奵 +奊 +夲 +夋 +壛 +墥 +墤 +墡 +塴 +塭 +塬 +堳 +堉 +埁 +垽 +垷 +垥 +垐 +坉 +圿 +圢 +囃 +嚤 +嚓 +嚋 +嚂 +嚁 +噳 +噗 +噒 +噋 +嘳 +嗀 +喛 +喗 +啵 +啨 +啎 +啌 +啉 +唴 +唌 +唊 +哅 +咠 +咅 +呹 +呲 +呯 +吅 +叁 +厷 +匚 +勭 +剾 +刵 +凼 +凁 +冿 +冮 +儖 +儑 +僼 +僠 +僓 +傽 +傤 +傈 +偩 +偡 +偑 +倴 +倓 +佸 +佒 +伝 +仼 +仹 +仢 +仐 +亻 +亠 +䶀 +䵻 +䵝 +䴮 +䳄 +䱶 +䱬 +䰓 +䯚 +䯉 +䮺 +䮧 +䮦 +䮐 +䭃 +䬼 +䬴 +䬔 +䬏 +䬍 +䫴 +䫌 +䪻 +䪮 +䪧 +䪗 +䪒 +䩮 +䩞 +䩌 +䩉 +䨹 +䧹 +䧙 +䧅 +䦳 +䦪 +䤥 +䤙 +䤑 +䤉 +䢮 +䡴 +䡬 +䡝 +䡘 +䠧 +䠙 +䠄 +䟶 +䟴 +䟡 +䟓 +䟐 +䟌 +䞋 +䝼 +䝁 +䜱 +䜫 +䛠 +䚦 +䚥 +䙾 +䙡 +䙕 +䙔 +䘳 +䘧 +䘢 +䘠 +䗲 +䗱 +䗤 +䗊 +䗈 +䗁 +䖹 +䖳 +䖜 +䖘 +䖅 +䕻 +䕺 +䕭 +䔺 +䔵 +䔟 +䔖 +䔒 +䓪 +䓘 +䒼 +䒶 +䒩 +䒖 +䒐 +䒏 +䑧 +䑜 +䑙 +䑘 +䐯 +䐈 +䏺 +䎃 +䎀 +䍲 +䍢 +䌷 +䌜 +䌖 +䋿 +䋭 +䋈 +䊆 +䉲 +䉞 +䉛 +䉂 +䉀 +䈴 +䈳 +䈙 +䈎 +䇶 +䆾 +䆵 +䆡 +䆟 +䆛 +䆘 +䆕 +䆅 +䅽 +䅳 +䃮 +䃫 +䃚 +䃙 +䃈 +䃃 +䂪 +䂘 +䂎 +䂊 +䁸 +䁳 +䁭 +䁪 +䁨 +䁆 +䀶 +䀭 +䀢 +䀟 +䀝 +䀛 +㿥 +㾫 +㾦 +㾛 +㾀 +㽿 +㽲 +㽭 +㽨 +㽉 +㼿 +㼟 +㻣 +㻡 +㻒 +㸙 +㷳 +㶕 +㶒 +㵵 +㵧 +㵘 +㴬 +㴐 +㴎 +㲨 +㲥 +㱡 +㱌 +㱂 +㰹 +㯕 +㯃 +㮇 +㭾 +㭰 +㭁 +㬎 +㬌 +㬇 +㫽 +㫲 +㫊 +㫃 +㩼 +㩖 +㨄 +㧿 +㧻 +㧓 +㧐 +㧁 +㧀 +㤯 +㤏 +㢩 +㡿 +㡜 +㡉 +㠟 +㟳 +㟯 +㟮 +㟍 +㟆 +㞏 +㝾 +㝩 +㝔 +㜶 +㛁 +㚶 +㙳 +㙨 +㘽 +㗩 +㗦 +㗅 +㖷 +㖒 +㕸 +㕲 +㕭 +㕉 +㔞 +㔅 +㓢 +㓊 +㒞 +㒔 +㑻 +㑛 +㑊 +㐆 +ヰ +ラ +〖 +々 +⿷ +⺲ +⺘ +⺖ +∴ +t +m +c +H +* +' +𰔔 +虩 +兔 +𭠥 +𭖆 +𫾣 +𫚥 +𫙮 +𫇛 +𫀆 +𪚰 +𪚑 +𪚃 +𪙆 +𪘲 +𪗾 +𪗰 +𪖨 +𪖣 +𪒬 +𪒒 +𪒄 +𪑜 +𪏶 +𪍴 +𪊤 +𪄲 +𪄟 +𪃸 +𪁽 +𪀘 +𩿪 +𩾽 +𩺂 +𩺀 +𩸊 +𩷪 +𩷑 +𩰾 +𩭪 +𩭩 +𩭤 +𩬹 +𩬛 +𩬈 +𩫖 +𩪘 +𩪍 +𩪉 +𩩝 +𩨨 +𩥉 +𩥇 +𩥄 +𩤸 +𩢿 +𩢰 +𩡩 +𩡝 +𩡕 +𩞄 +𩝨 +𩜾 +𩛞 +𩛆 +𩘎 +𩔖 +𩒹 +𩐿 +𩐎 +𩐁 +𩏢 +𩎠 +𩎖 +𩍜 +𩍈 +𩍂 +𩌧 +𩊓 +𩉴 +𩆩 +𩅀 +𩄡 +𩁻 +𨼊 +𨻄 +𨺱 +𨹷 +𨶶 +𨵦 +𨵙 +𨴹 +𨴢 +𨳲 +𨰜 +𨭽 +𨬟 +𨫂 +𨧱 +𨧨 +𨥚 +𨤏 +𨣧 +𨢆 +𨠯 +𨛬 +𨚕 +𨙸 +𨘧 +𨕣 +𨔝 +𨑦 +𨑖 +𨏼 +𨎎 +𨍋 +𨍈 +𨋰 +𨋖 +𨋎 +𨊵 +𨊩 +𨈚 +𨆪 +𨃪 +𨃚 +𨁝 +𨁊 +𨀉 +𧿧 +𧿁 +𧾧 +𧾜 +𧻴 +𧻨 +𧹢 +𧹛 +𧵍 +𧴍 +𧲛 +𧲂 +𧱲 +𧱉 +𧰚 +𧯑 +𧮭 +𧮈 +𧭈 +𧫛 +𧧂 +𧦬 +𧣈 +𧡪 +𧟄 +𧞣 +𧞋 +𧝬 +𧝅 +𧝀 +𧜳 +𧛾 +𧚮 +𧙏 +𧙍 +𧘢 +𧗳 +𧕅 +𧔥 +𧓈 +𧒭 +𧒏 +𧑤 +𧍣 +𧌳 +𧌇 +𧋢 +𧊸 +𧉬 +𧉧 +𧆑 +𧅱 +𧄤 +𧀱 +𦾶 +𦾴 +𦺅 +𦷁 +𦴻 +𦳝 +𦱧 +𦱌 +𦪞 +𦪇 +𦩷 +𦨰 +𦨚 +𦧟 +𦤞 +𦣱 +𦞦 +𦝫 +𦚎 +𦘹 +𦔽 +𦐂 +𦍩 +𦌠 +𦋺 +𦊰 +𦊓 +𦊀 +𦉊 +𦆻 +𦄢 +𦃪 +𦂱 +𦂄 +𦁤 +𦀟 +𥿑 +𥻿 +𥺢 +𥺚 +𥷤 +𥷟 +𥵦 +𥳧 +𥳑 +𥳊 +𥲑 +𥱼 +𥱌 +𥰕 +𥯛 +𥮕 +𥩳 +𥨫 +𥨍 +𥥍 +𥥌 +𥥈 +𥣧 +𥡥 +𥡙 +𥟇 +𥝧 +𥜧 +𥜠 +𥙿 +𥘤 +𥖝 +𥕘 +𥕕 +𥑻 +𥑮 +𥑤 +𥐭 +𥐞 +𥏳 +𥏫 +𥏪 +𥌣 +𥊑 +𥈎 +𥄱 +𥄉 +𥁃 +𥀶 +𤿑 +𤾭 +𤼟 +𤻎 +𤺄 +𤸷 +𤸃 +𤵜 +𤲪 +𤱕 +𤯝 +𤮊 +𤭢 +𤬯 +𤪌 +𤦎 +𤥐 +𤢜 +𤢒 +𤠞 +𤟭 +𤞑 +𤙶 +𤘅 +𤔲 +𤐰 +𤌏 +𤈍 +𤆘 +𤆏 +𤅵 +𤅊 +𤄬 +𤂷 +𤂑 +𤁄 +𣿙 +𣷞 +𣶂 +𣵺 +𣱄 +𣱁 +𣰥 +𣰆 +𣯻 +𣯏 +𣯍 +𣯌 +𣯈 +𣪁 +𣨼 +𣨎 +𣧗 +𣤛 +𣟴 +𣟄 +𣞶 +𣞉 +𣝒 +𣙢 +𣘻 +𣗪 +𣕕 +𣓌 +𣓉 +𣐙 +𣎗 +𣍐 +𣋉 +𣊓 +𣇕 +𣆀 +𣅿 +𣃽 +𣃵 +𣂰 +𣀮 +𢿱 +𢷾 +𢶏 +𢶉 +𢳣 +𢳀 +𢰦 +𢬸 +𢪛 +𢧿 +𢦟 +𢡚 +𢝌 +𢜮 +𢛅 +𢘿 +𢘩 +𢘍 +𢗳 +𢗲 +𢗒 +𢎥 +𢌞 +𢋅 +𢊽 +𢊖 +𢊆 +𢉤 +𢉜 +𢇦 +𢇍 +𢅰 +𢄘 +𢃐 +𢁥 +𡾹 +𡾊 +𡽶 +𡽨 +𡽐 +𡼠 +𡺱 +𡺟 +𡹌 +𡸰 +𡸗 +𡶶 +𡶵 +𡶥 +𡶒 +𡵨 +𡵀 +𡰱 +𡰈 +𡫏 +𡩡 +𡨂 +𡦽 +𡤫 +𡟒 +𡞞 +𡝫 +𡜮 +𡚬 +𡙆 +𡖔 +𡔏 +𡔀 +𡒨 +𡒊 +𡏟 +𡏇 +𡌚 +𡋺 +𡊅 +𡉞 +𡉏 +𡈹 +𡆠 +𠿒 +𠿈 +𠽡 +𠻗 +𠹛 +𠹖 +𠸢 +𠴫 +𠳁 +𠯴 +𠫤 +𠨜 +𠨚 +𠥱 +𠤷 +𠤱 +𠣑 +𠢕 +𠠇 +𠟇 +𠝞 +𠜾 +𠜴 +𠛎 +𠙆 +𠖣 +𠐼 +𠐏 +𠏡 +𠌰 +𠊰 +𠉁 +𠈣 +𠅥 +~ +T +K +I +C +B +A +里 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +龯 +齌 +鼿 +鼸 +鼆 +鼀 +黢 +黚 +麿 +麭 +麎 +麉 +麆 +鹀 +鸅 +鷶 +鷨 +鷑 +鶽 +鶫 +鶪 +鶦 +鶝 +鶌 +鶅 +鶁 +鵼 +鵹 +鵸 +鵨 +鵅 +鴺 +鴮 +鴘 +鴗 +鴖 +鴅 +鴀 +鳹 +鳭 +鳎 +鳈 +鲹 +鲣 +鲘 +鲉 +鱥 +鱚 +鱙 +鱆 +鱃 +鰿 +鰸 +鰡 +鰊 +鯻 +鯯 +鯠 +鯃 +鯂 +鮤 +鮡 +鮕 +鮔 +鮈 +魾 +魼 +魻 +魝 +魜 +骿 +骮 +骉 +驡 +驞 +騿 +騺 +騯 +騬 +騪 +騜 +駶 +駧 +駌 +馼 +馰 +馛 +馉 +餸 +飵 +颽 +颚 +顭 +顪 +顜 +頯 +韾 +韢 +韗 +韏 +鞎 +鞊 +靾 +靍 +霨 +霌 +雽 +雟 +雗 +雓 +雐 +陏 +闄 +闂 +閺 +閰 +镱 +镢 +镅 +锰 +锌 +铬 +铀 +钽 +钶 +钯 +钬 +钨 +钕 +钅 +钀 +鑨 +鑐 +鑅 +鐼 +鐴 +鐯 +鐪 +鐜 +鐈 +鏸 +鏣 +鏒 +鏏 +鏉 +鎑 +鎆 +鍻 +鍴 +鍒 +鍎 +鍌 +錻 +錵 +錥 +錣 +錋 +錂 +鋺 +鋧 +鋆 +銸 +銵 +銴 +銧 +鉹 +鉷 +鉰 +鉌 +鉃 +鈫 +鈪 +鈤 +鈛 +釶 +釲 +釢 +醶 +醔 +醌 +醀 +酼 +酨 +酠 +酓 +酀 +鄬 +鄪 +鄨 +鄐 +鄍 +郺 +郳 +郱 +郩 +郣 +郂 +郀 +邷 +邟 +邔 +邌 +遳 +逫 +迿 +迶 +迵 +迉 +辿 +轏 +轁 +輱 +輢 +輐 +軴 +軝 +軗 +軕 +蹿 +蹾 +蹽 +蹪 +蹧 +蹚 +踚 +踑 +踂 +跭 +趥 +趌 +趈 +贕 +贌 +賥 +貹 +貚 +貗 +貏 +貈 +豷 +豲 +豦 +谻 +讠 +讝 +讂 +譺 +譶 +譅 +譂 +謵 +謴 +謲 +謯 +謮 +謘 +謍 +謋 +謉 +諿 +諻 +諥 +諣 +諀 +誺 +誧 +誙 +誗 +誔 +誐 +誎 +詽 +詻 +詺 +詜 +詙 +詃 +訰 +訬 +訦 +訋 +訆 +訄 +覣 +襰 +襣 +襔 +襀 +褽 +褺 +褩 +裺 +裷 +裧 +裢 +袻 +袹 +袸 +袧 +袡 +袘 +衼 +衶 +衳 +衏 +蠾 +蠽 +蠸 +蠳 +蠞 +蠂 +蟼 +蟤 +蟡 +蟗 +蟔 +螸 +螱 +螠 +螋 +蝽 +蝳 +蝲 +蝭 +蝞 +蝚 +蝐 +蜟 +蜛 +蛫 +蛢 +蛞 +蛅 +蚽 +蚸 +蚜 +蚒 +蚇 +虴 +虰 +虌 +虅 +虂 +蘱 +蘫 +蘈 +藗 +藔 +藊 +藈 +藄 +藃 +薽 +薣 +薚 +薂 +蕍 +蕂 +蔸 +蔨 +蔝 +蔏 +蔈 +蓎 +蓈 +蒴 +蒮 +蒫 +蒠 +蒛 +蒘 +蒖 +蒒 +蒏 +葾 +葥 +葟 +葜 +葎 +萶 +萠 +萀 +菿 +菾 +菺 +菧 +菒 +菎 +菃 +莮 +莬 +莥 +莜 +莏 +莍 +莌 +莈 +莄 +荾 +荎 +茻 +茛 +苸 +苲 +苰 +苬 +苠 +苖 +苄 +苂 +苀 +芺 +芵 +芔 +芇 +芆 +艹 +艵 +艔 +艏 +艀 +舨 +臷 +臎 +臅 +膦 +膗 +腵 +腛 +腘 +腈 +腅 +腃 +脦 +脠 +脒 +胻 +胺 +胘 +胕 +胉 +胅 +肼 +肶 +肙 +肏 +聧 +聤 +聓 +聄 +聁 +耾 +耹 +耷 +耠 +耚 +翵 +翧 +羺 +羠 +缍 +纞 +纗 +繓 +縬 +縜 +縒 +縏 +縌 +縄 +緸 +緷 +緛 +綛 +綔 +絬 +絗 +絑 +紨 +糫 +糏 +糃 +粆 +籣 +籅 +簭 +簚 +簂 +篧 +篜 +篕 +篃 +箿 +箈 +箂 +筽 +筶 +筫 +筨 +筘 +筄 +笸 +笧 +笣 +笝 +笒 +竨 +竀 +窏 +穼 +穛 +穖 +穔 +穃 +穁 +稫 +稤 +稖 +稕 +稒 +秳 +秲 +秓 +禸 +禷 +禛 +禉 +祹 +祵 +祱 +祂 +礹 +礵 +礭 +礇 +磱 +磭 +磦 +磘 +磂 +碘 +碋 +硺 +硥 +硢 +硛 +硍 +硅 +砱 +砛 +砈 +砄 +矽 +矨 +矠 +瞺 +瞴 +瞝 +瞛 +瞙 +瞔 +瞏 +瞉 +瞈 +睾 +睸 +睧 +睔 +睈 +眮 +眣 +眗 +眖 +眔 +眍 +盷 +盄 +皫 +皝 +癿 +癦 +癔 +癑 +癍 +癋 +瘷 +瘱 +瘩 +瘜 +瘛 +痽 +痻 +痧 +痑 +痆 +痄 +痃 +疨 +疛 +甏 +甉 +瓵 +瓫 +瓩 +瓥 +瓂 +璯 +璒 +璌 +瑷 +瑬 +瑝 +瑅 +瑃 +瑂 +琩 +琧 +琙 +琓 +琋 +珴 +珨 +珦 +珛 +珄 +珃 +玏 +獽 +獩 +獣 +獡 +獕 +獓 +獋 +獊 +狛 +狅 +犾 +犸 +犰 +犩 +牻 +牳 +牑 +牃 +爿 +爔 +爎 +爄 +爂 +燶 +燣 +燡 +熷 +熵 +熴 +熰 +熩 +熕 +熓 +熑 +熃 +煻 +煪 +煐 +煋 +焪 +焖 +焀 +烷 +烠 +烀 +炾 +炪 +炥 +炞 +炝 +炚 +炌 +灦 +灚 +灀 +瀶 +瀱 +瀪 +濹 +濸 +濦 +濣 +濗 +澿 +澽 +澺 +澭 +澪 +澢 +澛 +潵 +潡 +潐 +漺 +漝 +漛 +漒 +漃 +滳 +滰 +滜 +溭 +溍 +湨 +湚 +湕 +湀 +渷 +渳 +渜 +渓 +渏 +淣 +淔 +淎 +淃 +涺 +涭 +涥 +涍 +浾 +浶 +浧 +洷 +洰 +洢 +洔 +洆 +泹 +泍 +沝 +沎 +沋 +汯 +汥 +汢 +汑 +氽 +氰 +氮 +氨 +氟 +氀 +殸 +欗 +櫼 +櫡 +櫏 +櫍 +櫅 +檼 +檵 +檴 +檧 +橾 +橭 +橪 +橨 +橞 +橖 +橔 +橒 +橆 +橃 +橀 +樖 +樒 +樈 +槾 +槸 +槡 +槙 +榺 +榞 +楤 +楐 +楋 +楉 +楆 +椼 +椲 +椯 +椩 +椢 +椚 +椋 +椊 +棿 +棤 +棎 +棇 +梾 +梽 +桾 +桻 +栥 +栙 +柭 +柫 +柛 +柒 +枼 +枩 +枡 +枟 +枙 +枆 +杸 +杧 +杛 +杚 +杔 +朹 +朸 +朳 +朥 +朜 +朆 +曋 +曂 +暺 +暩 +暥 +暟 +暊 +晫 +晗 +昹 +昋 +旫 +旔 +斣 +敀 +攳 +攍 +擳 +擛 +撜 +撖 +撊 +撂 +摼 +摷 +摮 +摬 +摡 +摗 +摓 +搻 +搸 +搫 +搡 +搟 +搝 +揧 +揟 +揙 +揓 +揇 +揁 +掝 +掜 +掑 +捝 +捌 +捊 +捈 +捀 +挓 +挒 +挄 +挀 +拺 +抲 +抮 +抭 +抐 +抋 +抂 +扻 +扲 +扨 +扜 +扄 +戜 +戓 +懻 +懱 +懝 +懗 +懅 +憼 +憘 +慡 +慔 +慏 +愪 +愥 +愝 +愖 +愄 +惗 +惃 +悿 +悡 +悈 +悂 +恾 +恦 +恎 +恄 +恀 +怽 +怤 +忶 +忚 +忕 +忄 +徦 +徥 +徍 +彸 +彆 +彃 +弲 +弪 +弞 +弖 +廤 +庯 +庝 +庘 +庎 +庁 +幦 +幠 +幓 +幋 +幆 +帺 +帵 +帞 +帍 +帄 +巯 +嶩 +嶥 +嶘 +嶎 +嵨 +嵧 +嵙 +嵅 +崾 +峜 +峑 +峊 +岇 +岆 +岄 +屾 +屻 +屸 +屌 +屄 +尳 +尥 +寷 +寲 +寯 +寏 +寋 +寊 +宱 +宔 +宀 +孢 +孉 +孈 +嬻 +嬳 +嬱 +嬬 +嬩 +嬎 +嬆 +嫶 +嫧 +嫐 +媶 +媙 +媄 +婽 +婸 +婫 +婋 +娺 +娷 +娧 +娞 +娂 +姺 +姶 +姳 +姩 +姖 +姇 +姂 +姀 +妵 +妦 +妢 +妡 +妟 +妑 +奲 +奣 +夼 +夰 +夨 +夃 +壪 +壣 +壆 +墿 +墲 +墭 +墘 +墔 +墌 +塶 +塱 +塥 +塝 +堬 +堥 +堜 +堎 +埨 +垴 +垱 +垘 +垗 +垊 +垈 +坽 +坴 +坬 +坈 +坄 +圵 +圴 +圱 +圙 +圔 +圐 +囍 +囇 +囄 +嚝 +嚛 +嚍 +噺 +噮 +噡 +噅 +嘥 +嘣 +嗴 +嗙 +喱 +喰 +啫 +啇 +啂 +唿 +哾 +哴 +哞 +哖 +哐 +咞 +咝 +咗 +咇 +呎 +呋 +呁 +吣 +吋 +叐 +叀 +厽 +厱 +厬 +厒 +厇 +匴 +匤 +匒 +勫 +劦 +劥 +劜 +劐 +劏 +劆 +劁 +剫 +刏 +凗 +凐 +凎 +冹 +冃 +儾 +儶 +儰 +儩 +儏 +僗 +僈 +傌 +傄 +偤 +偍 +偋 +偅 +倿 +倞 +倝 +倊 +倄 +俧 +俉 +侺 +侳 +侇 +侀 +佲 +佨 +伬 +伡 +仸 +仱 +仫 +仩 +仜 +亇 +亄 +乽 +乀 +丷 +丒 +䶥 +䶙 +䵾 +䵶 +䵬 +䵩 +䵨 +䵤 +䵘 +䵎 +䴾 +䴺 +䴴 +䴚 +䴙 +䴇 +䴅 +䴃 +䴀 +䳽 +䳷 +䳮 +䳭 +䳚 +䳐 +䳂 +䲛 +䲓 +䲀 +䱦 +䱠 +䱓 +䱌 +䱁 +䰼 +䰻 +䰝 +䰙 +䰏 +䰎 +䰋 +䰃 +䯿 +䯲 +䯭 +䯠 +䯗 +䯔 +䯓 +䯋 +䯊 +䯈 +䯄 +䮷 +䮥 +䮝 +䮕 +䮋 +䮂 +䭹 +䭪 +䭢 +䭘 +䭈 +䬵 +䬳 +䬦 +䬠 +䬓 +䬌 +䬆 +䫾 +䫼 +䫹 +䫪 +䫩 +䫥 +䫒 +䫋 +䫁 +䪹 +䪴 +䪤 +䪓 +䪍 +䩳 +䩦 +䩡 +䩐 +䩋 +䨱 +䨨 +䨛 +䨕 +䨏 +䧜 +䧆 +䦵 +䦚 +䥬 +䥝 +䥓 +䥈 +䤬 +䤎 +䤇 +䣼 +䣯 +䣮 +䣣 +䣢 +䣕 +䢷 +䡶 +䡰 +䡦 +䡐 +䠬 +䠪 +䠟 +䠚 +䠎 +䠀 +䟮 +䟛 +䟕 +䟑 +䟂 +䞤 +䞙 +䞘 +䞀 +䝨 +䝙 +䝖 +䝏 +䝍 +䝄 +䜤 +䜝 +䜕 +䜌 +䜉 +䛸 +䛢 +䛒 +䛆 +䚹 +䚮 +䚔 +䚍 +䚇 +䙷 +䙤 +䙞 +䙌 +䙅 +䙃 +䘺 +䘸 +䘦 +䘛 +䘚 +䘓 +䘌 +䗿 +䗽 +䗰 +䗐 +䗋 +䖺 +䖵 +䖱 +䖦 +䖔 +䖑 +䕹 +䕵 +䕦 +䕟 +䕓 +䕌 +䔾 +䔷 +䔰 +䔯 +䔭 +䔬 +䔑 +䔋 +䔆 +䓶 +䓨 +䓣 +䓏 +䓉 +䒺 +䒷 +䒘 +䒎 +䒍 +䒁 +䑽 +䑼 +䑶 +䑪 +䑦 +䐿 +䐺 +䐪 +䐩 +䐨 +䐧 +䐣 +䐢 +䐘 +䐗 +䐊 +䐂 +䏬 +䏥 +䏒 +䏁 +䎿 +䎼 +䎸 +䎳 +䎩 +䎂 +䍴 +䍣 +䍑 +䍋 +䍃 +䌴 +䌇 +䌁 +䋼 +䋺 +䋁 +䊻 +䊹 +䊮 +䊝 +䉵 +䉬 +䉪 +䉦 +䉥 +䉣 +䉔 +䉌 +䉉 +䈻 +䈵 +䈰 +䈟 +䈝 +䈕 +䈒 +䈑 +䈏 +䈁 +䇞 +䇘 +䇓 +䆲 +䆦 +䆖 +䆎 +䆃 +䆁 +䅼 +䅺 +䅶 +䅩 +䅤 +䅗 +䅖 +䅓 +䅎 +䅋 +䅊 +䄻 +䄱 +䄩 +䄛 +䄌 +䄇 +䃾 +䃳 +䃰 +䃗 +䃑 +䃎 +䃆 +䂽 +䂨 +䂑 +䂍 +䁲 +䁬 +䁠 +䁝 +䁘 +䁈 +䁁 +䀿 +䀫 +䀁 +㿪 +㿟 +㿜 +㿃 +㿂 +㾞 +㾐 +㾊 +㾁 +㽽 +㽻 +㽵 +㽥 +㽤 +㽕 +㽑 +㼽 +㼻 +㼷 +㼶 +㼱 +㼰 +㼨 +㼣 +㼝 +㼘 +㼕 +㼐 +㼋 +㼈 +㼆 +㻛 +㻍 +㻌 +㻁 +㺲 +㺥 +㺠 +㺗 +㺌 +㺊 +㹻 +㹲 +㹗 +㹔 +㹈 +㸽 +㸰 +㸪 +㸡 +㸝 +㸒 +㷼 +㷵 +㷱 +㷠 +㷟 +㶴 +㶳 +㶟 +㶄 +㵶 +㵣 +㵞 +㵗 +㵋 +㴲 +㴦 +㴥 +㴒 +㴂 +㳶 +㳴 +㳞 +㳃 +㲼 +㲰 +㲮 +㲣 +㲠 +㲏 +㱾 +㱪 +㱦 +㱟 +㱙 +㱊 +㱅 +㱁 +㰶 +㰳 +㰪 +㰞 +㰝 +㯻 +㯮 +㯗 +㯐 +㯌 +㯅 +㮿 +㮹 +㮷 +㮴 +㮲 +㮬 +㮙 +㮕 +㮒 +㮀 +㭺 +㭮 +㭦 +㭙 +㭒 +㭑 +㬺 +㬮 +㬭 +㬤 +㬣 +㫿 +㫸 +㫮 +㫬 +㫧 +㫝 +㫙 +㪹 +㪇 +㪁 +㩾 +㩋 +㩈 +㩂 +㨵 +㨭 +㨦 +㨟 +㨝 +㨋 +㧹 +㧶 +㧮 +㧬 +㧣 +㧝 +㧘 +㧗 +㧕 +㧒 +㧑 +㦿 +㦷 +㦤 +㦗 +㦒 +㦎 +㦄 +㥶 +㥎 +㥅 +㥃 +㤱 +㤮 +㤨 +㤕 +㤊 +㤉 +㣱 +㣚 +㣂 +㢋 +㢅 +㢃 +㡤 +㡗 +㡆 +㠿 +㠹 +㠸 +㠷 +㠭 +㠨 +㠛 +㠙 +㠐 +㠏 +㠇 +㟲 +㟰 +㟥 +㟓 +㟒 +㟐 +㟎 +㞳 +㞥 +㞟 +㞘 +㞒 +㞇 +㝵 +㝣 +㝟 +㝏 +㝁 +㜰 +㜬 +㜝 +㜊 +㛺 +㛪 +㛠 +㛙 +㛂 +㚻 +㚵 +㚤 +㚋 +㚃 +㙭 +㙪 +㙩 +㙐 +㙏 +㙋 +㘻 +㘵 +㘲 +㘬 +㘦 +㘋 +㘊 +㘉 +㘃 +㗨 +㗧 +㗙 +㗘 +㗉 +㖿 +㖽 +㖵 +㖮 +㖩 +㖡 +㖞 +㖓 +㖇 +㕬 +㕥 +㕚 +㕎 +㕂 +㔾 +㔨 +㓠 +㓘 +㓖 +㒮 +㒗 +㒓 +㒊 +㑲 +㑯 +㑭 +㑀 +㐸 +㐌 +ㄦ +ㄠ +ㄝ +ㄚ +ㄓ +ㄉ +ㄇ +ゼ +エ +ぷ +び +ね +ず +じ +〗 +〃 +  +⿹ +⿵ +⿲ +⾙ +⻊ +⺼ +⺢ +⺌ +⺅ +⑶ +• +​ +и +Ъ +Ё +Λ +Β +x +r +n +g +d +Y +S +P +L +$ diff --git a/tasks/common/gradio/__init__.py b/tasks/common/gradio/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tasks/common/gradio/pipeline_app.py b/tasks/common/gradio/pipeline_app.py new file mode 100644 index 0000000000000000000000000000000000000000..257bb2582d9cf61d685dbd90d092a72c9d0d08d6 --- /dev/null +++ b/tasks/common/gradio/pipeline_app.py @@ -0,0 +1,256 @@ +""" +基于 Pipeline 的 Gradio 应用构建器 + +提供可配置的 Gradio 应用,支持不同的文本生成场景。 +""" + +from functools import partial + +import gradio as gr + +import pipeline.base.sample_functions as sample_functions +from pipeline import Pipeline +from pipeline.base.checkpoint import resolve_checkpoint +from pipeline.base.generation import TextGenerator +from pipeline.base.model_loader import load_inference_artifact_from_pipeline +from env.resolve import display_path + + +class AppBuilderFromPipeline: + """ + 基于 Pipeline 配置的 Gradio 应用构建器 + """ + + def __init__( + self, + pipeline: Pipeline, + title: str = "文本生成器", + placeholder: str = "请输入提示文本", + output_label: str = "生成的文本", + max_length: int = 200, + ): + """ + 初始化应用 + + Args: + pipeline: Pipeline 实例 + title: 界面标题 + placeholder: 输入框占位符文本 + output_label: 输出框标签 + max_length: 默认最大生成长度 + """ + self.pipeline = pipeline + self.title = title + self.placeholder = placeholder + self.output_label = output_label + self.max_length = max_length + self.temp_slider = None + self.top_k_slider = None + self._generator = None # 延迟初始化 + + def _load_inference_artifact(self) -> tuple: + checkpoint_rule = self.pipeline.checkpoint_rules.resolve_deployment_rule( + default_dirs=[self.pipeline.checkpoint_dir] + ) + return load_inference_artifact_from_pipeline(self.pipeline, checkpoint_rule) + + def get_model_info(self) -> str: + """获取模型信息(单行格式)""" + parts = [] + checkpoint_rule = self.pipeline.checkpoint_rules.resolve_deployment_rule( + default_dirs=[self.pipeline.checkpoint_dir] + ) + + # 解析检查点路径 + checkpoint_path, _ = resolve_checkpoint(**checkpoint_rule) + + if checkpoint_path is None: + raise FileNotFoundError("未找到模型检查点文件") + else: + # 模型文件名和大小 + file_name = checkpoint_path.name + file_size = checkpoint_path.stat().st_size + parts.append( + f"**模型文件**: {file_name}({file_size / (1024 * 1024):.2f} MB)" + ) + + # 词汇表大小 + tokenizer_info = self.pipeline.dataset.tokenizer_bundle() + vocab_size = tokenizer_info.vocab_size + if tokenizer_info.vocab_path: + parts.append( + f"**词汇表**: {display_path(tokenizer_info.vocab_path)}({vocab_size}词)" + ) + else: + parts.append(f"**词汇表**: {vocab_size}词") + + return ",".join(parts) + + def _init_generator(self) -> TextGenerator: + """初始化 GPT 生成器""" + print("正在加载模型和分词器...") + inference_artifact, tokenizer_info = self._load_inference_artifact() + print("模型加载完成!") + generator = TextGenerator( + artifact=inference_artifact, + tokenizer=tokenizer_info.tokenizer, + decode=tokenizer_info.decode, + end_of_text=tokenizer_info.end_of_text, + max_length=self.max_length, + sample_fn=sample_functions.top_k + ) + return generator + + def _ensure_generator_initialized(self) -> None: + """确保生成器已初始化(延迟加载)""" + if self._generator is None: + self._generator = self._init_generator() + + @staticmethod + def get_sample_fn(strategy: str, temperature: float, top_k_value: int): + """根据策略和参数返回采样函数""" + if strategy == "greedy": + return sample_functions.greedy_search + elif strategy == "random": + return partial(sample_functions.random_sample, temperature=temperature) + elif strategy == "top_k": + return partial( + sample_functions.top_k, k=top_k_value, temperature=temperature + ) + else: + raise ValueError(f"未知的采样策略: {strategy}") + + def generate_text( + self, + prompt: str, + max_length: int, + strategy: str, + temperature: float, + top_k_value: int, + ) -> str: + """ + 生成文本 + + 第一次调用时会自动加载模型。 + + Args: + prompt: 输入提示文本 + max_length: 最大生成长度 + strategy: 采样策略 + temperature: 温度参数 + top_k_value: top-k 值 + + Returns: + 生成的文本 + """ + # 确保生成器已初始化(延迟加载) + self._ensure_generator_initialized() + + sample_fn = self.get_sample_fn(strategy, temperature, top_k_value) + result = self._generator.generate_text( + prompt, + max_length=max_length, + sample_fn=sample_fn + ) + return f"{result.text}{result.stop_reason}" + + def update_ui(self, strategy: str): + """根据采样策略更新 UI 组件状态""" + if strategy == "greedy": + return { + self.temp_slider: gr.update(interactive=False, value=1.0), + self.top_k_slider: gr.update(interactive=False, value=5), + } + elif strategy == "random": + return { + self.temp_slider: gr.update(interactive=True), + self.top_k_slider: gr.update(interactive=False, value=5), + } + else: # top_k + return { + self.temp_slider: gr.update(interactive=True), + self.top_k_slider: gr.update(interactive=True), + } + + def create_ui(self): + """创建 Gradio 界面""" + with gr.Blocks(title=self.title) as demo: + gr.Markdown(f"# {self.title}") + gr.Markdown("输入提示文本,模型将生成续写内容。") + try: + gr.Markdown(self.get_model_info()) # 模型信息展示 + except Exception as e: + gr.Markdown(f"**模型信息加载失败**: {str(e)}") + + with gr.Row(): + with gr.Column(): + # 输入区 + prompt_input = gr.Textbox( + label="提示文本 (Prompt)", + placeholder=self.placeholder, + lines=3, + ) + + # 采样策略 + strategy_radio = gr.Radio( + choices=["greedy", "random", "top_k"], + value="top_k", + label="采样策略", + ) + + # 参数控制 + self.temp_slider = gr.Slider( + minimum=0.1, + maximum=5.0, + value=1.0, + step=0.1, + label="Temperature", + ) + + self.top_k_slider = gr.Slider( + minimum=1, maximum=20, value=5, step=1, label="Top-k" + ) + + max_length_slider = gr.Slider( + minimum=10, + maximum=self.max_length, + value=self.max_length, + step=1, + label="最大生成长度", + ) + + generate_btn = gr.Button("生成", variant="primary") + + with gr.Column(): + # 输出区 + output_text = gr.Textbox( + label=self.output_label, + lines=15, + interactive=False, + ) + + # 事件绑定 + strategy_radio.change( + fn=self.update_ui, + inputs=[strategy_radio], + outputs=[self.temp_slider, self.top_k_slider], + ) + + generate_btn.click( + fn=self.generate_text, + inputs=[ + prompt_input, + max_length_slider, + strategy_radio, + self.temp_slider, + self.top_k_slider, + ], + outputs=[output_text], + ) + + return demo + + def run(self): + """启动应用""" + demo = self.create_ui() + demo.launch(share=False) diff --git a/tasks/poetry_gpt/generate.py b/tasks/poetry_gpt/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..0a15666cf82577c4e2a99326c4cc9add040c4758 --- /dev/null +++ b/tasks/poetry_gpt/generate.py @@ -0,0 +1,50 @@ +""" +诗歌生成脚本 - 交互式诗歌生成 + +执行命令:python3 -m tasks.poetry_gpt.generate + +用于加载训练好的诗歌模型并进行交互式诗歌生成。 +""" + +from pipeline.base.generation import TextGenerator +from pipeline.base.generation_runner import BaseGenerationRunner +from pipeline.base.model_loader import load_inference_artifact_from_pipeline +from tasks.poetry_gpt.train import resolve_pipeline + + +class PoetryGenerateRunner(BaseGenerationRunner): + """诗歌生成 ActionRunner""" + + title = "诗歌生成器" + fixed_prompts = [ + "白日依山尽", + "床前明月光", + "春眠不觉晓", + "千山鸟飞绝", + "空山不见人", + ] + random_config = {"num_text": 10, "text_length": 10} + + # TODO: 这 3 个生成器(poetry_gpt, poetry_rnn, wiki_gpt)基本是重复代码 + def _build_generator(self) -> TextGenerator: + """构建诗歌生成器""" + checkpoint_rule = self.pipeline.checkpoint_rules.resolve_testing_rule( + default_dirs=[self.pipeline.checkpoint_dir] + ) + inference_artifact, tokenizer_info = load_inference_artifact_from_pipeline( + self.pipeline, + checkpoint_rule + ) + return TextGenerator( + artifact=inference_artifact, + tokenizer=tokenizer_info.tokenizer, + decode=tokenizer_info.decode, + end_of_text=tokenizer_info.end_of_text, + max_length=100, + sample_fn=self.pipeline.generation_rule.sample_strategy + ) + + +if __name__ == "__main__": + runner = PoetryGenerateRunner(resolve_pipeline) + runner("run_fixed") diff --git a/tasks/poetry_gpt/gradio.py b/tasks/poetry_gpt/gradio.py new file mode 100644 index 0000000000000000000000000000000000000000..a712defecb123f6e37c620e3a69dfad47c46a3af --- /dev/null +++ b/tasks/poetry_gpt/gradio.py @@ -0,0 +1,32 @@ +""" +诗歌 Gradio 交互界面 + +执行命令:python3 -m tasks.poetry_gpt.gradio + +提供 Web 界面体验诗歌生成功能。 +""" + +from env.keras import enable_mixed_precision +from tasks.common.gradio.pipeline_app import AppBuilderFromPipeline +from tasks.poetry_gpt.train import resolve_pipeline + +# 设置混合精度 +enable_mixed_precision() + +# 获取 Pipeline +pipeline = resolve_pipeline() + +# 创建应用 +app = AppBuilderFromPipeline( + pipeline=pipeline, + title="诗歌生成器", + placeholder="请输入诗句开头,例如:白日依山尽", + output_label="生成的诗句", + max_length=100, +) + +# 创建 Blocks demo(模块级别,供多页面应用使用) +demo = app.create_ui() + +if __name__ == "__main__": + demo.launch() diff --git a/tasks/poetry_gpt/save_model.py b/tasks/poetry_gpt/save_model.py new file mode 100644 index 0000000000000000000000000000000000000000..696b8c831eb63511e2b5d3bac3ad1c01667b26da --- /dev/null +++ b/tasks/poetry_gpt/save_model.py @@ -0,0 +1,11 @@ +from tasks.poetry_gpt.train import resolve_pipeline + + +def main(): + pip = resolve_pipeline() + model_path = pip.save_inference_model() + print(f"模型已保存到: {model_path}") + + +if __name__ == "__main__": + main() diff --git a/tasks/poetry_gpt/train.py b/tasks/poetry_gpt/train.py new file mode 100644 index 0000000000000000000000000000000000000000..63f395222fd2509adfe238f8cff77f5f394f1eea --- /dev/null +++ b/tasks/poetry_gpt/train.py @@ -0,0 +1,89 @@ +import os + +from data import PoetryDataset +from env.resolve import resolve_path, resolve_saved +from models.mini_gpt import GptModelBuilder +from pipeline import CheckpointConfig, Pipeline, PipelineRunner +from pipeline.base.configs import CheckpointRules, GenerationRule, TrainingRule +from pipeline.base.prompts_strategy import fixed_prompts +from pipeline.base.sample_functions import top_k + +""" +在运行脚本前,确保数据集已经准备完毕。 + +第一,下载数据集: + + git clone https://github.com/xiu-ze/Poetry.git ~/data/Poetry + +以后将 ~/data/Poetry 作为我们的数据集路径。 + +第二,生成词汇表。执行: + + python3 -m data.poetry.runner build_vocab + +词汇表保存在 ~/data/Poetry/vocabulary.txt 中,每行一个字符。这个词汇表包含了数据集中出现的所有字符, +以及一个特殊的 <|endoftext|> 标记(在文件中表示为 $)。 +""" + +# 测试配置 +test_pip = Pipeline( + name="poetry_gpt", + dataset=PoetryDataset( + data_dir=str(resolve_path("data/dev/poetry")), + vocab_path=str(resolve_saved("vocab/poetry/vocab.txt")), + sequence_length=100, + ), + model_builder=GptModelBuilder( + hidden_dim=50, + intermediate_dim=50, + num_heads=2, + num_layers=1, + ), + training_rule=TrainingRule( + batch_size=128, epochs=5, steps_per_epoch=30, validation_batches=0 + ), + generation_rule=GenerationRule( + prompts_generator=fixed_prompts(["白日依山"]), + sample_strategy=top_k + ), + checkpoint_rules=CheckpointRules() +) + +# 生产配置 +prod_pip = Pipeline( + name="poetry_gpt", + dataset=PoetryDataset( + data_dir=str(resolve_path("~/data/Poetry/诗歌数据集")), + vocab_path=str(resolve_saved("vocab/poetry/vocab.txt")), + sequence_length=100, + ), + model_builder=GptModelBuilder( + hidden_dim=512, + intermediate_dim=2056, + num_heads=4, + num_layers=4, + ), + training_rule=TrainingRule( + batch_size=128, epochs=100, steps_per_epoch=2000, validation_batches=200 + ), + generation_rule=GenerationRule( + prompts_generator=fixed_prompts(["白日依山"]), + sample_strategy=top_k + ), + checkpoint_rules=CheckpointRules( + testing=CheckpointConfig(epoch=45), + deployment=CheckpointConfig(dirs=[resolve_saved("models/poetry_gpt")], suffix=".keras") + ) +) + +pip_runner = PipelineRunner(test_pip, prod_pip) + + +def resolve_pipeline(): + """根据环境变量获取 Pipeline 实例""" + env = os.environ.get("ENV", "test") + return prod_pip if env == "production" else test_pip + + +if __name__ == "__main__": + pip_runner() diff --git a/tasks/poetry_rnn/generate.py b/tasks/poetry_rnn/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..6f065048c2f82457416bbf1451f5a9c38e67682e --- /dev/null +++ b/tasks/poetry_rnn/generate.py @@ -0,0 +1,49 @@ +""" +诗歌生成脚本 - 交互式诗歌生成 + +执行命令:python3 -m tasks.poetry_rnn.generate + +用于加载训练好的诗歌模型并进行交互式诗歌生成。 +""" + +from pipeline.base.generation import TextGenerator +from pipeline.base.generation_runner import BaseGenerationRunner +from pipeline.base.model_loader import load_inference_artifact_from_pipeline +from tasks.poetry_rnn.train import resolve_pipeline + + +class PoetryGenerateRunner(BaseGenerationRunner): + """诗歌生成 ActionRunner""" + + title = "诗歌生成器" + fixed_prompts = [ + "白日依山尽", + "床前明月光", + "春眠不觉晓", + "千山鸟飞绝", + "空山不见人", + ] + random_config = {"num_text": 10, "text_length": 10} + + def _build_generator(self) -> TextGenerator: + """构建诗歌生成器""" + checkpoint_rule = self.pipeline.checkpoint_rules.resolve_testing_rule( + default_dirs=[self.pipeline.checkpoint_dir] + ) + inference_artifact, tokenizer_info = load_inference_artifact_from_pipeline( + self.pipeline, + checkpoint_rule + ) + return TextGenerator( + artifact=inference_artifact, + tokenizer=tokenizer_info.tokenizer, + decode=tokenizer_info.decode, + end_of_text=tokenizer_info.end_of_text, + max_length=100, + sample_fn=self.pipeline.generation_rule.sample_strategy + ) + + +if __name__ == "__main__": + runner = PoetryGenerateRunner(resolve_pipeline) + runner("run_fixed") diff --git a/tasks/poetry_rnn/gradio.py b/tasks/poetry_rnn/gradio.py new file mode 100644 index 0000000000000000000000000000000000000000..5c6aeef9cf3a4570d461050973212eac025dbbad --- /dev/null +++ b/tasks/poetry_rnn/gradio.py @@ -0,0 +1,32 @@ +""" +诗歌 Gradio 交互界面 + +执行命令:python3 -m tasks.poetry_rnn.gradio + +提供 Web 界面体验诗歌生成功能。 +""" + +from env.keras import enable_mixed_precision +from tasks.common.gradio.pipeline_app import AppBuilderFromPipeline +from tasks.poetry_rnn.train import resolve_pipeline + +# 设置混合精度 +enable_mixed_precision() + +# 获取 Pipeline +pipeline = resolve_pipeline() + +# 创建应用 +app = AppBuilderFromPipeline( + pipeline=pipeline, + title="诗歌生成器 (RNN)", + placeholder="请输入诗句开头,例如:白日依山尽", + output_label="生成的诗句", + max_length=100, +) + +# 创建 Blocks demo(模块级别,供多页面应用使用) +demo = app.create_ui() + +if __name__ == "__main__": + demo.launch() diff --git a/tasks/poetry_rnn/save_model.py b/tasks/poetry_rnn/save_model.py new file mode 100644 index 0000000000000000000000000000000000000000..921fc59a0bcc0768dfd56e8141326cb54f7dfec6 --- /dev/null +++ b/tasks/poetry_rnn/save_model.py @@ -0,0 +1,11 @@ +from tasks.poetry_rnn.train import resolve_pipeline + + +def main(): + pip = resolve_pipeline() + model_path = pip.save_inference_model() + print(f"模型已保存到: {model_path}") + + +if __name__ == "__main__": + main() diff --git a/tasks/poetry_rnn/train.py b/tasks/poetry_rnn/train.py new file mode 100644 index 0000000000000000000000000000000000000000..fb1ea55ae5a66297968d7598564d24a925898a6c --- /dev/null +++ b/tasks/poetry_rnn/train.py @@ -0,0 +1,79 @@ +import os + +from data import PoetryDataset +from env.resolve import resolve_path, resolve_saved +from models.rnn import RNNModelBuilder +from pipeline import CheckpointConfig, Pipeline, PipelineRunner +from pipeline.base.configs import CheckpointRules, GenerationRule, TrainingRule +from pipeline.base.prompts_strategy import fixed_prompts +from pipeline.base.sample_functions import top_k + +""" +在运行脚本前,确保数据集已经准备完毕。 + +第一,下载数据集: + + git clone https://github.com/xiu-ze/Poetry.git ~/data/Poetry + +以后将 ~/data/Poetry 作为我们的数据集路径。 + +第二,生成词汇表。执行: + + python3 -m data.poetry.tokenizer + +词汇表保存在 ~/data/Poetry/vocabulary.txt 中,每行一个字符。这个词汇表包含了数据集中出现的所有字符, +以及一个特殊的 <|endoftext|> 标记(在文件中表示为 $)。 +""" + +# 测试配置 +test_pip = Pipeline( + name="poetry_rnn", + dataset=PoetryDataset( + data_dir=str(resolve_path("data/dev/poetry")), + vocab_path=str(resolve_saved("vocab/poetry/vocab.txt")), + sequence_length=100, + ), + model_builder=RNNModelBuilder(num_layers=1, embedding_dim=50, hidden_dim=50), + training_rule=TrainingRule( + batch_size=128, epochs=5, steps_per_epoch=30, validation_batches=0 + ), + generation_rule=GenerationRule( + prompts_generator=fixed_prompts(["白日依山"]), + sample_strategy=top_k + ), + checkpoint_rules=CheckpointRules() +) + +# 生产配置 +prod_pip = Pipeline( + name="poetry_rnn", + dataset=PoetryDataset( + data_dir=str(resolve_path("~/data/Poetry/诗歌数据集")), + vocab_path=str(resolve_saved("vocab/poetry/vocab.txt")), + sequence_length=100, + ), + model_builder=RNNModelBuilder(num_layers=2, embedding_dim=100, hidden_dim=512), + training_rule=TrainingRule( + batch_size=128, epochs=100, steps_per_epoch=2000, validation_batches=200 + ), + generation_rule=GenerationRule( + prompts_generator=fixed_prompts(["白日依山"]), + sample_strategy=top_k + ), + checkpoint_rules=CheckpointRules( + testing=CheckpointConfig(epoch=41), + deployment=CheckpointConfig(dirs=[resolve_saved("models/poetry_rnn")], suffix=".keras") + ) +) + +pip_runner = PipelineRunner(test_pip, prod_pip) + + +def resolve_pipeline(): + """根据环境变量获取 Pipeline 实例""" + env = os.environ.get("ENV", "test") + return prod_pip if env == "production" else test_pip + + +if __name__ == "__main__": + pip_runner() diff --git a/tasks/wiki_gpt/dataset_stats.py b/tasks/wiki_gpt/dataset_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..976f0cab7db56b3722777ac2274903d6cc4fde2d --- /dev/null +++ b/tasks/wiki_gpt/dataset_stats.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +"""数据集统计脚本 + +统计指定目录下数据集的文档统计信息,使用统一报表格式。 + +直接运行脚本统计 ~/data/wiki/mini_c4 目录下的数据集。 + +示例输出: +============================================================ +Mini C4 数据集统计 +============================================================ +文档数: 1,749,701 +总字符数: 806,779,541 +总 Token 数: 1,115,044,742 +------------------------------------------------------------ +平均每文档字符数: 461.1 +平均每文档 Token 数: 637.3 +最长文档字符数: 5,432 +文档长度中位数: 380 +============================================================ + +训练样本预估 (seq=256): + 可生成约 4,355,643 个训练样本 +""" + +from data import WikiDataset +from env.resolve import resolve_path, resolve_env + + +data_dir = resolve_env( + resolve_path("data/dev/mini_c4"), + resolve_path("~/data/wiki/mini_c4"), +) +tokenizer_type = resolve_env("character", "sentence_piece") + + +def main(data_dir: str, glob_pattern: str, tokenizer_type: str, name: str = "数据集"): + """统计数据集并输出报表""" + dataset = WikiDataset( + data_dir=data_dir, glob_pattern=glob_pattern, tokenizer_type=tokenizer_type + ) + dataset.stat(seq_length=256) + + +if __name__ == "__main__": + main( + data_dir=str(data_dir), + glob_pattern="*", + tokenizer_type=tokenizer_type, + name="Mini C4", + ) diff --git a/tasks/wiki_gpt/generate.py b/tasks/wiki_gpt/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..08e75a47e6b9029f6b67f7fa0b77f8adcf2993e4 --- /dev/null +++ b/tasks/wiki_gpt/generate.py @@ -0,0 +1,43 @@ +""" +文本生成脚本 - 交互式GPT文本生成 + +执行命令:python3 -m tasks.wiki_gpt.generate + +用于加载训练好的Mini GPT模型并进行交互式文本生成。 +""" + +from pipeline.base.generation import TextGenerator +from pipeline.base.generation_runner import BaseGenerationRunner +from pipeline.base.model_loader import load_inference_artifact_from_pipeline +from tasks.wiki_gpt.train import resolve_pipeline + + +class GptGenerateRunner(BaseGenerationRunner): + """Mini GPT 文本生成 ActionRunner""" + + title = "Mini GPT 文本生成器" + fixed_prompts = ["中国的首都是"] + random_config = {"num_text": 10, "preview_size": 100} + + def _build_generator(self): + """构建 GPT 生成器""" + checkpoint_rule = self.pipeline.checkpoint_rules.resolve_testing_rule( + default_dirs=[self.pipeline.checkpoint_dir] + ) + inference_artifact, tokenizer_info = load_inference_artifact_from_pipeline( + self.pipeline, + checkpoint_rule + ) + return TextGenerator( + artifact=inference_artifact, + tokenizer=tokenizer_info.tokenizer, + decode=tokenizer_info.decode, + end_of_text=tokenizer_info.end_of_text, + max_length=200, + sample_fn=self.pipeline.generation_rule.sample_strategy + ) + + +if __name__ == "__main__": + runner = GptGenerateRunner(resolve_pipeline) + runner("run_fixed") diff --git a/tasks/wiki_gpt/gradio.py b/tasks/wiki_gpt/gradio.py new file mode 100644 index 0000000000000000000000000000000000000000..c0e46e6d88f0397369facdd39b7f145daa902a40 --- /dev/null +++ b/tasks/wiki_gpt/gradio.py @@ -0,0 +1,32 @@ +""" +Mini GPT Gradio 交互界面 + +执行命令:python3 -m tasks.wiki_gpt.gradio + +提供 Web 界面体验 GPT 文本生成功能。 +""" + +from env.keras import enable_mixed_precision +from tasks.common.gradio.pipeline_app import AppBuilderFromPipeline +from tasks.wiki_gpt.train import resolve_pipeline + +# 设置混合精度 +enable_mixed_precision() + +# 获取 Pipeline +pipeline = resolve_pipeline() + +# 创建应用 +app = AppBuilderFromPipeline( + pipeline=pipeline, + title="Mini GPT 文本生成器", + placeholder="请输入提示文本,例如:海上护卫队总司令部", + output_label="生成的文本", + max_length=200, +) + +# 创建 Blocks demo(模块级别,供多页面应用使用) +demo = app.create_ui() + +if __name__ == "__main__": + demo.launch() diff --git a/tasks/wiki_gpt/save_model.py b/tasks/wiki_gpt/save_model.py new file mode 100644 index 0000000000000000000000000000000000000000..f174d1ad67bcd885fabf42f97f12d26e5c388223 --- /dev/null +++ b/tasks/wiki_gpt/save_model.py @@ -0,0 +1,11 @@ +from tasks.wiki_gpt.train import resolve_pipeline + + +def main(): + pip = resolve_pipeline() + model_path = pip.save_inference_model() + print(f"模型已保存到: {model_path}") + + +if __name__ == "__main__": + main() diff --git a/tasks/wiki_gpt/train.py b/tasks/wiki_gpt/train.py new file mode 100644 index 0000000000000000000000000000000000000000..0a3e90cf0694639b1175dd068dc5b6178521211c --- /dev/null +++ b/tasks/wiki_gpt/train.py @@ -0,0 +1,75 @@ +""" +执行命令: python3 -m tasks.wiki_gpt.train + +用于训练Mini GPT模型的脚本。 +""" + +import os +from pathlib import Path + +from data import WikiDataset +from env.resolve import resolve_path, resolve_saved +from models.mini_gpt import GptModelBuilder +from pipeline import CheckpointConfig, Pipeline, PipelineRunner +from pipeline.base.configs import CheckpointRules, GenerationRule, TrainingRule +from pipeline.base.prompts_strategy import fixed_prompts +from pipeline.base.sample_functions import top_k + +# 测试配置 +test_pip = Pipeline( + name="wiki_gpt", + dataset=WikiDataset( + data_dir=str(resolve_path("data/dev/mini_c4")), tokenizer_type="character" + ), + model_builder=GptModelBuilder( + hidden_dim=50, + intermediate_dim=50, + num_heads=2, + num_layers=1, + ), + training_rule=TrainingRule( + batch_size=128, epochs=5, steps_per_epoch=30, validation_batches=1 + ), + generation_rule=GenerationRule( + prompts_generator=fixed_prompts(["first doc"]), + sample_strategy=top_k + ), + checkpoint_rules=CheckpointRules() +) + +# 生产配置 +prod_pip = Pipeline( + name="wiki_gpt", + dataset=WikiDataset( + data_dir=str(resolve_path("~/data/wiki/mini_c4")), tokenizer_type="sentence_piece" + ), + model_builder=GptModelBuilder( + hidden_dim=512, + intermediate_dim=2056, + num_heads=8, + num_layers=8, + ), + training_rule=TrainingRule( + batch_size=128, epochs=100, steps_per_epoch=2000, validation_batches=500 + ), + generation_rule=GenerationRule( + prompts_generator=fixed_prompts(["中国的首都是"]), + sample_strategy=top_k + ), + checkpoint_rules=CheckpointRules( + testing=CheckpointConfig(epoch=86), + deployment=CheckpointConfig(dirs=[resolve_saved("models/wiki_gpt")], suffix=".keras") + ) +) + +pip_runner = PipelineRunner(test_pip, prod_pip) + + +def resolve_pipeline(): + """根据环境变量获取 Pipeline 实例""" + env = os.environ.get("ENV", "test") + return prod_pip if env == "production" else test_pip + + +if __name__ == "__main__": + pip_runner() diff --git a/tasks/wiki_gpt/wiki_to_minic4.py b/tasks/wiki_gpt/wiki_to_minic4.py new file mode 100644 index 0000000000000000000000000000000000000000..a290f567e8cabc3cfc260c5172eabd7826205659 --- /dev/null +++ b/tasks/wiki_gpt/wiki_to_minic4.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +""" +将 wiki 格式转换为 mini-c4 格式。 + +Wiki 格式: 内容 +Mini-c4 格式: 每行一个文档,换行符转义为 \\n +线上转换总结: +转换完成: + 成功文件: 2513 + 失败文件: 0 + 总文档数: 1749701 +""" + +import os.path +import re +from pathlib import Path + +from data.wiki.wiki_cleaner import clean +from env.resolve import resolve_path + + +def convert_wiki_to_minic4(input_path: str, output_path: str) -> int: + """ + 将 wiki 格式文件转换为 mini-c4 格式。 + + Args: + input_path: wiki 格式文件路径 + output_path: mini-c4 格式输出文件路径 + + Returns: + 转换的文档数量 + """ + # 读取整个文件 + with open(input_path, "r", encoding="utf-8") as f: + content = f.read() + + # 匹配所有 ... 标签 + pattern = re.compile(r"]*>(.*?)", re.DOTALL) + docs = pattern.findall(content) + + # 处理和写入 + count = 0 + with open(output_path, "w", encoding="utf-8") as f: + for doc in docs: + # 去除首尾空白 + text = doc.strip() + + # 过滤空文档 + text = clean(text) + if not text: + continue + + # 转义换行符 + text = text.replace("\n", "\\n") + + # 写入一行 + f.write(text + "\n") + count += 1 + + return count + + +def convert_wiki_dir_to_minic4(input_path: Path, output_path: Path) -> None: + """ + 将 wiki 格式目录批量转换为 mini-c4 格式。 + + Args: + input_path: wiki 格式源目录 + output_path: mini-c4 格式输出目录 + """ + total_files = 0 + total_docs = 0 + failed_files = 0 + + # 遍历源目录中的所有文件 + for file_path in input_path.rglob("*"): + if not file_path.is_file(): + continue + + # 计算相对路径和输出路径 + rel_path = file_path.relative_to(input_path) + out_file = output_path / rel_path.with_suffix(".txt") + + # 创建输出目录 + out_file.parent.mkdir(parents=True, exist_ok=True) + + try: + count = convert_wiki_to_minic4(str(file_path), str(out_file)) + total_files += 1 + total_docs += count + print(f"✓ {rel_path} -> {count} 个文档") + except Exception as e: + failed_files += 1 + print(f"✗ {rel_path}: {e}") + + print(f"\n转换完成:") + print(f" 成功文件: {total_files}") + print(f" 失败文件: {failed_files}") + print(f" 总文档数: {total_docs}") + + +def main(): + input_dir = Path(os.path.expanduser("~/data/wiki/converted")) + if not input_dir.exists(): + print(f"输入目录不存在: {input_dir}") + return + + output_dir = resolve_path("saved/mini_c4") + print(f"正在转换目录: {input_dir} -> {output_dir}") + + convert_wiki_dir_to_minic4(input_dir, output_dir) + + +if __name__ == "__main__": + main() diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/test/base/__init__.py b/test/base/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/test/base/tensorflow_test.py b/test/base/tensorflow_test.py new file mode 100644 index 0000000000000000000000000000000000000000..74f4329ac6942b36684d47148be0e0504689aff9 --- /dev/null +++ b/test/base/tensorflow_test.py @@ -0,0 +1,24 @@ +from typing import Any + +import tensorflow as tf +import numpy as np +import pytest + + +@pytest.mark.parametrize("rval", [ + np.array([0]), tf.constant([0]), [0] +]) +def test_concat_end_of_text(rval: Any): + """测试 tf.concat([x, np.array([end_of_text])], -1) 的行为""" + # 准备测试数据 + x = tf.constant([1, 2, 3, 4, 5]) + + # 执行 concat 操作 + result = tf.concat([x, rval], -1) + + # 验证结果 + expected = tf.constant([1, 2, 3, 4, 5, 0]) + assert result.shape == (6,), f"Expected length 6, got {result.shape[0]}" + assert tf.reduce_all(tf.equal(result, expected)).numpy(), ( + f"Expected {expected}, got {result}" + ) diff --git a/test/data/__init__.py b/test/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/test/data/dataset_loader_test.py b/test/data/dataset_loader_test.py new file mode 100644 index 0000000000000000000000000000000000000000..344905268ff4a8cbfc1f3077370ac20519747510 --- /dev/null +++ b/test/data/dataset_loader_test.py @@ -0,0 +1,21 @@ +from data.wiki.loader import doc_load +from env.resolve import resolve_path + + +def test_dataset_load_mini_c4(): + data_dir = resolve_path("data/dev/mini_c4") + ds = doc_load(data_dir=data_dir, glob_pattern="*.txt", cycle_length=1) + + result = list(ds.as_numpy_iterator()) + assert len(result) == 10 + + assert result[0] == b"first document of first file" + assert result[1] == b"second document of first file" + assert result[2] == b"third document of first file" + assert result[3] == b"first document of second file" + assert result[4] == b"second document of second file" + assert result[5] == b"third document of second file" + assert result[6] == b"fourth document of second file" + assert result[7] == b"first document of third file" + assert result[8] == b"second document of third file" + assert result[9] == b"third document of third file" diff --git a/test/data/tokenizer_test.py b/test/data/tokenizer_test.py new file mode 100644 index 0000000000000000000000000000000000000000..694e1d2dbcdfd6c92e9f5764c2386a13066bd201 --- /dev/null +++ b/test/data/tokenizer_test.py @@ -0,0 +1,65 @@ +import keras +import keras_hub +import pytest +from keras import layers + + +def test_vectorizer_specified_vocabulary_one(): + vectorizer = layers.TextVectorization( + output_mode="int", + split="character", + output_sequence_length=10, + standardize=None, + ) + vocab = ["", "", "白", "日", "依", "山", "尽", "$"] + vectorizer.set_vocabulary(vocab) + + sample_text = "白日依山尽" + encoded = vectorizer([sample_text]) + assert (encoded[0].numpy() == [4, 5, 6, 7, 8, 0, 0, 0, 0, 0]).all(), ( + "编码结果比词表进了2位,因为前面的两个特殊标记没有被认可" + ) + + +def test_vectorizer_specified_vocabulary_two(): + vectorizer = layers.TextVectorization( + output_mode="int", + split="character", + output_sequence_length=10, + standardize=None, + ) + vocab = ["", "[UNK]", "白", "日", "依", "山", "尽", "$"] + vectorizer.set_vocabulary(vocab) + + sample_text = "白日依山尽" + encoded = vectorizer([sample_text]) + assert (encoded[0].numpy() == [2, 3, 4, 5, 6, 0, 0, 0, 0, 0]).all(), ( + "编码结果与词表的序号一致" + ) + + +def test_batch_encode_decode(): + """测试批量编码和解码功能""" + pytest.importorskip("tensorflow_text") + vocabulary_file = keras.utils.get_file( + origin="https://hf-mirror.com/mattdangerw/spiece/resolve/main/vocabulary.proto" + ) + tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(vocabulary_file) + + # 批量编码 + texts = ["", "Hi!", "Machine learning is amazing."] + tokens = tokenizer.tokenize(texts) + + # 验证编码结果 + # SentencePiece 默认返回 RaggedTensor;但是传递 sequence_length 参数会返回密集 Tensor,不足的部分会被填充为 0. + expected_tokens = [[], [6324, 29991], [6189, 6509, 338, 21863, 292, 29889]] + assert tokens.to_list() == expected_tokens, f"编码结果不匹配: {tokens.to_list()}" + + # 批量解码 + decoded = tokenizer.detokenize(tokens) + + # 验证解码结果 + expected_decoded = [b"", b"Hi!", b"Machine learning is amazing."] + assert decoded.numpy().tolist() == expected_decoded, ( + f"解码结果不匹配: {decoded.numpy().tolist()}" + ) diff --git a/test/data/vocab_test.py b/test/data/vocab_test.py new file mode 100644 index 0000000000000000000000000000000000000000..932213aeb6d89e70b175abf9966cebf593bd1c44 --- /dev/null +++ b/test/data/vocab_test.py @@ -0,0 +1,66 @@ +import pathlib +import tempfile + +from keras import layers + +from data import PoetryDataset +from data.common import build_vocab_from_dataset +from data.poetry.loader import doc_load +from data.poetry.tokenizer import load_vectorizer +from env.resolve import resolve_path + + +def test_create_and_load_vectorizer(): + """测试加载已保存的 TextVectorization 层并验证编码解码""" + + data_dir = resolve_path("data/dev/poetry") + sequence_length = 100 + dataset = doc_load(data_dir) + + def _check_vectorizer_encode(vectorizer: layers.TextVectorization): + vocab = vectorizer.get_vocabulary() + + sample_tensor = dataset.take(1).get_single_element() + sample_text = sample_tensor.numpy().decode("utf-8") + + encoded = vectorizer([sample_text]) + + nonzero_indices = encoded[0].numpy()[encoded[0].numpy() > 0] + decoded = [vocab[idx] for idx in nonzero_indices] + decoded_text = "".join(decoded) + + original_chars = list(sample_text[: len(decoded)]) + decoded_chars = list(decoded_text) + assert decoded_chars == original_chars, "解码的字符应与原始文本一致" + + with tempfile.TemporaryDirectory() as tmpdir: + vocab_path = pathlib.Path(tmpdir) / "poetry_vocab.txt" + vocab = build_vocab_from_dataset(dataset, vocab_path) + + assert vocab_path.exists(), "词汇表文件应该被创建" + assert len(vocab) > 0, "词汇表不应为空" + + loaded_vectorizer = load_vectorizer(vocab_path, sequence_length) + loaded_vocab = loaded_vectorizer.get_vocabulary() + + assert len(loaded_vocab) == len(vocab), "加载的词汇表大小应一致" + _check_vectorizer_encode(loaded_vectorizer) + + +def test_poetry_dataset_tokenizer_bundle_contains_vocab_path(): + data_dir = resolve_path("data/dev/poetry") + + with tempfile.TemporaryDirectory() as tmpdir: + vocab_path = pathlib.Path(tmpdir) / "poetry_vocab.txt" + dataset = doc_load(data_dir) + build_vocab_from_dataset(dataset, vocab_path) + + poetry_dataset = PoetryDataset( + data_dir=str(data_dir), + vocab_path=str(vocab_path), + sequence_length=100 + ) + + tokenizer_info = poetry_dataset.tokenizer_bundle() + + assert tokenizer_info.vocab_path == str(vocab_path) diff --git a/test/data/wiki_cleaner_test.py b/test/data/wiki_cleaner_test.py new file mode 100644 index 0000000000000000000000000000000000000000..01d3276266167c6b9be17511ec77fd20abdab72d --- /dev/null +++ b/test/data/wiki_cleaner_test.py @@ -0,0 +1,309 @@ +""" +Wiki 清洗模块的单元测试。 +""" + +from pathlib import Path + +from data.wiki.wiki_cleaner import ( + filter_single_line, + filter_html_tags, + filter_empty_brackets, + filter_lang_tags, + clean, +) +from env.resolve import resolve_path + + +class TestFilterSingleLine: + """测试单行过滤器""" + + def test_single_line_returns_none(self): + """单行文本应该返回 None""" + assert filter_single_line("这是一个重定向") is None + + def test_single_line_with_whitespace_returns_none(self): + """单行但包含空白字符应该返回 None""" + assert filter_single_line(" 这是一个重定向 ") is None + + def test_multiple_lines_returns_original(self): + """多行文本应该返回原文本""" + text = "第一行\n第二行\n第三行" + assert filter_single_line(text) == text + + def test_multiple_lines_with_empty_lines(self): + """多行包含空行应该返回原文本""" + text = "第一行\n\n第二行\n\n" + result = filter_single_line(text) + assert result == text + + def test_empty_string_returns_none(self): + """空字符串应该返回 None""" + assert filter_single_line("") is None + + def test_only_whitespace_returns_none(self): + """只有空白字符应该返回 None""" + assert filter_single_line(" \n \n ") is None + + +class TestFilterEmptyBrackets: + """测试空括号过滤器""" + + def test_remove_empty_parentheses_in_text(self): + """移除文本中的空括号 ()""" + text = "这是()一段文本" + result = filter_empty_brackets(text) + assert result == "这是一段文本" + + def test_remove_empty_chinese_brackets_in_text(self): + """移除文本中的空中文括号 ()""" + text = "这是()一段文本" + result = filter_empty_brackets(text) + assert result == "这是一段文本" + + def test_remove_brackets_with_space_in_text(self): + """移除带空格的空括号""" + text = "这是( )一段( )文本" + result = filter_empty_brackets(text) + assert result == "这是一段文本" + + def test_keep_brackets_with_content(self): + """保留有内容的括号""" + text = "这是一个(有内容的)括号" + assert filter_empty_brackets(text) == text + + def test_remove_square_brackets_in_text(self): + """移除文本中的空方括号 []""" + text = "这是[]一段[ ]文本" + result = filter_empty_brackets(text) + assert result == "这是一段文本" + + def test_remove_chinese_square_brackets_in_text(self): + """移除文本中的空中文方括号 【】""" + text = "这是【】一段文本" + result = filter_empty_brackets(text) + assert result == "这是一段文本" + + def test_remove_curly_brackets_in_text(self): + """移除文本中的空花括号 {}""" + text = "这是{}一段{ }文本" + result = filter_empty_brackets(text) + assert result == "这是一段文本" + + def test_no_brackets_returns_original(self): + """没有括号应该返回原文本""" + text = "这是一段普通文本\n没有任何括号" + assert filter_empty_brackets(text) == text + + def test_empty_string(self): + """空字符串应该返回空字符串""" + assert filter_empty_brackets("") == "" + + def test_multiple_empty_brackets(self): + """移除多个空括号""" + text = "()()[]【】" + result = filter_empty_brackets(text) + assert result == "" + + def test_mixed_empty_and_content_brackets(self): + """混合空括号和有内容的括号""" + text = "这是()(有内容的)和[]的测试" + result = filter_empty_brackets(text) + assert result == "这是(有内容的)和的测试" + + def test_multiple_lines_with_empty_brackets(self): + """多行文本中的空括号 ()""" + text = "这是()一段文本\n这是()一段文本" + result = filter_empty_brackets(text) + assert result == "这是一段文本\n这是一段文本" + + +class TestFilterHtmlTags: + """测试 HTML 标签过滤器""" + + def test_remove_templatestyles_tag(self): + """移除 templatestyles 标签(实体编码格式)""" + text = '<templatestyles src="ShareCSS/infobox.css" />正文内容' + result = filter_html_tags(text) + assert result == "正文内容" + + def test_remove_multiple_tags(self): + """移除多个 HTML 标签(实体编码格式)""" + text = "<div><p>段落</p></div>" + result = filter_html_tags(text) + assert result == "段落" + + def test_no_tags_returns_original(self): + """没有标签应该返回原文本""" + text = "这是一段普通文本" + assert filter_html_tags(text) == text + + def test_empty_string(self): + """空字符串应该返回空字符串""" + assert filter_html_tags("") == "" + + def test_only_tags(self): + """只有标签应该返回空字符串""" + text = '<templatestyles src="test.css" />' + assert filter_html_tags(text) == "" + + def test_mixed_content(self): + """混合内容应该只移除标签""" + text = "开头<tag>中间</tag>结尾" + result = filter_html_tags(text) + assert result == "开头中间结尾" + + def test_multiple_lines_with_html_tags(self): + """多行文本中的 HTML 标签""" + text = "第一行<tag>\n第二行<tag>\n第三行" + result = filter_html_tags(text) + assert result == "第一行\n第二行\n第三行" + + +class TestFilterLangTags: + """测试语言转换标记过滤器""" + + def test_remove_single_lang_tags(self): + """移除单个语言转换标记""" + text = "-{H|zh-hans:重定向;zh-hant:重新导向;}-正文" + result = filter_lang_tags(text) + assert result == "正文" + + def test_remove_multiple_lang_tagss(self): + """移除多个语言转换标记""" + text = "-{H|zh-hans:重定向;zh-hant:重新导向;}--{H|zh-cn:字符;zh-tw:字元;}-正文" + result = filter_lang_tags(text) + assert result == "正文" + + def test_remove_complex_lang_tags(self): + """移除复杂的语言转换标记""" + text = ( + "-{H|zh-hans:文件; zh-hant:档案;}--{H|zh-hans:快捷方式; zh-hant:捷径;}-正文" + ) + result = filter_lang_tags(text) + assert result == "正文" + + def test_no_lang_tags_returns_original(self): + """没有语言转换标记应该返回原文本""" + text = "这是一段普通文本" + assert filter_lang_tags(text) == text + + def test_empty_string(self): + """空字符串应该返回空字符串""" + assert filter_lang_tags("") == "" + + def test_only_lang_tags(self): + """只有语言转换标记应该返回空字符串""" + text = "-{H|zh-hans:重定向;zh-hant:重新导向;}-" + assert filter_lang_tags(text) == "" + + def test_multiple_lines_with_lang_tags(self): + """多行文本中的语言转换标记""" + text = "第一行-{H|zh-hans:测试1;}-\n第二行-{H|zh-hans:测试2;}-\n第三行" + result = filter_lang_tags(text) + assert result == "第一行\n第二行\n第三行" + + def test_nested_lang_tags(self): + """移除嵌套的语言转换标记""" + text = "-{T|zh:-{zh|}-;zh-hans:-{zh-hans|}-;zh-hant:-{zh-hant|}-;}-正文" + result = filter_lang_tags(text) + assert result == "正文" + + def test_deeply_nested_lang_tags(self): + """移除深度嵌套的语言转换标记""" + text = "-{A|-{B|-{C|内容}-}-}-正文" + result = filter_lang_tags(text) + assert result == "正文" + + +class TestCleanIntegration: + """测试 clean 函数的集成效果""" + + def test_single_line_returns_none(self): + """单行文本应该返回 None""" + assert clean("重定向") is None + + def test_empty_after_filtering_returns_none(self): + """过滤后为空应该返回 None""" + text = "()()[]" + assert clean(text) is None + + def test_multiple_filters_applied(self): + """多个过滤器应该依次应用""" + text = """第一行 +<templatestyles src="test.css" /> +() +-{H|zh-hans:测试;zh-hant:測試;}- +第二行""" + result = clean(text) + assert result is not None + assert "<" not in result + assert "()" not in result + assert "-{" not in result + assert "第一行" in result + assert "第二行" in result + + def test_real_wiki_example(self): + """真实 wiki 文本示例""" + text = """词条标题 +<templatestyles src="ShareCSS/infobox.css" /> +这是正文内容。 +() +-{H|zh-hans:重定向;zh-hant:重新导向;}- +更多内容。""" + result = clean(text) + assert result is not None + assert "<templatestyles" not in result + assert "()" not in result + assert "-{" not in result + assert "这是正文内容" in result + assert "更多内容" in result + + def test_normal_text_unchanged(self): + """正常文本应该保持不变""" + text = """第一行 +第二行 +第三行""" + result = clean(text) + assert result == text + + def test_only_whitespace_returns_none(self): + """只有空白字符应该返回 None""" + assert clean(" \n \n ") is None + + def test_multiple_lines_clean(self): + """多行文本的完整清洗""" + text = """词条标题 +<templatestyles src="test.css" /> +这是()一段()文本 +-{H|zh-hans:测试;zh-hant:測試;}- +第二行 +<div>标签</div> +()空括号 +第三行""" + result = clean(text) + assert result is not None + assert "<" not in result + assert "()" not in result + assert "()" not in result + assert "-{" not in result + assert "这是一段文本" in result + assert "第二行" in result + assert "第三行" in result + + +def test_clean_demo_text(): + """读取 demo_text.txt 文件并打印清洗后的内容""" + demo_file = resolve_path("test/fixtures/clean/demo_text.txt") + + with open(demo_file, "r", encoding="utf-8") as f: + content = f.read() + + result = clean(content) + + print("\n" + "=" * 50) + print("清洗后的内容:") + print("=" * 50) + print(result) + print("=" * 50) + + assert result is not None diff --git a/test/data/wiki_dataset_test.py b/test/data/wiki_dataset_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c3bb41c6786289d7a5cdac01423f2f184a403014 --- /dev/null +++ b/test/data/wiki_dataset_test.py @@ -0,0 +1,55 @@ +"""测试 WikiDataset 的功能""" + +import tensorflow as tf + +from data import WikiDataset +from env.resolve import resolve_path + + +def _load_dataset_for_test(batch_size: int, taken_size: int): + """测试数据集处理的基本功能""" + dataset = WikiDataset( + data_dir=str(resolve_path("data/dev/mini_c4")), + tokenizer_type="character", + ) + + ds = dataset.tokens_ds( + seq_length=16, + batch_size=batch_size, + ).repeat() + + for ibatch, batch in enumerate(ds.take(taken_size)): + print(f"\nBatch {ibatch + 1}:") + # 将输入和目标编码合并 + merged = tf.concat([batch[0], batch[1][:, -1:]], axis=-1) + for val in merged: + dec = dataset.tokenizer_bundle().decode(val.numpy().tolist()) + print(" ", dec) + + +def test_load_dataset_batch_one(): + """ + 测试批大小为1时的数据集加载行为。 + + 预期行为: + • 一轮完整的数据集将生成17个有效样本。 + • 到第 18 个样本的时候会重新开始一轮数据集迭代。 + + 注意: + drop_remainder=True 会丢弃最后一个样本,因此你看到输出的最后一个样本是不完整的。 + """ + _load_dataset_for_test(batch_size=1, taken_size=18) + + +def test_load_dataset_batch_four(): + """ + 测试批大小为1时的数据集加载行为。 + + 预期行为: + • 一轮完整的数据集将生成17个有效样本(一个 5 个批次)。 + • 到第 18 个样本的时候会重新开始一轮数据集迭代(第 6 个批次)。 + + 注意: + drop_remainder=True 会丢弃最后一个样本,因此你看到输出的最后一个样本是不完整的。 + """ + _load_dataset_for_test(batch_size=4, taken_size=6) diff --git a/test/fixtures/clean/demo_text.txt b/test/fixtures/clean/demo_text.txt new file mode 100644 index 0000000000000000000000000000000000000000..35281d20de08934c8a8f12909ee8ef014a026dfd --- /dev/null +++ b/test/fixtures/clean/demo_text.txt @@ -0,0 +1,44 @@ +Wiki + +-{T|zh:-{zh|}-;zh-hans:-{zh-hans|}-;zh-hant:-{zh-hant|}-;zh-cn:-{zh-cn|}-;zh-hk:-{zh-hk|}-;zh-mo:-{zh-mo|}-;zh-my:-{zh-my|}-;zh-sg:-{zh-sg|}-;zh-tw:-{zh-tw|}-;}- +()是一种可通过浏览器访问并由用户协同编辑其内容的网站。沃德·坎宁安于1995年开发了最初的wiki。他将wiki定义为“一种允许一群用户用简单的描述来创建和连接一组网页的社会计算系统”。 +有些人认为,wiki系统属于一种人类知识的网路系统,让人们可以在web的基础上对wiki文本进行浏览、创建和更改,而且这种创建、更改及发布的成本远比HTML文本小。与此同时,wiki系统还支持那些面向社群的协同写作,为其提供必要的帮助。最后wiki的写作者自然构成一个社群,wiki系统为这个社群提供简单的交流工具。与其它超文本系统相比,wiki有使用简便且开放的特点,有助于在一个社群内共享某个领域的知识。 +词源. +"wiki" 取自夏威夷的Wiki Wiki公车,源自夏威夷语「wiki」,本是「快速」之意。wiki的中文翻译有维客、围纪、快纪、共笔和维基等等,其中「维基」一词是中文维基百科人特别为维基百科而创,属于维基媒体的专用术语。随著「维基」一词能见度增加,常被泛用为wiki的主要音译名。 +历史. +wiki软体由软件设计模式社群开发,用来书写与讨论模式语言。沃德·坎宁安于1995年3月25日成立第一个wiki网站:WikiWikiWeb,用来补充他自己经营的软件设计模式网站。他发明wiki这个名字以及相关概念,并且实作第一个wiki引擎。坎宁安说自己是根据檀香山的Wiki Wiki公车取名的,「wiki」在夏威夷语为「快速」之意,<templatestyles src="Template:Mark I/styles.css" /><mark class="template-facttext" title="需要提供文献来源">这是他到檀香山学会的第一个夏威夷语</mark>,故他将「wiki-wiki」作为「快速」的意思以避免将「这东西」取名为「快速网」(quick-web)。 +坎宁安说,wiki的构想来自他自己在1980年代晚期利用苹果电脑HyperCard程式作出的一个小功能。HyperCard类似名片整理程式,可用来纪录人物与相关事物。HyperCard管理许多称为「卡片」的资料,每张卡片上都可划分栏位、加上图片、有样式的文字或按钮等等,而且这些内容都可在 +查阅卡片的同时修改编辑。HyperCard类似于后来的网页,但是缺乏一些重要特征。 +坎宁安认为原来的HyperCard程式十分有用,但创造卡片与卡片之间的连结却很困难。于是他不用HyperCard程式原本的创造连结功能,而改用「随选搜寻」的方式自己增添了一个新的连结功能。使用者只要将连结输入卡片上的一个特殊栏位,而这个栏位每一行都有一个按钮。按下按钮时如果卡片已经存在,按钮就会带使用者去那张卡片,否则就发出哔声,而继续压著按钮不放,程式就会为使用者产生一张卡片。 +坎宁安向他的朋友展示了这个程式和他自己写的人事卡片,往往会有人指出卡片之中的内容不太对,他们就可当场利用HyperCard初始的功能修正内容,并利用坎宁安加入的新功能补充连结。 +坎宁安后来在别处又写了这样的功能,而且这次他还增加了多使用者写作功能。新功能之一是程式会在每一次任何一张卡片被更改时,自动在「最近更改」卡片上增加一个连往被更改卡片的连结。坎宁安自己常常看「最近更改」卡片,而且还会注意到空白的说明栏位会让他想要描述一下更改的摘要。 +特征. +奥德·坎宁安和波·路夫(Bo Leuf)在《Wiki之道——网上快捷合作》一书中描述wiki概念的几个本质特征: +编辑wiki页面. +wiki中用户使用很多方式来编辑。通常需要-{zh-hans:通过; zh-hant:透过;}-文本标记语言。 +应用. +wiki在一些需要内容管理系统的企业中得到了广泛应用、JotSpot和是创wiki企业应用的先河。wiki可以在高校教育环境中发挥积极的作用,但是直到2006年,wiki应用于教育的案例在全球都比较少。wiki除了被用来建立网站外,也被用作编写网志。wiki在中小学教育方面,可以作为学生 +协助学习的平台。 +实施. +wiki软件是运行wiki的群件之统称,允许使用常见的Web浏览器建立和修改网页,被作为应用程式伺服器在多个网页服务器上运作。 +导航. +在大多数页面的文本,通常有大量的超文本链接到其他网页。大多数wiki有一个反向的功能,它显示所有链接到一个给定页面的页面。 +认可与安全. +控制更改. +-{H|zh-hans:重定向;zh-hant:重新导向;}--{H|zh-cn:字符;zh-tw:字元;}--{H|zh-hans:文件; zh-hant:档案;}--{H|zh-hans:快捷方式; zh-hant:捷径;}--{H|zh-hans:项目;zh-hant:专案;zh-tw:计划;zh-hk:计划;zh-mo:计划;}--{H|zh-cn:计算机; zh-sg:电脑; zh-tw:电脑;}- +wiki的基本设计理念是,与其避免人们犯错,倒不如让人们更方便地修正错误。因此,wiki固然相当开放,但它有一个有助检验最近新增页面正确性的功能。几乎每一个wiki网站都有的最突出的功能,就是「最近修订」页面——一个列出最近修订的特殊页面,或是一个在特定时间范围内所做修改的列表。一些wiki可以对此清单作出过滤,筛去小修改或利用自动脚本所做之修改(所谓「机械人」)。 +大部分wiki网站的页面编修纪录页都拥有以下功能:可查看过去的修订版本,亦可在任何两个修订版本之间进行差异对比。编辑者可以利用修订历史浏览并且恢复此条目的前一版本;显示差异功能则能让编辑者更容易决定是否有必要做此更改。一个普通的wiki使用者可以在「最近修订」页面浏览差异、查阅历史、甚至恢复到先前的版本。这个过程基本上是很流畅的,具体细节则要看用的是哪款wiki软体。 +为了避免人们做出差劣的编辑,有些wiki引擎可以对内容编辑权限进行各种程度的限制,以确保一篇或一组条目的品质。当有人修改某个条目时,愿意维护该页面的使用者(们)会收到通知,让他/她得以马上对新编辑进行审查。 +有些wiki会提供「巡逻校订」(patrolled revisions),让有权限的编辑者在正当(非破坏)的修订上做标记。而「标记校订」(flagged revisions)则是让普通用户无法看见尚未通过评审的修订。 +搜索. +wiki提供至少一个标题搜索,有时是一个全文搜索。搜索的可扩展性取决于wiki引擎是否使用一个数据库。一些wiki(如PmWiki)使用文本文件。MediaWiki的第一个版本采用文本文件,但它在21世纪初被李丹尼尔克罗克改写成一个数据库应用程序。 +另外有时可以对wiki使用Google等外部搜索引擎的限定网域搜索功能以获得更精确或更全面的结果。 +规则. +为了保证内容的质量,大部分wiki系统或其所建立的社区都有一系列错综复杂的策略和指导方针,用以对用户的使用行为进行有一系列的规则控制。 +比如总结成以下五个方面:维基百科是一部自由的百科全书;维基百科代表的是一个中立的观点;维基百科自由编辑内容;维基百科的编辑者应该以一种尊重和文明的方式互相交流;维基百科没有一成不变的规章,但不可以自由修改规则。 +社区. +有许多的wiki社区是私密的,尤其是企业的wiki。企业的wiki有可能只允许内部员工修改。 +参阅. +<templatestyles src="Div col/styles.css"/> +外部连结. +<templatestyles src="Div col/styles.css"/> diff --git a/test/models/__init__.py b/test/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/test/models/checkpoint_rules_test.py b/test/models/checkpoint_rules_test.py new file mode 100644 index 0000000000000000000000000000000000000000..2e94483e52307ac56475c7f6441f59f76d05d0f4 --- /dev/null +++ b/test/models/checkpoint_rules_test.py @@ -0,0 +1,16 @@ +from pipeline.base.configs import CheckpointConfig, CheckpointRules + + +def test_resolve_testing_rule_uses_default_dirs(tmp_path): + checkpoint_rules = CheckpointRules( + testing=CheckpointConfig(epoch=5, suffix=".keras") + ) + + result = checkpoint_rules.resolve_testing_rule(default_dirs=[tmp_path]) + + assert result == { + "dirs": [tmp_path], + "path": None, + "epoch": 5, + "suffix": ".keras" + } diff --git a/test/models/checkpoint_test.py b/test/models/checkpoint_test.py new file mode 100644 index 0000000000000000000000000000000000000000..d080102d90da9ada7b5816e7ec519fdd33245fff --- /dev/null +++ b/test/models/checkpoint_test.py @@ -0,0 +1,298 @@ +"""检查点解析功能单元测试 + +测试 resolve_checkpoint 函数的各种场景。 + +测试场景: +resolve_checkpoint 测试场景: +├── path 提供 +│ ├── 绝对路径 +│ │ ├── 存在 +│ │ │ ├── .keras文件 → 成功返回 +│ │ │ └── .weights.h5文件 → 成功返回 +│ │ ├── 存在(同时提供dirs)→ 使用绝对路径,忽略dirs,打印警告 +│ │ ├── suffix不匹配 → FileNotFoundError +│ │ └── 不存在 → FileNotFoundError +│ └── 相对路径 +│ ├── dirs提供 +│ │ ├── 单目录 → 成功解析 +│ │ └── 多目录按顺序查找 → 成功解析 +│ └── dirs=None → ValueError +└── path 未提供 + └── dirs 提供 + ├── epoch=None + │ ├── 单目录 + │ │ ├── 存在 .weights.h5 / .keras 文件 → 返回最新的 + │ │ ├── 存在但为空 → 返回 (None, 0) + │ │ └── 目录不存在 → 返回 (None, 0) + │ └── 多目录 → 返回全局最新的检查点 + └── epoch指定 + ├── 未指定suffix + │ ├── 存在对应epoch → 返回对应epoch的检查点 + │ └── epoch不存在 → FileNotFoundError + └── 指定suffix + ├── 存在对应后缀 → 返回对应检查点 + └── 无对应后缀 → FileNotFoundError + └── 两者都为None → ValueError + +extract_number_of_filename 测试场景: +├── 正常提取 +│ ├── 从包含 epoch 的文件名中提取数字 → 返回数字 +│ ├── 从多个数字的文件名中提取最后一个数字 → 返回最后一个数字 +│ └── 从 .keras 文件名中提取数字 → 返回数字 +└── 异常情况 + ├── 没有数字的文件名 → 抛出 ValueError + └── .weights.h5 文件名中没有数字 → 抛出 ValueError +""" + +import pathlib +import tempfile + +import pytest + +from pipeline.base.configs import CheckpointConfig +from pipeline.base.checkpoint import ( + extract_number_of_filename, + resolve_checkpoint +) + + +class TestCheckpointConfig: + def test_default_values(self): + checkpoint = CheckpointConfig() + + assert checkpoint.dirs is None + assert checkpoint.path is None + assert checkpoint.epoch is None + assert checkpoint.suffix is None + + def test_custom_values(self): + checkpoint = CheckpointConfig( + dirs=[pathlib.Path("dir_a"), pathlib.Path("dir_b")], + path=pathlib.Path("model_epoch_005.weights.h5"), + epoch=5, + suffix=".weights.h5" + ) + + assert checkpoint.dirs == [pathlib.Path("dir_a"), pathlib.Path("dir_b")] + assert checkpoint.path == pathlib.Path("model_epoch_005.weights.h5") + assert checkpoint.epoch == 5 + assert checkpoint.suffix == ".weights.h5" + + +class TestExtractNumberOfFilename: + """测试 extract_number_of_filename 函数""" + + def test_extract_from_epoch_filename(self): + """从包含 epoch 的文件名中提取数字""" + assert extract_number_of_filename("model_epoch_001") == 1 + assert extract_number_of_filename("model_epoch_010") == 10 + assert extract_number_of_filename("model_epoch_100") == 100 + + def test_extract_last_number(self): + """提取最后一个数字""" + assert extract_number_of_filename("checkpoint_2024_06_30_epoch_002") == 2 + assert extract_number_of_filename("model_v1_epoch_005") == 5 + + def test_extract_from_keras_file(self): + """从 .keras 文件名中提取数字""" + assert extract_number_of_filename("epoch_005_model") == 5 + assert extract_number_of_filename("model_epoch_003.keras") == 3 + + def test_no_number_raises_error(self): + """没有数字时抛出 ValueError""" + with pytest.raises(ValueError, match="No number found"): + extract_number_of_filename("model_final") + + def test_no_number_in_weights_file_raises_error(self): + """.weights.h5 文件名中没有数字时抛出 ValueError""" + with pytest.raises(ValueError, match="No number found"): + extract_number_of_filename("model_final.weights") + + +class TestResolveCheckpoint: + """测试 resolve_checkpoint 函数""" + + @pytest.fixture + def temp_dir(self): + """创建临时目录""" + with tempfile.TemporaryDirectory() as tmp: + yield pathlib.Path(tmp) + + def test_absolute_path_exists_returns_path_and_epoch(self, temp_dir): + """path=绝对路径且存在 → 成功返回""" + checkpoint_file = temp_dir / "model_epoch_005.keras" + checkpoint_file.write_text("dummy") + + path, epoch = resolve_checkpoint(path=checkpoint_file) + + assert path == checkpoint_file + assert epoch == 5 + + def test_absolute_path_with_dirs_ignores_dir_and_warns(self, temp_dir): + """path=绝对路径且存在(同时提供dirs)→ 使用绝对路径,忽略dirs,打印警告""" + checkpoint_file = temp_dir / "model_epoch_005.keras" + checkpoint_file.write_text("dummy") + other_dir = temp_dir / "other_dir" + other_dir.mkdir() + + with pytest.warns(UserWarning, match="dirs 参数将被忽略"): + path, epoch = resolve_checkpoint( + path=checkpoint_file, + dirs=[other_dir] + ) + + assert path == checkpoint_file + assert epoch == 5 + + def test_absolute_path_not_exists_raises_error(self, temp_dir): + """path=绝对路径但不存在 → FileNotFoundError""" + checkpoint_file = temp_dir / "model_epoch_005.keras" + + with pytest.raises(FileNotFoundError, match="检查点文件不存在"): + resolve_checkpoint(path=checkpoint_file) + + def test_relative_path_with_dirs_returns_path(self, temp_dir): + """path=相对路径+dirs → 成功解析""" + checkpoint_file = temp_dir / "model_epoch_010.weights.h5" + checkpoint_file.write_text("dummy") + + path, epoch = resolve_checkpoint( + dirs=[temp_dir], + path="model_epoch_010.weights.h5" + ) + + assert path == checkpoint_file + assert epoch == 10 + + def test_relative_path_without_dirs_raises_error(self): + """path=相对路径+dirs=None → ValueError""" + with pytest.raises(ValueError, match="path 是相对路径时,必须提供 dirs"): + resolve_checkpoint(path="model.keras") + + def test_resolve_latest_weights_h5(self, temp_dir): + """path=None+dirs存在+epoch=None → 返回最新的检查点""" + (temp_dir / "model_epoch_001.weights.h5").write_text("dummy") + (temp_dir / "model_epoch_005.weights.h5").write_text("dummy") + (temp_dir / "model_epoch_003.weights.h5").write_text("dummy") + (temp_dir / "model_epoch_004.keras").write_text("dummy") + + path, epoch = resolve_checkpoint(dirs=[temp_dir]) + + assert path.name == "model_epoch_005.weights.h5" + assert epoch == 5 + + def test_resolve_specific_epoch(self, temp_dir): + """path=None+dirs存在+epoch指定 → 返回对应epoch的检查点""" + (temp_dir / "model_epoch_001.weights.h5").write_text("dummy") + (temp_dir / "model_epoch_005.weights.h5").write_text("dummy") + (temp_dir / "model_epoch_010.weights.h5").write_text("dummy") + + path, epoch = resolve_checkpoint(dirs=[temp_dir], epoch=5) + + assert path.name == "model_epoch_005.weights.h5" + assert epoch == 5 + + def test_resolve_nonexistent_epoch_raises_error(self, temp_dir): + """请求不存在的 epoch → FileNotFoundError""" + (temp_dir / "model_epoch_001.weights.h5").write_text("dummy") + + with pytest.raises(FileNotFoundError, match="未找到 epoch 5"): + resolve_checkpoint(dirs=[temp_dir], epoch=5) + + def test_empty_dirs_returns_none(self, temp_dir): + """path=None+dirs存在但为空 → 返回 (None, 0)""" + path, epoch = resolve_checkpoint(dirs=[temp_dir]) + assert path is None + assert epoch == 0 + + def test_nonexistent_dirs_returns_none(self): + """path=None+dirs不存在 → 返回 (None, 0)""" + path, epoch = resolve_checkpoint(dirs=["/nonexistent/path"]) + assert path is None + assert epoch == 0 + + def test_both_none_raises_error(self): + """两者都为None → ValueError""" + with pytest.raises(ValueError, match="必须提供 dirs 或 path"): + resolve_checkpoint() + + def test_resolve_keras_file(self, temp_dir): + """支持 .keras 文件格式""" + checkpoint_file = temp_dir / "epoch_007_model.keras" + checkpoint_file.write_text("dummy") + + path, epoch = resolve_checkpoint(path=checkpoint_file) + + assert path == checkpoint_file + assert epoch == 7 + + def test_resolve_weights_h5_file(self, temp_dir): + """支持 .weights.h5 文件格式""" + checkpoint_file = temp_dir / "model_epoch_012.weights.h5" + checkpoint_file.write_text("dummy") + + path, epoch = resolve_checkpoint(path=checkpoint_file) + + assert path == checkpoint_file + assert epoch == 12 + + def test_relative_path_uses_checkpoint_dirs_in_order(self, temp_dir): + first_dir = temp_dir / "first" + second_dir = temp_dir / "second" + first_dir.mkdir() + second_dir.mkdir() + checkpoint_file = second_dir / "model_epoch_012.weights.h5" + checkpoint_file.write_text("dummy") + + path, epoch = resolve_checkpoint( + dirs=[first_dir, second_dir], + path="model_epoch_012.weights.h5" + ) + + assert path == checkpoint_file + assert epoch == 12 + + def test_resolve_latest_from_checkpoint_dirs(self, temp_dir): + first_dir = temp_dir / "first" + second_dir = temp_dir / "second" + first_dir.mkdir() + second_dir.mkdir() + (first_dir / "model_epoch_003.weights.h5").write_text("dummy") + (second_dir / "model_epoch_008.weights.h5").write_text("dummy") + + path, epoch = resolve_checkpoint(dirs=[first_dir, second_dir]) + + assert path == second_dir / "model_epoch_008.weights.h5" + assert epoch == 8 + + def test_resolve_with_suffix(self, temp_dir): + (temp_dir / "model_epoch_003.weights.h5").write_text("dummy") + (temp_dir / "model_epoch_005.keras").write_text("dummy") + + path, epoch = resolve_checkpoint( + dirs=[temp_dir], + suffix=".keras" + ) + + assert path == temp_dir / "model_epoch_005.keras" + assert epoch == 5 + + def test_resolve_with_missing_suffix_raises_error(self, temp_dir): + (temp_dir / "model_epoch_003.weights.h5").write_text("dummy") + + with pytest.raises(FileNotFoundError, match="未找到 epoch 3"): + resolve_checkpoint( + dirs=[temp_dir], + epoch=3, + suffix=".keras" + ) + + def test_absolute_path_with_suffix_mismatch_raises_error(self, temp_dir): + checkpoint_file = temp_dir / "model_epoch_005.keras" + checkpoint_file.write_text("dummy") + + with pytest.raises(FileNotFoundError, match="检查点文件后缀不匹配"): + resolve_checkpoint( + path=checkpoint_file, + suffix=".weights.h5" + ) diff --git a/test/models/generation_test.py b/test/models/generation_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9411923343bc51e58ac9ba9779bf76b9905922bd --- /dev/null +++ b/test/models/generation_test.py @@ -0,0 +1,37 @@ +from unittest.mock import Mock + +import numpy as np + +from pipeline.base.generation import generate_with_stateful_model, generate_with_training_model +from pipeline.base.model_builder import GenerationContext + + +def test_generate_with_training_model(): + model = Mock() + model.predict = Mock(return_value=np.zeros((1, 10, 100))) + + sample_results = [50, 60, 99] + sample_fn = Mock(side_effect=[np.array([t]) for t in sample_results]) + context = GenerationContext(end_of_text=99, max_length=10, sample_fn=sample_fn) + + result = generate_with_training_model(model, context, [10, 20]) + + assert result.token_ids == [10, 20, 50, 60] + assert result.stop_reason == "<|endoftext|>" + + +def test_generate_with_stateful_model(): + model = Mock() + model.predict = Mock( + return_value=[np.zeros((1, 1, 100)), np.zeros((1, 16)), np.zeros((1, 16))] + ) + + sample_results = [50, 60, 99] + sample_fn = Mock(side_effect=[np.array([t]) for t in sample_results]) + context = GenerationContext(end_of_text=99, max_length=10, sample_fn=sample_fn) + initial_states = [np.zeros((1, 16)), np.zeros((1, 16))] + + result = generate_with_stateful_model(model, context, [10, 20], initial_states) + + assert result.token_ids == [10, 20, 50, 60] + assert result.stop_reason == "<|endoftext|>" diff --git a/test/models/model_builder_test.py b/test/models/model_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9e1b4f133d0f8f91f25d1a3317dfdf6bc481cdee --- /dev/null +++ b/test/models/model_builder_test.py @@ -0,0 +1,157 @@ +import pytest +import tensorflow as tf +import numpy as np + +from models.mini_gpt import GptModelBuilder +from models.rnn import RNNModelBuilder +from pipeline.base.model_builder import GenerationContext + + +def _sample_one(logits): + return tf.constant([1], dtype="int32") + + +@pytest.mark.parametrize( + "builder", + [ + GptModelBuilder( + hidden_dim=8, + intermediate_dim=16, + num_heads=2, + num_layers=1 + ), + RNNModelBuilder( + num_layers=1, + embedding_dim=8, + hidden_dim=16 + ) + ] +) +def test_builder_training_and_inference_generate_match(builder): + training_artifact = builder.build_training_artifact( + vocab_size=32, + sequence_length=16 + ) + inference_artifact = builder.build_inference_artifact( + training_artifact=training_artifact + ) + context = GenerationContext( + end_of_text=99, + max_length=6, + sample_fn=_sample_one + ) + + training_result = training_artifact.generate(context, [2, 3, 4]) + inference_result = inference_artifact.generate(context, [2, 3, 4]) + + assert training_result.token_ids == [2, 3, 4, 1, 1, 1] + assert inference_result.token_ids == training_result.token_ids + assert inference_result.stop_reason == training_result.stop_reason + + +def test_gpt_inference_artifact_reuses_training_artifact(): + builder = GptModelBuilder( + hidden_dim=8, + intermediate_dim=16, + num_heads=2, + num_layers=1 + ) + training_artifact = builder.build_training_artifact( + vocab_size=32, + sequence_length=16 + ) + + inference_artifact = builder.build_inference_artifact( + training_artifact=training_artifact + ) + + assert inference_artifact is training_artifact + assert inference_artifact.model is training_artifact.model + + +def test_rnn_inference_artifact_uses_distinct_model(): + builder = RNNModelBuilder( + num_layers=1, + embedding_dim=8, + hidden_dim=16 + ) + training_artifact = builder.build_training_artifact( + vocab_size=32, + sequence_length=16 + ) + + inference_artifact = builder.build_inference_artifact( + training_artifact=training_artifact + ) + + assert inference_artifact is not training_artifact + assert inference_artifact.model is not training_artifact.model + + +def test_rnn_inference_model_outputs_logits_and_states(): + builder = RNNModelBuilder( + num_layers=2, + embedding_dim=8, + hidden_dim=16 + ) + training_artifact = builder.build_training_artifact( + vocab_size=32, + sequence_length=16 + ) + inference_artifact = builder.build_inference_artifact( + training_artifact=training_artifact + ) + token_input = tf.constant([[2, 3, 4]], dtype="int32") + state_inputs = [] + for _ in range(builder.num_layers): + state_inputs.append(tf.zeros((1, builder.hidden_dim))) + state_inputs.append(tf.zeros((1, builder.hidden_dim))) + + outputs = inference_artifact.model([token_input] + state_inputs, training=False) + + assert len(outputs) == 1 + builder.num_layers * 2 + assert outputs[0].shape == (1, 32) + for state in outputs[1:]: + assert state.shape == (1, builder.hidden_dim) + + +def test_rnn_inference_model_copies_training_weights(): + builder = RNNModelBuilder( + num_layers=2, + embedding_dim=8, + hidden_dim=16 + ) + training_artifact = builder.build_training_artifact( + vocab_size=32, + sequence_length=16 + ) + + inference_artifact = builder.build_inference_artifact( + training_artifact=training_artifact + ) + + training_model = training_artifact.model + inference_model = inference_artifact.model + + np.testing.assert_allclose( + training_model.get_layer("embedding").get_weights()[0], + inference_model.get_layer("embedding").get_weights()[0] + ) + np.testing.assert_allclose( + training_model.get_layer("logits").get_weights()[0], + inference_model.get_layer("logits").get_weights()[0] + ) + np.testing.assert_allclose( + training_model.get_layer("logits").get_weights()[1], + inference_model.get_layer("logits").get_weights()[1] + ) + + for i in range(builder.num_layers): + training_lstm = training_model.get_layer(f"lstm_{i}") + inference_lstm = inference_model.get_layer(f"lstm_{i}") + + for training_weights, inference_weights in zip( + training_lstm.get_weights(), + inference_lstm.get_weights() + ): + np.testing.assert_allclose(training_weights, inference_weights) diff --git a/test/models/model_loader_test.py b/test/models/model_loader_test.py new file mode 100644 index 0000000000000000000000000000000000000000..00e93698cf9a49c74c7eafa634e4171f163e58ff --- /dev/null +++ b/test/models/model_loader_test.py @@ -0,0 +1,206 @@ +from dataclasses import dataclass +import pathlib + +import numpy as np +import tensorflow as tf + +from data.base import DataBundle, TokenizerBundle +from models.mini_gpt import GptModelBuilder +from models.rnn import RNNModelBuilder +from pipeline.base.configs import CheckpointConfig, CheckpointRules, GenerationRule, TrainingRule +from pipeline.base.model_loader import ( + load_inference_artifact_from_pipeline, + load_training_artifact_from_pipeline +) +from pipeline.pipeline import Pipeline + + +@dataclass +class DummyDataset(DataBundle): + def doc_ds(self) -> tf.data.Dataset: + return tf.data.Dataset.from_tensor_slices(["abc"]) + + def tokens_ds(self, seq_length: int, batch_size: int) -> tf.data.Dataset: + inputs = tf.constant([[1, 2, 3]], dtype="int32") + targets = tf.constant([[2, 3, 4]], dtype="int32") + return tf.data.Dataset.from_tensor_slices((inputs, targets)).batch(batch_size) + + def tokenizer_bundle(self) -> TokenizerBundle: + return TokenizerBundle( + tokenizer=lambda text: tf.constant([1, 2, 3], dtype="int32"), + decode=lambda ids: "".join(str(token) for token in ids), + end_of_text=99, + vocab_size=32 + ) + + +def _sample_one(logits): + return tf.constant([1], dtype="int32") + + +def _create_pipeline( + task_dir: pathlib.Path, + model_builder, + checkpoint_path: pathlib.Path +) -> Pipeline: + return Pipeline( + name="test_task", + dataset=DummyDataset(data_dir="unused", sequence_length=16), + model_builder=model_builder, + training_rule=TrainingRule(batch_size=1, epochs=1, steps_per_epoch=1, validation_batches=1), + generation_rule=GenerationRule( + prompts_generator=lambda dataset: ["abc"], + sample_strategy=_sample_one + ), + checkpoint_rules=CheckpointRules( + testing=CheckpointConfig(path=checkpoint_path) + ), + task_dir=task_dir + ) + + +def _save_training_checkpoint(model_builder, checkpoint_path: pathlib.Path): + training_artifact = model_builder.build_training_artifact( + vocab_size=32, + sequence_length=16 + ) + checkpoint_path.parent.mkdir(parents=True, exist_ok=True) + if checkpoint_path.suffix.lower() == ".keras": + training_artifact.model.save(str(checkpoint_path)) + else: + training_artifact.model.save_weights(str(checkpoint_path)) + return training_artifact + + +def test_load_training_artifact_from_keras_checkpoint(tmp_path): + builder = GptModelBuilder( + hidden_dim=8, + intermediate_dim=16, + num_heads=2, + num_layers=1 + ) + pipeline = _create_pipeline( + tmp_path / "task", + builder, + pathlib.Path("model_epoch_003.keras") + ) + saved_artifact = _save_training_checkpoint( + builder, + pipeline.checkpoint_dir / "model_epoch_003.keras" + ) + checkpoint_rule = pipeline.checkpoint_rules.resolve_testing_rule( + default_dirs=[pipeline.checkpoint_dir] + ) + + loaded_artifact, tokenizer_info = load_training_artifact_from_pipeline( + pipeline, + checkpoint_rule + ) + + assert loaded_artifact.model.name == "mini_gpt" + assert tokenizer_info.vocab_size == 32 + for saved_weights, loaded_weights in zip( + saved_artifact.model.get_weights(), + loaded_artifact.model.get_weights() + ): + np.testing.assert_allclose(saved_weights, loaded_weights) + + +def test_load_training_artifact_from_weights_checkpoint(tmp_path): + builder = RNNModelBuilder( + num_layers=1, + embedding_dim=8, + hidden_dim=16 + ) + pipeline = _create_pipeline( + tmp_path / "task", + builder, + pathlib.Path("model_epoch_003.weights.h5") + ) + saved_artifact = _save_training_checkpoint( + builder, + pipeline.checkpoint_dir / "model_epoch_003.weights.h5" + ) + checkpoint_rule = pipeline.checkpoint_rules.resolve_testing_rule( + default_dirs=[pipeline.checkpoint_dir] + ) + + loaded_artifact, tokenizer_info = load_training_artifact_from_pipeline( + pipeline, + checkpoint_rule + ) + + assert loaded_artifact.model.name == "rnn_training" + assert tokenizer_info.vocab_size == 32 + for saved_weights, loaded_weights in zip( + saved_artifact.model.get_weights(), + loaded_artifact.model.get_weights() + ): + np.testing.assert_allclose(saved_weights, loaded_weights) + + +def test_load_inference_artifact_from_pipeline_returns_gpt_model(tmp_path): + builder = GptModelBuilder( + hidden_dim=8, + intermediate_dim=16, + num_heads=2, + num_layers=1 + ) + pipeline = _create_pipeline( + tmp_path / "task", + builder, + pathlib.Path("model_epoch_003.keras") + ) + _save_training_checkpoint( + builder, + pipeline.checkpoint_dir / "model_epoch_003.keras" + ) + checkpoint_rule = pipeline.checkpoint_rules.resolve_testing_rule( + default_dirs=[pipeline.checkpoint_dir] + ) + + inference_artifact, _ = load_inference_artifact_from_pipeline( + pipeline, + checkpoint_rule + ) + outputs = inference_artifact.model(tf.constant([[2, 3, 4]], dtype="int32"), training=False) + + assert outputs.shape == (1, 3, 32) + + +def test_load_inference_artifact_from_pipeline_returns_rnn_model(tmp_path): + builder = RNNModelBuilder( + num_layers=1, + embedding_dim=8, + hidden_dim=16 + ) + pipeline = _create_pipeline( + tmp_path / "task", + builder, + pathlib.Path("model_epoch_003.weights.h5") + ) + _save_training_checkpoint( + builder, + pipeline.checkpoint_dir / "model_epoch_003.weights.h5" + ) + checkpoint_rule = pipeline.checkpoint_rules.resolve_testing_rule( + default_dirs=[pipeline.checkpoint_dir] + ) + + inference_artifact, _ = load_inference_artifact_from_pipeline( + pipeline, + checkpoint_rule + ) + outputs = inference_artifact.model( + [ + tf.constant([[2, 3, 4]], dtype="int32"), + tf.zeros((1, 16)), + tf.zeros((1, 16)) + ], + training=False + ) + + assert len(outputs) == 3 + assert outputs[0].shape == (1, 32) + assert outputs[1].shape == (1, 16) + assert outputs[2].shape == (1, 16) diff --git a/test/models/warmup_schedule_test.py b/test/models/warmup_schedule_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c5b87ccd575efeebcd00e1461701aececc99fe42 --- /dev/null +++ b/test/models/warmup_schedule_test.py @@ -0,0 +1,121 @@ +"""测试 WarmupSchedule 和 checkpoint 保存/加载的局限性 + +验证 weights.h5 不保存优化器状态,WarmupSchedule 会在加载后重置。 +""" + +import tempfile +from pathlib import Path + +import keras +import numpy as np +import pytest +from keras import ops + +from pipeline.pipeline import WarmupSchedule + + +@keras.saving.register_keras_serializable(package="test") +class SimpleModel(keras.Model): + """简单的测试模型""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense(10) + + def call(self, inputs): + return self.dense(inputs) + + +class TestWarmupScheduleCheckpointLimitation: + """测试 weights.h5 不保存优化器状态/WarmupSchedule 状态""" + + def _create_model(self): + """创建模型和优化器""" + model = SimpleModel() + schedule = WarmupSchedule() + optimizer = keras.optimizers.Adam(learning_rate=schedule) + model.compile(optimizer=optimizer, loss="mse") + model(np.zeros((1, 5))) + return model, optimizer, schedule + + def _train_steps(self, model, steps): + """训练模型指定步数""" + for _ in range(steps): + x = np.random.randn(2, 5).astype(np.float32) + y = np.random.randn(2, 10).astype(np.float32) + model.train_on_batch(x, y) + + def test_weights_h5_does_not_save_optimizer_state(self): + """测试:weights.h5 不保存优化器状态,WarmupSchedule 会重置 + + 验证保存并加载 weights.h5 后: + 1. 优化器 step 重置为 0 + 2. WarmupSchedule 学习率从 0 重新开始 + """ + # 创建模型和训练 500 步 + model, optimizer, schedule = self._create_model() + self._train_steps(model, 500) + + # 验证训练后状态 + assert int(optimizer.iterations.numpy()) == 500 + assert np.isclose(float(schedule(ops.convert_to_tensor(500))), 1e-4, rtol=0.01) + + # 保存 weights.h5 并加载到新模型 + with tempfile.TemporaryDirectory() as tmpdir: + checkpoint_path = Path(tmpdir) / "model.weights.h5" + model.save_weights(str(checkpoint_path)) + + new_model, new_optimizer, new_schedule = self._create_model() + new_model.load_weights(str(checkpoint_path)) + + # 验证:加载后状态重置 + assert int(new_optimizer.iterations.numpy()) == 0 + assert np.isclose( + float(new_schedule(ops.convert_to_tensor(0))), 0.0, atol=1e-7 + ) + + # 继续训练 500 步 + self._train_steps(new_model, 500) + + # 验证:状态重新累积 + assert int(new_optimizer.iterations.numpy()) == 500 + assert np.isclose( + float(new_schedule(ops.convert_to_tensor(500))), 1e-4, rtol=0.01 + ) + + def test_keras_format_continue_training(self): + """测试:加载 .keras 模型后继续训练,验证学习率行为 + + 场景: + 1. 训练 500 步(学习率 1e-4) + 2. 保存并加载模型 + 3. 继续训练到 1000 步 + 4. 验证:学习率应该达到 2e-4(预热完成) + """ + # 创建并训练模型(训练 500 步) + model, optimizer, _ = self._create_model() + self._train_steps(model, 500) + + assert int(optimizer.iterations.numpy()) == 500 + + # 保存并加载模型 + with tempfile.TemporaryDirectory() as tmpdir: + model_path = Path(tmpdir) / "model.keras" + model.save(str(model_path)) + + loaded_model = keras.models.load_model( + str(model_path), custom_objects={"WarmupSchedule": WarmupSchedule} + ) + + # 继续训练 500 步(总共 1000 步) + self._train_steps(loaded_model, 500) + + # 验证:step 累计,学习率达到最大值 + assert int(loaded_model.optimizer.iterations.numpy()) == 1000 + assert np.isclose( + float(loaded_model.optimizer.learning_rate), 2e-4, rtol=0.01 + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/test/pipeline/__init__.py b/test/pipeline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/test/pipeline/__init__.py @@ -0,0 +1 @@ + diff --git a/test/pipeline/generation_test.py b/test/pipeline/generation_test.py new file mode 100644 index 0000000000000000000000000000000000000000..771dafd7f8adc3b0910f16991e3974b1847c1294 --- /dev/null +++ b/test/pipeline/generation_test.py @@ -0,0 +1,69 @@ +from unittest.mock import Mock + +from pipeline.base.configs import CheckpointRules +from pipeline.base.generation import GenerationResult, TextGenerator +from pipeline.base.generation_runner import BaseGenerationRunner +from pipeline.base.model_builder import ModelArtifact +from test.pipeline.helpers import create_pipeline + + +class DummyGenerationRunner(BaseGenerationRunner): + # 提供最小生成 runner,复用真实的 run_fixed 流程 + title = "测试生成器" + fixed_prompts = ["白日依山尽", "床前明月光"] + + def _build_generator(self) -> TextGenerator: + # 按真实流程读取 testing checkpoint 规则并构造 TextGenerator + checkpoint_rule = self.pipeline.checkpoint_rules.resolve_testing_rule( + default_dirs=[self.pipeline.checkpoint_dir] + ) + artifact, tokenizer_info = self.loader(self.pipeline, checkpoint_rule) + return TextGenerator( + artifact=artifact, + tokenizer=tokenizer_info.tokenizer, + decode=tokenizer_info.decode, + end_of_text=tokenizer_info.end_of_text, + max_length=16, + sample_fn=self.pipeline.generation_rule.sample_strategy + ) + + +def test_generation_runner_runs_generation_flow(tmp_path, capsys): + # 构造最小 pipeline,固定生成参数与 checkpoint 规则 + pipeline = create_pipeline(tmp_path / "task", Mock(), CheckpointRules()) + log_config = Mock() + pipeline.log_config = log_config + + # 构造可控的推理产物,避免真实加载模型与推理 + artifact = ModelArtifact( + model=Mock(), + generate=Mock(return_value=GenerationResult([7, 8], "<|stop|>")) + ) + expected_tokenizer_info = pipeline.dataset.tokenizer_bundle() + loader = Mock(return_value=(artifact, expected_tokenizer_info)) + DummyGenerationRunner.loader = loader + + # 执行固定 prompts 的生成流程 + runner = DummyGenerationRunner(lambda: pipeline) + runner.run_fixed() + + # 验证打印 config + log_config.assert_called_once_with() + + # 验证生成流程确实按 testing checkpoint 规则装配了生成器 + loader.assert_called_once() + loader_pipeline, checkpoint_rule = loader.call_args.args + assert loader_pipeline is pipeline + assert checkpoint_rule == { + "dirs": [pipeline.checkpoint_dir], + "path": None, + "epoch": None, + "suffix": None + } + assert artifact.generate.call_count == 2 + + # 验证两个固定提示词都完成了生成并输出到控制台 + output = capsys.readouterr().out + assert "白日依山尽" in output + assert "床前明月光" in output + assert "78<|stop|>" in output diff --git a/test/pipeline/helpers.py b/test/pipeline/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..1b925b9d086c39b1f16afb94933a699204f72f9b --- /dev/null +++ b/test/pipeline/helpers.py @@ -0,0 +1,64 @@ +import pathlib +from dataclasses import dataclass + +import tensorflow as tf + +from data.base import DataBundle, TokenizerBundle +from pipeline.base.configs import GenerationRule, TrainingRule +from pipeline.pipeline import Pipeline + + +@dataclass +class DummyDataset(DataBundle): + """测试专用的最小数据集,实现 Pipeline 所需的 DataBundle 接口。""" + + def doc_ds(self) -> tf.data.Dataset: + return tf.data.Dataset.from_tensor_slices(["abc"]) + + def tokens_ds(self, seq_length: int, batch_size: int) -> tf.data.Dataset: + inputs = tf.constant([[1, 2, 3]], dtype="int32") + targets = tf.constant([[2, 3, 4]], dtype="int32") + return tf.data.Dataset.from_tensor_slices((inputs, targets)).batch(batch_size) + + def tokenizer_bundle(self) -> TokenizerBundle: + return TokenizerBundle( + tokenizer=lambda text: tf.constant([1, 2, 3], dtype="int32"), + decode=lambda ids: "".join(str(token) for token in ids), + end_of_text=99, + vocab_size=32 + ) + + +def sample_one(logits): + # 固定采样结果,避免测试受随机数影响 + return tf.constant([1], dtype="int32") + + +def create_pipeline(task_dir: pathlib.Path, model_builder, checkpoint_rules=None) -> Pipeline: + # 组装流程测试共用的最小 Pipeline + kwargs = {} + if checkpoint_rules is not None: + kwargs["checkpoint_rules"] = checkpoint_rules + + return Pipeline( + name="test_task", + dataset=DummyDataset(data_dir="unused", sequence_length=16), + model_builder=model_builder, + training_rule=TrainingRule(batch_size=1, epochs=1, steps_per_epoch=1, validation_batches=1), + generation_rule=GenerationRule( + prompts_generator=lambda dataset: ["abc"], + sample_strategy=sample_one + ), + task_dir=task_dir, + **kwargs + ) + + +def save_training_checkpoint(model_builder, checkpoint_path: pathlib.Path): + # 先保存一份训练权重,供 save_inference_model 读取并导出推理模型 + training_artifact = model_builder.build_training_artifact( + vocab_size=32, + sequence_length=16 + ) + checkpoint_path.parent.mkdir(parents=True, exist_ok=True) + training_artifact.model.save_weights(str(checkpoint_path)) diff --git a/test/pipeline/save_model_test.py b/test/pipeline/save_model_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1779873f034e09be8136daf900cbf6b8b9c0ee7c --- /dev/null +++ b/test/pipeline/save_model_test.py @@ -0,0 +1,47 @@ + +import tensorflow as tf +from unittest.mock import Mock + +from env import resolve as resolve_module +from models.mini_gpt import GptModelBuilder +from pipeline.base.model_loader import _load_keras_model +from test.pipeline.helpers import create_pipeline, save_training_checkpoint + + +def test_save_inference_model_runs_save_flow(tmp_path, monkeypatch): + # 构造最小 GPT 模型,保留真实保存与加载链路 + builder = GptModelBuilder( + hidden_dim=8, + intermediate_dim=16, + num_heads=2, + num_layers=1 + ) + pipeline = create_pipeline(tmp_path / "task", builder) + log_config = Mock() + pipeline.log_config = log_config + + # 先写入训练权重,作为后续导出推理模型的输入检查点 + save_training_checkpoint( + builder, + pipeline.checkpoint_dir / "model_epoch_005.weights.h5" + ) + + # 将保存目录重定向到临时目录,避免污染仓库默认路径 + monkeypatch.setattr( + resolve_module, + "resolve_saved", + lambda path=None: tmp_path / path if path else tmp_path + ) + + # 执行推理模型导出,并重新加载验证文件可用 + model_path = pipeline.save_inference_model() + loaded_model = _load_keras_model(model_path) + outputs = loaded_model(tf.constant([[2, 3, 4]], dtype="int32"), training=False) + + # 验证保存模型流程启动时会先打印 config + log_config.assert_called_once_with() + + # 验证导出文件名、文件存在性和前向输出形状 + assert model_path.name == "model_epoch_005.keras" + assert model_path.exists() + assert outputs.shape == (1, 3, 32) diff --git a/test/pipeline/train_test.py b/test/pipeline/train_test.py new file mode 100644 index 0000000000000000000000000000000000000000..10a365a412b8e3bffa694db6d543b11b94fc0eff --- /dev/null +++ b/test/pipeline/train_test.py @@ -0,0 +1,57 @@ +from typing import cast +from unittest.mock import Mock + +import env.keras as keras_env +import pipeline.pipeline as pipeline_module +from pipeline.base.model_builder import ModelArtifact, ModelBuilder +from test.pipeline.helpers import create_pipeline + + +def _assert_fit_kwargs(fit_kwargs): + # 校验训练参数与验证集内容,确保 execute 编排正确 + assert fit_kwargs["initial_epoch"] == 0 + assert fit_kwargs["epochs"] == 1 + assert fit_kwargs["steps_per_epoch"] == 1 + assert len(fit_kwargs["callbacks"]) == 4 + + validation_batch = next(fit_kwargs["validation_data"].as_numpy_iterator()) + assert validation_batch[0].tolist() == [[1, 2, 3]] + assert validation_batch[1].tolist() == [[2, 3, 4]] + + +def test_execute_runs_training_flow(tmp_path, monkeypatch): + """训练流程测试:验证 execute 能完成训练编排并调用 fit。""" + # 构造最小训练产物,避免真实训练开销 + model = Mock() + training_artifact = ModelArtifact(model=model, generate=Mock()) + model_builder_mock = Mock() + model_builder_mock.build_training_artifact.return_value = training_artifact + model_builder = cast(ModelBuilder, cast(object, model_builder_mock)) + pipeline = create_pipeline(tmp_path / "task", model_builder) + + # 屏蔽混合精度与配置输出副作用,只关注训练主流程 + enable_mixed_precision = Mock() + log_config = Mock() + monkeypatch.setattr(keras_env, "enable_mixed_precision", enable_mixed_precision) + monkeypatch.setattr(pipeline, "log_config", log_config) + monkeypatch.setattr(pipeline_module, "resolve_checkpoint", lambda **kwargs: (None, 0)) + + # 执行训练流程 + pipeline.execute() + + # 验证训练前准备和模型装配都已发生 + enable_mixed_precision.assert_called_once_with() + log_config.assert_called_once_with() + model_builder_mock.build_training_artifact.assert_called_once_with( + vocab_size=32, + sequence_length=16 + ) + model.compile.assert_called_once() + model.summary.assert_called_once_with() + model.fit.assert_called_once() + + # 验证 fit 接收到的关键训练参数与验证集 + _, fit_kwargs = model.fit.call_args + _assert_fit_kwargs(fit_kwargs) + assert pipeline.log_dir.exists() + assert pipeline.checkpoint_dir.exists() diff --git a/test/utils/__init__.py b/test/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/test/utils/logging_utils_test.py b/test/utils/logging_utils_test.py new file mode 100644 index 0000000000000000000000000000000000000000..37fec53f6846a1844b301c8291beb1383499c14f --- /dev/null +++ b/test/utils/logging_utils_test.py @@ -0,0 +1,133 @@ +from dataclasses import dataclass +from pipeline.base.logging_config_utils import format_config_value, INDENT + + +def test_format_string(): + """测试字符串格式化""" + assert format_config_value("hello") == "hello" + + +def test_format_int(): + """测试整数格式化""" + assert format_config_value(42) == "42" + + +def test_format_none(): + """测试 None 格式化""" + assert format_config_value(None) == "None" + + +def test_format_function(): + """测试函数格式化 - 显示函数名""" + + def foo(): + pass + + assert format_config_value(foo) == "foo" + + +def test_format_lambda(): + """测试 lambda 格式化""" + func = lambda x: x + assert format_config_value(func) == "" + + +class TestFormatDataclass: + """测试 dataclass 格式化""" + + def test_simple_dataclass(self): + """测试简单 dataclass""" + + @dataclass + class Simple: + x: int = 1 + y: str = "hello" + + obj = Simple(x=10, y="world") + result = format_config_value(obj) + + assert result.startswith("Simple(") + assert "x=10" in result + assert "y=world" in result + assert result.endswith(")") + + def test_nested_dataclass(self): + """测试嵌套 dataclass""" + + @dataclass + class Inner: + value: int = 0 + + @dataclass + class Outer: + name: str = "outer" + inner: Inner = None + + inner = Inner(value=42) + outer = Outer(name="test", inner=inner) + result = format_config_value(outer) + + expected = f"""Outer( +{INDENT}name=test +{INDENT}inner=Inner( +{INDENT}{INDENT}value=42 +{INDENT}) +)""" + assert result == expected + + def test_empty_dataclass(self): + """测试空 dataclass(没有字段)""" + + @dataclass + class Empty: + pass + + obj = Empty() + result = format_config_value(obj) + + assert result == "Empty()" + + def test_dataclass_with_callable(self): + """测试包含 callable 的 dataclass""" + + @dataclass + class WithCallable: + name: str = "test" + processor: callable = None + + def my_processor(): + pass + + obj = WithCallable(name="doc", processor=my_processor) + result = format_config_value(obj) + + assert "WithCallable(" in result + assert "name=doc" in result + assert "processor=my_processor" in result + + def test_dataclass_with_indent(self): + """测试带缩进的格式化""" + + @dataclass + class Simple: + x: int = 1 + + obj = Simple(x=5) + result = format_config_value(obj, indent=1) + + # 验证有缩进 + assert f"{INDENT}Simple(" in result + lines = result.split("\n") + field_line = [l for l in lines if "x=5" in l][0] + assert field_line.startswith(INDENT * 2) # 2层缩进 + + +def test_format_arbitrary_object(): + """测试任意对象的格式化""" + + class MyClass: + def __str__(self): + return "my_custom_object" + + obj = MyClass() + assert format_config_value(obj) == "my_custom_object" diff --git a/test/utils/paths_test.py b/test/utils/paths_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b8dd66abe6391bbf8ae4342510a8b2084559d95b --- /dev/null +++ b/test/utils/paths_test.py @@ -0,0 +1,25 @@ +from env.resolve import PROJECT_ROOT, display_path, resolve_path + + +def test_resolve(): + # 测试相对路径解析 + relative_path = "data/dev" + resolved_path = resolve_path(relative_path) + print(resolved_path) + +def test_resolve2(): + relative_path = "~" + resolved_path = resolve_path(relative_path) + print(resolved_path) + + +def test_display_path_returns_project_relative_path(): + path = PROJECT_ROOT / "saved" / "vocab" / "poetry" / "vocab.txt" + + assert display_path(path) == "saved/vocab/poetry/vocab.txt" + + +def test_display_path_returns_absolute_path_outside_project(): + path = "/tmp/vocab.txt" + + assert display_path(path) == str(resolve_path(path))