Spaces:
Sleeping
Sleeping
陳聖勳 commited on
Commit ·
03189b2
1
Parent(s): 72045da
initial commit
Browse files- .gitattributes +2 -0
- .gitignore +74 -0
- app.ipynb +757 -0
- dataset/MOE_word2explanations_8.json +3 -0
- dataset/TMLD_word2explanations_3.json +3 -0
- dataset/cloze_exampleB.txt +3 -0
- dataset/cloze_exampleC.txt +3 -0
- dataset/grammars.jsonl +3 -0
- dataset/sentencepattern.json +3 -0
- dataset/subtopic_map20251002v2.json +3 -0
- dataset/word2TBCL.json +3 -0
- fileio/__init__.py +0 -0
- fileio/jsonio.py +154 -0
- requirements.txt +29 -0
- util/cloze_constants.py +253 -0
- util/fileio/__init__.py +0 -0
- util/fileio/jsonio.py +154 -0
- util/judgement.py +469 -0
- util/reading_constants.py +450 -0
- util/sentence_dealer/.gitignore +165 -0
- util/sentence_dealer/README.md +28 -0
- util/sentence_dealer/__init__.py +7 -0
- util/sentence_dealer/requirements.txt +74 -0
- util/sentence_dealer/sample.py +6 -0
- util/sentence_dealer/sentence_dealer.py +400 -0
- util/sentence_dealer/setup.py +30 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
dataset/** filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
CKIP/** filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual Environment
|
| 24 |
+
venv/
|
| 25 |
+
env/
|
| 26 |
+
ENV/
|
| 27 |
+
env.bak/
|
| 28 |
+
venv.bak/
|
| 29 |
+
dealer/
|
| 30 |
+
|
| 31 |
+
# Jupyter Notebook
|
| 32 |
+
.ipynb_checkpoints
|
| 33 |
+
*/.ipynb_checkpoints/*
|
| 34 |
+
*.ipynb_checkpoints
|
| 35 |
+
|
| 36 |
+
# CKIP Files (auto download from Hugging Face)
|
| 37 |
+
CKIP/
|
| 38 |
+
|
| 39 |
+
# Data folder
|
| 40 |
+
data/
|
| 41 |
+
|
| 42 |
+
# IDE
|
| 43 |
+
.vscode/
|
| 44 |
+
.idea/
|
| 45 |
+
*.swp
|
| 46 |
+
*.swo
|
| 47 |
+
*~
|
| 48 |
+
|
| 49 |
+
# macOS
|
| 50 |
+
.DS_Store
|
| 51 |
+
|
| 52 |
+
# Windows
|
| 53 |
+
Thumbs.db
|
| 54 |
+
ehthumbs.db
|
| 55 |
+
Desktop.ini
|
| 56 |
+
|
| 57 |
+
# Gradio
|
| 58 |
+
.gradio/
|
| 59 |
+
*.db
|
| 60 |
+
flagged/
|
| 61 |
+
gradio_cached_examples/
|
| 62 |
+
|
| 63 |
+
# Logs
|
| 64 |
+
*.log
|
| 65 |
+
|
| 66 |
+
# Environment variables
|
| 67 |
+
.env
|
| 68 |
+
.env.local
|
| 69 |
+
|
| 70 |
+
# Temporary files
|
| 71 |
+
*.tmp
|
| 72 |
+
*.temp
|
| 73 |
+
*.bak
|
| 74 |
+
|
app.ipynb
ADDED
|
@@ -0,0 +1,757 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "341f0828",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stderr",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"D:\\anaconda3\\envs\\study_prompt\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 14 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"name": "stdout",
|
| 19 |
+
"output_type": "stream",
|
| 20 |
+
"text": [
|
| 21 |
+
"WARNING:tensorflow:From D:\\anaconda3\\envs\\study_prompt\\lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
|
| 22 |
+
"\n",
|
| 23 |
+
"WARNING:tensorflow:From D:\\anaconda3\\envs\\study_prompt\\lib\\site-packages\\ckiptagger\\api.py:8: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.\n",
|
| 24 |
+
"\n"
|
| 25 |
+
]
|
| 26 |
+
}
|
| 27 |
+
],
|
| 28 |
+
"source": [
|
| 29 |
+
"import re\n",
|
| 30 |
+
"import gradio as gr\n",
|
| 31 |
+
"import json, random\n",
|
| 32 |
+
"from util.reading_constants import READING_LEVEL_CONFIG, FIX_PREFERENCE_PROMPT, FIX_HIGH_LEVEL_PROMPT, FIX_LOW_LEVEL_PROMPT\n",
|
| 33 |
+
"from util.cloze_constants import CLOZE_LEVEL_CONFIG, CLOZE_FIX_PREFERENCE_PROMPT, CLOZE_FIX_HIGH_LEVEL_PROMPT, CLOZE_FIX_LOW_LEVEL_PROMPT\n",
|
| 34 |
+
"from util.sentence_dealer import Sentence_Dealer\n",
|
| 35 |
+
"from util.judgement import READING_JUDGE_PROMPTS, CLOZE_JUDGE_PROMPTS, QUALIFICATION_SCHEMA\n",
|
| 36 |
+
"\n",
|
| 37 |
+
"def load_json(data_path:str):\n",
|
| 38 |
+
" with open(data_path, \"r\", encoding=\"utf-8\") as f:\n",
|
| 39 |
+
" result = json.load(f)\n",
|
| 40 |
+
" return result"
|
| 41 |
+
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"cell_type": "code",
|
| 45 |
+
"execution_count": null,
|
| 46 |
+
"id": "bb187536",
|
| 47 |
+
"metadata": {},
|
| 48 |
+
"outputs": [
|
| 49 |
+
{
|
| 50 |
+
"name": "stderr",
|
| 51 |
+
"output_type": "stream",
|
| 52 |
+
"text": [
|
| 53 |
+
"D:\\anaconda3\\envs\\study_prompt\\lib\\site-packages\\ckiptagger\\model_ws.py:106: UserWarning: `tf.nn.rnn_cell.LSTMCell` is deprecated and will be removed in a future version. This class is equivalent as `tf.keras.layers.LSTMCell`, and will be replaced by that in Tensorflow 2.0.\n",
|
| 54 |
+
" cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)\n",
|
| 55 |
+
"D:\\anaconda3\\envs\\study_prompt\\lib\\site-packages\\ckiptagger\\model_pos.py:56: UserWarning: `tf.nn.rnn_cell.LSTMCell` is deprecated and will be removed in a future version. This class is equivalent as `tf.keras.layers.LSTMCell`, and will be replaced by that in Tensorflow 2.0.\n",
|
| 56 |
+
" cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)\n",
|
| 57 |
+
"D:\\anaconda3\\envs\\study_prompt\\lib\\site-packages\\ckiptagger\\model_ner.py:57: UserWarning: `tf.nn.rnn_cell.LSTMCell` is deprecated and will be removed in a future version. This class is equivalent as `tf.keras.layers.LSTMCell`, and will be replaced by that in Tensorflow 2.0.\n",
|
| 58 |
+
" cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)\n"
|
| 59 |
+
]
|
| 60 |
+
}
|
| 61 |
+
],
|
| 62 |
+
"source": [
|
| 63 |
+
"data_folder_path = \"./dataset/\"\n",
|
| 64 |
+
"CHOICE_LABELS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'AA', 'AB', 'AC', 'AD', 'AE', 'AF', 'AG', 'AI', 'AK', 'AL', 'AM', 'AN', 'AP', 'AR', 'AS', 'AT', 'AU', 'AV', 'AW', 'AX', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BI', 'BL', 'BO', 'BR', 'BS', 'BU', 'BY', 'CA', 'CB', 'CC', 'CD', 'CE', 'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CO', 'CP', 'CR', 'CS', 'CT', 'CV', 'DA', 'DB', 'DC', 'DD', 'DE', 'DF', 'DI', 'DK', 'DL', 'DM', 'DN', 'DO', 'DP', 'DR', 'DS', 'DT', 'DU', 'EB', 'EC', 'ED', 'EE', 'EF', 'EG', 'EL', 'EM', 'EN', 'EP', 'EQ', 'ER', 'ES', 'ET', 'EV', 'EX', 'EY', 'FA', 'FB', 'FC', 'FD', 'FE', 'FF', 'FI', 'FL', 'FM', 'FO', 'FP', 'FR', 'FS', 'FT', 'FX', 'GA', 'GB', 'GC', 'GE', 'GG', 'GL', 'GM', 'GN', 'GO', 'GP', 'GR', 'GS', 'GT', 'GV', 'HA', 'HC', 'HD', 'HE', 'HH', 'HI', 'HL', 'HO', 'HP', 'HR', 'HS', 'HT', 'IA', 'IB', 'IC', 'ID', 'IE', 'IF', 'IG', 'II', 'IK', 'IL', 'IM', 'IN', 'IO', 'IP', 'IR', 'IS', 'IT', 'IV', 'IX', 'IZ', 'JB', 'JS', 'KB', 'KE', 'KS', 'LA', 'LC', 'LD', 'LE', 'LI', 'LL', 'LM', 'LO', 'LP', 'LR', 'LS', 'LT', 'LY', 'MA', 'MB', 'MC', 'MD', 'ME', 'MI', 'ML', 'MM', 'MO', 'MP', 'MQ', 'MR', 'MS', 'MT', 'MW', 'MY', 'NA', 'NB', 'NC', 'ND', 'NE', 'NF', 'NG', 'NI', 'NL', 'NN', 'NO', 'NP', 'NR', 'NS', 'NT', 'NU', 'OB', 'OC', 'OD', 'OF', 'OH', 'OK', 'OL', 'OM', 'ON', 'OP', 'OR', 'OS', 'OT', 'OU', 'OW', 'PA', 'PC', 'PD', 'PE', 'PF', 'PG', 'PH', 'PI', 'PK', 'PL', 'PM', 'PN', 'PO', 'PP', 'PR', 'PS', 'PT', 'PU', 'PY', 'QL', 'QU', 'RA', 'RC', 'RE', 'RI', 'RL', 'RO', 'RS', 'RT', 'RU', 'RY', 'SA', 'SB', 'SC', 'SD', 'SE', 'SF', 'SG', 'SH', 'SI', 'SK', 'SL', 'SM', 'SN', 'SO', 'SP', 'SR', 'SS', 'ST', 'SU', 'SV', 'SW', 'SY', 'TA', 'TB', 'TC', 'TD', 'TE', 'TF', 'TH', 'TI', 'TL', 'TM', 'TO', 'TP', 'TR', 'TS', 'TT', 'TV', 'TW', 'TX', 'TY', 'UB', 'UC', 'UD', 'UE', 'UG', 'UI', 'UK', 'UL', 'UM', 'UN', 'UP', 'UR', 'US', 'UT', 'VA', 'VB', 'VC', 'VD', 'VE', 'VF', 'VI', 'VM', 'VO', 'VP', 'VS', 'WA', 'WD', 'WE', 'WF', 'WH', 'WI', 'WM', 'WN', 'WP', 'WR', 'WS', 'WT', 'XT', 'XV', 'XX', 'XY', 'YS', 'YY', 'ZE', 'ABC', 'ACE', 'ACK', 'ACT', 'ADD', 'AGE', 'ALL', 'AME', 'AML', 'AMP', 'AND', 'ANG', 'ANT', 'API', 'APP', 'ARD', 'ARN', 'ART', 'ARY', 'ASC', 'ASE', 'ASH', 'ASS', 'AST', 'ATA', 'ATE', 'ATH', 'AUT', 'AVA', 'AXI', 'BER', 'BIT', 'BUG', 'CAA', 'CAT', 'CCE', 'CCN', 'CES', 'CLA', 'CLC', 'CLI', 'COL', 'COM', 'CON', 'CRE', 'CSS', 'CUR', 'DAT', 'DAY', 'DBC', 'DEF', 'DER', 'DEX', 'DIR', 'DIS', 'DOC', 'DOM', 'EAR', 'ECK', 'ECT', 'ELD', 'EMA', 'END', 'ENT', 'ENV', 'ERE', 'ERR', 'ERS', 'ERT', 'ERY', 'EXT', 'FIG', 'FIX', 'FLA', 'FOR', 'GEN', 'GER', 'GET', 'HER', 'IAB', 'IAL', 'ICE', 'IDE', 'IES', 'IGN', 'III', 'ILL', 'IMA', 'IME', 'IND', 'INE', 'INF', 'ING', 'INT', 'ION', 'IOS', 'ISO', 'IST', 'ITE', 'ITH', 'ITY', 'IVE', 'JAX', 'KEY', 'LAB', 'LAY', 'LED', 'LES', 'LIC', 'LIN', 'LOB', 'LOC', 'LOG', 'LOW', 'MAN', 'MAP', 'MAX', 'MIN', 'MIT', 'MON', 'NER', 'NET', 'NEW', 'NOT', 'NUM', 'OFF', 'OIN', 'ONE', 'ONG', 'OPT', 'ORD', 'ORM', 'ORS', 'ORT', 'ORY', 'OST', 'OUR', 'OUT', 'PAR', 'PDF', 'PER', 'PHP', 'POS', 'PRE', 'PRI', 'PRO', 'PUT', 'QUE', 'RAM', 'RAY', 'RED', 'REE', 'REF', 'REG', 'RES', 'RGB', 'RIG', 'ROM', 'ROP', 'ROR', 'ROW', 'SBN', 'SDK', 'SEE', 'SER', 'SET', 'SHA', 'SON', 'SQL', 'SSL', 'SSN', 'STR', 'SUB', 'SUM', 'TAC', 'TAG', 'TER', 'THE', 'UES', 'UID', 'ULL', 'ULT', 'UMN', 'UND', 'UNT', 'URE', 'URI', 'URL', 'URN', 'USA', 'USE', 'UST', 'UTC', 'UTE', 'UTF', 'VAL', 'VAR', 'VER', 'VID', 'VIS', 'WID', 'WIN', 'WOR', 'XML', 'XXX', 'YES', 'YPE']\n",
|
| 65 |
+
"with open(data_folder_path + 'grammars.jsonl', 'r', encoding='utf-8') as f:\n",
|
| 66 |
+
" GRAMMAR_ID2INFO = {item['grammar_id']: item for item in [json.loads(line) for line in f]}\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"word2explanation_infos = load_json(data_folder_path + \"MOE_word2explanations_8.json\")\n",
|
| 69 |
+
"word2explanation_infos.update(load_json(data_folder_path + \"TMLD_word2explanations_3.json\"))\n",
|
| 70 |
+
"\n",
|
| 71 |
+
"# 載入主題和副主題對應\n",
|
| 72 |
+
"subtopic_map = load_json(data_folder_path + \"subtopic_map20251002v2.json\")\n",
|
| 73 |
+
"\n",
|
| 74 |
+
"dealer = Sentence_Dealer(ckiptagger_path=\"./CKIP/\")"
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"cell_type": "code",
|
| 79 |
+
"execution_count": 3,
|
| 80 |
+
"id": "64d44fce",
|
| 81 |
+
"metadata": {},
|
| 82 |
+
"outputs": [],
|
| 83 |
+
"source": [
|
| 84 |
+
"#Modify the ckiptagger_path if you want\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"sample_sentence = \"\"\"朝聞道夕死可矣\"\"\"\n",
|
| 87 |
+
"\n",
|
| 88 |
+
"grammar_ids, grammar_range = dealer.list_all_grammars(sentence=sample_sentence)"
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"cell_type": "code",
|
| 93 |
+
"execution_count": 4,
|
| 94 |
+
"id": "a18b23ad",
|
| 95 |
+
"metadata": {},
|
| 96 |
+
"outputs": [
|
| 97 |
+
{
|
| 98 |
+
"name": "stdout",
|
| 99 |
+
"output_type": "stream",
|
| 100 |
+
"text": [
|
| 101 |
+
"[([242, 243, 244, 309], 5)]\n"
|
| 102 |
+
]
|
| 103 |
+
}
|
| 104 |
+
],
|
| 105 |
+
"source": [
|
| 106 |
+
"grammar_starts = [x[0] for x in grammar_range]\n",
|
| 107 |
+
"grammar_id2range = [(x,y) for x, y in sorted(zip(grammar_ids, grammar_starts), key=lambda p: p[1])]\n",
|
| 108 |
+
"print(grammar_id2range)"
|
| 109 |
+
]
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"cell_type": "code",
|
| 113 |
+
"execution_count": 5,
|
| 114 |
+
"id": "a377fe5e",
|
| 115 |
+
"metadata": {},
|
| 116 |
+
"outputs": [],
|
| 117 |
+
"source": [
|
| 118 |
+
"\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"def remove_example(text):\n",
|
| 121 |
+
" #return text.replace('「', '『').replace('」', '』')\n",
|
| 122 |
+
" if text.startswith('《'):\n",
|
| 123 |
+
" # 如果第一個字是《,保留直到第二次出現《之前的文字\n",
|
| 124 |
+
" tmp_str = re.sub(r'(《.*?《).*', r'\\1', text).rstrip('《')\n",
|
| 125 |
+
" else:\n",
|
| 126 |
+
" # 否則,保留《之前的文字\n",
|
| 127 |
+
" tmp_str = re.sub(r'(.*?)《.*', r'\\1', text)\n",
|
| 128 |
+
" tmp_str = tmp_str.split(\"如:\")[0] #去除\"如:\"以後的字\n",
|
| 129 |
+
" index = tmp_str.find('。')\n",
|
| 130 |
+
" return tmp_str[:index + 1] if index != -1 else text #去除\"。\"以後的字\n",
|
| 131 |
+
"\n",
|
| 132 |
+
"def sentence_segmentation(text):\n",
|
| 133 |
+
" sentence_list = text.split('。')\n",
|
| 134 |
+
" sentence_list = [sen.strip() + '。' for sen in sentence_list if sen.strip() != '']\n",
|
| 135 |
+
" return sentence_list"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"cell_type": "code",
|
| 140 |
+
"execution_count": 6,
|
| 141 |
+
"id": "b53b89ce",
|
| 142 |
+
"metadata": {},
|
| 143 |
+
"outputs": [
|
| 144 |
+
{
|
| 145 |
+
"data": {
|
| 146 |
+
"text/plain": [
|
| 147 |
+
"['朝聞道夕死可矣。']"
|
| 148 |
+
]
|
| 149 |
+
},
|
| 150 |
+
"execution_count": 6,
|
| 151 |
+
"metadata": {},
|
| 152 |
+
"output_type": "execute_result"
|
| 153 |
+
}
|
| 154 |
+
],
|
| 155 |
+
"source": [
|
| 156 |
+
"sentence_segmentation(sample_sentence)"
|
| 157 |
+
]
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"cell_type": "code",
|
| 161 |
+
"execution_count": 7,
|
| 162 |
+
"id": "8273fac9",
|
| 163 |
+
"metadata": {},
|
| 164 |
+
"outputs": [],
|
| 165 |
+
"source": [
|
| 166 |
+
"## 為了方便理解,我們對任務做了簡化"
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"cell_type": "code",
|
| 171 |
+
"execution_count": null,
|
| 172 |
+
"id": "462508ee",
|
| 173 |
+
"metadata": {},
|
| 174 |
+
"outputs": [],
|
| 175 |
+
"source": [
|
| 176 |
+
"# =========================\n",
|
| 177 |
+
"# 1. 函式定義區(先留空給你實作)\n",
|
| 178 |
+
"# =========================\n",
|
| 179 |
+
"\n",
|
| 180 |
+
"def gen_textbook_prompt(\n",
|
| 181 |
+
" tbcl_level: str,\n",
|
| 182 |
+
" main_topic: str,\n",
|
| 183 |
+
" sub_topic: str,\n",
|
| 184 |
+
" extra_content: str,\n",
|
| 185 |
+
" article_type: str,\n",
|
| 186 |
+
") -> str:\n",
|
| 187 |
+
" \"\"\"\n",
|
| 188 |
+
" 課文生成 prompt 產生器\n",
|
| 189 |
+
" 根據 TBCL 等級、主題、副主題、額外內容和文章類型生成適合的課文生成 Prompt\n",
|
| 190 |
+
" \"\"\"\n",
|
| 191 |
+
" \n",
|
| 192 |
+
" # Prompt 模板庫(參考 inference_codes/extras/prompts.py)\n",
|
| 193 |
+
" PURE_TEMPLATES = [\n",
|
| 194 |
+
" \"請以「{topics}」為主題,撰寫一篇 TBCL {level} 級的華語{textbook_type}課文。\",\n",
|
| 195 |
+
" \"請完成一篇有關「{topics}」且 TBCL 分級為 {level} 級的華語課文,課文形式為{textbook_type}。\",\n",
|
| 196 |
+
" \"以「{topics}」為題,編寫一篇華語課文。\\nTBCL 分級為 {level} 級,要求格式為{textbook_type}\",\n",
|
| 197 |
+
" \"請提供一篇關於「{topics}」的課文。\\n課文形式為{textbook_type},TBCL 分級為 {level} 級。\",\n",
|
| 198 |
+
" \"生成一篇難度為 TBCL {level} 級的{textbook_type}華語課文,內容需與「{topics}」相關\",\n",
|
| 199 |
+
" \"請針對「{topics}」寫出一篇華語{textbook_type}課文,難度必須符合 TBCL {level} 級。\",\n",
|
| 200 |
+
" \"請撰寫一篇華語課文。\\n類型:{textbook_type}\\n主題:{topics}\\nTBCL 分級:{level}\",\n",
|
| 201 |
+
" \"請根據「{topics}」主題,撰寫一篇符合 TBCL {level} 級標準的華語{textbook_type}課文。\",\n",
|
| 202 |
+
" ]\n",
|
| 203 |
+
" \n",
|
| 204 |
+
" SUBTOPIC_TEMPLATES = [\n",
|
| 205 |
+
" \"請以「{topics}」為主題,選擇一至多個邏輯相關的子主題(如:{subtopics}),撰寫一篇TBCL {level}級的華語{textbook_type}課文。請確保選擇的面向能自然地融入內容中。\",\n",
|
| 206 |
+
" \"以「{topics}」為主軸,從此主題的相關面向中選擇一至多個相互關聯的元素,編寫一篇TBCL {level}級的{textbook_type}課文。內容須流暢地整合這些面向。\",\n",
|
| 207 |
+
" \"撰寫一篇TBCL {level}級的華語{textbook_type}課文。主題為「{topics}」,請自由選擇一至多個相關子主題(參考但不限於:{subtopics})。所選面向須具邏輯關聯,並在課文中自然呈現。\",\n",
|
| 208 |
+
" \"請以「{topics}」為核心主題,從相關子主題(如{subtopics})中挑選一至多個具關聯性的元素,編寫一篇TBCL {level}級的{textbook_type}課文。要求各面向之間的過渡自然,內容連貫。\",\n",
|
| 209 |
+
" \"以「{topics}」為主題創作一篇TBCL {level}級華語{textbook_type}課文。請自選一至多個相關連的子主題(可參考:{subtopics}),並設計合適的情境來呈現這些面向。\",\n",
|
| 210 |
+
" ]\n",
|
| 211 |
+
" \n",
|
| 212 |
+
" # 參數驗證\n",
|
| 213 |
+
" valid_levels = [\"第一級\", \"第二級\", \"第三級\", \"第四級\", \"第五級\", \"第六級\"]\n",
|
| 214 |
+
" valid_topics = list(subtopic_map.keys())\n",
|
| 215 |
+
" valid_article_types = [\"短文\", \"對話\"]\n",
|
| 216 |
+
" \n",
|
| 217 |
+
" if tbcl_level not in valid_levels:\n",
|
| 218 |
+
" return f\"❌ 錯誤:TBCL 等級必須是以下之一:{', '.join(valid_levels)}\"\n",
|
| 219 |
+
" \n",
|
| 220 |
+
" if main_topic not in valid_topics:\n",
|
| 221 |
+
" return f\"❌ 錯誤:主題必須是以下之一:{', '.join(valid_topics)}\"\n",
|
| 222 |
+
" \n",
|
| 223 |
+
" if article_type not in valid_article_types:\n",
|
| 224 |
+
" return f\"❌ 錯誤:文章類型必須是以下之一:{', '.join(valid_article_types)}\"\n",
|
| 225 |
+
" \n",
|
| 226 |
+
" # 判斷使用哪種模式\n",
|
| 227 |
+
" # 如果有選擇副主題,使用 subtopic 模式;否則使用 pure 模式\n",
|
| 228 |
+
" if sub_topic and sub_topic in subtopic_map.get(main_topic, []):\n",
|
| 229 |
+
" # 使用副主題模式\n",
|
| 230 |
+
" template = random.choice(SUBTOPIC_TEMPLATES)\n",
|
| 231 |
+
" # 從該主題的副主題列表中隨機選擇幾個作為參考\n",
|
| 232 |
+
" all_subtopics = subtopic_map[main_topic]\n",
|
| 233 |
+
" # 確保選中的副主題在列表中,並添加其他幾個作為參考\n",
|
| 234 |
+
" sample_subtopics = [sub_topic]\n",
|
| 235 |
+
" other_subtopics = [s for s in all_subtopics if s != sub_topic]\n",
|
| 236 |
+
" sample_subtopics.extend(random.sample(other_subtopics, min(4, len(other_subtopics))))\n",
|
| 237 |
+
" subtopics_text = \"、\".join(sample_subtopics)\n",
|
| 238 |
+
" \n",
|
| 239 |
+
" prompt = template.format(\n",
|
| 240 |
+
" topics=main_topic,\n",
|
| 241 |
+
" level=tbcl_level,\n",
|
| 242 |
+
" textbook_type=article_type,\n",
|
| 243 |
+
" subtopics=subtopics_text\n",
|
| 244 |
+
" )\n",
|
| 245 |
+
" else:\n",
|
| 246 |
+
" # 使用純主題模式\n",
|
| 247 |
+
" template = random.choice(PURE_TEMPLATES)\n",
|
| 248 |
+
" prompt = template.format(\n",
|
| 249 |
+
" topics=main_topic,\n",
|
| 250 |
+
" level=tbcl_level,\n",
|
| 251 |
+
" textbook_type=article_type\n",
|
| 252 |
+
" )\n",
|
| 253 |
+
" \n",
|
| 254 |
+
" # 如果有額外內容,附加到 prompt 後面\n",
|
| 255 |
+
" if extra_content and extra_content.strip():\n",
|
| 256 |
+
" prompt += f\"\\n\\n【額外要求】\\n{extra_content.strip()}\"\n",
|
| 257 |
+
" \n",
|
| 258 |
+
" return prompt\n",
|
| 259 |
+
"\n",
|
| 260 |
+
"\n",
|
| 261 |
+
"def gen_word_sense_prompt(\n",
|
| 262 |
+
" sentence: str,\n",
|
| 263 |
+
" target_word: str,\n",
|
| 264 |
+
") -> str:\n",
|
| 265 |
+
" \"\"\"\n",
|
| 266 |
+
" 詞意消歧 prompt 產生器\n",
|
| 267 |
+
" \"\"\"\n",
|
| 268 |
+
" chat_template = \"請判斷「%s」在以下句子中為何種解釋,並直接輸出正確的選項代號。\\n%s\\n\\n%s\"\n",
|
| 269 |
+
"\n",
|
| 270 |
+
" if target_word not in sentence:\n",
|
| 271 |
+
" return \"此詞彙未出現在句子中。\"\n",
|
| 272 |
+
" elif target_word not in word2explanation_infos.keys():\n",
|
| 273 |
+
" return \"此詞彙未出現在字典。\"\n",
|
| 274 |
+
" else:\n",
|
| 275 |
+
" split_idx = sentence.index(target_word)\n",
|
| 276 |
+
" sentence = sentence[:split_idx]+'「'+target_word+'」'+sentence[split_idx+len(target_word):]\n",
|
| 277 |
+
" classfication_table = \"\"\n",
|
| 278 |
+
" for i, explaination_info in enumerate(word2explanation_infos[target_word]):\n",
|
| 279 |
+
" classfication_table += CHOICE_LABELS[i] + \". \" + remove_example(explaination_info[0]) + \"\\n\"\n",
|
| 280 |
+
" prompt = chat_template % (target_word, sentence, classfication_table[:-1])\n",
|
| 281 |
+
" return prompt\n",
|
| 282 |
+
"\n",
|
| 283 |
+
"def gen_word_info(sentence:str, target_word:str, target_choice:str):\n",
|
| 284 |
+
" columns = [\"釋義\", \"TBCL等級\", \"注音\", \"漢語拼音\", \"詞性\", \"英文翻譯\", \"例句\", \"例句漢語拼音\", \"例句翻譯\"]\n",
|
| 285 |
+
" if target_word not in sentence:\n",
|
| 286 |
+
" return \"此詞彙未出現在句子中。\"\n",
|
| 287 |
+
" elif target_word not in word2explanation_infos.keys():\n",
|
| 288 |
+
" return \"此詞彙未出現在字典。\"\n",
|
| 289 |
+
" elif target_choice not in CHOICE_LABELS:\n",
|
| 290 |
+
" return \"請填寫純英文代號。\"\n",
|
| 291 |
+
" ans_id = CHOICE_LABELS.index(target_choice)\n",
|
| 292 |
+
" explanations = word2explanation_infos[target_word]\n",
|
| 293 |
+
" explanation_ans = explanations[ans_id][:6]+explanations[ans_id][6][0]\n",
|
| 294 |
+
" response_str = \"\"\n",
|
| 295 |
+
" for column, explanation_info in zip(columns, explanation_ans):\n",
|
| 296 |
+
" response_str += \"%s: %s\\n\" %(column, explanation_info)\n",
|
| 297 |
+
" return response_str[:-1]\n",
|
| 298 |
+
"\n",
|
| 299 |
+
"def gen_grammar_analysis_prompt(\n",
|
| 300 |
+
" textbook: str,\n",
|
| 301 |
+
") -> str:\n",
|
| 302 |
+
" \"\"\"\n",
|
| 303 |
+
" 課文語法分析 prompt 產生器\n",
|
| 304 |
+
" \"\"\"\n",
|
| 305 |
+
" chat_template = \"請閱讀以下句子並根據解釋選出句子中有使用到的語法,請直接輸出選項代號,若有多個答案則使用頓號(、)分隔。\\n%s\\n\\n\"\n",
|
| 306 |
+
" prompt = chat_template % (textbook)\n",
|
| 307 |
+
" possible_grammars_list = []\n",
|
| 308 |
+
" for sentence in sentence_segmentation(textbook):\n",
|
| 309 |
+
" grammar_ids, grammar_starts = dealer.list_all_grammars(sentence)\n",
|
| 310 |
+
" grammar_starts = [x[0] for x in grammar_range]\n",
|
| 311 |
+
" grammar_ids_sorted = [x for x, _ in sorted(zip(grammar_ids, grammar_starts), key=lambda p: p[1])]\n",
|
| 312 |
+
" for grammar_id in grammar_ids_sorted:\n",
|
| 313 |
+
" if isinstance(grammar_id, list):\n",
|
| 314 |
+
" for id in grammar_id:\n",
|
| 315 |
+
" possible_grammars_list.append(id)\n",
|
| 316 |
+
" else:\n",
|
| 317 |
+
" possible_grammars_list.append(grammar_id)\n",
|
| 318 |
+
" already_have = []\n",
|
| 319 |
+
" for i, id in enumerate(possible_grammars_list):\n",
|
| 320 |
+
" if id not in already_have:\n",
|
| 321 |
+
" detail = GRAMMAR_ID2INFO[id]\n",
|
| 322 |
+
" prompt += f\"{CHOICE_LABELS[i]}. {detail['grammar_name']}: \\n{detail['way_to_use_zh']}\\n\\n\"\n",
|
| 323 |
+
" already_have.append(id)\n",
|
| 324 |
+
" return prompt[:-1]\n",
|
| 325 |
+
"\n",
|
| 326 |
+
"\n",
|
| 327 |
+
"def gen_reading_test_prompt(\n",
|
| 328 |
+
" pre_text: str,\n",
|
| 329 |
+
" tocfl_level: str,\n",
|
| 330 |
+
" style: str,\n",
|
| 331 |
+
") -> str:\n",
|
| 332 |
+
" \"\"\"\n",
|
| 333 |
+
" 閱讀測驗生成 prompt 產生器\n",
|
| 334 |
+
" \"\"\"\n",
|
| 335 |
+
" \n",
|
| 336 |
+
" level_map = {\"入門基礎\":\"A-入門基礎\", \"進階高階\":\"B-進階高階\", \"流利精通\":\"C-流利精通\"}\n",
|
| 337 |
+
" level = level_map[tocfl_level]\n",
|
| 338 |
+
" level_config = READING_LEVEL_CONFIG[level]\n",
|
| 339 |
+
" system_prompt = level_config[\"system_prompt\"]\n",
|
| 340 |
+
" definition_prompt = level_config[\"definition_prompt\"]\n",
|
| 341 |
+
" asking_prompt_templates = level_config[\"asking_prompt_templates\"]\n",
|
| 342 |
+
" reference_asking_prompt_template = level_config[\"reference_asking_prompt_template\"]\n",
|
| 343 |
+
" topic_classes = level_config[\"topic_classes\"]\n",
|
| 344 |
+
" style_distribution = level_config[\"style_distribution\"]\n",
|
| 345 |
+
" #conversation_distribution = level_config[\"conversation_distribution\"]\n",
|
| 346 |
+
" question_type_distribution = level_config[\"question_type_distribution\"]\n",
|
| 347 |
+
" json_schema = level_config[\"json_schema\"]\n",
|
| 348 |
+
" prompt = system_prompt + \"\\n\"\n",
|
| 349 |
+
" \n",
|
| 350 |
+
" question_type = None\n",
|
| 351 |
+
" # --- 抽 topic(等機率) ---\n",
|
| 352 |
+
" topic_class = random.choice(topic_classes)\n",
|
| 353 |
+
"\n",
|
| 354 |
+
" # --- 按機率抽 style ---\n",
|
| 355 |
+
" #styles, style_probs = zip(*style_distribution.items())\n",
|
| 356 |
+
" #style = random.choices(styles, weights=style_probs, k=1)[0]\n",
|
| 357 |
+
"\n",
|
| 358 |
+
" #conversation_types, conversation_types = zip(*conversation_distribution.items())\n",
|
| 359 |
+
" #conversation = random.choices(conversation_types, weights=conversation_types, k=1)[0]\n",
|
| 360 |
+
" #if conversation == \"含對話\":\n",
|
| 361 |
+
" # style = style + (\"(含對話)\")\n",
|
| 362 |
+
" #else:\n",
|
| 363 |
+
" # style = style + (\"(純敘述)\")\n",
|
| 364 |
+
" \n",
|
| 365 |
+
" # --- 按機率抽 question_type ---\n",
|
| 366 |
+
" if question_type_distribution is not None:\n",
|
| 367 |
+
" qtypes, q_probs = zip(*question_type_distribution.items())\n",
|
| 368 |
+
" question_type = random.choices(qtypes, weights=q_probs, k=1)[0]\n",
|
| 369 |
+
" if question_type:\n",
|
| 370 |
+
" reference_asking_prompt = reference_asking_prompt_template.safe_substitute(style=style, question_type=question_type, content=pre_text)\n",
|
| 371 |
+
" else:\n",
|
| 372 |
+
" reference_asking_prompt = reference_asking_prompt_template.safe_substitute(style=style, content=pre_text)\n",
|
| 373 |
+
" response_rule = \"【json_schema 輸出格式】\"+\"\\n\"+ str(json_schema)\n",
|
| 374 |
+
" return prompt+definition_prompt+reference_asking_prompt+\"\\n\\n\"+response_rule\n",
|
| 375 |
+
"\n",
|
| 376 |
+
"def gen_cloze_prompts(\n",
|
| 377 |
+
" pre_text: str,\n",
|
| 378 |
+
" tocfl_level: str,\n",
|
| 379 |
+
") -> str:\n",
|
| 380 |
+
" \"\"\"\n",
|
| 381 |
+
" 選詞填空生成 prompt 產生器\n",
|
| 382 |
+
" \"\"\"\n",
|
| 383 |
+
" level_map = {\"入門基礎\":\"A-入門基礎\", \"進階高階\":\"B-進階高階\", \"流利精通\":\"C-流利精通\"}\n",
|
| 384 |
+
" level = level_map[tocfl_level]\n",
|
| 385 |
+
" level_config = CLOZE_LEVEL_CONFIG[level]\n",
|
| 386 |
+
" word_list = level_config[\"word_list\"]\n",
|
| 387 |
+
" grammar_list = level_config[\"grammar_list\"]\n",
|
| 388 |
+
" all_patterns = level_config[\"all_patterns\"]\n",
|
| 389 |
+
" examples = level_config[\"examples\"]\n",
|
| 390 |
+
" system_prompt = level_config[\"system_prompt\"]\n",
|
| 391 |
+
" prompt_definition_template = level_config[\"prompt_definition_template\"]\n",
|
| 392 |
+
" asking_prompt_1 = level_config[\"asking_prompt_1\"]\n",
|
| 393 |
+
" reference_asking_prompt_1_template = level_config[\"reference_asking_prompt_1_template\"]\n",
|
| 394 |
+
" perference_prompts = level_config[\"perference_prompts\"]\n",
|
| 395 |
+
" asking_prompt_2_template = level_config[\"asking_prompt_2_template\"]\n",
|
| 396 |
+
" json_schema = level_config[\"json_schema\"]\n",
|
| 397 |
+
"\n",
|
| 398 |
+
" word_text = \"、\".join(random.sample(word_list, min(30, len(word_list))))\n",
|
| 399 |
+
" grammar_text = \"、\".join(random.sample(grammar_list, min(10, len(grammar_list))))\n",
|
| 400 |
+
" pattern_text = \"、\".join(random.sample(all_patterns, min(9, len(all_patterns))))\n",
|
| 401 |
+
" prompt_definition = prompt_definition_template.safe_substitute(level_word_list=word_text, level_grammar_list=grammar_text, level_pattern_list=pattern_text, level_examples=examples)\n",
|
| 402 |
+
" pre_text_prompt = \"【前置課文】\" + \"\\n\" + pre_text\n",
|
| 403 |
+
" asking_prompt = \"請思考讀者學習完【前置課文】後的克漏字填空出題方向,其中克漏字填空裡的[完整文章]是【前置課文】的延伸,兩者需保持一定的相關性,但不需重複。請先根據【要求】的規範生成不帶任何空格的[完整文章]。\"\n",
|
| 404 |
+
" chat_prompt1 = system_prompt + \"\\n\" + prompt_definition + pre_text_prompt + '\\n' + asking_prompt\n",
|
| 405 |
+
" chat_prompt2 = asking_prompt_2_template.safe_substitute(level_preference1=perference_prompts[0], level_preference2=perference_prompts[1], patterns_text=pattern_text)\n",
|
| 406 |
+
" return chat_prompt1, chat_prompt2\n",
|
| 407 |
+
" "
|
| 408 |
+
]
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"cell_type": "code",
|
| 412 |
+
"execution_count": null,
|
| 413 |
+
"id": "cd6eda82",
|
| 414 |
+
"metadata": {},
|
| 415 |
+
"outputs": [],
|
| 416 |
+
"source": [
|
| 417 |
+
"# =========================\n",
|
| 418 |
+
"# 2. Gradio UI 定義\n",
|
| 419 |
+
"# =========================\n",
|
| 420 |
+
"\n",
|
| 421 |
+
"TBCL_LEVELS = [\"第一級\", \"第二級\", \"第三級\", \"第四級\", \"第五級\", \"第六級\"]\n",
|
| 422 |
+
"TOCFL_LEVELS = [\"入門基礎\", \"進階高階\", \"流利精通\"]\n",
|
| 423 |
+
"ARTICLE_STYLES = [\"記敘文\", \"議論文\", \"説明文\", \"抒情文\"]\n",
|
| 424 |
+
"ARTICLE_TYPES = [\"短文\", \"對話\"]\n",
|
| 425 |
+
"MAIN_TOPICS = list(subtopic_map.keys()) # 從 subtopic_map 取得主題列表\n",
|
| 426 |
+
"NEED_FIX_STATUS = [\"生成題目TOCFL等級過高\", \"生成題目TOCFL等級過低\", \"生成題目品質不過關\"]\n",
|
| 427 |
+
"\n",
|
| 428 |
+
"\n",
|
| 429 |
+
"with gr.Blocks(title=\"教育任務 Prompt 產生器\") as demo:\n",
|
| 430 |
+
" gr.Markdown(\"# 🧩 教育任務 Prompt 產生器\")\n",
|
| 431 |
+
"\n",
|
| 432 |
+
" with gr.Tab(\"課文生成\"):\n",
|
| 433 |
+
" tbcl_level_in = gr.Dropdown(\n",
|
| 434 |
+
" choices=TBCL_LEVELS,\n",
|
| 435 |
+
" value=\"第一級\",\n",
|
| 436 |
+
" label=\"TBCL 等級\",\n",
|
| 437 |
+
" )\n",
|
| 438 |
+
" main_topic_in = gr.Dropdown(\n",
|
| 439 |
+
" choices=MAIN_TOPICS,\n",
|
| 440 |
+
" value=MAIN_TOPICS[0],\n",
|
| 441 |
+
" label=\"主題\",\n",
|
| 442 |
+
" )\n",
|
| 443 |
+
" sub_topic_in = gr.Dropdown(\n",
|
| 444 |
+
" choices=[],\n",
|
| 445 |
+
" value=None,\n",
|
| 446 |
+
" label=\"副主題(選填)\",\n",
|
| 447 |
+
" interactive=True,\n",
|
| 448 |
+
" )\n",
|
| 449 |
+
" extra_content_in = gr.Textbox(\n",
|
| 450 |
+
" lines=5,\n",
|
| 451 |
+
" label=\"額外內容\",\n",
|
| 452 |
+
" placeholder=\"可填寫想加入的設定、情境說明等(選填)\",\n",
|
| 453 |
+
" )\n",
|
| 454 |
+
" article_type_in = gr.Dropdown(\n",
|
| 455 |
+
" choices=ARTICLE_TYPES,\n",
|
| 456 |
+
" value=\"短文\",\n",
|
| 457 |
+
" label=\"文章類型\",\n",
|
| 458 |
+
" )\n",
|
| 459 |
+
"\n",
|
| 460 |
+
" gen_button_1 = gr.Button(\"產生課文生成 Prompt\")\n",
|
| 461 |
+
" output_1 = gr.Textbox(\n",
|
| 462 |
+
" lines=10,\n",
|
| 463 |
+
" label=\"課文生成 Prompt(string)\",\n",
|
| 464 |
+
" )\n",
|
| 465 |
+
"\n",
|
| 466 |
+
" def _update_subtopics(main_topic):\n",
|
| 467 |
+
" \"\"\"當主題改變時,更新副主題選項\"\"\"\n",
|
| 468 |
+
" if main_topic in subtopic_map:\n",
|
| 469 |
+
" subtopics = subtopic_map[main_topic]\n",
|
| 470 |
+
" return gr.Dropdown(choices=subtopics, value=None)\n",
|
| 471 |
+
" return gr.Dropdown(choices=[], value=None)\n",
|
| 472 |
+
"\n",
|
| 473 |
+
" def _on_gen_textbook(tbcl_level, main_topic, sub_topic, extra_content, article_type):\n",
|
| 474 |
+
" return gen_textbook_prompt(\n",
|
| 475 |
+
" tbcl_level=tbcl_level,\n",
|
| 476 |
+
" main_topic=main_topic,\n",
|
| 477 |
+
" sub_topic=sub_topic,\n",
|
| 478 |
+
" extra_content=extra_content,\n",
|
| 479 |
+
" article_type=article_type,\n",
|
| 480 |
+
" )\n",
|
| 481 |
+
"\n",
|
| 482 |
+
" # 當主題改變時,更新副主題選項\n",
|
| 483 |
+
" main_topic_in.change(\n",
|
| 484 |
+
" _update_subtopics,\n",
|
| 485 |
+
" inputs=[main_topic_in],\n",
|
| 486 |
+
" outputs=[sub_topic_in],\n",
|
| 487 |
+
" )\n",
|
| 488 |
+
"\n",
|
| 489 |
+
" gen_button_1.click(\n",
|
| 490 |
+
" _on_gen_textbook,\n",
|
| 491 |
+
" inputs=[tbcl_level_in, main_topic_in, sub_topic_in, extra_content_in, article_type_in],\n",
|
| 492 |
+
" outputs=output_1,\n",
|
| 493 |
+
" )\n",
|
| 494 |
+
"\n",
|
| 495 |
+
" with gr.Tab(\"詞意消歧\"):\n",
|
| 496 |
+
" sentence_in = gr.Textbox(\n",
|
| 497 |
+
" lines=5,\n",
|
| 498 |
+
" label=\"句子\",\n",
|
| 499 |
+
" placeholder=\"請輸入包含目標詞彙的完整句子\",\n",
|
| 500 |
+
" )\n",
|
| 501 |
+
" target_word_in = gr.Textbox(\n",
|
| 502 |
+
" lines=1,\n",
|
| 503 |
+
" label=\"詞彙\",\n",
|
| 504 |
+
" placeholder=\"請輸入要進行詞意消歧的詞彙\",\n",
|
| 505 |
+
" )\n",
|
| 506 |
+
"\n",
|
| 507 |
+
" gen_button_2 = gr.Button(\"產生詞意消歧 Prompt\")\n",
|
| 508 |
+
" output_2 = gr.Textbox(\n",
|
| 509 |
+
" lines=8,\n",
|
| 510 |
+
" label=\"詞意消歧 Prompt(string)\",\n",
|
| 511 |
+
" )\n",
|
| 512 |
+
" target_choice_in = gr.Textbox(\n",
|
| 513 |
+
" lines=1,\n",
|
| 514 |
+
" label=\"選項\",\n",
|
| 515 |
+
" placeholder=\"請輸入正確的選項\",\n",
|
| 516 |
+
" )\n",
|
| 517 |
+
" gen_button_2_2 = gr.Button(\"產生詞彙釋義相關資訊\")\n",
|
| 518 |
+
" output_3 = gr.Textbox(\n",
|
| 519 |
+
" lines=10,\n",
|
| 520 |
+
" label=\"詞彙釋義資訊\",\n",
|
| 521 |
+
" )\n",
|
| 522 |
+
"\n",
|
| 523 |
+
" def _on_gen_word_sense(sentence, target_word):\n",
|
| 524 |
+
" return gen_word_sense_prompt(sentence=sentence, target_word=target_word)\n",
|
| 525 |
+
"\n",
|
| 526 |
+
" gen_button_2.click(\n",
|
| 527 |
+
" _on_gen_word_sense,\n",
|
| 528 |
+
" inputs=[sentence_in, target_word_in],\n",
|
| 529 |
+
" outputs=output_2,\n",
|
| 530 |
+
" )\n",
|
| 531 |
+
"\n",
|
| 532 |
+
" def _on_gen_word_info(sentence, target_word, target_choice):\n",
|
| 533 |
+
" return gen_word_info(sentence=sentence, target_word=target_word, target_choice=target_choice)\n",
|
| 534 |
+
"\n",
|
| 535 |
+
" gen_button_2_2.click(\n",
|
| 536 |
+
" _on_gen_word_info,\n",
|
| 537 |
+
" inputs=[sentence_in, target_word_in, target_choice_in],\n",
|
| 538 |
+
" outputs=output_3,\n",
|
| 539 |
+
" )\n",
|
| 540 |
+
"\n",
|
| 541 |
+
" with gr.Tab(\"課文語法分析\"):\n",
|
| 542 |
+
" textbook_in = gr.Textbox(\n",
|
| 543 |
+
" lines=10,\n",
|
| 544 |
+
" label=\"課文\",\n",
|
| 545 |
+
" placeholder=\"請貼上完整課文內容\",\n",
|
| 546 |
+
" )\n",
|
| 547 |
+
"\n",
|
| 548 |
+
" gen_button_3 = gr.Button(\"產生課文語法分析 Prompt\")\n",
|
| 549 |
+
" output_3 = gr.Textbox(\n",
|
| 550 |
+
" lines=8,\n",
|
| 551 |
+
" label=\"課文語法分析 Prompt(string)\",\n",
|
| 552 |
+
" )\n",
|
| 553 |
+
"\n",
|
| 554 |
+
" def _on_gen_grammar(textbook):\n",
|
| 555 |
+
" return gen_grammar_analysis_prompt(textbook=textbook)\n",
|
| 556 |
+
"\n",
|
| 557 |
+
" gen_button_3.click(\n",
|
| 558 |
+
" _on_gen_grammar,\n",
|
| 559 |
+
" inputs=[textbook_in],\n",
|
| 560 |
+
" outputs=output_3,\n",
|
| 561 |
+
" )\n",
|
| 562 |
+
"\n",
|
| 563 |
+
" with gr.Tab(\"閱讀測驗生成\"):\n",
|
| 564 |
+
" pre_text_in = gr.Textbox(\n",
|
| 565 |
+
" lines=10,\n",
|
| 566 |
+
" label=\"前置課文\",\n",
|
| 567 |
+
" placeholder=\"請貼上作為閱讀測驗基礎的課文\",\n",
|
| 568 |
+
" )\n",
|
| 569 |
+
" tocfl_level_in_1 = gr.Dropdown(\n",
|
| 570 |
+
" choices=TOCFL_LEVELS,\n",
|
| 571 |
+
" value=\"入門基礎\",\n",
|
| 572 |
+
" label=\"TOCFL 等級\",\n",
|
| 573 |
+
" )\n",
|
| 574 |
+
" article_style_in = gr.Dropdown(\n",
|
| 575 |
+
" choices=ARTICLE_STYLES,\n",
|
| 576 |
+
" value=\"記敘文\",\n",
|
| 577 |
+
" label=\"題目文章文體\",\n",
|
| 578 |
+
" )\n",
|
| 579 |
+
"\n",
|
| 580 |
+
" gen_button_4 = gr.Button(\"產生閱讀測驗 Prompt\")\n",
|
| 581 |
+
" output_4 = gr.Textbox(\n",
|
| 582 |
+
" lines=8,\n",
|
| 583 |
+
" label=\"閱讀測驗生成 Prompt(string)\",\n",
|
| 584 |
+
" )\n",
|
| 585 |
+
"\n",
|
| 586 |
+
" judgement_level_status_in = gr.Dropdown(\n",
|
| 587 |
+
" choices=NEED_FIX_STATUS,\n",
|
| 588 |
+
" value=\"題目等級過高\",\n",
|
| 589 |
+
" label=\"遇到需要修正的評估結果\",\n",
|
| 590 |
+
" )\n",
|
| 591 |
+
"\n",
|
| 592 |
+
" gen_button_4_2 = gr.Button(\"產生修正等級的 Prompt\")\n",
|
| 593 |
+
" output_4_2 = gr.Textbox(\n",
|
| 594 |
+
" lines=1,\n",
|
| 595 |
+
" label=\"修正Prompt生成\",\n",
|
| 596 |
+
" )\n",
|
| 597 |
+
"\n",
|
| 598 |
+
" def _on_gen_reading_test(pre_text, tocfl_level, style):\n",
|
| 599 |
+
" return gen_reading_test_prompt(\n",
|
| 600 |
+
" pre_text=pre_text,\n",
|
| 601 |
+
" tocfl_level=tocfl_level,\n",
|
| 602 |
+
" style=style,\n",
|
| 603 |
+
" )\n",
|
| 604 |
+
"\n",
|
| 605 |
+
" gen_button_4.click(\n",
|
| 606 |
+
" _on_gen_reading_test,\n",
|
| 607 |
+
" inputs=[pre_text_in, tocfl_level_in_1, article_style_in],\n",
|
| 608 |
+
" outputs=output_4,\n",
|
| 609 |
+
" )\n",
|
| 610 |
+
" gr.Markdown(\"## 品質評估用的prompt_1\\n\" + READING_JUDGE_PROMPTS.system + \"\\n\\n[待評估之課文]\")\n",
|
| 611 |
+
" gr.Markdown(\"## 品質評估用的prompt_2\\n\" + READING_JUDGE_PROMPTS.additional_check)\n",
|
| 612 |
+
" gr.Markdown(\"## 品質評估用的prompt_3 (格式化輸出)\\n\" +\"【json_schema 輸出格式】\\n\\n\"+ str(READING_JUDGE_PROMPTS.json_schema))\n",
|
| 613 |
+
" \n",
|
| 614 |
+
" def _on_gen_reading_test(judgement_level_status):\n",
|
| 615 |
+
" if judgement_level_status == \"生成題目TOCFL等級過高\":\n",
|
| 616 |
+
" return FIX_HIGH_LEVEL_PROMPT\n",
|
| 617 |
+
" elif judgement_level_status == \"生成題目TOCFL等級過低\":\n",
|
| 618 |
+
" return FIX_LOW_LEVEL_PROMPT\n",
|
| 619 |
+
" elif judgement_level_status == \"生成題目品質不過關\":\n",
|
| 620 |
+
" return FIX_PREFERENCE_PROMPT\n",
|
| 621 |
+
" \n",
|
| 622 |
+
" gen_button_4_2.click(\n",
|
| 623 |
+
" _on_gen_reading_test,\n",
|
| 624 |
+
" inputs=[judgement_level_status_in],\n",
|
| 625 |
+
" outputs=output_4_2,\n",
|
| 626 |
+
" )\n",
|
| 627 |
+
"\n",
|
| 628 |
+
" with gr.Tab(\"選詞填空生成\"):\n",
|
| 629 |
+
" pre_text_in_2 = gr.Textbox(\n",
|
| 630 |
+
" lines=10,\n",
|
| 631 |
+
" label=\"前置課文\",\n",
|
| 632 |
+
" placeholder=\"請貼上作為選詞填空基礎的課文\",\n",
|
| 633 |
+
" )\n",
|
| 634 |
+
" tocfl_level_in_2 = gr.Dropdown(\n",
|
| 635 |
+
" choices=TOCFL_LEVELS[1:],\n",
|
| 636 |
+
" value=\"進階高階\",\n",
|
| 637 |
+
" label=\"TOCFL 等級\",\n",
|
| 638 |
+
" )\n",
|
| 639 |
+
"\n",
|
| 640 |
+
" gen_button_5 = gr.Button(\"產生完整文章生成 Prompt\")\n",
|
| 641 |
+
" output_5 = gr.Textbox(\n",
|
| 642 |
+
" lines=8,\n",
|
| 643 |
+
" label=\"選詞填空完整文章生成 Prompt(string)\",\n",
|
| 644 |
+
" )\n",
|
| 645 |
+
" \n",
|
| 646 |
+
" gen_button_5_2 = gr.Button(\"產生選詞填空出題 Prompt\")\n",
|
| 647 |
+
" output_5_2 = gr.Textbox(\n",
|
| 648 |
+
" lines=8,\n",
|
| 649 |
+
" label=\"選詞填空題目生成 Prompt(string)\",\n",
|
| 650 |
+
" )\n",
|
| 651 |
+
" gr.Markdown(\"## 品質評估用的prompt_1\\n\" + CLOZE_JUDGE_PROMPTS.system + \"\\n\\n[待評估之課文]\")\n",
|
| 652 |
+
" gr.Markdown(\"## 品質評估用的prompt_2\\n\" + CLOZE_JUDGE_PROMPTS.additional_check)\n",
|
| 653 |
+
" gr.Markdown(\"## 品質評估用的prompt_3 (格式化輸出)\\n\" +\"【json_schema 輸出格式】\\n\\n\"+ str(CLOZE_JUDGE_PROMPTS.json_schema))\n",
|
| 654 |
+
"\n",
|
| 655 |
+
" def _on_gen_cloze_complete_text(pre_text, tocfl_level):\n",
|
| 656 |
+
" return gen_cloze_prompts(\n",
|
| 657 |
+
" pre_text=pre_text,\n",
|
| 658 |
+
" tocfl_level=tocfl_level,\n",
|
| 659 |
+
" )[0]\n",
|
| 660 |
+
"\n",
|
| 661 |
+
" gen_button_5.click(\n",
|
| 662 |
+
" _on_gen_cloze_complete_text,\n",
|
| 663 |
+
" inputs=[pre_text_in_2, tocfl_level_in_2],\n",
|
| 664 |
+
" outputs=output_5,\n",
|
| 665 |
+
" )\n",
|
| 666 |
+
" \n",
|
| 667 |
+
" def _on_gen_cloze_text(pre_text, tocfl_level):\n",
|
| 668 |
+
" return gen_cloze_prompts(\n",
|
| 669 |
+
" pre_text=pre_text,\n",
|
| 670 |
+
" tocfl_level=tocfl_level,\n",
|
| 671 |
+
" )[1]\n",
|
| 672 |
+
"\n",
|
| 673 |
+
" gen_button_5.click(\n",
|
| 674 |
+
" _on_gen_cloze_text,\n",
|
| 675 |
+
" inputs=[pre_text_in_2, tocfl_level_in_2],\n",
|
| 676 |
+
" outputs=output_5_2,\n",
|
| 677 |
+
" )"
|
| 678 |
+
]
|
| 679 |
+
},
|
| 680 |
+
{
|
| 681 |
+
"cell_type": "code",
|
| 682 |
+
"execution_count": 10,
|
| 683 |
+
"id": "04ca8272",
|
| 684 |
+
"metadata": {},
|
| 685 |
+
"outputs": [
|
| 686 |
+
{
|
| 687 |
+
"name": "stdout",
|
| 688 |
+
"output_type": "stream",
|
| 689 |
+
"text": [
|
| 690 |
+
"Running on local URL: http://127.0.0.1:7860\n",
|
| 691 |
+
"IMPORTANT: You are using gradio version 3.41.2, however version 4.44.1 is available, please upgrade.\n",
|
| 692 |
+
"--------\n",
|
| 693 |
+
"\n",
|
| 694 |
+
"Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.\n"
|
| 695 |
+
]
|
| 696 |
+
},
|
| 697 |
+
{
|
| 698 |
+
"data": {
|
| 699 |
+
"text/html": [
|
| 700 |
+
"<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
| 701 |
+
],
|
| 702 |
+
"text/plain": [
|
| 703 |
+
"<IPython.core.display.HTML object>"
|
| 704 |
+
]
|
| 705 |
+
},
|
| 706 |
+
"metadata": {},
|
| 707 |
+
"output_type": "display_data"
|
| 708 |
+
}
|
| 709 |
+
],
|
| 710 |
+
"source": [
|
| 711 |
+
"# 直接執行:python this_file.py 後 demo.launch()\n",
|
| 712 |
+
"if __name__ == \"__main__\":\n",
|
| 713 |
+
" demo.launch(share=True)\n",
|
| 714 |
+
" #demo.launch(share=True)"
|
| 715 |
+
]
|
| 716 |
+
},
|
| 717 |
+
{
|
| 718 |
+
"cell_type": "code",
|
| 719 |
+
"execution_count": 11,
|
| 720 |
+
"id": "8f3b4413",
|
| 721 |
+
"metadata": {},
|
| 722 |
+
"outputs": [],
|
| 723 |
+
"source": [
|
| 724 |
+
"WORD_LIST_COLUMN_NAMES = [\"繁體中文\", \"TBCL等級\", \"注音\", \"漢語拼音\", \"詞性\", \"英文翻譯\", \"例句\", \"例句漢語拼音\", \"例句翻譯\", \"位置\", \"釋義\", \"釋義ID\"]\n"
|
| 725 |
+
]
|
| 726 |
+
},
|
| 727 |
+
{
|
| 728 |
+
"cell_type": "code",
|
| 729 |
+
"execution_count": null,
|
| 730 |
+
"id": "73d4d38b",
|
| 731 |
+
"metadata": {},
|
| 732 |
+
"outputs": [],
|
| 733 |
+
"source": []
|
| 734 |
+
}
|
| 735 |
+
],
|
| 736 |
+
"metadata": {
|
| 737 |
+
"kernelspec": {
|
| 738 |
+
"display_name": "Python (study_prompt)",
|
| 739 |
+
"language": "python",
|
| 740 |
+
"name": "study_prompt"
|
| 741 |
+
},
|
| 742 |
+
"language_info": {
|
| 743 |
+
"codemirror_mode": {
|
| 744 |
+
"name": "ipython",
|
| 745 |
+
"version": 3
|
| 746 |
+
},
|
| 747 |
+
"file_extension": ".py",
|
| 748 |
+
"mimetype": "text/x-python",
|
| 749 |
+
"name": "python",
|
| 750 |
+
"nbconvert_exporter": "python",
|
| 751 |
+
"pygments_lexer": "ipython3",
|
| 752 |
+
"version": "3.10.19"
|
| 753 |
+
}
|
| 754 |
+
},
|
| 755 |
+
"nbformat": 4,
|
| 756 |
+
"nbformat_minor": 5
|
| 757 |
+
}
|
dataset/MOE_word2explanations_8.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fefe39650697542809f27992c2a41f24c6d8969c7337edc37b21b0e865e6b7ca
|
| 3 |
+
size 21537358
|
dataset/TMLD_word2explanations_3.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c28ac7148e35ca173ad415c9395963443d6cae2c3913c8d34f4201bbd40e791a
|
| 3 |
+
size 3092273
|
dataset/cloze_exampleB.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0218024f0f1ac12c09bce0222680390584e3444f89a9c1d19e950a43942ba41
|
| 3 |
+
size 7526
|
dataset/cloze_exampleC.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6981c03016abd3a125e788f61c2ab374b62a84900f3c7160676e430f5cd2821b
|
| 3 |
+
size 8146
|
dataset/grammars.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4098c0cd08e469eda96b1ac06e59ed9c7e9dc5de2459267a9cec0bd6ebe396b5
|
| 3 |
+
size 479986
|
dataset/sentencepattern.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c4082daa6c1d27758e2f665bb1c4a88c1c6da632933bdbcbc3c65356979b33a
|
| 3 |
+
size 3819
|
dataset/subtopic_map20251002v2.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6f168c516543b721782a136c7fd0571a21f166fd09cce45690133954138ca40
|
| 3 |
+
size 10921
|
dataset/word2TBCL.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a46c135ce1590659694dc135eed3d957daaca9032e8d9f696e46523982de7e71
|
| 3 |
+
size 295094
|
fileio/__init__.py
ADDED
|
File without changes
|
fileio/jsonio.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import io
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import gzip
|
| 6 |
+
import tempfile
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any, Iterable, Iterator, Union, Optional, overload, List, Dict
|
| 9 |
+
|
| 10 |
+
PathLike = Union[str, Path]
|
| 11 |
+
|
| 12 |
+
class FileIOError(Exception):
|
| 13 |
+
"""基底例外:涵蓋讀寫與解析錯誤。"""
|
| 14 |
+
|
| 15 |
+
class JSONDecodeError(FileIOError):
|
| 16 |
+
"""JSON 解析失敗。"""
|
| 17 |
+
|
| 18 |
+
class AtomicWriter:
|
| 19 |
+
"""
|
| 20 |
+
以原子方式寫檔:先寫到同目錄暫存檔,再 os.replace()。
|
| 21 |
+
可搭配 gzip_mode=True 自動用 gzip 包裝。
|
| 22 |
+
"""
|
| 23 |
+
def __init__(self, target: Path, gzip_mode: bool = False, encoding: str = "utf-8", newline: str = "\n"):
|
| 24 |
+
self.target = target
|
| 25 |
+
self.dir = target.parent
|
| 26 |
+
self.encoding = encoding
|
| 27 |
+
self.newline = newline
|
| 28 |
+
self.gzip_mode = gzip_mode
|
| 29 |
+
self._tmp_path: Optional[Path] = None
|
| 30 |
+
self._fh: Optional[io.TextIOBase] = None
|
| 31 |
+
|
| 32 |
+
def __enter__(self) -> io.TextIOBase:
|
| 33 |
+
self.dir.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
fd, tmp_name = tempfile.mkstemp(prefix=f".{self.target.name}.", dir=str(self.dir))
|
| 35 |
+
os.close(fd) # 我們會用高階 file object 重新開
|
| 36 |
+
self._tmp_path = Path(tmp_name)
|
| 37 |
+
|
| 38 |
+
if self.gzip_mode:
|
| 39 |
+
f = gzip.open(self._tmp_path, mode="wt", encoding=self.encoding, newline=self.newline)
|
| 40 |
+
else:
|
| 41 |
+
f = open(self._tmp_path, mode="w", encoding=self.encoding, newline=self.newline)
|
| 42 |
+
self._fh = f
|
| 43 |
+
return f
|
| 44 |
+
|
| 45 |
+
def __exit__(self, exc_type, exc, tb) -> None:
|
| 46 |
+
if self._fh:
|
| 47 |
+
self._fh.close()
|
| 48 |
+
if exc_type is None:
|
| 49 |
+
assert self._tmp_path is not None
|
| 50 |
+
os.replace(self._tmp_path, self.target)
|
| 51 |
+
else:
|
| 52 |
+
# 發生錯誤就清掉暫存檔
|
| 53 |
+
if self._tmp_path and self._tmp_path.exists():
|
| 54 |
+
try:
|
| 55 |
+
self._tmp_path.unlink()
|
| 56 |
+
except Exception:
|
| 57 |
+
pass # 忽略清理失敗
|
| 58 |
+
|
| 59 |
+
def _open_read(path: Path, encoding: str = "utf-8") -> io.TextIOBase:
|
| 60 |
+
if path.suffix == ".gz":
|
| 61 |
+
return gzip.open(path, mode="rt", encoding=encoding)
|
| 62 |
+
return open(path, mode="r", encoding=encoding)
|
| 63 |
+
|
| 64 |
+
def _needs_gzip(path: Path) -> bool:
|
| 65 |
+
return path.suffix == ".gz"
|
| 66 |
+
|
| 67 |
+
class JsonIO:
|
| 68 |
+
"""
|
| 69 |
+
JSON 單檔存取。
|
| 70 |
+
- 保留中文 ensure_ascii=False
|
| 71 |
+
- 預設縮排 indent=2(可改)
|
| 72 |
+
- 自動支援 .gz 讀/寫
|
| 73 |
+
"""
|
| 74 |
+
def __init__(self, path: PathLike, encoding: str = "utf-8"):
|
| 75 |
+
self.path = Path(path)
|
| 76 |
+
self.encoding = encoding
|
| 77 |
+
|
| 78 |
+
def load(self) -> Any:
|
| 79 |
+
try:
|
| 80 |
+
with _open_read(self.path, encoding=self.encoding) as f:
|
| 81 |
+
return json.load(f)
|
| 82 |
+
except json.JSONDecodeError as e:
|
| 83 |
+
raise JSONDecodeError(f"JSON decode failed at {self.path}: {e}") from e
|
| 84 |
+
except Exception as e:
|
| 85 |
+
raise FileIOError(f"Failed to read {self.path}: {e}") from e
|
| 86 |
+
|
| 87 |
+
def save(self, obj: Any, indent: int = 2, ensure_ascii: bool = False) -> None:
|
| 88 |
+
gzip_mode = _needs_gzip(self.path)
|
| 89 |
+
try:
|
| 90 |
+
with AtomicWriter(self.path, gzip_mode=gzip_mode, encoding=self.encoding) as f:
|
| 91 |
+
json.dump(obj, f, ensure_ascii=ensure_ascii, indent=indent)
|
| 92 |
+
f.write("\n")
|
| 93 |
+
except Exception as e:
|
| 94 |
+
raise FileIOError(f"Failed to write {self.path}: {e}") from e
|
| 95 |
+
|
| 96 |
+
class JsonlIO:
|
| 97 |
+
"""
|
| 98 |
+
JSONL(NDJSON)逐行存取。
|
| 99 |
+
- 迭代讀(省記憶體)
|
| 100 |
+
- 覆寫寫入(原子)
|
| 101 |
+
- append 寫入
|
| 102 |
+
- 自動支援 .gz
|
| 103 |
+
"""
|
| 104 |
+
def __init__(self, path: PathLike, encoding: str = "utf-8"):
|
| 105 |
+
self.path = Path(path)
|
| 106 |
+
self.encoding = encoding
|
| 107 |
+
|
| 108 |
+
def iter_load(self) -> Iterator[Dict[str, Any]]:
|
| 109 |
+
"""逐行解析,遇到空白行自動略過。"""
|
| 110 |
+
try:
|
| 111 |
+
with _open_read(self.path, encoding=self.encoding) as f:
|
| 112 |
+
for i, line in enumerate(f, start=1):
|
| 113 |
+
line = line.strip()
|
| 114 |
+
if not line:
|
| 115 |
+
continue
|
| 116 |
+
try:
|
| 117 |
+
yield json.loads(line)
|
| 118 |
+
except json.JSONDecodeError as e:
|
| 119 |
+
raise JSONDecodeError(f"JSONL decode failed at line {i} in {self.path}: {e}") from e
|
| 120 |
+
except Exception as e:
|
| 121 |
+
raise FileIOError(f"Failed to read {self.path}: {e}") from e
|
| 122 |
+
|
| 123 |
+
def load_all(self) -> List[Dict[str, Any]]:
|
| 124 |
+
"""一次載入全部(小檔方便用)。"""
|
| 125 |
+
return list(self.iter_load())
|
| 126 |
+
|
| 127 |
+
def save_all(self, rows: Iterable[Dict[str, Any]], ensure_ascii: bool = False) -> None:
|
| 128 |
+
"""覆寫寫入(原子)。"""
|
| 129 |
+
gzip_mode = _needs_gzip(self.path)
|
| 130 |
+
try:
|
| 131 |
+
with AtomicWriter(self.path, gzip_mode=gzip_mode, encoding=self.encoding) as f:
|
| 132 |
+
for row in rows:
|
| 133 |
+
f.write(json.dumps(row, ensure_ascii=ensure_ascii))
|
| 134 |
+
f.write("\n")
|
| 135 |
+
except Exception as e:
|
| 136 |
+
raise FileIOError(f"Failed to write {self.path}: {e}") from e
|
| 137 |
+
|
| 138 |
+
def append(self, rows: Iterable[Dict[str, Any]], ensure_ascii: bool = False) -> None:
|
| 139 |
+
"""附加寫入(非原子;適合持續追加的 log 類資料)。"""
|
| 140 |
+
try:
|
| 141 |
+
if _needs_gzip(self.path):
|
| 142 |
+
# gzip 不支援原地 append 的隨機存取,這裡採「串流追加」
|
| 143 |
+
with gzip.open(self.path, mode="at", encoding=self.encoding, newline="\n") as f:
|
| 144 |
+
for row in rows:
|
| 145 |
+
f.write(json.dumps(row, ensure_ascii=ensure_ascii))
|
| 146 |
+
f.write("\n")
|
| 147 |
+
else:
|
| 148 |
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
| 149 |
+
with open(self.path, mode="a", encoding=self.encoding, newline="\n") as f:
|
| 150 |
+
for row in rows:
|
| 151 |
+
f.write(json.dumps(row, ensure_ascii=ensure_ascii))
|
| 152 |
+
f.write("\n")
|
| 153 |
+
except Exception as e:
|
| 154 |
+
raise FileIOError(f"Failed to append {self.path}: {e}") from e
|
requirements.txt
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Gradio UI
|
| 2 |
+
gradio==6.2.0
|
| 3 |
+
|
| 4 |
+
# Sentence Dealer and dependencies
|
| 5 |
+
# Install sentence_dealer package first: pip install -e util/sentence_dealer
|
| 6 |
+
# Or uncomment below to install dependencies directly:
|
| 7 |
+
|
| 8 |
+
# CKIP and NLP
|
| 9 |
+
ckiptagger==0.2.1
|
| 10 |
+
OpenCC==1.1.9
|
| 11 |
+
deep-translator==1.11.4
|
| 12 |
+
|
| 13 |
+
# TensorFlow and ML
|
| 14 |
+
tensorflow==2.15.0
|
| 15 |
+
keras==2.15.0
|
| 16 |
+
numpy==1.26.4
|
| 17 |
+
pandas==2.2.3
|
| 18 |
+
|
| 19 |
+
# Hugging Face
|
| 20 |
+
huggingface_hub==1.2.3
|
| 21 |
+
|
| 22 |
+
# OpenAI / Azure OpenAI
|
| 23 |
+
openai==2.14.0
|
| 24 |
+
|
| 25 |
+
# Utilities
|
| 26 |
+
requests==2.32.3
|
| 27 |
+
tqdm==4.66.6
|
| 28 |
+
typing-extensions==4.15.0
|
| 29 |
+
|
util/cloze_constants.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from string import Template
|
| 3 |
+
from types import MappingProxyType
|
| 4 |
+
def load_text(text_path:str, strip=True):
|
| 5 |
+
text_str = ""
|
| 6 |
+
with open(text_path, "r", encoding="utf-8") as f:
|
| 7 |
+
text_str = f.read()
|
| 8 |
+
if strip: text_str = text_str.strip()
|
| 9 |
+
return text_str
|
| 10 |
+
|
| 11 |
+
def load_json(json_path:str):
|
| 12 |
+
data_list = []
|
| 13 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
| 14 |
+
data_list = json.load(f)
|
| 15 |
+
return data_list
|
| 16 |
+
|
| 17 |
+
def load_jsonl(jsonl_path:str, chinese_only=False):
|
| 18 |
+
data_list = []
|
| 19 |
+
with open(jsonl_path, "r", encoding="utf-8") as f:
|
| 20 |
+
for line in f:
|
| 21 |
+
item = json.loads(line)
|
| 22 |
+
if chinese_only:
|
| 23 |
+
# 僅保留中文語法名稱
|
| 24 |
+
if all('\u4e00' <= ch <= '\u9fff' or ch == '.' for ch in item["grammar_name"]):
|
| 25 |
+
data_list.append(item)
|
| 26 |
+
else: data_list.append(item)
|
| 27 |
+
return data_list
|
| 28 |
+
|
| 29 |
+
example_B_path = "./dataset/cloze_exampleB.txt"
|
| 30 |
+
example_C_path = "./dataset/cloze_exampleC.txt"
|
| 31 |
+
word_definition_path = "./dataset/word2TBCL.json"
|
| 32 |
+
grammars_path = "./dataset/grammars.jsonl"
|
| 33 |
+
patterns_path = "./dataset/sentencepattern.json"
|
| 34 |
+
|
| 35 |
+
example_B = load_text(example_B_path)
|
| 36 |
+
example_C = load_text(example_C_path)
|
| 37 |
+
word2TBCL = load_json(word_definition_path)
|
| 38 |
+
grammar_json = load_jsonl(grammars_path, chinese_only=True)
|
| 39 |
+
pattern_json = load_json(patterns_path)
|
| 40 |
+
|
| 41 |
+
SYSTEM_PROMPT = "你是一位在台灣從事國文教育多年有趣的華語老師,這次負責出克漏字填空(cloze)的題目。"
|
| 42 |
+
CLOZE_TASK_DEFINITION = "每個克漏字填空由[完整文章]、[挖空後的文章]和[題目]所組成。[完整文章]由一連串文字組成,與學校的課文相似;[挖空後的文章]由[完整文章]挖空後取得,空格由底線+數字組成,像是(__1__, __2__, ...)或是(__1__⋯⋯__1__, __2__⋯⋯__2__, ...);[題目]為單選題,共6題,選項為空格內可能可以填入的詞彙或片語,有(A)、(B)、(C)、(D)四個選項供選擇。"
|
| 43 |
+
CLOZE_PROMPT_DEFINITION_TEMPLATE = Template("""【題型說明】
|
| 44 |
+
%s
|
| 45 |
+
【參考資料】
|
| 46 |
+
[詞彙清單]: ${level_word_list}
|
| 47 |
+
[語法結構清單]: ${level_grammar_list}
|
| 48 |
+
[句型結構清單]: ${level_pattern_list}
|
| 49 |
+
|
| 50 |
+
【要求】
|
| 51 |
+
文章長度150~300字,語言難度相當於國中水準,可出現書面詞、抽象概念,難度盡量與後續的【範例】保持一致。
|
| 52 |
+
生成[完整文章]時必須使用3 個[詞彙清單]中的詞彙,需自然嵌入文章中。
|
| 53 |
+
生成[完整文章]時必須使用1~2 個[語法結構清單]中的語法結構,需自然嵌入文章中。
|
| 54 |
+
生成[完整文章]時必須使用1~2 種[句型結構清單]中的句型結構,需自然嵌入文章中。
|
| 55 |
+
預期讀者為學習繁體中文的外國成年人,可以以較為國際化或本土化的文章內容出題。
|
| 56 |
+
為確保題目的多樣性,挖空位置可使用不同詞性(名詞、動詞、形容詞、副詞等)。
|
| 57 |
+
[完整文章]不可出現「.」或「_」這兩種字元。
|
| 58 |
+
|
| 59 |
+
【範例】
|
| 60 |
+
${level_examples}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
""" %(CLOZE_TASK_DEFINITION))
|
| 64 |
+
"""
|
| 65 |
+
【克漏字填空生產流程】
|
| 66 |
+
1. 閱讀完【題型說明】、【參考資料】、【要求】、【範例】裡的內容,依據【要求】裡生成文章的規範生成沒有空格的[完整文章]。
|
| 67 |
+
2. 參考【參考資料】、【要求】、【範例】裡的資訊,尋找[完整文章]裡適合挖空的位置,必需挖出6個題目的位置,每個位址以__1__, __2__, ...來表示,若挖空位址並非詞彙則以__1__⋯⋯__1__, __2__⋯⋯__2__, ...來表示,其中⋯⋯為[完整文章]裡的部分內容,此階段之結果為[挖空後的文章]。
|
| 68 |
+
3. 參考[完整文章]、[挖空後的文章]、【參考資料】、【要求】、【範例】裡的資訊,設計6題具多樣性及誘答性的題目。
|
| 69 |
+
4. 確認[完整文章]內容自然通順且不含簡體字,確認每題題目只有一個正確答案。
|
| 70 |
+
"""
|
| 71 |
+
level_B_perferences = ["","「挖空位置」偏好選擇日常生活情境相關的詞彙或語法點(如家庭、購物、交通、學習、健康、寵物等),挖空重點放在 「基礎語法、常用詞搭配、生活詞彙」。"]
|
| 72 |
+
level_C_perferences = ["(可出現成語)","「挖空位置」偏好選擇較正式或書面化的文章,挖空重點放在 「成語、書面連接詞、文學修辭、抽象概念詞」,以檢測高階語感與閱讀理解。"]
|
| 73 |
+
ASKING_PROMPT_2_TEMPLATE = Template("""請閱讀上述的[完整文章],依照下列規範設計填空題,讓題組中同時包含詞彙題與語法題(每題為一個詞彙或一組語法結構、副詞、固定詞組或片語填空):
|
| 74 |
+
【要求】
|
| 75 |
+
[完整文章]請務必完全保留原文,不需任何空格、_、⋯。
|
| 76 |
+
[挖空後的文章]必需挖出6個選擇題填空對應的位置,每個位置以__1__, __2__, ..., __6__來表示,若挖空位址並非詞彙則以__1__⋯⋯__1__, __2__���⋯__2__, ..., __6__⋯⋯__6__來表示,其中⋯⋯為[完整文章]裡的部分內容
|
| 77 |
+
[挖空後的文章]除挖空位置外不得新增、刪減、替換或改寫任何[完整文章]裡的內容,必須與原文一字不差。
|
| 78 |
+
「⋯⋯」僅允許作為選項中的示意,不計入字數;[完整文章]、[挖空後的文章]皆不得出現「⋯⋯」。
|
| 79 |
+
挖空位置是題目的正確答案,其必須完整取自[完整文章],不能創造[完整文章]中不存在的詞彙或語法結構。
|
| 80 |
+
請從原文中選出六個適合挖空的位置,其中可分為:
|
| 81 |
+
詞彙${level_preference1}
|
| 82 |
+
副詞搭配
|
| 83 |
+
虛詞
|
| 84 |
+
量詞
|
| 85 |
+
語法結構(如複句)
|
| 86 |
+
固定詞組或片語
|
| 87 |
+
${level_preference2}
|
| 88 |
+
「挖空位置」禁止挖空專有名詞(如人名、地名、組織名)、數字(如「三」「五」)或其他不適合設計選項的詞彙。
|
| 89 |
+
|
| 90 |
+
【挖空標記】
|
| 91 |
+
將被選的詞彙/連詞/副詞/片語在原文中「就地」替換為 __1__、__2__…。
|
| 92 |
+
- 複句連詞採「雙空同號」:同一題出現兩次相同標記(如:__1__⋯⋯__1__⋯⋯)。
|
| 93 |
+
- 除被替換的目標詞外,[完整文章]其他內容不得增刪或改寫(包含標點與空白)。
|
| 94 |
+
|
| 95 |
+
【複句連詞對挖空規則】
|
| 96 |
+
定義: [複句連詞對]是由兩個(以上)句子依特定語法關係組合而成,類似於國文課中照樣造句的形式。
|
| 97 |
+
複句類型:{patterns_text}
|
| 98 |
+
連詞對必須來自[完整文章]的實際用詞,一字不差;不可用語義相近或結構相似詞替代。
|
| 99 |
+
不得將單一詞或固定片語(如「非但如此」「因此」「不過」)誤判為連詞對。若句中沒有真正的連詞對,則不得設計成複句題,請改為詞彙、副詞挖空。
|
| 100 |
+
「標點不構成阻斷」:逗號、頓號、分號等標點不影響配對,兩部分仍視為同一句內的「連詞對」。
|
| 101 |
+
挖空時必須同時替換該[複句連詞對]中的兩個詞(雙空同號),不得只替換其中一個。
|
| 102 |
+
兩個空必須在同一句內;每題恰好出現同號標記兩次(例如:__2__ 與 __2__)。
|
| 103 |
+
【示例1】
|
| 104 |
+
- 原句: 「非但氣氛熱烈,而且售票情況也非常理想。」
|
| 105 |
+
- 正確: 「__3__氣氛熱烈,__3__售票情況也非常理想。」
|
| 106 |
+
- 錯誤(禁止): 「__3__⋯⋯__3__⋯⋯售票情況也非常理想。」
|
| 107 |
+
【示例2】
|
| 108 |
+
- 原句: 最後他們班不但贏得了比賽,也獲得了老師的讚賞。
|
| 109 |
+
- 正確: 最後他們班__6__贏得了比賽,__6__獲得了老師的讚賞。
|
| 110 |
+
- 錯誤(禁止): 最後他們班__6__贏得了比賽,也獲得了老師的讚賞。(少挖後半)
|
| 111 |
+
|
| 112 |
+
【選擇題設計】
|
| 113 |
+
每題需提供四個選項 (A~D),且只有一個正確答案。
|
| 114 |
+
正確答案必須取自[完整文章],與原文一字不差。
|
| 115 |
+
四個選項的字數盡量一致**(連詞對以「前半+後半」的總字數為準)。
|
| 116 |
+
- 若為[複句連詞對],四個選項都必須是[複句連詞對],刪節號使用「⋯⋯」。
|
| 117 |
+
- 不得出現單側連詞(如「不如果決」)或「連詞+詞彙」的混種格式。
|
| 118 |
+
至少一個錯誤選項必須設計為「誘答選項」(plausible distractor),不可與正解等義或可替代;錯誤選項不得與原文任一詞語完全一致。
|
| 119 |
+
6題選擇題選項之間,正確答案選項與候選選項必須避免重複或高度近似的詞彙。
|
| 120 |
+
- 不得在不同選項中重複出現同義詞(如「十分」「非常」「極度」)。
|
| 121 |
+
- 6題[題目]盡量涵蓋不同詞類(動詞、副詞、連詞、名詞、片語([複句連詞對])、量詞),確保多樣性。
|
| 122 |
+
|
| 123 |
+
【誘答選項定義與規則】
|
| 124 |
+
- 定義: 在選擇題或選詞填空題中,除了唯一正確答案之外,其餘錯誤但看似合理的選項,稱為「誘答選項」。
|
| 125 |
+
- 規則:
|
| 126 |
+
1. 正確答案只有一個。
|
| 127 |
+
2. 誘答選項必須是「錯誤答案」,不可與正確答案等義或合理替代。
|
| 128 |
+
3. 誘答選項應具備迷惑性,常見設計方式:
|
| 129 |
+
- 語義相反(如「不開心」 vs. 正確答案「開心」)。
|
| 130 |
+
- 搭配錯誤(詞性或語境不合)。
|
| 131 |
+
- 同類詞但不符合上下文(如形近詞、同字根詞,但意思放不進去)。
|
| 132 |
+
4. 不要產生與正確答案意義相近、且同樣合理的詞(例如正解「有趣」,不可出現「有意思」)。
|
| 133 |
+
5. 錯誤選項必須保證不與原文的任何詞語完全一致。
|
| 134 |
+
|
| 135 |
+
【克漏字填空生產流程】
|
| 136 |
+
1. 按照【要求】的內容從[完整文章]裡挖出6個挖空位置產出[挖空後的文章],格式請注意【挖空標記】、【複句連詞對挖空規則】裡的內容。
|
| 137 |
+
2. 將挖空的區域列為之後選擇題的正確答案
|
| 138 |
+
3. 按照【選擇題設計】的內容設計6題具多樣性及誘答性的6題選擇題,誘答性定義請參考【誘答選項定義與規則】,正確答案不藥都是A選項,請平均分配。
|
| 139 |
+
4. 務必確認每題選擇題的每一個選項代入句子中的正確性,每題選擇題正確答案只能有一個,不能有兩個選項帶入句子皆合理的情況。
|
| 140 |
+
|
| 141 |
+
請根據【克漏字填空生產流程】的內容一步一步生成克漏字填空。
|
| 142 |
+
""")
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
ASKING_PROMPT_1 = "請先根據【要求】的規範生成不帶任何空格的[完整文章]"
|
| 146 |
+
#ASKING_PROMPT = "請根據【克漏字填空生產流程】產出克漏字填空。"
|
| 147 |
+
REFERENCE_ASKING_PROMPT_1_TEMPLATE = Template("""【前置課文資訊】
|
| 148 |
+
[主題] ${topic}
|
| 149 |
+
[副主題] ${subtopic}
|
| 150 |
+
[前置課文標題] ${title}
|
| 151 |
+
[前置課文內容] ${pre_context}
|
| 152 |
+
請思考讀者學習完【前置課文資訊】後的克漏字填空出題方向,其中克漏字填空裡的[完整文章]是【課文】的延伸,兩者需保持一定的相關性,但不需重複。請先根據【要求】的規範生成不帶任何空格的[完整文章]。
|
| 153 |
+
""")
|
| 154 |
+
|
| 155 |
+
CLOZE_FIX_PREFERENCE_PROMPT = "請根據上述審核結果,包含之前的簡短說明,及詳細不合格的理由,修改克漏字填空的內容。"
|
| 156 |
+
CLOZE_FIX_HIGH_LEVEL_PROMPT = "文章難度過高,請重新閱讀【要求】、【範例】的內容後,降低閱讀測驗的難度。"
|
| 157 |
+
CLOZE_FIX_LOW_LEVEL_PROMPT = "文章難度過低,請重新閱讀【要求】、【範例】的內容後,增加閱讀測驗的難度。"
|
| 158 |
+
# 請先根據【要求】的規範生成不帶任何空格的「完整文章」。
|
| 159 |
+
|
| 160 |
+
level2examples = {"B":example_B, "C":example_C}
|
| 161 |
+
level_B_words = [word for word, level in word2TBCL.items() if 4 <= level <= 5]
|
| 162 |
+
level_C_words = [word for word, level in word2TBCL.items() if 6 <= level <= 7]
|
| 163 |
+
level_B_grammars = [g["grammar_name"] for g in grammar_json if 1 <= g["level"] <= 3]
|
| 164 |
+
level_C_grammars = [g["grammar_name"] for g in grammar_json if 4 <= g["level"] <= 5]
|
| 165 |
+
all_patterns = []
|
| 166 |
+
for group in pattern_json["patterns"]:
|
| 167 |
+
all_patterns.extend(group["patterns"])
|
| 168 |
+
|
| 169 |
+
CLOZE_SCHEMA = MappingProxyType({
|
| 170 |
+
"name": "cloze_output",
|
| 171 |
+
"strict": True,
|
| 172 |
+
"schema": {
|
| 173 |
+
"type": "object",
|
| 174 |
+
"properties": {
|
| 175 |
+
"完整文章": {
|
| 176 |
+
"type": "string",
|
| 177 |
+
"description": "克漏字填空的完整文章,不含任何_或⋯。"
|
| 178 |
+
},
|
| 179 |
+
"挖空後的文章": {
|
| 180 |
+
"type": "string",
|
| 181 |
+
"description": "克漏字填空挖空後的文章。"
|
| 182 |
+
},
|
| 183 |
+
"題目列表": {
|
| 184 |
+
"type": "array",
|
| 185 |
+
"description": "本篇文章對應的選擇題列表(1~6題)。",
|
| 186 |
+
"minItems": 6,
|
| 187 |
+
"maxItems": 6,
|
| 188 |
+
"items": {
|
| 189 |
+
"type": "object",
|
| 190 |
+
"properties": {
|
| 191 |
+
"題號": {
|
| 192 |
+
"type": "integer",
|
| 193 |
+
"minimum": 1,
|
| 194 |
+
"maximum": 6,
|
| 195 |
+
"description": "說明是哪個挖空位置的選擇題。"
|
| 196 |
+
},
|
| 197 |
+
"題目句": {
|
| 198 |
+
"type": "string",
|
| 199 |
+
"description": "挖空後文章的斷句,用來描述此挖空題的準確位置。",
|
| 200 |
+
"minLength": 1
|
| 201 |
+
},
|
| 202 |
+
"A": {"type": "string", "description": "選項 A", "minLength": 1},
|
| 203 |
+
"B": {"type": "string", "description": "選項 B", "minLength": 1},
|
| 204 |
+
"C": {"type": "string", "description": "選項 C", "minLength": 1},
|
| 205 |
+
"D": {"type": "string", "description": "選項 D", "minLength": 1},
|
| 206 |
+
"答案": {
|
| 207 |
+
"type": "string",
|
| 208 |
+
"description": "正確選項。",
|
| 209 |
+
"enum": ["A", "B", "C", "D"]
|
| 210 |
+
},
|
| 211 |
+
"解析": {
|
| 212 |
+
"type": "string",
|
| 213 |
+
"description": "挖空理由。"
|
| 214 |
+
}
|
| 215 |
+
},
|
| 216 |
+
"required": ["題號", "題目句", "A", "B", "C", "D", "答案", "解析"],
|
| 217 |
+
"additionalProperties": False
|
| 218 |
+
}
|
| 219 |
+
},
|
| 220 |
+
},
|
| 221 |
+
"required": ["完整文章", "挖空後的文章", "題目列表"],
|
| 222 |
+
"additionalProperties": False
|
| 223 |
+
}
|
| 224 |
+
})
|
| 225 |
+
|
| 226 |
+
CLOZE_LEVEL_CONFIG = {
|
| 227 |
+
"B-進階高階": {
|
| 228 |
+
"word_list":level_B_words,
|
| 229 |
+
"grammar_list":level_B_grammars,
|
| 230 |
+
"all_patterns":all_patterns,
|
| 231 |
+
"examples":level2examples["B"],
|
| 232 |
+
"system_prompt":SYSTEM_PROMPT,
|
| 233 |
+
"prompt_definition_template":CLOZE_PROMPT_DEFINITION_TEMPLATE,
|
| 234 |
+
"asking_prompt_1":ASKING_PROMPT_1,
|
| 235 |
+
"perference_prompts":level_B_perferences,
|
| 236 |
+
"asking_prompt_2_template":ASKING_PROMPT_2_TEMPLATE,
|
| 237 |
+
"reference_asking_prompt_1_template":REFERENCE_ASKING_PROMPT_1_TEMPLATE,
|
| 238 |
+
"json_schema": CLOZE_SCHEMA
|
| 239 |
+
},
|
| 240 |
+
"C-流利精通": {
|
| 241 |
+
"word_list":level_C_words,
|
| 242 |
+
"grammar_list":level_C_grammars,
|
| 243 |
+
"all_patterns":all_patterns,
|
| 244 |
+
"examples":level2examples["C"],
|
| 245 |
+
"system_prompt":SYSTEM_PROMPT,
|
| 246 |
+
"prompt_definition_template":CLOZE_PROMPT_DEFINITION_TEMPLATE,
|
| 247 |
+
"asking_prompt_1":ASKING_PROMPT_1,
|
| 248 |
+
"perference_prompts":level_C_perferences,
|
| 249 |
+
"asking_prompt_2_template":ASKING_PROMPT_2_TEMPLATE,
|
| 250 |
+
"reference_asking_prompt_1_template":REFERENCE_ASKING_PROMPT_1_TEMPLATE,
|
| 251 |
+
"json_schema": CLOZE_SCHEMA
|
| 252 |
+
}
|
| 253 |
+
}
|
util/fileio/__init__.py
ADDED
|
File without changes
|
util/fileio/jsonio.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import io
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import gzip
|
| 6 |
+
import tempfile
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any, Iterable, Iterator, Union, Optional, overload, List, Dict
|
| 9 |
+
|
| 10 |
+
PathLike = Union[str, Path]
|
| 11 |
+
|
| 12 |
+
class FileIOError(Exception):
|
| 13 |
+
"""基底例外:涵蓋讀寫與解析錯誤。"""
|
| 14 |
+
|
| 15 |
+
class JSONDecodeError(FileIOError):
|
| 16 |
+
"""JSON 解析失敗。"""
|
| 17 |
+
|
| 18 |
+
class AtomicWriter:
|
| 19 |
+
"""
|
| 20 |
+
以原子方式寫檔:先寫到同目錄暫存檔,再 os.replace()。
|
| 21 |
+
可搭配 gzip_mode=True 自動用 gzip 包裝。
|
| 22 |
+
"""
|
| 23 |
+
def __init__(self, target: Path, gzip_mode: bool = False, encoding: str = "utf-8", newline: str = "\n"):
|
| 24 |
+
self.target = target
|
| 25 |
+
self.dir = target.parent
|
| 26 |
+
self.encoding = encoding
|
| 27 |
+
self.newline = newline
|
| 28 |
+
self.gzip_mode = gzip_mode
|
| 29 |
+
self._tmp_path: Optional[Path] = None
|
| 30 |
+
self._fh: Optional[io.TextIOBase] = None
|
| 31 |
+
|
| 32 |
+
def __enter__(self) -> io.TextIOBase:
|
| 33 |
+
self.dir.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
fd, tmp_name = tempfile.mkstemp(prefix=f".{self.target.name}.", dir=str(self.dir))
|
| 35 |
+
os.close(fd) # 我們會用高階 file object 重新開
|
| 36 |
+
self._tmp_path = Path(tmp_name)
|
| 37 |
+
|
| 38 |
+
if self.gzip_mode:
|
| 39 |
+
f = gzip.open(self._tmp_path, mode="wt", encoding=self.encoding, newline=self.newline)
|
| 40 |
+
else:
|
| 41 |
+
f = open(self._tmp_path, mode="w", encoding=self.encoding, newline=self.newline)
|
| 42 |
+
self._fh = f
|
| 43 |
+
return f
|
| 44 |
+
|
| 45 |
+
def __exit__(self, exc_type, exc, tb) -> None:
|
| 46 |
+
if self._fh:
|
| 47 |
+
self._fh.close()
|
| 48 |
+
if exc_type is None:
|
| 49 |
+
assert self._tmp_path is not None
|
| 50 |
+
os.replace(self._tmp_path, self.target)
|
| 51 |
+
else:
|
| 52 |
+
# 發生錯誤就清掉暫存檔
|
| 53 |
+
if self._tmp_path and self._tmp_path.exists():
|
| 54 |
+
try:
|
| 55 |
+
self._tmp_path.unlink()
|
| 56 |
+
except Exception:
|
| 57 |
+
pass # 忽略清理失敗
|
| 58 |
+
|
| 59 |
+
def _open_read(path: Path, encoding: str = "utf-8") -> io.TextIOBase:
|
| 60 |
+
if path.suffix == ".gz":
|
| 61 |
+
return gzip.open(path, mode="rt", encoding=encoding)
|
| 62 |
+
return open(path, mode="r", encoding=encoding)
|
| 63 |
+
|
| 64 |
+
def _needs_gzip(path: Path) -> bool:
|
| 65 |
+
return path.suffix == ".gz"
|
| 66 |
+
|
| 67 |
+
class JsonIO:
|
| 68 |
+
"""
|
| 69 |
+
JSON 單檔存取。
|
| 70 |
+
- 保留中文 ensure_ascii=False
|
| 71 |
+
- 預設縮排 indent=2(可改)
|
| 72 |
+
- 自動支援 .gz 讀/寫
|
| 73 |
+
"""
|
| 74 |
+
def __init__(self, path: PathLike, encoding: str = "utf-8"):
|
| 75 |
+
self.path = Path(path)
|
| 76 |
+
self.encoding = encoding
|
| 77 |
+
|
| 78 |
+
def load(self) -> Any:
|
| 79 |
+
try:
|
| 80 |
+
with _open_read(self.path, encoding=self.encoding) as f:
|
| 81 |
+
return json.load(f)
|
| 82 |
+
except json.JSONDecodeError as e:
|
| 83 |
+
raise JSONDecodeError(f"JSON decode failed at {self.path}: {e}") from e
|
| 84 |
+
except Exception as e:
|
| 85 |
+
raise FileIOError(f"Failed to read {self.path}: {e}") from e
|
| 86 |
+
|
| 87 |
+
def save(self, obj: Any, indent: int = 2, ensure_ascii: bool = False) -> None:
|
| 88 |
+
gzip_mode = _needs_gzip(self.path)
|
| 89 |
+
try:
|
| 90 |
+
with AtomicWriter(self.path, gzip_mode=gzip_mode, encoding=self.encoding) as f:
|
| 91 |
+
json.dump(obj, f, ensure_ascii=ensure_ascii, indent=indent)
|
| 92 |
+
f.write("\n")
|
| 93 |
+
except Exception as e:
|
| 94 |
+
raise FileIOError(f"Failed to write {self.path}: {e}") from e
|
| 95 |
+
|
| 96 |
+
class JsonlIO:
|
| 97 |
+
"""
|
| 98 |
+
JSONL(NDJSON)逐行存取。
|
| 99 |
+
- 迭代讀(省記憶體)
|
| 100 |
+
- 覆寫寫入(原子)
|
| 101 |
+
- append 寫入
|
| 102 |
+
- 自動支援 .gz
|
| 103 |
+
"""
|
| 104 |
+
def __init__(self, path: PathLike, encoding: str = "utf-8"):
|
| 105 |
+
self.path = Path(path)
|
| 106 |
+
self.encoding = encoding
|
| 107 |
+
|
| 108 |
+
def iter_load(self) -> Iterator[Dict[str, Any]]:
|
| 109 |
+
"""逐行解析,遇到空白行自動略過。"""
|
| 110 |
+
try:
|
| 111 |
+
with _open_read(self.path, encoding=self.encoding) as f:
|
| 112 |
+
for i, line in enumerate(f, start=1):
|
| 113 |
+
line = line.strip()
|
| 114 |
+
if not line:
|
| 115 |
+
continue
|
| 116 |
+
try:
|
| 117 |
+
yield json.loads(line)
|
| 118 |
+
except json.JSONDecodeError as e:
|
| 119 |
+
raise JSONDecodeError(f"JSONL decode failed at line {i} in {self.path}: {e}") from e
|
| 120 |
+
except Exception as e:
|
| 121 |
+
raise FileIOError(f"Failed to read {self.path}: {e}") from e
|
| 122 |
+
|
| 123 |
+
def load_all(self) -> List[Dict[str, Any]]:
|
| 124 |
+
"""一次載入全部(小檔方便用)。"""
|
| 125 |
+
return list(self.iter_load())
|
| 126 |
+
|
| 127 |
+
def save_all(self, rows: Iterable[Dict[str, Any]], ensure_ascii: bool = False) -> None:
|
| 128 |
+
"""覆寫寫入(原子)。"""
|
| 129 |
+
gzip_mode = _needs_gzip(self.path)
|
| 130 |
+
try:
|
| 131 |
+
with AtomicWriter(self.path, gzip_mode=gzip_mode, encoding=self.encoding) as f:
|
| 132 |
+
for row in rows:
|
| 133 |
+
f.write(json.dumps(row, ensure_ascii=ensure_ascii))
|
| 134 |
+
f.write("\n")
|
| 135 |
+
except Exception as e:
|
| 136 |
+
raise FileIOError(f"Failed to write {self.path}: {e}") from e
|
| 137 |
+
|
| 138 |
+
def append(self, rows: Iterable[Dict[str, Any]], ensure_ascii: bool = False) -> None:
|
| 139 |
+
"""附加寫入(非原子;適合持續追加的 log 類資料)。"""
|
| 140 |
+
try:
|
| 141 |
+
if _needs_gzip(self.path):
|
| 142 |
+
# gzip 不支援原地 append 的隨機存取,這裡採「串流追加」
|
| 143 |
+
with gzip.open(self.path, mode="at", encoding=self.encoding, newline="\n") as f:
|
| 144 |
+
for row in rows:
|
| 145 |
+
f.write(json.dumps(row, ensure_ascii=ensure_ascii))
|
| 146 |
+
f.write("\n")
|
| 147 |
+
else:
|
| 148 |
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
| 149 |
+
with open(self.path, mode="a", encoding=self.encoding, newline="\n") as f:
|
| 150 |
+
for row in rows:
|
| 151 |
+
f.write(json.dumps(row, ensure_ascii=ensure_ascii))
|
| 152 |
+
f.write("\n")
|
| 153 |
+
except Exception as e:
|
| 154 |
+
raise FileIOError(f"Failed to append {self.path}: {e}") from e
|
util/judgement.py
ADDED
|
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json, time, re, opencc, math
|
| 2 |
+
# region agent log
|
| 3 |
+
import os as _os, sys as _sys
|
| 4 |
+
def _agent_log(_hypothesisId: str, _location: str, _message: str, _data: dict):
|
| 5 |
+
try:
|
| 6 |
+
_p = r"c:\Users\tom1030507\Desktop\Programming\study_prompt_UI\.cursor\debug.log"
|
| 7 |
+
_payload = {
|
| 8 |
+
"sessionId": "debug-session",
|
| 9 |
+
"runId": _os.environ.get("AGENT_RUN_ID", "pre-fix"),
|
| 10 |
+
"hypothesisId": _hypothesisId,
|
| 11 |
+
"location": _location,
|
| 12 |
+
"message": _message,
|
| 13 |
+
"data": _data,
|
| 14 |
+
"timestamp": int(time.time() * 1000),
|
| 15 |
+
}
|
| 16 |
+
with open(_p, "a", encoding="utf-8") as _f:
|
| 17 |
+
_f.write(json.dumps(_payload, ensure_ascii=False) + "\n")
|
| 18 |
+
except Exception:
|
| 19 |
+
pass
|
| 20 |
+
|
| 21 |
+
_agent_log(
|
| 22 |
+
"H_PATH",
|
| 23 |
+
"util/judgement.py:module_import",
|
| 24 |
+
"judgement module import start",
|
| 25 |
+
{
|
| 26 |
+
"__file__": __file__,
|
| 27 |
+
"cwd": _os.getcwd(),
|
| 28 |
+
"sys.executable": _sys.executable,
|
| 29 |
+
"sys.path_head": _sys.path[:5],
|
| 30 |
+
},
|
| 31 |
+
)
|
| 32 |
+
# endregion
|
| 33 |
+
#from __future__ import annotations
|
| 34 |
+
from dataclasses import dataclass
|
| 35 |
+
from typing import Any, Dict, List, Tuple, Optional, Literal
|
| 36 |
+
from openai import AzureOpenAI
|
| 37 |
+
from openai._exceptions import APIError, RateLimitError, APITimeoutError
|
| 38 |
+
from typing import Any, Mapping
|
| 39 |
+
from types import MappingProxyType
|
| 40 |
+
|
| 41 |
+
# Optional heavy deps (allow importing prompt constants without installing ML stack)
|
| 42 |
+
try:
|
| 43 |
+
import torch # type: ignore
|
| 44 |
+
import torch.nn as nn # type: ignore
|
| 45 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 46 |
+
torch = None # type: ignore
|
| 47 |
+
nn = None # type: ignore
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
from transformers import AutoModel, AutoTokenizer # type: ignore
|
| 51 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 52 |
+
AutoModel = None # type: ignore
|
| 53 |
+
AutoTokenizer = None # type: ignore
|
| 54 |
+
|
| 55 |
+
# region agent log
|
| 56 |
+
_agent_log(
|
| 57 |
+
"H_DEPS",
|
| 58 |
+
"util/judgement.py:optional_deps",
|
| 59 |
+
"optional deps availability",
|
| 60 |
+
{
|
| 61 |
+
"torch_available": torch is not None,
|
| 62 |
+
"transformers_available": AutoModel is not None and AutoTokenizer is not None,
|
| 63 |
+
},
|
| 64 |
+
)
|
| 65 |
+
# endregion
|
| 66 |
+
|
| 67 |
+
# ---------- Utilities ----------
|
| 68 |
+
|
| 69 |
+
class OpenCCChecker:
|
| 70 |
+
_cc_s2t = None
|
| 71 |
+
_cc_s2twp = None
|
| 72 |
+
|
| 73 |
+
@classmethod
|
| 74 |
+
def ensure_init(cls):
|
| 75 |
+
if cls._cc_s2t is None:
|
| 76 |
+
cls._cc_s2t = opencc.OpenCC('s2t.json')
|
| 77 |
+
if cls._cc_s2twp is None:
|
| 78 |
+
cls._cc_s2twp = opencc.OpenCC('s2twp.json')
|
| 79 |
+
|
| 80 |
+
@classmethod
|
| 81 |
+
def has_simplified(cls, text: str, prefer_general_word_check: bool = False) -> bool:
|
| 82 |
+
cls.ensure_init()
|
| 83 |
+
cc = cls._cc_s2twp if prefer_general_word_check else cls._cc_s2t
|
| 84 |
+
return text != cc.convert(text)
|
| 85 |
+
|
| 86 |
+
@dataclass(frozen=True, slots=True)
|
| 87 |
+
class PromptSet:
|
| 88 |
+
system: str
|
| 89 |
+
additional_check: str
|
| 90 |
+
json_schema: Mapping[str, Any]
|
| 91 |
+
|
| 92 |
+
QUALIFICATION_SCHEMA = MappingProxyType({
|
| 93 |
+
"name": "qualification_check",
|
| 94 |
+
"strict": True,
|
| 95 |
+
"schema": {
|
| 96 |
+
"type": "object",
|
| 97 |
+
"properties": {
|
| 98 |
+
"合格/不合格": {
|
| 99 |
+
"type": "string",
|
| 100 |
+
"description": "根據歷史紀錄判定結果為『合格』或『不合格』,必須為這兩個之一。",
|
| 101 |
+
"enum": ["合格", "不合格"]
|
| 102 |
+
},
|
| 103 |
+
"簡短說明": {
|
| 104 |
+
"type": "string",
|
| 105 |
+
"description": "針對檢查結果簡要說明原因或依據,不得為空。",
|
| 106 |
+
"minLength": 1
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
"required": ["合格/不合格", "簡短說明"],
|
| 110 |
+
"additionalProperties": False
|
| 111 |
+
}
|
| 112 |
+
})
|
| 113 |
+
|
| 114 |
+
READING_JUDGE_PROMPTS = PromptSet(
|
| 115 |
+
system=(
|
| 116 |
+
"你是一位專業的華語審題老師,長期負責《閱讀測驗》的審核工作。\n"
|
| 117 |
+
"請嚴格依據以下【審題標準】進行判斷:\n"
|
| 118 |
+
"【題目說明】\n"
|
| 119 |
+
"每個閱讀測驗由「文章」和「題目」所組成,「文章」由一連串文字組成,與學校的課文相似;"
|
| 120 |
+
"「題目」為選擇題,提出一個問題後給予(A)、(B)、(C)、(D)四個可能的答案供選擇。\n"
|
| 121 |
+
"【審題標準】\n"
|
| 122 |
+
"1. 「文章」和「題目」不可以出現簡體字。\n"
|
| 123 |
+
"2. 「文章」和「題目」中不能出現明顯的負面用語,需要用較為客觀的用詞取代。例如:「奧客」應改為「惡劣的顧客」。若有此情況視為不合格。\n"
|
| 124 |
+
"3. 「文章」和「題目」語意需清楚,不能出現模稜兩可或不合時宜的病句或詞彙。舉例:「平時我們也常從新聞看到有人因為隨手滑手機而被車撞。」"
|
| 125 |
+
"應改為「平時我們也常從新聞看到有人因為一邊走路一邊滑手機,而被車撞。」……(略)\n"
|
| 126 |
+
"4. 每一個「題目」必須且只能有一個標準答案(唯一性)。\n"
|
| 127 |
+
"5. 「文章」不得有邏輯衝突(身份衝突、視角矛盾等)。"
|
| 128 |
+
),
|
| 129 |
+
additional_check=(
|
| 130 |
+
"在不考慮「文章」只看「題目」(包含選項)的情況下,確認每一個「題目」裡的每一個選項是否也有很高的機率選出正確答案,只要有一個這種「明顯錯誤」的選項,一律視為「整題不合格」,請嚴格執行此項判斷,以下是一些「選項明顯錯誤」的案例:\n"
|
| 131 |
+
" 6.1 選項中出現明顯錯誤、負面詞語(如「只有」、「只」、「僅以」)或「過度絕對化」的敘述(如「只會增加困難與挫折」、「認為文學只適合考試使用」)。\n"
|
| 132 |
+
" 6.2 選項答案長度差異過大,使正確選項顯而易見。\n"
|
| 133 |
+
" 6.3 選項描述內容高度雷同,使正確答案突出。\n"
|
| 134 |
+
" 例如:根據文章內容,作者如何總結經濟發展與環境保護之間的關係?\n"
|
| 135 |
+
" (A) 經濟發展必然以環境犧牲為代價\n"
|
| 136 |
+
" (B) 兩者本質上無法兼容,須取捨其一\n"
|
| 137 |
+
" (C) 若結合在地經濟行動與濕地復育,反而有助於生態與地方發展\n"
|
| 138 |
+
" (D) 經濟行動與環境保護之間缺乏交集,各自為政"
|
| 139 |
+
),
|
| 140 |
+
json_schema=QUALIFICATION_SCHEMA
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
CLOZE_JUDGE_PROMPTS = PromptSet(
|
| 144 |
+
system=(
|
| 145 |
+
"你是一位專業的華語審題老師,長期負責《填空題》的審核工作。\n"
|
| 146 |
+
"請嚴格依據以下【審題標準】進行判斷:\n"
|
| 147 |
+
"【題目說明】\n"
|
| 148 |
+
"每個填空題由「挖空後的文章」和「題目」所組成:\n"
|
| 149 |
+
"「挖空後的文章」中的挖空位置以「__(題號)__」或「__(題號)__⋯⋯__(題號)__」表示,像是「__1__」或「__1__⋯⋯__1__」為給題目1的挖空位置,後者為片語形式。;"
|
| 150 |
+
"「題目」為選擇題,每題對應到一個位置,會有(A)、(B)、(C)、(D)四個詞彙、語法或成語供選擇。\n\n"
|
| 151 |
+
"【審題標準】\n"
|
| 152 |
+
"1. 「挖空後的文章」和「題目」不可以出現簡體字。\n"
|
| 153 |
+
"2. 「挖空後的文章」和「題目」中不能出現明顯的負面用語,需要用較為客觀的用詞取代。例如:「奧客」應改為「惡劣的顧客」。若有此情況視為不合格。\n"
|
| 154 |
+
"3. 每一個「題目」必須且只能有一個標準答案(唯一性),因此請確定每一個題目每一個選項代入句子後的合理性,每一題只能有一個選項合理。\n"
|
| 155 |
+
"4. 挖控的位置是否正確。\n"
|
| 156 |
+
"請根據上述標準,逐一檢查每一個項目,並提供最終評判。"
|
| 157 |
+
),
|
| 158 |
+
additional_check=(
|
| 159 |
+
"除了原先的【審題標準】,請逐一檢查連續兩題的答案是否能合併為一個片語,若能合併則此填空題應判定為不合格,舉例來說:\n"
|
| 160 |
+
"如果「雖然」是第一題的答案,「但是」是第二題的答案,這兩題又在同一句話中出現且可以合併為「雖然⋯⋯但是」,那應該將這兩題合併為第一題,因此此填空題應判定為不合格。"
|
| 161 |
+
),
|
| 162 |
+
json_schema=QUALIFICATION_SCHEMA
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
PROMPT_REGISTRY: dict[str, PromptSet] = {
|
| 166 |
+
"reading": READING_JUDGE_PROMPTS,
|
| 167 |
+
"cloze": CLOZE_JUDGE_PROMPTS,
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
Mode = Literal["reading", "cloze"]
|
| 171 |
+
|
| 172 |
+
if nn is not None and AutoModel is not None:
|
| 173 |
+
class LevelJudgementModel(nn.Module):
|
| 174 |
+
def __init__(self, model_name: str, num_labels: int = 3):
|
| 175 |
+
super().__init__()
|
| 176 |
+
self.bert = AutoModel.from_pretrained(model_name)
|
| 177 |
+
hidden = self.bert.config.hidden_size # 不要寫死 1024
|
| 178 |
+
self.classifier = nn.Linear(hidden, num_labels) #huggingface BertForSequenceClassification 的預設名稱
|
| 179 |
+
self.save_hyperparams = {"model_name": model_name, "num_labels": num_labels}
|
| 180 |
+
|
| 181 |
+
def forward(self, input_ids, attention_mask):
|
| 182 |
+
"""
|
| 183 |
+
與 huggingface 的 BertForSequenceClassification 略有不同,少了tanh作為activation fuction。
|
| 184 |
+
"""
|
| 185 |
+
last_hidden = self.bert(input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
|
| 186 |
+
logits = self.classifier(last_hidden[:, 0, :]) # [CLS] 位置
|
| 187 |
+
return logits
|
| 188 |
+
else:
|
| 189 |
+
LevelJudgementModel = None # type: ignore
|
| 190 |
+
|
| 191 |
+
class PreprocessTool():
|
| 192 |
+
question_keys = ["題目1", "題目2", "題目3", "題目4", "題目5", "題目6"]
|
| 193 |
+
@classmethod
|
| 194 |
+
def make_reading_prompt(cls, sample_data:dict) -> list:
|
| 195 |
+
"""
|
| 196 |
+
Minimum required fields for sample_data:
|
| 197 |
+
['文章', '題目1', '題目2', '題目3', '題目4', '題目5', '題目6', '通過/不通過']
|
| 198 |
+
"""
|
| 199 |
+
prompt = "文章:\n" + sample_data["文章"] + "\n\n" + "題目:"+ "\n"
|
| 200 |
+
for question_key in cls.question_keys:
|
| 201 |
+
question_str = sample_data[question_key]
|
| 202 |
+
question_str = re.sub("【.*】\n|[\((]*(考點類型|事實理解|歸納分析|推論判斷|批判評析).*\n", "", question_str)
|
| 203 |
+
question_str = re.sub("參考答案: .", "", question_str)
|
| 204 |
+
prompt = prompt + question_str.strip() + "\n"
|
| 205 |
+
prompt = prompt.replace("?", "?")
|
| 206 |
+
prompt = prompt.replace("(", "(")
|
| 207 |
+
prompt = prompt.replace(")", ")")
|
| 208 |
+
prompt = prompt.replace(" ", "")
|
| 209 |
+
return prompt.strip(), sample_data["通過/不通過"]
|
| 210 |
+
|
| 211 |
+
@classmethod
|
| 212 |
+
def make_cloze_prompt(cls, sample_data:dict) -> list:
|
| 213 |
+
"""
|
| 214 |
+
Minimum required fields for sample_data:
|
| 215 |
+
['挖空後文章', '題目1', '題目2', '題目3', '題目4', '題目5', '題目6', '通過/不通過']
|
| 216 |
+
"""
|
| 217 |
+
prompt = sample_data["挖空後文章"] + "\n"
|
| 218 |
+
for question_key in cls.question_keys:
|
| 219 |
+
question_str = sample_data[question_key]
|
| 220 |
+
question_str = re.sub("參考答案:.", "", question_str)
|
| 221 |
+
prompt = prompt + question_str.strip() + "\n"
|
| 222 |
+
prompt = prompt.replace("?", "?")
|
| 223 |
+
prompt = prompt.replace("(", "(")
|
| 224 |
+
prompt = prompt.replace(")", ")")
|
| 225 |
+
prompt = prompt.replace(" ", "")
|
| 226 |
+
return prompt.strip(), sample_data["通過/不通過"]
|
| 227 |
+
|
| 228 |
+
# ---------- Core client wrapper ----------
|
| 229 |
+
|
| 230 |
+
class LLMClient:
|
| 231 |
+
def __init__(self, client: AzureOpenAI, model: str, temperature: float = 0, max_tokens: Optional[int] = None):
|
| 232 |
+
self.client = client
|
| 233 |
+
self.model = model
|
| 234 |
+
self.temperature = float(temperature)
|
| 235 |
+
self.max_tokens = max_tokens
|
| 236 |
+
self.chat_rerank_log = []
|
| 237 |
+
def _call_chat(self, **kwargs):
|
| 238 |
+
# 共同呼叫點,做指數退避
|
| 239 |
+
backoff = 0.5
|
| 240 |
+
for _ in range(4):
|
| 241 |
+
try:
|
| 242 |
+
return self.client.chat.completions.create(**kwargs)
|
| 243 |
+
except (RateLimitError, APITimeoutError, APIError):
|
| 244 |
+
time.sleep(backoff); backoff *= 2
|
| 245 |
+
# 最後再嘗試一次,把例外拋出
|
| 246 |
+
return self.client.chat.completions.create(**kwargs)
|
| 247 |
+
|
| 248 |
+
def chat(
|
| 249 |
+
self,
|
| 250 |
+
messages: List[Dict[str, str]],
|
| 251 |
+
response_format: Optional[Dict[str, Any]] = None
|
| 252 |
+
) -> str:
|
| 253 |
+
kwargs = dict(model=self.model, messages=messages, temperature=self.temperature)
|
| 254 |
+
if self.max_tokens is not None:
|
| 255 |
+
kwargs["max_tokens"] = self.max_tokens
|
| 256 |
+
if response_format:
|
| 257 |
+
kwargs["response_format"] = response_format
|
| 258 |
+
resp = self._call_chat(**kwargs)
|
| 259 |
+
return resp.choices[0].message.content or ""
|
| 260 |
+
|
| 261 |
+
# ===== 新增:多候選 + logprobs rerank =====
|
| 262 |
+
def chat_rerank(
|
| 263 |
+
self,
|
| 264 |
+
messages: List[Dict[str, str]],
|
| 265 |
+
n: int = 2,
|
| 266 |
+
temperature: Optional[float] = None,
|
| 267 |
+
response_format: Optional[Dict[str, Any]] = None,
|
| 268 |
+
top_logprobs: int = 1, #
|
| 269 |
+
length_norm: bool = True # True=平均logprob;False=總logprob
|
| 270 |
+
) -> Dict[str, Any]:
|
| 271 |
+
"""
|
| 272 |
+
回傳:
|
| 273 |
+
{
|
| 274 |
+
"best_text": str,
|
| 275 |
+
"best_index": int,
|
| 276 |
+
"best_score": float,
|
| 277 |
+
"candidates": [
|
| 278 |
+
{"index": i, "text": "...", "score": float, "token_logprobs": [..]},
|
| 279 |
+
...
|
| 280 |
+
]
|
| 281 |
+
}
|
| 282 |
+
"""
|
| 283 |
+
t = self.temperature if temperature is None else float(temperature)
|
| 284 |
+
|
| 285 |
+
base_kwargs = dict(
|
| 286 |
+
model=self.model,
|
| 287 |
+
messages=messages,
|
| 288 |
+
n=int(n),
|
| 289 |
+
temperature=t
|
| 290 |
+
)
|
| 291 |
+
if self.max_tokens is not None:
|
| 292 |
+
base_kwargs["max_tokens"] = self.max_tokens
|
| 293 |
+
if response_format:
|
| 294 |
+
base_kwargs["response_format"] = response_format
|
| 295 |
+
|
| 296 |
+
# --- 相容處理:先用 OpenAI 風格(logprobs=True + top_logprobs=int)
|
| 297 |
+
kwargs = dict(base_kwargs)
|
| 298 |
+
kwargs["logprobs"] = True
|
| 299 |
+
kwargs["top_logprobs"] = int(top_logprobs)
|
| 300 |
+
|
| 301 |
+
try:
|
| 302 |
+
resp = self._call_chat(**kwargs)
|
| 303 |
+
except APIError as e:
|
| 304 |
+
# 若是 Azure 端驗證失敗(常見於有的版本把 logprobs 視為 int)
|
| 305 |
+
# 改成 Azure 風格:logprobs=int(top_k),不帶 top_logprobs
|
| 306 |
+
kwargs = dict(base_kwargs)
|
| 307 |
+
# 至少要 1,才能回傳 chosen token 的 logprob 與若干 top 候選
|
| 308 |
+
kwargs["logprobs"] = max(1, int(top_logprobs)) if top_logprobs > 0 else 1
|
| 309 |
+
resp = self._call_chat(**kwargs)
|
| 310 |
+
|
| 311 |
+
def extract_token_logprobs(choice) -> List[float]:
|
| 312 |
+
"""從 choice.logprobs 取出每個輸出 token 的 logprob;不同佈署格式略有差異,做穩健解析。"""
|
| 313 |
+
lp = getattr(choice, "logprobs", None)
|
| 314 |
+
if not lp:
|
| 315 |
+
return []
|
| 316 |
+
# OpenAI Chat 回傳:choice.logprobs.content -> List[ { token, logprob, top_logprobs? } ]
|
| 317 |
+
content = getattr(lp, "content", None)
|
| 318 |
+
if isinstance(content, list) and content:
|
| 319 |
+
vals = []
|
| 320 |
+
for item in content:
|
| 321 |
+
val = getattr(item, "logprob", None)
|
| 322 |
+
if val is not None:
|
| 323 |
+
vals.append(float(val))
|
| 324 |
+
return vals
|
| 325 |
+
# 有些佈署可能直接給 tokens/logprobs 串列
|
| 326 |
+
tokens = getattr(lp, "tokens", None)
|
| 327 |
+
logps = getattr(lp, "token_logprobs", None) or getattr(lp, "logprobs", None)
|
| 328 |
+
if isinstance(logps, list):
|
| 329 |
+
return [float(x) for x in logps]
|
| 330 |
+
return []
|
| 331 |
+
|
| 332 |
+
def score_of(logps: List[float]) -> float:
|
| 333 |
+
if not logps:
|
| 334 |
+
return float("-inf")
|
| 335 |
+
return (sum(logps) / len(logps)) if length_norm else sum(logps)
|
| 336 |
+
|
| 337 |
+
# 整理所有候選並計分
|
| 338 |
+
scored = []
|
| 339 |
+
for i, ch in enumerate(resp.choices):
|
| 340 |
+
text = (ch.message.content or "")
|
| 341 |
+
token_logps = extract_token_logprobs(ch)
|
| 342 |
+
scored.append({
|
| 343 |
+
"index": i,
|
| 344 |
+
"text": text,
|
| 345 |
+
"score": math.e ** score_of(token_logps),
|
| 346 |
+
"token_logprobs": token_logps
|
| 347 |
+
})
|
| 348 |
+
|
| 349 |
+
scored.sort(key=lambda x: x["score"], reverse=True)
|
| 350 |
+
best = scored[0] if scored else {"index": -1, "text": "", "score": float("-inf"), "token_logprobs": []}
|
| 351 |
+
self.chat_rerank_log = scored
|
| 352 |
+
|
| 353 |
+
return {
|
| 354 |
+
"best_text": best["text"],
|
| 355 |
+
"best_index": best["index"],
|
| 356 |
+
"best_score": best["score"],
|
| 357 |
+
"log": scored
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
# ---------- Orchestrator ----------
|
| 361 |
+
|
| 362 |
+
class PreferenceJudge:
|
| 363 |
+
def __init__(
|
| 364 |
+
self,
|
| 365 |
+
llm: LLMClient,
|
| 366 |
+
mode: Mode = "reading",
|
| 367 |
+
prompts: Optional[PromptSet] = None, # 顯式注入時會覆蓋 mode
|
| 368 |
+
json_schema_override: Optional[Mapping[str, Any]] = None,
|
| 369 |
+
):
|
| 370 |
+
self.llm = llm
|
| 371 |
+
|
| 372 |
+
if prompts is None:
|
| 373 |
+
key = mode.casefold()
|
| 374 |
+
try:
|
| 375 |
+
self.prompts = PROMPT_REGISTRY[key]
|
| 376 |
+
except KeyError as e:
|
| 377 |
+
valid = ", ".join(sorted(PROMPT_REGISTRY.keys()))
|
| 378 |
+
raise ValueError(f"Invalid mode '{mode}'. Valid modes: {valid}") from e
|
| 379 |
+
self.mode = key
|
| 380 |
+
else:
|
| 381 |
+
self.prompts = prompts
|
| 382 |
+
self.mode = "custom"
|
| 383 |
+
|
| 384 |
+
# 以 prompts 的 schema 為主,必要時允許覆寫
|
| 385 |
+
base_schema = json_schema_override if json_schema_override is not None else self.prompts.json_schema
|
| 386 |
+
# 包成唯讀,避免執行期被改動
|
| 387 |
+
self.json_schema = base_schema if isinstance(base_schema, MappingProxyType) else MappingProxyType(dict(base_schema))
|
| 388 |
+
|
| 389 |
+
@classmethod
|
| 390 |
+
def register_mode(cls, name: str, prompts: PromptSet) -> None:
|
| 391 |
+
"""外部可擴充新模式"""
|
| 392 |
+
PROMPT_REGISTRY[name.casefold()] = prompts
|
| 393 |
+
|
| 394 |
+
@staticmethod
|
| 395 |
+
def contains_simplified(text: str, enable_general_word_check: bool = False) -> bool:
|
| 396 |
+
return OpenCCChecker.has_simplified(text, prefer_general_word_check=enable_general_word_check)
|
| 397 |
+
|
| 398 |
+
def judge(self, reading_task_prompt: str, run_additional_check: bool = True, run_json_schema: bool = True) -> Tuple[List[Dict[str, str]], Dict[str, Any]]:
|
| 399 |
+
msgs: List[Dict[str, str]] = [
|
| 400 |
+
{"role": "developer", "content": self.prompts.system},
|
| 401 |
+
{"role": "user", "content": reading_task_prompt},
|
| 402 |
+
]
|
| 403 |
+
|
| 404 |
+
# 回合1:基礎審核
|
| 405 |
+
resp1 = self.llm.chat(msgs)
|
| 406 |
+
msgs.append({"role": "assistant", "content": resp1})
|
| 407 |
+
|
| 408 |
+
# 回合2:只看題目之選項檢查(可關閉)
|
| 409 |
+
if run_additional_check:
|
| 410 |
+
msgs.append({"role": "user", "content": self.prompts.additional_check})
|
| 411 |
+
resp2 = self.llm.chat(msgs)
|
| 412 |
+
msgs.append({"role": "assistant", "content": resp2})
|
| 413 |
+
|
| 414 |
+
# 回合3:JSON Schema 輸出(若提供)
|
| 415 |
+
if run_json_schema:
|
| 416 |
+
resp3 = self.llm.chat(
|
| 417 |
+
msgs,
|
| 418 |
+
response_format={"type": "json_schema", "json_schema": self.prompts.json_schema}
|
| 419 |
+
)
|
| 420 |
+
msgs.append({"role": "assistant", "content": resp3})
|
| 421 |
+
try:
|
| 422 |
+
parsed = json.loads(resp3)
|
| 423 |
+
except json.JSONDecodeError:
|
| 424 |
+
parsed = {"error": "model did not return valid JSON", "raw": resp3}
|
| 425 |
+
else:
|
| 426 |
+
parsed = {"raw": msgs[-1]["content"]}
|
| 427 |
+
return msgs, parsed
|
| 428 |
+
|
| 429 |
+
class LevelJudge:
|
| 430 |
+
def __init__(self, model_path: str, device: str = "cuda"):
|
| 431 |
+
if torch is None or AutoTokenizer is None or LevelJudgementModel is None:
|
| 432 |
+
# region agent log
|
| 433 |
+
_agent_log(
|
| 434 |
+
"H_DEPS",
|
| 435 |
+
"util/judgement.py:LevelJudge.__init__",
|
| 436 |
+
"LevelJudge missing optional deps",
|
| 437 |
+
{
|
| 438 |
+
"torch_available": torch is not None,
|
| 439 |
+
"transformers_available": AutoTokenizer is not None,
|
| 440 |
+
"LevelJudgementModel_available": LevelJudgementModel is not None,
|
| 441 |
+
"model_path": model_path,
|
| 442 |
+
"device": device,
|
| 443 |
+
},
|
| 444 |
+
)
|
| 445 |
+
# endregion
|
| 446 |
+
raise ModuleNotFoundError(
|
| 447 |
+
"LevelJudge requires optional dependencies. Install 'torch' and 'transformers' to use it."
|
| 448 |
+
)
|
| 449 |
+
|
| 450 |
+
ckpt = torch.load(model_path, map_location=device) # PyTorch 2.6 預設 weights_only=True,讀 state_dict 沒問題
|
| 451 |
+
hparams = ckpt["hparams"]
|
| 452 |
+
|
| 453 |
+
self.model = LevelJudgementModel(**hparams).to(device)
|
| 454 |
+
self.model.load_state_dict(ckpt["model_state_dict"], strict=True)
|
| 455 |
+
self.tonkenizer = AutoTokenizer.from_pretrained(hparams["model_name"])
|
| 456 |
+
self.model.eval()
|
| 457 |
+
self.device = device
|
| 458 |
+
|
| 459 |
+
def judge(self, text):
|
| 460 |
+
"""Predicts level probabilities."""
|
| 461 |
+
batch_x = self.tonkenizer(text, max_length=512, padding=True, truncation=True, return_tensors="pt").to(self.device)
|
| 462 |
+
|
| 463 |
+
with torch.no_grad():
|
| 464 |
+
output = self.model(batch_x["input_ids"], batch_x["attention_mask"])
|
| 465 |
+
probs = torch.nn.functional.softmax(output, dim=1).squeeze().tolist()
|
| 466 |
+
|
| 467 |
+
id2label = {0:'入門基礎', 1:'進階高階', 2:'流利精通'}
|
| 468 |
+
predictions = {id2label[i]: round(probs[i], 3) for i in range(len(probs))}
|
| 469 |
+
return predictions
|
util/reading_constants.py
ADDED
|
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from string import Template
|
| 3 |
+
from types import MappingProxyType
|
| 4 |
+
SYSTEM_PROMPT = "你是一位在台灣從事國文教育多年有趣的華語老師,這次負責出閱讀測驗的題目。"
|
| 5 |
+
READING_TASK_DEFINITION = "每個閱讀測驗由「文章」和「題目」所組成,「文章」由一連串文字組成,與學校的課文相似;「題目」為選擇題,提出一個問題後給予(A)、(B)、(C)、(D)四個可能的答案供選擇。"
|
| 6 |
+
|
| 7 |
+
BAND_A_DEFINITION_PROMPT = """【題型說明】
|
| 8 |
+
%s
|
| 9 |
+
|
| 10 |
+
【要求】
|
| 11 |
+
每篇文章只出一題題目。
|
| 12 |
+
文章長度80~150字,語言難度相當於國小中高年級水準,盡量不要出現書面語。
|
| 13 |
+
文章多為日常情境、簡短故事的記敘文,富含一些道理。
|
| 14 |
+
預期讀者為學習繁體中文的外國成年人,可以以較為國際化或本土化的文章內容出題。
|
| 15 |
+
文章文體分為四種:
|
| 16 |
+
[記敘文]: 敘寫人、事、時、地、物。 包含,形塑人物、記述事件、描繪空間、摹寫物品。
|
| 17 |
+
[抒情文]: 抒發對人、事、物、景的情感。包含,因人抒情、藉事抒情、因地抒情、詠物抒情、藉景抒情。
|
| 18 |
+
[説明文]: 說明事理或事物。包含,解釋事理、說明事物、解決問題、說明手法、說明輔助。
|
| 19 |
+
[議論文]: 表達對人、事、物的看法。包含,建立論點、提出論據、因果論證、歸納論證、演繹論證。
|
| 20 |
+
*若文中含有對話則在文題後方標明(含對話)。
|
| 21 |
+
題目類型分為[直接提取]、[主旨判斷]、[直接推論]:
|
| 22 |
+
[直接提取]: 要求讀者直接從文本中找到明確訊息,通常答案在文中有清楚線索。
|
| 23 |
+
[主旨判斷]: 考驗讀者能否掌握文章的核心重點或整體意圖,而不是某一句的資訊。
|
| 24 |
+
[直接推論]: 著重於從上下文推敲作者舉例、用語或段落背後的用意,重點在「為什麼要這樣說/這個例子是要說明什麼」。
|
| 25 |
+
|
| 26 |
+
【閱讀測驗生產流程】
|
| 27 |
+
閱讀完相關要求後,決定一個80~150的數字做為文章長度參考,並依照給定的文體產出文章。
|
| 28 |
+
閱讀完文章內容後,從給定的題型,產出題目問句。
|
| 29 |
+
產出適當的三個誘答選項及標準答案,誘答選項不可以是「只看選項就可以判斷答案的『明顯錯誤答案』」,常見的情況有「強烈的否定用語」或「與常理不符」。
|
| 30 |
+
|
| 31 |
+
【範例】
|
| 32 |
+
文章([記敘文](純敘述)):
|
| 33 |
+
我最近學會一道臺灣菜,叫做「鹹蛋苦瓜」,這道菜是房東陳媽媽教我做的。以前,我覺得苦瓜吃起來和藥一樣苦,一直不喜歡吃。 可是陳媽媽教我做的苦瓜一點都不苦,吃起來又香又甜。而且陳媽媽說,吃沒煮過的苦瓜對身體更好,還說下一次要教我用沒煮過的苦瓜做好喝的果汁,我學會以後,要先做給我的好朋友玉芳喝,希望她和我一樣愛上苦瓜。
|
| 34 |
+
|
| 35 |
+
題目:
|
| 36 |
+
1.寫這段話的人說了什麼?
|
| 37 |
+
(A)他一道臺灣菜也不會做
|
| 38 |
+
(B)陳媽媽和他一樣不吃苦瓜
|
| 39 |
+
(C)他發現苦瓜在臺灣是一種藥
|
| 40 |
+
(D)他的好朋友玉芳也不喜歡苦瓜
|
| 41 |
+
[直接推論]
|
| 42 |
+
|
| 43 |
+
文章([説明文](純敘述)):
|
| 44 |
+
你怎麼處理那些已經不再看的書呢?直接丟掉?送給別人?還是賣給書店?現代網路的進步,使賣書變得更容易。於是有些人就利用網路開店賣舊書,把賺來的錢分給需要的人,有時也將舊書送到鄉下或山裡的學校,這樣的話,不但可以保護環境,還可以讓更多的人有機會讀書。
|
| 45 |
+
|
| 46 |
+
題目:
|
| 47 |
+
1.這篇文章在談什麼?
|
| 48 |
+
(A)用家中舊書換新書的方法
|
| 49 |
+
(B)上網看書才不會用太多紙
|
| 50 |
+
(C)學會賣書可以賺更多的錢
|
| 51 |
+
(D)把不要的書變成有用的書
|
| 52 |
+
[主旨判斷]
|
| 53 |
+
|
| 54 |
+
文章([抒情文](含對話)):
|
| 55 |
+
明麗的朋友都覺得明麗總是開心地過生活,大家從來沒有看過她傷心或生氣。有一次,我問她是怎麼辦到的,她回答我:「每天一早醒來,我都有兩個選擇,可以選擇壞心情,討厭自己的生活;也可以選擇好心情,喜愛自己的生活。如果是你,不會選第二個嗎?」
|
| 56 |
+
|
| 57 |
+
題目:
|
| 58 |
+
1.明麗最後要說的意思是什麼?
|
| 59 |
+
(A)想太多事情對自己沒用
|
| 60 |
+
(B)心情好壞可以自己決定
|
| 61 |
+
(C)早睡早起的人一定快樂
|
| 62 |
+
(D)生活進步的方法有很多
|
| 63 |
+
[直接推論]
|
| 64 |
+
|
| 65 |
+
文章([記敘文](純敘述)):
|
| 66 |
+
有一對夫婦開了一家賣蛋的小店,又高又壯的先生賣的蛋總是比又瘦又矮的太太少得多。他覺得很奇怪,想了又想,原來是因為他的手比較大,當他用手拿著蛋時,客人會覺得蛋變小了。太太用她的小手拿蛋給客人時,蛋看起來居然大一些。之後,先生把蛋放在很小的盤子裡賣,大家就覺得這間店的蛋特別大,先生也就成功賣出了更多蛋。
|
| 67 |
+
|
| 68 |
+
題目:
|
| 69 |
+
1.為什麼太太賣蛋賣得比先生好?
|
| 70 |
+
(A)太太賣的蛋比較便宜
|
| 71 |
+
(B)先生賣蛋的樣子很奇怪
|
| 72 |
+
(C)太太賣的蛋看起來比較大
|
| 73 |
+
(D)先生把蛋放在小盤子裡賣
|
| 74 |
+
[直接提取]
|
| 75 |
+
|
| 76 |
+
""" %(READING_TASK_DEFINITION)
|
| 77 |
+
|
| 78 |
+
BAND_B_DEFINITION_PROMPT = """【題型說明】
|
| 79 |
+
%s
|
| 80 |
+
|
| 81 |
+
【要求】
|
| 82 |
+
每篇文章出2~4題題目。
|
| 83 |
+
文章多樣,富含哲理。
|
| 84 |
+
文章長度150~400字,語言難度相當於國中水準,可出現少量書面語。
|
| 85 |
+
預期讀者為學習繁體中文的外國成年人,因此可考慮使用一些「台灣文化」。
|
| 86 |
+
文章文體分為四種:
|
| 87 |
+
[記敘文]: 敘寫人、事、時、地、物。 包含,形塑人物、記述事件、描繪空間、摹寫物品。
|
| 88 |
+
[抒情文]: 抒發對人、事、物、景的情感。包含,因人抒情、藉事抒情、因地抒情、詠物抒情、藉景抒情。
|
| 89 |
+
[説明文]: 說明事理或事物。包含,解釋事理、說明事物、解決問題、說明手法、說明輔助。
|
| 90 |
+
[議論文]: 表達對人、事、物的看法。包含,建立論點、提出論據、因果論證、歸納論證、演繹論證。
|
| 91 |
+
*若文中含有對話則在文題後方標明(含對話)。
|
| 92 |
+
題目類型分為[直接提取]、[主旨判斷]、[詞語句意]、[直接推論]:
|
| 93 |
+
[直接提取]:要求讀者直接從文本中找到明確訊息,通常答案在文中有清楚線索。
|
| 94 |
+
[主旨判斷]:考驗讀者能否掌握文章的核心重點或整體意圖,而不是某一句的資訊。
|
| 95 |
+
[詞語句意]:針對文中某個詞語、句子或段落的字面意義進行釐清,重點在「它是什麼意思」。
|
| 96 |
+
[直接推論]:著重於從上下文推敲作者舉例、用語或段落背後的用意,重點在「為什麼要這樣說/這個例子是要說明什麼」。
|
| 97 |
+
|
| 98 |
+
【閱讀測驗生產流程】
|
| 99 |
+
閱讀完相關要求後,決定一個150-400的數字做為文章長度參考,並依照給定的文體產出文章。
|
| 100 |
+
閱讀完文章內容後,隨機選取題型,產出2~4個題目問句。
|
| 101 |
+
產出適當的三個誘答選項及標準答案,誘答選項不可以是「只看選項就可以判斷答案的『明顯錯誤答案』」,常見的情況有「強烈的否定用語」或「與常理不符」。
|
| 102 |
+
|
| 103 |
+
【範例】
|
| 104 |
+
文章([議論文](純敘述)):
|
| 105 |
+
在某些社會文化中,由於「一個人吃飯」通常與被孤立、沒朋友、不懂社交等負面印象聯繫在一起,所以一個人走進餐廳就成了許多人害怕的事。但,獨自吃飯真是一件那麼糟糕的事嗎?事實上,一個人並不意味著寂寞,與此相反,更可以將注意力放在享受餐盤上的美食。當然,與他人分享、交流時能帶給人們滿足與快樂,而當你已經在人際相處上付出了許多時間後,更應該為自己保留一些獨處的空間;當你因為獨自吃飯而害怕被貼上「人際障礙」的標籤時,也許根本沒有人注意到你,或是覺得你性格古怪。忙碌的生活常常讓我們忽略與自己獨處的重要性,哪怕是一頓飯、一部電影的時間,都是面對自己的好機會。久而久之,你將發現,那些異樣眼光只是一種自我限制,而能一個人吃飯,竟然是那麼寶貴。
|
| 106 |
+
|
| 107 |
+
題目:
|
| 108 |
+
1. 作者認為有些人害怕一個人吃飯的原因是什麼?
|
| 109 |
+
(A)不敢違反社交規定
|
| 110 |
+
(B)太在意別人的想法
|
| 111 |
+
(C)不能分享最新的消息
|
| 112 |
+
(D)擔心錯過朋友的邀請
|
| 113 |
+
[直接提取]
|
| 114 |
+
2. 關於「害怕一個人吃飯」,下面哪句話可以用來說明作者的觀點?
|
| 115 |
+
(A)無疑是自己嚇自己
|
| 116 |
+
(B)反而更不顧別人感受
|
| 117 |
+
(C)負面評價自有他的道理
|
| 118 |
+
(D)每個人都得面對生活的寂寞
|
| 119 |
+
[直接推論]
|
| 120 |
+
3. 根據本文,作者認為「一個人」的優點是什麼?
|
| 121 |
+
(A)不必擔心事情被中途打斷
|
| 122 |
+
(B)能消除人與人之間的防備
|
| 123 |
+
(C)對所做的事情能更加投入
|
| 124 |
+
(D)能建立起獨立自主的形象
|
| 125 |
+
[直接提取]
|
| 126 |
+
|
| 127 |
+
文章([記敘文](純敘述)):
|
| 128 |
+
我們都知道,小時候大人說給我們聽的故事,大部分都不是真的。即使早就知道這個事實,但長大以後,當我們有了自己的小孩,還是會把聽過的故事重新說一遍,就這樣一代一代,讓故事繼續下去。這種對幻想世界的好奇心,是每個小孩最珍貴的精神財富。而一個大人在經歷無數人生中的現實之後,如果還能保留住這份天真,該是多麼難得。 電影《大魚》,正是在探討這樣的主題。電影描述一位非常有想像力的父親,總是告訴小兒子一些以自己為主角的神奇故事,然而隨著年齡增長,小兒子從原本對父親的敬愛和崇拜,逐漸轉變成忽視。 他不喜歡父親仍然把他當小孩一樣,繼續說著那些聽起來很「誇張」的故事。父子間的關係因此越來越差,彼此甚至不再說話。直到父親去世後,在父親的葬禮上,小兒子看見父親故事裡的那些人物竟然都出席了,他才明白,父親只是希望,他始終能擁有兒時的天真。最後,他也把父親的故事,分享給了自己的孩子。
|
| 129 |
+
|
| 130 |
+
題目:
|
| 131 |
+
1. 這篇短文主要想傳達什麼思想?
|
| 132 |
+
(A)看電影和聽故事都不必太認���
|
| 133 |
+
(B)人們應該完全像小孩一樣活著
|
| 134 |
+
(C)成長過程中要保持單純不容易
|
| 135 |
+
(D)既然長大就應該提早面對現實
|
| 136 |
+
[主旨判斷]
|
| 137 |
+
2. 請問下面哪一段文字最符合第二段中所說的「主題」?
|
| 138 |
+
(A)睡前聽故事
|
| 139 |
+
(B)孝順父母親
|
| 140 |
+
(C)失去的童心
|
| 141 |
+
(D)哲學的智慧
|
| 142 |
+
[詞語句意]
|
| 143 |
+
3. 根據短文,為什麼小兒子後來和父親的關係不好?
|
| 144 |
+
(A)他一直都誤會父親
|
| 145 |
+
(B)他覺得父親不關心他
|
| 146 |
+
(C)他父親很少再和他講故事
|
| 147 |
+
(D)他父親是個不切實際的人
|
| 148 |
+
[直接推論]
|
| 149 |
+
|
| 150 |
+
文章([抒情文](純敘述)):
|
| 151 |
+
孩子,這是你改搭校車回家的第一天。媽媽沒忘記你早上出門前,不停地告訴我搭校車是件很恐怖的事,那語氣裡充滿了面對不確定的不安,更讓媽媽覺得不忍心。然而我只是笑著看你,聽你把所有預期中可怕的後果全部說完,然後篤定地告訴你:「這是你人生的另一個開始,你要學著去面對你自己的人生!」
|
| 152 |
+
|
| 153 |
+
題目:
|
| 154 |
+
1.在這段短文裡,母親面對孩子的不安時,態度怎麼樣?
|
| 155 |
+
(A)既害怕又不忍心
|
| 156 |
+
(B)不知道該怎麼辦
|
| 157 |
+
(C)覺得孩子很可憐
|
| 158 |
+
(D)鼓勵孩子自己解決
|
| 159 |
+
[直接推論]
|
| 160 |
+
|
| 161 |
+
""" %(READING_TASK_DEFINITION)
|
| 162 |
+
|
| 163 |
+
BAND_C_DEFINITION_PROMPT = """【題型說明】
|
| 164 |
+
%s
|
| 165 |
+
|
| 166 |
+
【要求】
|
| 167 |
+
每篇文章出4~6題題目。
|
| 168 |
+
文章多為説明文或議論文,富含哲理、情感、經驗。
|
| 169 |
+
文章長度250~500字,語言難度相當於高中水準,可出現書面詞、抽象概念。
|
| 170 |
+
預期讀者為學習繁體中文的外國成年人,因此可考慮使用一些「台灣文化」。
|
| 171 |
+
文章文體分為四種:
|
| 172 |
+
[記敘文]: 敘寫人、事、時、地、物。 包含,形塑人物、記述事件、描繪空間、摹寫物品。
|
| 173 |
+
[抒情文]: 抒發對人、事、物、景的情感。包含,因人抒情、藉事抒情、因地抒情、詠物抒情、藉景抒情。
|
| 174 |
+
[説明文]: 說明事理或事物。包含,解釋事理、說明事物、解決問題、說明手法、說明輔助。
|
| 175 |
+
[議論文]: 表達對人、事、物的看法。包含,建立論點、提出論據、因果論證、歸納論證、演繹論證。
|
| 176 |
+
*若文中含有對話則在文題後方標明(含對話)。
|
| 177 |
+
題目類型分為[直接提取]、[主旨判斷]、[詞語句意]、[直接推論]:
|
| 178 |
+
[直接提取]:要求讀者直接從文本中找到明確訊息,通常答案在文中有清楚線索。
|
| 179 |
+
[主旨判斷]:考驗讀者能否掌握文章的核心重點或整體意圖,而不是某一句的資訊。
|
| 180 |
+
[詞語句意]:針對文中某個詞語、句子或段落的字面意義進行釐清,重點在「它是什麼意思」。
|
| 181 |
+
[直接推論]:著重於從上下文推敲作者舉例、用語或段落背後的用意,重點在「為什麼要這樣說/這個例子是要說明什麼」。
|
| 182 |
+
|
| 183 |
+
【閱讀測驗生產流程】
|
| 184 |
+
閱讀完相關要求後,決定一個250-500的數字做為文章長度參考,並依照給定的文體產出文章。
|
| 185 |
+
閱讀完文章內容後,隨機選取題型,產出題目問句。
|
| 186 |
+
產出適當的三個誘答選項及標準答案,誘答選項不可以是「只看選項就可以判斷答案的『明顯錯誤答案』」,常見的情況有「強烈的否定用語」或「與常理不符」。
|
| 187 |
+
|
| 188 |
+
【範例】
|
| 189 |
+
文章([抒情文](純敘述)):
|
| 190 |
+
「花間一壺酒,獨酌無相親」,這兩句詩表達出李白的心情。他在奼紫嫣紅的花叢間,擺上一壺酒。在醺然的狀態下,邀了天上的明月一起對飲。這樣的良辰美景,應該是三五好友,知心共享的時刻,李白卻只能喟嘆。月亮、李白與李白的影子,對影成三人。
|
| 191 |
+
月亮不會飲酒,和自己的影子飲酒更是平添寂寞。但是,李白終究灑脫。趁著春光正盛,恣意歌詠,至歡快處,手舞足蹈不也酣暢淋漓、大快人心。
|
| 192 |
+
李白就這樣從最初的落寞,到任著自己的心情,在月下舞著、蹈著、唱著,還和白玉盤有了天際渺渺之約。李白不需要走到人生的困境,才懂得駐足欣賞眼前的風景,他總是隨走隨看隨領略。即便孤身一人,總有艱辛,他也能將之轉化與天地同歡,而這番了悟,妙筆一揮,遂成詩裡行間的風景。
|
| 193 |
+
在我看來,人生泰半是挫折與苦難。但是,若偕李白同行,每每到水窮處時,必定聽見李白放聲大歌。伴著歌聲,試著將挫折烹調成一盤下酒菜。咀嚼於舌尖的況味,終將消化於人生的坦然。
|
| 194 |
+
|
| 195 |
+
題目:
|
| 196 |
+
1.「花間一壺酒,獨酌無相親」表達出李白的心情怎麼樣?
|
| 197 |
+
(A)漠然
|
| 198 |
+
(B)舒暢
|
| 199 |
+
(C)孤寂
|
| 200 |
+
(D)哀痛
|
| 201 |
+
[詞語句意]
|
| 202 |
+
2. 根據本文第二段描述,讓李白手舞足蹈的原因是什麼?
|
| 203 |
+
(A)因為他自在的個性
|
| 204 |
+
(B)因為他和月亮的約定
|
| 205 |
+
(C)因為他醉後就想跳舞
|
| 206 |
+
(D)因為他想看影子的流動
|
| 207 |
+
[直接提取]
|
| 208 |
+
3. 本文第三段提到,「詩裡行間的風景」,作者指的是什麼?
|
| 209 |
+
(A)李白對���然的讚嘆
|
| 210 |
+
(B)李白對官場的感慨
|
| 211 |
+
(C)李白對人生的領悟
|
| 212 |
+
(D)李白對詩詞的看法
|
| 213 |
+
[直接推論]
|
| 214 |
+
4. 根據文本,作者咀嚼於舌尖的「況味」,是一種什麼味道?
|
| 215 |
+
(A)甘味
|
| 216 |
+
(B)鮮味
|
| 217 |
+
(C)辛味
|
| 218 |
+
(D)苦味
|
| 219 |
+
[主旨判斷]
|
| 220 |
+
|
| 221 |
+
文章([記敘文](純敘述)):
|
| 222 |
+
她獨行於瞎燈黑火的林道上,僅有些微月光從葉隙中透出。臨下山前,家中長輩再三囑咐,此去求援,不可妄生事端,她的拳腳功夫僅能自保,若不是族中男子皆已上戰場廝殺,萬萬是不能讓她一個女子孤身上路。是以一切只求護得自身周全,快去快回即可。
|
| 223 |
+
一路行來,她不僅低調,且不停地變換交通工具,先是乘舟走水十來天,接著易舟為馬,在馬上又折騰了十來天。就剩最後這幾里路時,她才改由步行。原以為安全的地兒,眼下卻讓她有不寒而慄之感。
|
| 224 |
+
前方突地傳來沙沙聲,她下意識退後兩步,卻見林中飛鳥穿過樹梢,她有些「惱怒」,暗啐膽小。驀地,身後傳來虛空劃破之聲。她略側頭回看,眼角已閃過一道疾光。堪堪轉身之際,就見劍尖在眼前一晃,幸得她偏頭一閃,並朝對方腰處撞去。她也不知道這一招是否有效,只是拼盡全身的力氣,狠地一撞。這一來一往,間刻不能容髮之際,她都沒察覺自己一顆心已經提到了嗓子眼兒,堵得呼吸都倍顯艱難。
|
| 225 |
+
幸得這一撞,教來者失了準頭,削掉她幾縷青絲。但這一撞,也叫她看清來者何人,一聲驚呼幾乎要從喉間溢出,今天真是給阿爹應了「養虎為患」這句話啊!來人穩住身形,又是一連串凌厲攻勢,她左支右絀閃躲地甚是狼狽。反射性地憶起了與這人無猜的歲月,心下淒苦。喉頭一甜、滿嘴鹹腥。眼下任務不僅完成不了,想她不過一十八歲的年華或許也將了結在此。
|
| 226 |
+
|
| 227 |
+
題目:
|
| 228 |
+
1.關於文章中的主角,下面哪一個是對的?
|
| 229 |
+
(A)她找好救兵準備上戰場
|
| 230 |
+
(B)她跟著一大隊人馬求援
|
| 231 |
+
(C)她的武功已是出神入化
|
| 232 |
+
(D)她的狀態現是生死未卜
|
| 233 |
+
[直接提取]
|
| 234 |
+
2. 文章中的主角與攻擊她的人,以前是什麼關係?
|
| 235 |
+
(A)舊識
|
| 236 |
+
(B)親家
|
| 237 |
+
(C)陌路
|
| 238 |
+
(D)對頭
|
| 239 |
+
[直接提取]
|
| 240 |
+
3. 文章第三段提及主角有些惱怒,她在惱怒什麼?
|
| 241 |
+
(A)她覺得自己的能力不如男子
|
| 242 |
+
(B)她害怕自己無法完成此趟任務
|
| 243 |
+
(C)她覺得長途跋涉的旅程太過疲憊
|
| 244 |
+
(D)她覺得自己不該被一些聲響嚇到
|
| 245 |
+
[直接推論]
|
| 246 |
+
4. 文中,心「提到了嗓子眼兒,堵得呼吸都倍顯艱難」,是怎樣的情緒?
|
| 247 |
+
(A)氣憤
|
| 248 |
+
(B)尷尬
|
| 249 |
+
(C)緊張
|
| 250 |
+
(D)鬱鬱
|
| 251 |
+
[詞語句意]
|
| 252 |
+
5. 文章最後一段提及,主角和敵人對戰的情況怎麼樣?
|
| 253 |
+
(A)她落於下風
|
| 254 |
+
(B)她最終反敗為勝
|
| 255 |
+
(C)她將對方手到擒來
|
| 256 |
+
(D)她與對方勢均力敵
|
| 257 |
+
[直接提取]
|
| 258 |
+
6. 下面哪一個句子可以用來描述攻擊的人?
|
| 259 |
+
(A)相貌玉樹臨風
|
| 260 |
+
(B)出手趕盡殺絕
|
| 261 |
+
(C)招式拖泥帶水
|
| 262 |
+
(D)神色恍恍惚惚
|
| 263 |
+
[直接提取]
|
| 264 |
+
|
| 265 |
+
文章([説明文](純敘述)):
|
| 266 |
+
多數人以為,超大質量黑洞應存在於巨大星系的中心位置。但日前,已有天文學家在銀河系邊緣,發現了幾十個超大質量黑洞。這些黑洞在環繞銀河系主體,由恆星和氣體構成近似於球形的「恆星暈」區域中閒逛著。
|
| 267 |
+
天文學家認為,黑洞之所以存在於宿主星系邊緣的現象,是來自星系合併的結果,這現象經常發生在一個不斷膨脹的宇宙中。當一個較小的星系融入一個更大的主星系時,它會把自己中央的超大質量黑洞置於新的宿主星系上。《天體物理學雜誌快報》刊發的研究報告進一步預測,與銀河系具有相近質量的星系,應該包含著不只一個超大質量的黑洞。
|
| 268 |
+
該報告的研究員表示,由於工具的限制,想直接論斷黑洞的存在是不可能的。即使想間接推斷它們是否存在,也必須要有更好的方法與手段才行。因為,每1000億年左右,才能與影響太陽系的漫遊黑洞近距離接觸,從人類角度來看,其發生頻率極低;再加上,因為預測漫遊的超大質量黑洞位於離星系中心很遠且處於星系盤之外的地方,所以它們不太可能吸收增加更多的氣體,來膨脹規模,讓人類發現它們。
|
| 269 |
+
|
| 270 |
+
題目:
|
| 271 |
+
1.根據本文,關於「超大質量黑洞」描述,下列何者正確?
|
| 272 |
+
(A)在宿主星系遊蕩
|
| 273 |
+
(B)只存在於已經死亡的星系裡
|
| 274 |
+
(C)天文學家預測僅存在銀河系中
|
| 275 |
+
(D)天文學家無法證實其存在星系邊緣
|
| 276 |
+
[直接提取]
|
| 277 |
+
2. 天文學家為什麼提到「星系合併」?
|
| 278 |
+
(A)主要說明黑洞的位置
|
| 279 |
+
(B)解釋超大黑洞發生的時間
|
| 280 |
+
(C)強調太陽系仍在膨脹壯大中
|
| 281 |
+
(D)從合併過程說明宇宙氣體的重要性
|
| 282 |
+
[直接推論]
|
| 283 |
+
3. 關於超大質量黑洞,《��體物理學雜誌快報》預測了什麼?
|
| 284 |
+
(A)黑洞影響太陽星系的時間
|
| 285 |
+
(B)超大質量黑洞運行的軌跡
|
| 286 |
+
(C)數個超大質量黑洞的存在
|
| 287 |
+
(D)「恆星暈」形成的可能因素
|
| 288 |
+
[直接提取]
|
| 289 |
+
4. 根據本文,研究員認為間接推斷其他星系黑洞存在的限制是什麼?
|
| 290 |
+
(A)超大質量黑洞無法靠近「恆星暈」
|
| 291 |
+
(B)觀測工具無法觀測那麼遠的距離
|
| 292 |
+
(C)無法證明銀河系邊緣存在著黑洞
|
| 293 |
+
(D)宇宙時間無垠,而人類時間有限
|
| 294 |
+
[直接提取]
|
| 295 |
+
|
| 296 |
+
文章([議論文](純敘述)):
|
| 297 |
+
在美國,有不少長年戰績不佳的球隊。最經典的,當然是快要一百年沒拿過世界大賽冠軍的芝加哥小熊隊,他們不只打不到冠軍,而且戰績常常落得難看,球季中就失去了爭冠的機會。但小熊隊從來不缺球迷支持者,他們的球迷甚至練就一身自我解嘲、自我安慰的本事,例如在球季開賽的第一場比賽,就高舉牌子,上面寫著:「別擔心,還有明年!」
|
| 298 |
+
近來台灣有不少運動員在國際體壇表現卓越,無論媒體或輿論都以「台灣之光」來稱呼這些為國爭光的運動員,一般民眾也與有榮焉,追著觀看這些台灣之光的各項賽事。不過,這些因運動員知名度提昇而追星的民眾,恐怕很難理解小熊隊球迷的忠誠,更難體會這種忠誠的價值,因為王建民、曾雅妮沒贏球,我們根本就不會注意到他們。我們從這些球星身上得到的,是贏的快感,是沾染贏的光榮;我們選擇的,不是特定的球員,也不是他們所屬的哪一支球隊,而是贏的事實。一旦失去了贏的因素,我們也就毫不客氣地失去對他們的興趣,轉而尋找實力更強的球隊來支持了。
|
| 299 |
+
|
| 300 |
+
題目:
|
| 301 |
+
1.小熊隊球迷舉的牌子,向球隊傳達了什麼訊息?
|
| 302 |
+
(A)我們今年一定贏
|
| 303 |
+
(B)今年別輸得太難看
|
| 304 |
+
(C)今年輸了也沒關係
|
| 305 |
+
(D)明年會和今年一樣好
|
| 306 |
+
[直接提取]
|
| 307 |
+
2. 第二段提到的「他們」指的是誰?
|
| 308 |
+
(A)對小熊隊忠誠的球迷
|
| 309 |
+
(B)注意到明星球員的民眾
|
| 310 |
+
(C)表現優異的台灣運動員
|
| 311 |
+
(D)台灣運動員所屬的球隊
|
| 312 |
+
[詞語句意]
|
| 313 |
+
3. 作者說,台灣民眾觀看運動員和球賽的心態是什麼?
|
| 314 |
+
(A)有贏有輸比賽才好看
|
| 315 |
+
(B)因為球員贏了才支持
|
| 316 |
+
(C)即使球員贏了也沒什麼
|
| 317 |
+
(D)就算球員輸了也依舊支持
|
| 318 |
+
[直接推論]
|
| 319 |
+
4. 本文從哪個角度來比較台灣和美國的球迷?
|
| 320 |
+
(A)球迷如何幫球隊宣傳
|
| 321 |
+
(B)球迷如何選出明星球員
|
| 322 |
+
(C)球迷是否注意媒體報導
|
| 323 |
+
(D)球迷是否在乎球賽輸贏
|
| 324 |
+
[主旨判斷]
|
| 325 |
+
""" %(READING_TASK_DEFINITION)
|
| 326 |
+
|
| 327 |
+
BAND_A_ASKING_PROMPTS = [Template("你覺得在「${topic_class}」這個類別底下適合給成年人看的${style}主題有哪些?"), Template("從這些「${topic_class}」的主題中選擇一個最特別的、最能引起讀者共鳴的,並依照原本的【閱讀測驗生產流程】產出閱讀測驗,文章文體使用${style},題目類型使用${question_type}。")]
|
| 328 |
+
BAND_A_REFERENCE_ASKING_PROMPT = Template("請思考讀者學習完【課文】後的閱讀測驗方向,其中閱讀測驗裡的文章是【課文】的延伸,兩者需保持一定的相關性,但不需重複。請依照原本的【閱讀測驗生產流程】產出閱讀測驗,文章文體使用${style},題目類型使用${question_type}。\n【課文】\n${content}")
|
| 329 |
+
ASKING_PROMPTS = [Template("你覺得在「${topic_class}」這個類別底下適合給成年人看的${style}主題有哪些?"), Template("從這些「${topic_class}」的主題中選擇一個最特別的、最能引起讀者共鳴的,並依照原本的【閱讀測驗生產流程】產出閱讀測驗,文章文體使用${style}")]
|
| 330 |
+
REFERENCE_ASKING_PROMPT = Template("請思考讀者學習完【課文】後的閱讀測驗方向,其中閱讀測驗裡的文章是【課文】的延伸,兩者需保持一定的相關性,但不需重複。請依照原本的【閱讀測驗生產流程】產出閱讀測驗,文章文體使用${style}。\n【課文】\n${content}")
|
| 331 |
+
FIX_PREFERENCE_PROMPT = "請根據上述審核結果,包含之前的簡短說明,及詳細不合格的理由,修改閱讀測驗的內容。"
|
| 332 |
+
FIX_HIGH_LEVEL_PROMPT = "文章難度過高,請重新閱讀【要求】、【範例】的內容後,降低閱讀測驗的難度。"
|
| 333 |
+
FIX_LOW_LEVEL_PROMPT = "文章難度過低,請重新閱讀【要求】、【範例】的內容後,增加閱讀測驗的難度。"
|
| 334 |
+
|
| 335 |
+
#計畫助理們(華語老師)確認中
|
| 336 |
+
subtopic_map_path = "./dataset/subtopic_map20251002v2.json"
|
| 337 |
+
with open(subtopic_map_path, "r", encoding="utf-8") as f:
|
| 338 |
+
SUBTOPIC_MAP = json.load(f)
|
| 339 |
+
|
| 340 |
+
SUBTOPIC_CLASS_LIST = []
|
| 341 |
+
for class_name, sub_class in SUBTOPIC_MAP.items():
|
| 342 |
+
SUBTOPIC_CLASS_LIST.extend(sub_class)
|
| 343 |
+
|
| 344 |
+
READING_TEST_SCHEMA = MappingProxyType({
|
| 345 |
+
"name": "reading_test_output",
|
| 346 |
+
"strict": True,
|
| 347 |
+
"schema": {
|
| 348 |
+
"type": "object",
|
| 349 |
+
"properties": {
|
| 350 |
+
"文章文體": {
|
| 351 |
+
"type": "string",
|
| 352 |
+
"description": "閱讀測驗的文體。"
|
| 353 |
+
},
|
| 354 |
+
"文章主題": {
|
| 355 |
+
"type": "string",
|
| 356 |
+
"description": "閱讀測驗的文章主題。"
|
| 357 |
+
},
|
| 358 |
+
"文章主題類別": {
|
| 359 |
+
"type": "string",
|
| 360 |
+
"description": "閱讀測驗的文章主題類別。"
|
| 361 |
+
},
|
| 362 |
+
"文章": {
|
| 363 |
+
"type": "string",
|
| 364 |
+
"description": "閱讀測驗的文章內容。"
|
| 365 |
+
},
|
| 366 |
+
"題目列表": {
|
| 367 |
+
"type": "array",
|
| 368 |
+
"description": "本篇文章對應的選擇題列表(1~6題)。",
|
| 369 |
+
"minItems": 1,
|
| 370 |
+
"maxItems": 6,
|
| 371 |
+
"items": {
|
| 372 |
+
"type": "object",
|
| 373 |
+
"properties": {
|
| 374 |
+
"題號": {
|
| 375 |
+
"type": "integer",
|
| 376 |
+
"minimum": 1,
|
| 377 |
+
"maximum": 6,
|
| 378 |
+
"description": "可選;不提供則以陣列順序視為題號。"
|
| 379 |
+
},
|
| 380 |
+
"題目類別": {
|
| 381 |
+
"type": "string",
|
| 382 |
+
"description": "如:直接提取/主旨判斷/直接推論……"
|
| 383 |
+
},
|
| 384 |
+
"題目描述": {
|
| 385 |
+
"type": "string",
|
| 386 |
+
"description": "題目問句。",
|
| 387 |
+
"minLength": 1
|
| 388 |
+
},
|
| 389 |
+
"A": {"type": "string", "description": "選項 A", "minLength": 1},
|
| 390 |
+
"B": {"type": "string", "description": "選項 B", "minLength": 1},
|
| 391 |
+
"C": {"type": "string", "description": "選項 C", "minLength": 1},
|
| 392 |
+
"D": {"type": "string", "description": "選項 D", "minLength": 1},
|
| 393 |
+
"答案": {
|
| 394 |
+
"type": "string",
|
| 395 |
+
"description": "正確選項。",
|
| 396 |
+
"enum": ["A", "B", "C", "D"]
|
| 397 |
+
},
|
| 398 |
+
"解析": {
|
| 399 |
+
"type": "string",
|
| 400 |
+
"description": "可選;作答理由、關鍵句或提示。"
|
| 401 |
+
}
|
| 402 |
+
},
|
| 403 |
+
"required": ["題號", "題目類別", "題目描述", "A", "B", "C", "D", "答案", "解析"],
|
| 404 |
+
"additionalProperties": False
|
| 405 |
+
}
|
| 406 |
+
},
|
| 407 |
+
},
|
| 408 |
+
"required": ["文章文體", "文章主題", "文章主題類別", "文章", "題目列表"],
|
| 409 |
+
"additionalProperties": False
|
| 410 |
+
}
|
| 411 |
+
})
|
| 412 |
+
|
| 413 |
+
TEMPERATURE = 0.5
|
| 414 |
+
BEAM_NUM = 2
|
| 415 |
+
|
| 416 |
+
READING_LEVEL_CONFIG = {
|
| 417 |
+
"A-入門基礎": {
|
| 418 |
+
"system_prompt": SYSTEM_PROMPT, #確定
|
| 419 |
+
"definition_prompt": BAND_A_DEFINITION_PROMPT,
|
| 420 |
+
"asking_prompt_templates": BAND_A_ASKING_PROMPTS,
|
| 421 |
+
"reference_asking_prompt_template": BAND_A_REFERENCE_ASKING_PROMPT,
|
| 422 |
+
"style_distribution":{'[記敘文](含對話)': 0.206, '[記敘文](純敘述)': 0.529, '[議論文](純敘述)': 0.059, '[説明文](純敘述)': 0.118, '[抒情文](純敘述)': 0.059, '[議論文](含對話)': 0.029},
|
| 423 |
+
#"conversation_distribution":{"純敘述":0.9,"含對話":0.1},
|
| 424 |
+
"topic_classes": SUBTOPIC_CLASS_LIST,
|
| 425 |
+
"question_type_distribution":{"[直接提取]":float(12/33),"[主旨判斷]":float(7/33),"[直接推論]":float(14/33)},
|
| 426 |
+
"json_schema": READING_TEST_SCHEMA, #確定
|
| 427 |
+
},
|
| 428 |
+
"B-進階高階": {
|
| 429 |
+
"system_prompt": SYSTEM_PROMPT, #確定
|
| 430 |
+
"definition_prompt": BAND_B_DEFINITION_PROMPT,
|
| 431 |
+
"asking_prompt_templates": ASKING_PROMPTS,
|
| 432 |
+
"reference_asking_prompt_template": REFERENCE_ASKING_PROMPT,
|
| 433 |
+
"style_distribution":{'[記敘文](純敘述)': 0.22, '[議論文](純敘述)': 0.26, '[説明文](純敘述)': 0.34, '[記敘文](含對話)': 0.1, '[抒情文](純敘述)': 0.06, '[説明文](含對話)': 0.02},
|
| 434 |
+
#"conversation_distribution":{"純敘述":0.93,"含對話":0.07},
|
| 435 |
+
"topic_classes": SUBTOPIC_CLASS_LIST,
|
| 436 |
+
"question_type_distribution":None,
|
| 437 |
+
"json_schema": READING_TEST_SCHEMA, #確定
|
| 438 |
+
},
|
| 439 |
+
"C-流利精通": {
|
| 440 |
+
"system_prompt": SYSTEM_PROMPT, #確定
|
| 441 |
+
"definition_prompt": BAND_C_DEFINITION_PROMPT,
|
| 442 |
+
"asking_prompt_templates": ASKING_PROMPTS,
|
| 443 |
+
"reference_asking_prompt_template": REFERENCE_ASKING_PROMPT,
|
| 444 |
+
"style_distribution":{'[説明文](純敘述)': 0.516, '[議論文](純敘述)': 0.419, '[抒情文](純敘述)': 0.032, '[記敘文](純敘述)': 0.032},
|
| 445 |
+
#"conversation_distribution":{"純敘述":0.93,"含對話":0.07},
|
| 446 |
+
"topic_classes": SUBTOPIC_CLASS_LIST,
|
| 447 |
+
"question_type_distribution":None,
|
| 448 |
+
"json_schema": READING_TEST_SCHEMA, #確定
|
| 449 |
+
}
|
| 450 |
+
}
|
util/sentence_dealer/.gitignore
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# poetry
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 102 |
+
#poetry.lock
|
| 103 |
+
|
| 104 |
+
# pdm
|
| 105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 106 |
+
#pdm.lock
|
| 107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 108 |
+
# in version control.
|
| 109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 110 |
+
.pdm.toml
|
| 111 |
+
.pdm-python
|
| 112 |
+
.pdm-build/
|
| 113 |
+
|
| 114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 115 |
+
__pypackages__/
|
| 116 |
+
|
| 117 |
+
# Celery stuff
|
| 118 |
+
celerybeat-schedule
|
| 119 |
+
celerybeat.pid
|
| 120 |
+
|
| 121 |
+
# SageMath parsed files
|
| 122 |
+
*.sage.py
|
| 123 |
+
|
| 124 |
+
# Environments
|
| 125 |
+
.env
|
| 126 |
+
.venv
|
| 127 |
+
env/
|
| 128 |
+
venv/
|
| 129 |
+
ENV/
|
| 130 |
+
env.bak/
|
| 131 |
+
venv.bak/
|
| 132 |
+
|
| 133 |
+
# Spyder project settings
|
| 134 |
+
.spyderproject
|
| 135 |
+
.spyproject
|
| 136 |
+
|
| 137 |
+
# Rope project settings
|
| 138 |
+
.ropeproject
|
| 139 |
+
|
| 140 |
+
# mkdocs documentation
|
| 141 |
+
/site
|
| 142 |
+
|
| 143 |
+
# mypy
|
| 144 |
+
.mypy_cache/
|
| 145 |
+
.dmypy.json
|
| 146 |
+
dmypy.json
|
| 147 |
+
|
| 148 |
+
# Pyre type checker
|
| 149 |
+
.pyre/
|
| 150 |
+
|
| 151 |
+
# pytype static type analyzer
|
| 152 |
+
.pytype/
|
| 153 |
+
|
| 154 |
+
# Cython debug symbols
|
| 155 |
+
cython_debug/
|
| 156 |
+
|
| 157 |
+
# PyCharm
|
| 158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 162 |
+
#.idea/
|
| 163 |
+
CKIP/
|
| 164 |
+
*.zip
|
| 165 |
+
*test*
|
util/sentence_dealer/README.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Sentence_Dealer
|
| 2 |
+
|
| 3 |
+
## Install the package
|
| 4 |
+
* 下載可能會花5分鐘左右
|
| 5 |
+
```
|
| 6 |
+
conda create -n dealer python=3.10 -y
|
| 7 |
+
conda activate dealer
|
| 8 |
+
cd sentence_dealer
|
| 9 |
+
|
| 10 |
+
#Install the Sentence_Dealer
|
| 11 |
+
pip install .
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
##Login Huggingface_hub
|
| 15 |
+
```
|
| 16 |
+
huggingface-cli login
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
## Quick Start
|
| 20 |
+
```
|
| 21 |
+
from sentence_dealer import Sentence_Dealer
|
| 22 |
+
|
| 23 |
+
#Modify the ckiptagger_path if you want
|
| 24 |
+
dealer = Sentence_Dealer(ckiptagger_path="./CKIP/")
|
| 25 |
+
sample_sentence = "李相赫是我永遠的神。"
|
| 26 |
+
|
| 27 |
+
grammar_ids, grammar_range = dealer.list_all_grammars(sentence=sample_sentence)
|
| 28 |
+
```
|
util/sentence_dealer/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# sentence_judgement/__init__.py
|
| 2 |
+
|
| 3 |
+
# 導入 Sentence_Dealer 類別
|
| 4 |
+
from .sentence_dealer import Sentence_Dealer
|
| 5 |
+
|
| 6 |
+
# 定義 package 的公開介面
|
| 7 |
+
__all__ = ["Sentence_Dealer"]
|
util/sentence_dealer/requirements.txt
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==2.1.0
|
| 2 |
+
asttokens==2.4.1
|
| 3 |
+
astunparse==1.6.3
|
| 4 |
+
beautifulsoup4==4.12.3
|
| 5 |
+
cachetools==5.5.0
|
| 6 |
+
certifi==2024.8.30
|
| 7 |
+
charset-normalizer==3.4.0
|
| 8 |
+
ckiptagger==0.2.1
|
| 9 |
+
decorator==5.1.1
|
| 10 |
+
deep-translator==1.11.4
|
| 11 |
+
exceptiongroup==1.2.2
|
| 12 |
+
executing==2.1.0
|
| 13 |
+
filelock==3.16.1
|
| 14 |
+
flatbuffers==24.3.25
|
| 15 |
+
gast==0.6.0
|
| 16 |
+
gdown==5.2.0
|
| 17 |
+
google-auth==2.35.0
|
| 18 |
+
google-auth-oauthlib==1.2.1
|
| 19 |
+
google-pasta==0.2.0
|
| 20 |
+
grpcio==1.67.1
|
| 21 |
+
h5py==3.12.1
|
| 22 |
+
idna==3.10
|
| 23 |
+
ipython==8.29.0
|
| 24 |
+
jedi==0.19.1
|
| 25 |
+
keras==2.15.0
|
| 26 |
+
libclang==18.1.1
|
| 27 |
+
Markdown==3.7
|
| 28 |
+
markdown-it-py==3.0.0
|
| 29 |
+
MarkupSafe==3.0.2
|
| 30 |
+
matplotlib-inline==0.1.7
|
| 31 |
+
mdurl==0.1.2
|
| 32 |
+
ml-dtypes==0.2.0
|
| 33 |
+
namex==0.0.8
|
| 34 |
+
numpy==1.26.4
|
| 35 |
+
oauthlib==3.2.2
|
| 36 |
+
OpenCC==1.1.9
|
| 37 |
+
opt_einsum==3.4.0
|
| 38 |
+
optree==0.13.0
|
| 39 |
+
packaging==24.1
|
| 40 |
+
pandas==2.2.3
|
| 41 |
+
parso==0.8.4
|
| 42 |
+
pexpect==4.9.0
|
| 43 |
+
prompt_toolkit==3.0.48
|
| 44 |
+
protobuf==4.25.5
|
| 45 |
+
ptyprocess==0.7.0
|
| 46 |
+
pure_eval==0.2.3
|
| 47 |
+
pyasn1==0.6.1
|
| 48 |
+
pyasn1_modules==0.4.1
|
| 49 |
+
Pygments==2.18.0
|
| 50 |
+
PySocks==1.7.1
|
| 51 |
+
python-dateutil==2.9.0.post0
|
| 52 |
+
pytz==2024.2
|
| 53 |
+
requests==2.32.3
|
| 54 |
+
requests-oauthlib==2.0.0
|
| 55 |
+
rich==13.9.3
|
| 56 |
+
rsa==4.9
|
| 57 |
+
six==1.16.0
|
| 58 |
+
soupsieve==2.6
|
| 59 |
+
stack-data==0.6.3
|
| 60 |
+
tensorboard==2.15.2
|
| 61 |
+
tensorboard-data-server==0.7.2
|
| 62 |
+
tensorflow==2.15.0
|
| 63 |
+
tensorflow-estimator==2.15.0
|
| 64 |
+
tensorflow-io-gcs-filesystem==0.31.0
|
| 65 |
+
termcolor==2.5.0
|
| 66 |
+
tqdm==4.66.6
|
| 67 |
+
traitlets==5.14.3
|
| 68 |
+
typing_extensions==4.15.0
|
| 69 |
+
tzdata==2024.2
|
| 70 |
+
urllib3==2.2.3
|
| 71 |
+
wcwidth==0.2.13
|
| 72 |
+
Werkzeug==3.1.0
|
| 73 |
+
wrapt==1.14.1
|
| 74 |
+
huggingface_hub==1.2.3
|
util/sentence_dealer/sample.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_dealer import Sentence_Dealer
|
| 2 |
+
|
| 3 |
+
dealer = Sentence_Dealer(ckiptagger_path="./CKIP/")
|
| 4 |
+
sample_sentence = "李相赫是我永遠的神。"
|
| 5 |
+
|
| 6 |
+
grammar_ids, grammar_range = dealer.list_all_grammars(sentence=sample_sentence)
|
util/sentence_dealer/sentence_dealer.py
ADDED
|
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, opencc, json, re, sys, zipfile
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from ckiptagger import WS, POS, NER, data_utils
|
| 5 |
+
from deep_translator import GoogleTranslator
|
| 6 |
+
from IPython.display import display, HTML
|
| 7 |
+
from importlib import resources
|
| 8 |
+
from huggingface_hub import hf_hub_download
|
| 9 |
+
|
| 10 |
+
class Sentence_Dealer():
|
| 11 |
+
def __init__(self, GPU=True, GPU_Num="0", ckiptagger_path="./CKIP/"):
|
| 12 |
+
#site_packages_path = next(p for p in sys.path if 'site-packages' in p)
|
| 13 |
+
|
| 14 |
+
# 拼接 package 中的 CKIP 和 data 路徑
|
| 15 |
+
data_path = os.path.join("./util/sentence_dealer/data")
|
| 16 |
+
|
| 17 |
+
if not os.path.exists(ckiptagger_path):
|
| 18 |
+
# Download CKIP.zip from Huggingface if it doesn't exist
|
| 19 |
+
self._download_ckip(ckiptagger_path)
|
| 20 |
+
|
| 21 |
+
if GPU:
|
| 22 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = GPU_Num
|
| 23 |
+
self.ws = WS(ckiptagger_path, disable_cuda=False)
|
| 24 |
+
self.pos = POS(ckiptagger_path, disable_cuda=False)
|
| 25 |
+
self.ner = NER(ckiptagger_path, disable_cuda=False)
|
| 26 |
+
else:
|
| 27 |
+
self.ws = WS(ckiptagger_path)
|
| 28 |
+
self.pos = POS(ckiptagger_path)
|
| 29 |
+
self.ner = NER(ckiptagger_path)
|
| 30 |
+
self._init_load_data(data_path)
|
| 31 |
+
self._init_load_grammar_data(data_path+"/grammar")
|
| 32 |
+
self.converter = opencc.OpenCC('t2s.json')
|
| 33 |
+
|
| 34 |
+
def _download_ckip(self, ckip_path):
|
| 35 |
+
print("Downloading CKIP.zip from Huggingface...")
|
| 36 |
+
try:
|
| 37 |
+
hf_hub_download(
|
| 38 |
+
repo_id="TAIDE-EDU/CKIP-Tagger",
|
| 39 |
+
filename="CKIP.zip",
|
| 40 |
+
local_dir=ckip_path,
|
| 41 |
+
local_dir_use_symlinks=False
|
| 42 |
+
)
|
| 43 |
+
except Exception as e:
|
| 44 |
+
raise FileNotFoundError(f"Failed to download CKIP.zip from Huggingface: {e}. Please download the CKIP.zip file manually.")
|
| 45 |
+
with zipfile.ZipFile(f"{ckip_path}/CKIP.zip", 'r') as zip_ref:
|
| 46 |
+
zip_ref.extractall(f"{ckip_path}")
|
| 47 |
+
os.remove(f"{ckip_path}/CKIP.zip")
|
| 48 |
+
|
| 49 |
+
print("CKIP Tagger downloaded successfully.")
|
| 50 |
+
|
| 51 |
+
def _load_json(self, data_path):
|
| 52 |
+
with open(data_path, "r", encoding="utf-8") as f:
|
| 53 |
+
result = json.load(f)
|
| 54 |
+
return result
|
| 55 |
+
|
| 56 |
+
def _init_load_data(self, data_path):
|
| 57 |
+
self.sim_affixes = self._load_json(data_path + "/sim_affix2.json")
|
| 58 |
+
self.word2TBCL = self._load_json(data_path + "/word2TBCL.json")
|
| 59 |
+
self.word2id = self._load_json(data_path + "/word2id.json")
|
| 60 |
+
self.id2info = self._load_json(data_path + "/id2info.json")
|
| 61 |
+
self.sim_affix_coerce_dictionary = self._load_json(data_path + "/sim_affix_coerce_dict.json")
|
| 62 |
+
self.word_coerce_dictionary = self._load_json(data_path + "/word_coerce_dict.json")
|
| 63 |
+
|
| 64 |
+
def _init_load_grammar_data(self, data_path):
|
| 65 |
+
self.if_else_grammar = self._load_json(data_path + "/if_else.json")
|
| 66 |
+
self.re_grammar = self._load_json(data_path + "/re.json")
|
| 67 |
+
self.pos_grammar = self._load_json(data_path + "/pos.json")
|
| 68 |
+
self.grammar_point = self._load_json(data_path + "/grammar_point.json")
|
| 69 |
+
self.grammar_id2info = {grammar_sample["id"]:(grammar_sample["grammar_point"],grammar_sample["example"]) for grammar_sample in self.grammar_point}
|
| 70 |
+
|
| 71 |
+
def _sentence2words_TBCL_by_words(self, sentence, coerce_dictionary, debug= False):
|
| 72 |
+
"""輸入句子,輸出句子中每個詞彙TBCL的等級 (僅參考詞彙檔)"""
|
| 73 |
+
words_seg = self.ws([sentence], coerce_dictionary=coerce_dictionary)[0]
|
| 74 |
+
words_level = []
|
| 75 |
+
for word in words_seg:
|
| 76 |
+
if word in self.word2TBCL.keys():
|
| 77 |
+
words_level.append(self.word2TBCL[word])
|
| 78 |
+
else:
|
| 79 |
+
words_level.append("X")
|
| 80 |
+
if debug:
|
| 81 |
+
assert len(words_seg) == len(words_level)
|
| 82 |
+
for word, level in zip(words_seg, words_level):
|
| 83 |
+
print(f"{word}({level})", end="\u3000")
|
| 84 |
+
print()
|
| 85 |
+
return words_seg, words_level
|
| 86 |
+
|
| 87 |
+
def sim_affix2TBCL(self, word, reason=False):
|
| 88 |
+
for sim_affix in self.sim_affixes:
|
| 89 |
+
if "pattern" not in sim_affix.keys():
|
| 90 |
+
continue
|
| 91 |
+
result = re.search(sim_affix["pattern"], word)
|
| 92 |
+
if result !=None:
|
| 93 |
+
sim_affix_level = float(re.search("\d+",sim_affix["ji"]).group())
|
| 94 |
+
if "*" in sim_affix["ji"]: sim_affix_level +=0.5
|
| 95 |
+
|
| 96 |
+
start, end = result.span()
|
| 97 |
+
#print(start, end)
|
| 98 |
+
sep_word_level = {word[0:start]:"X", word[start:end]:sim_affix_level, word[end:len(word)]:"X"}
|
| 99 |
+
word_list = {seperate_word:level for seperate_word, level in sep_word_level.items() if seperate_word!=""}
|
| 100 |
+
if reason:
|
| 101 |
+
print("偵測到\"%s\"系列詞,其含義為\"%s\"。" %(sim_affix["sim_affix"], sim_affix["desc"]))
|
| 102 |
+
return list(word_list.keys()), list(word_list.values())
|
| 103 |
+
return [word], ["X"]
|
| 104 |
+
|
| 105 |
+
def sentence2words_TBCL(self, sentence, debug= False):
|
| 106 |
+
"""輸入句子,��出句子中每個詞彙TBCL的等級 (有參考詞彙檔&類詞墜檔)"""
|
| 107 |
+
words_seg, words_level = self._sentence2words_TBCL_by_words(sentence, self.sim_affix_coerce_dictionary)
|
| 108 |
+
|
| 109 |
+
words_seg_2 = []
|
| 110 |
+
words_level_2 = []
|
| 111 |
+
for word_seg, word_level in zip(words_seg, words_level):
|
| 112 |
+
if word_level!="X": words_seg_2.append(word_seg), words_level_2.append(word_level)
|
| 113 |
+
else:
|
| 114 |
+
cut_words, levels = self._sentence2words_TBCL_by_words(word_seg, self.word_coerce_dictionary)
|
| 115 |
+
words_seg_2.extend(cut_words), words_level_2.extend(levels)
|
| 116 |
+
|
| 117 |
+
words_seg_3 = []
|
| 118 |
+
words_level_3 = []
|
| 119 |
+
for word_seg, word_level in zip(words_seg_2, words_level_2):
|
| 120 |
+
if word_level!="X": words_seg_3.append(word_seg), words_level_3.append(word_level)
|
| 121 |
+
else:
|
| 122 |
+
cut_words, levels = self.sim_affix2TBCL(word_seg, reason=debug)
|
| 123 |
+
for cut_word, level in zip(cut_words, levels):
|
| 124 |
+
if level != "X": words_seg_3.append(cut_word), words_level_3.append(level)
|
| 125 |
+
else:
|
| 126 |
+
cut_word_seg, cut_word_seg_level = self._sentence2words_TBCL_by_words(cut_word, self.word_coerce_dictionary)
|
| 127 |
+
words_seg_3.extend(cut_word_seg), words_level_3.extend(cut_word_seg_level)
|
| 128 |
+
if debug:
|
| 129 |
+
assert len(words_seg_3) == len(words_level_3)
|
| 130 |
+
for word, level in zip(words_seg_3, words_level_3):
|
| 131 |
+
print(f"{word}({level})", end="")
|
| 132 |
+
print()
|
| 133 |
+
return words_seg_3, words_level_3
|
| 134 |
+
|
| 135 |
+
def sentence2TBCL_by_words(self, sentence, reason = False, debug= False):
|
| 136 |
+
|
| 137 |
+
TBCL_1_3_threshold = 0.2
|
| 138 |
+
TBCL_4_6_threshold = 0.25
|
| 139 |
+
_, words_level = self.sentence2words_TBCL(sentence, debug=reason)
|
| 140 |
+
level_list = [word for word in words_level if word!="X"]
|
| 141 |
+
sorted_levels = sorted(level_list, reverse=True)
|
| 142 |
+
if sorted_levels == []:
|
| 143 |
+
if reason: print("未偵測到任何詞彙")
|
| 144 |
+
return "X"
|
| 145 |
+
|
| 146 |
+
if debug:
|
| 147 |
+
print("sorted levels:", sorted_levels)
|
| 148 |
+
print("tack the index %d for TBCL_1_3_threshold." %(int(len(sorted_levels)*TBCL_1_3_threshold)))
|
| 149 |
+
print("tack the index %d for TBCL_4_6_threshold." %(int(len(sorted_levels)*TBCL_4_6_threshold)))
|
| 150 |
+
|
| 151 |
+
# TBCL 1~3級允許20%超等詞,TBCL 4~6級允許25%超等詞
|
| 152 |
+
tmp_level = sorted_levels[ int(len(sorted_levels)*TBCL_1_3_threshold) ]
|
| 153 |
+
if tmp_level <= 3:
|
| 154 |
+
sentence_level = tmp_level
|
| 155 |
+
else:
|
| 156 |
+
tmp_level = sorted_levels[ int(len(sorted_levels)*TBCL_4_6_threshold) ]
|
| 157 |
+
if tmp_level < 4:
|
| 158 |
+
sentence_level = 4
|
| 159 |
+
else:
|
| 160 |
+
sentence_level = tmp_level
|
| 161 |
+
if reason:
|
| 162 |
+
print("偵測到%.1f%%超等詞。" %((len([word_level for word_level in sorted_levels if word_level > sentence_level])/len(sorted_levels))*100))
|
| 163 |
+
return sentence_level
|
| 164 |
+
|
| 165 |
+
def list_all_grammars(self, sentence, debug = False):
|
| 166 |
+
words_seg, _ = self.sentence2words_TBCL(sentence)
|
| 167 |
+
pos_list = self.sentence_pos(words_seg)
|
| 168 |
+
pos_sentence = ""
|
| 169 |
+
for word, pos in zip(words_seg, pos_list):
|
| 170 |
+
if pos in ['VH', 'VHC', 'VI', 'VJ', 'VK', 'VL']:
|
| 171 |
+
pos = 'Vs'
|
| 172 |
+
pos_sentence += word + ""
|
| 173 |
+
pos_sentence += "[" + pos + "]"
|
| 174 |
+
pos_sentence = pos_sentence.strip()
|
| 175 |
+
find_garmmars_id = []
|
| 176 |
+
find_garmmars_range = []
|
| 177 |
+
#if_else 一個字要跟斷詞後結果比對,兩個字(含)以上不用(直接看句子是否包含)
|
| 178 |
+
for grammar_name, [level, id] in self.if_else_grammar.items():
|
| 179 |
+
if len(grammar_name)==1:
|
| 180 |
+
if grammar_name in words_seg:
|
| 181 |
+
find_garmmars_id.append(id)
|
| 182 |
+
#find grammar range
|
| 183 |
+
index_number = words_seg.index(grammar_name)
|
| 184 |
+
range_start = len("".join(words_seg[:index_number]))
|
| 185 |
+
range_end = range_start + len(words_seg[index_number])
|
| 186 |
+
find_garmmars_range.append((range_start, range_end))
|
| 187 |
+
elif grammar_name in sentence:
|
| 188 |
+
find_garmmars_id.append(id)
|
| 189 |
+
#find grammar range
|
| 190 |
+
range_start = sentence.index(grammar_name)
|
| 191 |
+
range_end = range_start + len(grammar_name)
|
| 192 |
+
find_garmmars_range.append((range_start, range_end))
|
| 193 |
+
#detect_list = []
|
| 194 |
+
pattern_list = []
|
| 195 |
+
for grammar_pattern, [level, id] in self.re_grammar.items():
|
| 196 |
+
detect = re.search(grammar_pattern, sentence)
|
| 197 |
+
if detect:
|
| 198 |
+
find_garmmars_id.append(id)
|
| 199 |
+
find_garmmars_range.append(detect.span())
|
| 200 |
+
pattern_list.append(grammar_pattern)
|
| 201 |
+
#detect_list.append(detect.group())
|
| 202 |
+
for grammar_pattern, [level, id] in self.pos_grammar.items():
|
| 203 |
+
# print(pos_sentence)
|
| 204 |
+
|
| 205 |
+
detect = re.search(grammar_pattern, pos_sentence)
|
| 206 |
+
if detect:
|
| 207 |
+
find_garmmars_id.append(id)
|
| 208 |
+
pattern_list.append(grammar_pattern)
|
| 209 |
+
detect_spen = detect.span()
|
| 210 |
+
#print(re.sub("\[([a-zA-Z_1-9])+?\]", "", pos_sentence[:detect_spen[0]]))
|
| 211 |
+
range_start = len(re.sub("\[([a-zA-Z_1-9])+?\]", "", pos_sentence[:detect_spen[0]]))
|
| 212 |
+
range_end = len(re.sub("\[([a-zA-Z_1-9])+?\]", "", pos_sentence[:detect_spen[1]]))
|
| 213 |
+
find_garmmars_range.append((range_start, range_end))
|
| 214 |
+
|
| 215 |
+
if debug:
|
| 216 |
+
return find_garmmars_id, find_garmmars_range, pos_sentence
|
| 217 |
+
else:
|
| 218 |
+
return find_garmmars_id, find_garmmars_range
|
| 219 |
+
|
| 220 |
+
def sentence2TBCL_by_grammar(self, sentence, level_lower_bound = 0, reason = False, debug = False):
|
| 221 |
+
words_seg, _ = self.sentence2words_TBCL(sentence)
|
| 222 |
+
pos_list = self.sentence_pos(words_seg)
|
| 223 |
+
pos_sentence = ""
|
| 224 |
+
for word, pos in zip(words_seg, pos_list):
|
| 225 |
+
pos_sentence += word + ""
|
| 226 |
+
pos_sentence += "[" + pos + "]"
|
| 227 |
+
pos_sentence = pos_sentence.strip()
|
| 228 |
+
find_garmmars_id = []
|
| 229 |
+
find_garmmars_level = []
|
| 230 |
+
find_garmmars_range = []
|
| 231 |
+
print('pos_sentence')
|
| 232 |
+
print(pos_sentence) # TODO remove
|
| 233 |
+
#if_else 一個字要跟斷詞後結果比對,兩個字(含)以上不用(直接看句子是否包含)
|
| 234 |
+
for grammar_name, [level, id] in self.if_else_grammar.items():
|
| 235 |
+
if len(grammar_name)==1:
|
| 236 |
+
if grammar_name in words_seg:
|
| 237 |
+
find_garmmars_id.append(id)
|
| 238 |
+
find_garmmars_level.append(level)
|
| 239 |
+
#find grammar range
|
| 240 |
+
index_number = words_seg.index(grammar_name)
|
| 241 |
+
range_start = len("".join(words_seg[:index_number]))
|
| 242 |
+
range_end = range_start + len(words_seg[index_number])
|
| 243 |
+
find_garmmars_range.append((range_start, range_end))
|
| 244 |
+
elif grammar_name in sentence:
|
| 245 |
+
find_garmmars_id.append(id)
|
| 246 |
+
find_garmmars_level.append(level)
|
| 247 |
+
#find grammar range
|
| 248 |
+
range_start = sentence.index(grammar_name)
|
| 249 |
+
range_end = range_start + len(grammar_name)
|
| 250 |
+
find_garmmars_range.append((range_start, range_end))
|
| 251 |
+
#detect_list = []
|
| 252 |
+
pattern_list = []
|
| 253 |
+
for grammar_pattern, [level, id] in self.re_grammar.items():
|
| 254 |
+
detect = re.search(grammar_pattern, sentence)
|
| 255 |
+
if detect:
|
| 256 |
+
find_garmmars_id.append(id)
|
| 257 |
+
find_garmmars_level.append(level)
|
| 258 |
+
find_garmmars_range.append(detect.span())
|
| 259 |
+
pattern_list.append(grammar_pattern)
|
| 260 |
+
#detect_list.append(detect.group())
|
| 261 |
+
for grammar_pattern, [level, id] in self.pos_grammar.items():
|
| 262 |
+
detect = re.search(grammar_pattern, pos_sentence)
|
| 263 |
+
if detect:
|
| 264 |
+
find_garmmars_id.append(id)
|
| 265 |
+
find_garmmars_level.append(level)
|
| 266 |
+
pattern_list.append(grammar_pattern)
|
| 267 |
+
detect_spen = detect.span()
|
| 268 |
+
#print(re.sub("\[([a-zA-Z_1-9])+?\]", "", pos_sentence[:detect_spen[0]]))
|
| 269 |
+
range_start = len(re.sub("\[([a-zA-Z_1-9])+?\]", "", pos_sentence[:detect_spen[0]]))
|
| 270 |
+
range_end = len(re.sub("\[([a-zA-Z_1-9])+?\]", "", pos_sentence[:detect_spen[1]]))
|
| 271 |
+
find_garmmars_range.append((range_start, range_end))
|
| 272 |
+
#detect_list.append(detect.group())
|
| 273 |
+
if find_garmmars_level == []:
|
| 274 |
+
if reason: print("未偵測到任何文法")
|
| 275 |
+
return "X"
|
| 276 |
+
|
| 277 |
+
if reason:
|
| 278 |
+
print("找出大於level_lower_bound的語法位置: (預設0級)")
|
| 279 |
+
color_style = ["<span style=\"color: #B22222;\";>", "</span>"]
|
| 280 |
+
color_style_len = len(color_style[0])+len(color_style[1])
|
| 281 |
+
#統計每個語法的範圍
|
| 282 |
+
sentence_words_status = np.zeros(len(sentence))
|
| 283 |
+
for garmmar_level, grammar_range in zip(find_garmmars_level, find_garmmars_range):
|
| 284 |
+
if garmmar_level > level_lower_bound:
|
| 285 |
+
start = grammar_range[0]
|
| 286 |
+
end = grammar_range[1]
|
| 287 |
+
sentence_words_status[start:end] = 1
|
| 288 |
+
final_ranges = []
|
| 289 |
+
start = 0
|
| 290 |
+
for i in range(1, len(sentence_words_status)):
|
| 291 |
+
if sentence_words_status[i] == sentence_words_status[start]:
|
| 292 |
+
continue
|
| 293 |
+
else:
|
| 294 |
+
if sentence_words_status[start]==1: final_ranges.append((start, i))
|
| 295 |
+
start = i
|
| 296 |
+
if sentence_words_status[start]==1: final_ranges.append((start, i))
|
| 297 |
+
|
| 298 |
+
show_sentence = sentence
|
| 299 |
+
for i, final_range in enumerate(final_ranges):
|
| 300 |
+
start = final_range[0] + (i*color_style_len)
|
| 301 |
+
end = final_range[1] + (i*color_style_len)
|
| 302 |
+
show_sentence = show_sentence[:start] + color_style[0] + show_sentence[start:end] + color_style[1] + show_sentence[end:]
|
| 303 |
+
display(HTML(show_sentence))
|
| 304 |
+
|
| 305 |
+
print("HTML原始版本:/n")
|
| 306 |
+
print(show_sentence)
|
| 307 |
+
|
| 308 |
+
print("印出大於level_lower_bound的語法表格: (預設0級)")
|
| 309 |
+
show_dict = {"grammar_id":[], "grammar_name":[], "grammar_level":[], "example":[]}
|
| 310 |
+
for garmmars_id , garmmars_level in zip(find_garmmars_id, find_garmmars_level):
|
| 311 |
+
if garmmars_id in show_dict["grammar_id"]:
|
| 312 |
+
continue
|
| 313 |
+
elif type(garmmars_id) == list:
|
| 314 |
+
show_dict["grammar_id"].extend(garmmars_id)
|
| 315 |
+
show_dict["grammar_level"].extend([garmmars_level for i in range(len(garmmars_id))])
|
| 316 |
+
else:
|
| 317 |
+
show_dict["grammar_id"].append(garmmars_id)
|
| 318 |
+
show_dict["grammar_level"].append(garmmars_level)
|
| 319 |
+
|
| 320 |
+
for grammar_id in show_dict["grammar_id"]:
|
| 321 |
+
grammar_name, example = self.grammar_id2info[grammar_id]
|
| 322 |
+
show_dict["grammar_name"].append(grammar_name)
|
| 323 |
+
example = re.sub("<.*?>", "", example)
|
| 324 |
+
show_dict["example"].append(example)
|
| 325 |
+
#print(show_dict)
|
| 326 |
+
show_dataframe = pd.DataFrame.from_dict(show_dict)
|
| 327 |
+
show_dataframe = show_dataframe[show_dataframe["grammar_level"] > level_lower_bound]
|
| 328 |
+
display(HTML(show_dataframe.to_html()))
|
| 329 |
+
|
| 330 |
+
if debug:
|
| 331 |
+
print(find_garmmars_id)
|
| 332 |
+
print(find_garmmars_level)
|
| 333 |
+
print(pos_sentence)
|
| 334 |
+
print(pattern_list)
|
| 335 |
+
print(find_garmmars_range)
|
| 336 |
+
#print(show_sentence)
|
| 337 |
+
|
| 338 |
+
#
|
| 339 |
+
TBCL_threshold = [1.5, 1.5, 2, 3, 4]
|
| 340 |
+
tmp_grammars_level = max(find_garmmars_level)
|
| 341 |
+
if (tmp_grammars_level-0.5) in find_garmmars_level:
|
| 342 |
+
return tmp_grammars_level-0.5
|
| 343 |
+
else:
|
| 344 |
+
return tmp_grammars_level
|
| 345 |
+
|
| 346 |
+
def sentence_pos(self, words_seg, debug= False):
|
| 347 |
+
words_pos = self.pos([words_seg])[0]
|
| 348 |
+
if debug:
|
| 349 |
+
assert len(words_seg) == len(words_pos)
|
| 350 |
+
for word, token in zip(words_seg, words_pos):
|
| 351 |
+
print(f"{word}({token})", end="\u3000")
|
| 352 |
+
print()
|
| 353 |
+
return words_pos
|
| 354 |
+
|
| 355 |
+
def ids2pndas(self, pos_list, ids):
|
| 356 |
+
|
| 357 |
+
#['id', 'word', 'deng', 'ji', 'situation', 'wfreq', 'sfreq', 'newlink', 'bopomofo', 'pinyin']
|
| 358 |
+
keys = ["序號", "繁體中文", "簡體中文", "注音", "拼音", "詞性", "例句"]
|
| 359 |
+
result_df = pd.DataFrame(columns=keys)
|
| 360 |
+
for i, (index, pos) in enumerate(zip(ids, pos_list)):
|
| 361 |
+
word_result = {"序號":[i]}
|
| 362 |
+
info = self.id2info[str(index)]
|
| 363 |
+
word_result["繁體中文"] = [info["word"]]
|
| 364 |
+
word_result["簡體中文"] = [GoogleTranslator(source='zh-TW', target='zh-CN').translate(info["word"]) ]
|
| 365 |
+
word_result["注音"] = [info["bopomofo"]]
|
| 366 |
+
word_result["拼音"] = [info["pinyin"]]
|
| 367 |
+
word_result["詞性"] = [pos]
|
| 368 |
+
word_result["例句"] = ["X"]
|
| 369 |
+
word_result_df = pd.DataFrame.from_dict(word_result)
|
| 370 |
+
result_df = pd.concat([result_df, word_result_df], ignore_index=True)
|
| 371 |
+
return result_df
|
| 372 |
+
|
| 373 |
+
def sentence2word_list(self, sentence:str, need_level:int, sim_affix=False, debug=True):
|
| 374 |
+
words, levels = self.sentence2words_TBCL(sentence)
|
| 375 |
+
pos_list = self.sentence_pos(words, debug=True)
|
| 376 |
+
word_id_list = []
|
| 377 |
+
words_result = []
|
| 378 |
+
pos_result = []
|
| 379 |
+
for word, level, pos in zip(words, levels, pos_list):
|
| 380 |
+
if need_level == level and word in self.word2id.keys() and word not in words_result:
|
| 381 |
+
word_id = self.word2id[word]
|
| 382 |
+
word_id_list.append(word_id)
|
| 383 |
+
words_result.append(word)
|
| 384 |
+
pos_result.append(pos)
|
| 385 |
+
result_df = self.ids2pndas(pos_result, word_id_list)
|
| 386 |
+
return result_df
|
| 387 |
+
|
| 388 |
+
def label_sentence(self, sentence, grammar_range):
|
| 389 |
+
color_style = ["<b style='color:blue;'>", "</b>"]
|
| 390 |
+
color_style_len = len(color_style[0])+len(color_style[1])
|
| 391 |
+
#統計每個語法的範圍
|
| 392 |
+
sentence_words_status = np.zeros(len(sentence))
|
| 393 |
+
start = grammar_range[0]
|
| 394 |
+
end = grammar_range[1]
|
| 395 |
+
sentence_words_status[start:end] = 1
|
| 396 |
+
|
| 397 |
+
show_sentence = sentence
|
| 398 |
+
labeled_sentence = show_sentence[:start] + color_style[0] + show_sentence[start:end] + color_style[1] + show_sentence[end:]
|
| 399 |
+
|
| 400 |
+
return labeled_sentence
|
util/sentence_dealer/setup.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import zipfile
|
| 3 |
+
from setuptools import setup, find_packages
|
| 4 |
+
|
| 5 |
+
# 從 requirements.txt 讀取相依套件
|
| 6 |
+
def parse_requirements(filename):
|
| 7 |
+
with open(filename, 'r') as file:
|
| 8 |
+
return [line.strip() for line in file if line.strip() and not line.startswith('#')]
|
| 9 |
+
|
| 10 |
+
# 套用相依套件列表
|
| 11 |
+
requirements = parse_requirements('requirements.txt')
|
| 12 |
+
|
| 13 |
+
setup(
|
| 14 |
+
name="sentence_dealer",
|
| 15 |
+
version="1.0.0",
|
| 16 |
+
description="A sentence segmentation and POS tagging package",
|
| 17 |
+
author="BCC",
|
| 18 |
+
packages=['sentence_dealer'],
|
| 19 |
+
package_dir={'sentence_dealer': './'},
|
| 20 |
+
install_requires=requirements,
|
| 21 |
+
include_package_data=True,
|
| 22 |
+
package_data={
|
| 23 |
+
"sentence_dealer": ["./data/**/*", "./CKIP/**/*"],
|
| 24 |
+
},
|
| 25 |
+
classifiers=[
|
| 26 |
+
"Programming Language :: Python :: 3",
|
| 27 |
+
"Operating System :: OS Independent",
|
| 28 |
+
],
|
| 29 |
+
python_requires=">=3.6",
|
| 30 |
+
)
|