Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import glob
|
|
| 4 |
import csv
|
| 5 |
import threading
|
| 6 |
import shutil
|
|
|
|
| 7 |
from datetime import datetime, timedelta
|
| 8 |
from typing import Optional
|
| 9 |
|
|
@@ -11,6 +12,9 @@ import gradio as gr
|
|
| 11 |
import textstat
|
| 12 |
from openai import OpenAI
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
# =========================
|
| 15 |
# 設定(元コード踏襲)
|
| 16 |
# =========================
|
|
@@ -33,6 +37,9 @@ os.makedirs(REWRITE_DIR, exist_ok=True)
|
|
| 33 |
|
| 34 |
PASSAGES_DIR = os.getenv("PASSAGES_DIR", "passages")
|
| 35 |
|
|
|
|
|
|
|
|
|
|
| 36 |
if not API_KEY:
|
| 37 |
raise RuntimeError("API_KEY is not set (env: API_KEY)")
|
| 38 |
|
|
@@ -44,147 +51,77 @@ _rewrite_sem = threading.Semaphore(REWRITE_CONCURRENCY)
|
|
| 44 |
_stop_flag_lock = threading.Lock()
|
| 45 |
_stop_flag = False
|
| 46 |
|
|
|
|
| 47 |
# =========================
|
| 48 |
-
#
|
| 49 |
# =========================
|
| 50 |
-
def list_passage_files_sorted(passages_dir: str) -> list[tuple[int, str]]:
|
| 51 |
-
pattern = os.path.join(passages_dir, "pg*.txt")
|
| 52 |
-
files = glob.glob(pattern)
|
| 53 |
-
|
| 54 |
-
items: list[tuple[int, str]] = []
|
| 55 |
-
for fp in files:
|
| 56 |
-
name = os.path.basename(fp)
|
| 57 |
-
m = re.match(r"pg(\d+)\.txt$", name)
|
| 58 |
-
if m:
|
| 59 |
-
items.append((int(m.group(1)), fp))
|
| 60 |
-
items.sort(key=lambda x: x[0])
|
| 61 |
-
return items
|
| 62 |
-
|
| 63 |
-
|
| 64 |
def load_text(path: str) -> str:
|
| 65 |
with open(path, "r", encoding="utf-8") as f:
|
| 66 |
return f.read()
|
| 67 |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
# =========================
|
| 70 |
-
#
|
| 71 |
# =========================
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
}
|
| 76 |
|
|
|
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
| 83 |
|
|
|
|
| 84 |
|
| 85 |
-
|
| 86 |
-
"""
|
| 87 |
-
|
| 88 |
-
- "legacy": 元の1本プロンプト(現状踏襲)
|
| 89 |
-
- "final_v1": 最終プロンプトセット(Constraints重視)
|
| 90 |
-
"""
|
| 91 |
-
level_to_flesch = {1: 90, 2: 75, 3: 65, 4: 55, 5: 40}
|
| 92 |
-
target_flesch = level_to_flesch[int(target_level)]
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
Requirements:
|
| 99 |
- Extract only the main text, excluding titles, author names, source information, chapter numbers, annotations, footers, etc.
|
| 100 |
- Clearly separate sections such as chapters by inserting blank lines between them.
|
| 101 |
- Maintain the original meaning faithfully; do not add or remove important information.
|
| 102 |
-
- Rewrite
|
| 103 |
-
|
| 104 |
-
Use simple, modern sentence structures and word order.
|
| 105 |
- Output only the rewritten text. Do not include explanatory notes or other additional text.
|
| 106 |
{text}
|
| 107 |
-
"""
|
| 108 |
-
|
| 109 |
-
# ---- final_v1: あなたの「最終プロンプトセット」をコードに落とし込み ----
|
| 110 |
-
level_name = {
|
| 111 |
-
1: "Level 90",
|
| 112 |
-
2: "Level 75",
|
| 113 |
-
3: "Level 65",
|
| 114 |
-
4: "Level 55",
|
| 115 |
-
5: "Level 40",
|
| 116 |
-
}[int(target_level)]
|
| 117 |
-
|
| 118 |
-
level_constraints = {
|
| 119 |
-
5: [
|
| 120 |
-
"- Average sentence length: 18–25 words",
|
| 121 |
-
"- Use abstract nouns (e.g., system, process, structure, responsibility)",
|
| 122 |
-
"- Allow multi-syllable words",
|
| 123 |
-
"- Use complex sentences with relative clauses",
|
| 124 |
-
"- Prefer formal, academic tone",
|
| 125 |
-
"- Avoid short sentences",
|
| 126 |
-
],
|
| 127 |
-
4: [
|
| 128 |
-
"- Average sentence length: 14–18 words",
|
| 129 |
-
"- Mix abstract and concrete vocabulary",
|
| 130 |
-
"- Avoid very short sentences",
|
| 131 |
-
"- Limit technical jargon",
|
| 132 |
-
"- Use clear logical flow",
|
| 133 |
-
],
|
| 134 |
-
3: [
|
| 135 |
-
"- Average sentence length: 10–14 words",
|
| 136 |
-
"- Use common vocabulary",
|
| 137 |
-
"- One main idea per sentence",
|
| 138 |
-
"- Prefer active voice",
|
| 139 |
-
"- Avoid abstract nominalizations",
|
| 140 |
-
"- Split long sentences into shorter independent sentences",
|
| 141 |
-
],
|
| 142 |
-
2: [
|
| 143 |
-
"- Average sentence length: 7–10 words",
|
| 144 |
-
"- Use everyday vocabulary only",
|
| 145 |
-
"- Avoid complex connectors (however, therefore, although)",
|
| 146 |
-
"- Use short, simple sentences",
|
| 147 |
-
"- Repeat key ideas using different simple wording",
|
| 148 |
-
],
|
| 149 |
-
1: [
|
| 150 |
-
"- Average sentence length: 4–6 words",
|
| 151 |
-
"- Use only basic vocabulary",
|
| 152 |
-
"- One action or fact per sentence",
|
| 153 |
-
"- Avoid abstract words completely",
|
| 154 |
-
"- Avoid explanations and reasoning",
|
| 155 |
-
"- Prefer many short declarative sentences",
|
| 156 |
-
],
|
| 157 |
-
}[int(target_level)]
|
| 158 |
-
|
| 159 |
-
constraints_block = "\n".join(level_constraints)
|
| 160 |
-
|
| 161 |
-
return f"""Rewrite the following text for learners at {level_name}.
|
| 162 |
-
|
| 163 |
-
Requirements:
|
| 164 |
-
- Target Flesch Reading Ease: around {target_flesch} (±3)
|
| 165 |
-
- Keep the original meaning.
|
| 166 |
-
- Do not add new information.
|
| 167 |
-
- Output only the rewritten text.
|
| 168 |
-
|
| 169 |
-
Constraints:
|
| 170 |
-
{constraints_block}
|
| 171 |
-
|
| 172 |
-
Additional handling (same as current system):
|
| 173 |
-
- Extract only the portions of the text that should be read as the main body, excluding title/author/source/chapter numbers/annotations/footers.
|
| 174 |
-
- When outputting, make sure sections divided by chapters are clearly distinguishable by leaving a blank line between them.
|
| 175 |
-
|
| 176 |
-
Text:
|
| 177 |
-
<<<
|
| 178 |
-
{text}
|
| 179 |
-
>>>
|
| 180 |
-
"""
|
| 181 |
|
| 182 |
|
| 183 |
# =========================
|
| 184 |
-
# 書き換え
|
| 185 |
# =========================
|
| 186 |
-
def rewrite_level(text: str, target_level: int
|
| 187 |
-
prompt = build_prompt(text=text, target_level=target_level
|
| 188 |
|
| 189 |
max_tokens_candidates = [3000, 2000, 1500, 1000]
|
| 190 |
last_error = None
|
|
@@ -223,7 +160,7 @@ def compute_metrics(text: str) -> tuple[float, int]:
|
|
| 223 |
|
| 224 |
|
| 225 |
# =========================
|
| 226 |
-
# CSV追記
|
| 227 |
# =========================
|
| 228 |
_csv_lock = threading.Lock()
|
| 229 |
|
|
@@ -240,7 +177,7 @@ def append_csv_row(row: dict):
|
|
| 240 |
|
| 241 |
|
| 242 |
# =========================
|
| 243 |
-
# txt追記(★ファイル名
|
| 244 |
# =========================
|
| 245 |
_txt_lock = threading.Lock()
|
| 246 |
|
|
@@ -248,19 +185,21 @@ _txt_lock = threading.Lock()
|
|
| 248 |
def append_rewrite_txt(
|
| 249 |
text_id: int,
|
| 250 |
target_level: int,
|
| 251 |
-
|
| 252 |
-
|
| 253 |
word_count: int,
|
| 254 |
rewritten_text: str,
|
| 255 |
):
|
| 256 |
ts = (datetime.utcnow() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
|
| 257 |
-
|
| 258 |
-
|
|
|
|
| 259 |
|
| 260 |
block = (
|
| 261 |
f"# Text {text_id}\n"
|
| 262 |
f"Target Level: {target_level}\n"
|
| 263 |
-
f"Flesch
|
|
|
|
| 264 |
f"Word Count: {word_count}\n"
|
| 265 |
f"Timestamp (JST): {ts}\n"
|
| 266 |
f"Model: {MODEL}\n\n"
|
|
@@ -292,44 +231,85 @@ def get_stop() -> bool:
|
|
| 292 |
# UIロジック
|
| 293 |
# =========================
|
| 294 |
def init_state():
|
| 295 |
-
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
|
| 298 |
|
| 299 |
-
def start(level: int
|
| 300 |
set_stop(False)
|
| 301 |
st = init_state()
|
| 302 |
-
total = len(st["files"])
|
| 303 |
-
if total == 0:
|
| 304 |
-
return st, "passages/pg*.txt が見つかりません", "", "", "", None, None
|
| 305 |
|
| 306 |
-
|
| 307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
|
|
|
|
|
|
| 309 |
|
| 310 |
-
|
|
|
|
| 311 |
set_stop(False)
|
| 312 |
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
total =
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
-
|
| 318 |
-
|
| 319 |
|
| 320 |
-
|
| 321 |
original = load_text(path)
|
| 322 |
|
| 323 |
-
rewritten = rewrite_level(original, target_level=level
|
| 324 |
-
|
| 325 |
|
|
|
|
| 326 |
ts = (datetime.utcnow() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
|
| 327 |
|
| 328 |
append_csv_row({
|
| 329 |
"timestamp_jst": ts,
|
| 330 |
"Text#": text_id,
|
| 331 |
"target_level": level,
|
| 332 |
-
"flesch_reading_ease": f"{
|
| 333 |
"word_count": wc,
|
| 334 |
"rewritten_text": rewritten
|
| 335 |
})
|
|
@@ -337,59 +317,65 @@ def run_one(level: int, prompt_mode: str, state: dict):
|
|
| 337 |
append_rewrite_txt(
|
| 338 |
text_id=text_id,
|
| 339 |
target_level=level,
|
| 340 |
-
|
| 341 |
-
|
| 342 |
word_count=wc,
|
| 343 |
rewritten_text=rewritten,
|
| 344 |
)
|
| 345 |
|
| 346 |
-
|
|
|
|
|
|
|
| 347 |
|
| 348 |
-
|
| 349 |
header = (
|
| 350 |
f"#Text {text_id}\n"
|
| 351 |
f"Target Level: {level}\n"
|
| 352 |
-
f"
|
| 353 |
-
f"Flesch Reading Ease: {
|
| 354 |
f"Word Count: {wc}\n"
|
| 355 |
f"Saved CSV: {CSV_PATH}\n"
|
| 356 |
-
f"Saved TXT: {
|
| 357 |
)
|
| 358 |
-
|
|
|
|
| 359 |
return state, "1件処理しました。", header, progress, rewritten, None, None
|
| 360 |
|
| 361 |
|
| 362 |
-
def run_all(level: int,
|
| 363 |
set_stop(False)
|
| 364 |
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
total =
|
| 368 |
|
| 369 |
-
if
|
| 370 |
-
return state, "全て処理済みです。", "", f"{
|
| 371 |
|
| 372 |
last_header = ""
|
| 373 |
last_text = ""
|
| 374 |
|
| 375 |
-
while
|
| 376 |
if get_stop():
|
| 377 |
-
state["
|
| 378 |
-
|
|
|
|
| 379 |
|
| 380 |
-
text_id
|
|
|
|
| 381 |
original = load_text(path)
|
| 382 |
|
| 383 |
-
rewritten = rewrite_level(original, target_level=level
|
| 384 |
-
|
| 385 |
|
|
|
|
| 386 |
ts = (datetime.utcnow() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
|
| 387 |
|
| 388 |
append_csv_row({
|
| 389 |
"timestamp_jst": ts,
|
| 390 |
"Text#": text_id,
|
| 391 |
"target_level": level,
|
| 392 |
-
"flesch_reading_ease": f"{
|
| 393 |
"word_count": wc,
|
| 394 |
"rewritten_text": rewritten
|
| 395 |
})
|
|
@@ -397,28 +383,29 @@ def run_all(level: int, prompt_mode: str, state: dict):
|
|
| 397 |
append_rewrite_txt(
|
| 398 |
text_id=text_id,
|
| 399 |
target_level=level,
|
| 400 |
-
|
| 401 |
-
|
| 402 |
word_count=wc,
|
| 403 |
rewritten_text=rewritten,
|
| 404 |
)
|
| 405 |
|
| 406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
last_header = (
|
| 408 |
f"#Text {text_id}\n"
|
| 409 |
f"Target Level: {level}\n"
|
| 410 |
-
f"
|
| 411 |
-
f"Flesch Reading Ease: {
|
| 412 |
f"Word Count: {wc}\n"
|
| 413 |
f"Saved CSV: {CSV_PATH}\n"
|
| 414 |
-
f"Saved TXT: {
|
| 415 |
)
|
| 416 |
last_text = rewritten
|
| 417 |
|
| 418 |
-
|
| 419 |
-
state["idx"] = idx
|
| 420 |
-
|
| 421 |
-
return state, "全件処理が完了しました。", last_header, f"{idx} / {total}", last_text, None, None
|
| 422 |
|
| 423 |
|
| 424 |
def stop():
|
|
@@ -446,18 +433,14 @@ def reset_rewrite_folder():
|
|
| 446 |
|
| 447 |
|
| 448 |
# =========================
|
| 449 |
-
#
|
| 450 |
# =========================
|
| 451 |
def list_generated_txt_files() -> list[str]:
|
| 452 |
-
|
| 453 |
-
files = sorted(glob.glob(os.path.join(REWRITE_DIR, "Text_*_*_*.txt")))
|
| 454 |
return [os.path.basename(f) for f in files]
|
| 455 |
|
| 456 |
|
| 457 |
def build_single_txt_path(selected_name: str) -> str:
|
| 458 |
-
"""
|
| 459 |
-
選択されたtxtをダウンロード用に返す(gr.Fileに渡す)
|
| 460 |
-
"""
|
| 461 |
path = os.path.join(REWRITE_DIR, selected_name)
|
| 462 |
if not os.path.exists(path):
|
| 463 |
raise FileNotFoundError(f"Not found: {path}")
|
|
@@ -465,11 +448,6 @@ def build_single_txt_path(selected_name: str) -> str:
|
|
| 465 |
|
| 466 |
|
| 467 |
def build_zip_of_txts(mode: str, n_last: int) -> str:
|
| 468 |
-
"""
|
| 469 |
-
mode:
|
| 470 |
-
- 'all' : 全txt
|
| 471 |
-
- 'last_n' : 最新N個
|
| 472 |
-
"""
|
| 473 |
files = sorted(glob.glob(os.path.join(REWRITE_DIR, "Text_*.txt")), key=os.path.getmtime)
|
| 474 |
if not files:
|
| 475 |
raise FileNotFoundError("No generated txt files yet.")
|
|
@@ -477,12 +455,10 @@ def build_zip_of_txts(mode: str, n_last: int) -> str:
|
|
| 477 |
if mode == "last_n":
|
| 478 |
files = files[-max(1, int(n_last)):]
|
| 479 |
|
| 480 |
-
# ZIPは毎回作り直す
|
| 481 |
zip_path = os.path.join(OUT_DIR, "rewrite_passages.zip")
|
| 482 |
if os.path.exists(zip_path):
|
| 483 |
os.remove(zip_path)
|
| 484 |
|
| 485 |
-
# 一時ディレクトリに集めてからzip(shutil.make_archiveはディレクトリ単位)
|
| 486 |
tmp_dir = os.path.join(OUT_DIR, "_zip_tmp")
|
| 487 |
if os.path.exists(tmp_dir):
|
| 488 |
shutil.rmtree(tmp_dir)
|
|
@@ -492,8 +468,6 @@ def build_zip_of_txts(mode: str, n_last: int) -> str:
|
|
| 492 |
shutil.copy(fp, os.path.join(tmp_dir, os.path.basename(fp)))
|
| 493 |
|
| 494 |
shutil.make_archive(os.path.join(OUT_DIR, "rewrite_passages"), "zip", tmp_dir)
|
| 495 |
-
|
| 496 |
-
# 後片付け
|
| 497 |
shutil.rmtree(tmp_dir, ignore_errors=True)
|
| 498 |
|
| 499 |
return zip_path
|
|
@@ -518,21 +492,15 @@ with gr.Blocks() as demo:
|
|
| 518 |
|
| 519 |
state = gr.State(init_state())
|
| 520 |
|
| 521 |
-
# ★
|
| 522 |
-
prompt_mode_ui = gr.Dropdown(
|
| 523 |
-
choices=list(PROMPT_MODES.keys()),
|
| 524 |
-
value="既存(シンプル)",
|
| 525 |
-
label="Prompt Mode(試すプロンプトを選択)"
|
| 526 |
-
)
|
| 527 |
-
|
| 528 |
level = gr.Dropdown(choices=[1, 2, 3, 4, 5], value=1, label="Target Level (1..5)")
|
| 529 |
status = gr.Textbox(label="Status", interactive=False)
|
| 530 |
-
header = gr.Textbox(label="Result Header (#Text / FRE / Words)", lines=
|
| 531 |
progress = gr.Textbox(label="Progress", interactive=False)
|
| 532 |
output_text = gr.Textbox(label="Rewritten Text", lines=18, interactive=False)
|
| 533 |
|
| 534 |
with gr.Row():
|
| 535 |
-
start_btn = gr.Button("開始(
|
| 536 |
one_btn = gr.Button("次へ(1件処理)")
|
| 537 |
all_btn = gr.Button("全件実行(残りを処理)")
|
| 538 |
stop_btn = gr.Button("停止")
|
|
@@ -567,27 +535,24 @@ with gr.Blocks() as demo:
|
|
| 567 |
download_csv_btn = gr.Button("CSV(rewrite_scores.csv)をダウンロード")
|
| 568 |
download_csv_file = gr.File(label="Download (csv)")
|
| 569 |
|
| 570 |
-
# 既存の表示(参考)
|
| 571 |
gr.Markdown(f"📄 CSVパス: `{CSV_PATH}`")
|
| 572 |
gr.Markdown(f"📝 TXTフォルダ: `{REWRITE_DIR}`")
|
|
|
|
| 573 |
|
| 574 |
-
# ---- 既存ボタン(prompt_mode
|
| 575 |
-
def _pm_key(pm_label: str) -> str:
|
| 576 |
-
return PROMPT_MODES.get(pm_label, "legacy")
|
| 577 |
-
|
| 578 |
start_btn.click(
|
| 579 |
-
fn=
|
| 580 |
-
inputs=[level
|
| 581 |
outputs=[state, status, header, progress, output_text, download_one_file, download_zip_file]
|
| 582 |
)
|
| 583 |
one_btn.click(
|
| 584 |
-
fn=
|
| 585 |
-
inputs=[level,
|
| 586 |
outputs=[state, status, header, progress, output_text, download_one_file, download_zip_file]
|
| 587 |
)
|
| 588 |
all_btn.click(
|
| 589 |
-
fn=
|
| 590 |
-
inputs=[level,
|
| 591 |
outputs=[state, status, header, progress, output_text, download_one_file, download_zip_file]
|
| 592 |
)
|
| 593 |
|
|
|
|
| 4 |
import csv
|
| 5 |
import threading
|
| 6 |
import shutil
|
| 7 |
+
import random
|
| 8 |
from datetime import datetime, timedelta
|
| 9 |
from typing import Optional
|
| 10 |
|
|
|
|
| 12 |
import textstat
|
| 13 |
from openai import OpenAI
|
| 14 |
|
| 15 |
+
# ★追加:xlsx読込
|
| 16 |
+
import pandas as pd
|
| 17 |
+
|
| 18 |
# =========================
|
| 19 |
# 設定(元コード踏襲)
|
| 20 |
# =========================
|
|
|
|
| 37 |
|
| 38 |
PASSAGES_DIR = os.getenv("PASSAGES_DIR", "passages")
|
| 39 |
|
| 40 |
+
# ★追加:Excel
|
| 41 |
+
PASSAGE_INFO_XLSX = os.getenv("PASSAGE_INFO_XLSX", "passage_information.xlsx")
|
| 42 |
+
|
| 43 |
if not API_KEY:
|
| 44 |
raise RuntimeError("API_KEY is not set (env: API_KEY)")
|
| 45 |
|
|
|
|
| 51 |
_stop_flag_lock = threading.Lock()
|
| 52 |
_stop_flag = False
|
| 53 |
|
| 54 |
+
|
| 55 |
# =========================
|
| 56 |
+
# ユーティリティ
|
| 57 |
# =========================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
def load_text(path: str) -> str:
|
| 59 |
with open(path, "r", encoding="utf-8") as f:
|
| 60 |
return f.read()
|
| 61 |
|
| 62 |
|
| 63 |
+
def target_flesch_from_level(level: int) -> int:
|
| 64 |
+
level_to_flesch = {1: 90, 2: 75, 3: 65, 4: 55, 5: 40}
|
| 65 |
+
return level_to_flesch[int(level)]
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def passage_path_from_text_id(text_id: int, passages_dir: str) -> str:
|
| 69 |
+
# 元コードの pg{ID}.txt に合わせる
|
| 70 |
+
return os.path.join(passages_dir, f"pg{text_id}.txt")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
# =========================
|
| 74 |
+
# Excel読み込み(Text# と flesch_score)
|
| 75 |
# =========================
|
| 76 |
+
def load_passage_info(xlsx_path: str) -> pd.DataFrame:
|
| 77 |
+
if not os.path.exists(xlsx_path):
|
| 78 |
+
raise FileNotFoundError(f"Not found: {xlsx_path}")
|
|
|
|
| 79 |
|
| 80 |
+
df = pd.read_excel(xlsx_path)
|
| 81 |
|
| 82 |
+
# 必須列チェック(スペースや大小の揺れを許容)
|
| 83 |
+
cols = {c.strip(): c for c in df.columns}
|
| 84 |
+
if "Text#" not in cols or "flesch_score" not in cols:
|
| 85 |
+
raise ValueError(
|
| 86 |
+
"passage_information.xlsx must contain columns: 'Text#' and 'flesch_score'"
|
| 87 |
+
)
|
| 88 |
|
| 89 |
+
df = df.rename(columns={cols["Text#"]: "Text#", cols["flesch_score"]: "flesch_score"})
|
| 90 |
|
| 91 |
+
# 型整形
|
| 92 |
+
df["Text#"] = pd.to_numeric(df["Text#"], errors="coerce").astype("Int64")
|
| 93 |
+
df["flesch_score"] = pd.to_numeric(df["flesch_score"], errors="coerce")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
df = df.dropna(subset=["Text#", "flesch_score"]).copy()
|
| 96 |
+
df["Text#"] = df["Text#"].astype(int)
|
| 97 |
+
df["flesch_score"] = df["flesch_score"].astype(float)
|
| 98 |
+
return df
|
| 99 |
|
| 100 |
+
|
| 101 |
+
# =========================
|
| 102 |
+
# プロンプト(★1本固定:元の1つ目だけ)
|
| 103 |
+
# =========================
|
| 104 |
+
def build_prompt(text: str, target_level: int) -> str:
|
| 105 |
+
target_flesch = target_flesch_from_level(target_level)
|
| 106 |
+
|
| 107 |
+
return f"""
|
| 108 |
+
Rewrite the following text to achieve a Flesch Readability Score of {target_flesch}.
|
| 109 |
Requirements:
|
| 110 |
- Extract only the main text, excluding titles, author names, source information, chapter numbers, annotations, footers, etc.
|
| 111 |
- Clearly separate sections such as chapters by inserting blank lines between them.
|
| 112 |
- Maintain the original meaning faithfully; do not add or remove important information.
|
| 113 |
+
- Rewrite the text in clear, modern English suitable for learners, consistently eliminating archaic, outdated, or literary expressions and unnatural syntax common in older texts, regardless of the target level.
|
| 114 |
+
- Key points to keep in mind when lowering reading comprehension difficulty include keeping sentences short, using familiar vocabulary, prioritizing simple sentence structures, avoiding jargon or providing clear explanations when used, and minimizing figurative language, idioms, and other expressions where meaning cannot be directly inferred from the words themselves.
|
|
|
|
| 115 |
- Output only the rewritten text. Do not include explanatory notes or other additional text.
|
| 116 |
{text}
|
| 117 |
+
""".strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
# =========================
|
| 121 |
+
# 書き換え
|
| 122 |
# =========================
|
| 123 |
+
def rewrite_level(text: str, target_level: int) -> str:
|
| 124 |
+
prompt = build_prompt(text=text, target_level=target_level)
|
| 125 |
|
| 126 |
max_tokens_candidates = [3000, 2000, 1500, 1000]
|
| 127 |
last_error = None
|
|
|
|
| 160 |
|
| 161 |
|
| 162 |
# =========================
|
| 163 |
+
# CSV追記(元コード踏襲:列は増やさない)
|
| 164 |
# =========================
|
| 165 |
_csv_lock = threading.Lock()
|
| 166 |
|
|
|
|
| 177 |
|
| 178 |
|
| 179 |
# =========================
|
| 180 |
+
# txt追記(★ファイル名を Text# と Target level が分かる形に)
|
| 181 |
# =========================
|
| 182 |
_txt_lock = threading.Lock()
|
| 183 |
|
|
|
|
| 185 |
def append_rewrite_txt(
|
| 186 |
text_id: int,
|
| 187 |
target_level: int,
|
| 188 |
+
original_fre: float, # ★追加:元教材スコア
|
| 189 |
+
rewritten_fre: float,
|
| 190 |
word_count: int,
|
| 191 |
rewritten_text: str,
|
| 192 |
):
|
| 193 |
ts = (datetime.utcnow() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
|
| 194 |
+
|
| 195 |
+
# ★要件:#Text と Target level がわかるファイル名
|
| 196 |
+
path = os.path.join(REWRITE_DIR, f"Text_{text_id}_Level_{target_level}.txt")
|
| 197 |
|
| 198 |
block = (
|
| 199 |
f"# Text {text_id}\n"
|
| 200 |
f"Target Level: {target_level}\n"
|
| 201 |
+
f"Original Flesch (from passage_information.xlsx): {original_fre:.2f}\n"
|
| 202 |
+
f"Rewritten Flesch Reading Ease: {rewritten_fre:.2f}\n"
|
| 203 |
f"Word Count: {word_count}\n"
|
| 204 |
f"Timestamp (JST): {ts}\n"
|
| 205 |
f"Model: {MODEL}\n\n"
|
|
|
|
| 231 |
# UIロジック
|
| 232 |
# =========================
|
| 233 |
def init_state():
|
| 234 |
+
# ★Excelから Text#/flesch_score 読み込み
|
| 235 |
+
df = load_passage_info(PASSAGE_INFO_XLSX)
|
| 236 |
+
|
| 237 |
+
# stateに保持(元コードの構造を壊さない範囲で追加)
|
| 238 |
+
return {
|
| 239 |
+
"df": df,
|
| 240 |
+
"eligible_ids": [], # start()でlevelに応じてセット
|
| 241 |
+
"orig_score_map": {}, # text_id -> original fre
|
| 242 |
+
"done": 0,
|
| 243 |
+
"total": 0,
|
| 244 |
+
}
|
| 245 |
|
| 246 |
|
| 247 |
+
def start(level: int):
|
| 248 |
set_stop(False)
|
| 249 |
st = init_state()
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
+
target_flesch = target_flesch_from_level(level)
|
| 252 |
+
|
| 253 |
+
df = st["df"]
|
| 254 |
+
# ★要件:Targetよりスコアが低い=難しいもの
|
| 255 |
+
eligible = df[df["flesch_score"] < float(target_flesch)].copy()
|
| 256 |
+
|
| 257 |
+
# passages/pg{Text#}.txt が存在するものだけに絞る(無いと失敗するため)
|
| 258 |
+
ids = []
|
| 259 |
+
score_map = {}
|
| 260 |
+
for _, r in eligible.iterrows():
|
| 261 |
+
tid = int(r["Text#"])
|
| 262 |
+
fp = passage_path_from_text_id(tid, PASSAGES_DIR)
|
| 263 |
+
if os.path.exists(fp):
|
| 264 |
+
ids.append(tid)
|
| 265 |
+
score_map[tid] = float(r["flesch_score"])
|
| 266 |
+
|
| 267 |
+
if not ids:
|
| 268 |
+
msg = (
|
| 269 |
+
f"条件に合う教材がありません。\n"
|
| 270 |
+
f"- Target FRE: {target_flesch}\n"
|
| 271 |
+
f"- 条件: flesch_score < {target_flesch}\n"
|
| 272 |
+
f"- かつ {PASSAGES_DIR}/pg{{Text#}}.txt が存在"
|
| 273 |
+
)
|
| 274 |
+
return st, msg, "", "", "", None, None
|
| 275 |
+
|
| 276 |
+
random.shuffle(ids)
|
| 277 |
+
st["eligible_ids"] = ids
|
| 278 |
+
st["orig_score_map"] = score_map
|
| 279 |
+
st["done"] = 0
|
| 280 |
+
st["total"] = len(ids)
|
| 281 |
|
| 282 |
+
msg = f"準備完了: 条件に合う {st['total']} 件からランダムに処理します(Target FRE={target_flesch})。"
|
| 283 |
+
return st, msg, "", f"0 / {st['total']}", "", None, None
|
| 284 |
|
| 285 |
+
|
| 286 |
+
def run_one(level: int, state: dict):
|
| 287 |
set_stop(False)
|
| 288 |
|
| 289 |
+
ids = state.get("eligible_ids", [])
|
| 290 |
+
done = int(state.get("done", 0))
|
| 291 |
+
total = int(state.get("total", 0))
|
| 292 |
+
|
| 293 |
+
if not ids:
|
| 294 |
+
return state, "全て処理済みです。", "", f"{done} / {total}", "", None, None
|
| 295 |
|
| 296 |
+
# ★ランダム(ただし未処理リストから先頭を取る:startでshuffle済み)
|
| 297 |
+
text_id = ids.pop(0)
|
| 298 |
|
| 299 |
+
path = passage_path_from_text_id(text_id, PASSAGES_DIR)
|
| 300 |
original = load_text(path)
|
| 301 |
|
| 302 |
+
rewritten = rewrite_level(original, target_level=level)
|
| 303 |
+
rewritten_fre, wc = compute_metrics(rewritten)
|
| 304 |
|
| 305 |
+
original_fre = float(state.get("orig_score_map", {}).get(text_id, float("nan")))
|
| 306 |
ts = (datetime.utcnow() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
|
| 307 |
|
| 308 |
append_csv_row({
|
| 309 |
"timestamp_jst": ts,
|
| 310 |
"Text#": text_id,
|
| 311 |
"target_level": level,
|
| 312 |
+
"flesch_reading_ease": f"{rewritten_fre:.2f}",
|
| 313 |
"word_count": wc,
|
| 314 |
"rewritten_text": rewritten
|
| 315 |
})
|
|
|
|
| 317 |
append_rewrite_txt(
|
| 318 |
text_id=text_id,
|
| 319 |
target_level=level,
|
| 320 |
+
original_fre=original_fre,
|
| 321 |
+
rewritten_fre=rewritten_fre,
|
| 322 |
word_count=wc,
|
| 323 |
rewritten_text=rewritten,
|
| 324 |
)
|
| 325 |
|
| 326 |
+
done += 1
|
| 327 |
+
state["done"] = done
|
| 328 |
+
state["eligible_ids"] = ids
|
| 329 |
|
| 330 |
+
saved_txt = os.path.join(REWRITE_DIR, f"Text_{text_id}_Level_{level}.txt")
|
| 331 |
header = (
|
| 332 |
f"#Text {text_id}\n"
|
| 333 |
f"Target Level: {level}\n"
|
| 334 |
+
f"Original Flesch (xlsx): {original_fre:.2f}\n"
|
| 335 |
+
f"Rewritten Flesch Reading Ease: {rewritten_fre:.2f}\n"
|
| 336 |
f"Word Count: {wc}\n"
|
| 337 |
f"Saved CSV: {CSV_PATH}\n"
|
| 338 |
+
f"Saved TXT: {saved_txt}"
|
| 339 |
)
|
| 340 |
+
|
| 341 |
+
progress = f"{done} / {total}"
|
| 342 |
return state, "1件処理しました。", header, progress, rewritten, None, None
|
| 343 |
|
| 344 |
|
| 345 |
+
def run_all(level: int, state: dict):
|
| 346 |
set_stop(False)
|
| 347 |
|
| 348 |
+
ids = state.get("eligible_ids", [])
|
| 349 |
+
done = int(state.get("done", 0))
|
| 350 |
+
total = int(state.get("total", 0))
|
| 351 |
|
| 352 |
+
if not ids:
|
| 353 |
+
return state, "全て処理済みです。", "", f"{done} / {total}", "", None, None
|
| 354 |
|
| 355 |
last_header = ""
|
| 356 |
last_text = ""
|
| 357 |
|
| 358 |
+
while ids:
|
| 359 |
if get_stop():
|
| 360 |
+
state["eligible_ids"] = ids
|
| 361 |
+
state["done"] = done
|
| 362 |
+
return state, "停止しました。", last_header, f"{done} / {total}", last_text, None, None
|
| 363 |
|
| 364 |
+
text_id = ids.pop(0)
|
| 365 |
+
path = passage_path_from_text_id(text_id, PASSAGES_DIR)
|
| 366 |
original = load_text(path)
|
| 367 |
|
| 368 |
+
rewritten = rewrite_level(original, target_level=level)
|
| 369 |
+
rewritten_fre, wc = compute_metrics(rewritten)
|
| 370 |
|
| 371 |
+
original_fre = float(state.get("orig_score_map", {}).get(text_id, float("nan")))
|
| 372 |
ts = (datetime.utcnow() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
|
| 373 |
|
| 374 |
append_csv_row({
|
| 375 |
"timestamp_jst": ts,
|
| 376 |
"Text#": text_id,
|
| 377 |
"target_level": level,
|
| 378 |
+
"flesch_reading_ease": f"{rewritten_fre:.2f}",
|
| 379 |
"word_count": wc,
|
| 380 |
"rewritten_text": rewritten
|
| 381 |
})
|
|
|
|
| 383 |
append_rewrite_txt(
|
| 384 |
text_id=text_id,
|
| 385 |
target_level=level,
|
| 386 |
+
original_fre=original_fre,
|
| 387 |
+
rewritten_fre=rewritten_fre,
|
| 388 |
word_count=wc,
|
| 389 |
rewritten_text=rewritten,
|
| 390 |
)
|
| 391 |
|
| 392 |
+
done += 1
|
| 393 |
+
state["done"] = done
|
| 394 |
+
state["eligible_ids"] = ids
|
| 395 |
+
|
| 396 |
+
saved_txt = os.path.join(REWRITE_DIR, f"Text_{text_id}_Level_{level}.txt")
|
| 397 |
last_header = (
|
| 398 |
f"#Text {text_id}\n"
|
| 399 |
f"Target Level: {level}\n"
|
| 400 |
+
f"Original Flesch (xlsx): {original_fre:.2f}\n"
|
| 401 |
+
f"Rewritten Flesch Reading Ease: {rewritten_fre:.2f}\n"
|
| 402 |
f"Word Count: {wc}\n"
|
| 403 |
f"Saved CSV: {CSV_PATH}\n"
|
| 404 |
+
f"Saved TXT: {saved_txt}"
|
| 405 |
)
|
| 406 |
last_text = rewritten
|
| 407 |
|
| 408 |
+
return state, "全件処理が完了しました。", last_header, f"{done} / {total}", last_text, None, None
|
|
|
|
|
|
|
|
|
|
| 409 |
|
| 410 |
|
| 411 |
def stop():
|
|
|
|
| 433 |
|
| 434 |
|
| 435 |
# =========================
|
| 436 |
+
# ダウンロード機能(ファイル名パターン更新)
|
| 437 |
# =========================
|
| 438 |
def list_generated_txt_files() -> list[str]:
|
| 439 |
+
files = sorted(glob.glob(os.path.join(REWRITE_DIR, "Text_*_Level_*.txt")))
|
|
|
|
| 440 |
return [os.path.basename(f) for f in files]
|
| 441 |
|
| 442 |
|
| 443 |
def build_single_txt_path(selected_name: str) -> str:
|
|
|
|
|
|
|
|
|
|
| 444 |
path = os.path.join(REWRITE_DIR, selected_name)
|
| 445 |
if not os.path.exists(path):
|
| 446 |
raise FileNotFoundError(f"Not found: {path}")
|
|
|
|
| 448 |
|
| 449 |
|
| 450 |
def build_zip_of_txts(mode: str, n_last: int) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
files = sorted(glob.glob(os.path.join(REWRITE_DIR, "Text_*.txt")), key=os.path.getmtime)
|
| 452 |
if not files:
|
| 453 |
raise FileNotFoundError("No generated txt files yet.")
|
|
|
|
| 455 |
if mode == "last_n":
|
| 456 |
files = files[-max(1, int(n_last)):]
|
| 457 |
|
|
|
|
| 458 |
zip_path = os.path.join(OUT_DIR, "rewrite_passages.zip")
|
| 459 |
if os.path.exists(zip_path):
|
| 460 |
os.remove(zip_path)
|
| 461 |
|
|
|
|
| 462 |
tmp_dir = os.path.join(OUT_DIR, "_zip_tmp")
|
| 463 |
if os.path.exists(tmp_dir):
|
| 464 |
shutil.rmtree(tmp_dir)
|
|
|
|
| 468 |
shutil.copy(fp, os.path.join(tmp_dir, os.path.basename(fp)))
|
| 469 |
|
| 470 |
shutil.make_archive(os.path.join(OUT_DIR, "rewrite_passages"), "zip", tmp_dir)
|
|
|
|
|
|
|
| 471 |
shutil.rmtree(tmp_dir, ignore_errors=True)
|
| 472 |
|
| 473 |
return zip_path
|
|
|
|
| 492 |
|
| 493 |
state = gr.State(init_state())
|
| 494 |
|
| 495 |
+
# ★変更:プロンプト選択UIを削除(1本固定)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
level = gr.Dropdown(choices=[1, 2, 3, 4, 5], value=1, label="Target Level (1..5)")
|
| 497 |
status = gr.Textbox(label="Status", interactive=False)
|
| 498 |
+
header = gr.Textbox(label="Result Header (#Text / FRE / Words)", lines=8, interactive=False)
|
| 499 |
progress = gr.Textbox(label="Progress", interactive=False)
|
| 500 |
output_text = gr.Textbox(label="Rewritten Text", lines=18, interactive=False)
|
| 501 |
|
| 502 |
with gr.Row():
|
| 503 |
+
start_btn = gr.Button("開始(Excel読み込み+候補抽出)")
|
| 504 |
one_btn = gr.Button("次へ(1件処理)")
|
| 505 |
all_btn = gr.Button("全件実行(残りを処理)")
|
| 506 |
stop_btn = gr.Button("停止")
|
|
|
|
| 535 |
download_csv_btn = gr.Button("CSV(rewrite_scores.csv)をダウンロード")
|
| 536 |
download_csv_file = gr.File(label="Download (csv)")
|
| 537 |
|
|
|
|
| 538 |
gr.Markdown(f"📄 CSVパス: `{CSV_PATH}`")
|
| 539 |
gr.Markdown(f"📝 TXTフォルダ: `{REWRITE_DIR}`")
|
| 540 |
+
gr.Markdown(f"📘 Excel: `{PASSAGE_INFO_XLSX}` / passages dir: `{PASSAGES_DIR}`")
|
| 541 |
|
| 542 |
+
# ---- 既存ボタン(prompt_modeなし)----
|
|
|
|
|
|
|
|
|
|
| 543 |
start_btn.click(
|
| 544 |
+
fn=start,
|
| 545 |
+
inputs=[level],
|
| 546 |
outputs=[state, status, header, progress, output_text, download_one_file, download_zip_file]
|
| 547 |
)
|
| 548 |
one_btn.click(
|
| 549 |
+
fn=run_one,
|
| 550 |
+
inputs=[level, state],
|
| 551 |
outputs=[state, status, header, progress, output_text, download_one_file, download_zip_file]
|
| 552 |
)
|
| 553 |
all_btn.click(
|
| 554 |
+
fn=run_all,
|
| 555 |
+
inputs=[level, state],
|
| 556 |
outputs=[state, status, header, progress, output_text, download_one_file, download_zip_file]
|
| 557 |
)
|
| 558 |
|