Prompt_Test / app.py
Toya0421's picture
Update app.py
f6a23cc verified
import os
import re
import glob
import csv
import threading
import shutil
import random
from datetime import datetime, timedelta
from typing import Optional
import gradio as gr
import textstat
from openai import OpenAI
# ★追加:xlsx読込
import pandas as pd
# =========================
# 設定(元コード踏襲)
# =========================
API_KEY = os.getenv("API_KEY")
BASE_URL = "https://openrouter.ai/api/v1"
MODEL = os.getenv("MODEL", "google/gemini-2.5-flash")
# ★追加:デバッグ(LLMへ送ったプロンプトをUI表示する)
SHOW_PROMPT_DEBUG = os.getenv("SHOW_PROMPT_DEBUG", "1").strip() not in ("0", "false", "False", "no", "NO")
# Free Space では /data は永続ではない可能性が高いので、
# ダウンロード前提で「カレント」に保存してOK
OUT_DIR = os.getenv("OUT_DIR", ".")
os.makedirs(OUT_DIR, exist_ok=True)
CSV_PATH = os.path.join(OUT_DIR, "rewrite_scores.csv")
_word_re = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")
# 書き換え本文をtxtで蓄積するフォルダ
REWRITE_DIR = "rewrite_passages"
os.makedirs(REWRITE_DIR, exist_ok=True)
PASSAGES_DIR = os.getenv("PASSAGES_DIR", "passages")
# ★追加:Excel
PASSAGE_INFO_XLSX = os.getenv("PASSAGE_INFO_XLSX", "passage_information.xlsx")
if not API_KEY:
raise RuntimeError("API_KEY is not set (env: API_KEY)")
client = OpenAI(base_url=BASE_URL, api_key=API_KEY)
REWRITE_CONCURRENCY = int(os.getenv("REWRITE_CONCURRENCY", "2"))
_rewrite_sem = threading.Semaphore(REWRITE_CONCURRENCY)
_stop_flag_lock = threading.Lock()
_stop_flag = False
# =========================
# ユーティリティ
# =========================
def load_text(path: str) -> str:
with open(path, "r", encoding="utf-8") as f:
return f.read()
def target_flesch_from_level(level: int) -> int:
level_to_flesch = {1: 90, 2: 80, 3: 70, 4: 60, 5: 50}
return level_to_flesch[int(level)]
def passage_path_from_text_id(text_id: int, passages_dir: str) -> str:
# 元コードの pg{ID}.txt に合わせる
return os.path.join(passages_dir, f"pg{text_id}.txt")
# =========================
# Excel読み込み(Text# と flesch_score)
# =========================
def load_passage_info(xlsx_path: str) -> pd.DataFrame:
if not os.path.exists(xlsx_path):
raise FileNotFoundError(f"Not found: {xlsx_path}")
df = pd.read_excel(xlsx_path)
# 必須列チェック(スペースや大小の揺れを許容)
cols = {c.strip(): c for c in df.columns}
if "Text#" not in cols or "flesch_score" not in cols:
raise ValueError(
"passage_information.xlsx must contain columns: 'Text#' and 'flesch_score'"
)
df = df.rename(columns={cols["Text#"]: "Text#", cols["flesch_score"]: "flesch_score"})
# 型整形
df["Text#"] = pd.to_numeric(df["Text#"], errors="coerce").astype("Int64")
df["flesch_score"] = pd.to_numeric(df["flesch_score"], errors="coerce")
df = df.dropna(subset=["Text#", "flesch_score"]).copy()
df["Text#"] = df["Text#"].astype(int)
df["flesch_score"] = df["flesch_score"].astype(float)
return df
# =========================
# プロンプト(★1本固定:置き換え)
# =========================
def build_prompt(text: str, target_level: int, original_fre: Optional[float] = None) -> str:
target_flesch = target_flesch_from_level(target_level)
# original_fre が無い/壊れてる場合のフォールバック(仕様は変えない範囲で安全に)
try:
original_fre_val = float(original_fre) if original_fre is not None else float("nan")
except Exception:
original_fre_val = float("nan")
return f"""
Numerous Readability Manipulation:
The Flesch Reading Ease score is a numeric measure of text readability,
where higher scores indicate easier readability and lower scores indicate more difficult text.
In this task, we are trying to rewrite a given text into the target Flesch Reading Ease score.
Given the original draft (Flesch Reading Ease = {original_fre_val}):
[TEXT START]
{text}
[TEXT END]
Rewrite the above text to the difficulty level of:
Flesch Reading Ease = {target_flesch}
Follow the instructions below carefully.
- Reduce average sentence length while avoiding uniform or fixed-length sentences.
- Introduce natural variation in sentence length to maintain rhythm and avoid mechanical patterns.
- Prefer familiar, high-frequency vocabulary.
Qualitative Readability Control
・Content preservation
- Maintain the original meaning faithfully.
- Do not add new information.
- Do not remove important information.
- Do not introduce interpretations or opinions that are not present in the original text.
- Avoid jargon; if technical terms are necessary, explain them clearly in simple language.
・Clarity & Naturalness Refinement
- Rewrite the text in clear, modern English.
- Remove archaic expressions and unnatural or outdated syntax typical of older texts at all levels.
- Maintain a natural flow of reading rather than a sequence of evenly shortened sentences.
- Minimize figurative language, idioms, and expressions whose meanings are not directly inferable.
- Use simple and direct sentence structures.
Formatting Constraints
・Scope of rewriting
- Rewrite ONLY the main body text.
- Completely EXCLUDE titles, headings, chapter labels, author names, source information, footnotes, annotations, and introductions.
- Do NOT include any text other than the rewritten main body under any circumstances.
・Structure and formatting
- Preserve the original paragraph structure of the main text.
- Insert exactly ONE blank line between paragraphs.
- Do NOT create new section breaks, chapter divisions, or headings.
- Output only the rewritten text.
- Do not include explanations, comments, or metadata.
- Do not include [TEXT START] and [TEXT END] in the output.
""".strip()
# =========================
# 書き換え
# =========================
def rewrite_level(text: str, target_level: int, original_fre: Optional[float] = None):
"""
★変更(デバッグ用):書き換え結果に加えて、実際に送ったpromptも返す
API呼び出しやプロンプト内容自体は同一
"""
prompt = build_prompt(text=text, target_level=target_level, original_fre=original_fre)
max_tokens_candidates = [3000, 2000, 1500, 1000]
last_error = None
for mt in max_tokens_candidates:
try:
with _rewrite_sem:
resp = client.chat.completions.create(
model=MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.4,
max_tokens=mt,
)
return resp.choices[0].message.content.strip(), prompt
except Exception as e:
last_error = e
if "402" in str(e) or "more credits" in str(e):
continue
raise
raise RuntimeError(f"Rewrite failed after retries: {last_error}")
# =========================
# 指標(FRE + 単語数)
# =========================
def count_words_english(text: str) -> int:
return len(_word_re.findall(text))
def compute_metrics(text: str) -> tuple[float, int]:
fre = float(textstat.flesch_reading_ease(text))
wc = count_words_english(text)
return fre, wc
# =========================
# CSV追記(元コード踏襲:列は増やさない)
# =========================
_csv_lock = threading.Lock()
def append_csv_row(row: dict):
fieldnames = ["timestamp_jst", "Text#", "target_level", "flesch_reading_ease", "word_count", "rewritten_text"]
with _csv_lock:
exists = os.path.exists(CSV_PATH)
with open(CSV_PATH, "a", encoding="utf-8", newline="") as f:
w = csv.DictWriter(f, fieldnames=fieldnames)
if not exists:
w.writeheader()
w.writerow({k: row.get(k, "") for k in fieldnames})
# =========================
# txt追記(★ファイル名を Text# と Target level が分かる形に)
# =========================
_txt_lock = threading.Lock()
def append_rewrite_txt(
text_id: int,
target_level: int,
original_fre: float, # ★追加:元教材スコア
rewritten_fre: float,
word_count: int,
rewritten_text: str,
):
ts = (datetime.utcnow() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
# ★要件:#Text と Target level がわかるファイル名
path = os.path.join(REWRITE_DIR, f"Text_{text_id}_Level_{target_level}.txt")
block = (
f"# Text {text_id}\n"
f"Target Level: {target_level}\n"
f"Original Flesch (from passage_information.xlsx): {original_fre:.2f}\n"
f"Rewritten Flesch Reading Ease: {rewritten_fre:.2f}\n"
f"Word Count: {word_count}\n"
f"Timestamp (JST): {ts}\n"
f"Model: {MODEL}\n\n"
f"---- Rewritten Text ----\n"
f"{rewritten_text}\n\n"
f"{'=' * 80}\n"
)
with _txt_lock:
with open(path, "a", encoding="utf-8") as f:
f.write(block)
# =========================
# 停止フラグ
# =========================
def set_stop(flag: bool):
global _stop_flag
with _stop_flag_lock:
_stop_flag = flag
def get_stop() -> bool:
with _stop_flag_lock:
return _stop_flag
# =========================
# UIロジック
# =========================
def init_state():
# ★Excelから Text#/flesch_score 読み込み
df = load_passage_info(PASSAGE_INFO_XLSX)
# stateに保持(元コードの構造を壊さない範囲で追加)
return {
"df": df,
"eligible_ids": [], # start()でlevelに応じてセット
"orig_score_map": {}, # text_id -> original fre
"done": 0,
"total": 0,
}
def start(level: int):
set_stop(False)
st = init_state()
target_flesch = target_flesch_from_level(level)
df = st["df"]
# ★要件:Targetよりスコアが低い=難しいもの
eligible = df[df["flesch_score"] < float(target_flesch)].copy()
# passages/pg{Text#}.txt が存在するものだけに絞る(無いと失敗するため)
ids = []
score_map = {}
for _, r in eligible.iterrows():
tid = int(r["Text#"])
fp = passage_path_from_text_id(tid, PASSAGES_DIR)
if os.path.exists(fp):
ids.append(tid)
score_map[tid] = float(r["flesch_score"])
if not ids:
msg = (
f"条件に合う教材がありません。\n"
f"- Target FRE: {target_flesch}\n"
f"- 条件: flesch_score < {target_flesch}\n"
f"- かつ {PASSAGES_DIR}/pg{{Text#}}.txt が存在"
)
# ★prompt_debugも空で返す(outputsを増やしたため)
return st, msg, "", "", "", "", None, None
random.shuffle(ids)
st["eligible_ids"] = ids
st["orig_score_map"] = score_map
st["done"] = 0
st["total"] = len(ids)
msg = f"準備完了: 条件に合う {st['total']} 件からランダムに処理します(Target FRE={target_flesch})。"
return st, msg, "", f"0 / {st['total']}", "", "", None, None
def run_one(level: int, state: dict):
set_stop(False)
ids = state.get("eligible_ids", [])
done = int(state.get("done", 0))
total = int(state.get("total", 0))
if not ids:
return state, "全て処理済みです。", "", f"{done} / {total}", "", "", None, None
# ★ランダム(ただし未処理リストから先頭を取る:startでshuffle済み)
text_id = ids.pop(0)
path = passage_path_from_text_id(text_id, PASSAGES_DIR)
original = load_text(path)
original_fre = float(state.get("orig_score_map", {}).get(text_id, float("nan")))
# ★変更:original_fre をプロンプトに渡す
rewritten, used_prompt = rewrite_level(original, target_level=level, original_fre=original_fre)
rewritten_fre, wc = compute_metrics(rewritten)
ts = (datetime.utcnow() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
append_csv_row({
"timestamp_jst": ts,
"Text#": text_id,
"target_level": level,
"flesch_reading_ease": f"{rewritten_fre:.2f}",
"word_count": wc,
"rewritten_text": rewritten
})
append_rewrite_txt(
text_id=text_id,
target_level=level,
original_fre=original_fre,
rewritten_fre=rewritten_fre,
word_count=wc,
rewritten_text=rewritten,
)
done += 1
state["done"] = done
state["eligible_ids"] = ids
saved_txt = os.path.join(REWRITE_DIR, f"Text_{text_id}_Level_{level}.txt")
header = (
f"#Text {text_id}\n"
f"Target Level: {level}\n"
f"Original Flesch (xlsx): {original_fre:.2f}\n"
f"Rewritten Flesch Reading Ease: {rewritten_fre:.2f}\n"
f"Word Count: {wc}\n"
f"Saved CSV: {CSV_PATH}\n"
f"Saved TXT: {saved_txt}"
)
progress = f"{done} / {total}"
prompt_debug_text = used_prompt if SHOW_PROMPT_DEBUG else ""
return state, "1件処理しました。", header, progress, rewritten, prompt_debug_text, None, None
def run_all(level: int, state: dict):
set_stop(False)
ids = state.get("eligible_ids", [])
done = int(state.get("done", 0))
total = int(state.get("total", 0))
if not ids:
return state, "全て処理済みです。", "", f"{done} / {total}", "", "", None, None
last_header = ""
last_text = ""
last_prompt = ""
while ids:
if get_stop():
state["eligible_ids"] = ids
state["done"] = done
return state, "停止しました。", last_header, f"{done} / {total}", last_text, (last_prompt if SHOW_PROMPT_DEBUG else ""), None, None
text_id = ids.pop(0)
path = passage_path_from_text_id(text_id, PASSAGES_DIR)
original = load_text(path)
original_fre = float(state.get("orig_score_map", {}).get(text_id, float("nan")))
rewritten, used_prompt = rewrite_level(original, target_level=level, original_fre=original_fre)
rewritten_fre, wc = compute_metrics(rewritten)
ts = (datetime.utcnow() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")
append_csv_row({
"timestamp_jst": ts,
"Text#": text_id,
"target_level": level,
"flesch_reading_ease": f"{rewritten_fre:.2f}",
"word_count": wc,
"rewritten_text": rewritten
})
append_rewrite_txt(
text_id=text_id,
target_level=level,
original_fre=original_fre,
rewritten_fre=rewritten_fre,
word_count=wc,
rewritten_text=rewritten,
)
done += 1
state["done"] = done
state["eligible_ids"] = ids
saved_txt = os.path.join(REWRITE_DIR, f"Text_{text_id}_Level_{level}.txt")
last_header = (
f"#Text {text_id}\n"
f"Target Level: {level}\n"
f"Original Flesch (xlsx): {original_fre:.2f}\n"
f"Rewritten Flesch Reading Ease: {rewritten_fre:.2f}\n"
f"Word Count: {wc}\n"
f"Saved CSV: {CSV_PATH}\n"
f"Saved TXT: {saved_txt}"
)
last_text = rewritten
last_prompt = used_prompt
return state, "全件処理が完了しました。", last_header, f"{done} / {total}", last_text, (last_prompt if SHOW_PROMPT_DEBUG else ""), None, None
def stop():
set_stop(True)
return "停止要求を受け付けました(処理中の1件が終わったタイミングで止まります)。"
def reset_csv():
with _csv_lock:
if os.path.exists(CSV_PATH):
os.remove(CSV_PATH)
return f"CSVを削除しました: {CSV_PATH}"
def reset_rewrite_folder():
removed = 0
with _txt_lock:
for fp in glob.glob(os.path.join(REWRITE_DIR, "Text_*.txt")):
try:
os.remove(fp)
removed += 1
except Exception:
pass
return f"rewrite_passages の Text_*.txt を削除しました({removed}件): {REWRITE_DIR}"
# =========================
# ダウンロード機能(ファイル名パターン更新)
# =========================
def list_generated_txt_files() -> list[str]:
files = sorted(glob.glob(os.path.join(REWRITE_DIR, "Text_*_Level_*.txt")))
return [os.path.basename(f) for f in files]
def build_single_txt_path(selected_name: str) -> str:
path = os.path.join(REWRITE_DIR, selected_name)
if not os.path.exists(path):
raise FileNotFoundError(f"Not found: {path}")
return path
def build_zip_of_txts(mode: str, n_last: int) -> str:
files = sorted(glob.glob(os.path.join(REWRITE_DIR, "Text_*.txt")), key=os.path.getmtime)
if not files:
raise FileNotFoundError("No generated txt files yet.")
if mode == "last_n":
files = files[-max(1, int(n_last)):]
zip_path = os.path.join(OUT_DIR, "rewrite_passages.zip")
if os.path.exists(zip_path):
os.remove(zip_path)
tmp_dir = os.path.join(OUT_DIR, "_zip_tmp")
if os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir)
os.makedirs(tmp_dir, exist_ok=True)
for fp in files:
shutil.copy(fp, os.path.join(tmp_dir, os.path.basename(fp)))
shutil.make_archive(os.path.join(OUT_DIR, "rewrite_passages"), "zip", tmp_dir)
shutil.rmtree(tmp_dir, ignore_errors=True)
return zip_path
def download_csv() -> str:
if not os.path.exists(CSV_PATH):
raise FileNotFoundError("rewrite_scores.csv is not created yet.")
return CSV_PATH
def refresh_txt_dropdown() -> gr.Dropdown:
names = list_generated_txt_files()
return gr.Dropdown(choices=names, value=(names[-1] if names else None))
# =========================
# Gradio UI(Spaces向け)
# =========================
with gr.Blocks() as demo:
gr.Markdown("# 🔁 Passage Rewrite + FRE/Word Scoring (HF Spaces)")
state = gr.State(init_state())
level = gr.Dropdown(choices=[1, 2, 3, 4, 5], value=1, label="Target Level (1..5)")
status = gr.Textbox(label="Status", interactive=False)
header = gr.Textbox(label="Result Header (#Text / FRE / Words)", lines=8, interactive=False)
progress = gr.Textbox(label="Progress", interactive=False)
output_text = gr.Textbox(label="Rewritten Text", lines=18, interactive=False)
# ★追加:プロンプトデバッグ表示
prompt_debug = gr.Textbox(
label="🔍 Prompt sent to LLM (Debug)",
lines=20,
interactive=False,
visible=SHOW_PROMPT_DEBUG
)
with gr.Row():
start_btn = gr.Button("開始(Excel読み込み+候補抽出)")
one_btn = gr.Button("次へ(1件処理)")
all_btn = gr.Button("全件実行(残りを処理)")
stop_btn = gr.Button("停止")
with gr.Row():
reset_btn = gr.Button("CSVリセット(削除)")
reset_txt_btn = gr.Button("rewrite_passagesリセット(Text_*.txt削除)")
gr.Markdown("## 📥 Download")
gr.Markdown("Free Space では Files に生成物が見えないことがあるので、ここからダウンロードしてください。")
with gr.Row():
refresh_btn = gr.Button("txt一覧を更新")
txt_dropdown = gr.Dropdown(
choices=list_generated_txt_files(),
label="生成済み txt を選択(1個ダウンロード)"
)
download_one_btn = gr.Button("選択したtxtをダウンロード")
download_one_file = gr.File(label="Download (single txt)")
with gr.Row():
zip_mode = gr.Radio(
choices=[("全txtをまとめてZIP", "all"), ("最新N個をZIP", "last_n")],
value="all",
label="ZIPの作り方"
)
last_n = gr.Number(value=5, precision=0, label="N(最新N個の場合)")
download_zip_btn = gr.Button("ZIPを作ってダウンロード")
download_zip_file = gr.File(label="Download (zip)")
with gr.Row():
download_csv_btn = gr.Button("CSV(rewrite_scores.csv)をダウンロード")
download_csv_file = gr.File(label="Download (csv)")
gr.Markdown(f"📄 CSVパス: `{CSV_PATH}`")
gr.Markdown(f"📝 TXTフォルダ: `{REWRITE_DIR}`")
gr.Markdown(f"📘 Excel: `{PASSAGE_INFO_XLSX}` / passages dir: `{PASSAGES_DIR}`")
start_btn.click(
fn=start,
inputs=[level],
outputs=[state, status, header, progress, output_text, prompt_debug, download_one_file, download_zip_file]
)
one_btn.click(
fn=run_one,
inputs=[level, state],
outputs=[state, status, header, progress, output_text, prompt_debug, download_one_file, download_zip_file]
)
all_btn.click(
fn=run_all,
inputs=[level, state],
outputs=[state, status, header, progress, output_text, prompt_debug, download_one_file, download_zip_file]
)
stop_btn.click(fn=stop, inputs=[], outputs=[status])
reset_btn.click(fn=reset_csv, inputs=[], outputs=[status])
reset_txt_btn.click(fn=reset_rewrite_folder, inputs=[], outputs=[status])
# ---- Download UI ----
refresh_btn.click(fn=refresh_txt_dropdown, inputs=[], outputs=[txt_dropdown])
download_one_btn.click(fn=build_single_txt_path, inputs=[txt_dropdown], outputs=[download_one_file])
download_zip_btn.click(fn=build_zip_of_txts, inputs=[zip_mode, last_n], outputs=[download_zip_file])
download_csv_btn.click(fn=download_csv, inputs=[], outputs=[download_csv_file])
if __name__ == "__main__":
demo.queue(max_size=64)
demo.launch()