mk_pptx / app.py
syurein
add
d4ea988
import gradio as gr
import cv2
import numpy as np
import easyocr
from PIL import Image, ImageDraw
import os
import tempfile
import torch
from torch.hub import download_url_to_file
# PPTXライブラリのインポートチェック
try:
from pptx import Presentation
from pptx.util import Pt
from pptx.dml.color import RGBColor
except ImportError:
print("⚠️ python-pptx is not installed. Please run: pip install python-pptx")
# ==============================================================================
# 1. CPU/GPU両対応 LaMa (Inpaintingモデル) クラス
# ==============================================================================
class SafeLama:
def __init__(self):
# GPUが使えるならGPU、だめならCPUを選択
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"LaMa is running on: {self.device}")
# モデルファイルのダウンロードとキャッシュ
self.model_url = "https://github.com/sanster/models/releases/download/add_big_lama/big-lama.pt"
self.model_path = os.path.join(os.path.expanduser("~"), ".cache", "big-lama.pt")
if not os.path.exists(self.model_path):
print(f"Downloading LaMa model to {self.model_path}...")
os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
download_url_to_file(self.model_url, self.model_path)
# モデルロード (map_locationでCPU/GPU自動振り分け)
try:
self.model = torch.jit.load(self.model_path, map_location=self.device)
self.model.eval()
self.model.to(self.device)
print("LaMa model loaded successfully.")
except Exception as e:
print(f"Fatal Error loading LaMa model: {e}")
raise e
def __call__(self, image: Image.Image, mask: Image.Image) -> Image.Image:
"""
image: PIL Image (RGB)
mask: PIL Image (L or RGB, 0=Keep, 255=Remove)
"""
# 前処理: Numpy化 & 正規化
img_np = np.array(image).astype(np.float32) / 255.0
mask_np = np.array(mask.convert("L")).astype(np.float32) / 255.0
# Tensor化
img_t = torch.from_numpy(img_np).permute(2, 0, 1).unsqueeze(0).to(self.device)
mask_t = torch.from_numpy(mask_np).unsqueeze(0).unsqueeze(0).to(self.device)
mask_t = (mask_t > 0.5).float() # 2値化
# 推論
with torch.no_grad():
output = self.model(img_t, mask_t)
# 後処理: 画像形式に戻す
cur_res = output[0].permute(1, 2, 0).detach().cpu().numpy()
cur_res = np.clip(cur_res * 255, 0, 255).astype(np.uint8)
return Image.fromarray(cur_res)
# ==============================================================================
# 2. コアロジック (OCR, 描画, PPTX生成)
# ==============================================================================
class SlideCleanerCore:
def __init__(self):
print("Initializing EasyOCR...")
use_gpu = torch.cuda.is_available()
self.reader = easyocr.Reader(['ja', 'en'], gpu=use_gpu)
self.lama_model = None
try:
print("Loading LaMa wrapper...")
self.lama_model = SafeLama()
except Exception as e:
print(f"LaMa load failed: {e}")
pass
def detect_text_initial(self, image_np):
"""
OCRを実行し、編集可能なボックス状態リストを作成して返す
"""
print("Running OCR detection...")
results = self.reader.readtext(image_np)
box_states = []
for (bbox, text, prob) in results:
(tl, tr, br, bl) = bbox
# 座標を整数に変換
x1 = int(min(tl[0], bl[0]))
y1 = int(min(tl[1], tr[1]))
x2 = int(max(tr[0], br[0]))
y2 = int(max(bl[1], br[1]))
box_states.append({
'bbox': [x1, y1, x2, y2],
'text': text,
'active': True # 初期値: 消去対象
})
return box_states
def draw_preview(self, image_np, box_states, temp_point=None, highlight_idx=None):
"""
プレビュー画像の描画
- Activeなボックス: 赤枠 (消去対象)
- Highlightなボックス(統合待ち): シアン枠
- 手動追加中の始点: 黄色い点
"""
pil_img = Image.fromarray(image_np).convert("RGBA")
overlay = Image.new("RGBA", pil_img.size, (255, 255, 255, 0))
draw = ImageDraw.Draw(overlay)
for i, item in enumerate(box_states):
if item['active']:
x1, y1, x2, y2 = item['bbox']
# デフォルト色
fill_color = (255, 0, 0, 100) # 赤 半透明
outline_color = "red"
# 統合待機中のハイライト
if highlight_idx is not None and i == highlight_idx:
fill_color = (0, 255, 255, 150) # シアン 半透明
outline_color = "cyan"
draw.rectangle([x1, y1, x2, y2], fill=fill_color, outline=outline_color, width=2)
# 手動追加モードの始点マーカー
if temp_point:
tx, ty = temp_point
r = 5
draw.ellipse((tx-r, ty-r, tx+r, ty+r), fill="yellow", outline="black")
return Image.alpha_composite(pil_img, overlay).convert("RGB")
def create_mask_from_states(self, image_shape, box_states, dilation=10):
"""
現在のボックス状態からInpainting用のマスク画像を生成
"""
h, w = image_shape[:2]
mask = np.zeros((h, w), dtype=np.uint8)
for item in box_states:
if item['active']:
x1, y1, x2, y2 = item['bbox']
cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
if dilation > 0:
kernel = np.ones((dilation, dilation), np.uint8)
mask = cv2.dilate(mask, kernel, iterations=1)
return mask
def inpaint_image(self, image_np, mask_np, method):
"""
文字消去実行
"""
if method == "LaMa" and self.lama_model is not None:
pil_img = Image.fromarray(image_np)
pil_mask = Image.fromarray(mask_np)
return np.array(self.lama_model(pil_img, pil_mask))
else:
# OpenCV Fallback
return cv2.inpaint(image_np, mask_np, 3, cv2.INPAINT_TELEA)
def add_slide_to_prs(self, prs, original_img_np, clean_img_np, box_states):
"""
PPTXのスライドを1枚追加する処理 (エラー回避・型変換強化版)
"""
try:
slide = prs.slides.add_slide(prs.slide_layouts[6]) # 空白スライド
# 画像サイズをPython標準のintに変換
img_h, img_w = original_img_np.shape[:2]
img_h, img_w = int(img_h), int(img_w)
# 背景画像の設定
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_bg:
Image.fromarray(clean_img_np).save(tmp_bg.name)
slide.shapes.add_picture(tmp_bg.name, 0, 0, width=prs.slide_width, height=prs.slide_height)
tmp_bg_path = tmp_bg.name
# スライドのサイズ (EMU)
slide_width_emu = prs.slide_width
slide_height_emu = prs.slide_height
# テキストボックス配置ループ
for item in box_states:
if item['active']:
try:
bbox = item['bbox']
text = item['text']
# 【重要】Numpy型 -> Python int型への強制変換
x1, y1, x2, y2 = [int(v) for v in bbox]
# 座標異常チェック
if x2 <= x1 or y2 <= y1:
continue
# 画像に対する相対位置を計算 (0.0 - 1.0)
rel_x = x1 / img_w
rel_y = y1 / img_h
rel_w = (x2 - x1) / img_w
rel_h = (y2 - y1) / img_h
# スライド上の座標 (EMU) に変換
left = int(slide_width_emu * rel_x)
top = int(slide_height_emu * rel_y)
width = int(slide_width_emu * rel_w)
height = int(slide_height_emu * rel_h)
# テキストボックス作成
textbox = slide.shapes.add_textbox(left, top, width, height)
tf = textbox.text_frame
tf.word_wrap = True
p = tf.paragraphs[0]
p.text = str(text)
# フォント設定
# 高さの70%程度をフォントサイズとする
font_size_emu = slide_height_emu * rel_h * 0.7
# Pt換算 (1 Pt = 12700 EMU)
p.font.size = Pt(max(8, font_size_emu / 12700))
# 【要望対応】文字色は黒(#000000)固定
p.font.color.rgb = RGBColor(0, 0, 0)
p.font.name = "Meiryo" # 日本語対応フォント
except Exception as e_box:
print(f"Skipping text box due to error: {e_box}")
continue
# 一時ファイル掃除
if os.path.exists(tmp_bg_path): os.remove(tmp_bg_path)
except Exception as e_slide:
print(f"Error adding slide: {e_slide}")
# インスタンス化
core = SlideCleanerCore()
# ==============================================================================
# 3. UIロジック (イベントハンドラ)
# ==============================================================================
def on_files_upload(files):
"""
ファイルアップロード時の初期化処理
"""
if not files: return None, [], 0, [], "No files selected."
file_paths = [f.name for f in files]
# バッチデータ構造の初期化
# メモリ節約のため画像本体は保持せず、パスとメタデータ(boxes)だけ持つ
batch_data = [{'path': p, 'boxes': None} for p in file_paths]
# 1枚目をロードして初期表示
try:
first_img = Image.open(file_paths[0]).convert("RGB")
first_np = np.array(first_img)
# OCR実行
first_boxes = core.detect_text_initial(first_np)
batch_data[0]['boxes'] = first_boxes
preview = core.draw_preview(first_np, first_boxes)
msg = f"Loaded {len(files)} images. Showing 1/{len(files)}."
except Exception as e:
return None, [], 0, [], f"Error loading image: {e}"
return preview, batch_data, 0, first_boxes, msg
def load_image_at_index(batch_data, index):
"""
指定インデックスの画像を読み込み、必要ならOCRしてプレビューを返す
"""
if not batch_data or index < 0 or index >= len(batch_data):
return None, None, None
path = batch_data[index]['path']
try:
img = Image.open(path).convert("RGB")
img_np = np.array(img)
# OCRキャッシュがなければ実行
if batch_data[index]['boxes'] is None:
boxes = core.detect_text_initial(img_np)
batch_data[index]['boxes'] = boxes
else:
boxes = batch_data[index]['boxes']
preview = core.draw_preview(img_np, boxes)
return preview, boxes, img_np
except Exception as e:
print(f"Load error: {e}")
return None, None, None
def navigate(direction, batch_data, current_index, current_boxes_state):
"""
Prev/Next ボタン処理
"""
if not batch_data: return None, batch_data, 0, [], "No Data"
# 現在の編集内容を保存
batch_data[current_index]['boxes'] = current_boxes_state
new_index = current_index + direction
# 範囲制限
if new_index < 0: new_index = 0
if new_index >= len(batch_data): new_index = len(batch_data) - 1
preview, new_boxes, _ = load_image_at_index(batch_data, new_index)
return preview, batch_data, new_index, new_boxes, f"Image {new_index+1}/{len(batch_data)}"
def canvas_click(batch_data, current_index, current_boxes_state, drawing_point, merge_src_idx, mode, evt: gr.SelectData):
"""
画像クリック時のアクション分岐
"""
if not batch_data: return None, current_boxes_state, drawing_point, merge_src_idx, "No Data"
path = batch_data[current_index]['path']
img_np = np.array(Image.open(path).convert("RGB"))
click_x, click_y = evt.index
msg = ""
# ------------------------------------------------------------------
# A. Toggle Mode (消す/残す 切り替え)
# ------------------------------------------------------------------
if mode.startswith("Toggle"):
for item in current_boxes_state:
x1, y1, x2, y2 = item['bbox']
# クリック座標がBox内か判定
if x1 <= click_x <= x2 and y1 <= click_y <= y2:
item['active'] = not item['active'] # 反転
break
# 他の状態はリセット
drawing_point = None
merge_src_idx = None
msg = "Toggled box state."
# ------------------------------------------------------------------
# B. Merge Boxes Mode (2つの箱を合体)
# ------------------------------------------------------------------
elif mode.startswith("Merge"):
# クリックされたBoxを探す
clicked_idx = -1
for i, item in enumerate(current_boxes_state):
x1, y1, x2, y2 = item['bbox']
if x1 <= click_x <= x2 and y1 <= click_y <= y2:
clicked_idx = i
break
if clicked_idx != -1:
if merge_src_idx is None:
# 1つ目を選択
merge_src_idx = clicked_idx
msg = "Select 2nd box to merge."
else:
if merge_src_idx == clicked_idx:
# 同じ箱をクリック -> キャンセル
merge_src_idx = None
msg = "Merge canceled (Same box)."
else:
# 2つ目を選択 -> 統合実行
box_a = current_boxes_state[merge_src_idx]
box_b = current_boxes_state[clicked_idx]
# 座標の結合
nx1 = min(box_a['bbox'][0], box_b['bbox'][0])
ny1 = min(box_a['bbox'][1], box_b['bbox'][1])
nx2 = max(box_a['bbox'][2], box_b['bbox'][2])
ny2 = max(box_a['bbox'][3], box_b['bbox'][3])
# テキスト結合 (Y座標が上の方を先に連結)
if box_a['bbox'][1] < box_b['bbox'][1]:
new_text = str(box_a['text']) + " " + str(box_b['text'])
else:
new_text = str(box_b['text']) + " " + str(box_a['text'])
new_box = {'bbox': [nx1, ny1, nx2, ny2], 'text': new_text, 'active': True}
# 古い方を削除 (インデックスずれ防止のため降順削除)
indices = sorted([merge_src_idx, clicked_idx], reverse=True)
for idx in indices:
current_boxes_state.pop(idx)
# 新しいボックスを追加
current_boxes_state.append(new_box)
merge_src_idx = None
msg = "Merged successfully!"
else:
# 何もないところをクリック -> キャンセル
merge_src_idx = None
msg = "Canceled."
drawing_point = None
# ------------------------------------------------------------------
# C. Add Manual Box Mode (手動追加)
# ------------------------------------------------------------------
else:
merge_src_idx = None
if drawing_point is None:
# 1クリック目: 始点登録
drawing_point = (click_x, click_y)
msg = "Start point set. Click End point."
else:
# 2クリック目: 終点登録 -> Box生成
sx, sy = drawing_point
x1, x2 = int(min(sx, click_x)), int(max(sx, click_x))
y1, y2 = int(min(sy, click_y)), int(max(sy, click_y))
# 幅があまりに小さい場合は無視
if (x2 - x1) > 5 and (y2 - y1) > 5:
current_boxes_state.append({'bbox': [x1, y1, x2, y2], 'text': "", 'active': True})
msg = "Manual Box added."
else:
msg = "Box too small."
drawing_point = None
# 描画更新
batch_data[current_index]['boxes'] = current_boxes_state
# merge_src_idx がある場合はその箱をハイライト
preview = core.draw_preview(img_np, current_boxes_state, temp_point=drawing_point, highlight_idx=merge_src_idx)
return preview, batch_data, current_boxes_state, drawing_point, merge_src_idx, msg
def generate_final_pptx_batch(batch_data, dilation, method, progress=gr.Progress()):
"""
全ページを結合したPPTXを生成する (エラーハンドリング強化版)
"""
if not batch_data: return "No data", None
try:
prs = Presentation()
# 1枚目の画像でスライドサイズを決定
if not os.path.exists(batch_data[0]['path']):
return "Error: Image file not found.", None
img0 = Image.open(batch_data[0]['path'])
prs.slide_width = Pt(img0.width)
prs.slide_height = Pt(img0.height)
# 全スライド処理ループ
for i, data in enumerate(progress.tqdm(batch_data, desc="Generating Slides")):
if not os.path.exists(data['path']):
continue
img = Image.open(data['path']).convert("RGB")
img_np = np.array(img)
# OCR未実行なら実行
boxes = data['boxes']
if boxes is None:
boxes = core.detect_text_initial(img_np)
# マスク生成 & Inpaint
mask = core.create_mask_from_states(img_np.shape, boxes, dilation)
clean_img = core.inpaint_image(img_np, mask, method)
# スライド追加
core.add_slide_to_prs(prs, img_np, clean_img, boxes)
# 保存
out_path = "final_presentation.pptx"
prs.save(out_path)
return f"Completed! {len(batch_data)} slides merged.", out_path
except Exception as e:
import traceback
traceback.print_exc()
return f"Error during PPTX generation: {e}", None
# ==============================================================================
# 4. Gradio UI 構築
# ==============================================================================
# テーマ設定
theme = gr.themes.Soft(
primary_hue="indigo",
secondary_hue="slate",
neutral_hue="slate",
font=["Meiryo", "sans-serif"]
).set(
body_background_fill="#0b0f19",
block_background_fill="#111827",
block_border_color="#374151",
input_background_fill="#1f2937",
button_primary_background_fill="#4f46e5",
body_text_color="#f3f4f6",
block_label_text_color="#f3f4f6",
block_title_text_color="#f3f4f6"
)
css = """
footer {visibility: hidden}
.gradio-container {min-height: 100vh;}
"""
with gr.Blocks(theme=theme, css=css, title="Slide Fixer Pro Max") as demo:
gr.Markdown("# 🚀 Slide Fixer Pro Max")
gr.Markdown("画像をアップロード -> 確認・編集 -> **文字が消えた編集可能なPPTX** を生成!")
# ----- 状態変数 (State) -----
batch_data_state = gr.State([]) # 全画像のパスとOCRデータ
current_idx_state = gr.State(0) # 現在表示中の画像インデックス
current_boxes_state = gr.State([]) # 現在表示中の画像のボックスリスト
draw_point_state = gr.State(None) # 手動追加モード用: 始点
merge_src_idx_state = gr.State(None) # 統合モード用: 1つ目のボックスインデックス
# ----- UI Layout -----
with gr.Row():
files_input = gr.File(file_count="multiple", label="1. Upload Images (Multiple Support)")
with gr.Row():
# 左カラム: 画像プレビュー & エディタ
with gr.Column(scale=2):
canvas = gr.Image(label="Preview & Click Editor", interactive=False)
# 右カラム: 操作パネル
with gr.Column(scale=1):
info_text = gr.Textbox(label="Status", value="Waiting for upload...")
# ページ送り
with gr.Row():
prev_btn = gr.Button("<< Prev Image")
next_btn = gr.Button("Next Image >>")
gr.Markdown("### 🛠️ Edit Mode")
edit_mode = gr.Radio(
[
"Toggle (Click to Remove/Keep)",
"Merge Boxes (Click 2 Boxes)",
"Add Manual Box (Click Start/End)"
],
value="Toggle (Click to Remove/Keep)",
label="Mode Select"
)
gr.Markdown("""
- **Toggle**: 赤枠をクリックで ON/OFF 切り替え。
- **Merge**: 離れた枠を2つクリックして合体。
- **Add**: 認識されなかった文字を手動で囲む。
""")
gr.Markdown("---")
# 設定
dilation_sld = gr.Slider(0, 30, 10, label="Dilation (Mask Expansion)")
method_radio = gr.Radio(["LaMa", "OpenCV"], value="LaMa", label="Inpaint Method")
# 生成ボタン
gen_btn = gr.Button("✨ Generate Merged PPTX (All Black Text)", variant="primary", size="lg")
dl_file = gr.File(label="Download Result")
# ----- イベント定義 -----
# 1. 画像アップロード
files_input.upload(
fn=on_files_upload,
inputs=[files_input],
outputs=[canvas, batch_data_state, current_idx_state, current_boxes_state, info_text]
)
# 2. 前へ戻る
prev_btn.click(
fn=lambda d, b, i, c: navigate(-1, b, i, c),
inputs=[batch_data_state, batch_data_state, current_idx_state, current_boxes_state],
outputs=[canvas, batch_data_state, current_idx_state, current_boxes_state, info_text]
)
# 3. 次へ進む
next_btn.click(
fn=lambda d, b, i, c: navigate(1, b, i, c),
inputs=[batch_data_state, batch_data_state, current_idx_state, current_boxes_state],
outputs=[canvas, batch_data_state, current_idx_state, current_boxes_state, info_text]
)
# 4. 画像クリック (編集アクション)
canvas.select(
fn=canvas_click,
inputs=[
batch_data_state,
current_idx_state,
current_boxes_state,
draw_point_state,
merge_src_idx_state,
edit_mode
],
outputs=[
canvas,
batch_data_state,
current_boxes_state,
draw_point_state,
merge_src_idx_state,
info_text
]
)
# 5. PPTX生成
gen_btn.click(
fn=generate_final_pptx_batch,
inputs=[batch_data_state, dilation_sld, method_radio],
outputs=[info_text, dl_file]
)
if __name__ == "__main__":
# HuggingFace Spacesで動かすための設定
demo.queue().launch(share=False)