Spaces:

rein0421
/

mk_pptx

Sleeping

syurein

add

d4ea988 about 2 months ago

24.5 kB

	import gradio as gr
	import cv2
	import numpy as np
	import easyocr
	from PIL import Image, ImageDraw
	import os
	import tempfile
	import torch
	from torch.hub import download_url_to_file

	# PPTXライブラリのインポートチェック
	try:
	from pptx import Presentation
	from pptx.util import Pt
	from pptx.dml.color import RGBColor
	except ImportError:
	print("⚠️ python-pptx is not installed. Please run: pip install python-pptx")

	# ==============================================================================
	# 1. CPU/GPU両対応 LaMa (Inpaintingモデル) クラス
	# ==============================================================================
	class SafeLama:
	def __init__(self):
	# GPUが使えるならGPU、だめならCPUを選択
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"LaMa is running on: {self.device}")

	# モデルファイルのダウンロードとキャッシュ
	self.model_url = "https://github.com/sanster/models/releases/download/add_big_lama/big-lama.pt"
	self.model_path = os.path.join(os.path.expanduser("~"), ".cache", "big-lama.pt")

	if not os.path.exists(self.model_path):
	print(f"Downloading LaMa model to {self.model_path}...")
	os.makedirs(os.path.dirname(self.model_path), exist_ok=True)
	download_url_to_file(self.model_url, self.model_path)

	# モデルロード (map_locationでCPU/GPU自動振り分け)
	try:
	self.model = torch.jit.load(self.model_path, map_location=self.device)
	self.model.eval()
	self.model.to(self.device)
	print("LaMa model loaded successfully.")
	except Exception as e:
	print(f"Fatal Error loading LaMa model: {e}")
	raise e

	def __call__(self, image: Image.Image, mask: Image.Image) -> Image.Image:
	"""
	image: PIL Image (RGB)
	mask: PIL Image (L or RGB, 0=Keep, 255=Remove)
	"""
	# 前処理: Numpy化 & 正規化
	img_np = np.array(image).astype(np.float32) / 255.0
	mask_np = np.array(mask.convert("L")).astype(np.float32) / 255.0

	# Tensor化
	img_t = torch.from_numpy(img_np).permute(2, 0, 1).unsqueeze(0).to(self.device)
	mask_t = torch.from_numpy(mask_np).unsqueeze(0).unsqueeze(0).to(self.device)
	mask_t = (mask_t > 0.5).float() # 2値化

	# 推論
	with torch.no_grad():
	output = self.model(img_t, mask_t)

	# 後処理: 画像形式に戻す
	cur_res = output[0].permute(1, 2, 0).detach().cpu().numpy()
	cur_res = np.clip(cur_res * 255, 0, 255).astype(np.uint8)

	return Image.fromarray(cur_res)


	# ==============================================================================
	# 2. コアロジック (OCR, 描画, PPTX生成)
	# ==============================================================================
	class SlideCleanerCore:
	def __init__(self):
	print("Initializing EasyOCR...")
	use_gpu = torch.cuda.is_available()
	self.reader = easyocr.Reader(['ja', 'en'], gpu=use_gpu)

	self.lama_model = None
	try:
	print("Loading LaMa wrapper...")
	self.lama_model = SafeLama()
	except Exception as e:
	print(f"LaMa load failed: {e}")
	pass

	def detect_text_initial(self, image_np):
	"""
	OCRを実行し、編集可能なボックス状態リストを作成して返す
	"""
	print("Running OCR detection...")
	results = self.reader.readtext(image_np)
	box_states = []
	for (bbox, text, prob) in results:
	(tl, tr, br, bl) = bbox
	# 座標を整数に変換
	x1 = int(min(tl[0], bl[0]))
	y1 = int(min(tl[1], tr[1]))
	x2 = int(max(tr[0], br[0]))
	y2 = int(max(bl[1], br[1]))

	box_states.append({
	'bbox': [x1, y1, x2, y2],
	'text': text,
	'active': True # 初期値: 消去対象
	})
	return box_states

	def draw_preview(self, image_np, box_states, temp_point=None, highlight_idx=None):
	"""
	プレビュー画像の描画
	- Activeなボックス: 赤枠 (消去対象)
	- Highlightなボックス(統合待ち): シアン枠
	- 手動追加中の始点: 黄色い点
	"""
	pil_img = Image.fromarray(image_np).convert("RGBA")
	overlay = Image.new("RGBA", pil_img.size, (255, 255, 255, 0))
	draw = ImageDraw.Draw(overlay)

	for i, item in enumerate(box_states):
	if item['active']:
	x1, y1, x2, y2 = item['bbox']

	# デフォルト色
	fill_color = (255, 0, 0, 100) # 赤半透明
	outline_color = "red"

	# 統合待機中のハイライト
	if highlight_idx is not None and i == highlight_idx:
	fill_color = (0, 255, 255, 150) # シアン半透明
	outline_color = "cyan"

	draw.rectangle([x1, y1, x2, y2], fill=fill_color, outline=outline_color, width=2)

	# 手動追加モードの始点マーカー
	if temp_point:
	tx, ty = temp_point
	r = 5
	draw.ellipse((tx-r, ty-r, tx+r, ty+r), fill="yellow", outline="black")

	return Image.alpha_composite(pil_img, overlay).convert("RGB")

	def create_mask_from_states(self, image_shape, box_states, dilation=10):
	"""
	現在のボックス状態からInpainting用のマスク画像を生成
	"""
	h, w = image_shape[:2]
	mask = np.zeros((h, w), dtype=np.uint8)

	for item in box_states:
	if item['active']:
	x1, y1, x2, y2 = item['bbox']
	cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)

	if dilation > 0:
	kernel = np.ones((dilation, dilation), np.uint8)
	mask = cv2.dilate(mask, kernel, iterations=1)

	return mask

	def inpaint_image(self, image_np, mask_np, method):
	"""
	文字消去実行
	"""
	if method == "LaMa" and self.lama_model is not None:
	pil_img = Image.fromarray(image_np)
	pil_mask = Image.fromarray(mask_np)
	return np.array(self.lama_model(pil_img, pil_mask))
	else:
	# OpenCV Fallback
	return cv2.inpaint(image_np, mask_np, 3, cv2.INPAINT_TELEA)

	def add_slide_to_prs(self, prs, original_img_np, clean_img_np, box_states):
	"""
	PPTXのスライドを1枚追加する処理 (エラー回避・型変換強化版)
	"""
	try:
	slide = prs.slides.add_slide(prs.slide_layouts[6]) # 空白スライド

	# 画像サイズをPython標準のintに変換
	img_h, img_w = original_img_np.shape[:2]
	img_h, img_w = int(img_h), int(img_w)

	# 背景画像の設定
	with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_bg:
	Image.fromarray(clean_img_np).save(tmp_bg.name)
	slide.shapes.add_picture(tmp_bg.name, 0, 0, width=prs.slide_width, height=prs.slide_height)
	tmp_bg_path = tmp_bg.name

	# スライドのサイズ (EMU)
	slide_width_emu = prs.slide_width
	slide_height_emu = prs.slide_height

	# テキストボックス配置ループ
	for item in box_states:
	if item['active']:
	try:
	bbox = item['bbox']
	text = item['text']

	# 【重要】Numpy型 -> Python int型への強制変換
	x1, y1, x2, y2 = [int(v) for v in bbox]

	# 座標異常チェック
	if x2 <= x1 or y2 <= y1:
	continue

	# 画像に対する相対位置を計算 (0.0 - 1.0)
	rel_x = x1 / img_w
	rel_y = y1 / img_h
	rel_w = (x2 - x1) / img_w
	rel_h = (y2 - y1) / img_h

	# スライド上の座標 (EMU) に変換
	left = int(slide_width_emu * rel_x)
	top = int(slide_height_emu * rel_y)
	width = int(slide_width_emu * rel_w)
	height = int(slide_height_emu * rel_h)

	# テキストボックス作成
	textbox = slide.shapes.add_textbox(left, top, width, height)
	tf = textbox.text_frame
	tf.word_wrap = True

	p = tf.paragraphs[0]
	p.text = str(text)

	# フォント設定
	# 高さの70%程度をフォントサイズとする
	font_size_emu = slide_height_emu * rel_h * 0.7
	# Pt換算 (1 Pt = 12700 EMU)
	p.font.size = Pt(max(8, font_size_emu / 12700))

	# 【要望対応】文字色は黒(#000000)固定
	p.font.color.rgb = RGBColor(0, 0, 0)
	p.font.name = "Meiryo" # 日本語対応フォント

	except Exception as e_box:
	print(f"Skipping text box due to error: {e_box}")
	continue

	# 一時ファイル掃除
	if os.path.exists(tmp_bg_path): os.remove(tmp_bg_path)

	except Exception as e_slide:
	print(f"Error adding slide: {e_slide}")


	# インスタンス化
	core = SlideCleanerCore()


	# ==============================================================================
	# 3. UIロジック (イベントハンドラ)
	# ==============================================================================

	def on_files_upload(files):
	"""
	ファイルアップロード時の初期化処理
	"""
	if not files: return None, [], 0, [], "No files selected."

	file_paths = [f.name for f in files]

	# バッチデータ構造の初期化
	# メモリ節約のため画像本体は保持せず、パスとメタデータ(boxes)だけ持つ
	batch_data = [{'path': p, 'boxes': None} for p in file_paths]

	# 1枚目をロードして初期表示
	try:
	first_img = Image.open(file_paths[0]).convert("RGB")
	first_np = np.array(first_img)
	# OCR実行
	first_boxes = core.detect_text_initial(first_np)
	batch_data[0]['boxes'] = first_boxes

	preview = core.draw_preview(first_np, first_boxes)
	msg = f"Loaded {len(files)} images. Showing 1/{len(files)}."
	except Exception as e:
	return None, [], 0, [], f"Error loading image: {e}"

	return preview, batch_data, 0, first_boxes, msg

	def load_image_at_index(batch_data, index):
	"""
	指定インデックスの画像を読み込み、必要ならOCRしてプレビューを返す
	"""
	if not batch_data or index < 0 or index >= len(batch_data):
	return None, None, None

	path = batch_data[index]['path']
	try:
	img = Image.open(path).convert("RGB")
	img_np = np.array(img)

	# OCRキャッシュがなければ実行
	if batch_data[index]['boxes'] is None:
	boxes = core.detect_text_initial(img_np)
	batch_data[index]['boxes'] = boxes
	else:
	boxes = batch_data[index]['boxes']

	preview = core.draw_preview(img_np, boxes)
	return preview, boxes, img_np
	except Exception as e:
	print(f"Load error: {e}")
	return None, None, None

	def navigate(direction, batch_data, current_index, current_boxes_state):
	"""
	Prev/Next ボタン処理
	"""
	if not batch_data: return None, batch_data, 0, [], "No Data"

	# 現在の編集内容を保存
	batch_data[current_index]['boxes'] = current_boxes_state

	new_index = current_index + direction
	# 範囲制限
	if new_index < 0: new_index = 0
	if new_index >= len(batch_data): new_index = len(batch_data) - 1

	preview, new_boxes, _ = load_image_at_index(batch_data, new_index)

	return preview, batch_data, new_index, new_boxes, f"Image {new_index+1}/{len(batch_data)}"

	def canvas_click(batch_data, current_index, current_boxes_state, drawing_point, merge_src_idx, mode, evt: gr.SelectData):
	"""
	画像クリック時のアクション分岐
	"""
	if not batch_data: return None, current_boxes_state, drawing_point, merge_src_idx, "No Data"

	path = batch_data[current_index]['path']
	img_np = np.array(Image.open(path).convert("RGB"))
	click_x, click_y = evt.index
	msg = ""

	# ------------------------------------------------------------------
	# A. Toggle Mode (消す/残す切り替え)
	# ------------------------------------------------------------------
	if mode.startswith("Toggle"):
	for item in current_boxes_state:
	x1, y1, x2, y2 = item['bbox']
	# クリック座標がBox内か判定
	if x1 <= click_x <= x2 and y1 <= click_y <= y2:
	item['active'] = not item['active'] # 反転
	break
	# 他の状態はリセット
	drawing_point = None
	merge_src_idx = None
	msg = "Toggled box state."

	# ------------------------------------------------------------------
	# B. Merge Boxes Mode (2つの箱を合体)
	# ------------------------------------------------------------------
	elif mode.startswith("Merge"):
	# クリックされたBoxを探す
	clicked_idx = -1
	for i, item in enumerate(current_boxes_state):
	x1, y1, x2, y2 = item['bbox']
	if x1 <= click_x <= x2 and y1 <= click_y <= y2:
	clicked_idx = i
	break

	if clicked_idx != -1:
	if merge_src_idx is None:
	# 1つ目を選択
	merge_src_idx = clicked_idx
	msg = "Select 2nd box to merge."
	else:
	if merge_src_idx == clicked_idx:
	# 同じ箱をクリック -> キャンセル
	merge_src_idx = None
	msg = "Merge canceled (Same box)."
	else:
	# 2つ目を選択 -> 統合実行
	box_a = current_boxes_state[merge_src_idx]
	box_b = current_boxes_state[clicked_idx]

	# 座標の結合
	nx1 = min(box_a['bbox'][0], box_b['bbox'][0])
	ny1 = min(box_a['bbox'][1], box_b['bbox'][1])
	nx2 = max(box_a['bbox'][2], box_b['bbox'][2])
	ny2 = max(box_a['bbox'][3], box_b['bbox'][3])

	# テキスト結合 (Y座標が上の方を先に連結)
	if box_a['bbox'][1] < box_b['bbox'][1]:
	new_text = str(box_a['text']) + " " + str(box_b['text'])
	else:
	new_text = str(box_b['text']) + " " + str(box_a['text'])

	new_box = {'bbox': [nx1, ny1, nx2, ny2], 'text': new_text, 'active': True}

	# 古い方を削除 (インデックスずれ防止のため降順削除)
	indices = sorted([merge_src_idx, clicked_idx], reverse=True)
	for idx in indices:
	current_boxes_state.pop(idx)

	# 新しいボックスを追加
	current_boxes_state.append(new_box)

	merge_src_idx = None
	msg = "Merged successfully!"
	else:
	# 何もないところをクリック -> キャンセル
	merge_src_idx = None
	msg = "Canceled."

	drawing_point = None

	# ------------------------------------------------------------------
	# C. Add Manual Box Mode (手動追加)
	# ------------------------------------------------------------------
	else:
	merge_src_idx = None
	if drawing_point is None:
	# 1クリック目: 始点登録
	drawing_point = (click_x, click_y)
	msg = "Start point set. Click End point."
	else:
	# 2クリック目: 終点登録 -> Box生成
	sx, sy = drawing_point
	x1, x2 = int(min(sx, click_x)), int(max(sx, click_x))
	y1, y2 = int(min(sy, click_y)), int(max(sy, click_y))

	# 幅があまりに小さい場合は無視
	if (x2 - x1) > 5 and (y2 - y1) > 5:
	current_boxes_state.append({'bbox': [x1, y1, x2, y2], 'text': "", 'active': True})
	msg = "Manual Box added."
	else:
	msg = "Box too small."

	drawing_point = None

	# 描画更新
	batch_data[current_index]['boxes'] = current_boxes_state
	# merge_src_idx がある場合はその箱をハイライト
	preview = core.draw_preview(img_np, current_boxes_state, temp_point=drawing_point, highlight_idx=merge_src_idx)

	return preview, batch_data, current_boxes_state, drawing_point, merge_src_idx, msg


	def generate_final_pptx_batch(batch_data, dilation, method, progress=gr.Progress()):
	"""
	全ページを結合したPPTXを生成する (エラーハンドリング強化版)
	"""
	if not batch_data: return "No data", None

	try:
	prs = Presentation()

	# 1枚目の画像でスライドサイズを決定
	if not os.path.exists(batch_data[0]['path']):
	return "Error: Image file not found.", None

	img0 = Image.open(batch_data[0]['path'])
	prs.slide_width = Pt(img0.width)
	prs.slide_height = Pt(img0.height)

	# 全スライド処理ループ
	for i, data in enumerate(progress.tqdm(batch_data, desc="Generating Slides")):
	if not os.path.exists(data['path']):
	continue

	img = Image.open(data['path']).convert("RGB")
	img_np = np.array(img)

	# OCR未実行なら実行
	boxes = data['boxes']
	if boxes is None:
	boxes = core.detect_text_initial(img_np)

	# マスク生成 & Inpaint
	mask = core.create_mask_from_states(img_np.shape, boxes, dilation)
	clean_img = core.inpaint_image(img_np, mask, method)

	# スライド追加
	core.add_slide_to_prs(prs, img_np, clean_img, boxes)

	# 保存
	out_path = "final_presentation.pptx"
	prs.save(out_path)
	return f"Completed! {len(batch_data)} slides merged.", out_path

	except Exception as e:
	import traceback
	traceback.print_exc()
	return f"Error during PPTX generation: {e}", None


	# ==============================================================================
	# 4. Gradio UI 構築
	# ==============================================================================

	# テーマ設定
	theme = gr.themes.Soft(
	primary_hue="indigo",
	secondary_hue="slate",
	neutral_hue="slate",
	font=["Meiryo", "sans-serif"]
	).set(
	body_background_fill="#0b0f19",
	block_background_fill="#111827",
	block_border_color="#374151",
	input_background_fill="#1f2937",
	button_primary_background_fill="#4f46e5",
	body_text_color="#f3f4f6",
	block_label_text_color="#f3f4f6",
	block_title_text_color="#f3f4f6"
	)

	css = """
	footer {visibility: hidden}
	.gradio-container {min-height: 100vh;}
	"""

	with gr.Blocks(theme=theme, css=css, title="Slide Fixer Pro Max") as demo:
	gr.Markdown("# 🚀 Slide Fixer Pro Max")
	gr.Markdown("画像をアップロード -> 確認・編集 -> 文字が消えた編集可能なPPTX を生成！")

	# ----- 状態変数 (State) -----
	batch_data_state = gr.State([]) # 全画像のパスとOCRデータ
	current_idx_state = gr.State(0) # 現在表示中の画像インデックス
	current_boxes_state = gr.State([]) # 現在表示中の画像のボックスリスト
	draw_point_state = gr.State(None) # 手動追加モード用: 始点
	merge_src_idx_state = gr.State(None) # 統合モード用: 1つ目のボックスインデックス

	# ----- UI Layout -----
	with gr.Row():
	files_input = gr.File(file_count="multiple", label="1. Upload Images (Multiple Support)")

	with gr.Row():
	# 左カラム: 画像プレビュー & エディタ
	with gr.Column(scale=2):
	canvas = gr.Image(label="Preview & Click Editor", interactive=False)

	# 右カラム: 操作パネル
	with gr.Column(scale=1):
	info_text = gr.Textbox(label="Status", value="Waiting for upload...")

	# ページ送り
	with gr.Row():
	prev_btn = gr.Button("<< Prev Image")
	next_btn = gr.Button("Next Image >>")

	gr.Markdown("### 🛠️ Edit Mode")
	edit_mode = gr.Radio(
	[
	"Toggle (Click to Remove/Keep)",
	"Merge Boxes (Click 2 Boxes)",
	"Add Manual Box (Click Start/End)"
	],
	value="Toggle (Click to Remove/Keep)",
	label="Mode Select"
	)
	gr.Markdown("""
	- Toggle: 赤枠をクリックで ON/OFF 切り替え。
	- Merge: 離れた枠を2つクリックして合体。
	- Add: 認識されなかった文字を手動で囲む。
	""")

	gr.Markdown("---")
	# 設定
	dilation_sld = gr.Slider(0, 30, 10, label="Dilation (Mask Expansion)")
	method_radio = gr.Radio(["LaMa", "OpenCV"], value="LaMa", label="Inpaint Method")

	# 生成ボタン
	gen_btn = gr.Button("✨ Generate Merged PPTX (All Black Text)", variant="primary", size="lg")
	dl_file = gr.File(label="Download Result")

	# ----- イベント定義 -----

	# 1. 画像アップロード
	files_input.upload(
	fn=on_files_upload,
	inputs=[files_input],
	outputs=[canvas, batch_data_state, current_idx_state, current_boxes_state, info_text]
	)

	# 2. 前へ戻る
	prev_btn.click(
	fn=lambda d, b, i, c: navigate(-1, b, i, c),
	inputs=[batch_data_state, batch_data_state, current_idx_state, current_boxes_state],
	outputs=[canvas, batch_data_state, current_idx_state, current_boxes_state, info_text]
	)

	# 3. 次へ進む
	next_btn.click(
	fn=lambda d, b, i, c: navigate(1, b, i, c),
	inputs=[batch_data_state, batch_data_state, current_idx_state, current_boxes_state],
	outputs=[canvas, batch_data_state, current_idx_state, current_boxes_state, info_text]
	)

	# 4. 画像クリック (編集アクション)
	canvas.select(
	fn=canvas_click,
	inputs=[
	batch_data_state,
	current_idx_state,
	current_boxes_state,
	draw_point_state,
	merge_src_idx_state,
	edit_mode
	],
	outputs=[
	canvas,
	batch_data_state,
	current_boxes_state,
	draw_point_state,
	merge_src_idx_state,
	info_text
	]
	)

	# 5. PPTX生成
	gen_btn.click(
	fn=generate_final_pptx_batch,
	inputs=[batch_data_state, dilation_sld, method_radio],
	outputs=[info_text, dl_file]
	)

	if __name__ == "__main__":
	# HuggingFace Spacesで動かすための設定
	demo.queue().launch(share=False)