Studymaker2

Sleeping

App Files Files Community

Studymaker2 / ppt_parser.py

g0th

Update ppt_parser.py

3b3c05a verified 7 months ago

raw

history blame

3.67 kB

	import json
	import os
	from pptx import Presentation
	from pptx.util import Inches
	from pptx.shapes.group import GroupShape
	from pptx.shapes.picture import Picture
	from PIL import Image
	import io

	def print_json(item):
	return json.dumps(item, ensure_ascii=False, indent=4)

	def safe_font_attribute(run, attr):
	try:
	return getattr(run.font, attr)
	except Exception:
	return None

	def safe_color(run):
	try:
	return str(run.font.color.rgb) if run.font.color and run.font.color.rgb else None
	except Exception:
	return None

	def extract_paragraph_data(paragraph):
	if not paragraph.runs:
	return None
	run = paragraph.runs[0]
	return {
	'text': paragraph.text,
	'align': paragraph.alignment,
	'font': {
	'name': safe_font_attribute(run, 'name'),
	'bold': safe_font_attribute(run, 'bold'),
	'italic': safe_font_attribute(run, 'italic'),
	'underline': safe_font_attribute(run, 'underline'),
	'color': safe_color(run),
	'language_id': safe_font_attribute(run, 'language_id'),
	}
	}

	def transfer_textbox_content_in_group(group_shape):
	group_shape_item = {}
	for l, shape in enumerate(group_shape.shapes):
	shape_item = {}
	if shape.has_text_frame:
	shape_item['type'] = "text"
	shape_item['location'] = (shape.left, shape.top)
	text_frame = shape.text_frame
	for r, paragraph in enumerate(text_frame.paragraphs):
	data = extract_paragraph_data(paragraph)
	if data:
	shape_item[f'paragraph_{r}'] = data
	group_shape_item[f"shape_{l}"] = shape_item
	return group_shape_item

	def transfer_to_structure(pptx_file, images_dir_path):
	item = {}
	prs = Presentation(pptx_file)
	image_path_list = []

	os.makedirs(images_dir_path, exist_ok=True)

	for i, slide in enumerate(prs.slides):
	slide_item = {}

	for j, shape in enumerate(slide.shapes):
	shape_item = {}

	# Case 1: Normal text box
	if shape.has_text_frame:
	shape_item['type'] = "text"
	text_frame = shape.text_frame
	for r, paragraph in enumerate(text_frame.paragraphs):
	data = extract_paragraph_data(paragraph)
	if data:
	shape_item[f'paragraph_{r}'] = data

	# Case 2: Grouped shapes
	elif isinstance(shape, GroupShape):
	shape_item['type'] = "group"
	shape_item['group_content'] = transfer_textbox_content_in_group(shape)

	# Case 3: Picture
	elif isinstance(shape, Picture):
	shape_item['type'] = "picture"
	image_path = os.path.join(images_dir_path, f"picture_{j}.png")
	image_path_list.append(image_path)
	shape_item['image_path'] = image_path
	shape_item['size'] = shape.image.size
	shape_item['dpi'] = shape.image.dpi
	shape_item['location'] = (shape.left, shape.top)
	shape_item['location_inches'] = (Inches(shape.left).inches, Inches(shape.top).inches)
	try:
	image_stream = io.BytesIO(shape.image.blob)
	shape_image = Image.open(image_stream)
	shape_image.save(image_path)
	except Exception:
	pass # Could not parse image

	slide_item[f"shape_{j}"] = shape_item

	item[f"slide_{i}"] = slide_item

	return print_json(item), image_path_list