import streamlit as st import os import numpy as np import fitz # PyMuPDF import easyocr # EasyOCR import cv2 import shutil import re import threading from PIL import Image from pptx import Presentation from pptx.util import Pt, Cm from pptx.enum.text import PP_ALIGN from docx import Document from docx.shared import Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from transformers import pipeline # Setup EasyOCR reader os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # To address OpenMP runtime issue reader = easyocr.Reader(['en']) # Initialize EasyOCR # Function to convert PDF pages to images def convert_pdf_pages_to_images(pdf_path, image_folder_path): doc = fitz.open(pdf_path) if not os.path.exists(image_folder_path): os.makedirs(image_folder_path) images = [] for page_num in range(len(doc)): page = doc.load_page(page_num) # Increase the resolution by specifying a higher zoom factor # Default DPI in PDFs is usually 72, so zooming by 3 gives you 216 DPI zoom_x = 3.5 # horizontal zoom zoom_y = 3.5 # vertical zoom mat = fitz.Matrix(zoom_x, zoom_y) # Zoom factor 3 in each dimension pix = page.get_pixmap(matrix=mat, alpha=False) # Render page to an image image_path = os.path.join(image_folder_path, f"page_{page_num}.png") pix.save(image_path) images.append(image_path) return images # Function to detect highlighted regions in images def detect_highlighted_regions(image_paths): lower_yellow = np.array([20, 100, 100]) upper_yellow = np.array([30, 255, 255]) highlighted_regions = [] for image_path in image_paths: image = cv2.imread(image_path) hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) mask = cv2.inRange(hsv, lower_yellow, upper_yellow) contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours = refine_contours(contours) # Refine contours to focus on highlighted regions contours = sort_contours(contours) # Sort contours to ensure correct order of text extraction for contour in contours: x, y, w, h = cv2.boundingRect(contour) highlighted_regions.append((image_path, (x, y, x+w, y+h))) return highlighted_regions # Function to refine contours based on certain criteria def refine_contours(contours): refined = [] for contour in contours: _, _, w, h = cv2.boundingRect(contour) if w > 10 and h > 10: # Example criteria refined.append(contour) return refined # Function to sort contours from top to bottom def sort_contours(contours): return sorted(contours, key=lambda c: cv2.boundingRect(c)[1]) # Function to extract text from highlighted regions def extract_text_from_highlights(highlighted_regions): extracted_texts = [] for image_path, (x1, y1, x2, y2) in highlighted_regions: image = Image.open(image_path).convert('RGB') cropped_image = image.crop((x1, y1, x2, y2)) result = reader.readtext(np.array(cropped_image), detail=0) extracted_text = " ".join(result) extracted_texts.append(extracted_text) return extracted_texts # Function to count words in a text def count_words(text): words = re.findall(r'\w+', text) return len(words) # Function to delete images in a folder def delete_images(folder_path): files = os.listdir(folder_path) for file in files: if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')): file_path = os.path.join(folder_path, file) try: os.remove(file_path) # st.write(f"Deleted: {file_path}") except Exception as e: st.write(f"Error deleting {file_path}: {e}") # Function to save extracted text to a Word document def save_text_to_word(extracted_texts, output_path): doc = Document() for text in extracted_texts: num_words = count_words(text) max_words = 200 min_words = 20 if num_words <= min_words: summarized_text = text else: max_length = min(max_words, num_words) summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") summarized_text = summarizer(text, max_length=max_length, min_length=min_words)[0]['summary_text'] doc.add_paragraph(summarized_text) doc.save(output_path) folder_path = "images" delete_images(folder_path) # Function to create a presentation from a Word document def create_presentation_from_word(doc_path, pptx_path, template_path): doc = Document(doc_path) prs = Presentation(template_path) single_line_heading_pattern = re.compile(r'^\d+(\.\d+)*(\.\d+)*\s+[A-Z].*') multi_line_heading_number_pattern = re.compile(r'^\d+(\.\d+)*$') word_limit_per_slide = 110 def add_initial_content_slide(): slide_layout = prs.slide_layouts[1] slide = prs.slides.add_slide(slide_layout) _, content_shape = slide.shapes.title, slide.placeholders[1] customize_content_shape(content_shape) return content_shape.text_frame content_started = False text_frame_for_initial_content = None for i, paragraph in enumerate(doc.paragraphs): text = paragraph.text.strip() if not content_started and (single_line_heading_pattern.match(text) is None and not text.isdigit()): parts = split_text_into_parts(text, word_limit_per_slide) for part in parts: if text_frame_for_initial_content is None or parts.index(part) > 0: text_frame_for_initial_content = add_initial_content_slide() add_content_to_slide(text_frame_for_initial_content, part) continue content_started = True is_multi_line_heading = (multi_line_heading_number_pattern.match(text) and i + 1 < len(doc.paragraphs) and doc.paragraphs[i + 1].text.strip()[0].islower()) if single_line_heading_pattern.match(text) or is_multi_line_heading: slide_layout = prs.slide_layouts[1] current_slide = prs.slides.add_slide(slide_layout) title_shape, content_shape = current_slide.shapes.title, current_slide.placeholders[1] title_text = text if not is_multi_line_heading else f"{text} {doc.paragraphs[i + 1].text.strip()}" if is_multi_line_heading: i += 1 customize_title_shape(title_shape, title_text) customize_content_shape(content_shape) elif 'current_slide' in locals(): parts = split_text_into_parts(text.replace('_', '.'), word_limit_per_slide) for part in parts: if parts.index(part) > 0: current_slide = prs.slides.add_slide(slide_layout) _, content_shape = current_slide.shapes.title, current_slide.placeholders[1] customize_content_shape(content_shape) add_content_to_slide(content_shape.text_frame, part) prs.save(pptx_path) # Function to format headings in a Word document def format_headings_in_word(doc_path, output_path): doc = Document(doc_path) single_line_heading_pattern = re.compile(r'^\d+(\.\d+)*(\.\d+)*\s+[A-Z].*') heading_number_pattern = re.compile(r'^\d+(\.\d+)*$') figure_line_pattern = re.compile(r'^Figure\s+') new_doc = Document() i = 0 while i < len(doc.paragraphs): paragraph = doc.paragraphs[i] text = paragraph.text.strip().replace("_", ".") if figure_line_pattern.match(text): i += 1 continue if single_line_heading_pattern.match(text) or heading_number_pattern.match(text): run = new_doc.add_paragraph().add_run(text) run.bold = True run.font.size = Pt(12) if heading_number_pattern.match(text) and i + 1 < len(doc.paragraphs) and not figure_line_pattern.match(doc.paragraphs[i + 1].text.strip()): i += 1 next_text = doc.paragraphs[i].text.strip().replace('_', '.') run = new_doc.add_paragraph().add_run(next_text) run.bold = True run.font.size = Pt(12) else: text_content = [text] while i + 1 < len(doc.paragraphs) and not single_line_heading_pattern.match(doc.paragraphs[i + 1].text.strip()) and not heading_number_pattern.match(doc.paragraphs[i + 1].text.strip()) and not figure_line_pattern.match(doc.paragraphs[i + 1].text.strip()): i += 1 next_text = doc.paragraphs[i].text.strip().replace('_', '.') text_content.append(next_text) consolidated_text = ' '.join(text_content) new_paragraph = new_doc.add_paragraph(consolidated_text) new_paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY i += 1 new_doc.save(output_path) # Function to split text into parts ensuring each part has around 'limit' words without cutting sentences in the middle def split_text_into_parts(text, limit): sentences = re.split(r'(?<=[.!?])\s+', text) parts = [] part_words = [] current_count = 0 for sentence in sentences: sentence_words = sentence.split() sentence_length = len(sentence_words) if current_count + sentence_length > limit: if part_words: parts.append(' '.join(part_words)) part_words = sentence_words current_count = sentence_length else: part_words.extend(sentence_words) current_count += sentence_length if part_words: parts.append(' '.join(part_words)) return parts # Function to customize title shape in PowerPoint def customize_title_shape(title_shape, title_text): sentences = re.split(r'(?<=\.)\s+', title_text) for index, sentence in enumerate(sentences): if sentence: p = title_shape.text_frame.add_paragraph() if index > 0 or title_shape.text_frame.paragraphs[0].text else title_shape.text_frame.paragraphs[0] p.text = sentence.replace('_', '.') font_size = 28 if 41 > len(sentence) > 31: font_size = 24 elif 51 > len(sentence) >= 41: font_size = 22 elif len(sentence) >= 51: font_size = 18 p.alignment = PP_ALIGN.JUSTIFY for run in p.runs: run.font.size = Pt(font_size) run.font.name = 'Calibri' title_shape.width = Cm(21) title_shape.height = Cm(2.5) title_shape.left = Cm(0.3) title_shape.top = Cm(0.4) # Function to customize content shape in PowerPoint def customize_content_shape(content_shape): content_shape.width = Cm(24) content_shape.height = Cm(15) content_shape.left = Cm(0.3) content_shape.top = Cm(2.8) # Function to add content to slide in PowerPoint def add_content_to_slide(text_frame, text): sentences = re.split(r'(?<=\.)\s+', text) for index, sentence in enumerate(sentences): if sentence: p = text_frame.add_paragraph() if index > 0 or text_frame.paragraphs[0].text else text_frame.paragraphs[0] p.text = sentence.replace('_', '.') p.alignment = PP_ALIGN.JUSTIFY for run in p.runs: run.font.size = Pt(21) run.font.name = 'Calibri' # Streamlit app st.title("PDF to PPT Converter") pdf_path = st.file_uploader("Select PDF:", type=["pdf"]) pptx_template_path = st.file_uploader("Select PowerPoint Template:", type=["pptx"]) output_dir = st.text_input("Enter Output Directory Path:") output_dir = os.path.abspath(output_dir) if output_dir else None if st.button("Convert PDF to PPT"): if pdf_path is None: st.warning("Please select a PDF file.") elif pptx_template_path is None: st.warning("Please select a PowerPoint template.") elif not output_dir: st.warning("Please enter the output directory path.") else: image_folder_path = os.path.join(output_dir, "images") pdf_filename = os.path.basename(pdf_path.name) pptx_template_filename = os.path.basename(pptx_template_path.name) output_word_file = os.path.join(output_dir, f"{os.path.splitext(pdf_filename)[0]}_extracted_text.docx") formatted_output_word_file = os.path.join(output_dir, f"{os.path.splitext(pdf_filename)[0]}_formatted_extracted_text.docx") pptx_output_path = os.path.join(output_dir, f"{os.path.splitext(pdf_filename)[0]}_presentation.pptx") # Create output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) # Process PDF to extract text and convert to PowerPoint presentation try: st.write("Converting PDF to Word...") image_paths = convert_pdf_pages_to_images(pdf_path, image_folder_path) highlighted_regions = detect_highlighted_regions(image_paths) extracted_texts = extract_text_from_highlights(highlighted_regions) save_text_to_word(extracted_texts, output_word_file) st.write("Formatting Word document...") format_headings_in_word(output_word_file, formatted_output_word_file) st.write("Creating PowerPoint presentation...") create_presentation_from_word(formatted_output_word_file, pptx_output_path, pptx_template_path) delete_images(image_folder_path) shutil.rmtree(image_folder_path) except Exception as e: st.error(f"An error occurred: {e}")