Spaces:

deveshsahu
/

ppt_generator

Runtime error

File size: 13,770 Bytes

4370a7e

import streamlit as st
import os
import numpy as np  
import fitz  # PyMuPDF
import easyocr  # EasyOCR
import cv2
import shutil
import re
import threading
from PIL import Image
from pptx import Presentation
from pptx.util import Pt, Cm
from pptx.enum.text import PP_ALIGN
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from transformers import pipeline


# Setup EasyOCR reader
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"  # To address OpenMP runtime issue
reader = easyocr.Reader(['en'])  # Initialize EasyOCR

# Function to convert PDF pages to images
def convert_pdf_pages_to_images(pdf_path, image_folder_path):
    doc = fitz.open(pdf_path)
    if not os.path.exists(image_folder_path):
        os.makedirs(image_folder_path)
    images = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        # Increase the resolution by specifying a higher zoom factor
        # Default DPI in PDFs is usually 72, so zooming by 3 gives you 216 DPI
        zoom_x = 3.5  # horizontal zoom
        zoom_y = 3.5  # vertical zoom
        mat = fitz.Matrix(zoom_x, zoom_y)  # Zoom factor 3 in each dimension
        pix = page.get_pixmap(matrix=mat, alpha=False)  # Render page to an image
        image_path = os.path.join(image_folder_path, f"page_{page_num}.png")
        pix.save(image_path)
        images.append(image_path)
    return images

# Function to detect highlighted regions in images
def detect_highlighted_regions(image_paths):
    lower_yellow = np.array([20, 100, 100])
    upper_yellow = np.array([30, 255, 255])
    highlighted_regions = []
    for image_path in image_paths:
        image = cv2.imread(image_path)
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        mask = cv2.inRange(hsv, lower_yellow, upper_yellow)
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        contours = refine_contours(contours)  # Refine contours to focus on highlighted regions
        contours = sort_contours(contours)  # Sort contours to ensure correct order of text extraction
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            highlighted_regions.append((image_path, (x, y, x+w, y+h)))
    return highlighted_regions

# Function to refine contours based on certain criteria
def refine_contours(contours):
    refined = []
    for contour in contours:
        _, _, w, h = cv2.boundingRect(contour)
        if w > 10 and h > 10:  # Example criteria
            refined.append(contour)
    return refined

# Function to sort contours from top to bottom
def sort_contours(contours):
    return sorted(contours, key=lambda c: cv2.boundingRect(c)[1])

# Function to extract text from highlighted regions
def extract_text_from_highlights(highlighted_regions):
    extracted_texts = []
    for image_path, (x1, y1, x2, y2) in highlighted_regions:
        image = Image.open(image_path).convert('RGB')
        cropped_image = image.crop((x1, y1, x2, y2))
        result = reader.readtext(np.array(cropped_image), detail=0)
        extracted_text = " ".join(result)
        extracted_texts.append(extracted_text)
    return extracted_texts


# Function to count words in a text
def count_words(text):
    words = re.findall(r'\w+', text)
    return len(words)

# Function to delete images in a folder
def delete_images(folder_path):
    files = os.listdir(folder_path)
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
            file_path = os.path.join(folder_path, file)
            try:
                os.remove(file_path)
                # st.write(f"Deleted: {file_path}")
            except Exception as e:
                st.write(f"Error deleting {file_path}: {e}")

# Function to save extracted text to a Word document
def save_text_to_word(extracted_texts, output_path):
    doc = Document()
    for text in extracted_texts:
        num_words = count_words(text)
        max_words = 200
        min_words = 20
        if num_words <= min_words:
            summarized_text = text
        else:
            max_length = min(max_words, num_words)
            summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
            summarized_text = summarizer(text, max_length=max_length, min_length=min_words)[0]['summary_text']
        doc.add_paragraph(summarized_text)
    doc.save(output_path)
    folder_path = "images"
    delete_images(folder_path)

# Function to create a presentation from a Word document
def create_presentation_from_word(doc_path, pptx_path, template_path):
    doc = Document(doc_path)
    prs = Presentation(template_path)
    single_line_heading_pattern = re.compile(r'^\d+(\.\d+)*(\.\d+)*\s+[A-Z].*')
    multi_line_heading_number_pattern = re.compile(r'^\d+(\.\d+)*$')
    word_limit_per_slide = 110

    def add_initial_content_slide():
        slide_layout = prs.slide_layouts[1]
        slide = prs.slides.add_slide(slide_layout)
        _, content_shape = slide.shapes.title, slide.placeholders[1]
        customize_content_shape(content_shape)
        return content_shape.text_frame

    content_started = False
    text_frame_for_initial_content = None

    for i, paragraph in enumerate(doc.paragraphs):
        text = paragraph.text.strip()

        if not content_started and (single_line_heading_pattern.match(text) is None and not text.isdigit()):
            parts = split_text_into_parts(text, word_limit_per_slide)

            for part in parts:
                if text_frame_for_initial_content is None or parts.index(part) > 0:
                    text_frame_for_initial_content = add_initial_content_slide()
                add_content_to_slide(text_frame_for_initial_content, part)

            continue

        content_started = True
        is_multi_line_heading = (multi_line_heading_number_pattern.match(text) and
                                 i + 1 < len(doc.paragraphs) and
                                 doc.paragraphs[i + 1].text.strip()[0].islower())

        if single_line_heading_pattern.match(text) or is_multi_line_heading:
            slide_layout = prs.slide_layouts[1]
            current_slide = prs.slides.add_slide(slide_layout)
            title_shape, content_shape = current_slide.shapes.title, current_slide.placeholders[1]

            title_text = text if not is_multi_line_heading else f"{text} {doc.paragraphs[i + 1].text.strip()}"
            if is_multi_line_heading:
                i += 1

            customize_title_shape(title_shape, title_text)
            customize_content_shape(content_shape)

        elif 'current_slide' in locals():
            parts = split_text_into_parts(text.replace('_', '.'), word_limit_per_slide)
            for part in parts:
                if parts.index(part) > 0:
                    current_slide = prs.slides.add_slide(slide_layout)
                    _, content_shape = current_slide.shapes.title, current_slide.placeholders[1]
                    customize_content_shape(content_shape)
                add_content_to_slide(content_shape.text_frame, part)

    prs.save(pptx_path)

# Function to format headings in a Word document
def format_headings_in_word(doc_path, output_path):
    doc = Document(doc_path)
    single_line_heading_pattern = re.compile(r'^\d+(\.\d+)*(\.\d+)*\s+[A-Z].*')
    heading_number_pattern = re.compile(r'^\d+(\.\d+)*$')
    figure_line_pattern = re.compile(r'^Figure\s+')

    new_doc = Document()

    i = 0
    while i < len(doc.paragraphs):
        paragraph = doc.paragraphs[i]
        text = paragraph.text.strip().replace("_", ".")
        
        if figure_line_pattern.match(text):
            i += 1
            continue

        if single_line_heading_pattern.match(text) or heading_number_pattern.match(text):
            run = new_doc.add_paragraph().add_run(text)
            run.bold = True
            run.font.size = Pt(12)
            if heading_number_pattern.match(text) and i + 1 < len(doc.paragraphs) and not figure_line_pattern.match(doc.paragraphs[i + 1].text.strip()):
                i += 1
                next_text = doc.paragraphs[i].text.strip().replace('_', '.')
                run = new_doc.add_paragraph().add_run(next_text)
                run.bold = True
                run.font.size = Pt(12)
        else:
            text_content = [text]
            while i + 1 < len(doc.paragraphs) and not single_line_heading_pattern.match(doc.paragraphs[i + 1].text.strip()) and not heading_number_pattern.match(doc.paragraphs[i + 1].text.strip()) and not figure_line_pattern.match(doc.paragraphs[i + 1].text.strip()):
                i += 1
                next_text = doc.paragraphs[i].text.strip().replace('_', '.')
                text_content.append(next_text)
            consolidated_text = ' '.join(text_content)
            new_paragraph = new_doc.add_paragraph(consolidated_text)
            new_paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY

        i += 1

    new_doc.save(output_path)

# Function to split text into parts ensuring each part has around 'limit' words without cutting sentences in the middle
def split_text_into_parts(text, limit):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    parts = []
    part_words = []
    current_count = 0

    for sentence in sentences:
        sentence_words = sentence.split()
        sentence_length = len(sentence_words)

        if current_count + sentence_length > limit:
            if part_words:
                parts.append(' '.join(part_words))
            part_words = sentence_words
            current_count = sentence_length
        else:
            part_words.extend(sentence_words)
            current_count += sentence_length

    if part_words:
        parts.append(' '.join(part_words))

    return parts

# Function to customize title shape in PowerPoint
def customize_title_shape(title_shape, title_text):
    sentences = re.split(r'(?<=\.)\s+', title_text)
    for index, sentence in enumerate(sentences):
        if sentence:
            p = title_shape.text_frame.add_paragraph() if index > 0 or title_shape.text_frame.paragraphs[0].text else title_shape.text_frame.paragraphs[0]
            p.text = sentence.replace('_', '.')
            font_size = 28
            if 41 > len(sentence) > 31:
                font_size = 24
            elif 51 > len(sentence) >= 41:
                font_size = 22
            elif len(sentence) >= 51:
                font_size = 18
            p.alignment = PP_ALIGN.JUSTIFY
            for run in p.runs:
                run.font.size = Pt(font_size)
                run.font.name = 'Calibri'
    title_shape.width = Cm(21)
    title_shape.height = Cm(2.5)
    title_shape.left = Cm(0.3)
    title_shape.top = Cm(0.4)

# Function to customize content shape in PowerPoint
def customize_content_shape(content_shape):
    content_shape.width = Cm(24)
    content_shape.height = Cm(15)
    content_shape.left = Cm(0.3)
    content_shape.top = Cm(2.8)

# Function to add content to slide in PowerPoint
def add_content_to_slide(text_frame, text):
    sentences = re.split(r'(?<=\.)\s+', text)
    for index, sentence in enumerate(sentences):
        if sentence:
            p = text_frame.add_paragraph() if index > 0 or text_frame.paragraphs[0].text else text_frame.paragraphs[0]
            p.text = sentence.replace('_', '.')
            p.alignment = PP_ALIGN.JUSTIFY
            for run in p.runs:
                run.font.size = Pt(21)
                run.font.name = 'Calibri'

# Streamlit app
st.title("PDF to PPT Converter")

pdf_path = st.file_uploader("Select PDF:", type=["pdf"])
pptx_template_path = st.file_uploader("Select PowerPoint Template:", type=["pptx"])
output_dir = st.text_input("Enter Output Directory Path:")
output_dir = os.path.abspath(output_dir) if output_dir else None

if st.button("Convert PDF to PPT"):
    if pdf_path is None:
        st.warning("Please select a PDF file.")
    elif pptx_template_path is None:
        st.warning("Please select a PowerPoint template.")
    elif not output_dir:
        st.warning("Please enter the output directory path.")
    else:
        image_folder_path = os.path.join(output_dir, "images")
        pdf_filename = os.path.basename(pdf_path.name)
        pptx_template_filename = os.path.basename(pptx_template_path.name)
        output_word_file = os.path.join(output_dir, f"{os.path.splitext(pdf_filename)[0]}_extracted_text.docx")
        formatted_output_word_file = os.path.join(output_dir, f"{os.path.splitext(pdf_filename)[0]}_formatted_extracted_text.docx")
        pptx_output_path = os.path.join(output_dir, f"{os.path.splitext(pdf_filename)[0]}_presentation.pptx")
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Process PDF to extract text and convert to PowerPoint presentation
        try:
            st.write("Converting PDF to Word...")
            image_paths = convert_pdf_pages_to_images(pdf_path, image_folder_path)
            highlighted_regions = detect_highlighted_regions(image_paths)
            extracted_texts = extract_text_from_highlights(highlighted_regions)
            save_text_to_word(extracted_texts, output_word_file)
            
            st.write("Formatting Word document...")
            format_headings_in_word(output_word_file, formatted_output_word_file)
            
            st.write("Creating PowerPoint presentation...")
            create_presentation_from_word(formatted_output_word_file, pptx_output_path, pptx_template_path)
            delete_images(image_folder_path)
            shutil.rmtree(image_folder_path)
        except Exception as e:
            st.error(f"An error occurred: {e}")