File size: 4,616 Bytes
67bf68b
 
7877199
67bf68b
 
 
 
 
 
906d18c
 
 
4cd5238
67bf68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7877199
23807f5
7877199
4166de3
7877199
23807f5
 
b5abbc4
23807f5
906d18c
 
 
 
 
23807f5
906d18c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cd5238
906d18c
4cd5238
47c535b
 
 
 
 
 
 
 
4166de3
47c535b
 
 
 
 
 
4166de3
 
 
47c535b
906d18c
 
23807f5
906d18c
 
 
3b29aeb
c3a449c
 
 
4166de3
c3a449c
 
4166de3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from PyPDF2 import PdfReader
import re
import streamlit as st
import fitz
from transformers import pipeline
import os
import requests
import io
from PIL import Image
from pptx import Presentation
from pptx.util import Inches
import tempfile

API_URL = "https://api-inference.huggingface.co/models/runwayml/stable-diffusion-v1-5"
headers = {"Authorization": "Bearer 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"}
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.content

def extract_paragraphs_by_vertical_spacing(pdf_data, spacing_threshold=10):
    paragraphs = []
    
    try:
        pdf_stream = io.BytesIO(pdf_data)
        pdf_document = fitz.open(stream=pdf_stream, filetype="pdf")
        
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            blocks = page.get_text("blocks")
            
            current_paragraph = ""
            previous_bottom = None
            
            for block in blocks:
                x0, y0, x1, y1 = block[:4]  # Coordonnées du bloc de texte
                text = block[4]  # Texte du bloc
                
                # Mesurez l'espacement vertical entre les blocs de texte
                if previous_bottom is not None:
                    vertical_spacing = y0 - previous_bottom
                else:
                    vertical_spacing = 0
                
                # Si l'espacement vertical dépasse le seuil, considérez-le comme un nouveau paragraphe
                if vertical_spacing > spacing_threshold:
                    if current_paragraph:
                        paragraphs.append(current_paragraph.strip())
                    current_paragraph = text
                else:
                    current_paragraph += " " + text  # Ajoutez le texte au paragraphe actuel
                
                previous_bottom = y1
            
            # Ajoutez le dernier paragraphe de la page
            if current_paragraph:
                paragraphs.append(current_paragraph.strip())
        
        pdf_document.close()
    except Exception as e:
        print(f"Erreur lors de l'extraction du PDF : {str(e)}")
    
    return paragraphs

st.title("PDF2SLIDE")

uploaded_file = st.file_uploader("Select a PDF", type=["pdf"])

if uploaded_file is not None:
    pdf_data = uploaded_file.read()

    paragraphs = extract_paragraphs_by_vertical_spacing(pdf_data)
    i = 1

    # Create a PowerPoint presentation
    prs = Presentation()

    for paragraph in paragraphs:
        summary = summarizer(paragraph, max_length=(len(paragraph) / 2), min_length=10, do_sample=False)

        # Create a slide
        slide = prs.slides.add_slide(prs.slide_layouts[5])

        # Add the paragraph to the slide
        left = top = Inches(1)
        width = height = Inches(5)
        txBox = slide.shapes.add_textbox(left, top, width, height)
        tf = txBox.text_frame
        p = tf.add_paragraph()
        p.text = f"Paragraph {i}:"
        p.space_after = Inches(0.1)
        p = tf.add_paragraph()
        p.text = summary[0]['summary_text']

        # Generate and add the image to the slide
        image_bytes = query({
            "inputs": 'A picture without text about: ' + summary[0]['summary_text']
        })
        image = Image.open(io.BytesIO(image_bytes))
        
        # Define the desired image width and height
        image_width = 800  # Adjust as needed
        image_height = 600  # Adjust as needed
        
        # Resize the image to the desired dimensions
        image = image.resize((image_width, image_height), Image.ANTIALIAS)
        
        # Create a temporary file to save the resized image
        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_img_file:
            temp_img_path = temp_img_file.name
            image.save(temp_img_path, format="PNG")
        
        # Add the image to the slide
        left = Inches(1)
        top = Inches(2)
        pic = slide.shapes.add_picture(temp_img_path, left, top, width, height)
        

        i += 1

    # Save the PowerPoint presentation
    presentation_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pptx").name
    prs.save(presentation_path)

    # Display a download button for the PowerPoint file
    st.download_button(
        label="Download PowerPoint Presentation",
        data=open(presentation_path, "rb"),
        key="download_ppt",
        file_name="PDF2SLIDE_Presentation.pptx",
    )