File size: 4,696 Bytes
67bf68b
 
7877199
67bf68b
 
 
 
 
 
906d18c
6d9e930
2444fb8
4cd5238
67bf68b
 
 
 
 
 
 
 
ca96738
6d9e930
 
 
 
 
 
 
 
 
 
 
 
 
 
ca96738
 
67bf68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e41304e
 
67bf68b
 
 
 
 
 
 
 
 
 
 
e41304e
67bf68b
 
 
 
 
 
 
 
 
 
 
7877199
23807f5
7877199
4166de3
7877199
23807f5
 
b5abbc4
23807f5
906d18c
 
 
 
 
23807f5
906d18c
4afa594
906d18c
f5cc285
 
 
 
 
 
 
 
906d18c
 
 
ec76e54
59e3acd
ec76e54
7df67c3
59e3acd
e41304e
6d9e930
 
 
2444fb8
906d18c
 
 
ca96738
6d9e930
f5cc285
906d18c
 
 
3b29aeb
c3a449c
 
 
4166de3
c3a449c
 
4166de3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from PyPDF2 import PdfReader
import re
import streamlit as st
import fitz
from transformers import pipeline
import os
import requests
import io
from PIL import Image
from pptx import Presentation
from pptx.util import Inches, Pt
import tempfile

API_URL = "https://api-inference.huggingface.co/models/runwayml/stable-diffusion-v1-5"
headers = {"Authorization": "Bearer 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"}
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.content

def add_line_breaks_to_summary(summary_text, line_length):
    # Split the summary text into lines without breaking words
    lines = []
    words = summary_text.split()
    current_line = ""
    for word in words:
        if len(current_line) + len(word) + 1 <= line_length:  # Include space between words
            if current_line:
                current_line += " "
            current_line += word
        else:
            lines.append(current_line)
            current_line = word
    if current_line:
        lines.append(current_line)
    return "\n".join(lines)

def extract_paragraphs_by_vertical_spacing(pdf_data, spacing_threshold=10):
    paragraphs = []
    
    try:
        pdf_stream = io.BytesIO(pdf_data)
        pdf_document = fitz.open(stream=pdf_stream, filetype="pdf")
        
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            blocks = page.get_text("blocks")
            
            current_paragraph = ""
            previous_bottom = None
            
            for block in blocks:
                x0, y0, x1, y1 = block[:4]
                text = block[4]
                
                if previous_bottom is not None:
                    vertical_spacing = y0 - previous_bottom
                else:
                    vertical_spacing = 0
                
                if vertical_spacing > spacing_threshold:
                    if current_paragraph:
                        paragraphs.append(current_paragraph.strip())
                    current_paragraph = text
                else:
                    current_paragraph += " " + text
                
                previous_bottom = y1
            
            if current_paragraph:
                paragraphs.append(current_paragraph.strip())
        
        pdf_document.close()
    except Exception as e:
        print(f"Erreur lors de l'extraction du PDF : {str(e)}")
    
    return paragraphs

st.title("PDF2SLIDE")

uploaded_file = st.file_uploader("Select a PDF", type=["pdf"])

if uploaded_file is not None:
    pdf_data = uploaded_file.read()

    paragraphs = extract_paragraphs_by_vertical_spacing(pdf_data)
    i = 1

    # Create a PowerPoint presentation
    prs = Presentation()

    for paragraph in paragraphs:
        summary = summarizer(paragraph, max_length=(len(paragraph) / 2), min_length=10, do_sample=False)
        summary_text = add_line_breaks_to_summary(summary[0]['summary_text'], 80)

        # Generate and save the image to a temporary file
        image_bytes = query({
            "inputs": 'A picture without text about: ' + summary[0]['summary_text']
        })
        temp_img_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
        with open(temp_img_path, "wb") as img_file:
            img_file.write(image_bytes)

        # Create a slide
        slide = prs.slides.add_slide(prs.slide_layouts[5])

        # Add the image to the slide at the bottom with a 0.5-inch space
        left = (prs.slide_width - Inches(3)) / 2
        top = prs.slide_height - Inches(3) - Inches(0.5)  # Adjusted for the 0.5-inch space
        pic = slide.shapes.add_picture(temp_img_path, left, top, Inches(3), Inches(3))

        # Add the paragraph to the slide at the top
        left = Inches(1)
        top = Inches(1)
        width = Inches(8)  # Adjust the width as needed
        height = Inches(2)  # Adjust the height as needed
        txBox = slide.shapes.add_textbox(left, top, width, height)
        tf = txBox.text_frame
        p = tf.add_paragraph()
        p.text = summary_text
        p.space_after = Pt(0)  # Adjust the spacing as needed

    # Save the PowerPoint presentation
    presentation_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pptx").name
    prs.save(presentation_path)

    # Display a download button for the PowerPoint file
    st.download_button(
        label="Download PowerPoint Presentation",
        data=open(presentation_path, "rb"),
        key="download_ppt",
        file_name="PDF2SLIDE_Presentation.pptx",
    )