File size: 3,045 Bytes
df3a05c
56bfb5c
c9787b4
6850f84
c9787b4
 
 
 
 
 
 
 
 
 
7726619
 
 
 
2f7c029
6850f84
 
 
2f7c029
 
6850f84
 
 
 
 
 
 
 
 
7d4e8c6
6850f84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9787b4
 
 
 
 
 
 
 
a11cfb1
50a6f7b
 
2f7c029
7726619
6850f84
d41e885
 
f150a6c
498d518
7726619
f150a6c
7726619
 
 
498d518
 
7726619
c9787b4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from PyPDF2 import PdfReader
import re
import streamlit as st
import fitz
from transformers import pipeline
import os
import requests
import io
from PIL import Image

API_URL = "https://api-inference.huggingface.co/models/runwayml/stable-diffusion-v1-5"
headers = {"Authorization": "Bearer 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"}
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.content

def extract_paragraphs_by_vertical_spacing(pdf_data, spacing_threshold=10):
    paragraphs = []
    
    try:
        pdf_stream = io.BytesIO(pdf_data)
        pdf_document = fitz.open(stream=pdf_stream, filetype="pdf")
        
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            blocks = page.get_text("blocks")
            
            current_paragraph = ""
            previous_bottom = None
            
            for block in blocks:
                x0, y0, x1, y1 = block[:4]  # Coordonnées du bloc de texte
                text = block[4]  # Texte du bloc
                
                # Mesurez l'espacement vertical entre les blocs de texte
                if previous_bottom is not None:
                    vertical_spacing = y0 - previous_bottom
                else:
                    vertical_spacing = 0
                
                # Si l'espacement vertical dépasse le seuil, considérez-le comme un nouveau paragraphe
                if vertical_spacing > spacing_threshold:
                    if current_paragraph:
                        paragraphs.append(current_paragraph.strip())
                    current_paragraph = text
                else:
                    current_paragraph += " " + text  # Ajoutez le texte au paragraphe actuel
                
                previous_bottom = y1
            
            # Ajoutez le dernier paragraphe de la page
            if current_paragraph:
                paragraphs.append(current_paragraph.strip())
        
        pdf_document.close()
    except Exception as e:
        print(f"Erreur lors de l'extraction du PDF : {str(e)}")
    
    return paragraphs

#def extract_paragraph(texte):
#	paragraph = texte.split("\n\n")
#	return paragraph

st.title("PDF2SLIDE")

uploaded_file = st.file_uploader("Selectionnez un PDF", type=["pdf"])

if uploaded_file is not None:
    pdf_data = uploaded_file.read()

    paragraphs = extract_paragraphs_by_vertical_spacing(pdf_data)
    i = 1   
    for paragraph in paragraphs:
    
        summary = summarizer(paragraph, max_length=(len(paragraph)/2), min_length=10, do_sample=False)
        st.text(f"Paragraphe {i}: {summary[0]['summary_text']}")  # Affiche le résumé du paragraphe
        
        image_bytes = query({
            "inputs": summary[0]['summary_text']  # Utilisez le texte du résumé
        })
        image = Image.open(io.BytesIO(image_bytes))
        st.image(image)

        i = i + 1