PDF2SLIDE / app.py
newoz's picture
Update app.py
47c535b
from PyPDF2 import PdfReader
import re
import streamlit as st
import fitz
from transformers import pipeline
import os
import requests
import io
from PIL import Image
from pptx import Presentation
from pptx.util import Inches
import tempfile
API_URL = "https://api-inference.huggingface.co/models/runwayml/stable-diffusion-v1-5"
headers = {"Authorization": "Bearer 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"}
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.content
def extract_paragraphs_by_vertical_spacing(pdf_data, spacing_threshold=10):
paragraphs = []
try:
pdf_stream = io.BytesIO(pdf_data)
pdf_document = fitz.open(stream=pdf_stream, filetype="pdf")
for page_number in range(pdf_document.page_count):
page = pdf_document.load_page(page_number)
blocks = page.get_text("blocks")
current_paragraph = ""
previous_bottom = None
for block in blocks:
x0, y0, x1, y1 = block[:4] # Coordonnées du bloc de texte
text = block[4] # Texte du bloc
# Mesurez l'espacement vertical entre les blocs de texte
if previous_bottom is not None:
vertical_spacing = y0 - previous_bottom
else:
vertical_spacing = 0
# Si l'espacement vertical dépasse le seuil, considérez-le comme un nouveau paragraphe
if vertical_spacing > spacing_threshold:
if current_paragraph:
paragraphs.append(current_paragraph.strip())
current_paragraph = text
else:
current_paragraph += " " + text # Ajoutez le texte au paragraphe actuel
previous_bottom = y1
# Ajoutez le dernier paragraphe de la page
if current_paragraph:
paragraphs.append(current_paragraph.strip())
pdf_document.close()
except Exception as e:
print(f"Erreur lors de l'extraction du PDF : {str(e)}")
return paragraphs
st.title("PDF2SLIDE")
uploaded_file = st.file_uploader("Select a PDF", type=["pdf"])
if uploaded_file is not None:
pdf_data = uploaded_file.read()
paragraphs = extract_paragraphs_by_vertical_spacing(pdf_data)
i = 1
# Create a PowerPoint presentation
prs = Presentation()
for paragraph in paragraphs:
summary = summarizer(paragraph, max_length=(len(paragraph) / 2), min_length=10, do_sample=False)
# Create a slide
slide = prs.slides.add_slide(prs.slide_layouts[5])
# Add the paragraph to the slide
left = top = Inches(1)
width = height = Inches(5)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
p = tf.add_paragraph()
p.text = f"Paragraph {i}:"
p.space_after = Inches(0.1)
p = tf.add_paragraph()
p.text = summary[0]['summary_text']
# Generate and add the image to the slide
image_bytes = query({
"inputs": 'A picture without text about: ' + summary[0]['summary_text']
})
image = Image.open(io.BytesIO(image_bytes))
# Define the desired image width and height
image_width = 800 # Adjust as needed
image_height = 600 # Adjust as needed
# Resize the image to the desired dimensions
image = image.resize((image_width, image_height), Image.ANTIALIAS)
# Create a temporary file to save the resized image
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_img_file:
temp_img_path = temp_img_file.name
image.save(temp_img_path, format="PNG")
# Add the image to the slide
left = Inches(1)
top = Inches(2)
pic = slide.shapes.add_picture(temp_img_path, left, top, width, height)
i += 1
# Save the PowerPoint presentation
presentation_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pptx").name
prs.save(presentation_path)
# Display a download button for the PowerPoint file
st.download_button(
label="Download PowerPoint Presentation",
data=open(presentation_path, "rb"),
key="download_ppt",
file_name="PDF2SLIDE_Presentation.pptx",
)