from PyPDF2 import PdfReader import re import streamlit as st import fitz from transformers import pipeline import os import requests import io from PIL import Image from pptx import Presentation from pptx.util import Inches, Pt import tempfile API_URL = "https://api-inference.huggingface.co/models/runwayml/stable-diffusion-v1-5" headers = {"Authorization": "Bearer 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"} summarizer = pipeline("summarization", model="facebook/bart-large-cnn") def query(payload): response = requests.post(API_URL, headers=headers, json=payload) return response.content def add_line_breaks_to_summary(summary_text, line_length): # Split the summary text into lines without breaking words lines = [] words = summary_text.split() current_line = "" for word in words: if len(current_line) + len(word) + 1 <= line_length: # Include space between words if current_line: current_line += " " current_line += word else: lines.append(current_line) current_line = word if current_line: lines.append(current_line) return "\n".join(lines) def extract_paragraphs_by_vertical_spacing(pdf_data, spacing_threshold=10): paragraphs = [] try: pdf_stream = io.BytesIO(pdf_data) pdf_document = fitz.open(stream=pdf_stream, filetype="pdf") for page_number in range(pdf_document.page_count): page = pdf_document.load_page(page_number) blocks = page.get_text("blocks") current_paragraph = "" previous_bottom = None for block in blocks: x0, y0, x1, y1 = block[:4] text = block[4] if previous_bottom is not None: vertical_spacing = y0 - previous_bottom else: vertical_spacing = 0 if vertical_spacing > spacing_threshold: if current_paragraph: paragraphs.append(current_paragraph.strip()) current_paragraph = text else: current_paragraph += " " + text previous_bottom = y1 if current_paragraph: paragraphs.append(current_paragraph.strip()) pdf_document.close() except Exception as e: print(f"Erreur lors de l'extraction du PDF : {str(e)}") return paragraphs st.title("PDF2SLIDE") uploaded_file = st.file_uploader("Select a PDF", type=["pdf"]) if uploaded_file is not None: pdf_data = uploaded_file.read() paragraphs = extract_paragraphs_by_vertical_spacing(pdf_data) i = 1 # Create a PowerPoint presentation prs = Presentation() for paragraph in paragraphs: summary = summarizer(paragraph, max_length=(len(paragraph) / 2), min_length=10, do_sample=False) summary_text = add_line_breaks_to_summary(summary[0]['summary_text'], 80) # Generate and save the image to a temporary file image_bytes = query({ "inputs": 'A picture without text about: ' + summary[0]['summary_text'] }) temp_img_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name with open(temp_img_path, "wb") as img_file: img_file.write(image_bytes) # Create a slide slide = prs.slides.add_slide(prs.slide_layouts[5]) # Add the image to the slide at the bottom with a 0.5-inch space left = (prs.slide_width - Inches(3)) / 2 top = prs.slide_height - Inches(3) - Inches(0.5) # Adjusted for the 0.5-inch space pic = slide.shapes.add_picture(temp_img_path, left, top, Inches(3), Inches(3)) # Add the paragraph to the slide at the top left = Inches(1) top = Inches(1) width = Inches(8) # Adjust the width as needed height = Inches(2) # Adjust the height as needed txBox = slide.shapes.add_textbox(left, top, width, height) tf = txBox.text_frame p = tf.add_paragraph() p.text = summary_text p.space_after = Pt(0) # Adjust the spacing as needed # Save the PowerPoint presentation presentation_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pptx").name prs.save(presentation_path) # Display a download button for the PowerPoint file st.download_button( label="Download PowerPoint Presentation", data=open(presentation_path, "rb"), key="download_ppt", file_name="PDF2SLIDE_Presentation.pptx", )