Spaces:
Runtime error
Runtime error
File size: 3,045 Bytes
df3a05c 56bfb5c c9787b4 6850f84 c9787b4 7726619 2f7c029 6850f84 2f7c029 6850f84 7d4e8c6 6850f84 c9787b4 a11cfb1 50a6f7b 2f7c029 7726619 6850f84 d41e885 f150a6c 498d518 7726619 f150a6c 7726619 498d518 7726619 c9787b4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | from PyPDF2 import PdfReader
import re
import streamlit as st
import fitz
from transformers import pipeline
import os
import requests
import io
from PIL import Image
API_URL = "https://api-inference.huggingface.co/models/runwayml/stable-diffusion-v1-5"
headers = {"Authorization": "Bearer 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"}
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.content
def extract_paragraphs_by_vertical_spacing(pdf_data, spacing_threshold=10):
paragraphs = []
try:
pdf_stream = io.BytesIO(pdf_data)
pdf_document = fitz.open(stream=pdf_stream, filetype="pdf")
for page_number in range(pdf_document.page_count):
page = pdf_document.load_page(page_number)
blocks = page.get_text("blocks")
current_paragraph = ""
previous_bottom = None
for block in blocks:
x0, y0, x1, y1 = block[:4] # Coordonnées du bloc de texte
text = block[4] # Texte du bloc
# Mesurez l'espacement vertical entre les blocs de texte
if previous_bottom is not None:
vertical_spacing = y0 - previous_bottom
else:
vertical_spacing = 0
# Si l'espacement vertical dépasse le seuil, considérez-le comme un nouveau paragraphe
if vertical_spacing > spacing_threshold:
if current_paragraph:
paragraphs.append(current_paragraph.strip())
current_paragraph = text
else:
current_paragraph += " " + text # Ajoutez le texte au paragraphe actuel
previous_bottom = y1
# Ajoutez le dernier paragraphe de la page
if current_paragraph:
paragraphs.append(current_paragraph.strip())
pdf_document.close()
except Exception as e:
print(f"Erreur lors de l'extraction du PDF : {str(e)}")
return paragraphs
#def extract_paragraph(texte):
# paragraph = texte.split("\n\n")
# return paragraph
st.title("PDF2SLIDE")
uploaded_file = st.file_uploader("Selectionnez un PDF", type=["pdf"])
if uploaded_file is not None:
pdf_data = uploaded_file.read()
paragraphs = extract_paragraphs_by_vertical_spacing(pdf_data)
i = 1
for paragraph in paragraphs:
summary = summarizer(paragraph, max_length=(len(paragraph)/2), min_length=10, do_sample=False)
st.text(f"Paragraphe {i}: {summary[0]['summary_text']}") # Affiche le résumé du paragraphe
image_bytes = query({
"inputs": summary[0]['summary_text'] # Utilisez le texte du résumé
})
image = Image.open(io.BytesIO(image_bytes))
st.image(image)
i = i + 1
|