Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,24 +17,32 @@ def pdf_to_text(pdf_file):
|
|
| 17 |
text += page_text + "\n"
|
| 18 |
except Exception as e:
|
| 19 |
return f"Error reading PDF: {str(e)}"
|
| 20 |
-
return text
|
| 21 |
|
| 22 |
def summarize_pdf(pdf_file):
|
| 23 |
"""Summarize the content of a PDF file."""
|
| 24 |
text = pdf_to_text(pdf_file)
|
| 25 |
-
if len(text
|
| 26 |
return "No text found in the PDF."
|
| 27 |
-
|
| 28 |
# Check if the text is too short for summarization
|
| 29 |
-
if len(text) < 50: #
|
| 30 |
return "The text extracted is too short for summarization."
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# Attempt to summarize the text
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
# Create a Gradio interface
|
| 40 |
interface = gr.Interface(
|
|
|
|
| 17 |
text += page_text + "\n"
|
| 18 |
except Exception as e:
|
| 19 |
return f"Error reading PDF: {str(e)}"
|
| 20 |
+
return text.strip()
|
| 21 |
|
| 22 |
def summarize_pdf(pdf_file):
|
| 23 |
"""Summarize the content of a PDF file."""
|
| 24 |
text = pdf_to_text(pdf_file)
|
| 25 |
+
if len(text) == 0:
|
| 26 |
return "No text found in the PDF."
|
| 27 |
+
|
| 28 |
# Check if the text is too short for summarization
|
| 29 |
+
if len(text) < 50: # Adjust this threshold if necessary
|
| 30 |
return "The text extracted is too short for summarization."
|
| 31 |
|
| 32 |
+
# Split text if it's too long
|
| 33 |
+
max_input_length = 1024 # BART's maximum token length
|
| 34 |
+
text_chunks = [text[i:i + max_input_length] for i in range(0, len(text), max_input_length)]
|
| 35 |
+
|
| 36 |
# Attempt to summarize the text
|
| 37 |
+
summaries = []
|
| 38 |
+
for chunk in text_chunks:
|
| 39 |
+
try:
|
| 40 |
+
summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
|
| 41 |
+
summaries.append(summary[0]['summary_text'])
|
| 42 |
+
except Exception as e:
|
| 43 |
+
return f"Error summarizing text: {str(e)}"
|
| 44 |
+
|
| 45 |
+
return "\n\n".join(summaries) # Join summaries from chunks
|
| 46 |
|
| 47 |
# Create a Gradio interface
|
| 48 |
interface = gr.Interface(
|