akazmi commited on
Commit
14fdcd7
·
verified ·
1 Parent(s): f4613f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -9
app.py CHANGED
@@ -17,24 +17,32 @@ def pdf_to_text(pdf_file):
17
  text += page_text + "\n"
18
  except Exception as e:
19
  return f"Error reading PDF: {str(e)}"
20
- return text
21
 
22
  def summarize_pdf(pdf_file):
23
  """Summarize the content of a PDF file."""
24
  text = pdf_to_text(pdf_file)
25
- if len(text.strip()) == 0:
26
  return "No text found in the PDF."
27
-
28
  # Check if the text is too short for summarization
29
- if len(text) < 50: # You can adjust this threshold
30
  return "The text extracted is too short for summarization."
31
 
 
 
 
 
32
  # Attempt to summarize the text
33
- try:
34
- summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
35
- return summary[0]['summary_text']
36
- except Exception as e:
37
- return f"Error summarizing text: {str(e)}"
 
 
 
 
38
 
39
  # Create a Gradio interface
40
  interface = gr.Interface(
 
17
  text += page_text + "\n"
18
  except Exception as e:
19
  return f"Error reading PDF: {str(e)}"
20
+ return text.strip()
21
 
22
  def summarize_pdf(pdf_file):
23
  """Summarize the content of a PDF file."""
24
  text = pdf_to_text(pdf_file)
25
+ if len(text) == 0:
26
  return "No text found in the PDF."
27
+
28
  # Check if the text is too short for summarization
29
+ if len(text) < 50: # Adjust this threshold if necessary
30
  return "The text extracted is too short for summarization."
31
 
32
+ # Split text if it's too long
33
+ max_input_length = 1024 # BART's maximum token length
34
+ text_chunks = [text[i:i + max_input_length] for i in range(0, len(text), max_input_length)]
35
+
36
  # Attempt to summarize the text
37
+ summaries = []
38
+ for chunk in text_chunks:
39
+ try:
40
+ summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
41
+ summaries.append(summary[0]['summary_text'])
42
+ except Exception as e:
43
+ return f"Error summarizing text: {str(e)}"
44
+
45
+ return "\n\n".join(summaries) # Join summaries from chunks
46
 
47
  # Create a Gradio interface
48
  interface = gr.Interface(