ngupta949 commited on
Commit
ffb9e96
·
verified ·
1 Parent(s): 74c3cb8

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +21 -0
  2. requirements.txt +3 -0
  3. summarizer.py +22 -0
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from summarizer import extract_text_from_pdf, summarize_text
3
+
4
+ def summarize_pdf(file):
5
+ try:
6
+ text = extract_text_from_pdf(file)
7
+ summary = summarize_text(text)
8
+ return summary
9
+ except Exception as e:
10
+ return f"Error processing file: {str(e)}"
11
+
12
+ iface = gr.Interface(
13
+ fn=summarize_pdf,
14
+ inputs=gr.File(label="Upload PDF"),
15
+ outputs=gr.Textbox(label="Summary"),
16
+ title="PDF Document Summarizer",
17
+ description="Upload a PDF file and get a machine-generated summary.",
18
+ )
19
+
20
+ if __name__ == "__main__":
21
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ transformers
3
+ PyPDF2
summarizer.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import PyPDF2
3
+
4
+ # Load summarization model
5
+ summarizer_pipeline = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
6
+
7
+ def extract_text_from_pdf(file):
8
+ # Check if file is a path (string) or file-like object
9
+ if isinstance(file, str):
10
+ with open(file, "rb") as f:
11
+ reader = PyPDF2.PdfReader(f)
12
+ text = "".join([page.extract_text() or "" for page in reader.pages])
13
+ else:
14
+ reader = PyPDF2.PdfReader(file)
15
+ text = "".join([page.extract_text() or "" for page in reader.pages])
16
+ return text.strip()
17
+
18
+ def summarize_text(text, max_length=130, min_length=30):
19
+ if len(text.strip()) == 0:
20
+ return "No valid text found in the PDF."
21
+ summary = summarizer_pipeline(text[:3000], max_length=max_length, min_length=min_length, do_sample=False)
22
+ return summary[0]['summary_text']