Aroy1997 commited on
Commit
ca4ec2e
Β·
verified Β·
1 Parent(s): 737fcfe

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -0
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # KEEPING YOUR ORIGINAL IMPORTS
2
+ import gradio as gr
3
+ import PyPDF2
4
+ import io
5
+ from transformers import pipeline, AutoTokenizer
6
+ import torch
7
+ import re
8
+ from typing import List, Tuple
9
+ import warnings
10
+ warnings.filterwarnings("ignore")
11
+
12
+ # QUESTION-ANSWERING ADDITION
13
+ qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
14
+
15
+ # === YOUR ORIGINAL SUMMARIZER CLASS ===
16
+ class PDFSummarizer:
17
+ def __init__(self):
18
+ self.model_name = "sshleifer/distilbart-cnn-12-6"
19
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ print(f"Using device: {self.device}")
21
+
22
+ try:
23
+ self.summarizer = pipeline(
24
+ "summarization",
25
+ model=self.model_name,
26
+ device=0 if self.device == "cuda" else -1,
27
+ framework="pt",
28
+ model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
29
+ )
30
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
31
+ print("Model loaded successfully")
32
+ except Exception as e:
33
+ print(f"Error loading model: {e}")
34
+ self.model_name = "facebook/bart-large-cnn"
35
+ self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
36
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
37
+ print("Fallback model loaded")
38
+
39
+ def extract_text_from_pdf(self, pdf_file) -> str:
40
+ try:
41
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
42
+ text = ""
43
+ for page_num, page in enumerate(pdf_reader.pages):
44
+ page_text = page.extract_text()
45
+ if page_text.strip():
46
+ text += f"\n--- Page {page_num + 1} ---\n"
47
+ text += page_text
48
+ return text.strip()
49
+ except Exception as e:
50
+ raise Exception(f"Error extracting text from PDF: {str(e)}")
51
+
52
+ def clean_text(self, text: str) -> str:
53
+ text = re.sub(r'\s+', ' ', text)
54
+ text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
55
+ text = re.sub(r'--- Page \d+ ---', '', text)
56
+ return text.strip()
57
+
58
+ def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
59
+ sentences = text.split('. ')
60
+ chunks = []
61
+ current_chunk = ""
62
+ for sentence in sentences:
63
+ potential_chunk = current_chunk + sentence + ". "
64
+ if len(potential_chunk.split()) <= max_chunk_length:
65
+ current_chunk = potential_chunk
66
+ else:
67
+ if current_chunk:
68
+ chunks.append(current_chunk.strip())
69
+ current_chunk = sentence + ". "
70
+ if current_chunk:
71
+ chunks.append(current_chunk.strip())
72
+ return chunks[:5]
73
+
74
+ def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
75
+ try:
76
+ summary = self.summarizer(
77
+ chunk,
78
+ max_length=max_length,
79
+ min_length=min_length,
80
+ do_sample=False,
81
+ truncation=True,
82
+ early_stopping=True,
83
+ num_beams=2
84
+ )
85
+ return summary[0]['summary_text']
86
+ except Exception as e:
87
+ return f"Error summarizing chunk: {str(e)}"
88
+
89
+ def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
90
+ try:
91
+ raw_text = self.extract_text_from_pdf(pdf_file)
92
+ if not raw_text.strip():
93
+ return "❌ Error: No text could be extracted from the PDF.", "", ""
94
+ cleaned_text = self.clean_text(raw_text)
95
+ word_count = len(cleaned_text.split())
96
+ char_count = len(cleaned_text)
97
+ if word_count < 50:
98
+ return "❌ Error: PDF contains too little text to summarize.", "", ""
99
+ chunks = self.chunk_text(cleaned_text)
100
+ if summary_type == "Brief (Quick)":
101
+ max_len, min_len = 60, 20
102
+ elif summary_type == "Detailed":
103
+ max_len, min_len = 100, 40
104
+ else:
105
+ max_len, min_len = 150, 60
106
+ chunk_summaries = []
107
+ for i, chunk in enumerate(chunks):
108
+ print(f"Processing chunk {i+1}/{len(chunks)}")
109
+ summary = self.summarize_chunk(chunk, max_len, min_len)
110
+ chunk_summaries.append(summary)
111
+ combined_summary = " ".join(chunk_summaries)
112
+ if len(chunks) <= 2:
113
+ final_summary = combined_summary
114
+ else:
115
+ final_summary = self.summarize_chunk(
116
+ combined_summary,
117
+ max_length=min(200, max_len * 1.5),
118
+ min_length=min_len
119
+ )
120
+ summary_stats = f"""
121
+ πŸ“Š **Document Statistics:**
122
+ - Original word count: {word_count:,}
123
+ - Original character count: {char_count:,}
124
+ - Pages processed: {len(chunks)}
125
+ - Summary word count: {len(final_summary.split()):,}
126
+ - Compression ratio: {word_count / len(final_summary.split()):.1f}:1
127
+ """
128
+ return final_summary, summary_stats, "βœ… Summary generated successfully!"
129
+ except Exception as e:
130
+ return f"❌ Error processing PDF: {str(e)}", "", ""
131
+
132
+ pdf_summarizer = PDFSummarizer()
133
+ global_pdf_text = "" # used for QA
134
+
135
+ def summarize_pdf_interface(pdf_file, summary_type):
136
+ global global_pdf_text
137
+ if pdf_file is None:
138
+ return "❌ Please upload a PDF file.", "", ""
139
+ try:
140
+ with open(pdf_file, 'rb') as f:
141
+ pdf_content = f.read()
142
+ global_pdf_text = pdf_summarizer.clean_text(pdf_summarizer.extract_text_from_pdf(pdf_content))
143
+ summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
144
+ return summary, stats, status
145
+ except Exception as e:
146
+ return f"❌ Error: {str(e)}", "", ""
147
+
148
+ # === NEW: QA FUNCTION ===
149
+ def answer_question_interface(question):
150
+ if not global_pdf_text:
151
+ return "❌ Please upload and summarize a PDF first."
152
+ try:
153
+ answer = qa_pipeline(question=question, context=global_pdf_text)
154
+ return answer["answer"]
155
+ except Exception as e:
156
+ return f"❌ Error: {str(e)}"
157
+
158
+ # === GRADIO INTERFACE ===
159
+ def create_interface():
160
+ with gr.Blocks(title="πŸ“„ AI PDF Summarizer & QA", theme=gr.themes.Soft()) as interface:
161
+ gr.Markdown("# πŸ“„ PDF Summarizer + πŸ’¬ Question Answering")
162
+
163
+ with gr.Row():
164
+ with gr.Column(scale=1):
165
+ pdf_input = gr.File(label="πŸ“ Upload PDF", file_types=[".pdf"], type="filepath")
166
+ summary_type = gr.Radio(
167
+ choices=["Brief (Quick)", "Detailed", "Comprehensive"],
168
+ value="Detailed",
169
+ label="πŸ“ Summary Length"
170
+ )
171
+ summarize_btn = gr.Button("πŸš€ Generate Summary", variant="primary")
172
+ status_output = gr.Textbox(label="πŸ“‹ Status", interactive=False, max_lines=2)
173
+ with gr.Column(scale=2):
174
+ summary_output = gr.Textbox(label="πŸ“ Summary", lines=15, interactive=False)
175
+ stats_output = gr.Markdown(label="πŸ“Š Document Statistics")
176
+
177
+ summarize_btn.click(
178
+ fn=summarize_pdf_interface,
179
+ inputs=[pdf_input, summary_type],
180
+ outputs=[summary_output, stats_output, status_output]
181
+ )
182
+ pdf_input.change(
183
+ fn=summarize_pdf_interface,
184
+ inputs=[pdf_input, summary_type],
185
+ outputs=[summary_output, stats_output, status_output]
186
+ )
187
+
188
+ gr.Markdown("## πŸ’¬ Ask a Question About the PDF")
189
+ with gr.Row():
190
+ question_input = gr.Textbox(label="❓ Your Question", placeholder="e.g. What is the main finding?")
191
+ answer_output = gr.Textbox(label="πŸ’‘ Answer", interactive=False)
192
+ question_input.submit(fn=answer_question_interface, inputs=question_input, outputs=answer_output)
193
+
194
+ return interface
195
+
196
+ # === MAIN ===
197
+ if __name__ == "__main__":
198
+ interface = create_interface()
199
+ interface.launch()