Files changed (1) hide show
  1. app.py +128 -87
app.py CHANGED
@@ -1,4 +1,9 @@
1
- # KEEPING YOUR ORIGINAL IMPORTS
 
 
 
 
 
2
  import gradio as gr
3
  import PyPDF2
4
  import io
@@ -9,8 +14,9 @@ from typing import List, Tuple
9
  import warnings
10
  warnings.filterwarnings("ignore")
11
 
12
- # QUESTION-ANSWERING ADDITION
13
- qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
 
14
 
15
  # === SUMMARIZER CLASS ===
16
  class PDFSummarizer:
@@ -18,23 +24,28 @@ class PDFSummarizer:
18
  self.model_name = "sshleifer/distilbart-cnn-12-6"
19
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
20
  print(f"Using device: {self.device}")
21
-
22
- try:
23
- self.summarizer = pipeline(
24
- "summarization",
25
- model=self.model_name,
26
- device=0 if self.device == "cuda" else -1,
27
- framework="pt",
28
- model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
29
- )
30
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
31
- print("Model loaded successfully")
32
- except Exception as e:
33
- print(f"Error loading model: {e}")
34
- self.model_name = "facebook/bart-large-cnn"
35
- self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
36
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
37
- print("Fallback model loaded")
 
 
 
 
 
38
 
39
  def extract_text_from_pdf(self, pdf_file) -> str:
40
  try:
@@ -42,12 +53,11 @@ class PDFSummarizer:
42
  text = ""
43
  for page_num, page in enumerate(pdf_reader.pages):
44
  page_text = page.extract_text()
45
- if page_text.strip():
46
- text += f"\n--- Page {page_num + 1} ---\n"
47
- text += page_text
48
  return text.strip()
49
  except Exception as e:
50
- raise Exception(f"Error extracting text from PDF: {str(e)}")
51
 
52
  def clean_text(self, text: str) -> str:
53
  text = re.sub(r'\s+', ' ', text)
@@ -57,143 +67,174 @@ class PDFSummarizer:
57
 
58
  def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
59
  sentences = text.split('. ')
60
- chunks = []
61
- current_chunk = ""
62
  for sentence in sentences:
63
- potential_chunk = current_chunk + sentence + ". "
64
- if len(potential_chunk.split()) <= max_chunk_length:
65
- current_chunk = potential_chunk
66
  else:
67
  if current_chunk:
68
  chunks.append(current_chunk.strip())
69
  current_chunk = sentence + ". "
 
70
  if current_chunk:
71
  chunks.append(current_chunk.strip())
 
72
  return chunks[:5]
73
 
74
- def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
75
  try:
 
 
76
  summary = self.summarizer(
77
  chunk,
78
  max_length=max_length,
79
  min_length=min_length,
80
  do_sample=False,
81
  truncation=True,
82
- early_stopping=True,
83
  num_beams=2
84
  )
85
  return summary[0]['summary_text']
 
86
  except Exception as e:
87
  return f"Error summarizing chunk: {str(e)}"
88
 
89
  def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
90
  try:
91
  raw_text = self.extract_text_from_pdf(pdf_file)
 
92
  if not raw_text.strip():
93
- return "❌ Error: No text could be extracted from the PDF.", "", ""
 
94
  cleaned_text = self.clean_text(raw_text)
95
  word_count = len(cleaned_text.split())
96
  char_count = len(cleaned_text)
 
97
  if word_count < 50:
98
- return "❌ Error: PDF contains too little text to summarize.", "", ""
 
99
  chunks = self.chunk_text(cleaned_text)
 
100
  if summary_type == "Brief (Quick)":
101
  max_len, min_len = 60, 20
102
  elif summary_type == "Detailed":
103
  max_len, min_len = 100, 40
104
  else:
105
  max_len, min_len = 150, 60
106
- chunk_summaries = []
 
107
  for i, chunk in enumerate(chunks):
108
  print(f"Processing chunk {i+1}/{len(chunks)}")
109
- summary = self.summarize_chunk(chunk, max_len, min_len)
110
- chunk_summaries.append(summary)
111
- combined_summary = " ".join(chunk_summaries)
112
- if len(chunks) <= 2:
113
- final_summary = combined_summary
114
- else:
115
  final_summary = self.summarize_chunk(
116
- combined_summary,
117
- max_length=min(200, max_len * 1.5),
118
  min_length=min_len
119
  )
120
- summary_stats = f"""
 
 
 
121
  πŸ“Š **Document Statistics:**
122
  - Original word count: {word_count:,}
123
- - Original character count: {char_count:,}
124
- - Pages processed: {len(chunks)}
125
- - Summary word count: {len(final_summary.split()):,}
126
- - Compression ratio: {word_count / len(final_summary.split()):.1f}:1
127
- """
128
- return final_summary, summary_stats, "βœ… Summary generated successfully!"
 
 
129
  except Exception as e:
130
- return f"❌ Error processing PDF: {str(e)}", "", ""
 
131
 
132
  pdf_summarizer = PDFSummarizer()
133
- global_pdf_text = "" # used for QA
134
 
 
135
  def summarize_pdf_interface(pdf_file, summary_type):
136
  global global_pdf_text
 
137
  if pdf_file is None:
138
- return "❌ Please upload a PDF file.", "", ""
 
139
  try:
140
  with open(pdf_file, 'rb') as f:
141
  pdf_content = f.read()
142
- global_pdf_text = pdf_summarizer.clean_text(pdf_summarizer.extract_text_from_pdf(pdf_content))
143
- summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
144
- return summary, stats, status
 
 
 
 
145
  except Exception as e:
146
  return f"❌ Error: {str(e)}", "", ""
147
 
148
- # === NEW: QA FUNCTION ===
149
  def answer_question_interface(question):
 
 
150
  if not global_pdf_text:
151
- return "❌ Please upload and summarize a PDF first."
 
152
  try:
 
 
 
 
 
 
 
153
  answer = qa_pipeline(question=question, context=global_pdf_text)
154
  return answer["answer"]
 
155
  except Exception as e:
156
  return f"❌ Error: {str(e)}"
157
 
158
- # === GRADIO INTERFACE ===
 
159
  def create_interface():
160
- with gr.Blocks(title="πŸ“„ AI PDF Summarizer & QA", theme=gr.themes.Soft()) as interface:
161
- gr.Markdown("# πŸ“„ PDF Summarizer + πŸ’¬ Question Answering")
 
162
 
163
  with gr.Row():
164
- with gr.Column(scale=1):
165
- pdf_input = gr.File(label="πŸ“ Upload PDF", file_types=[".pdf"], type="filepath")
166
  summary_type = gr.Radio(
167
- choices=["Brief (Quick)", "Detailed", "Comprehensive"],
168
- value="Detailed",
169
- label="πŸ“ Summary Length"
170
  )
171
- summarize_btn = gr.Button("πŸš€ Generate Summary", variant="primary")
172
- status_output = gr.Textbox(label="πŸ“‹ Status", interactive=False, max_lines=2)
173
- with gr.Column(scale=2):
174
- summary_output = gr.Textbox(label="πŸ“ Summary", lines=15, interactive=False)
175
- stats_output = gr.Markdown(label="πŸ“Š Document Statistics")
176
-
177
- summarize_btn.click(
178
- fn=summarize_pdf_interface,
179
- inputs=[pdf_input, summary_type],
180
- outputs=[summary_output, stats_output, status_output]
181
- )
182
- pdf_input.change(
183
- fn=summarize_pdf_interface,
184
  inputs=[pdf_input, summary_type],
185
- outputs=[summary_output, stats_output, status_output]
186
  )
187
 
188
- gr.Markdown("## πŸ’¬ Ask a Question About the PDF")
189
- with gr.Row():
190
- question_input = gr.Textbox(label="❓ Your Question", placeholder="e.g. What is the main finding?")
191
- answer_output = gr.Textbox(label="πŸ’‘ Answer", interactive=False)
192
- question_input.submit(fn=answer_question_interface, inputs=question_input, outputs=answer_output)
 
 
193
 
194
- return interface
195
 
196
  # === MAIN ===
197
  if __name__ == "__main__":
198
- interface = create_interface()
199
- interface.launch()
 
 
1
+ ```python
2
+ # === ENV FIXES (IMPORTANT FOR HF SPACES) ===
3
+ import os
4
+ os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
5
+
6
+ # === IMPORTS ===
7
  import gradio as gr
8
  import PyPDF2
9
  import io
 
14
  import warnings
15
  warnings.filterwarnings("ignore")
16
 
17
+ # === GLOBALS (LAZY LOADING) ===
18
+ qa_pipeline = None
19
+ global_pdf_text = ""
20
 
21
  # === SUMMARIZER CLASS ===
22
  class PDFSummarizer:
 
24
  self.model_name = "sshleifer/distilbart-cnn-12-6"
25
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
26
  print(f"Using device: {self.device}")
27
+
28
+ # Lazy init
29
+ self.summarizer = None
30
+ self.tokenizer = None
31
+
32
+ def load_model(self):
33
+ if self.summarizer is None:
34
+ try:
35
+ print("Loading summarization model...")
36
+ self.summarizer = pipeline(
37
+ "summarization",
38
+ model=self.model_name,
39
+ device=0 if self.device == "cuda" else -1
40
+ )
41
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
42
+ print("Model loaded successfully")
43
+ except Exception as e:
44
+ print(f"Primary model failed: {e}")
45
+ self.model_name = "facebook/bart-large-cnn"
46
+ self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
47
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
48
+ print("Fallback model loaded")
49
 
50
  def extract_text_from_pdf(self, pdf_file) -> str:
51
  try:
 
53
  text = ""
54
  for page_num, page in enumerate(pdf_reader.pages):
55
  page_text = page.extract_text()
56
+ if page_text and page_text.strip():
57
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}"
 
58
  return text.strip()
59
  except Exception as e:
60
+ raise Exception(f"Error extracting text: {str(e)}")
61
 
62
  def clean_text(self, text: str) -> str:
63
  text = re.sub(r'\s+', ' ', text)
 
67
 
68
  def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
69
  sentences = text.split('. ')
70
+ chunks, current_chunk = [], ""
71
+
72
  for sentence in sentences:
73
+ temp = current_chunk + sentence + ". "
74
+ if len(temp.split()) <= max_chunk_length:
75
+ current_chunk = temp
76
  else:
77
  if current_chunk:
78
  chunks.append(current_chunk.strip())
79
  current_chunk = sentence + ". "
80
+
81
  if current_chunk:
82
  chunks.append(current_chunk.strip())
83
+
84
  return chunks[:5]
85
 
86
+ def summarize_chunk(self, chunk: str, max_length=100, min_length=30) -> str:
87
  try:
88
+ self.load_model()
89
+
90
  summary = self.summarizer(
91
  chunk,
92
  max_length=max_length,
93
  min_length=min_length,
94
  do_sample=False,
95
  truncation=True,
 
96
  num_beams=2
97
  )
98
  return summary[0]['summary_text']
99
+
100
  except Exception as e:
101
  return f"Error summarizing chunk: {str(e)}"
102
 
103
  def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
104
  try:
105
  raw_text = self.extract_text_from_pdf(pdf_file)
106
+
107
  if not raw_text.strip():
108
+ return "❌ No text extracted from PDF.", "", ""
109
+
110
  cleaned_text = self.clean_text(raw_text)
111
  word_count = len(cleaned_text.split())
112
  char_count = len(cleaned_text)
113
+
114
  if word_count < 50:
115
+ return "❌ Too little text to summarize.", "", ""
116
+
117
  chunks = self.chunk_text(cleaned_text)
118
+
119
  if summary_type == "Brief (Quick)":
120
  max_len, min_len = 60, 20
121
  elif summary_type == "Detailed":
122
  max_len, min_len = 100, 40
123
  else:
124
  max_len, min_len = 150, 60
125
+
126
+ summaries = []
127
  for i, chunk in enumerate(chunks):
128
  print(f"Processing chunk {i+1}/{len(chunks)}")
129
+ summaries.append(self.summarize_chunk(chunk, max_len, min_len))
130
+
131
+ combined = " ".join(summaries)
132
+
133
+ if len(chunks) > 2:
 
134
  final_summary = self.summarize_chunk(
135
+ combined,
136
+ max_length=min(200, int(max_len * 1.5)),
137
  min_length=min_len
138
  )
139
+ else:
140
+ final_summary = combined
141
+
142
+ stats = f"""
143
  πŸ“Š **Document Statistics:**
144
  - Original word count: {word_count:,}
145
+ - Characters: {char_count:,}
146
+ - Chunks: {len(chunks)}
147
+ - Summary words: {len(final_summary.split()):,}
148
+ - Compression: {word_count / len(final_summary.split()):.1f}:1
149
+ """
150
+
151
+ return final_summary, stats, "βœ… Summary generated"
152
+
153
  except Exception as e:
154
+ return f"❌ Error: {str(e)}", "", ""
155
+
156
 
157
  pdf_summarizer = PDFSummarizer()
 
158
 
159
+ # === INTERFACE FUNCTIONS ===
160
  def summarize_pdf_interface(pdf_file, summary_type):
161
  global global_pdf_text
162
+
163
  if pdf_file is None:
164
+ return "❌ Upload a PDF.", "", ""
165
+
166
  try:
167
  with open(pdf_file, 'rb') as f:
168
  pdf_content = f.read()
169
+
170
+ global_pdf_text = pdf_summarizer.clean_text(
171
+ pdf_summarizer.extract_text_from_pdf(pdf_content)
172
+ )
173
+
174
+ return pdf_summarizer.process_pdf(pdf_content, summary_type)
175
+
176
  except Exception as e:
177
  return f"❌ Error: {str(e)}", "", ""
178
 
179
+
180
  def answer_question_interface(question):
181
+ global qa_pipeline
182
+
183
  if not global_pdf_text:
184
+ return "❌ Upload & summarize PDF first."
185
+
186
  try:
187
+ if qa_pipeline is None:
188
+ print("Loading QA model...")
189
+ qa_pipeline = pipeline(
190
+ "question-answering",
191
+ model="deepset/roberta-base-squad2"
192
+ )
193
+
194
  answer = qa_pipeline(question=question, context=global_pdf_text)
195
  return answer["answer"]
196
+
197
  except Exception as e:
198
  return f"❌ Error: {str(e)}"
199
 
200
+
201
+ # === UI ===
202
  def create_interface():
203
+ with gr.Blocks(title="PDF Summarizer + QA") as app:
204
+
205
+ gr.Markdown("# πŸ“„ PDF Summarizer + πŸ’¬ QA")
206
 
207
  with gr.Row():
208
+ with gr.Column():
209
+ pdf_input = gr.File(file_types=[".pdf"])
210
  summary_type = gr.Radio(
211
+ ["Brief (Quick)", "Detailed", "Comprehensive"],
212
+ value="Detailed"
 
213
  )
214
+ btn = gr.Button("Generate Summary")
215
+ status = gr.Textbox(label="Status")
216
+
217
+ with gr.Column():
218
+ summary = gr.Textbox(lines=15, label="Summary")
219
+ stats = gr.Markdown()
220
+
221
+ btn.click(
222
+ summarize_pdf_interface,
 
 
 
 
223
  inputs=[pdf_input, summary_type],
224
+ outputs=[summary, stats, status]
225
  )
226
 
227
+ gr.Markdown("## Ask Questions")
228
+ question = gr.Textbox()
229
+ answer = gr.Textbox()
230
+
231
+ question.submit(answer_question_interface, inputs=question, outputs=answer)
232
+
233
+ return app
234
 
 
235
 
236
  # === MAIN ===
237
  if __name__ == "__main__":
238
+ app = create_interface()
239
+ app.launch()
240
+ ```