pdfSumAndQnA

Runtime error

App Files Files Community

Updated app.py

by nidhiguptahf - opened 24 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+128

-87

Files changed (1) hide show

app.py +128 -87

app.py CHANGED Viewed

@@ -1,4 +1,9 @@
-# KEEPING YOUR ORIGINAL IMPORTS
 import gradio as gr
 import PyPDF2
 import io
@@ -9,8 +14,9 @@ from typing import List, Tuple
 import warnings
 warnings.filterwarnings("ignore")
-# QUESTION-ANSWERING ADDITION
-qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
 # === SUMMARIZER CLASS ===
 class PDFSummarizer:
@@ -18,23 +24,28 @@ class PDFSummarizer:
         self.model_name = "sshleifer/distilbart-cnn-12-6"
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self.device}")
-        try:
-            self.summarizer = pipeline(
-                "summarization",
-                model=self.model_name,
-                device=0 if self.device == "cuda" else -1,
-                framework="pt",
-                model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
-            )
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            print("Model loaded successfully")
-        except Exception as e:
-            print(f"Error loading model: {e}")
-            self.model_name = "facebook/bart-large-cnn"
-            self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            print("Fallback model loaded")
     def extract_text_from_pdf(self, pdf_file) -> str:
         try:
@@ -42,12 +53,11 @@ class PDFSummarizer:
             text = ""
             for page_num, page in enumerate(pdf_reader.pages):
                 page_text = page.extract_text()
-                if page_text.strip():
-                    text += f"\n--- Page {page_num + 1} ---\n"
-                    text += page_text
             return text.strip()
         except Exception as e:
-            raise Exception(f"Error extracting text from PDF: {str(e)}")
     def clean_text(self, text: str) -> str:
         text = re.sub(r'\s+', ' ', text)
@@ -57,143 +67,174 @@ class PDFSummarizer:
     def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
         sentences = text.split('. ')
-        chunks = []
-        current_chunk = ""
         for sentence in sentences:
-            potential_chunk = current_chunk + sentence + ". "
-            if len(potential_chunk.split()) <= max_chunk_length:
-                current_chunk = potential_chunk
             else:
                 if current_chunk:
                     chunks.append(current_chunk.strip())
                 current_chunk = sentence + ". "
         if current_chunk:
             chunks.append(current_chunk.strip())
         return chunks[:5]
-    def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
         try:
             summary = self.summarizer(
                 chunk,
                 max_length=max_length,
                 min_length=min_length,
                 do_sample=False,
                 truncation=True,
-                early_stopping=True,
                 num_beams=2
             )
             return summary[0]['summary_text']
         except Exception as e:
             return f"Error summarizing chunk: {str(e)}"
     def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
         try:
             raw_text = self.extract_text_from_pdf(pdf_file)
             if not raw_text.strip():
-                return "❌ Error: No text could be extracted from the PDF.", "", ""
             cleaned_text = self.clean_text(raw_text)
             word_count = len(cleaned_text.split())
             char_count = len(cleaned_text)
             if word_count < 50:
-                return "❌ Error: PDF contains too little text to summarize.", "", ""
             chunks = self.chunk_text(cleaned_text)
             if summary_type == "Brief (Quick)":
                 max_len, min_len = 60, 20
             elif summary_type == "Detailed":
                 max_len, min_len = 100, 40
             else:
                 max_len, min_len = 150, 60
-            chunk_summaries = []
             for i, chunk in enumerate(chunks):
                 print(f"Processing chunk {i+1}/{len(chunks)}")
-                summary = self.summarize_chunk(chunk, max_len, min_len)
-                chunk_summaries.append(summary)
-            combined_summary = " ".join(chunk_summaries)
-            if len(chunks) <= 2:
-                final_summary = combined_summary
-            else:
                 final_summary = self.summarize_chunk(
-                    combined_summary,
-                    max_length=min(200, max_len * 1.5),
                     min_length=min_len
                 )
-            summary_stats = f"""
 📊 **Document Statistics:**
 - Original word count: {word_count:,}
-- Original character count: {char_count:,}
-- Pages processed: {len(chunks)}
-- Summary word count: {len(final_summary.split()):,}
-- Compression ratio: {word_count / len(final_summary.split()):.1f}:1
-            """
-            return final_summary, summary_stats, "✅ Summary generated successfully!"
         except Exception as e:
-            return f"❌ Error processing PDF: {str(e)}", "", ""
 pdf_summarizer = PDFSummarizer()
-global_pdf_text = ""  # used for QA
 def summarize_pdf_interface(pdf_file, summary_type):
     global global_pdf_text
     if pdf_file is None:
-        return "❌ Please upload a PDF file.", "", ""
     try:
         with open(pdf_file, 'rb') as f:
             pdf_content = f.read()
-        global_pdf_text = pdf_summarizer.clean_text(pdf_summarizer.extract_text_from_pdf(pdf_content))
-        summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
-        return summary, stats, status
     except Exception as e:
         return f"❌ Error: {str(e)}", "", ""
-# === NEW: QA FUNCTION ===
 def answer_question_interface(question):
     if not global_pdf_text:
-        return "❌ Please upload and summarize a PDF first."
     try:
         answer = qa_pipeline(question=question, context=global_pdf_text)
         return answer["answer"]
     except Exception as e:
         return f"❌ Error: {str(e)}"
-# === GRADIO INTERFACE ===
 def create_interface():
-    with gr.Blocks(title="📄 AI PDF Summarizer & QA", theme=gr.themes.Soft()) as interface:
-        gr.Markdown("# 📄 PDF Summarizer + 💬 Question Answering")
         with gr.Row():
-            with gr.Column(scale=1):
-                pdf_input = gr.File(label="📁 Upload PDF", file_types=[".pdf"], type="filepath")
                 summary_type = gr.Radio(
-                    choices=["Brief (Quick)", "Detailed", "Comprehensive"],
-                    value="Detailed",
-                    label="📏 Summary Length"
                 )
-                summarize_btn = gr.Button("🚀 Generate Summary", variant="primary")
-                status_output = gr.Textbox(label="📋 Status", interactive=False, max_lines=2)
-            with gr.Column(scale=2):
-                summary_output = gr.Textbox(label="📝 Summary", lines=15, interactive=False)
-                stats_output = gr.Markdown(label="📊 Document Statistics")
-        summarize_btn.click(
-            fn=summarize_pdf_interface,
-            inputs=[pdf_input, summary_type],
-            outputs=[summary_output, stats_output, status_output]
-        )
-        pdf_input.change(
-            fn=summarize_pdf_interface,
             inputs=[pdf_input, summary_type],
-            outputs=[summary_output, stats_output, status_output]
         )
-        gr.Markdown("## 💬 Ask a Question About the PDF")
-        with gr.Row():
-            question_input = gr.Textbox(label="❓ Your Question", placeholder="e.g. What is the main finding?")
-            answer_output = gr.Textbox(label="💡 Answer", interactive=False)
-        question_input.submit(fn=answer_question_interface, inputs=question_input, outputs=answer_output)
-    return interface
 # === MAIN ===
 if __name__ == "__main__":
-    interface = create_interface()
-    interface.launch()

+```python
+# === ENV FIXES (IMPORTANT FOR HF SPACES) ===
+import os
+os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
+# === IMPORTS ===
 import gradio as gr
 import PyPDF2
 import io
 import warnings
 warnings.filterwarnings("ignore")
+# === GLOBALS (LAZY LOADING) ===
+qa_pipeline = None
+global_pdf_text = ""
 # === SUMMARIZER CLASS ===
 class PDFSummarizer:
         self.model_name = "sshleifer/distilbart-cnn-12-6"
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self.device}")
+        # Lazy init
+        self.summarizer = None
+        self.tokenizer = None
+    def load_model(self):
+        if self.summarizer is None:
+            try:
+                print("Loading summarization model...")
+                self.summarizer = pipeline(
+                    "summarization",
+                    model=self.model_name,
+                    device=0 if self.device == "cuda" else -1
+                )
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+                print("Model loaded successfully")
+            except Exception as e:
+                print(f"Primary model failed: {e}")
+                self.model_name = "facebook/bart-large-cnn"
+                self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+                print("Fallback model loaded")
     def extract_text_from_pdf(self, pdf_file) -> str:
         try:
             text = ""
             for page_num, page in enumerate(pdf_reader.pages):
                 page_text = page.extract_text()
+                if page_text and page_text.strip():
+                    text += f"\n--- Page {page_num + 1} ---\n{page_text}"
             return text.strip()
         except Exception as e:
+            raise Exception(f"Error extracting text: {str(e)}")
     def clean_text(self, text: str) -> str:
         text = re.sub(r'\s+', ' ', text)
     def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
         sentences = text.split('. ')
+        chunks, current_chunk = [], ""
         for sentence in sentences:
+            temp = current_chunk + sentence + ". "
+            if len(temp.split()) <= max_chunk_length:
+                current_chunk = temp
             else:
                 if current_chunk:
                     chunks.append(current_chunk.strip())
                 current_chunk = sentence + ". "
         if current_chunk:
             chunks.append(current_chunk.strip())
         return chunks[:5]
+    def summarize_chunk(self, chunk: str, max_length=100, min_length=30) -> str:
         try:
+            self.load_model()
             summary = self.summarizer(
                 chunk,
                 max_length=max_length,
                 min_length=min_length,
                 do_sample=False,
                 truncation=True,
                 num_beams=2
             )
             return summary[0]['summary_text']
         except Exception as e:
             return f"Error summarizing chunk: {str(e)}"
     def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
         try:
             raw_text = self.extract_text_from_pdf(pdf_file)
             if not raw_text.strip():
+                return "❌ No text extracted from PDF.", "", ""
             cleaned_text = self.clean_text(raw_text)
             word_count = len(cleaned_text.split())
             char_count = len(cleaned_text)
             if word_count < 50:
+                return "❌ Too little text to summarize.", "", ""
             chunks = self.chunk_text(cleaned_text)
             if summary_type == "Brief (Quick)":
                 max_len, min_len = 60, 20
             elif summary_type == "Detailed":
                 max_len, min_len = 100, 40
             else:
                 max_len, min_len = 150, 60
+            summaries = []
             for i, chunk in enumerate(chunks):
                 print(f"Processing chunk {i+1}/{len(chunks)}")
+                summaries.append(self.summarize_chunk(chunk, max_len, min_len))
+            combined = " ".join(summaries)
+            if len(chunks) > 2:
                 final_summary = self.summarize_chunk(
+                    combined,
+                    max_length=min(200, int(max_len * 1.5)),
                     min_length=min_len
                 )
+            else:
+                final_summary = combined
+            stats = f"""
 📊 **Document Statistics:**
 - Original word count: {word_count:,}
+- Characters: {char_count:,}
+- Chunks: {len(chunks)}
+- Summary words: {len(final_summary.split()):,}
+- Compression: {word_count / len(final_summary.split()):.1f}:1
+"""
+            return final_summary, stats, "✅ Summary generated"
         except Exception as e:
+            return f"❌ Error: {str(e)}", "", ""
 pdf_summarizer = PDFSummarizer()
+# === INTERFACE FUNCTIONS ===
 def summarize_pdf_interface(pdf_file, summary_type):
     global global_pdf_text
     if pdf_file is None:
+        return "❌ Upload a PDF.", "", ""
     try:
         with open(pdf_file, 'rb') as f:
             pdf_content = f.read()
+        global_pdf_text = pdf_summarizer.clean_text(
+            pdf_summarizer.extract_text_from_pdf(pdf_content)
+        )
+        return pdf_summarizer.process_pdf(pdf_content, summary_type)
     except Exception as e:
         return f"❌ Error: {str(e)}", "", ""
 def answer_question_interface(question):
+    global qa_pipeline
     if not global_pdf_text:
+        return "❌ Upload & summarize PDF first."
     try:
+        if qa_pipeline is None:
+            print("Loading QA model...")
+            qa_pipeline = pipeline(
+                "question-answering",
+                model="deepset/roberta-base-squad2"
+            )
         answer = qa_pipeline(question=question, context=global_pdf_text)
         return answer["answer"]
     except Exception as e:
         return f"❌ Error: {str(e)}"
+# === UI ===
 def create_interface():
+    with gr.Blocks(title="PDF Summarizer + QA") as app:
+        gr.Markdown("# 📄 PDF Summarizer + 💬 QA")
         with gr.Row():
+            with gr.Column():
+                pdf_input = gr.File(file_types=[".pdf"])
                 summary_type = gr.Radio(
+                    ["Brief (Quick)", "Detailed", "Comprehensive"],
+                    value="Detailed"
                 )
+                btn = gr.Button("Generate Summary")
+                status = gr.Textbox(label="Status")
+            with gr.Column():
+                summary = gr.Textbox(lines=15, label="Summary")
+                stats = gr.Markdown()
+        btn.click(
+            summarize_pdf_interface,
             inputs=[pdf_input, summary_type],
+            outputs=[summary, stats, status]
         )
+        gr.Markdown("## Ask Questions")
+        question = gr.Textbox()
+        answer = gr.Textbox()
+        question.submit(answer_question_interface, inputs=question, outputs=answer)
+    return app
 # === MAIN ===
 if __name__ == "__main__":
+    app = create_interface()
+    app.launch()
+```