Spaces:

Layan22
/

layan-sumerizer

Runtime error

App Files Files Community

Layan22 commited on Dec 11, 2025

Commit

565b2dd

verified ·

1 Parent(s): 8817e65

Create app.py

Browse files

Files changed (1) hide show

app.py +103 -0

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import gradio as gr
+import PyPDF2
+import io
+from transformers import pipeline, AutoTokenizer
+import torch
+import re
+from typing import List, Tuple
+import warnings
+warnings.filterwarnings("ignore")
+class PDFSummarizer:
+    def init(self):
+        # Use a much faster, lighter model for summarization
+        self.model_name = "sshleifer/distilbart-cnn-12-6"  # Much faster than BART-large
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {self.device}")
+        try:
+            # Initialize the summarization pipeline with optimizations
+            self.summarizer = pipeline(
+                "summarization",
+                model=self.model_name,
+                device=0 if self.device == "cuda" else -1,
+                framework="pt",
+                model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
+            )
+            # Initialize tokenizer for length calculations
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            print("Model loaded successfully")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            # Fallback to an even faster model
+            self.model_name = "facebook/bart-large-cnn"
+            self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            print("Fallback model loaded")
+    def extract_text_from_pdf(self, pdf_file) -> str:
+        """Extract text content from PDF file"""
+        try:
+            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
+            text = ""
+            for page_num, page in enumerate(pdf_reader.pages):
+                page_text = page.extract_text()
+                if page_text.strip():
+                    text += f"\n--- Page {page_num + 1} ---\n"
+                    text += page_text
+            return text.strip()
+        except Exception as e:
+            raise Exception(f"Error extracting text from PDF: {str(e)}")
+    def clean_text(self, text: str) -> str:
+        """Clean and preprocess text"""
+        # Remove extra whitespaces and newlines
+        text = re.sub(r'\s+', ' ', text)
+        # Remove special characters but keep punctuation
+        text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
+        # Remove page markers
+        text = re.sub(r'--- Page \d+ ---', '', text)
+        return text.strip()
+    def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
+        """Split text into smaller, more manageable chunks for faster processing"""
+        sentences = text.split('. ')
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            # Check if adding this sentence would exceed the limit
+            potential_chunk = current_chunk + sentence + ". "
+            # Use faster length estimation
+            if len(potential_chunk.split()) <= max_chunk_length:
+                current_chunk = potential_chunk
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = sentence + ". "
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        # Limit number of chunks for speed
+        return chunks[:5]  # Process max 5 chunks for speed
+    def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
+        """Summarize a single chunk of text with speed optimizations"""
+        try:
+            # Speed optimizations
+            summary = self.summarizer(
+                chunk,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=False,
+                truncation=True,
+                early_stopping=True,
+                num_beams=2  # Reduced from default 4 for speed
+            )
+            return summary[0]['summary_text']
+        except Exception as e: