Layan22 commited on
Commit
565b2dd
·
verified ·
1 Parent(s): 8817e65

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PyPDF2
3
+ import io
4
+ from transformers import pipeline, AutoTokenizer
5
+ import torch
6
+ import re
7
+ from typing import List, Tuple
8
+ import warnings
9
+ warnings.filterwarnings("ignore")
10
+
11
+ class PDFSummarizer:
12
+ def init(self):
13
+ # Use a much faster, lighter model for summarization
14
+ self.model_name = "sshleifer/distilbart-cnn-12-6" # Much faster than BART-large
15
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ print(f"Using device: {self.device}")
17
+
18
+ try:
19
+ # Initialize the summarization pipeline with optimizations
20
+ self.summarizer = pipeline(
21
+ "summarization",
22
+ model=self.model_name,
23
+ device=0 if self.device == "cuda" else -1,
24
+ framework="pt",
25
+ model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
26
+ )
27
+
28
+ # Initialize tokenizer for length calculations
29
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
30
+ print("Model loaded successfully")
31
+
32
+ except Exception as e:
33
+ print(f"Error loading model: {e}")
34
+ # Fallback to an even faster model
35
+ self.model_name = "facebook/bart-large-cnn"
36
+ self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
37
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
38
+ print("Fallback model loaded")
39
+
40
+ def extract_text_from_pdf(self, pdf_file) -> str:
41
+ """Extract text content from PDF file"""
42
+ try:
43
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
44
+ text = ""
45
+
46
+ for page_num, page in enumerate(pdf_reader.pages):
47
+ page_text = page.extract_text()
48
+ if page_text.strip():
49
+ text += f"\n--- Page {page_num + 1} ---\n"
50
+ text += page_text
51
+
52
+ return text.strip()
53
+ except Exception as e:
54
+ raise Exception(f"Error extracting text from PDF: {str(e)}")
55
+
56
+ def clean_text(self, text: str) -> str:
57
+ """Clean and preprocess text"""
58
+ # Remove extra whitespaces and newlines
59
+ text = re.sub(r'\s+', ' ', text)
60
+ # Remove special characters but keep punctuation
61
+ text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
62
+ # Remove page markers
63
+ text = re.sub(r'--- Page \d+ ---', '', text)
64
+ return text.strip()
65
+
66
+ def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
67
+ """Split text into smaller, more manageable chunks for faster processing"""
68
+ sentences = text.split('. ')
69
+ chunks = []
70
+ current_chunk = ""
71
+
72
+ for sentence in sentences:
73
+ # Check if adding this sentence would exceed the limit
74
+ potential_chunk = current_chunk + sentence + ". "
75
+ # Use faster length estimation
76
+ if len(potential_chunk.split()) <= max_chunk_length:
77
+ current_chunk = potential_chunk
78
+ else:
79
+ if current_chunk:
80
+ chunks.append(current_chunk.strip())
81
+ current_chunk = sentence + ". "
82
+
83
+ if current_chunk:
84
+ chunks.append(current_chunk.strip())
85
+
86
+ # Limit number of chunks for speed
87
+ return chunks[:5] # Process max 5 chunks for speed
88
+
89
+ def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
90
+ """Summarize a single chunk of text with speed optimizations"""
91
+ try:
92
+ # Speed optimizations
93
+ summary = self.summarizer(
94
+ chunk,
95
+ max_length=max_length,
96
+ min_length=min_length,
97
+ do_sample=False,
98
+ truncation=True,
99
+ early_stopping=True,
100
+ num_beams=2 # Reduced from default 4 for speed
101
+ )
102
+ return summary[0]['summary_text']
103
+ except Exception as e: