Azidan commited on
Commit
d484432
·
verified ·
1 Parent(s): 2884c9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -22
app.py CHANGED
@@ -4,14 +4,19 @@ import torch
4
  import pdfplumber
5
  from docx import Document
6
  import os
 
 
 
 
 
7
 
8
- # Load faster model for CPU
9
  device = 0 if torch.cuda.is_available() else -1
10
  print(f"Using device: {'GPU' if device == 0 else 'CPU'}")
11
 
12
  summarizer = pipeline(
13
  "summarization",
14
- model="Falconsai/text_summarization", # Faster/smaller for quick tests
15
  device=device
16
  )
17
 
@@ -20,23 +25,35 @@ def extract_text(file_path):
20
  return ""
21
  file_path = str(file_path)
22
  filename = os.path.basename(file_path).lower()
 
23
  try:
24
  if filename.endswith('.pdf'):
25
- with pdfplumber.open(file_path) as pdf:
26
- return "\n".join(page.extract_text() or "" for page in pdf.pages)
 
 
 
 
 
 
 
 
 
27
  elif filename.endswith('.docx'):
28
  doc = Document(file_path)
29
- return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
30
  elif filename.endswith('.txt'):
31
  with open(file_path, "r", encoding="utf-8", errors="replace") as f:
32
- return f.read()
33
  else:
34
- return "Unsupported file. Please use .pdf, .docx, or .txt"
35
  except Exception as e:
36
- return f"Error reading file: {str(e)}"
 
 
37
 
38
  def summarize(input_text, file_path, detail_level, progress=gr.Progress()):
39
- progress(0, desc="Preparing text...")
40
 
41
  if file_path is not None:
42
  text = extract_text(file_path)
@@ -44,7 +61,7 @@ def summarize(input_text, file_path, detail_level, progress=gr.Progress()):
44
  text = input_text.strip()
45
 
46
  if not text:
47
- return "Please paste text or upload a valid lecture file."
48
 
49
  words = len(text.split())
50
  if words < 100:
@@ -53,21 +70,20 @@ def summarize(input_text, file_path, detail_level, progress=gr.Progress()):
53
  target_ratio = detail_level
54
  target_length = int(words * target_ratio)
55
 
56
- # Lower caps for speed and to avoid warnings
57
- max_l = max(200, min(512, target_length + 100))
58
  min_l = max(50, int(target_length * 0.65))
59
 
60
  if min_l >= max_l:
61
  min_l = max_l // 2
62
 
63
- progress(0.4, desc="Summarizing... (10–60 sec for long text)")
64
 
65
  try:
66
  result = summarizer(
67
  text,
68
  max_length=max_l,
69
  min_length=min_l,
70
- length_penalty=1.8,
71
  num_beams=4,
72
  early_stopping=True,
73
  do_sample=False,
@@ -76,19 +92,19 @@ def summarize(input_text, file_path, detail_level, progress=gr.Progress()):
76
  progress(1.0, desc="Done!")
77
  return result[0]['summary_text']
78
  except Exception as e:
79
- return f"Error: {str(e)}\n(Try shorter input or lower detail.)"
80
 
81
- # Interface with progress
82
  interface = gr.Interface(
83
  fn=summarize,
84
  inputs=[
85
- gr.Textbox(lines=12, placeholder="Paste lecture text...", label="Lecture Text (Paste)"),
86
- gr.File(file_types=[".pdf", ".docx", ".txt"], label="Upload Lecture File"),
87
- gr.Slider(0.15, 0.60, value=0.32, step=0.01, label="Detail Level (higher = longer)")
88
  ],
89
- outputs=gr.Textbox(label="Generated Summary"),
90
- title="Lecture Summarizer",
91
- description="Paste or upload lecture. Progress shows during generation. For long files, lower detail or upgrade to GPU.",
92
  flagging_mode="never",
93
  )
94
 
 
4
  import pdfplumber
5
  from docx import Document
6
  import os
7
+ from PyPDF2 import PdfReader
8
+ import fitz # PyMuPDF for better PDF handling
9
+ from PIL import Image
10
+ import pytesseract # For OCR on scanned PDFs
11
+ import io
12
 
13
+ # Load model
14
  device = 0 if torch.cuda.is_available() else -1
15
  print(f"Using device: {'GPU' if device == 0 else 'CPU'}")
16
 
17
  summarizer = pipeline(
18
  "summarization",
19
+ model="facebook/bart-large-cnn", # Better quality for lectures/books
20
  device=device
21
  )
22
 
 
25
  return ""
26
  file_path = str(file_path)
27
  filename = os.path.basename(file_path).lower()
28
+ text = ""
29
  try:
30
  if filename.endswith('.pdf'):
31
+ # Try PyMuPDF for better layout
32
+ doc = fitz.open(file_path)
33
+ for page in doc:
34
+ text += page.get_text("text") + "\n"
35
+ if not text.strip(): # If empty, try OCR as fallback (scanned PDF)
36
+ text = ""
37
+ for page in doc:
38
+ pix = page.get_pixmap()
39
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
40
+ text += pytesseract.image_to_string(img) + "\n"
41
+ doc.close()
42
  elif filename.endswith('.docx'):
43
  doc = Document(file_path)
44
+ text = "\n".join(para.text for para in doc.paragraphs if para.text.strip())
45
  elif filename.endswith('.txt'):
46
  with open(file_path, "r", encoding="utf-8", errors="replace") as f:
47
+ text = f.read()
48
  else:
49
+ return "Unsupported file. Use .pdf, .docx, or .txt"
50
  except Exception as e:
51
+ return f"Error reading file: {str(e)} (try non-scanned PDF or shorter file)"
52
+
53
+ return text.strip()
54
 
55
  def summarize(input_text, file_path, detail_level, progress=gr.Progress()):
56
+ progress(0, desc="Extracting text...")
57
 
58
  if file_path is not None:
59
  text = extract_text(file_path)
 
61
  text = input_text.strip()
62
 
63
  if not text:
64
+ return "No text found check file or paste directly."
65
 
66
  words = len(text.split())
67
  if words < 100:
 
70
  target_ratio = detail_level
71
  target_length = int(words * target_ratio)
72
 
73
+ max_l = max(200, min(1024, target_length + 100)) # Balanced for quality/speed
 
74
  min_l = max(50, int(target_length * 0.65))
75
 
76
  if min_l >= max_l:
77
  min_l = max_l // 2
78
 
79
+ progress(0.4, desc="Summarizing... (10–60 sec, longer for books)")
80
 
81
  try:
82
  result = summarizer(
83
  text,
84
  max_length=max_l,
85
  min_length=min_l,
86
+ length_penalty=1.5, # Lower for more concise but coherent
87
  num_beams=4,
88
  early_stopping=True,
89
  do_sample=False,
 
92
  progress(1.0, desc="Done!")
93
  return result[0]['summary_text']
94
  except Exception as e:
95
+ return f"Error: {str(e)}\n(Try lower detail or shorter text section. For books, summarize chapter by chapter.)"
96
 
97
+ # Interface
98
  interface = gr.Interface(
99
  fn=summarize,
100
  inputs=[
101
+ gr.Textbox(lines=12, placeholder="Paste lecture/book text...", label="Text (Paste)"),
102
+ gr.File(file_types=[".pdf", ".docx", ".txt"], label="Upload File"),
103
+ gr.Slider(0.15, 0.60, value=0.25, step=0.01, label="Detail Level (higher = longer) – start low for books")
104
  ],
105
+ outputs=gr.Textbox(label="Summary"),
106
+ title="Lecture/Book Summarizer",
107
+ description="Improved for books like Goggins better extraction + OCR for scanned PDFs. Use low detail for long texts.",
108
  flagging_mode="never",
109
  )
110