Spaces:

Azidan
/

text-sum

Sleeping

App Files Files Community

Azidan commited on 28 days ago

Commit

d484432

verified ·

1 Parent(s): 2884c9e

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -22

app.py CHANGED Viewed

@@ -4,14 +4,19 @@ import torch
 import pdfplumber
 from docx import Document
 import os
-# Load faster model for CPU
 device = 0 if torch.cuda.is_available() else -1
 print(f"Using device: {'GPU' if device == 0 else 'CPU'}")
 summarizer = pipeline(
     "summarization",
-    model="Falconsai/text_summarization",  # Faster/smaller for quick tests
     device=device
 )
@@ -20,23 +25,35 @@ def extract_text(file_path):
         return ""
     file_path = str(file_path)
     filename = os.path.basename(file_path).lower()
     try:
         if filename.endswith('.pdf'):
-            with pdfplumber.open(file_path) as pdf:
-                return "\n".join(page.extract_text() or "" for page in pdf.pages)
         elif filename.endswith('.docx'):
             doc = Document(file_path)
-            return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
         elif filename.endswith('.txt'):
             with open(file_path, "r", encoding="utf-8", errors="replace") as f:
-                return f.read()
         else:
-            return "Unsupported file. Please use .pdf, .docx, or .txt"
     except Exception as e:
-        return f"Error reading file: {str(e)}"
 def summarize(input_text, file_path, detail_level, progress=gr.Progress()):
-    progress(0, desc="Preparing text...")
     if file_path is not None:
         text = extract_text(file_path)
@@ -44,7 +61,7 @@ def summarize(input_text, file_path, detail_level, progress=gr.Progress()):
         text = input_text.strip()
     if not text:
-        return "Please paste text or upload a valid lecture file."
     words = len(text.split())
     if words < 100:
@@ -53,21 +70,20 @@ def summarize(input_text, file_path, detail_level, progress=gr.Progress()):
     target_ratio = detail_level
     target_length = int(words * target_ratio)
-    # Lower caps for speed and to avoid warnings
-    max_l = max(200, min(512, target_length + 100))
     min_l = max(50, int(target_length * 0.65))
     if min_l >= max_l:
         min_l = max_l // 2
-    progress(0.4, desc="Summarizing... (10–60 sec for long text)")
     try:
         result = summarizer(
             text,
             max_length=max_l,
             min_length=min_l,
-            length_penalty=1.8,
             num_beams=4,
             early_stopping=True,
             do_sample=False,
@@ -76,19 +92,19 @@ def summarize(input_text, file_path, detail_level, progress=gr.Progress()):
         progress(1.0, desc="Done!")
         return result[0]['summary_text']
     except Exception as e:
-        return f"Error: {str(e)}\n(Try shorter input or lower detail.)"
-# Interface with progress
 interface = gr.Interface(
     fn=summarize,
     inputs=[
-        gr.Textbox(lines=12, placeholder="Paste lecture text...", label="Lecture Text (Paste)"),
-        gr.File(file_types=[".pdf", ".docx", ".txt"], label="Upload Lecture File"),
-        gr.Slider(0.15, 0.60, value=0.32, step=0.01, label="Detail Level (higher = longer)")
     ],
-    outputs=gr.Textbox(label="Generated Summary"),
-    title="Lecture Summarizer",
-    description="Paste or upload lecture. Progress shows during generation. For long files, lower detail or upgrade to GPU.",
     flagging_mode="never",
 )

 import pdfplumber
 from docx import Document
 import os
+from PyPDF2 import PdfReader
+import fitz  # PyMuPDF for better PDF handling
+from PIL import Image
+import pytesseract  # For OCR on scanned PDFs
+import io
+# Load model
 device = 0 if torch.cuda.is_available() else -1
 print(f"Using device: {'GPU' if device == 0 else 'CPU'}")
 summarizer = pipeline(
     "summarization",
+    model="facebook/bart-large-cnn",  # Better quality for lectures/books
     device=device
 )
         return ""
     file_path = str(file_path)
     filename = os.path.basename(file_path).lower()
+    text = ""
     try:
         if filename.endswith('.pdf'):
+            # Try PyMuPDF for better layout
+            doc = fitz.open(file_path)
+            for page in doc:
+                text += page.get_text("text") + "\n"
+            if not text.strip():  # If empty, try OCR as fallback (scanned PDF)
+                text = ""
+                for page in doc:
+                    pix = page.get_pixmap()
+                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                    text += pytesseract.image_to_string(img) + "\n"
+            doc.close()
         elif filename.endswith('.docx'):
             doc = Document(file_path)
+            text = "\n".join(para.text for para in doc.paragraphs if para.text.strip())
         elif filename.endswith('.txt'):
             with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+                text = f.read()
         else:
+            return "Unsupported file. Use .pdf, .docx, or .txt"
     except Exception as e:
+        return f"Error reading file: {str(e)} (try non-scanned PDF or shorter file)"
+    return text.strip()
 def summarize(input_text, file_path, detail_level, progress=gr.Progress()):
+    progress(0, desc="Extracting text...")
     if file_path is not None:
         text = extract_text(file_path)
         text = input_text.strip()
     if not text:
+        return "No text found – check file or paste directly."
     words = len(text.split())
     if words < 100:
     target_ratio = detail_level
     target_length = int(words * target_ratio)
+    max_l = max(200, min(1024, target_length + 100))  # Balanced for quality/speed
     min_l = max(50, int(target_length * 0.65))
     if min_l >= max_l:
         min_l = max_l // 2
+    progress(0.4, desc="Summarizing... (10–60 sec, longer for books)")
     try:
         result = summarizer(
             text,
             max_length=max_l,
             min_length=min_l,
+            length_penalty=1.5,  # Lower for more concise but coherent
             num_beams=4,
             early_stopping=True,
             do_sample=False,
         progress(1.0, desc="Done!")
         return result[0]['summary_text']
     except Exception as e:
+        return f"Error: {str(e)}\n(Try lower detail or shorter text section. For books, summarize chapter by chapter.)"
+# Interface
 interface = gr.Interface(
     fn=summarize,
     inputs=[
+        gr.Textbox(lines=12, placeholder="Paste lecture/book text...", label="Text (Paste)"),
+        gr.File(file_types=[".pdf", ".docx", ".txt"], label="Upload File"),
+        gr.Slider(0.15, 0.60, value=0.25, step=0.01, label="Detail Level (higher = longer) – start low for books")
     ],
+    outputs=gr.Textbox(label="Summary"),
+    title="Lecture/Book Summarizer",
+    description="Improved for books like Goggins – better extraction + OCR for scanned PDFs. Use low detail for long texts.",
     flagging_mode="never",
 )