Spaces:

tejovanth
/

exampletwo

Sleeping

App Files Files Community

tejovanth commited on Apr 17, 2025

Commit

6f47432

verified ·

1 Parent(s): f738250

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -91

app.py CHANGED Viewed

@@ -1,109 +1,48 @@
 import gradio as gr
-import fitz  # PyMuPDF for PDFs
-import io
 import torch
 from transformers import pipeline
-from tqdm import tqdm
-import time
-# Check for GPU (mandatory for 5–10s target)
 device = 0 if torch.cuda.is_available() else -1
-if device == -1:
-    print("⚠️ Warning: GPU not detected. 5–10s target requires a GPU. Expect slower performance.")
-# Load summarization model (distilbart-cnn-6-6 is faster)
-summarizer = pipeline(
-    "summarization",
-    model="sshleifer/distilbart-cnn-6-6",
-    device=device,
-    torch_dtype=torch.float16 if device == 0 else torch.float32  # Quantize on GPU
-)
-# Function to extract text from PDFs or text files (skip images for speed)
 def extract_text(file_bytes):
-    try:
-        header = file_bytes[:4]
-        if header.startswith(b'%PDF'):
-            doc = fitz.open(stream=file_bytes, filetype="pdf")
-            text = ""
-            for page in tqdm(doc, desc="Extracting PDF pages", disable=True):  # Silent progress
-                text += page.get_text("text", flags=fitz.TEXTFLAGS_TEXT)  # Fast text-only extraction
-            doc.close()
-            return text
-        else:
-            try:
-                return file_bytes.decode("utf-8")
-            except UnicodeDecodeError:
-                return "❌ Unsupported file format (images not supported for speed)."
-    except Exception as e:
-        return f"❌ Error reading file: {str(e)}"
-# Function to chunk text
-def chunk_text(text, chunk_size=10000):
-    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
-# Summarize the extracted text
-def summarize_file(file_bytes):
-    start_time = time.time()
-    text = extract_text(file_bytes)
-    if not text or len(text.strip()) == 0:
-        return "❌ No text found in the uploaded file."
-    # Cap at 300,000 characters (optional, can remove for larger inputs)
-    if len(text) > 300000:
-        text = text[:300000]
-    # Chunk into 10,000-character segments (~30 chunks for 300,000 chars)
-    chunks = chunk_text(text, chunk_size=10000)
-    if not chunks:
-        return "❌ No valid chunks to summarize."
-    # Summarize with batch processing
     summaries = []
-    batch_size = 8 if device == 0 else 2  # Large batch on GPU, small on CPU
-    max_chunks = 15  # Limit to ~150,000 chars for 5–10s (adjust as needed)
-    for i in range(0, min(len(chunks), max_chunks), batch_size):
-        if time.time() - start_time > 8:  # Stop early if nearing 10s
-            summaries.append("⚠️ Stopped early to meet 5–10s target. Not all text summarized.")
             break
-        batch = chunks[i:i + batch_size]
         try:
-            batch_summaries = summarizer(
-                batch,
-                max_length=100,  # Shorter summaries for speed
-                min_length=20,
-                do_sample=False,
-                truncation=True,
-                batch_size=batch_size
-            )
-            for j, summary in enumerate(batch_summaries):
-                summaries.append(f"**Chunk {i+j+1} Summary**:\n{summary['summary_text']}")
-        except Exception as e:
-            summaries.append(f"**Chunk {i+1} Summary**: ❌ Error: {str(e)}")
-    # Add note if not all chunks processed
-    if len(chunks) > max_chunks:
-        summaries.append(f"⚠️ Only {max_chunks} of {len(chunks)} chunks processed (~{max_chunks*10000} chars). Full processing may take ~12–15s.")
-    combined_summary = "\n\n".join(summaries)
-    elapsed_time = time.time() - start_time
-    return f"**Total Characters Processed**: {min(len(text), max_chunks*10000)}\n**Time Taken**: {elapsed_time:.2f}s\n\n**Summaries**:\n{combined_summary}"
-# Gradio UI
 demo = gr.Interface(
-    fn=summarize_file,
-    inputs=gr.File(label="📄 Upload Notes (PDF or TXT)", type="binary"),
-    outputs=gr.Textbox(label="📝 Summarized Notes"),
-    title="📚 Ultra-Fast Note Summarizer",
-    description="Upload academic notes in PDF or TXT format (supports ~300,000 characters). Optimized for 5–10s runtime using a lightweight model and GPU. Images not supported for speed."
 )
-# Launch the interface
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import fitz
 import torch
 from transformers import pipeline
+import time, io
 device = 0 if torch.cuda.is_available() else -1
+if device == -1: raise RuntimeError("GPU required for 5–10s target")
+summarizer = pipeline("summarization", model="google/pegasus-xsum", device=device, torch_dtype=torch.int8)
 def extract_text(file_bytes):
+    if file_bytes[:4].startswith(b'%PDF'):
+        doc = fitz.open(stream=file_bytes, filetype="pdf")
+        text = "".join(page.get_text("text", flags=16) for page in doc)
+        doc.close()
+        return text
+    try: return file_bytes.decode("utf-8")
+    except: return "❌ Unsupported format (PDF/TXT only)"
+async def summarize_file(file_bytes):
+    start = time.time()
+    text = extract_text(file_bytes)[:300000] or "❌ No text found"
+    if len(text.strip()) == 0: return text
+    chunks = [text[i:i+15000] for i in range(0, len(text), 15000)]
+    if not chunks: return "❌ No chunks to summarize"
     summaries = []
+    for i in range(0, len(chunks), 10):
+        if time.time() - start > 7:
+            summaries.append("⚠️ Stopped early")
             break
+        batch = chunks[i:i+10]
         try:
+            batch_summaries = summarizer(batch, max_length=40, min_length=10, do_sample=False, batch_size=10)
+            summaries.extend(f"**Chunk {i+j+1}**:\n{s['summary_text']}" for j, s in enumerate(batch_summaries))
+        except: summaries.append(f"**Chunk {i+1}**: ❌ Error")
+    return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
 demo = gr.Interface(
+    fn=summarize_file, inputs=gr.File(label="📄 PDF/TXT Notes"),
+    outputs=gr.Textbox(label="📝 Summary"),
+    title="Fast Summarizer", description="300,000+ chars in ~5s (GPU)"
 )
 if __name__ == "__main__":
+    demo.launch(share=False)