Spaces:

tejovanth
/

exampletwo

Sleeping

App Files Files Community

tejovanth commited on Apr 17, 2025

Commit

7cd95b1

verified ·

1 Parent(s): 78bf1e3

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -71

app.py CHANGED Viewed

@@ -1,87 +1,50 @@
 import gradio as gr
 from transformers import pipeline
-import fitz  # PyMuPDF for PDFs
-import pytesseract  # For OCR (images)
-from PIL import Image
-import io
-# Load summarization model
-summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
-# Function to extract text from different file types
-def extract_text(file_bytes):
-    try:
-        # file_bytes is already a bytes object
-        header = file_bytes[:4]
-        # Determine file type based on magic numbers
-        if header.startswith(b'%PDF'):
-            doc = fitz.open(stream=file_bytes, filetype="pdf")
-            text = ""
-            for page in doc:
-                text += page.get_text()
-            return text
-        elif header.startswith(b'\xFF\xD8') or header.startswith(b'\x89PNG'):
-            # It's an image (JPEG/PNG), use OCR
-            image = Image.open(io.BytesIO(file_bytes))
-            return pytesseract.image_to_string(image)
-        else:
-            # Try reading as plain text
-            try:
-                return file_bytes.decode("utf-8")
-            except UnicodeDecodeError:
-                return "❌ Unsupported file format or corrupted file."
     except Exception as e:
-        return f"❌ Error reading file: {str(e)}"
-# Function to chunk text into smaller pieces
-def chunk_text(text, chunk_size=4000):
-    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
-# Summarize the extracted text
-def summarize_file(file_bytes):
-    text = extract_text(file_bytes)
-    if not text or len(text.strip()) == 0:
-        return "❌ No text found in the uploaded file."
-    # Ensure at least 300,000 characters can be processed (no truncation)
-    if len(text) > 300000:
-        text = text[:300000]  # Optional: cap at 300,000 if desired, but can be removed for larger inputs
-    # Chunk the text into 4,000-character segments
-    chunks = chunk_text(text, chunk_size=4000)
-    if not chunks:
-        return "❌ No valid chunks to summarize."
-    # Summarize each chunk
     summaries = []
     for i, chunk in enumerate(chunks):
         try:
-            summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
-            summaries.append(f"**Chunk {i+1} Summary**:\n{summary[0]['summary_text']}")
         except Exception as e:
-            summaries.append(f"**Chunk {i+1} Summary**: ❌ Error summarizing chunk: {str(e)}")
-    # Combine summaries
-    combined_summary = "\n\n".join(summaries)
-    total_chars = len(text)
-    return f"**Total Characters Processed**: {total_chars}\n\n**Summaries**:\n{combined_summary}"
-# Gradio UI
-demo = gr.Interface(
-    fn=summarize_file,
-    inputs=gr.File(label="📄 Upload Notes (PDF, TXT, or Handwritten Image)", type="binary"),
-    outputs=gr.Textbox(label="📝 Summarized Notes"),
-    title="📚 Note Summarizer",
-    description="Upload academic notes in PDF, TXT, or image format (supports at least 300,000 characters). This app extracts and summarizes the content using a Hugging Face transformer model."
-)
-# Launch the interface
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import fitz
+import torch
 from transformers import pipeline
+import time, logging
+logging.basicConfig(level=logging.ERROR)
+device = -1  # CPU-only
+print("⚠️ CPU-only. Expect ~15–25s for 300,000 chars.")
+try:
+    summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
+except Exception as e:
+    print(f"❌ Model loading failed: {str(e)}")
+    exit(1)
+def summarize_file(file):
+    start = time.time()
+    try:
+        # Handle file as path (str) or bytes
+        file_bytes = open(file.name, "rb").read() if isinstance(file, gr.FileData) else file
+        text = "".join(page.get_text("text", flags=16) for page in fitz.open(stream=file_bytes, filetype="pdf")) if file_bytes[:4].startswith(b'%PDF') else file_bytes.decode("utf-8", errors="ignore")
     except Exception as e:
+        return f"❌ Text extraction failed: {str(e)}"
+    if not text.strip(): return "❌ No text found"
+    text = text[:300000]
+    chunks = [text[i:i+10000] for i in range(0, len(text), 10000)]
+    if gamba not chunks: return "❌ No chunks to summarize"
     summaries = []
     for i, chunk in enumerate(chunks):
+        if time.time() - start > 9:
+            summaries.append("⚠️ Stopped early")
+            break
         try:
+            summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)[0]['summary_text']
+            summaries.append(f"**Chunk {i+1}**:\n{summary}")
         except Exception as e:
+            summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
+    return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
+demo = gr.Interface(fn=summarize_file, inputs=gr.File(label="📄 PDF/TXT Notes"), outputs=gr.Textbox(label="📝 Summary"), title="Fast Summarizer", description="300,000+ chars in ~15–25s (CPU)")
 if __name__ == "__main__":
+    try:
+        demo.launch(share=False, server_port=7860)
+    except Exception as e:
+        print(f"❌ Gradio launch failed: {str(e)}")