Spaces:

tejovanth
/

exampletwo

Sleeping

App Files Files Community

tejovanth commited on Apr 17, 2025

Commit

840cf52

verified ·

1 Parent(s): 9a7487f

Upload app (6).py

Browse files

Files changed (1) hide show

app (6).py +87 -0

app (6).py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+from transformers import pipeline
+import fitz  # PyMuPDF for PDFs
+import pytesseract  # For OCR (images)
+from PIL import Image
+import io
+# Load summarization model
+summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
+# Function to extract text from different file types
+def extract_text(file_bytes):
+    try:
+        # file_bytes is already a bytes object
+        header = file_bytes[:4]
+        # Determine file type based on magic numbers
+        if header.startswith(b'%PDF'):
+            doc = fitz.open(stream=file_bytes, filetype="pdf")
+            text = ""
+            for page in doc:
+                text += page.get_text()
+            return text
+        elif header.startswith(b'\xFF\xD8') or header.startswith(b'\x89PNG'):
+            # It's an image (JPEG/PNG), use OCR
+            image = Image.open(io.BytesIO(file_bytes))
+            return pytesseract.image_to_string(image)
+        else:
+            # Try reading as plain text
+            try:
+                return file_bytes.decode("utf-8")
+            except UnicodeDecodeError:
+                return "❌ Unsupported file format or corrupted file."
+    except Exception as e:
+        return f"❌ Error reading file: {str(e)}"
+# Function to chunk text into smaller pieces
+def chunk_text(text, chunk_size=4000):
+    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+# Summarize the extracted text
+def summarize_file(file_bytes):
+    text = extract_text(file_bytes)
+    if not text or len(text.strip()) == 0:
+        return "❌ No text found in the uploaded file."
+    # Ensure at least 300,000 characters can be processed (no truncation)
+    if len(text) > 300000:
+        text = text[:300000]  # Optional: cap at 300,000 if desired, but can be removed for larger inputs
+    # Chunk the text into 4,000-character segments
+    chunks = chunk_text(text, chunk_size=4000)
+    if not chunks:
+        return "❌ No valid chunks to summarize."
+    # Summarize each chunk
+    summaries = []
+    for i, chunk in enumerate(chunks):
+        try:
+            summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
+            summaries.append(f"**Chunk {i+1} Summary**:\n{summary[0]['summary_text']}")
+        except Exception as e:
+            summaries.append(f"**Chunk {i+1} Summary**: ❌ Error summarizing chunk: {str(e)}")
+    # Combine summaries
+    combined_summary = "\n\n".join(summaries)
+    total_chars = len(text)
+    return f"**Total Characters Processed**: {total_chars}\n\n**Summaries**:\n{combined_summary}"
+# Gradio UI
+demo = gr.Interface(
+    fn=summarize_file,
+    inputs=gr.File(label="📄 Upload Notes (PDF, TXT, or Handwritten Image)", type="binary"),
+    outputs=gr.Textbox(label="📝 Summarized Notes"),
+    title="📚 Note Summarizer",
+    description="Upload academic notes in PDF, TXT, or image format (supports at least 300,000 characters). This app extracts and summarizes the content using a Hugging Face transformer model."
+)
+# Launch the interface
+if __name__ == "__main__":
+    demo.launch()