Spaces:

uumerrr684
/

chunkify-smart-pdf-chunker

Sleeping

App Files Files Community

uumerrr684 commited on Aug 6, 2025

Commit

23139be

1 Parent(s): 3d5b1f6

🚀 ChunkiFy Phase 1: Initial app upload

Browse files

Files changed (5) hide show

README copy.md +39 -0
app.py +39 -0
chunker.py +51 -0
pdf_reader.py +22 -0
requirements.txt +2 -0

README copy.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# 🧩 ChunkiFy – Smart PDF Text Chunking App
+Slice long PDFs into clean, overlapping chunks.
+Built for reading clarity, NLP projects, and AI pipelines.
+## 🚀 What ChunkiFy Does
+- 📄 Upload any PDF
+- 🧠 Extracts clean text using PyMuPDF
+- 🔍 Splits into sentence-paragraphs
+- 🧩 Merges smartly into ~200-word chunks with 50-word overlap
+- ✅ Outputs clean, readable, and consistent content blocks
+## 🖥️ Built With
+- Python
+- PyMuPDF (`fitz`)
+- Gradio
+## 🔁 Use Cases
+- AI/NLP text preprocessing
+- Book summary readers
+- Embedding pipelines (Phase 2)
+- Focused learning from dense PDFs
+## 🏁 Phase 1: Chunking Milestone
+This is Phase 1 of a larger AI project.
+Upcoming phases: vector embeddings, semantic search, PDF Q&A chatbot.
+## 👨‍💻 Made by Umer
+A personal learning sprint turned into a useful microtool.
+[Gradio App Live](link) | [GitHub](link)
+---
+> ChunkiFy is not just a toy. It’s a foundational block in building smarter AI tools.

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# app.py
+import gradio as gr
+from pdf_reader import extract_text_from_pdf
+from chunker import chunk_text
+def process_pdf(file):
+    # Save uploaded file to disk
+    pdf_path = file.name
+    # Extract text
+    text = extract_text_from_pdf(pdf_path)
+    # Chunk text
+    chunks = chunk_text(text)
+    # Ensure we handle tuple return (chunks, extra) if needed
+    if isinstance(chunks, tuple):
+        chunks = chunks[0]
+    # Format chunks nicely for display
+    formatted = ""
+    for i, chunk in enumerate(chunks):
+        formatted += f"--- Chunk {i+1} ---\n{chunk.strip()}\n\n"
+    return formatted.strip()
+demo = gr.Interface(
+    fn=process_pdf,
+    inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
+    outputs=gr.Textbox(label="Chunked Output",
+                       lines=30, show_copy_button=True),
+    title="🧩 ChunkiFy – Smart PDF Text Chunking",
+    description="Slice long PDFs into clean, overlapping chunks using PyMuPDF + Gradio.",
+)
+if __name__ == "__main__":
+    demo.launch()

chunker.py ADDED Viewed

	@@ -0,0 +1,51 @@

+def chunk_text(text, max_words=200, overlap=50):
+    # Step 1: Clean the text (remove broken line breaks)
+    clean_text = text.replace('\n', ' ').replace('  ', ' ')
+    # Step 2: Split into sentence-like paragraphs
+    paragraphs = clean_text.split('. ')
+    print(f"Total sentence-paragraphs found: {len(paragraphs)}")
+    # Calculate total word count
+    total_words = sum(len(p.split()) for p in paragraphs)
+    print(f"Total word count: {total_words}")
+    # For short PDFs, return original sentence-paragraphs
+    if total_words <= 500:
+        print("Short PDF — returning sentence-paragraphs without merging.")
+        return paragraphs
+    print("Long PDF — using merged 200-word chunks with 50-word overlap.")
+    # Merging logic starts here
+    chunks = []
+    current_chunk = []
+    current_word_count = 0
+    for para in paragraphs:
+        words = para.split()
+        word_count = len(words)
+        if current_word_count + word_count <= max_words:
+            current_chunk.append(para)
+            current_word_count += word_count
+        else:
+            # Save current chunk
+            chunk_text = '. '.join(current_chunk).strip()
+            chunks.append(chunk_text)
+            # Prepare next chunk with overlap
+            all_words = chunk_text.split()
+            overlap_words = all_words[-overlap:] if len(
+                all_words) >= overlap else all_words
+            current_chunk = [' '.join(overlap_words)]
+            current_chunk.append(para)
+            current_word_count = len(overlap_words) + word_count
+    # Save any leftover content
+    if current_chunk:
+        final_chunk = '. '.join(current_chunk).strip()
+        chunks.append(final_chunk)
+    return chunks

pdf_reader.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import fitz  # PyMuPDF
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    full_text = ""
+    for i, page in enumerate(doc):
+        print(f"📄 Processing page {i + 1} of {doc.page_count}")
+        text = page.get_text()
+        full_text += text + "\n"
+    doc.close()
+    return full_text
+# Test it here
+if __name__ == "__main__":
+    pdf_path = "/Users/mac/Desktop/brief-summary-of-atomic-habits.pdf"
+    text = extract_text_from_pdf(pdf_path)
+    print(text[:1000])  # Preview first 1000 characters
+    print(f"✅ Text extraction complete.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio
2	+ PyMuPDF